diff --git a/BUILD.bazel b/BUILD.bazel index f31cf1417e1..7f6ec2d1db7 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -233,7 +233,6 @@ MICROKERNEL_HDRS = [ "src/xnnpack/pad.h", "src/xnnpack/pavgpool.h", "src/xnnpack/ppmm.h", - "src/xnnpack/prelu.h", "src/xnnpack/quantization.h", "src/xnnpack/raddexpminusmax.h", "src/xnnpack/raddextexp.h", @@ -244,9 +243,6 @@ MICROKERNEL_HDRS = [ "src/xnnpack/unpool.h", "src/xnnpack/vbinary.h", "src/xnnpack/vcvt.h", - "src/xnnpack/vhswish.h", - "src/xnnpack/vlog.h", - "src/xnnpack/vlrelu.h", "src/xnnpack/vmulcaddc.h", "src/xnnpack/vscaleexpminusmax.h", "src/xnnpack/vscaleextexp.h", @@ -323,6 +319,7 @@ xnnpack_cc_library( deps = [ ":common", ":config_hdrs", + ":fp16", ":math", ":memory", ":microparams", @@ -1247,6 +1244,18 @@ config_setting( define_values = {"xnn_enable_avx512skx": "false"}, ) +# Enables usage of Intel AVX512VBMI (evex512) kernels. +config_setting( + name = "xnn_enable_avx512vbmi_explicit_true", + define_values = {"xnn_enable_avx512vbmi": "true"}, +) + +# Disables usage of Intel AVX512VBMI (evex512) kernels. +config_setting( + name = "xnn_enable_avx512vbmi_explicit_false", + define_values = {"xnn_enable_avx512vbmi": "false"}, +) + # Enables usage of Intel AVX512VNNI (evex512) kernels. config_setting( name = "xnn_enable_avx512vnni_explicit_true", @@ -1704,6 +1713,22 @@ alias( }), ) +selects.config_setting_group( + name = "avx512vbmi_enabled_by_default", + match_any = [ + "//build_config:x86", + ], +) + +alias( + name = "avx512vbmi_enabled", + actual = select({ + ":xnn_enable_avx512vbmi_explicit_true": ":xnn_enable_avx512vbmi_explicit_true", + ":xnn_enable_avx512vbmi_explicit_false": ":xnn_enable_avx512vbmi_explicit_true", + "//conditions:default": ":avx512vbmi_enabled_by_default", + }), +) + selects.config_setting_group( name = "avx512vnni_enabled_by_default", match_any = [ diff --git a/CMakeLists.txt b/CMakeLists.txt index 039de1be556..00147e0d6de 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -201,6 +201,16 @@ ELSEIF(CMAKE_C_COMPILER_ID STREQUAL "Clang") SET(XNNPACK_ENABLE_AVX512SKX OFF) ENDIF() ENDIF() +OPTION(XNNPACK_ENABLE_AVX512VBMI "Build XNNPACK with AVX512VBMI micro-kernels" ON) +IF(CMAKE_C_COMPILER_ID STREQUAL "GNU") + IF(CMAKE_C_COMPILER_VERSION VERSION_LESS "8") + SET(XNNPACK_ENABLE_AVX512VBMI OFF) + ENDIF() +ELSEIF(CMAKE_C_COMPILER_ID STREQUAL "Clang") + IF(CMAKE_C_COMPILER_VERSION VERSION_LESS "6") + SET(XNNPACK_ENABLE_AVX512VBMI OFF) + ENDIF() +ENDIF() OPTION(XNNPACK_ENABLE_AVX512VNNI "Build XNNPACK with AVX512VNNI micro-kernels" ON) IF(CMAKE_C_COMPILER_ID STREQUAL "GNU") IF(CMAKE_C_COMPILER_VERSION VERSION_LESS "8") @@ -278,6 +288,7 @@ ADD_COMPILE_DEFINITIONS("XNN_ENABLE_AVX256VNNI=$") ADD_COMPILE_DEFINITIONS("XNN_ENABLE_AVX512F=$") ADD_COMPILE_DEFINITIONS("XNN_ENABLE_AVX512SKX=$") +ADD_COMPILE_DEFINITIONS("XNN_ENABLE_AVX512VBMI=$") ADD_COMPILE_DEFINITIONS("XNN_ENABLE_AVX512VNNI=$") ADD_COMPILE_DEFINITIONS("XNN_ENABLE_AVX512VNNIGFNI=$") ADD_COMPILE_DEFINITIONS("XNN_ENABLE_AVX512AMX=$") @@ -418,7 +429,6 @@ SET(OPERATOR_SRCS src/operators/global-average-pooling-nwc.c src/operators/lut-elementwise-nc.c src/operators/max-pooling-nhwc.c - src/operators/prelu-nc.c src/operators/reduce-nd.c src/operators/resize-bilinear-nchw.c src/operators/resize-bilinear-nhwc.c @@ -464,7 +474,6 @@ SET(SUBGRAPH_SRCS src/subgraph/log.c src/subgraph/max-pooling-2d.c src/subgraph/negate.c - src/subgraph/prelu.c src/subgraph/reciprocal-square-root.c src/subgraph/reshape-helpers.c src/subgraph/scaled-dot-product-attention.c @@ -508,7 +517,6 @@ SET(XNNPACK_SRCS src/configs/lut32norm-config.c src/configs/maxpool-config.c src/configs/pavgpool-config.c - src/configs/prelu-config.c src/configs/raddstoreexpminusmax-config.c src/configs/reduce-config.c src/configs/rmax-config.c @@ -657,7 +665,6 @@ IF(XNNPACK_TARGET_PROCESSOR MATCHES "^x86(_64)?$") LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_F16C_MICROKERNEL_SRCS}) LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_FMA3_MICROKERNEL_SRCS}) LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_AVX2_MICROKERNEL_SRCS}) - LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_AVX512VBMI_MICROKERNEL_SRCS}) IF(XNNPACK_ENABLE_AVX512AMX) LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_AVX512AMX_MICROKERNEL_SRCS}) ENDIF() @@ -685,6 +692,9 @@ IF(XNNPACK_TARGET_PROCESSOR MATCHES "^x86(_64)?$") IF(XNNPACK_ENABLE_AVX512SKX) LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_AVX512SKX_MICROKERNEL_SRCS}) ENDIF() + IF(XNNPACK_ENABLE_AVX512VBMI) + LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_AVX512VBMI_MICROKERNEL_SRCS}) + ENDIF() IF(XNNPACK_ENABLE_AVX512VNNI) LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_AVX512VNNI_MICROKERNEL_SRCS}) ENDIF() @@ -702,7 +712,6 @@ IF(XNNPACK_TARGET_PROCESSOR MATCHES "^x86(_64)?$") LIST(APPEND NON_PROD_MICROKERNEL_SRCS ${NON_PROD_F16C_MICROKERNEL_SRCS}) LIST(APPEND NON_PROD_MICROKERNEL_SRCS ${NON_PROD_FMA3_MICROKERNEL_SRCS}) LIST(APPEND NON_PROD_MICROKERNEL_SRCS ${NON_PROD_AVX2_MICROKERNEL_SRCS}) - LIST(APPEND NON_PROD_MICROKERNEL_SRCS ${NON_PROD_AVX512VBMI_MICROKERNEL_SRCS}) IF(XNNPACK_ENABLE_AVX512AMX) LIST(APPEND NON_PROD_MICROKERNEL_SRCS ${NON_PROD_AVX512AMX_MICROKERNEL_SRCS}) ENDIF() @@ -730,6 +739,9 @@ IF(XNNPACK_TARGET_PROCESSOR MATCHES "^x86(_64)?$") IF(XNNPACK_ENABLE_AVX512SKX) LIST(APPEND NON_PROD_MICROKERNEL_SRCS ${NON_PROD_AVX512SKX_MICROKERNEL_SRCS}) ENDIF() + IF(XNNPACK_ENABLE_AVX512VBMI) + LIST(APPEND NON_PROD_MICROKERNEL_SRCS ${NON_PROD_AVX512VBMI_MICROKERNEL_SRCS}) + ENDIF() IF(XNNPACK_ENABLE_AVX512VNNI) LIST(APPEND NON_PROD_MICROKERNEL_SRCS ${NON_PROD_AVX512VNNI_MICROKERNEL_SRCS}) ENDIF() @@ -836,8 +848,8 @@ IF(XNNPACK_BUILD_LIBRARY) TARGET_LINK_LIBRARIES(operator-run PRIVATE xnnpack-base logging) TARGET_LINK_LIBRARIES(operator-utils PRIVATE xnnpack-base logging) TARGET_LINK_LIBRARIES(subgraph PRIVATE xnnpack-base allocator logging memory mutex operators operator-run) - TARGET_LINK_LIBRARIES(XNNPACK PRIVATE xnnpack-base allocator cache hardware-config indirection logging memory microkernel-utils microparams-init mutex normalization operators operator-run operator-utils packing microkernels-prod subgraph) - TARGET_LINK_LIBRARIES(XNNPACK PUBLIC pthreadpool) + TARGET_LINK_LIBRARIES(XNNPACK PRIVATE xnnpack-base allocator cache hardware-config indirection memory microkernel-utils microparams-init mutex normalization operators operator-run operator-utils packing microkernels-prod subgraph) + TARGET_LINK_LIBRARIES(XNNPACK PUBLIC pthreadpool logging) SET_TARGET_PROPERTIES(XNNPACK PROPERTIES C_EXTENSIONS YES) ENDIF() IF(NOT MSVC) @@ -984,7 +996,7 @@ IF(XNNPACK_TARGET_PROCESSOR MATCHES "^x86(_64)?$") ENDIF() # Set `XNN_LOG_LEVEL` transitively for all targets that depend on `logging`. -TARGET_COMPILE_DEFINITIONS(logging PUBLIC "XNN_LOG_LEVEL=$<$:4>$<$>:0>") +TARGET_COMPILE_DEFINITIONS(logging PUBLIC "XNN_LOG_LEVEL=$<$:5>$<$>:0>") IF(MSVC) # Even though MSVC has __restrict, it can't be used in all the same contexts as the C99 restrict keyword @@ -1033,9 +1045,9 @@ ELSE() ENDIF() IF(XNNPACK_BUILD_ALL_MICROKERNELS) - TARGET_INCLUDE_DIRECTORIES(microkernels-all PRIVATE . include src) + TARGET_INCLUDE_DIRECTORIES(microkernels-all PRIVATE include src) ENDIF() -TARGET_INCLUDE_DIRECTORIES(microkernels-prod PRIVATE . include src) +TARGET_INCLUDE_DIRECTORIES(microkernels-prod PRIVATE include src) TARGET_INCLUDE_DIRECTORIES(hardware-config PRIVATE include src ${CPUINFO_SOURCE_DIR}/include) TARGET_INCLUDE_DIRECTORIES(indirection PRIVATE include src) TARGET_INCLUDE_DIRECTORIES(microparams-init PRIVATE include src) @@ -1044,13 +1056,13 @@ TARGET_INCLUDE_DIRECTORIES(packing PRIVATE include src) TARGET_INCLUDE_DIRECTORIES(logging PRIVATE include src) IF(XNNPACK_BUILD_LIBRARY) TARGET_INCLUDE_DIRECTORIES(XNNPACK PUBLIC include) - TARGET_INCLUDE_DIRECTORIES(XNNPACK PRIVATE . src) + TARGET_INCLUDE_DIRECTORIES(XNNPACK PRIVATE src) TARGET_INCLUDE_DIRECTORIES(allocator PRIVATE include src) TARGET_INCLUDE_DIRECTORIES(cache PRIVATE include src) TARGET_INCLUDE_DIRECTORIES(microkernel-utils PRIVATE include src) - TARGET_INCLUDE_DIRECTORIES(subgraph PRIVATE . include src) - TARGET_INCLUDE_DIRECTORIES(operators PRIVATE . include src) - TARGET_INCLUDE_DIRECTORIES(operator-run PRIVATE . include src) + TARGET_INCLUDE_DIRECTORIES(subgraph PRIVATE include src) + TARGET_INCLUDE_DIRECTORIES(operators PRIVATE include src) + TARGET_INCLUDE_DIRECTORIES(operator-run PRIVATE include src) TARGET_INCLUDE_DIRECTORIES(operator-utils PRIVATE include src) TARGET_INCLUDE_DIRECTORIES(memory PRIVATE include src) TARGET_INCLUDE_DIRECTORIES(mutex PRIVATE include src) @@ -1214,7 +1226,7 @@ IF(XNNPACK_BUILD_TESTS) ADD_LIBRARY(next-prime STATIC test/next_prime.cc) ADD_LIBRARY(gemm-microkernel-tester STATIC test/gemm-microkernel-tester.cc) - TARGET_INCLUDE_DIRECTORIES(gemm-microkernel-tester PRIVATE . include src test) + TARGET_INCLUDE_DIRECTORIES(gemm-microkernel-tester PRIVATE include src test) TARGET_LINK_LIBRARIES(gemm-microkernel-tester PRIVATE xnnpack-base pthreadpool GTest::gtest) TARGET_LINK_LIBRARIES(gemm-microkernel-tester PRIVATE packing) IF(XNNPACK_ENABLE_KLEIDIAI) @@ -1223,25 +1235,25 @@ IF(XNNPACK_BUILD_TESTS) TARGET_LINK_LIBRARIES(gemm-microkernel-tester PUBLIC next-prime) ADD_LIBRARY(unary-operator-tester STATIC test/unary-operator-tester.cc) - TARGET_INCLUDE_DIRECTORIES(unary-operator-tester PRIVATE . include src test) + TARGET_INCLUDE_DIRECTORIES(unary-operator-tester PRIVATE include src test) TARGET_LINK_LIBRARIES(unary-operator-tester PRIVATE XNNPACK pthreadpool GTest::gtest) ADD_LIBRARY(dwconv-microkernel-tester STATIC test/dwconv-microkernel-tester.cc) - TARGET_INCLUDE_DIRECTORIES(dwconv-microkernel-tester PRIVATE . include src test) + TARGET_INCLUDE_DIRECTORIES(dwconv-microkernel-tester PRIVATE include src test) TARGET_LINK_LIBRARIES(dwconv-microkernel-tester PRIVATE XNNPACK pthreadpool GTest::gtest) TARGET_LINK_LIBRARIES(dwconv-microkernel-tester PUBLIC next-prime) ADD_LIBRARY(vbinary-microkernel-tester STATIC test/vbinary-microkernel-tester.cc) SET_TARGET_PROPERTIES(vbinary-microkernel-tester PROPERTIES CXX_EXTENSIONS YES) - TARGET_INCLUDE_DIRECTORIES(vbinary-microkernel-tester PRIVATE . include src test) + TARGET_INCLUDE_DIRECTORIES(vbinary-microkernel-tester PRIVATE include src test) TARGET_LINK_LIBRARIES(vbinary-microkernel-tester PRIVATE XNNPACK pthreadpool GTest::gtest) ADD_LIBRARY(vcvt-microkernel-tester STATIC test/vcvt-microkernel-tester.cc) - TARGET_INCLUDE_DIRECTORIES(vcvt-microkernel-tester PRIVATE . include src test) + TARGET_INCLUDE_DIRECTORIES(vcvt-microkernel-tester PRIVATE include src test) TARGET_LINK_LIBRARIES(vcvt-microkernel-tester PRIVATE XNNPACK pthreadpool GTest::gtest) ADD_LIBRARY(vunary-microkernel-tester STATIC test/vunary-microkernel-tester.cc) - TARGET_INCLUDE_DIRECTORIES(vunary-microkernel-tester PRIVATE . include src test) + TARGET_INCLUDE_DIRECTORIES(vunary-microkernel-tester PRIVATE include src test) TARGET_LINK_LIBRARIES(vunary-microkernel-tester PRIVATE XNNPACK pthreadpool GTest::gtest) TARGET_LINK_LIBRARIES(vunary-microkernel-tester PUBLIC next-prime) @@ -1250,7 +1262,7 @@ IF(XNNPACK_BUILD_TESTS) TARGET_LINK_LIBRARIES(convolution-test-helpers PRIVATE xnnpack-base) ADD_LIBRARY(packq-microkernel-tester STATIC test/packq-microkernel-tester.cc) - TARGET_INCLUDE_DIRECTORIES(packq-microkernel-tester PRIVATE . include src test) + TARGET_INCLUDE_DIRECTORIES(packq-microkernel-tester PRIVATE include src test) TARGET_LINK_LIBRARIES(packq-microkernel-tester PRIVATE XNNPACK pthreadpool GTest::gtest) IF(XNNPACK_ENABLE_KLEIDIAI) TARGET_LINK_LIBRARIES(packq-microkernel-tester PRIVATE kleidiai) @@ -1269,7 +1281,7 @@ IF(XNNPACK_BUILD_TESTS) ) FOREACH(TEST ${SHARDED_TESTS}) ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc) - TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE . include src test) + TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE include src test) TARGET_LINK_LIBRARIES(${TEST}-test PRIVATE GTest::gtest GTest::gtest_main @@ -1295,7 +1307,7 @@ IF(XNNPACK_BUILD_TESTS) ) FOREACH(TEST ${LIBRARY_SHARDED_TESTS}) ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc) - TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE . src test) + TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE src test) TARGET_LINK_LIBRARIES(${TEST}-test PRIVATE GTest::gmock GTest::gtest @@ -1305,9 +1317,6 @@ IF(XNNPACK_BUILD_TESTS) ENDFOREACH() # ---[ Build size tests - ADD_EXECUTABLE(operator-size-test test/operator-size.c) - TARGET_LINK_LIBRARIES(operator-size-test PRIVATE XNNPACK) - ADD_EXECUTABLE(subgraph-size-test test/subgraph-size.c) TARGET_LINK_LIBRARIES(subgraph-size-test PRIVATE XNNPACK) @@ -1402,7 +1411,6 @@ IF(XNNPACK_BUILD_TESTS) log max-pooling-2d negate - prelu reciprocal-square-root reshape-helpers sigmoid @@ -1473,7 +1481,6 @@ IF(XNNPACK_BUILD_TESTS) f16-gavgpool-minmax f16-ibilinear-chw f16-ibilinear - f16-prelu f16-raddstoreexpminusmax f16-rmax f16-rsum @@ -1486,7 +1493,6 @@ IF(XNNPACK_BUILD_TESTS) f32-gavgpool-minmax f32-ibilinear-chw f32-ibilinear - f32-prelu f32-raddexpminusmax f32-raddextexp f32-raddstoreexpminusmax @@ -1534,7 +1540,7 @@ IF(XNNPACK_BUILD_TESTS) xx-pad) FOREACH(TEST ${MICROKERNEL_UNIT_TESTS}) ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc) - TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE . include src test) + TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE include src test) TARGET_LINK_LIBRARIES(${TEST}-test PRIVATE GTest::gmock GTest::gtest @@ -1573,7 +1579,7 @@ IF(XNNPACK_BUILD_TESTS) qu8-dwconv-minmax-unipass-rndnu) FOREACH(TEST ${MICROKERNEL_DWCONV_UNIT_TESTS}) ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc) - TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE . include src test) + TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE include src test) TARGET_LINK_LIBRARIES(${TEST}-test PRIVATE dwconv-microkernel-tester GTest::gmock @@ -1627,7 +1633,7 @@ IF(XNNPACK_BUILD_TESTS) FILE(GLOB TEST_SOURCES "test/${TEST}*.cc") IF(TEST_SOURCES) ADD_EXECUTABLE(${TEST}-test ${TEST_SOURCES}) - TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE . include src test) + TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE include src test) TARGET_LINK_LIBRARIES(${TEST}-test PRIVATE gemm-microkernel-tester GTest::gmock @@ -1648,7 +1654,7 @@ IF(XNNPACK_BUILD_TESTS) x8-packq) FOREACH(TEST ${MICROKERNEL_PACKQ_UNIT_TESTS}) ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc) - TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE . include src test) + TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE include src test) TARGET_LINK_LIBRARIES(${TEST}-test PRIVATE packq-microkernel-tester GTest::gmock @@ -1720,7 +1726,7 @@ IF(XNNPACK_BUILD_TESTS) s32-vmulc) FOREACH(TEST ${MICROKERNEL_VBINARY_UNIT_TESTS}) ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc) - TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE . include src test) + TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE include src test) TARGET_LINK_LIBRARIES(${TEST}-test PRIVATE vbinary-microkernel-tester GTest::gmock @@ -1752,7 +1758,7 @@ IF(XNNPACK_BUILD_TESTS) u32-f32-vcvt) FOREACH(TEST ${MICROKERNEL_VCVT_TESTS}) ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc) - TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE . include src test) + TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE include src test) TARGET_LINK_LIBRARIES(${TEST}-test PRIVATE vcvt-microkernel-tester GTest::gmock @@ -1805,7 +1811,7 @@ IF(XNNPACK_BUILD_TESTS) u8-vclamp) FOREACH(TEST ${MICROKERNEL_VUNARY_TESTS}) ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc) - TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE . include src test) + TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE include src test) TARGET_LINK_LIBRARIES(${TEST}-test PRIVATE vunary-microkernel-tester GTest::gmock @@ -1885,7 +1891,6 @@ IF(XNNPACK_BUILD_BENCHMARKS) ENDIF() ADD_LIBRARY(bench-utils STATIC bench/utils.cc) - TARGET_INCLUDE_DIRECTORIES(bench-utils PRIVATE .) TARGET_INCLUDE_DIRECTORIES(bench-utils PUBLIC include src) TARGET_LINK_LIBRARIES(bench-utils PRIVATE benchmark::benchmark cpuinfo pthreadpool) TARGET_LINK_LIBRARIES(bench-utils PRIVATE xnnpack-base hardware-config) @@ -1895,14 +1900,14 @@ IF(XNNPACK_BUILD_BENCHMARKS) # Helper libraries ADD_LIBRARY(packq-benchmark STATIC bench/packq-benchmark.cc) - TARGET_INCLUDE_DIRECTORIES(packq-benchmark PRIVATE . include src bench) + TARGET_INCLUDE_DIRECTORIES(packq-benchmark PRIVATE include src bench) TARGET_LINK_LIBRARIES(packq-benchmark PRIVATE XNNPACK benchmark::benchmark bench-utils) IF(XNNPACK_ENABLE_KLEIDIAI) TARGET_LINK_LIBRARIES(packq-benchmark PRIVATE kleidiai) ENDIF() ADD_LIBRARY(gemm-benchmark STATIC bench/gemm-benchmark.cc) - TARGET_INCLUDE_DIRECTORIES(gemm-benchmark PRIVATE . include src bench) + TARGET_INCLUDE_DIRECTORIES(gemm-benchmark PRIVATE include src bench) TARGET_LINK_LIBRARIES(gemm-benchmark PRIVATE XNNPACK benchmark::benchmark bench-utils) IF(XNNPACK_ENABLE_KLEIDIAI) TARGET_LINK_LIBRARIES(gemm-benchmark PUBLIC kleidiai) @@ -1921,11 +1926,10 @@ IF(XNNPACK_BUILD_BENCHMARKS) bench/models/fp32-mobilenet-v3-small.cc bench/models/qs8-mobilenet-v2.cc) SET_TARGET_PROPERTIES(models PROPERTIES CXX_EXTENSIONS YES) - TARGET_INCLUDE_DIRECTORIES(models PRIVATE .) TARGET_LINK_LIBRARIES(models PRIVATE XNNPACK) ADD_EXECUTABLE(bench-models bench/models/benchmark.cc) - TARGET_INCLUDE_DIRECTORIES(bench-models PRIVATE .) + TARGET_INCLUDE_DIRECTORIES(bench-models PRIVATE bench) TARGET_LINK_LIBRARIES(bench-models PRIVATE bench-utils benchmark::benchmark @@ -1949,7 +1953,6 @@ IF(XNNPACK_BUILD_BENCHMARKS) leaky-relu max-pooling negate - prelu reciprocal-square-root sigmoid softmax @@ -1959,7 +1962,6 @@ IF(XNNPACK_BUILD_BENCHMARKS) tanh) FOREACH(BENCH ${LIBRARY_OPERATOR_BENCHMARKS}) ADD_EXECUTABLE(${BENCH}-bench bench/${BENCH}.cc) - TARGET_INCLUDE_DIRECTORIES(${BENCH}-bench PRIVATE .) TARGET_LINK_LIBRARIES(${BENCH}-bench PRIVATE bench-utils benchmark::benchmark @@ -2055,7 +2057,7 @@ IF(XNNPACK_BUILD_BENCHMARKS) xx-transposev) FOREACH(BENCH ${MICROKERNEL_BENCHMARKS}) ADD_EXECUTABLE(${BENCH}-bench bench/${BENCH}.cc) - TARGET_INCLUDE_DIRECTORIES(${BENCH}-bench PRIVATE . include src) + TARGET_INCLUDE_DIRECTORIES(${BENCH}-bench PRIVATE include src) TARGET_LINK_LIBRARIES(${BENCH}-bench PRIVATE bench-utils benchmark::benchmark diff --git a/bench/BUILD.bazel b/bench/BUILD.bazel index 9e0a6862e39..bb6e122d1e9 100644 --- a/bench/BUILD.bazel +++ b/bench/BUILD.bazel @@ -51,6 +51,8 @@ OPERATOR_BENCHMARK_DEPS = [ "//:math", ] +############################### Helper libraries ############################### + xnnpack_cxx_library( name = "bench_utils", srcs = ["utils.cc"], @@ -98,8 +100,6 @@ cc_library( ], ) -######################### Benchmarks for micro-kernels ######################### - xnnpack_cxx_library( name = "gemm_benchmark", srcs = [ @@ -115,6 +115,29 @@ xnnpack_cxx_library( ], ) +xnnpack_cxx_library( + name = "packw_benchmark", + hdrs = [ + "packw-benchmark.h", + ], + deps = MICROKERNEL_BENCHMARK_DEPS + [ + ":bgemm", + "@com_google_benchmark//:benchmark", + ], +) + +xnnpack_cxx_library( + name = "bgemm", + hdrs = [ + "bgemm.h", + ], + deps = MICROKERNEL_BENCHMARK_DEPS + [ + "@com_google_benchmark//:benchmark", + ], +) + +######################### Benchmarks for micro-kernels ######################### + [xnnpack_benchmark( name = "%s_bench" % kernel, srcs = [ @@ -167,12 +190,12 @@ xnnpack_cxx_library( xnnpack_benchmark( name = "f32_bgemm_bench", srcs = [ - "bgemm.h", "f32-bgemm.cc", ], copts = xnnpack_optional_ruy_copts(), tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ + ":bgemm", "//:allocator", ] + xnnpack_optional_ruy_deps(), ) @@ -192,6 +215,19 @@ xnnpack_benchmark( ]), ) +xnnpack_benchmark( + name = "qp8_f32_qb4w_gemm", + srcs = ["qp8-f32-qb4w-gemm.cc"], + defines = xnnpack_kleidiai_defines(), + tags = xnnpack_slow_benchmark_tags(), + deps = MICROKERNEL_BENCHMARK_DEPS + [ + ":gemm_benchmark", + "//:isa_checks", + ] + xnnpack_if_kleidiai_enabled([ + "@KleidiAI//kai/ukernels/matmul", + ]), +) + [xnnpack_benchmark( name = "%s_bench" % kernel, srcs = [ @@ -292,6 +328,7 @@ xnnpack_benchmark( srcs = [ "qs8-dwconv.cc", ], + tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ ":dwconv", "//:indirection", @@ -470,11 +507,11 @@ xnnpack_benchmark( xnnpack_cxx_library( name = "packq_benchmark", srcs = [ - "bgemm.h", "packq-benchmark.cc", ], hdrs = ["packq-benchmark.h"], deps = MICROKERNEL_BENCHMARK_DEPS + [ + ":bgemm", "@com_google_benchmark//:benchmark", ], ) @@ -482,10 +519,11 @@ xnnpack_cxx_library( xnnpack_benchmark( name = "x8_packq_bench", srcs = [ - "bgemm.h", "x8-packq.cc", ], + tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ + ":bgemm", ":packq_benchmark", "//:allocator", ], @@ -494,11 +532,12 @@ xnnpack_benchmark( xnnpack_benchmark( name = "x8_packw_bench", srcs = [ - "bgemm.h", - "packw-benchmark.h", "x8-packw.cc", ], + tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ + ":bgemm", + ":packw_benchmark", "//:allocator", ], ) @@ -506,11 +545,12 @@ xnnpack_benchmark( xnnpack_benchmark( name = "qs8_packw_bench", srcs = [ - "bgemm.h", - "packw-benchmark.h", "qs8-packw.cc", ], + tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ + ":bgemm", + ":packw_benchmark", "//:allocator", ], ) @@ -518,11 +558,12 @@ xnnpack_benchmark( xnnpack_benchmark( name = "x16_packw_bench", srcs = [ - "bgemm.h", - "packw-benchmark.h", "x16-packw.cc", ], + tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ + ":bgemm", + ":packw_benchmark", "//:allocator", ], ) @@ -530,11 +571,12 @@ xnnpack_benchmark( xnnpack_benchmark( name = "x32_packw_bench", srcs = [ - "bgemm.h", - "packw-benchmark.h", "x32-packw.cc", ], + tags = xnnpack_slow_benchmark_tags(), deps = MICROKERNEL_BENCHMARK_DEPS + [ + ":bgemm", + ":packw_benchmark", "//:allocator", ], ) diff --git a/bench/abs.cc b/bench/abs.cc index c33171ddc14..03864613091 100644 --- a/bench/abs.cc +++ b/bench/abs.cc @@ -3,10 +3,10 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -#include "xnnpack.h" - #include "unary_operator.h" -#include "bench/utils.h" +#include "utils.h" +#include "xnnpack.h" +#include "xnnpack/math.h" #include #ifdef BENCHMARK_TENSORFLOW_LITE #include "tensorflow/lite/schema/schema_generated.h" diff --git a/bench/average-pooling.cc b/bench/average-pooling.cc index fa939165fc3..6c3deca8616 100644 --- a/bench/average-pooling.cc +++ b/bench/average-pooling.cc @@ -27,7 +27,7 @@ #include "tensorflow/lite/schema/schema_generated.h" #include "tensorflow/lite/version.h" #endif // BENCHMARK_TENSORFLOW_LITE -#include "bench/utils.h" +#include "utils.h" static void xnnpack_average_pooling_qu8(benchmark::State& state, const char* net) { const size_t batch_size = state.range(0); diff --git a/bench/bankers-rounding.cc b/bench/bankers-rounding.cc index 69c94384c35..870de837061 100644 --- a/bench/bankers-rounding.cc +++ b/bench/bankers-rounding.cc @@ -3,10 +3,10 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -#include "xnnpack.h" - #include "unary_operator.h" -#include "bench/utils.h" +#include "utils.h" +#include "xnnpack.h" +#include "xnnpack/math.h" #include #ifdef BENCHMARK_TENSORFLOW_LITE #include "tensorflow/lite/schema/schema_generated.h" diff --git a/bench/batch-matrix-multiply.cc b/bench/batch-matrix-multiply.cc index e45eccd46e8..b09cdebc34b 100644 --- a/bench/batch-matrix-multiply.cc +++ b/bench/batch-matrix-multiply.cc @@ -15,7 +15,7 @@ #include "xnnpack.h" #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack/buffer.h" #ifdef BENCHMARK_TENSORFLOW_LITE #include "flatbuffers/include/flatbuffers/flatbuffers.h" diff --git a/bench/bf16-gemm.cc b/bench/bf16-gemm.cc index 98289df55b5..01ef4a8f475 100644 --- a/bench/bf16-gemm.cc +++ b/bench/bf16-gemm.cc @@ -11,8 +11,8 @@ #include #include -#include "bench/gemm.h" -#include "bench/utils.h" +#include "gemm.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/gemm.h" diff --git a/bench/ceiling.cc b/bench/ceiling.cc index a377ae9289a..da08f7bb6d7 100644 --- a/bench/ceiling.cc +++ b/bench/ceiling.cc @@ -3,10 +3,10 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -#include "xnnpack.h" - #include "unary_operator.h" -#include "bench/utils.h" +#include "utils.h" +#include "xnnpack.h" +#include "xnnpack/math.h" #include #ifdef BENCHMARK_TENSORFLOW_LITE #include "tensorflow/lite/schema/schema_generated.h" diff --git a/bench/channel-shuffle.cc b/bench/channel-shuffle.cc index 61d138849c2..0a9f820ba3a 100644 --- a/bench/channel-shuffle.cc +++ b/bench/channel-shuffle.cc @@ -16,7 +16,7 @@ #include "xnnpack.h" #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack/buffer.h" diff --git a/bench/convert.cc b/bench/convert.cc index 77e6f2f7558..7c1725dc875 100644 --- a/bench/convert.cc +++ b/bench/convert.cc @@ -3,12 +3,12 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -#include "xnnpack.h" - #include #include "unary_operator.h" -#include "bench/utils.h" +#include "utils.h" +#include "xnnpack.h" +#include "xnnpack/math.h" #include #ifdef BENCHMARK_TENSORFLOW_LITE #include "flatbuffers/include/flatbuffers/flatbuffers.h" @@ -33,9 +33,7 @@ void xnnpack_convert_f32_qs8(benchmark::State& state) { benchmark_unary_operator( [](uint32_t flags, xnn_operator_t* op) { return xnn_create_convert_nc_f32_qs8( - 1.0f / 128.0f /* scale */, 1 /* zero point */, - std::numeric_limits::min(), - std::numeric_limits::max(), flags, op); + 1.0f / 128.0f /* scale */, 1 /* zero point */, flags, op); }, xnn_reshape_convert_nc_f32_qs8, xnn_setup_convert_nc_f32_qs8, state); } @@ -44,9 +42,7 @@ void xnnpack_convert_f32_qu8(benchmark::State& state) { benchmark_unary_operator( [](uint32_t flags, xnn_operator_t* op) { return xnn_create_convert_nc_f32_qu8( - 1.0f / 128.0f /* scale */, 127 /* zero point */, - std::numeric_limits::min(), - std::numeric_limits::max(), flags, op); + 1.0f / 128.0f /* scale */, 127 /* zero point */, flags, op); }, xnn_reshape_convert_nc_f32_qu8, xnn_setup_convert_nc_f32_qu8, state); } diff --git a/bench/convolution.cc b/bench/convolution.cc index 0011ed3545c..f6670b91351 100644 --- a/bench/convolution.cc +++ b/bench/convolution.cc @@ -28,7 +28,7 @@ #include "tensorflow/lite/schema/schema_generated.h" #include "tensorflow/lite/version.h" #endif // BENCHMARK_TENSORFLOW_LITE -#include "bench/utils.h" +#include "utils.h" #include "xnnpack/buffer.h" void xnnpack_convolution_qu8(benchmark::State& state, const char* net) { diff --git a/bench/deconvolution.cc b/bench/deconvolution.cc index 63e3f02b78e..3fb970c3149 100644 --- a/bench/deconvolution.cc +++ b/bench/deconvolution.cc @@ -25,7 +25,7 @@ #include "tensorflow/lite/schema/schema_generated.h" #include "tensorflow/lite/version.h" #endif // BENCHMARK_TENSORFLOW_LITE */ -#include "bench/utils.h" +#include "utils.h" #include "xnnpack/buffer.h" void xnnpack_deconvolution_qu8(benchmark::State& state, const char* net) { diff --git a/bench/elu.cc b/bench/elu.cc index d2823c97e2b..7912502bc91 100644 --- a/bench/elu.cc +++ b/bench/elu.cc @@ -3,13 +3,13 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -#include "xnnpack.h" - #include #include #include "unary_operator.h" -#include "bench/utils.h" +#include "utils.h" +#include "xnnpack.h" +#include "xnnpack/math.h" #include #ifdef BENCHMARK_TENSORFLOW_LITE #include "flatbuffers/include/flatbuffers/flatbuffer_builder.h" diff --git a/bench/f16-conv-hwc2chw.cc b/bench/f16-conv-hwc2chw.cc index 32ace527234..693871262bf 100644 --- a/bench/f16-conv-hwc2chw.cc +++ b/bench/f16-conv-hwc2chw.cc @@ -10,8 +10,8 @@ #include #include -#include "bench/dconv.h" -#include "bench/utils.h" +#include "dconv.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/conv.h" diff --git a/bench/f16-dwconv.cc b/bench/f16-dwconv.cc index ad08b6929e0..9733466d699 100644 --- a/bench/f16-dwconv.cc +++ b/bench/f16-dwconv.cc @@ -11,8 +11,8 @@ #include #include -#include "bench/dwconv.h" -#include "bench/utils.h" +#include "dwconv.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/dwconv.h" diff --git a/bench/f16-dwconv2d-chw.cc b/bench/f16-dwconv2d-chw.cc index 60d819f4f8b..a9f90ae24b5 100644 --- a/bench/f16-dwconv2d-chw.cc +++ b/bench/f16-dwconv2d-chw.cc @@ -10,8 +10,8 @@ #include #include -#include "bench/dwconv.h" -#include "bench/utils.h" +#include "dwconv.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/dwconv.h" diff --git a/bench/f16-f32-vcvt.cc b/bench/f16-f32-vcvt.cc index 7eca850e69f..07dacd35854 100644 --- a/bench/f16-f32-vcvt.cc +++ b/bench/f16-f32-vcvt.cc @@ -3,16 +3,17 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -#include -#include "bench/utils.h" -#include "bench/vcvt-benchmark.h" +#include "utils.h" +#include "vcvt-benchmark.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/hardware-config.h" +#include "xnnpack/math.h" #include "xnnpack/microfnptr.h" -#include "xnnpack/microparams.h" #include "xnnpack/microparams-init.h" +#include "xnnpack/microparams.h" #include "xnnpack/vcvt.h" +#include static void f16_f32_vcvt( benchmark::State& state, @@ -28,7 +29,7 @@ static void f16_f32_vcvt( BENCHMARK_CAPTURE(f16_f32_vcvt, ukernel, arch_flags, ukernel, init_params) \ ->Apply(benchmark::utils::UnaryElementwiseParameters) \ ->UseRealTime(); -#include "src/f16-f32-vcvt/f16-f32-vcvt.h" +#include "f16-f32-vcvt/f16-f32-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/bench/f16-f32acc-gemm.cc b/bench/f16-f32acc-gemm.cc index f81faef4c27..3678fcadf0e 100644 --- a/bench/f16-f32acc-gemm.cc +++ b/bench/f16-f32acc-gemm.cc @@ -14,8 +14,8 @@ #include #include -#include "bench/gemm.h" -#include "bench/utils.h" +#include "gemm.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/gemm.h" diff --git a/bench/f16-f32acc-igemm.cc b/bench/f16-f32acc-igemm.cc index 0138241f467..5377556af6e 100644 --- a/bench/f16-f32acc-igemm.cc +++ b/bench/f16-f32acc-igemm.cc @@ -10,8 +10,8 @@ #include #include -#include "bench/conv.h" -#include "bench/utils.h" +#include "conv.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/igemm.h" diff --git a/bench/f16-f32acc-rdsum.cc b/bench/f16-f32acc-rdsum.cc index e9d7734bee4..367e9a7e766 100644 --- a/bench/f16-f32acc-rdsum.cc +++ b/bench/f16-f32acc-rdsum.cc @@ -7,8 +7,8 @@ // Specification: test/f16-f32acc-rdsum.yaml // Generator: tools/generate-rdsum-benchmark.py -#include "bench/rsum-benchmark.h" -#include "bench/utils.h" +#include "rsum-benchmark.h" +#include "utils.h" #include #include "xnnpack.h" diff --git a/bench/f16-f32acc-rsum.cc b/bench/f16-f32acc-rsum.cc index aa4f3411b5b..f087668d615 100644 --- a/bench/f16-f32acc-rsum.cc +++ b/bench/f16-f32acc-rsum.cc @@ -7,8 +7,8 @@ // Specification: test/f16-f32acc-rsum.yaml // Generator: tools/generate-rdsum-benchmark.py -#include "bench/rsum-benchmark.h" -#include "bench/utils.h" +#include "rsum-benchmark.h" +#include "utils.h" #include #include "xnnpack.h" diff --git a/bench/f16-gavgpool-cw.cc b/bench/f16-gavgpool-cw.cc index 2091dbf7879..b8913fffb10 100644 --- a/bench/f16-gavgpool-cw.cc +++ b/bench/f16-gavgpool-cw.cc @@ -9,16 +9,15 @@ #include #include -#include "bench/utils.h" -#include - +#include "utils.h" #include "xnnpack.h" #include "xnnpack/aligned-allocator.h" #include "xnnpack/common.h" #include "xnnpack/gavgpool.h" +#include "xnnpack/math.h" #include "xnnpack/microfnptr.h" #include "xnnpack/microparams-init.h" - +#include void f16_gavgpool_cw( benchmark::State& state, diff --git a/bench/f16-gemm-minmax.cc b/bench/f16-gemm-minmax.cc index c0b491ec36c..c1834ba6117 100644 --- a/bench/f16-gemm-minmax.cc +++ b/bench/f16-gemm-minmax.cc @@ -8,8 +8,8 @@ // Generator: tools/generate-gemm-test.py #include -#include "bench/gemm-benchmark.h" -#include "bench/utils.h" +#include "gemm-benchmark.h" +#include "utils.h" #include "xnnpack/common.h" #include "xnnpack/gemm.h" #include "xnnpack/isa-checks.h" diff --git a/bench/f16-gemm.cc b/bench/f16-gemm.cc index 15196e54cc9..53de4dcd497 100644 --- a/bench/f16-gemm.cc +++ b/bench/f16-gemm.cc @@ -13,8 +13,8 @@ #include #include -#include "bench/gemm.h" -#include "bench/utils.h" +#include "gemm.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/gemm.h" diff --git a/bench/f16-igemm.cc b/bench/f16-igemm.cc index abed264c5a8..8269b73165f 100644 --- a/bench/f16-igemm.cc +++ b/bench/f16-igemm.cc @@ -11,8 +11,8 @@ #include #include -#include "bench/conv.h" -#include "bench/utils.h" +#include "conv.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/igemm.h" diff --git a/bench/f16-qs8-vcvt.cc b/bench/f16-qs8-vcvt.cc index 0727b2eab6b..71f00b7f807 100644 --- a/bench/f16-qs8-vcvt.cc +++ b/bench/f16-qs8-vcvt.cc @@ -7,8 +7,8 @@ #include #include -#include "bench/utils.h" -#include "bench/vcvt-benchmark.h" +#include "utils.h" +#include "vcvt-benchmark.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/hardware-config.h" @@ -27,9 +27,7 @@ static void f16_qs8_vcvt( xnn_f16_qs8_cvt_params params; init_params(¶ms, 1.0f /* scale */, - 1 /* output zero point */, - std::numeric_limits::min() + 1 /* output min */, - std::numeric_limits::max() - 1 /* output max */); + 1 /* output zero point */); cvt_benchmark(state, arch_flags, cvt, ¶ms); } @@ -39,7 +37,7 @@ static void f16_qs8_vcvt( BENCHMARK_CAPTURE(f16_qs8_vcvt, ukernel, arch_flags, ukernel, init_params) \ ->Apply(benchmark::utils::UnaryElementwiseParameters) \ ->UseRealTime(); -#include "src/f16-qs8-vcvt/f16-qs8-vcvt.h" +#include "f16-qs8-vcvt/f16-qs8-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/bench/f16-raddstoreexpminusmax.cc b/bench/f16-raddstoreexpminusmax.cc index 407cad1ee3a..f3cf127c743 100644 --- a/bench/f16-raddstoreexpminusmax.cc +++ b/bench/f16-raddstoreexpminusmax.cc @@ -9,7 +9,7 @@ #include #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/math.h" diff --git a/bench/f16-rmax.cc b/bench/f16-rmax.cc index 4343c6d6386..a2d55817ecf 100644 --- a/bench/f16-rmax.cc +++ b/bench/f16-rmax.cc @@ -9,7 +9,7 @@ #include #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/math.h" @@ -128,7 +128,7 @@ static void f16_rmax( ->UseRealTime(); #endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) BENCHMARK_CAPTURE(f16_rmax, avx512skx_u16, xnn_f16_rmax_ukernel__avx512skx_u16, /*init_params=*/nullptr, @@ -159,7 +159,9 @@ static void f16_rmax( benchmark::utils::CheckAVX512SKX) ->Apply(benchmark::utils::ReductionParameters) ->UseRealTime(); +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_rmax, f16c_u32, xnn_f16_rmax_ukernel__f16c_u32, /*init_params=*/nullptr, diff --git a/bench/f16-rmin.cc b/bench/f16-rmin.cc index 09d14b09553..87888553d5b 100644 --- a/bench/f16-rmin.cc +++ b/bench/f16-rmin.cc @@ -9,7 +9,7 @@ #include #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/math.h" @@ -128,7 +128,7 @@ static void f16_rmin( ->UseRealTime(); #endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) BENCHMARK_CAPTURE(f16_rmin, avx512skx_u16, xnn_f16_rmin_ukernel__avx512skx_u16, /*init_params=*/nullptr, @@ -159,7 +159,7 @@ static void f16_rmin( benchmark::utils::CheckAVX512SKX) ->Apply(benchmark::utils::ReductionParameters) ->UseRealTime(); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) BENCHMARK_CAPTURE(f16_rmin, scalar_u1, xnn_f16_rmin_ukernel__scalar_u1) diff --git a/bench/f16-rminmax.cc b/bench/f16-rminmax.cc index 1d931d6de88..32866e1df62 100644 --- a/bench/f16-rminmax.cc +++ b/bench/f16-rminmax.cc @@ -9,7 +9,7 @@ #include #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/math.h" @@ -128,7 +128,7 @@ static void f16_rminmax( ->UseRealTime(); #endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) BENCHMARK_CAPTURE(f16_rminmax, avx512skx_u16, xnn_f16_rminmax_ukernel__avx512skx_u16, /*init_params=*/nullptr, @@ -159,7 +159,7 @@ static void f16_rminmax( benchmark::utils::CheckAVX512SKX) ->Apply(benchmark::utils::ReductionParameters) ->UseRealTime(); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) BENCHMARK_CAPTURE(f16_rminmax, scalar_u1, xnn_f16_rminmax_ukernel__scalar_u1) diff --git a/bench/f16-rsum.cc b/bench/f16-rsum.cc index 230016a5cfe..89fcafcf996 100644 --- a/bench/f16-rsum.cc +++ b/bench/f16-rsum.cc @@ -7,8 +7,8 @@ // Specification: test/f16-rsum.yaml // Generator: tools/generate-rdsum-benchmark.py -#include "bench/rsum-benchmark.h" -#include "bench/utils.h" +#include "rsum-benchmark.h" +#include "utils.h" #include #include "xnnpack.h" diff --git a/bench/f16-spmm.cc b/bench/f16-spmm.cc index dfe95839b79..a9cc6ae6d36 100644 --- a/bench/f16-spmm.cc +++ b/bench/f16-spmm.cc @@ -11,8 +11,8 @@ #include #include -#include "bench/spmm.h" -#include "bench/utils.h" +#include "spmm.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/math.h" diff --git a/bench/f16-vcmul.cc b/bench/f16-vcmul.cc index f72a0473818..26ef075c606 100644 --- a/bench/f16-vcmul.cc +++ b/bench/f16-vcmul.cc @@ -10,14 +10,15 @@ #include #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack.h" +#include "xnnpack/buffer.h" #include "xnnpack/common.h" #include "xnnpack/hardware-config.h" +#include "xnnpack/math.h" #include "xnnpack/microfnptr.h" #include "xnnpack/microparams-init.h" #include "xnnpack/vbinary.h" -#include "xnnpack/buffer.h" #include static void f16_vcmul(benchmark::State& state, uint64_t arch_flags, @@ -72,7 +73,7 @@ static void f16_vcmul(benchmark::State& state, uint64_t arch_flags, benchmark::utils::BinaryElementwiseParameters, \ std::complex>) \ ->UseRealTime(); -#include "src/f16-vbinary/f16-vcmul.h" +#include "f16-vbinary/f16-vcmul.h" #undef XNN_UKERNEL_WITH_PARAMS #ifndef XNNPACK_BENCHMARK_NO_MAIN diff --git a/bench/f32-bgemm.cc b/bench/f32-bgemm.cc index 2c210d00bf2..f078bca6aae 100644 --- a/bench/f32-bgemm.cc +++ b/bench/f32-bgemm.cc @@ -17,8 +17,8 @@ #ifdef BENCHMARK_RUY #include "ruy/ruy.h" #endif // BENCHMARK_RUY -#include "bench/bgemm.h" -#include "bench/utils.h" +#include "bgemm.h" +#include "utils.h" #include "xnnpack/allocator.h" #include "xnnpack/common.h" #include "xnnpack/gemm.h" diff --git a/bench/f32-conv-hwc.cc b/bench/f32-conv-hwc.cc index 07048f96e23..86aad770c86 100644 --- a/bench/f32-conv-hwc.cc +++ b/bench/f32-conv-hwc.cc @@ -11,8 +11,8 @@ #include #include -#include "bench/dconv.h" -#include "bench/utils.h" +#include "dconv.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/conv.h" diff --git a/bench/f32-conv-hwc2chw.cc b/bench/f32-conv-hwc2chw.cc index d77c8cbab6f..ddeabbf47df 100644 --- a/bench/f32-conv-hwc2chw.cc +++ b/bench/f32-conv-hwc2chw.cc @@ -11,8 +11,8 @@ #include #include -#include "bench/dconv.h" -#include "bench/utils.h" +#include "dconv.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/conv.h" diff --git a/bench/f32-dwconv.cc b/bench/f32-dwconv.cc index 6ab991c2a88..fa21809f074 100644 --- a/bench/f32-dwconv.cc +++ b/bench/f32-dwconv.cc @@ -11,8 +11,8 @@ #include #include -#include "bench/dwconv.h" -#include "bench/utils.h" +#include "dwconv.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/dwconv.h" @@ -825,9 +825,69 @@ static void f32_dwconv( BENCHMARK_DWCONV(f32_dwconv_8f8m9l4c4s4r__neon_acc2) BENCHMARK_DWCONV(f32_dwconv_8f8m9l8c4s4r__neon) BENCHMARK_DWCONV(f32_dwconv_8f8m9l8c4s4r__neon_acc2) - #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + static void f32_dwconv_25p16c__avx512f(benchmark::State& state, const char* net) { + f32_dwconv(state, + xnn_f32_dwconv_minmax_ukernel_25p16c__avx512f, + xnn_init_f32_minmax_scalar_params, + 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckAVX512F); + } + static void f32_dwconv_25p16c__avx512f_acc2(benchmark::State& state, const char* net) { + f32_dwconv(state, + xnn_f32_dwconv_minmax_ukernel_25p16c__avx512f_acc2, + xnn_init_f32_minmax_scalar_params, + 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckAVX512F); + } + static void f32_dwconv_25p32c__avx512f(benchmark::State& state, const char* net) { + f32_dwconv(state, + xnn_f32_dwconv_minmax_ukernel_25p32c__avx512f, + xnn_init_f32_minmax_scalar_params, + 32 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckAVX512F); + } + static void f32_dwconv_25p32c__avx512f_acc2(benchmark::State& state, const char* net) { + f32_dwconv(state, + xnn_f32_dwconv_minmax_ukernel_25p32c__avx512f_acc2, + xnn_init_f32_minmax_scalar_params, + 32 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckAVX512F); + } + + static void f32_dwconv_5f5m5l16c16s1r__avx512f(benchmark::State& state, const char* net) { + f32_dwconv(state, + xnn_f32_dwconv_minmax_ukernel_5f5m5l16c16s1r__avx512f, xnn_init_f32_minmax_scalar_params, + 5 /* first pass tile */, 5 /* middle pass tile */, 5 /* last pass tile */, + 16 /* channel tile */, 16 /* channel subtile */, 1 /* channel round */, benchmark::utils::CheckAVX512F); + } + static void f32_dwconv_5f5m5l16c16s1r__avx512f_acc2(benchmark::State& state, const char* net) { + f32_dwconv(state, + xnn_f32_dwconv_minmax_ukernel_5f5m5l16c16s1r__avx512f_acc2, xnn_init_f32_minmax_scalar_params, + 5 /* first pass tile */, 5 /* middle pass tile */, 5 /* last pass tile */, + 16 /* channel tile */, 16 /* channel subtile */, 1 /* channel round */, benchmark::utils::CheckAVX512F); + } + static void f32_dwconv_5f5m5l32c16s1r__avx512f(benchmark::State& state, const char* net) { + f32_dwconv(state, + xnn_f32_dwconv_minmax_ukernel_5f5m5l32c16s1r__avx512f, xnn_init_f32_minmax_scalar_params, + 5 /* first pass tile */, 5 /* middle pass tile */, 5 /* last pass tile */, + 32 /* channel tile */, 16 /* channel subtile */, 1 /* channel round */, benchmark::utils::CheckAVX512F); + } + static void f32_dwconv_5f5m5l32c16s1r__avx512f_acc2(benchmark::State& state, const char* net) { + f32_dwconv(state, + xnn_f32_dwconv_minmax_ukernel_5f5m5l32c16s1r__avx512f_acc2, xnn_init_f32_minmax_scalar_params, + 5 /* first pass tile */, 5 /* middle pass tile */, 5 /* last pass tile */, + 32 /* channel tile */, 16 /* channel subtile */, 1 /* channel round */, benchmark::utils::CheckAVX512F); + } + + BENCHMARK_DWCONV(f32_dwconv_25p16c__avx512f) + BENCHMARK_DWCONV(f32_dwconv_25p16c__avx512f_acc2) + BENCHMARK_DWCONV(f32_dwconv_25p32c__avx512f) + BENCHMARK_DWCONV(f32_dwconv_25p32c__avx512f_acc2) + + BENCHMARK_DWCONV(f32_dwconv_5f5m5l16c16s1r__avx512f) + BENCHMARK_DWCONV(f32_dwconv_5f5m5l16c16s1r__avx512f_acc2) + BENCHMARK_DWCONV(f32_dwconv_5f5m5l32c16s1r__avx512f) + BENCHMARK_DWCONV(f32_dwconv_5f5m5l32c16s1r__avx512f_acc2) +#endif // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 static void f32_dwconv_4p4c__sse(benchmark::State& state, const char* net) { @@ -1195,56 +1255,6 @@ static void f32_dwconv( benchmark::utils::CheckFMA3); } - static void f32_dwconv_25p16c__avx512f(benchmark::State& state, const char* net) { - f32_dwconv(state, - xnn_f32_dwconv_minmax_ukernel_25p16c__avx512f, - xnn_init_f32_minmax_scalar_params, - 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckAVX512F); - } - static void f32_dwconv_25p16c__avx512f_acc2(benchmark::State& state, const char* net) { - f32_dwconv(state, - xnn_f32_dwconv_minmax_ukernel_25p16c__avx512f_acc2, - xnn_init_f32_minmax_scalar_params, - 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckAVX512F); - } - static void f32_dwconv_25p32c__avx512f(benchmark::State& state, const char* net) { - f32_dwconv(state, - xnn_f32_dwconv_minmax_ukernel_25p32c__avx512f, - xnn_init_f32_minmax_scalar_params, - 32 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckAVX512F); - } - static void f32_dwconv_25p32c__avx512f_acc2(benchmark::State& state, const char* net) { - f32_dwconv(state, - xnn_f32_dwconv_minmax_ukernel_25p32c__avx512f_acc2, - xnn_init_f32_minmax_scalar_params, - 32 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckAVX512F); - } - - static void f32_dwconv_5f5m5l16c16s1r__avx512f(benchmark::State& state, const char* net) { - f32_dwconv(state, - xnn_f32_dwconv_minmax_ukernel_5f5m5l16c16s1r__avx512f, xnn_init_f32_minmax_scalar_params, - 5 /* first pass tile */, 5 /* middle pass tile */, 5 /* last pass tile */, - 16 /* channel tile */, 16 /* channel subtile */, 1 /* channel round */, benchmark::utils::CheckAVX512F); - } - static void f32_dwconv_5f5m5l16c16s1r__avx512f_acc2(benchmark::State& state, const char* net) { - f32_dwconv(state, - xnn_f32_dwconv_minmax_ukernel_5f5m5l16c16s1r__avx512f_acc2, xnn_init_f32_minmax_scalar_params, - 5 /* first pass tile */, 5 /* middle pass tile */, 5 /* last pass tile */, - 16 /* channel tile */, 16 /* channel subtile */, 1 /* channel round */, benchmark::utils::CheckAVX512F); - } - static void f32_dwconv_5f5m5l32c16s1r__avx512f(benchmark::State& state, const char* net) { - f32_dwconv(state, - xnn_f32_dwconv_minmax_ukernel_5f5m5l32c16s1r__avx512f, xnn_init_f32_minmax_scalar_params, - 5 /* first pass tile */, 5 /* middle pass tile */, 5 /* last pass tile */, - 32 /* channel tile */, 16 /* channel subtile */, 1 /* channel round */, benchmark::utils::CheckAVX512F); - } - static void f32_dwconv_5f5m5l32c16s1r__avx512f_acc2(benchmark::State& state, const char* net) { - f32_dwconv(state, - xnn_f32_dwconv_minmax_ukernel_5f5m5l32c16s1r__avx512f_acc2, xnn_init_f32_minmax_scalar_params, - 5 /* first pass tile */, 5 /* middle pass tile */, 5 /* last pass tile */, - 32 /* channel tile */, 16 /* channel subtile */, 1 /* channel round */, benchmark::utils::CheckAVX512F); - } - BENCHMARK_DWCONV(f32_dwconv_4p4c__sse) BENCHMARK_DWCONV(f32_dwconv_9p4c__sse) BENCHMARK_DWCONV(f32_dwconv_25p4c__sse) @@ -1308,19 +1318,8 @@ static void f32_dwconv( BENCHMARK_DWCONV(f32_dwconv_7f6m6l16c8s4r__fma3_acc2) BENCHMARK_DWCONV(f32_dwconv_7f6m6l32c8s4r__fma3) BENCHMARK_DWCONV(f32_dwconv_7f6m6l32c8s4r__fma3_acc2) - - BENCHMARK_DWCONV(f32_dwconv_25p16c__avx512f) - BENCHMARK_DWCONV(f32_dwconv_25p16c__avx512f_acc2) - BENCHMARK_DWCONV(f32_dwconv_25p32c__avx512f) - BENCHMARK_DWCONV(f32_dwconv_25p32c__avx512f_acc2) - - BENCHMARK_DWCONV(f32_dwconv_5f5m5l16c16s1r__avx512f) - BENCHMARK_DWCONV(f32_dwconv_5f5m5l16c16s1r__avx512f_acc2) - BENCHMARK_DWCONV(f32_dwconv_5f5m5l32c16s1r__avx512f) - BENCHMARK_DWCONV(f32_dwconv_5f5m5l32c16s1r__avx512f_acc2) #endif // XNN_ARCH_X88 || XNN_ARCH_X86_64 - #if XNN_ARCH_WASM static void f32_dwconv_9p1c__wasm(benchmark::State& state, const char* net) { f32_dwconv(state, diff --git a/bench/f32-dwconv2d-chw.cc b/bench/f32-dwconv2d-chw.cc index bbfa543cfc4..598efe15c3d 100644 --- a/bench/f32-dwconv2d-chw.cc +++ b/bench/f32-dwconv2d-chw.cc @@ -11,8 +11,8 @@ #include #include -#include "bench/dwconv.h" -#include "bench/utils.h" +#include "dwconv.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/dwconv.h" diff --git a/bench/f32-f16-vcvt.cc b/bench/f32-f16-vcvt.cc index b9c4ca8b903..b5a16001e87 100644 --- a/bench/f32-f16-vcvt.cc +++ b/bench/f32-f16-vcvt.cc @@ -3,16 +3,17 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -#include -#include "bench/utils.h" -#include "bench/vcvt-benchmark.h" +#include "utils.h" +#include "vcvt-benchmark.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/hardware-config.h" +#include "xnnpack/math.h" #include "xnnpack/microfnptr.h" -#include "xnnpack/microparams.h" #include "xnnpack/microparams-init.h" +#include "xnnpack/microparams.h" #include "xnnpack/vcvt.h" +#include static void f32_f16_vcvt( benchmark::State& state, @@ -28,7 +29,7 @@ static void f32_f16_vcvt( BENCHMARK_CAPTURE(f32_f16_vcvt, ukernel, arch_flags, ukernel, init_params) \ ->Apply(benchmark::utils::UnaryElementwiseParameters) \ ->UseRealTime(); -#include "src/f32-f16-vcvt/f32-f16-vcvt.h" +#include "f32-f16-vcvt/f32-f16-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/bench/f32-gavgpool-cw.cc b/bench/f32-gavgpool-cw.cc index e51a74fba9a..fd19411217b 100644 --- a/bench/f32-gavgpool-cw.cc +++ b/bench/f32-gavgpool-cw.cc @@ -10,7 +10,7 @@ #include #include -#include "bench/utils.h" +#include "utils.h" #include #include "xnnpack.h" diff --git a/bench/f32-gemm-goi-minmax.cc b/bench/f32-gemm-goi-minmax.cc index 20ff18bfce6..acfa1d80780 100644 --- a/bench/f32-gemm-goi-minmax.cc +++ b/bench/f32-gemm-goi-minmax.cc @@ -8,8 +8,8 @@ // Generator: tools/generate-gemm-test.py #include -#include "bench/gemm-benchmark.h" -#include "bench/utils.h" +#include "gemm-benchmark.h" +#include "utils.h" #include "xnnpack/common.h" #include "xnnpack/gemm.h" #include "xnnpack/isa-checks.h" diff --git a/bench/f32-gemm-minmax.cc b/bench/f32-gemm-minmax.cc index 2cdf495a69d..c625c166777 100644 --- a/bench/f32-gemm-minmax.cc +++ b/bench/f32-gemm-minmax.cc @@ -8,8 +8,8 @@ // Generator: tools/generate-gemm-test.py #include -#include "bench/gemm-benchmark.h" -#include "bench/utils.h" +#include "gemm-benchmark.h" +#include "utils.h" #include "xnnpack/common.h" #include "xnnpack/gemm.h" #include "xnnpack/isa-checks.h" diff --git a/bench/f32-gemm.cc b/bench/f32-gemm.cc index b63abaa3903..ba5d3be4803 100644 --- a/bench/f32-gemm.cc +++ b/bench/f32-gemm.cc @@ -20,8 +20,8 @@ #ifdef BENCHMARK_RUY #include "ruy/ruy.h" #endif // BENCHMARK_RUY -#include "bench/gemm.h" -#include "bench/utils.h" +#include "gemm.h" +#include "utils.h" #include "xnnpack/allocator.h" #include "xnnpack/common.h" #include "xnnpack/gemm.h" @@ -1306,7 +1306,7 @@ static void ruy_st(benchmark::State& state, const char* net) BENCHMARK_GEMM(f32_gemm_8x8s4__neonfma) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) static void f32_gemm_1x16__avx512f_broadcast(benchmark::State& state, const char* net) { GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, @@ -1350,6 +1350,15 @@ static void ruy_st(benchmark::State& state, const char* net) benchmark::utils::CheckAVX512F); } + BENCHMARK_GEMM(f32_gemm_1x16__avx512f_broadcast) + BENCHMARK_GEMM(f32_gemm_4x16__avx512f_broadcast) + BENCHMARK_GEMM(f32_gemm_5x16__avx512f_broadcast) + BENCHMARK_GEMM(f32_gemm_6x16__avx512f_broadcast) + BENCHMARK_GEMM(f32_gemm_7x16__avx512f_broadcast) + BENCHMARK_GEMM(f32_gemm_8x16__avx512f_broadcast) +#endif // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 static void f32_gemm_1x8__fma3_broadcast(benchmark::State& state, const char* net) { GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, @@ -1638,13 +1647,6 @@ static void ruy_st(benchmark::State& state, const char* net) /*mr=*/4, /*nr=*/8, /*kr=*/1, /*sr=*/1); } - BENCHMARK_GEMM(f32_gemm_1x16__avx512f_broadcast) - BENCHMARK_GEMM(f32_gemm_4x16__avx512f_broadcast) - BENCHMARK_GEMM(f32_gemm_5x16__avx512f_broadcast) - BENCHMARK_GEMM(f32_gemm_6x16__avx512f_broadcast) - BENCHMARK_GEMM(f32_gemm_7x16__avx512f_broadcast) - BENCHMARK_GEMM(f32_gemm_8x16__avx512f_broadcast) - BENCHMARK_GEMM(f32_gemm_1x8__fma3_broadcast) BENCHMARK_GEMM(f32_gemm_4x8__fma3_broadcast) BENCHMARK_GEMM(f32_gemm_5x8__fma3_broadcast) diff --git a/bench/f32-igemm.cc b/bench/f32-igemm.cc index 2cf8214746c..8a67ac74811 100644 --- a/bench/f32-igemm.cc +++ b/bench/f32-igemm.cc @@ -11,8 +11,8 @@ #include #include -#include "bench/conv.h" -#include "bench/utils.h" +#include "conv.h" +#include "utils.h" #include "xnnpack/common.h" #include "xnnpack/igemm.h" #include "xnnpack/indirection.h" @@ -682,6 +682,56 @@ static void f32_igemm(benchmark::State& state, BENCHMARK_CONV(f32_igemm_8x8s4__neonfma) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + static void f32_igemm_1x16__avx512f_broadcast(benchmark::State& state, const char* net) { + f32_igemm(state, + xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast, + xnn_init_f32_minmax_scalar_params, + /*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1, + benchmark::utils::CheckAVX512F); + } + static void f32_igemm_4x16__avx512f_broadcast(benchmark::State& state, const char* net) { + f32_igemm(state, + xnn_f32_igemm_minmax_ukernel_4x16__avx512f_broadcast, + xnn_init_f32_minmax_scalar_params, + /*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1, + benchmark::utils::CheckAVX512F); + } + static void f32_igemm_5x16__avx512f_broadcast(benchmark::State& state, const char* net) { + f32_igemm(state, + xnn_f32_igemm_minmax_ukernel_5x16__avx512f_broadcast, + xnn_init_f32_minmax_scalar_params, + /*mr=*/5, /*nr=*/16, /*kr=*/1, /*sr=*/1, + benchmark::utils::CheckAVX512F); + } + static void f32_igemm_6x16__avx512f_broadcast(benchmark::State& state, const char* net) { + f32_igemm(state, + xnn_f32_igemm_minmax_ukernel_6x16__avx512f_broadcast, + xnn_init_f32_minmax_scalar_params, + /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1, + benchmark::utils::CheckAVX512F); + } + static void f32_igemm_7x16__avx512f_broadcast(benchmark::State& state, const char* net) { + f32_igemm(state, + xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast, + xnn_init_f32_minmax_scalar_params, + /*mr=*/7, /*nr=*/16, /*kr=*/1, /*sr=*/1, + benchmark::utils::CheckAVX512F); + } + static void f32_igemm_8x16__avx512f_broadcast(benchmark::State& state, const char* net) { + f32_igemm(state, + xnn_f32_igemm_minmax_ukernel_8x16__avx512f_broadcast, + xnn_init_f32_minmax_scalar_params, + /*mr=*/8, /*nr=*/16, /*kr=*/1, /*sr=*/1, + benchmark::utils::CheckAVX512F); + } + BENCHMARK_CONV(f32_igemm_1x16__avx512f_broadcast) + BENCHMARK_CONV(f32_igemm_4x16__avx512f_broadcast) + BENCHMARK_CONV(f32_igemm_5x16__avx512f_broadcast) + BENCHMARK_CONV(f32_igemm_6x16__avx512f_broadcast) + BENCHMARK_CONV(f32_igemm_7x16__avx512f_broadcast) + BENCHMARK_CONV(f32_igemm_8x16__avx512f_broadcast) +#endif // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 static void f32_igemm_1x8__sse_load1(benchmark::State& state, const char* net) { @@ -904,48 +954,6 @@ static void f32_igemm(benchmark::State& state, /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1, benchmark::utils::CheckFMA3); } - static void f32_igemm_1x16__avx512f_broadcast(benchmark::State& state, const char* net) { - f32_igemm(state, - xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast, - xnn_init_f32_minmax_scalar_params, - /*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1, - benchmark::utils::CheckAVX512F); - } - static void f32_igemm_4x16__avx512f_broadcast(benchmark::State& state, const char* net) { - f32_igemm(state, - xnn_f32_igemm_minmax_ukernel_4x16__avx512f_broadcast, - xnn_init_f32_minmax_scalar_params, - /*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1, - benchmark::utils::CheckAVX512F); - } - static void f32_igemm_5x16__avx512f_broadcast(benchmark::State& state, const char* net) { - f32_igemm(state, - xnn_f32_igemm_minmax_ukernel_5x16__avx512f_broadcast, - xnn_init_f32_minmax_scalar_params, - /*mr=*/5, /*nr=*/16, /*kr=*/1, /*sr=*/1, - benchmark::utils::CheckAVX512F); - } - static void f32_igemm_6x16__avx512f_broadcast(benchmark::State& state, const char* net) { - f32_igemm(state, - xnn_f32_igemm_minmax_ukernel_6x16__avx512f_broadcast, - xnn_init_f32_minmax_scalar_params, - /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1, - benchmark::utils::CheckAVX512F); - } - static void f32_igemm_7x16__avx512f_broadcast(benchmark::State& state, const char* net) { - f32_igemm(state, - xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast, - xnn_init_f32_minmax_scalar_params, - /*mr=*/7, /*nr=*/16, /*kr=*/1, /*sr=*/1, - benchmark::utils::CheckAVX512F); - } - static void f32_igemm_8x16__avx512f_broadcast(benchmark::State& state, const char* net) { - f32_igemm(state, - xnn_f32_igemm_minmax_ukernel_8x16__avx512f_broadcast, - xnn_init_f32_minmax_scalar_params, - /*mr=*/8, /*nr=*/16, /*kr=*/1, /*sr=*/1, - benchmark::utils::CheckAVX512F); - } BENCHMARK_CONV(f32_igemm_1x8__sse_load1) BENCHMARK_CONV(f32_igemm_3x8__sse_load1) @@ -980,12 +988,6 @@ static void f32_igemm(benchmark::State& state, BENCHMARK_CONV(f32_igemm_6x16__fma3_broadcast) BENCHMARK_CONV(f32_igemm_5x16__fma3_broadcast_prfm) BENCHMARK_CONV(f32_igemm_6x16__fma3_broadcast_prfm) - BENCHMARK_CONV(f32_igemm_1x16__avx512f_broadcast) - BENCHMARK_CONV(f32_igemm_4x16__avx512f_broadcast) - BENCHMARK_CONV(f32_igemm_5x16__avx512f_broadcast) - BENCHMARK_CONV(f32_igemm_6x16__avx512f_broadcast) - BENCHMARK_CONV(f32_igemm_7x16__avx512f_broadcast) - BENCHMARK_CONV(f32_igemm_8x16__avx512f_broadcast) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD diff --git a/bench/f32-im2col-gemm.cc b/bench/f32-im2col-gemm.cc index 864893a60f6..554f7ffbde6 100644 --- a/bench/f32-im2col-gemm.cc +++ b/bench/f32-im2col-gemm.cc @@ -11,8 +11,8 @@ #include #include -#include "bench/conv.h" -#include "bench/utils.h" +#include "conv.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/gemm.h" diff --git a/bench/f32-qc4w-gemm.cc b/bench/f32-qc4w-gemm.cc index 70d76a784b1..cb302351efd 100644 --- a/bench/f32-qc4w-gemm.cc +++ b/bench/f32-qc4w-gemm.cc @@ -12,8 +12,8 @@ #include #include -#include "bench/gemm.h" -#include "bench/utils.h" +#include "gemm.h" +#include "utils.h" #include "xnnpack/allocator.h" #include "xnnpack/common.h" #include "xnnpack/gemm.h" @@ -395,6 +395,74 @@ static void GEMMBenchmark(benchmark::State& state, BENCHMARK_GEMM(f32_qc4w_gemm_6x8__neonfma_dup_ld64) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + static void f32_qc4w_gemm_1x32__avx512skx_broadcast(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_f32_qc4w_gemm_minmax_ukernel_1x32__avx512skx_broadcast, + xnn_init_f32_qc4w_minmax_scalar_params, + /*mr=*/1, /*nr=*/32, /*kr=*/1, /*sr=*/1, + benchmark::utils::CheckAVX512SKX); + } + static void f32_qc4w_gemm_2x32__avx512skx_broadcast(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_f32_qc4w_gemm_minmax_ukernel_2x32__avx512skx_broadcast, + xnn_init_f32_qc4w_minmax_scalar_params, + /*mr=*/2, /*nr=*/32, /*kr=*/1, /*sr=*/1, + benchmark::utils::CheckAVX512SKX); + } + static void f32_qc4w_gemm_3x32__avx512skx_broadcast(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_f32_qc4w_gemm_minmax_ukernel_3x32__avx512skx_broadcast, + xnn_init_f32_qc4w_minmax_scalar_params, + /*mr=*/3, /*nr=*/32, /*kr=*/1, /*sr=*/1, + benchmark::utils::CheckAVX512SKX); + } + static void f32_qc4w_gemm_4x32__avx512skx_broadcast(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_f32_qc4w_gemm_minmax_ukernel_4x32__avx512skx_broadcast, + xnn_init_f32_qc4w_minmax_scalar_params, + /*mr=*/4, /*nr=*/32, /*kr=*/1, /*sr=*/1, + benchmark::utils::CheckAVX512SKX); + } + static void f32_qc4w_gemm_5x32__avx512skx_broadcast(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_f32_qc4w_gemm_minmax_ukernel_5x32__avx512skx_broadcast, + xnn_init_f32_qc4w_minmax_scalar_params, + /*mr=*/5, /*nr=*/32, /*kr=*/1, /*sr=*/1, + benchmark::utils::CheckAVX512SKX); + } + static void f32_qc4w_gemm_6x32__avx512skx_broadcast(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_f32_qc4w_gemm_minmax_ukernel_6x32__avx512skx_broadcast, + xnn_init_f32_qc4w_minmax_scalar_params, + /*mr=*/6, /*nr=*/32, /*kr=*/1, /*sr=*/1, + benchmark::utils::CheckAVX512SKX); + } + static void f32_qc4w_gemm_7x32__avx512skx_broadcast(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_f32_qc4w_gemm_minmax_ukernel_7x32__avx512skx_broadcast, + xnn_init_f32_qc4w_minmax_scalar_params, + /*mr=*/7, /*nr=*/32, /*kr=*/1, /*sr=*/1, + benchmark::utils::CheckAVX512SKX); + } + static void f32_qc4w_gemm_8x32__avx512skx_broadcast(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_f32_qc4w_gemm_minmax_ukernel_8x32__avx512skx_broadcast, + xnn_init_f32_qc4w_minmax_scalar_params, + /*mr=*/8, /*nr=*/32, /*kr=*/1, /*sr=*/1, + benchmark::utils::CheckAVX512SKX); + } + + BENCHMARK_GEMM(f32_qc4w_gemm_1x32__avx512skx_broadcast) + BENCHMARK_GEMM(f32_qc4w_gemm_2x32__avx512skx_broadcast) + BENCHMARK_GEMM(f32_qc4w_gemm_3x32__avx512skx_broadcast) + BENCHMARK_GEMM(f32_qc4w_gemm_4x32__avx512skx_broadcast) + BENCHMARK_GEMM(f32_qc4w_gemm_5x32__avx512skx_broadcast) + BENCHMARK_GEMM(f32_qc4w_gemm_6x32__avx512skx_broadcast) + BENCHMARK_GEMM(f32_qc4w_gemm_7x32__avx512skx_broadcast) + BENCHMARK_GEMM(f32_qc4w_gemm_8x32__avx512skx_broadcast) +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + #if XNN_ARCH_X86 || XNN_ARCH_X86_64 static void f32_qc4w_gemm_1x16__avx2_broadcast(benchmark::State& state, const char* net) { GEMMBenchmark(state, @@ -566,62 +634,7 @@ static void GEMMBenchmark(benchmark::State& state, /*mr=*/8, /*nr=*/16, /*kr=*/1, /*sr=*/1, benchmark::utils::CheckAVX2); } - static void f32_qc4w_gemm_1x32__avx512skx_broadcast(benchmark::State& state, const char* net) { - GEMMBenchmark(state, - xnn_f32_qc4w_gemm_minmax_ukernel_1x32__avx512skx_broadcast, - xnn_init_f32_qc4w_minmax_scalar_params, - /*mr=*/1, /*nr=*/32, /*kr=*/1, /*sr=*/1, - benchmark::utils::CheckAVX512SKX); - } - static void f32_qc4w_gemm_2x32__avx512skx_broadcast(benchmark::State& state, const char* net) { - GEMMBenchmark(state, - xnn_f32_qc4w_gemm_minmax_ukernel_2x32__avx512skx_broadcast, - xnn_init_f32_qc4w_minmax_scalar_params, - /*mr=*/2, /*nr=*/32, /*kr=*/1, /*sr=*/1, - benchmark::utils::CheckAVX512SKX); - } - static void f32_qc4w_gemm_3x32__avx512skx_broadcast(benchmark::State& state, const char* net) { - GEMMBenchmark(state, - xnn_f32_qc4w_gemm_minmax_ukernel_3x32__avx512skx_broadcast, - xnn_init_f32_qc4w_minmax_scalar_params, - /*mr=*/3, /*nr=*/32, /*kr=*/1, /*sr=*/1, - benchmark::utils::CheckAVX512SKX); - } - static void f32_qc4w_gemm_4x32__avx512skx_broadcast(benchmark::State& state, const char* net) { - GEMMBenchmark(state, - xnn_f32_qc4w_gemm_minmax_ukernel_4x32__avx512skx_broadcast, - xnn_init_f32_qc4w_minmax_scalar_params, - /*mr=*/4, /*nr=*/32, /*kr=*/1, /*sr=*/1, - benchmark::utils::CheckAVX512SKX); - } - static void f32_qc4w_gemm_5x32__avx512skx_broadcast(benchmark::State& state, const char* net) { - GEMMBenchmark(state, - xnn_f32_qc4w_gemm_minmax_ukernel_5x32__avx512skx_broadcast, - xnn_init_f32_qc4w_minmax_scalar_params, - /*mr=*/5, /*nr=*/32, /*kr=*/1, /*sr=*/1, - benchmark::utils::CheckAVX512SKX); - } - static void f32_qc4w_gemm_6x32__avx512skx_broadcast(benchmark::State& state, const char* net) { - GEMMBenchmark(state, - xnn_f32_qc4w_gemm_minmax_ukernel_6x32__avx512skx_broadcast, - xnn_init_f32_qc4w_minmax_scalar_params, - /*mr=*/6, /*nr=*/32, /*kr=*/1, /*sr=*/1, - benchmark::utils::CheckAVX512SKX); - } - static void f32_qc4w_gemm_7x32__avx512skx_broadcast(benchmark::State& state, const char* net) { - GEMMBenchmark(state, - xnn_f32_qc4w_gemm_minmax_ukernel_7x32__avx512skx_broadcast, - xnn_init_f32_qc4w_minmax_scalar_params, - /*mr=*/7, /*nr=*/32, /*kr=*/1, /*sr=*/1, - benchmark::utils::CheckAVX512SKX); - } - static void f32_qc4w_gemm_8x32__avx512skx_broadcast(benchmark::State& state, const char* net) { - GEMMBenchmark(state, - xnn_f32_qc4w_gemm_minmax_ukernel_8x32__avx512skx_broadcast, - xnn_init_f32_qc4w_minmax_scalar_params, - /*mr=*/8, /*nr=*/32, /*kr=*/1, /*sr=*/1, - benchmark::utils::CheckAVX512SKX); - } + static void f32_qc4w_gemm_1x8__sse41_dup(benchmark::State& state, const char* net) { GEMMBenchmark(state, xnn_f32_qc4w_gemm_minmax_ukernel_1x8__sse41_dup, @@ -658,14 +671,6 @@ static void GEMMBenchmark(benchmark::State& state, benchmark::utils::CheckSSE41); } - BENCHMARK_GEMM(f32_qc4w_gemm_1x32__avx512skx_broadcast) - BENCHMARK_GEMM(f32_qc4w_gemm_2x32__avx512skx_broadcast) - BENCHMARK_GEMM(f32_qc4w_gemm_3x32__avx512skx_broadcast) - BENCHMARK_GEMM(f32_qc4w_gemm_4x32__avx512skx_broadcast) - BENCHMARK_GEMM(f32_qc4w_gemm_5x32__avx512skx_broadcast) - BENCHMARK_GEMM(f32_qc4w_gemm_6x32__avx512skx_broadcast) - BENCHMARK_GEMM(f32_qc4w_gemm_7x32__avx512skx_broadcast) - BENCHMARK_GEMM(f32_qc4w_gemm_8x32__avx512skx_broadcast) BENCHMARK_GEMM(f32_qc4w_gemm_1x16__avx2_broadcast) BENCHMARK_GEMM(f32_qc4w_gemm_2x16__avx2_broadcast) BENCHMARK_GEMM(f32_qc4w_gemm_3x16__avx2_broadcast) diff --git a/bench/f32-qc8w-gemm.cc b/bench/f32-qc8w-gemm.cc index 97d34cf9301..76486933043 100644 --- a/bench/f32-qc8w-gemm.cc +++ b/bench/f32-qc8w-gemm.cc @@ -16,8 +16,8 @@ #include #include -#include "bench/gemm.h" -#include "bench/utils.h" +#include "gemm.h" +#include "utils.h" #include "xnnpack/allocator.h" #include "xnnpack/common.h" #include "xnnpack/gemm.h" @@ -470,7 +470,7 @@ static void GEMMBenchmark(benchmark::State& state, BENCHMARK_GEMM(f32_qc8w_gemm_6x8s4__neonfma) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) static void f32_qc8w_gemm_1x32__avx512skx_broadcast(benchmark::State& state, const char* net) { GEMMBenchmark(state, xnn_f32_qc8w_gemm_minmax_ukernel_1x32__avx512skx_broadcast, @@ -583,6 +583,27 @@ static void GEMMBenchmark(benchmark::State& state, /*mr=*/8, /*nr=*/16, /*kr=*/1, /*sr=*/1, benchmark::utils::CheckAVX512SKX); } + + BENCHMARK_GEMM(f32_qc8w_gemm_1x32__avx512skx_broadcast) + BENCHMARK_GEMM(f32_qc8w_gemm_2x32__avx512skx_broadcast) + BENCHMARK_GEMM(f32_qc8w_gemm_3x32__avx512skx_broadcast) + BENCHMARK_GEMM(f32_qc8w_gemm_4x32__avx512skx_broadcast) + BENCHMARK_GEMM(f32_qc8w_gemm_5x32__avx512skx_broadcast) + BENCHMARK_GEMM(f32_qc8w_gemm_6x32__avx512skx_broadcast) + BENCHMARK_GEMM(f32_qc8w_gemm_7x32__avx512skx_broadcast) + BENCHMARK_GEMM(f32_qc8w_gemm_8x32__avx512skx_broadcast) + + BENCHMARK_GEMM(f32_qc8w_gemm_1x16__avx512skx_broadcast) + BENCHMARK_GEMM(f32_qc8w_gemm_2x16__avx512skx_broadcast) + BENCHMARK_GEMM(f32_qc8w_gemm_3x16__avx512skx_broadcast) + BENCHMARK_GEMM(f32_qc8w_gemm_4x16__avx512skx_broadcast) + BENCHMARK_GEMM(f32_qc8w_gemm_5x16__avx512skx_broadcast) + BENCHMARK_GEMM(f32_qc8w_gemm_6x16__avx512skx_broadcast) + BENCHMARK_GEMM(f32_qc8w_gemm_7x16__avx512skx_broadcast) + BENCHMARK_GEMM(f32_qc8w_gemm_8x16__avx512skx_broadcast) +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 static void f32_qc8w_gemm_1x8__avx2_broadcast(benchmark::State& state, const char* net) { GEMMBenchmark(state, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__avx2_broadcast, @@ -941,24 +962,6 @@ static void GEMMBenchmark(benchmark::State& state, benchmark::utils::CheckSSE41); } - BENCHMARK_GEMM(f32_qc8w_gemm_1x32__avx512skx_broadcast) - BENCHMARK_GEMM(f32_qc8w_gemm_2x32__avx512skx_broadcast) - BENCHMARK_GEMM(f32_qc8w_gemm_3x32__avx512skx_broadcast) - BENCHMARK_GEMM(f32_qc8w_gemm_4x32__avx512skx_broadcast) - BENCHMARK_GEMM(f32_qc8w_gemm_5x32__avx512skx_broadcast) - BENCHMARK_GEMM(f32_qc8w_gemm_6x32__avx512skx_broadcast) - BENCHMARK_GEMM(f32_qc8w_gemm_7x32__avx512skx_broadcast) - BENCHMARK_GEMM(f32_qc8w_gemm_8x32__avx512skx_broadcast) - - BENCHMARK_GEMM(f32_qc8w_gemm_1x16__avx512skx_broadcast) - BENCHMARK_GEMM(f32_qc8w_gemm_2x16__avx512skx_broadcast) - BENCHMARK_GEMM(f32_qc8w_gemm_3x16__avx512skx_broadcast) - BENCHMARK_GEMM(f32_qc8w_gemm_4x16__avx512skx_broadcast) - BENCHMARK_GEMM(f32_qc8w_gemm_5x16__avx512skx_broadcast) - BENCHMARK_GEMM(f32_qc8w_gemm_6x16__avx512skx_broadcast) - BENCHMARK_GEMM(f32_qc8w_gemm_7x16__avx512skx_broadcast) - BENCHMARK_GEMM(f32_qc8w_gemm_8x16__avx512skx_broadcast) - BENCHMARK_GEMM(f32_qc8w_gemm_1x8__avx2_broadcast) BENCHMARK_GEMM(f32_qc8w_gemm_4x8__avx2_broadcast) BENCHMARK_GEMM(f32_qc8w_gemm_5x8__avx2_broadcast) diff --git a/bench/f32-qs8-vcvt.cc b/bench/f32-qs8-vcvt.cc index 4e05c0a1782..e622edb2a09 100644 --- a/bench/f32-qs8-vcvt.cc +++ b/bench/f32-qs8-vcvt.cc @@ -4,7 +4,7 @@ // LICENSE file in the root directory of this source tree. #include -#include "bench/vcvt-benchmark.h" +#include "vcvt-benchmark.h" #include "xnnpack.h" #include "xnnpack/hardware-config.h" #include "xnnpack/microfnptr.h" @@ -21,9 +21,7 @@ static void f32_qs8_vcvt( xnn_f32_qs8_cvt_params params; init_params(¶ms, 25.0f /* scale */, - 1 /* output zero point */, - std::numeric_limits::min() + 1 /* output min */, - std::numeric_limits::max() - 1 /* output max */); + 1 /* output zero point */); cvt_benchmark(state, arch_flags, cvt, ¶ms); } @@ -33,7 +31,7 @@ static void f32_qs8_vcvt( BENCHMARK_CAPTURE(f32_qs8_vcvt, ukernel, arch_flags, ukernel, init_params) \ ->Apply(benchmark::utils::UnaryElementwiseParameters) \ ->UseRealTime(); -#include "src/f32-qs8-vcvt/f32-qs8-vcvt.h" +#include "f32-qs8-vcvt/f32-qs8-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/bench/f32-qu8-vcvt.cc b/bench/f32-qu8-vcvt.cc index 977995befd4..7110d9f1443 100644 --- a/bench/f32-qu8-vcvt.cc +++ b/bench/f32-qu8-vcvt.cc @@ -4,8 +4,8 @@ // LICENSE file in the root directory of this source tree. #include -#include "bench/utils.h" -#include "bench/vcvt-benchmark.h" +#include "utils.h" +#include "vcvt-benchmark.h" #include "xnnpack.h" #include "xnnpack/hardware-config.h" #include "xnnpack/microfnptr.h" @@ -22,9 +22,7 @@ static void f32_qu8_vcvt( xnn_f32_qu8_cvt_params params; init_params(¶ms, 25.0f /* scale */, - 127 /* output zero point */, - std::numeric_limits::min() + 1 /* output min */, - std::numeric_limits::max() - 1 /* output max */); + 127 /* output zero point */); cvt_benchmark(state, arch_flags, cvt, ¶ms); } @@ -34,7 +32,7 @@ static void f32_qu8_vcvt( BENCHMARK_CAPTURE(f32_qu8_vcvt, ukernel, arch_flags, ukernel, init_params) \ ->Apply(benchmark::utils::UnaryElementwiseParameters) \ ->UseRealTime(); -#include "src/f32-qu8-vcvt/f32-qu8-vcvt.h" +#include "f32-qu8-vcvt/f32-qu8-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/bench/f32-raddexpminusmax.cc b/bench/f32-raddexpminusmax.cc index 3622332f4eb..76dd491a1f6 100644 --- a/bench/f32-raddexpminusmax.cc +++ b/bench/f32-raddexpminusmax.cc @@ -9,7 +9,7 @@ #include #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/microfnptr.h" @@ -79,7 +79,7 @@ static void CharacteristicArguments(benchmark::internal::Benchmark* b) { } } -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_u64, xnn_f32_rmax_ukernel__avx_u32_acc4, xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_u64, @@ -144,7 +144,9 @@ static void CharacteristicArguments(benchmark::internal::Benchmark* b) { xnn_f32_rmax_ukernel__avx_u32_acc4, xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_u192_acc6, benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime(); +#endif // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_u32, xnn_f32_rmax_ukernel__avx_u32_acc4, xnn_f32_raddexpminusmax_ukernel__avx2_p5_u32, diff --git a/bench/f32-raddextexp.cc b/bench/f32-raddextexp.cc index c422f1a8c3b..c7c9bb8a609 100644 --- a/bench/f32-raddextexp.cc +++ b/bench/f32-raddextexp.cc @@ -9,7 +9,7 @@ #include #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/microfnptr.h" @@ -73,7 +73,7 @@ static void CharacteristicArguments(benchmark::internal::Benchmark* b) { } } -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_u128, xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_u128, benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime(); @@ -113,7 +113,9 @@ static void CharacteristicArguments(benchmark::internal::Benchmark* b) { BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_u192_acc6, xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_u192_acc6, benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime(); +#endif // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f32_raddextexp, avx2_p5_u64, xnn_f32_raddextexp_ukernel__avx2_p5_u64, benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime(); diff --git a/bench/f32-raddstoreexpminusmax.cc b/bench/f32-raddstoreexpminusmax.cc index fd64bc3fdf4..06270debda1 100644 --- a/bench/f32-raddstoreexpminusmax.cc +++ b/bench/f32-raddstoreexpminusmax.cc @@ -9,7 +9,7 @@ #include #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/microfnptr.h" @@ -249,7 +249,7 @@ static void f32_raddstoreexpminusmax( #endif // XNN_ENABLE_AVX256SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_u16, xnn_f32_rmax_ukernel__avx_u32_acc4, xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u16, @@ -307,7 +307,9 @@ static void f32_raddstoreexpminusmax( benchmark::utils::CheckAVX512F) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); +#endif // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_u8, xnn_f32_rmax_ukernel__avx_u32_acc4, xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u8, diff --git a/bench/f32-rdsum.cc b/bench/f32-rdsum.cc index 188c0411ad2..4bc890ff208 100644 --- a/bench/f32-rdsum.cc +++ b/bench/f32-rdsum.cc @@ -7,8 +7,8 @@ // Specification: test/f32-rdsum.yaml // Generator: tools/generate-rdsum-benchmark.py -#include "bench/rsum-benchmark.h" -#include "bench/utils.h" +#include "rsum-benchmark.h" +#include "utils.h" #include #include "xnnpack.h" diff --git a/bench/f32-rmax.cc b/bench/f32-rmax.cc index a5f58e0eb18..a4175917931 100644 --- a/bench/f32-rmax.cc +++ b/bench/f32-rmax.cc @@ -9,7 +9,7 @@ #include #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/microfnptr.h" @@ -61,7 +61,7 @@ static void f32_rmax( benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate); } -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) BENCHMARK_CAPTURE(f32_rmax, avx512f_u16, xnn_f32_rmax_ukernel__avx512f_u16, /*init_params=*/nullptr, @@ -92,7 +92,9 @@ static void f32_rmax( benchmark::utils::CheckAVX512F) ->Apply(benchmark::utils::ReductionParameters) ->UseRealTime(); +#endif // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f32_rmax, avx_u8, xnn_f32_rmax_ukernel__avx_u8, /*init_params=*/nullptr, diff --git a/bench/f32-rmin.cc b/bench/f32-rmin.cc index c666f5cf30d..acba077e41a 100644 --- a/bench/f32-rmin.cc +++ b/bench/f32-rmin.cc @@ -9,7 +9,7 @@ #include #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/microfnptr.h" @@ -61,7 +61,7 @@ static void f32_rmin( benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate); } -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) BENCHMARK_CAPTURE(f32_rmin, avx512f_u16, xnn_f32_rmin_ukernel__avx512f_u16, /*init_params=*/nullptr, @@ -92,7 +92,9 @@ static void f32_rmin( benchmark::utils::CheckAVX512F) ->Apply(benchmark::utils::ReductionParameters) ->UseRealTime(); +#endif // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f32_rmin, avx_u8, xnn_f32_rmin_ukernel__avx_u8, /*init_params=*/nullptr, diff --git a/bench/f32-rminmax.cc b/bench/f32-rminmax.cc index 4fd4dfb10ee..0ab42c83dbb 100644 --- a/bench/f32-rminmax.cc +++ b/bench/f32-rminmax.cc @@ -9,7 +9,7 @@ #include #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/microfnptr.h" @@ -61,7 +61,7 @@ static void f32_rminmax( benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate); } -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) BENCHMARK_CAPTURE(f32_rminmax, avx512f_u16, xnn_f32_rminmax_ukernel__avx512f_u16, /*init_params=*/nullptr, @@ -92,7 +92,9 @@ static void f32_rminmax( benchmark::utils::CheckAVX512F) ->Apply(benchmark::utils::ReductionParameters) ->UseRealTime(); +#endif // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f32_rminmax, avx_u8, xnn_f32_rminmax_ukernel__avx_u8, /*init_params=*/nullptr, diff --git a/bench/f32-rsum.cc b/bench/f32-rsum.cc index 6b654c28b73..b4fcb5a70de 100644 --- a/bench/f32-rsum.cc +++ b/bench/f32-rsum.cc @@ -7,8 +7,8 @@ // Specification: test/f32-rsum.yaml // Generator: tools/generate-rdsum-benchmark.py -#include "bench/rsum-benchmark.h" -#include "bench/utils.h" +#include "rsum-benchmark.h" +#include "utils.h" #include #include "xnnpack.h" diff --git a/bench/f32-softmax.cc b/bench/f32-softmax.cc index 54dd1bbb2d2..f92ec8890ed 100644 --- a/bench/f32-softmax.cc +++ b/bench/f32-softmax.cc @@ -10,7 +10,7 @@ #ifdef BENCHMARK_INTEL_DNNL #include #endif // BENCHMARK_INTEL_DNNL -#include "bench/utils.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" @@ -419,6 +419,26 @@ static void CharacteristicArguments(benchmark::internal::Benchmark* b) { BENCHMARK(DNNLSoftArgMax)->Apply(CharacteristicArguments)->UseManualTime(); #endif +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + BENCHMARK_CAPTURE(TwoPassSoftMax, avx512f_p5_scalef, + xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_u144_acc3, + xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_u16, + benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime(); + BENCHMARK_CAPTURE(ThreePassSoftMaxWithRecomputing, avx512f_p5_scalef, + xnn_f32_rmax_ukernel__avx512f_u64_acc4, + (xnn_init_f32_default_params_fn) nullptr, + xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_u128_acc4, + xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_u16, + benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime(); + BENCHMARK_CAPTURE(ThreePassSoftMaxWithReloading, avx512f_p5_scalef, + xnn_f32_rmax_ukernel__avx512f_u64_acc4, + (xnn_init_f32_default_params_fn) nullptr, + xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc2, + nullptr, + xnn_f32_vmulc_ukernel__avx512f_u32, + benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime(); +#endif // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(TwoPassSoftMax, avx2_p5, xnn_f32_raddextexp_ukernel__avx2_p5_u96, @@ -437,24 +457,6 @@ static void CharacteristicArguments(benchmark::internal::Benchmark* b) { nullptr, xnn_f32_vmulc_ukernel__avx_u16, benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime(); - - BENCHMARK_CAPTURE(TwoPassSoftMax, avx512f_p5_scalef, - xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_u144_acc3, - xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_u16, - benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime(); - BENCHMARK_CAPTURE(ThreePassSoftMaxWithRecomputing, avx512f_p5_scalef, - xnn_f32_rmax_ukernel__avx512f_u64_acc4, - (xnn_init_f32_default_params_fn) nullptr, - xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_u128_acc4, - xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_u16, - benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime(); - BENCHMARK_CAPTURE(ThreePassSoftMaxWithReloading, avx512f_p5_scalef, - xnn_f32_rmax_ukernel__avx512f_u64_acc4, - (xnn_init_f32_default_params_fn) nullptr, - xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc2, - nullptr, - xnn_f32_vmulc_ukernel__avx512f_u32, - benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime(); #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV diff --git a/bench/f32-spmm.cc b/bench/f32-spmm.cc index 40da2d0dcf3..f802e9d3ad2 100644 --- a/bench/f32-spmm.cc +++ b/bench/f32-spmm.cc @@ -8,8 +8,8 @@ // Generator: tools/generate-spmm-test.py #include -#include "bench/spmm-benchmark.h" -#include "bench/utils.h" +#include "spmm-benchmark.h" +#include "utils.h" #include "xnnpack/gemm.h" #include "xnnpack/microfnptr.h" #include "xnnpack/microparams-init.h" diff --git a/bench/f32-vcmul.cc b/bench/f32-vcmul.cc index 7d680a4f4ea..37983b27e40 100644 --- a/bench/f32-vcmul.cc +++ b/bench/f32-vcmul.cc @@ -10,7 +10,7 @@ #include #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/hardware-config.h" @@ -71,7 +71,7 @@ static void f32_vcmul(benchmark::State& state, uint64_t arch_flags, benchmark::utils::BinaryElementwiseParameters, \ std::complex>) \ ->UseRealTime(); -#include "src/f32-vbinary/f32-vcmul.h" +#include "f32-vbinary/f32-vcmul.h" #undef XNN_UKERNEL_WITH_PARAMS #ifndef XNNPACK_BENCHMARK_NO_MAIN diff --git a/bench/f32-vscaleexpminusmax.cc b/bench/f32-vscaleexpminusmax.cc index cb3c68d1900..928d37d997b 100644 --- a/bench/f32-vscaleexpminusmax.cc +++ b/bench/f32-vscaleexpminusmax.cc @@ -6,7 +6,7 @@ #include #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/microfnptr.h" @@ -80,7 +80,7 @@ static void CharacteristicArguments(benchmark::internal::Benchmark* b) { } } -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_u16, xnn_f32_rmax_ukernel__avx512f_u64_acc4, xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_u128_acc2, @@ -141,7 +141,9 @@ static void CharacteristicArguments(benchmark::internal::Benchmark* b) { xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_u128_acc2, xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_u192, benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime(); +#endif // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_u8, xnn_f32_rmax_ukernel__avx_u32_acc4, xnn_f32_raddexpminusmax_ukernel__avx2_p5_u80_acc2, diff --git a/bench/f32-vscaleextexp.cc b/bench/f32-vscaleextexp.cc index ca51d3a9682..620c82a2249 100644 --- a/bench/f32-vscaleextexp.cc +++ b/bench/f32-vscaleextexp.cc @@ -6,7 +6,7 @@ #include #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/microfnptr.h" @@ -78,7 +78,7 @@ static void CharacteristicArguments(benchmark::internal::Benchmark* b) { } } -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) BENCHMARK_CAPTURE(f32_vscaleextexp, avx512f_p5_scalef_u16, xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_u128_acc2, xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_u16, @@ -127,7 +127,9 @@ static void CharacteristicArguments(benchmark::internal::Benchmark* b) { xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_u128_acc2, xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_u192, benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime(); +#endif // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f32_vscaleextexp, avx2_p5_u8, xnn_f32_raddextexp_ukernel__avx2_p5_u80_acc2, xnn_f32_vscaleextexp_ukernel__avx2_p5_u8, diff --git a/bench/floor.cc b/bench/floor.cc index b7e7f76a41e..06752a0b85c 100644 --- a/bench/floor.cc +++ b/bench/floor.cc @@ -3,10 +3,10 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -#include "xnnpack.h" - #include "unary_operator.h" -#include "bench/utils.h" +#include "utils.h" +#include "xnnpack.h" +#include "xnnpack/math.h" #include #ifdef BENCHMARK_TENSORFLOW_LITE #include "tensorflow/lite/schema/schema_generated.h" diff --git a/bench/fully-connected.cc b/bench/fully-connected.cc index c1b4f0bc23d..9e6c67dd1ac 100644 --- a/bench/fully-connected.cc +++ b/bench/fully-connected.cc @@ -18,7 +18,7 @@ #include "xnnpack.h" #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack/buffer.h" void xnnpack_fully_connected_f32(benchmark::State& state, const char* net) { diff --git a/bench/gemm-benchmark.cc b/bench/gemm-benchmark.cc index dee22586271..ab9f30562c8 100644 --- a/bench/gemm-benchmark.cc +++ b/bench/gemm-benchmark.cc @@ -12,7 +12,7 @@ #include #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/config-types.h" diff --git a/bench/gemm-benchmark.h b/bench/gemm-benchmark.h index 9924be2f3df..ea83c5d7d8d 100644 --- a/bench/gemm-benchmark.h +++ b/bench/gemm-benchmark.h @@ -16,8 +16,8 @@ #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0.h" #endif // XNN_ENABLE_KLEIDIAI -#include "bench/gemm.h" -#include "bench/utils.h" +#include "gemm.h" +#include "utils.h" #include void GEMMBenchmark(benchmark::State& state, xnn_qs8_gemm_minmax_ukernel_fn gemm, diff --git a/bench/global-average-pooling.cc b/bench/global-average-pooling.cc index 03949a986ab..a9adbba20ff 100644 --- a/bench/global-average-pooling.cc +++ b/bench/global-average-pooling.cc @@ -20,7 +20,7 @@ #include "xnnpack/math.h" #include -#include "bench/utils.h" +#include "utils.h" static void global_average_pooling_qu8(benchmark::State& state) { const size_t batch_size = state.range(0); diff --git a/bench/hardswish.cc b/bench/hardswish.cc index 35a7f100d3e..a666346f473 100644 --- a/bench/hardswish.cc +++ b/bench/hardswish.cc @@ -3,10 +3,10 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -#include "xnnpack.h" - #include "unary_operator.h" -#include "bench/utils.h" +#include "utils.h" +#include "xnnpack.h" +#include "xnnpack/math.h" #include #ifdef BENCHMARK_TENSORFLOW_LITE #include "tensorflow/lite/schema/schema_generated.h" diff --git a/bench/leaky-relu.cc b/bench/leaky-relu.cc index 70ac545e647..4c419286848 100644 --- a/bench/leaky-relu.cc +++ b/bench/leaky-relu.cc @@ -9,7 +9,7 @@ #include #include "unary_operator.h" -#include "bench/utils.h" +#include "utils.h" #include "xnnpack/math.h" #include #ifdef BENCHMARK_TENSORFLOW_LITE diff --git a/bench/max-pooling.cc b/bench/max-pooling.cc index c281d5dc1a1..9e3ee9117c4 100644 --- a/bench/max-pooling.cc +++ b/bench/max-pooling.cc @@ -15,7 +15,7 @@ #include #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/buffer.h" #include diff --git a/bench/models/benchmark.cc b/bench/models/benchmark.cc index bed5507e6fe..f2e7b4436af 100644 --- a/bench/models/benchmark.cc +++ b/bench/models/benchmark.cc @@ -15,7 +15,7 @@ #include #include "models.h" -#include "bench/utils.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/allocator.h" #include "xnnpack/subgraph.h" diff --git a/bench/models/qs8-mobilenet-v2.cc b/bench/models/qs8-mobilenet-v2.cc index e2aea9b6965..ae2aac57240 100644 --- a/bench/models/qs8-mobilenet-v2.cc +++ b/bench/models/qs8-mobilenet-v2.cc @@ -972,7 +972,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w67_data; uint32_t w67 = XNN_INVALID_VALUE_ID; std::array w67_dims = {{32, 3, 3, 3}}; - std::array w67_scale; + static std::array w67_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w67_scale.begin(), w67_scale.end(), std::ref(scalerng)); @@ -991,7 +991,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w68_data; uint32_t w68 = XNN_INVALID_VALUE_ID; std::array w68_dims = {{32}}; - std::array w68_scale; + static std::array w68_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w68_scale.begin(), w68_scale.end(), std::ref(scalerng)); @@ -1010,7 +1010,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w69_data; uint32_t w69 = XNN_INVALID_VALUE_ID; std::array w69_dims = {{1, 3, 3, 32}}; - std::array w69_scale; + static std::array w69_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w69_scale.begin(), w69_scale.end(), std::ref(scalerng)); @@ -1029,7 +1029,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w70_data; uint32_t w70 = XNN_INVALID_VALUE_ID; std::array w70_dims = {{32}}; - std::array w70_scale; + static std::array w70_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w70_scale.begin(), w70_scale.end(), std::ref(scalerng)); @@ -1048,7 +1048,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w71_data; uint32_t w71 = XNN_INVALID_VALUE_ID; std::array w71_dims = {{16, 1, 1, 32}}; - std::array w71_scale; + static std::array w71_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w71_scale.begin(), w71_scale.end(), std::ref(scalerng)); @@ -1067,7 +1067,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w72_data; uint32_t w72 = XNN_INVALID_VALUE_ID; std::array w72_dims = {{16}}; - std::array w72_scale; + static std::array w72_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w72_scale.begin(), w72_scale.end(), std::ref(scalerng)); @@ -1086,7 +1086,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w73_data; uint32_t w73 = XNN_INVALID_VALUE_ID; std::array w73_dims = {{96, 1, 1, 16}}; - std::array w73_scale; + static std::array w73_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w73_scale.begin(), w73_scale.end(), std::ref(scalerng)); @@ -1105,7 +1105,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w74_data; uint32_t w74 = XNN_INVALID_VALUE_ID; std::array w74_dims = {{96}}; - std::array w74_scale; + static std::array w74_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w74_scale.begin(), w74_scale.end(), std::ref(scalerng)); @@ -1124,7 +1124,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w75_data; uint32_t w75 = XNN_INVALID_VALUE_ID; std::array w75_dims = {{1, 3, 3, 96}}; - std::array w75_scale; + static std::array w75_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w75_scale.begin(), w75_scale.end(), std::ref(scalerng)); @@ -1143,7 +1143,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w76_data; uint32_t w76 = XNN_INVALID_VALUE_ID; std::array w76_dims = {{96}}; - std::array w76_scale; + static std::array w76_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w76_scale.begin(), w76_scale.end(), std::ref(scalerng)); @@ -1162,7 +1162,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w77_data; uint32_t w77 = XNN_INVALID_VALUE_ID; std::array w77_dims = {{24, 1, 1, 96}}; - std::array w77_scale; + static std::array w77_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w77_scale.begin(), w77_scale.end(), std::ref(scalerng)); @@ -1181,7 +1181,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w78_data; uint32_t w78 = XNN_INVALID_VALUE_ID; std::array w78_dims = {{24}}; - std::array w78_scale; + static std::array w78_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w78_scale.begin(), w78_scale.end(), std::ref(scalerng)); @@ -1200,7 +1200,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w79_data; uint32_t w79 = XNN_INVALID_VALUE_ID; std::array w79_dims = {{144, 1, 1, 24}}; - std::array w79_scale; + static std::array w79_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w79_scale.begin(), w79_scale.end(), std::ref(scalerng)); @@ -1219,7 +1219,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w80_data; uint32_t w80 = XNN_INVALID_VALUE_ID; std::array w80_dims = {{144}}; - std::array w80_scale; + static std::array w80_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w80_scale.begin(), w80_scale.end(), std::ref(scalerng)); @@ -1238,7 +1238,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w81_data; uint32_t w81 = XNN_INVALID_VALUE_ID; std::array w81_dims = {{1, 3, 3, 144}}; - std::array w81_scale; + static std::array w81_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w81_scale.begin(), w81_scale.end(), std::ref(scalerng)); @@ -1257,7 +1257,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w82_data; uint32_t w82 = XNN_INVALID_VALUE_ID; std::array w82_dims = {{144}}; - std::array w82_scale; + static std::array w82_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w82_scale.begin(), w82_scale.end(), std::ref(scalerng)); @@ -1276,7 +1276,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w83_data; uint32_t w83 = XNN_INVALID_VALUE_ID; std::array w83_dims = {{24, 1, 1, 144}}; - std::array w83_scale; + static std::array w83_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w83_scale.begin(), w83_scale.end(), std::ref(scalerng)); @@ -1295,7 +1295,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w84_data; uint32_t w84 = XNN_INVALID_VALUE_ID; std::array w84_dims = {{24}}; - std::array w84_scale; + static std::array w84_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w84_scale.begin(), w84_scale.end(), std::ref(scalerng)); @@ -1314,7 +1314,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w85_data; uint32_t w85 = XNN_INVALID_VALUE_ID; std::array w85_dims = {{144, 1, 1, 24}}; - std::array w85_scale; + static std::array w85_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w85_scale.begin(), w85_scale.end(), std::ref(scalerng)); @@ -1333,7 +1333,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w86_data; uint32_t w86 = XNN_INVALID_VALUE_ID; std::array w86_dims = {{144}}; - std::array w86_scale; + static std::array w86_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w86_scale.begin(), w86_scale.end(), std::ref(scalerng)); @@ -1352,7 +1352,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w87_data; uint32_t w87 = XNN_INVALID_VALUE_ID; std::array w87_dims = {{1, 3, 3, 144}}; - std::array w87_scale; + static std::array w87_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w87_scale.begin(), w87_scale.end(), std::ref(scalerng)); @@ -1371,7 +1371,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w88_data; uint32_t w88 = XNN_INVALID_VALUE_ID; std::array w88_dims = {{144}}; - std::array w88_scale; + static std::array w88_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w88_scale.begin(), w88_scale.end(), std::ref(scalerng)); @@ -1390,7 +1390,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w89_data; uint32_t w89 = XNN_INVALID_VALUE_ID; std::array w89_dims = {{32, 1, 1, 144}}; - std::array w89_scale; + static std::array w89_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w89_scale.begin(), w89_scale.end(), std::ref(scalerng)); @@ -1409,7 +1409,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w90_data; uint32_t w90 = XNN_INVALID_VALUE_ID; std::array w90_dims = {{32}}; - std::array w90_scale; + static std::array w90_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w90_scale.begin(), w90_scale.end(), std::ref(scalerng)); @@ -1428,7 +1428,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w91_data; uint32_t w91 = XNN_INVALID_VALUE_ID; std::array w91_dims = {{192, 1, 1, 32}}; - std::array w91_scale; + static std::array w91_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w91_scale.begin(), w91_scale.end(), std::ref(scalerng)); @@ -1447,7 +1447,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w92_data; uint32_t w92 = XNN_INVALID_VALUE_ID; std::array w92_dims = {{192}}; - std::array w92_scale; + static std::array w92_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w92_scale.begin(), w92_scale.end(), std::ref(scalerng)); @@ -1466,7 +1466,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w93_data; uint32_t w93 = XNN_INVALID_VALUE_ID; std::array w93_dims = {{1, 3, 3, 192}}; - std::array w93_scale; + static std::array w93_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w93_scale.begin(), w93_scale.end(), std::ref(scalerng)); @@ -1485,7 +1485,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w94_data; uint32_t w94 = XNN_INVALID_VALUE_ID; std::array w94_dims = {{192}}; - std::array w94_scale; + static std::array w94_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w94_scale.begin(), w94_scale.end(), std::ref(scalerng)); @@ -1504,7 +1504,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w95_data; uint32_t w95 = XNN_INVALID_VALUE_ID; std::array w95_dims = {{32, 1, 1, 192}}; - std::array w95_scale; + static std::array w95_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w95_scale.begin(), w95_scale.end(), std::ref(scalerng)); @@ -1523,7 +1523,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w96_data; uint32_t w96 = XNN_INVALID_VALUE_ID; std::array w96_dims = {{32}}; - std::array w96_scale; + static std::array w96_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w96_scale.begin(), w96_scale.end(), std::ref(scalerng)); @@ -1542,7 +1542,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w97_data; uint32_t w97 = XNN_INVALID_VALUE_ID; std::array w97_dims = {{192, 1, 1, 32}}; - std::array w97_scale; + static std::array w97_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w97_scale.begin(), w97_scale.end(), std::ref(scalerng)); @@ -1561,7 +1561,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w98_data; uint32_t w98 = XNN_INVALID_VALUE_ID; std::array w98_dims = {{192}}; - std::array w98_scale; + static std::array w98_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w98_scale.begin(), w98_scale.end(), std::ref(scalerng)); @@ -1580,7 +1580,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w99_data; uint32_t w99 = XNN_INVALID_VALUE_ID; std::array w99_dims = {{1, 3, 3, 192}}; - std::array w99_scale; + static std::array w99_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w99_scale.begin(), w99_scale.end(), std::ref(scalerng)); @@ -1599,7 +1599,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w100_data; uint32_t w100 = XNN_INVALID_VALUE_ID; std::array w100_dims = {{192}}; - std::array w100_scale; + static std::array w100_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w100_scale.begin(), w100_scale.end(), std::ref(scalerng)); @@ -1618,7 +1618,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w101_data; uint32_t w101 = XNN_INVALID_VALUE_ID; std::array w101_dims = {{32, 1, 1, 192}}; - std::array w101_scale; + static std::array w101_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w101_scale.begin(), w101_scale.end(), std::ref(scalerng)); @@ -1637,7 +1637,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w102_data; uint32_t w102 = XNN_INVALID_VALUE_ID; std::array w102_dims = {{32}}; - std::array w102_scale; + static std::array w102_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w102_scale.begin(), w102_scale.end(), std::ref(scalerng)); @@ -1656,7 +1656,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w103_data; uint32_t w103 = XNN_INVALID_VALUE_ID; std::array w103_dims = {{192, 1, 1, 32}}; - std::array w103_scale; + static std::array w103_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w103_scale.begin(), w103_scale.end(), std::ref(scalerng)); @@ -1675,7 +1675,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w104_data; uint32_t w104 = XNN_INVALID_VALUE_ID; std::array w104_dims = {{192}}; - std::array w104_scale; + static std::array w104_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w104_scale.begin(), w104_scale.end(), std::ref(scalerng)); @@ -1694,7 +1694,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w105_data; uint32_t w105 = XNN_INVALID_VALUE_ID; std::array w105_dims = {{1, 3, 3, 192}}; - std::array w105_scale; + static std::array w105_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w105_scale.begin(), w105_scale.end(), std::ref(scalerng)); @@ -1713,7 +1713,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w106_data; uint32_t w106 = XNN_INVALID_VALUE_ID; std::array w106_dims = {{192}}; - std::array w106_scale; + static std::array w106_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w106_scale.begin(), w106_scale.end(), std::ref(scalerng)); @@ -1732,7 +1732,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w107_data; uint32_t w107 = XNN_INVALID_VALUE_ID; std::array w107_dims = {{64, 1, 1, 192}}; - std::array w107_scale; + static std::array w107_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w107_scale.begin(), w107_scale.end(), std::ref(scalerng)); @@ -1751,7 +1751,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w108_data; uint32_t w108 = XNN_INVALID_VALUE_ID; std::array w108_dims = {{64}}; - std::array w108_scale; + static std::array w108_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w108_scale.begin(), w108_scale.end(), std::ref(scalerng)); @@ -1770,7 +1770,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w109_data; uint32_t w109 = XNN_INVALID_VALUE_ID; std::array w109_dims = {{384, 1, 1, 64}}; - std::array w109_scale; + static std::array w109_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w109_scale.begin(), w109_scale.end(), std::ref(scalerng)); @@ -1789,7 +1789,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w110_data; uint32_t w110 = XNN_INVALID_VALUE_ID; std::array w110_dims = {{384}}; - std::array w110_scale; + static std::array w110_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w110_scale.begin(), w110_scale.end(), std::ref(scalerng)); @@ -1808,7 +1808,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w111_data; uint32_t w111 = XNN_INVALID_VALUE_ID; std::array w111_dims = {{1, 3, 3, 384}}; - std::array w111_scale; + static std::array w111_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w111_scale.begin(), w111_scale.end(), std::ref(scalerng)); @@ -1827,7 +1827,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w112_data; uint32_t w112 = XNN_INVALID_VALUE_ID; std::array w112_dims = {{384}}; - std::array w112_scale; + static std::array w112_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w112_scale.begin(), w112_scale.end(), std::ref(scalerng)); @@ -1846,7 +1846,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w113_data; uint32_t w113 = XNN_INVALID_VALUE_ID; std::array w113_dims = {{64, 1, 1, 384}}; - std::array w113_scale; + static std::array w113_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w113_scale.begin(), w113_scale.end(), std::ref(scalerng)); @@ -1865,7 +1865,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w114_data; uint32_t w114 = XNN_INVALID_VALUE_ID; std::array w114_dims = {{64}}; - std::array w114_scale; + static std::array w114_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w114_scale.begin(), w114_scale.end(), std::ref(scalerng)); @@ -1884,7 +1884,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w115_data; uint32_t w115 = XNN_INVALID_VALUE_ID; std::array w115_dims = {{384, 1, 1, 64}}; - std::array w115_scale; + static std::array w115_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w115_scale.begin(), w115_scale.end(), std::ref(scalerng)); @@ -1903,7 +1903,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w116_data; uint32_t w116 = XNN_INVALID_VALUE_ID; std::array w116_dims = {{384}}; - std::array w116_scale; + static std::array w116_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w116_scale.begin(), w116_scale.end(), std::ref(scalerng)); @@ -1922,7 +1922,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w117_data; uint32_t w117 = XNN_INVALID_VALUE_ID; std::array w117_dims = {{1, 3, 3, 384}}; - std::array w117_scale; + static std::array w117_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w117_scale.begin(), w117_scale.end(), std::ref(scalerng)); @@ -1941,7 +1941,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w118_data; uint32_t w118 = XNN_INVALID_VALUE_ID; std::array w118_dims = {{384}}; - std::array w118_scale; + static std::array w118_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w118_scale.begin(), w118_scale.end(), std::ref(scalerng)); @@ -1960,7 +1960,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w119_data; uint32_t w119 = XNN_INVALID_VALUE_ID; std::array w119_dims = {{64, 1, 1, 384}}; - std::array w119_scale; + static std::array w119_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w119_scale.begin(), w119_scale.end(), std::ref(scalerng)); @@ -1979,7 +1979,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w120_data; uint32_t w120 = XNN_INVALID_VALUE_ID; std::array w120_dims = {{64}}; - std::array w120_scale; + static std::array w120_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w120_scale.begin(), w120_scale.end(), std::ref(scalerng)); @@ -1998,7 +1998,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w121_data; uint32_t w121 = XNN_INVALID_VALUE_ID; std::array w121_dims = {{384, 1, 1, 64}}; - std::array w121_scale; + static std::array w121_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w121_scale.begin(), w121_scale.end(), std::ref(scalerng)); @@ -2017,7 +2017,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w122_data; uint32_t w122 = XNN_INVALID_VALUE_ID; std::array w122_dims = {{384}}; - std::array w122_scale; + static std::array w122_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w122_scale.begin(), w122_scale.end(), std::ref(scalerng)); @@ -2036,7 +2036,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w123_data; uint32_t w123 = XNN_INVALID_VALUE_ID; std::array w123_dims = {{1, 3, 3, 384}}; - std::array w123_scale; + static std::array w123_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w123_scale.begin(), w123_scale.end(), std::ref(scalerng)); @@ -2055,7 +2055,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w124_data; uint32_t w124 = XNN_INVALID_VALUE_ID; std::array w124_dims = {{384}}; - std::array w124_scale; + static std::array w124_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w124_scale.begin(), w124_scale.end(), std::ref(scalerng)); @@ -2074,7 +2074,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w125_data; uint32_t w125 = XNN_INVALID_VALUE_ID; std::array w125_dims = {{64, 1, 1, 384}}; - std::array w125_scale; + static std::array w125_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w125_scale.begin(), w125_scale.end(), std::ref(scalerng)); @@ -2093,7 +2093,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w126_data; uint32_t w126 = XNN_INVALID_VALUE_ID; std::array w126_dims = {{64}}; - std::array w126_scale; + static std::array w126_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w126_scale.begin(), w126_scale.end(), std::ref(scalerng)); @@ -2112,7 +2112,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w127_data; uint32_t w127 = XNN_INVALID_VALUE_ID; std::array w127_dims = {{384, 1, 1, 64}}; - std::array w127_scale; + static std::array w127_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w127_scale.begin(), w127_scale.end(), std::ref(scalerng)); @@ -2131,7 +2131,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w128_data; uint32_t w128 = XNN_INVALID_VALUE_ID; std::array w128_dims = {{384}}; - std::array w128_scale; + static std::array w128_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w128_scale.begin(), w128_scale.end(), std::ref(scalerng)); @@ -2150,7 +2150,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w129_data; uint32_t w129 = XNN_INVALID_VALUE_ID; std::array w129_dims = {{1, 3, 3, 384}}; - std::array w129_scale; + static std::array w129_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w129_scale.begin(), w129_scale.end(), std::ref(scalerng)); @@ -2169,7 +2169,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w130_data; uint32_t w130 = XNN_INVALID_VALUE_ID; std::array w130_dims = {{384}}; - std::array w130_scale; + static std::array w130_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w130_scale.begin(), w130_scale.end(), std::ref(scalerng)); @@ -2188,7 +2188,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w131_data; uint32_t w131 = XNN_INVALID_VALUE_ID; std::array w131_dims = {{96, 1, 1, 384}}; - std::array w131_scale; + static std::array w131_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w131_scale.begin(), w131_scale.end(), std::ref(scalerng)); @@ -2207,7 +2207,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w132_data; uint32_t w132 = XNN_INVALID_VALUE_ID; std::array w132_dims = {{96}}; - std::array w132_scale; + static std::array w132_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w132_scale.begin(), w132_scale.end(), std::ref(scalerng)); @@ -2226,7 +2226,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w133_data; uint32_t w133 = XNN_INVALID_VALUE_ID; std::array w133_dims = {{576, 1, 1, 96}}; - std::array w133_scale; + static std::array w133_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w133_scale.begin(), w133_scale.end(), std::ref(scalerng)); @@ -2245,7 +2245,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w134_data; uint32_t w134 = XNN_INVALID_VALUE_ID; std::array w134_dims = {{576}}; - std::array w134_scale; + static std::array w134_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w134_scale.begin(), w134_scale.end(), std::ref(scalerng)); @@ -2264,7 +2264,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w135_data; uint32_t w135 = XNN_INVALID_VALUE_ID; std::array w135_dims = {{1, 3, 3, 576}}; - std::array w135_scale; + static std::array w135_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w135_scale.begin(), w135_scale.end(), std::ref(scalerng)); @@ -2283,7 +2283,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w136_data; uint32_t w136 = XNN_INVALID_VALUE_ID; std::array w136_dims = {{576}}; - std::array w136_scale; + static std::array w136_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w136_scale.begin(), w136_scale.end(), std::ref(scalerng)); @@ -2302,7 +2302,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w137_data; uint32_t w137 = XNN_INVALID_VALUE_ID; std::array w137_dims = {{96, 1, 1, 576}}; - std::array w137_scale; + static std::array w137_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w137_scale.begin(), w137_scale.end(), std::ref(scalerng)); @@ -2321,7 +2321,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w138_data; uint32_t w138 = XNN_INVALID_VALUE_ID; std::array w138_dims = {{96}}; - std::array w138_scale; + static std::array w138_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w138_scale.begin(), w138_scale.end(), std::ref(scalerng)); @@ -2340,7 +2340,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w139_data; uint32_t w139 = XNN_INVALID_VALUE_ID; std::array w139_dims = {{576, 1, 1, 96}}; - std::array w139_scale; + static std::array w139_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w139_scale.begin(), w139_scale.end(), std::ref(scalerng)); @@ -2359,7 +2359,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w140_data; uint32_t w140 = XNN_INVALID_VALUE_ID; std::array w140_dims = {{576}}; - std::array w140_scale; + static std::array w140_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w140_scale.begin(), w140_scale.end(), std::ref(scalerng)); @@ -2378,7 +2378,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w141_data; uint32_t w141 = XNN_INVALID_VALUE_ID; std::array w141_dims = {{1, 3, 3, 576}}; - std::array w141_scale; + static std::array w141_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w141_scale.begin(), w141_scale.end(), std::ref(scalerng)); @@ -2397,7 +2397,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w142_data; uint32_t w142 = XNN_INVALID_VALUE_ID; std::array w142_dims = {{576}}; - std::array w142_scale; + static std::array w142_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w142_scale.begin(), w142_scale.end(), std::ref(scalerng)); @@ -2416,7 +2416,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w143_data; uint32_t w143 = XNN_INVALID_VALUE_ID; std::array w143_dims = {{96, 1, 1, 576}}; - std::array w143_scale; + static std::array w143_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w143_scale.begin(), w143_scale.end(), std::ref(scalerng)); @@ -2435,7 +2435,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w144_data; uint32_t w144 = XNN_INVALID_VALUE_ID; std::array w144_dims = {{96}}; - std::array w144_scale; + static std::array w144_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w144_scale.begin(), w144_scale.end(), std::ref(scalerng)); @@ -2454,7 +2454,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w145_data; uint32_t w145 = XNN_INVALID_VALUE_ID; std::array w145_dims = {{576, 1, 1, 96}}; - std::array w145_scale; + static std::array w145_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w145_scale.begin(), w145_scale.end(), std::ref(scalerng)); @@ -2473,7 +2473,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w146_data; uint32_t w146 = XNN_INVALID_VALUE_ID; std::array w146_dims = {{576}}; - std::array w146_scale; + static std::array w146_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w146_scale.begin(), w146_scale.end(), std::ref(scalerng)); @@ -2492,7 +2492,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w147_data; uint32_t w147 = XNN_INVALID_VALUE_ID; std::array w147_dims = {{1, 3, 3, 576}}; - std::array w147_scale; + static std::array w147_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w147_scale.begin(), w147_scale.end(), std::ref(scalerng)); @@ -2511,7 +2511,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w148_data; uint32_t w148 = XNN_INVALID_VALUE_ID; std::array w148_dims = {{576}}; - std::array w148_scale; + static std::array w148_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w148_scale.begin(), w148_scale.end(), std::ref(scalerng)); @@ -2530,7 +2530,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w149_data; uint32_t w149 = XNN_INVALID_VALUE_ID; std::array w149_dims = {{160, 1, 1, 576}}; - std::array w149_scale; + static std::array w149_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w149_scale.begin(), w149_scale.end(), std::ref(scalerng)); @@ -2549,7 +2549,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w150_data; uint32_t w150 = XNN_INVALID_VALUE_ID; std::array w150_dims = {{160}}; - std::array w150_scale; + static std::array w150_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w150_scale.begin(), w150_scale.end(), std::ref(scalerng)); @@ -2568,7 +2568,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w151_data; uint32_t w151 = XNN_INVALID_VALUE_ID; std::array w151_dims = {{960, 1, 1, 160}}; - std::array w151_scale; + static std::array w151_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w151_scale.begin(), w151_scale.end(), std::ref(scalerng)); @@ -2587,7 +2587,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w152_data; uint32_t w152 = XNN_INVALID_VALUE_ID; std::array w152_dims = {{960}}; - std::array w152_scale; + static std::array w152_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w152_scale.begin(), w152_scale.end(), std::ref(scalerng)); @@ -2606,7 +2606,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w153_data; uint32_t w153 = XNN_INVALID_VALUE_ID; std::array w153_dims = {{1, 3, 3, 960}}; - std::array w153_scale; + static std::array w153_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w153_scale.begin(), w153_scale.end(), std::ref(scalerng)); @@ -2625,7 +2625,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w154_data; uint32_t w154 = XNN_INVALID_VALUE_ID; std::array w154_dims = {{960}}; - std::array w154_scale; + static std::array w154_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w154_scale.begin(), w154_scale.end(), std::ref(scalerng)); @@ -2644,7 +2644,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w155_data; uint32_t w155 = XNN_INVALID_VALUE_ID; std::array w155_dims = {{160, 1, 1, 960}}; - std::array w155_scale; + static std::array w155_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w155_scale.begin(), w155_scale.end(), std::ref(scalerng)); @@ -2663,7 +2663,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w156_data; uint32_t w156 = XNN_INVALID_VALUE_ID; std::array w156_dims = {{160}}; - std::array w156_scale; + static std::array w156_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w156_scale.begin(), w156_scale.end(), std::ref(scalerng)); @@ -2682,7 +2682,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w157_data; uint32_t w157 = XNN_INVALID_VALUE_ID; std::array w157_dims = {{960, 1, 1, 160}}; - std::array w157_scale; + static std::array w157_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w157_scale.begin(), w157_scale.end(), std::ref(scalerng)); @@ -2701,7 +2701,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w158_data; uint32_t w158 = XNN_INVALID_VALUE_ID; std::array w158_dims = {{960}}; - std::array w158_scale; + static std::array w158_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w158_scale.begin(), w158_scale.end(), std::ref(scalerng)); @@ -2720,7 +2720,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w159_data; uint32_t w159 = XNN_INVALID_VALUE_ID; std::array w159_dims = {{1, 3, 3, 960}}; - std::array w159_scale; + static std::array w159_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w159_scale.begin(), w159_scale.end(), std::ref(scalerng)); @@ -2739,7 +2739,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w160_data; uint32_t w160 = XNN_INVALID_VALUE_ID; std::array w160_dims = {{960}}; - std::array w160_scale; + static std::array w160_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w160_scale.begin(), w160_scale.end(), std::ref(scalerng)); @@ -2758,7 +2758,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w161_data; uint32_t w161 = XNN_INVALID_VALUE_ID; std::array w161_dims = {{160, 1, 1, 960}}; - std::array w161_scale; + static std::array w161_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w161_scale.begin(), w161_scale.end(), std::ref(scalerng)); @@ -2777,7 +2777,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w162_data; uint32_t w162 = XNN_INVALID_VALUE_ID; std::array w162_dims = {{160}}; - std::array w162_scale; + static std::array w162_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w162_scale.begin(), w162_scale.end(), std::ref(scalerng)); @@ -2796,7 +2796,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w163_data; uint32_t w163 = XNN_INVALID_VALUE_ID; std::array w163_dims = {{960, 1, 1, 160}}; - std::array w163_scale; + static std::array w163_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w163_scale.begin(), w163_scale.end(), std::ref(scalerng)); @@ -2815,7 +2815,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w164_data; uint32_t w164 = XNN_INVALID_VALUE_ID; std::array w164_dims = {{960}}; - std::array w164_scale; + static std::array w164_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w164_scale.begin(), w164_scale.end(), std::ref(scalerng)); @@ -2834,7 +2834,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w165_data; uint32_t w165 = XNN_INVALID_VALUE_ID; std::array w165_dims = {{1, 3, 3, 960}}; - std::array w165_scale; + static std::array w165_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w165_scale.begin(), w165_scale.end(), std::ref(scalerng)); @@ -2853,7 +2853,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w166_data; uint32_t w166 = XNN_INVALID_VALUE_ID; std::array w166_dims = {{960}}; - std::array w166_scale; + static std::array w166_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w166_scale.begin(), w166_scale.end(), std::ref(scalerng)); @@ -2872,7 +2872,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w167_data; uint32_t w167 = XNN_INVALID_VALUE_ID; std::array w167_dims = {{320, 1, 1, 960}}; - std::array w167_scale; + static std::array w167_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w167_scale.begin(), w167_scale.end(), std::ref(scalerng)); @@ -2891,7 +2891,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w168_data; uint32_t w168 = XNN_INVALID_VALUE_ID; std::array w168_dims = {{320}}; - std::array w168_scale; + static std::array w168_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w168_scale.begin(), w168_scale.end(), std::ref(scalerng)); @@ -2910,7 +2910,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w169_data; uint32_t w169 = XNN_INVALID_VALUE_ID; std::array w169_dims = {{1280, 1, 1, 320}}; - std::array w169_scale; + static std::array w169_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w169_scale.begin(), w169_scale.end(), std::ref(scalerng)); @@ -2929,7 +2929,7 @@ xnn_subgraph_t QS8MobileNetV2() { alignas(16) static std::array w170_data; uint32_t w170 = XNN_INVALID_VALUE_ID; std::array w170_dims = {{1280}}; - std::array w170_scale; + static std::array w170_scale; { auto scalerng = std::bind(std::uniform_real_distribution(0.01f, 1.0f), std::ref(rng)); std::generate(w170_scale.begin(), w170_scale.end(), std::ref(scalerng)); diff --git a/bench/negate.cc b/bench/negate.cc index a53d5c0c66b..4ca421d4887 100644 --- a/bench/negate.cc +++ b/bench/negate.cc @@ -3,10 +3,10 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -#include "xnnpack.h" - #include "unary_operator.h" -#include "bench/utils.h" +#include "utils.h" +#include "xnnpack.h" +#include "xnnpack/math.h" #include #ifdef BENCHMARK_TENSORFLOW_LITE #include "tensorflow/lite/schema/schema_generated.h" diff --git a/bench/packq-benchmark.cc b/bench/packq-benchmark.cc index 168fb6c1af8..a7b09412d41 100644 --- a/bench/packq-benchmark.cc +++ b/bench/packq-benchmark.cc @@ -9,14 +9,12 @@ #include #include #include -#include -#include "bench/utils.h" +#include "utils.h" +#include "xnnpack/buffer.h" #include "xnnpack/common.h" #include "xnnpack/microfnptr.h" -#include "xnnpack/pack.h" #include "xnnpack/packq.h" -#include "xnnpack/buffer.h" #include void x8_packq(benchmark::State& state, xnn_x8_packq_f32qp8_ukernel_fn packq, diff --git a/bench/packq-benchmark.h b/bench/packq-benchmark.h index 77f5b80d37d..7502098714a 100644 --- a/bench/packq-benchmark.h +++ b/bench/packq-benchmark.h @@ -8,7 +8,7 @@ #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack/common.h" #include "xnnpack/microfnptr.h" #include "xnnpack/pack.h" diff --git a/bench/packw-benchmark.h b/bench/packw-benchmark.h index 8204e51ccfc..d82f2ace33a 100644 --- a/bench/packw-benchmark.h +++ b/bench/packw-benchmark.h @@ -8,8 +8,8 @@ #include #include -#include "bench/bgemm.h" -#include "bench/utils.h" +#include "bgemm.h" +#include "utils.h" #include "xnnpack/common.h" #include "xnnpack/pack.h" #include "xnnpack/packw.h" diff --git a/bench/prelu.cc b/bench/prelu.cc index bd17cac386c..8da165cb695 100644 --- a/bench/prelu.cc +++ b/bench/prelu.cc @@ -7,13 +7,13 @@ #include #include #include +#include #include #include #include +#include "utils.h" #include "xnnpack.h" - -#include "bench/utils.h" #include "xnnpack/buffer.h" #include #ifdef BENCHMARK_TENSORFLOW_LITE @@ -43,6 +43,9 @@ void xnnpack_prelu_f32(benchmark::State& state, const char* net) { std::generate(slope.begin(), slope.end(), std::ref(f32wrng)); xnnpack::Buffer output(batch_size * height * width * channels); + const size_t input_shape[4] = {batch_size, height, width, channels}; + const size_t slope_shape[1] = {channels}; + xnn_status status = xnn_initialize(nullptr /* allocator */); if (status != xnn_status_success) { state.SkipWithError("failed to initialize XNNPACK"); @@ -50,27 +53,24 @@ void xnnpack_prelu_f32(benchmark::State& state, const char* net) { } xnn_operator_t prelu_op = nullptr; - status = xnn_create_prelu_nc_f32( - channels, /*slope_channels=*/channels, /*input_stride=*/channels , /*output_stride=*/channels, - slope.data(), - 0 /* flags */, nullptr, nullptr, &prelu_op); + status = xnn_create_binary_elementwise_nd(xnn_binary_prelu, xnn_datatype_fp32, + nullptr, nullptr, nullptr, + /*flags=*/0, &prelu_op); if (status != xnn_status_success) { state.SkipWithError("failed to create FP32 PReLU operator"); return; } - status = xnn_reshape_prelu_nc_f32( - prelu_op, - batch_size * height * width, - /*threadpool=*/nullptr); + status = xnn_reshape_binary_elementwise_nd(prelu_op, 4, &input_shape[0], 1, + &slope_shape[0], + /*threadpool=*/nullptr); if (status != xnn_status_success) { state.SkipWithError("failed to reshape FP32 PReLU operator"); return; } - status = xnn_setup_prelu_nc_f32( - prelu_op, - input.data(), output.data()); + status = xnn_setup_binary_elementwise_nd(prelu_op, input.data(), slope.data(), + output.data()); if (status != xnn_status_success) { state.SkipWithError("failed to setup FP32 PReLU operator"); return; diff --git a/bench/qd8-f16-qb4w-gemm.cc b/bench/qd8-f16-qb4w-gemm.cc index 406c7a9b8fb..ee0ece7a993 100644 --- a/bench/qd8-f16-qb4w-gemm.cc +++ b/bench/qd8-f16-qb4w-gemm.cc @@ -8,8 +8,8 @@ // Generator: tools/generate-gemm-test.py #include -#include "bench/gemm-benchmark.h" -#include "bench/utils.h" +#include "gemm-benchmark.h" +#include "utils.h" #include "xnnpack/common.h" #include "xnnpack/gemm.h" #include "xnnpack/isa-checks.h" diff --git a/bench/qd8-f16-qc4w-gemm.cc b/bench/qd8-f16-qc4w-gemm.cc index 72b644ec1f3..916d129ee52 100644 --- a/bench/qd8-f16-qc4w-gemm.cc +++ b/bench/qd8-f16-qc4w-gemm.cc @@ -8,8 +8,8 @@ // Generator: tools/generate-gemm-test.py #include -#include "bench/gemm-benchmark.h" -#include "bench/utils.h" +#include "gemm-benchmark.h" +#include "utils.h" #include "xnnpack/common.h" #include "xnnpack/gemm.h" #include "xnnpack/isa-checks.h" diff --git a/bench/qd8-f16-qc8w-gemm.cc b/bench/qd8-f16-qc8w-gemm.cc index 6de088c0fd8..c46f1602263 100644 --- a/bench/qd8-f16-qc8w-gemm.cc +++ b/bench/qd8-f16-qc8w-gemm.cc @@ -8,8 +8,8 @@ // Generator: tools/generate-gemm-test.py #include -#include "bench/gemm-benchmark.h" -#include "bench/utils.h" +#include "gemm-benchmark.h" +#include "utils.h" #include "xnnpack/common.h" #include "xnnpack/gemm.h" #include "xnnpack/isa-checks.h" diff --git a/bench/qd8-f32-qb4w-gemm.cc b/bench/qd8-f32-qb4w-gemm.cc index 4794bfefb34..175d9b816c3 100644 --- a/bench/qd8-f32-qb4w-gemm.cc +++ b/bench/qd8-f32-qb4w-gemm.cc @@ -8,8 +8,8 @@ // Generator: tools/generate-gemm-test.py #include -#include "bench/gemm-benchmark.h" -#include "bench/utils.h" +#include "gemm-benchmark.h" +#include "utils.h" #include "xnnpack/common.h" #include "xnnpack/gemm.h" #include "xnnpack/isa-checks.h" diff --git a/bench/qd8-f32-qc4w-gemm.cc b/bench/qd8-f32-qc4w-gemm.cc index 4e9931b6924..9066e099c1c 100644 --- a/bench/qd8-f32-qc4w-gemm.cc +++ b/bench/qd8-f32-qc4w-gemm.cc @@ -8,8 +8,8 @@ // Generator: tools/generate-gemm-test.py #include -#include "bench/gemm-benchmark.h" -#include "bench/utils.h" +#include "gemm-benchmark.h" +#include "utils.h" #include "xnnpack/common.h" #include "xnnpack/gemm.h" #include "xnnpack/isa-checks.h" diff --git a/bench/qd8-f32-qc8w-gemm.cc b/bench/qd8-f32-qc8w-gemm.cc index 838ad87ab76..906d7057611 100644 --- a/bench/qd8-f32-qc8w-gemm.cc +++ b/bench/qd8-f32-qc8w-gemm.cc @@ -8,8 +8,8 @@ // Generator: tools/generate-gemm-test.py #include -#include "bench/gemm-benchmark.h" -#include "bench/utils.h" +#include "gemm-benchmark.h" +#include "utils.h" #include "xnnpack/common.h" #include "xnnpack/gemm.h" #include "xnnpack/isa-checks.h" diff --git a/bench/qp8-f32-qb4w-gemm.cc b/bench/qp8-f32-qb4w-gemm.cc index f769132e5c5..71c64196e4e 100644 --- a/bench/qp8-f32-qb4w-gemm.cc +++ b/bench/qp8-f32-qb4w-gemm.cc @@ -8,8 +8,8 @@ // Generator: tools/generate-gemm-test.py #include -#include "bench/gemm-benchmark.h" -#include "bench/utils.h" +#include "gemm-benchmark.h" +#include "utils.h" #include "xnnpack/common.h" #include "xnnpack/gemm.h" #include "xnnpack/isa-checks.h" diff --git a/bench/qp8-f32-qc4w-gemm.cc b/bench/qp8-f32-qc4w-gemm.cc index 4a25dfd0708..cd5af5412eb 100644 --- a/bench/qp8-f32-qc4w-gemm.cc +++ b/bench/qp8-f32-qc4w-gemm.cc @@ -8,8 +8,8 @@ // Generator: tools/generate-gemm-test.py #include -#include "bench/gemm-benchmark.h" -#include "bench/utils.h" +#include "gemm-benchmark.h" +#include "utils.h" #include "xnnpack/common.h" #include "xnnpack/gemm.h" #include "xnnpack/isa-checks.h" diff --git a/bench/qs16-qs8-vcvt.cc b/bench/qs16-qs8-vcvt.cc index e17893e50a6..4ea6126a68d 100644 --- a/bench/qs16-qs8-vcvt.cc +++ b/bench/qs16-qs8-vcvt.cc @@ -4,8 +4,8 @@ // LICENSE file in the root directory of this source tree. #include -#include "bench/utils.h" -#include "bench/vcvt-benchmark.h" +#include "utils.h" +#include "vcvt-benchmark.h" #include "xnnpack.h" #include "xnnpack/hardware-config.h" #include "xnnpack/microfnptr.h" @@ -30,7 +30,7 @@ static void qs16_qs8_vcvt( BENCHMARK_CAPTURE(qs16_qs8_vcvt, ukernel, arch_flags, ukernel, init_params) \ ->Apply(benchmark::utils::UnaryElementwiseParameters) \ ->UseRealTime(); -#include "src/qs16-qs8-vcvt/qs16-qs8-vcvt.h" +#include "qs16-qs8-vcvt/qs16-qs8-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/bench/qs8-dwconv.cc b/bench/qs8-dwconv.cc index b5ced99b2df..3d862732f7d 100644 --- a/bench/qs8-dwconv.cc +++ b/bench/qs8-dwconv.cc @@ -4,32 +4,31 @@ // LICENSE file in the root directory of this source tree. #include -#include -#include +#include +#include #include #include #include -#include -#include "bench/dwconv.h" -#include "bench/utils.h" +#include "dwconv.h" +#include "utils.h" #include "xnnpack.h" +#include "xnnpack/buffer.h" #include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/indirection.h" #include "xnnpack/microfnptr.h" #include "xnnpack/microkernel-utils.h" #include "xnnpack/microparams-init.h" +#include "xnnpack/microparams.h" #include "xnnpack/pack.h" -#include "xnnpack/buffer.h" #include -static void DWConvBenchmark(benchmark::State& state, - xnn_qs8_dwconv_minmax_unipass_ukernel_fn dwconv, - xnn_init_qs8_conv_minmax_params_fn init_params, - uint32_t channel_tile, uint32_t primary_tile, - benchmark::utils::IsaCheckFunction isa_check = nullptr) -{ +static void DWConvBenchmark( + benchmark::State& state, xnn_qs8_dwconv_minmax_unipass_ukernel_fn dwconv, + xnn_init_qs8_conv_minmax_params_fn init_params, uint32_t channel_tile, + uint32_t primary_tile, + benchmark::utils::IsaCheckFunction isa_check = nullptr) { if (isa_check != nullptr && !isa_check(state)) { return; } @@ -52,66 +51,83 @@ static void DWConvBenchmark(benchmark::State& state, std::random_device random_device; auto rng = std::mt19937(random_device()); - auto i32rng = std::bind(std::uniform_int_distribution(-10000, 10000), std::ref(rng)); - auto i8rng = std::bind( - std::uniform_int_distribution(-std::numeric_limits::max(), std::numeric_limits::max()), std::ref(rng)); + auto i32rng = std::bind(std::uniform_int_distribution(-10000, 10000), + std::ref(rng)); + auto i8rng = std::bind(std::uniform_int_distribution( + -std::numeric_limits::max(), + std::numeric_limits::max()), + std::ref(rng)); const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1; const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1; const size_t padding_left = padding_width / 2; const size_t padding_top = padding_height / 2; - const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1; - const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1; + const size_t output_height = + (input_height + padding_height - effective_kernel_height) / subsampling + + 1; + const size_t output_width = + (input_width + padding_width - effective_kernel_width) / subsampling + 1; const size_t output_size = output_height * output_width; - const size_t step_width = dilation == 1 ? std::min(subsampling, kernel_width) : kernel_width; - const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height; + const size_t step_width = + dilation == 1 ? std::min(subsampling, kernel_width) : kernel_width; + const size_t step_height = + kernel_size + (output_width - 1) * step_width * kernel_height; - const size_t c_stride = benchmark::utils::RoundUp(channels, channel_tile); + const size_t c_stride = + benchmark::utils::RoundUp(channels, channel_tile); - xnnpack::Buffer a(channels * input_height * input_width + XNN_EXTRA_BYTES / sizeof(int8_t)); + xnnpack::Buffer a(channels * input_height * input_width + + XNN_EXTRA_BYTES / sizeof(int8_t)); std::generate(a.begin(), a.end(), std::ref(i8rng)); xnnpack::Buffer k(channels * kernel_height * kernel_width); std::generate(k.begin(), k.end(), std::ref(i8rng)); xnnpack::Buffer b(channels); std::generate(b.begin(), b.end(), std::ref(i32rng)); + // Zero buffer needs to be initialized with zeros. xnnpack::Buffer z(channels + XNN_EXTRA_BYTES / sizeof(int8_t)); + std::fill(z.begin(), z.end(), 0); const size_t k_elements = kernel_size * c_stride; const size_t b_elements = c_stride; - const size_t w_size = k_elements * sizeof(int8_t) + b_elements * sizeof(int32_t); - // Can read (primary_tile - kernel_size) elements after end of indirection buffer. - const size_t i_elements = (primary_tile - kernel_size) + output_height * step_height; + const size_t w_size = + k_elements * sizeof(int8_t) + b_elements * sizeof(int32_t); + // Can read (primary_tile - kernel_size) elements after end of indirection + // buffer. + const size_t i_elements = + (primary_tile - kernel_size) + output_height * step_height; const size_t c_elements = output_size * channels; - const size_t num_buffers = 1 + - benchmark::utils::DivideRoundUp(benchmark::utils::GetMaxCacheSize(), - (c_elements * sizeof(int8_t) + w_size) + sizeof(void*) * i_elements); + const size_t num_buffers = 1 + benchmark::utils::DivideRoundUp( + benchmark::utils::GetMaxCacheSize(), + (c_elements * sizeof(int8_t) + w_size) + + sizeof(void*) * i_elements); + // Explicitly initialize the weights buffer since `num_buffers` may be larger + // than the number of buffers that are actually initialized/needed. xnnpack::Buffer w(w_size * num_buffers); + std::fill(w.begin(), w.end(), 0); + + // Pack the weights buffer. struct xnn_qs8_packing_params packing_params; packing_params.input_zero_point = 0; - xnn_pack_qs8_dwconv_ghw_w(primary_tile, 0, 0, kernel_height, kernel_width, channels, - channel_tile, channel_tile, /*channel_round=*/1, - k.data(), b.data(), /*scale=*/nullptr, w.data(), - /*per_tile_extra_bytes=*/0, /*per_subtile_extra_bytes=*/0, &packing_params); + xnn_pack_qs8_dwconv_ghw_w(primary_tile, 0, 0, kernel_height, kernel_width, + channels, channel_tile, channel_tile, + /*channel_round=*/1, k.data(), b.data(), + /*scale=*/nullptr, w.data(), + /*per_tile_extra_bytes=*/0, + /*per_subtile_extra_bytes=*/0, &packing_params); for (size_t n = 1; n < num_buffers; n++) { std::copy(w.cbegin(), w.cbegin() + w_size, w.begin() + n * w_size); } xnnpack::Buffer i(i_elements * num_buffers); xnn_indirection_init_dwconv2d( - /*output_y_start=*/0, /*output_y_end=*/output_height, - reinterpret_cast(i.data()), - a.data(), - channels << XNN_LOG2_SIZEOF_INT8_T, - z.data(), - input_height, input_width, - output_height, output_width, - kernel_height, kernel_width, - subsampling, subsampling, - dilation, dilation, - padding_top, padding_left, - step_height, step_width, primary_tile); + /*output_y_start=*/0, /*output_y_end=*/output_height, + reinterpret_cast(i.data()), a.data(), + channels << XNN_LOG2_SIZEOF_INT8_T, z.data(), input_height, input_width, + output_height, output_width, kernel_height, kernel_width, subsampling, + subsampling, dilation, dilation, padding_top, padding_left, step_height, + step_width, primary_tile); for (size_t n = 1; n < num_buffers; n++) { std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements); } @@ -119,8 +135,9 @@ static void DWConvBenchmark(benchmark::State& state, xnnpack::Buffer c(c_elements * num_buffers); xnn_qs8_conv_minmax_params params; - init_params(¶ms, - 0.5f /* scale */, 0 /* output zero point */, std::numeric_limits::min(), std::numeric_limits::max()); + init_params(¶ms, 0.5f /* scale */, 0 /* output zero point */, + std::numeric_limits::min(), + std::numeric_limits::max()); size_t buffer_index = 0; for (auto _ : state) { @@ -131,11 +148,11 @@ static void DWConvBenchmark(benchmark::State& state, for (size_t y = 0; y < output_height; y++) { dwconv(channels, output_width, - i.data() + buffer_index * i_elements + step_height * y, - w.data() + buffer_index * w_size, - c.data() + buffer_index * c_elements + y * output_width * channels, - kernel_height * step_width * sizeof(void*), 0, - 0, z.data(), ¶ms); + i.data() + buffer_index * i_elements + step_height * y, + w.data() + buffer_index * w_size, + c.data() + buffer_index * c_elements + y * output_width * channels, + kernel_height * step_width * sizeof(void*), 0, 0, z.data(), + ¶ms); } } @@ -144,13 +161,17 @@ static void DWConvBenchmark(benchmark::State& state, state.counters["cpufreq"] = cpu_frequency; } - state.counters["OPS"] = benchmark::Counter( - uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size, - benchmark::Counter::kIsRate); + state.counters["OPS"] = + benchmark::Counter(static_cast(state.iterations()) * 2 * + output_size * channels * kernel_size, + benchmark::Counter::kIsRate); state.counters["bytes"] = benchmark::Counter( - uint64_t(state.iterations()) * channels * ((output_size + input_height * input_width + kernel_size) * sizeof(int8_t) + sizeof(int32_t)), - benchmark::Counter::kIsRate); + static_cast(state.iterations()) * channels * + ((output_size + input_height * input_width + kernel_size) * + sizeof(int8_t) + + sizeof(int32_t)), + benchmark::Counter::kIsRate); } static void DWConvBenchmark(benchmark::State& state, @@ -679,8 +700,7 @@ static void DWConvBenchmark(benchmark::State& state, #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) static void qs8_dwconv_9p16c__avx512skx_mul32(benchmark::State& state, const char* net) { DWConvBenchmark(state, xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx512skx_mul32, @@ -693,6 +713,12 @@ static void DWConvBenchmark(benchmark::State& state, xnn_init_qs8_conv_minmax_fp32_scalar_params, 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX512SKX); } + + BENCHMARK_DWCONV(qs8_dwconv_9p16c__avx512skx_mul32); + BENCHMARK_DWCONV(qs8_dwconv_9p32c__avx512skx_mul32); +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 static void qs8_dwconv_9p16c__avx2_mul16_vpmovsx(benchmark::State& state, const char* net) { DWConvBenchmark(state, xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_vpmovsx, @@ -1239,9 +1265,6 @@ static void DWConvBenchmark(benchmark::State& state, benchmark::utils::CheckAVX2); } - BENCHMARK_DWCONV(qs8_dwconv_9p16c__avx512skx_mul32); - BENCHMARK_DWCONV(qs8_dwconv_9p32c__avx512skx_mul32); - BENCHMARK_DWCONV(qs8_dwconv_9p16c__avx2_mul16_vpmovsx); BENCHMARK_DWCONV(qs8_dwconv_9p32c__avx2_mul16_vpmovsx); BENCHMARK_DWCONV(qs8_dwconv_9p16c__avx2_mul16_vpunpck); diff --git a/bench/qs8-f16-vcvt.cc b/bench/qs8-f16-vcvt.cc index 8feb7e15af9..bcf73e25966 100644 --- a/bench/qs8-f16-vcvt.cc +++ b/bench/qs8-f16-vcvt.cc @@ -3,16 +3,17 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -#include -#include "bench/utils.h" -#include "bench/vcvt-benchmark.h" +#include "utils.h" +#include "vcvt-benchmark.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/hardware-config.h" +#include "xnnpack/math.h" #include "xnnpack/microfnptr.h" -#include "xnnpack/microparams.h" #include "xnnpack/microparams-init.h" +#include "xnnpack/microparams.h" #include "xnnpack/vcvt.h" +#include static void qs8_f16_vcvt( benchmark::State& state, @@ -33,7 +34,7 @@ static void qs8_f16_vcvt( BENCHMARK_CAPTURE(qs8_f16_vcvt, ukernel, arch_flags, ukernel, init_params) \ ->Apply(benchmark::utils::UnaryElementwiseParameters) \ ->UseRealTime(); -#include "src/qs8-f16-vcvt/qs8-f16-vcvt.h" +#include "qs8-f16-vcvt/qs8-f16-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/bench/qs8-f32-vcvt.cc b/bench/qs8-f32-vcvt.cc index 8bac67bc1f1..5dc917aa950 100644 --- a/bench/qs8-f32-vcvt.cc +++ b/bench/qs8-f32-vcvt.cc @@ -4,8 +4,8 @@ // LICENSE file in the root directory of this source tree. #include -#include "bench/utils.h" -#include "bench/vcvt-benchmark.h" +#include "utils.h" +#include "vcvt-benchmark.h" #include "xnnpack.h" #include "xnnpack/hardware-config.h" #include "xnnpack/microfnptr.h" @@ -32,7 +32,7 @@ static void qs8_f32_vcvt( BENCHMARK_CAPTURE(qs8_f32_vcvt, ukernel, arch_flags, ukernel, init_params) \ ->Apply(benchmark::utils::UnaryElementwiseParameters) \ ->UseRealTime(); -#include "src/qs8-f32-vcvt/qs8-f32-vcvt.h" +#include "qs8-f32-vcvt/qs8-f32-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/bench/qs8-gemm.cc b/bench/qs8-gemm.cc index 1590c078f59..50e436e06a3 100644 --- a/bench/qs8-gemm.cc +++ b/bench/qs8-gemm.cc @@ -14,8 +14,8 @@ #include #include -#include "bench/gemm-benchmark.h" -#include "bench/utils.h" +#include "gemm-benchmark.h" +#include "utils.h" #ifdef BENCHMARK_RUY #include "ruy/ruy.h" #endif // BENCHMARK_RUY diff --git a/bench/qs8-packw.cc b/bench/qs8-packw.cc index 6b54534823a..dcd8971c795 100644 --- a/bench/qs8-packw.cc +++ b/bench/qs8-packw.cc @@ -5,9 +5,9 @@ #include -#include "bench/bgemm.h" -#include "bench/packw-benchmark.h" -#include "bench/utils.h" +#include "bgemm.h" +#include "packw-benchmark.h" +#include "utils.h" #include "xnnpack/common.h" #include "xnnpack/hardware-config.h" #include "xnnpack/packw.h" @@ -22,7 +22,7 @@ static void qs8_packw(benchmark::State& state, const char* net, #define XNN_QS8_UKERNEL(arch_flags, ukernel, nr, kr, sr, kblock, nr_scale, izp) \ BENCHMARK_CAPTURE_BGEMM(qs8_packw, ukernel##_, ukernel, arch_flags, nr, kr, sr); -#include "src/qs8-packw/qs8-packw.h" +#include "qs8-packw/qs8-packw.h" #undef XNN_QS8_UKERNEL diff --git a/bench/qs8-qc8w-gemm-fp32.cc b/bench/qs8-qc8w-gemm-fp32.cc index 905f3fc491d..795f8393620 100644 --- a/bench/qs8-qc8w-gemm-fp32.cc +++ b/bench/qs8-qc8w-gemm-fp32.cc @@ -8,8 +8,8 @@ // Generator: tools/generate-gemm-test.py #include -#include "bench/gemm-benchmark.h" -#include "bench/utils.h" +#include "gemm-benchmark.h" +#include "utils.h" #include "xnnpack/common.h" #include "xnnpack/gemm.h" #include "xnnpack/isa-checks.h" diff --git a/bench/qs8-rdsum.cc b/bench/qs8-rdsum.cc index f227b87b189..0a136758c59 100644 --- a/bench/qs8-rdsum.cc +++ b/bench/qs8-rdsum.cc @@ -7,8 +7,8 @@ // Specification: test/qs8-rdsum-minmax-fp32.yaml // Generator: tools/generate-rdsum-benchmark.py -#include "bench/rsum-benchmark.h" -#include "bench/utils.h" +#include "rsum-benchmark.h" +#include "utils.h" #include #include "xnnpack.h" diff --git a/bench/qs8-requantization.cc b/bench/qs8-requantization.cc index 3b1d63e6927..cff6e6934aa 100644 --- a/bench/qs8-requantization.cc +++ b/bench/qs8-requantization.cc @@ -9,7 +9,7 @@ #include #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/microfnptr.h" @@ -72,12 +72,6 @@ static void qs8_requantization( ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(qs8_requantization, rndna__neon, - xnn_qs8_requantize_rndna__neon, - benchmark::utils::CheckNEON) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(qs8_requantization, rndnu__neon_mull, xnn_qs8_requantize_rndnu__neon_mull, benchmark::utils::CheckNEON) @@ -116,21 +110,6 @@ static void qs8_requantization( ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - BENCHMARK_CAPTURE(qs8_requantization, rndna__sse2, - xnn_qs8_requantize_rndna__sse2) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(qs8_requantization, rndna__ssse3, - xnn_qs8_requantize_rndna__ssse3, - benchmark::utils::CheckSSSE3) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(qs8_requantization, rndna__sse41, - xnn_qs8_requantize_rndna__sse41, - benchmark::utils::CheckSSE41) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(qs8_requantization, rndnu__sse41_sra, xnn_qs8_requantize_rndnu__sse41_sra, benchmark::utils::CheckSSE41) @@ -169,19 +148,6 @@ BENCHMARK_CAPTURE(qs8_requantization, gemmlowp__scalar, ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); -BENCHMARK_CAPTURE(qs8_requantization, rndna__scalar_signed64, - xnn_qs8_requantize_rndna__scalar_signed64) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); -BENCHMARK_CAPTURE(qs8_requantization, rndna__scalar_unsigned32, - xnn_qs8_requantize_rndna__scalar_unsigned32) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); -BENCHMARK_CAPTURE(qs8_requantization, rndna__scalar_unsigned64, - xnn_qs8_requantize_rndna__scalar_unsigned64) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(qs8_requantization, rndnu__scalar, xnn_qs8_requantize_rndnu__scalar) ->Apply(benchmark::utils::UnaryElementwiseParameters) diff --git a/bench/qs8-rsum.cc b/bench/qs8-rsum.cc index 5b4f1a319ca..2651d3625ef 100644 --- a/bench/qs8-rsum.cc +++ b/bench/qs8-rsum.cc @@ -7,8 +7,8 @@ // Specification: test/qs8-rsum.yaml // Generator: tools/generate-rdsum-benchmark.py -#include "bench/rsum-benchmark.h" -#include "bench/utils.h" +#include "rsum-benchmark.h" +#include "utils.h" #include #include "xnnpack.h" diff --git a/bench/qs8-vcvt.cc b/bench/qs8-vcvt.cc index fc4009e787f..765c08b107b 100644 --- a/bench/qs8-vcvt.cc +++ b/bench/qs8-vcvt.cc @@ -5,8 +5,8 @@ #include #include -#include "bench/utils.h" -#include "bench/vcvt-benchmark.h" +#include "utils.h" +#include "vcvt-benchmark.h" #include "xnnpack.h" #include "xnnpack/hardware-config.h" #include "xnnpack/microfnptr.h" @@ -31,7 +31,7 @@ static void qs8_vcvt( BENCHMARK_CAPTURE(qs8_vcvt, ukernel, arch_flags, ukernel, init_params) \ ->Apply(benchmark::utils::UnaryElementwiseParameters) \ ->UseRealTime(); -#include "src/qs8-vcvt/qs8-vcvt.h" +#include "qs8-vcvt/qs8-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/bench/qu8-f32-vcvt.cc b/bench/qu8-f32-vcvt.cc index 08c9571b28b..f90bf47d203 100644 --- a/bench/qu8-f32-vcvt.cc +++ b/bench/qu8-f32-vcvt.cc @@ -4,8 +4,8 @@ // LICENSE file in the root directory of this source tree. #include -#include "bench/utils.h" -#include "bench/vcvt-benchmark.h" +#include "utils.h" +#include "vcvt-benchmark.h" #include "xnnpack.h" #include "xnnpack/hardware-config.h" #include "xnnpack/microfnptr.h" @@ -32,7 +32,7 @@ static void qu8_f32_vcvt( BENCHMARK_CAPTURE(qu8_f32_vcvt, ukernel, arch_flags, ukernel, init_params) \ ->Apply(benchmark::utils::UnaryElementwiseParameters) \ ->UseRealTime(); -#include "src/qu8-f32-vcvt/qu8-f32-vcvt.h" +#include "qu8-f32-vcvt/qu8-f32-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/bench/qu8-gemm-fp32.cc b/bench/qu8-gemm-fp32.cc index e7fc2ba3364..67f1ce5f5c2 100644 --- a/bench/qu8-gemm-fp32.cc +++ b/bench/qu8-gemm-fp32.cc @@ -8,8 +8,8 @@ // Generator: tools/generate-gemm-test.py #include -#include "bench/gemm-benchmark.h" -#include "bench/utils.h" +#include "gemm-benchmark.h" +#include "utils.h" #include "xnnpack/common.h" #include "xnnpack/gemm.h" #include "xnnpack/isa-checks.h" diff --git a/bench/qu8-gemm-rndnu.cc b/bench/qu8-gemm-rndnu.cc index ac374a7a12e..c812703d5e8 100644 --- a/bench/qu8-gemm-rndnu.cc +++ b/bench/qu8-gemm-rndnu.cc @@ -8,8 +8,8 @@ // Generator: tools/generate-gemm-test.py #include -#include "bench/gemm-benchmark.h" -#include "bench/utils.h" +#include "gemm-benchmark.h" +#include "utils.h" #include "xnnpack/common.h" #include "xnnpack/gemm.h" #include "xnnpack/isa-checks.h" diff --git a/bench/qu8-gemm.cc b/bench/qu8-gemm.cc index 89514b12aa0..2ecea2ff721 100644 --- a/bench/qu8-gemm.cc +++ b/bench/qu8-gemm.cc @@ -24,8 +24,8 @@ #ifdef BENCHMARK_RUY #include "ruy/ruy.h" #endif // BENCHMARK_RUY -#include "bench/gemm.h" -#include "bench/utils.h" +#include "gemm.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/gemm.h" @@ -537,8 +537,7 @@ static void ruy_st(benchmark::State& state, const char* net) BENCHMARK_GEMM(qu8_gemm_2x2c4__armsimd32) #endif // XNN_ARCH_ARM - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) static void qu8_gemm_1x16c8__avx512skx(benchmark::State& state, const char* net) { GEMMBenchmark(state, xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, @@ -546,6 +545,11 @@ static void ruy_st(benchmark::State& state, const char* net) 1, 16, 8, 1, benchmark::utils::CheckAVX512SKX); } + + BENCHMARK_GEMM(qu8_gemm_1x16c8__avx512skx) +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 static void qu8_gemm_1x8c8__avx2(benchmark::State& state, const char* net) { GEMMBenchmark(state, xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, @@ -849,8 +853,6 @@ static void ruy_st(benchmark::State& state, const char* net) /*mr=*/3, /*nr=*/4, /*kr=*/8, /*sr=*/1); } - BENCHMARK_GEMM(qu8_gemm_1x16c8__avx512skx) - BENCHMARK_GEMM(qu8_gemm_1x8c8__avx2) BENCHMARK_GEMM(qu8_gemm_2x8c8__avx2) BENCHMARK_GEMM(qu8_gemm_3x8c8__avx2) diff --git a/bench/qu8-rdsum.cc b/bench/qu8-rdsum.cc index b61c39de3ed..747117bcc11 100644 --- a/bench/qu8-rdsum.cc +++ b/bench/qu8-rdsum.cc @@ -7,8 +7,8 @@ // Specification: test/qu8-rdsum.yaml // Generator: tools/generate-rdsum-benchmark.py -#include "bench/rsum-benchmark.h" -#include "bench/utils.h" +#include "rsum-benchmark.h" +#include "utils.h" #include #include "xnnpack.h" diff --git a/bench/qu8-requantization.cc b/bench/qu8-requantization.cc index d01936b587e..301872bf193 100644 --- a/bench/qu8-requantization.cc +++ b/bench/qu8-requantization.cc @@ -9,7 +9,7 @@ #include #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/microfnptr.h" @@ -71,12 +71,6 @@ static void qu8_requantization( benchmark::utils::CheckNEON) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - - BENCHMARK_CAPTURE(qu8_requantization, rndna__neon, - xnn_qu8_requantize_rndna__neon, - benchmark::utils::CheckNEON) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -99,21 +93,6 @@ static void qu8_requantization( benchmark::utils::CheckSSE41) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); - - BENCHMARK_CAPTURE(qu8_requantization, rndna__sse2, - xnn_qu8_requantize_rndna__sse2) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(qu8_requantization, rndna__ssse3, - xnn_qu8_requantize_rndna__ssse3, - benchmark::utils::CheckSSSE3) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - BENCHMARK_CAPTURE(qu8_requantization, rndna__sse41, - xnn_qu8_requantize_rndna__sse41, - benchmark::utils::CheckSSE41) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -142,19 +121,6 @@ BENCHMARK_CAPTURE(qu8_requantization, gemmlowp__scalar, ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); -BENCHMARK_CAPTURE(qu8_requantization, rndna__scalar_signed64, - xnn_qu8_requantize_rndna__scalar_signed64) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); -BENCHMARK_CAPTURE(qu8_requantization, rndna__scalar_unsigned32, - xnn_qu8_requantize_rndna__scalar_unsigned32) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); -BENCHMARK_CAPTURE(qu8_requantization, rndna__scalar_unsigned64, - xnn_qu8_requantize_rndna__scalar_unsigned64) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - #ifndef XNNPACK_BENCHMARK_NO_MAIN BENCHMARK_MAIN(); #endif diff --git a/bench/qu8-rsum.cc b/bench/qu8-rsum.cc index f5c7ff8b2fb..54982589776 100644 --- a/bench/qu8-rsum.cc +++ b/bench/qu8-rsum.cc @@ -7,8 +7,8 @@ // Specification: test/qu8-rsum.yaml // Generator: tools/generate-rdsum-benchmark.py -#include "bench/rsum-benchmark.h" -#include "bench/utils.h" +#include "rsum-benchmark.h" +#include "utils.h" #include #include "xnnpack.h" diff --git a/bench/qu8-vcvt.cc b/bench/qu8-vcvt.cc index 26a5d911075..f43be113cd2 100644 --- a/bench/qu8-vcvt.cc +++ b/bench/qu8-vcvt.cc @@ -4,8 +4,8 @@ // LICENSE file in the root directory of this source tree. #include -#include "bench/utils.h" -#include "bench/vcvt-benchmark.h" +#include "utils.h" +#include "vcvt-benchmark.h" #include "xnnpack.h" #include "xnnpack/hardware-config.h" #include "xnnpack/microfnptr.h" @@ -30,7 +30,7 @@ static void qu8_vcvt( BENCHMARK_CAPTURE(qu8_vcvt, ukernel, arch_flags, ukernel, init_params) \ ->Apply(benchmark::utils::UnaryElementwiseParameters) \ ->UseRealTime(); -#include "src/qu8-vcvt/qu8-vcvt.h" +#include "qu8-vcvt/qu8-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/bench/reciprocal-square-root.cc b/bench/reciprocal-square-root.cc index 858a193afa4..2ce8ebe1003 100644 --- a/bench/reciprocal-square-root.cc +++ b/bench/reciprocal-square-root.cc @@ -6,7 +6,7 @@ #include "xnnpack.h" #include "unary_operator.h" -#include "bench/utils.h" +#include "utils.h" #include #ifdef BENCHMARK_TENSORFLOW_LITE #include "tensorflow/lite/schema/schema_generated.h" diff --git a/bench/rsum-benchmark.h b/bench/rsum-benchmark.h index 31327107d92..1aa40fc2d10 100644 --- a/bench/rsum-benchmark.h +++ b/bench/rsum-benchmark.h @@ -12,13 +12,14 @@ #include #include -#include "bench/rsum-benchmark.h" -#include "bench/utils.h" +#include "rsum-benchmark.h" +#include "utils.h" #include "xnnpack.h" +#include "xnnpack/buffer.h" #include "xnnpack/common.h" +#include "xnnpack/math.h" #include "xnnpack/microfnptr.h" #include "xnnpack/reduce.h" -#include "xnnpack/buffer.h" #include namespace { diff --git a/bench/s32-f32-vcvt.cc b/bench/s32-f32-vcvt.cc index a5f8748247c..a90c238e95f 100644 --- a/bench/s32-f32-vcvt.cc +++ b/bench/s32-f32-vcvt.cc @@ -4,8 +4,8 @@ // LICENSE file in the root directory of this source tree. #include -#include "bench/utils.h" -#include "bench/vcvt-benchmark.h" +#include "utils.h" +#include "vcvt-benchmark.h" #include "xnnpack.h" #include "xnnpack/hardware-config.h" #include "xnnpack/microfnptr.h" @@ -30,7 +30,7 @@ static void s32_f32_vcvt( BENCHMARK_CAPTURE(s32_f32_vcvt, ukernel, arch_flags, ukernel, init_params) \ ->Apply(benchmark::utils::UnaryElementwiseParameters) \ ->UseRealTime(); -#include "src/s32-f32-vcvt/s32-f32-vcvt.h" +#include "s32-f32-vcvt/s32-f32-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/bench/scaled-dot-product-attention.cc b/bench/scaled-dot-product-attention.cc index ec89d58d71c..17043f7f715 100644 --- a/bench/scaled-dot-product-attention.cc +++ b/bench/scaled-dot-product-attention.cc @@ -15,7 +15,7 @@ #include "xnnpack.h" #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack/buffer.h" void xnnpack_multihead_scaled_batch_matrix_multiply_cap_tanh_f32(benchmark::State& state, const char* net) { diff --git a/bench/sigmoid.cc b/bench/sigmoid.cc index 6344a8a0363..1c34fcdb52e 100644 --- a/bench/sigmoid.cc +++ b/bench/sigmoid.cc @@ -3,13 +3,13 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -#include "xnnpack.h" - #include #include #include "unary_operator.h" -#include "bench/utils.h" +#include "utils.h" +#include "xnnpack.h" +#include "xnnpack/math.h" #include #ifdef BENCHMARK_TENSORFLOW_LITE #include "flatbuffers/include/flatbuffers/flatbuffer_builder.h" diff --git a/bench/softmax.cc b/bench/softmax.cc index 5672efe6ab4..3dc57217889 100644 --- a/bench/softmax.cc +++ b/bench/softmax.cc @@ -14,7 +14,7 @@ #include #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/math.h" #include "xnnpack/buffer.h" diff --git a/bench/spmm-benchmark.h b/bench/spmm-benchmark.h index b0d02b57e9e..300a747d082 100644 --- a/bench/spmm-benchmark.h +++ b/bench/spmm-benchmark.h @@ -5,8 +5,8 @@ #pragma once -#include "bench/spmm.h" -#include "bench/utils.h" +#include "spmm.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/aligned-allocator.h" diff --git a/bench/square-root.cc b/bench/square-root.cc index 5411a4c7feb..e7d56c97a06 100644 --- a/bench/square-root.cc +++ b/bench/square-root.cc @@ -3,10 +3,10 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -#include "xnnpack.h" - #include "unary_operator.h" -#include "bench/utils.h" +#include "utils.h" +#include "xnnpack.h" +#include "xnnpack/math.h" #include #ifdef BENCHMARK_TENSORFLOW_LITE #include "tensorflow/lite/schema/schema_generated.h" diff --git a/bench/square.cc b/bench/square.cc index c794322017a..17d1c4fbc90 100644 --- a/bench/square.cc +++ b/bench/square.cc @@ -3,10 +3,10 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -#include "xnnpack.h" - #include "unary_operator.h" -#include "bench/utils.h" +#include "utils.h" +#include "xnnpack.h" +#include "xnnpack/math.h" #include #ifdef BENCHMARK_TENSORFLOW_LITE #include "tensorflow/lite/schema/schema_generated.h" diff --git a/bench/tanh.cc b/bench/tanh.cc index ad30bbfbb10..02b8f8113b0 100644 --- a/bench/tanh.cc +++ b/bench/tanh.cc @@ -3,13 +3,13 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -#include "xnnpack.h" - #include #include #include "unary_operator.h" -#include "bench/utils.h" +#include "utils.h" +#include "xnnpack.h" +#include "xnnpack/math.h" #include #ifdef BENCHMARK_TENSORFLOW_LITE #include "flatbuffers/include/flatbuffers/flatbuffer_builder.h" diff --git a/bench/truncation.cc b/bench/truncation.cc index 2358fc3e445..4b5b3d780d9 100644 --- a/bench/truncation.cc +++ b/bench/truncation.cc @@ -6,7 +6,7 @@ #include "xnnpack.h" #include "unary_operator.h" -#include "bench/utils.h" +#include "utils.h" #include #ifdef BENCHMARK_TENSORFLOW_LITE #include "tensorflow/lite/schema/schema_generated.h" diff --git a/bench/u32-f32-vcvt.cc b/bench/u32-f32-vcvt.cc index 44d70e02723..4f570cf1cad 100644 --- a/bench/u32-f32-vcvt.cc +++ b/bench/u32-f32-vcvt.cc @@ -9,8 +9,8 @@ #include -#include "bench/utils.h" -#include "bench/vcvt-benchmark.h" +#include "utils.h" +#include "vcvt-benchmark.h" #include "xnnpack.h" #include "xnnpack/hardware-config.h" #include "xnnpack/microfnptr.h" @@ -35,7 +35,7 @@ static void u32_f32_vcvt( BENCHMARK_CAPTURE(u32_f32_vcvt, ukernel, arch_flags, ukernel, init_params) \ ->Apply(benchmark::utils::UnaryElementwiseParameters) \ ->UseRealTime(); -#include "src/u32-f32-vcvt/u32-f32-vcvt.h" +#include "u32-f32-vcvt/u32-f32-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/bench/unary_operator.h b/bench/unary_operator.h index f8b6eb7a59c..2ad458acdbe 100644 --- a/bench/unary_operator.h +++ b/bench/unary_operator.h @@ -17,7 +17,7 @@ #include #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack/math.h" #include "xnnpack/buffer.h" #include diff --git a/bench/utils.cc b/bench/utils.cc index d612d964cd1..5239d10e980 100644 --- a/bench/utils.cc +++ b/bench/utils.cc @@ -28,7 +28,7 @@ #include "xnnpack/allocator.h" #include "xnnpack/hardware-config.h" -#include "bench/utils.h" +#include "utils.h" static void* wipe_buffer = nullptr; static size_t wipe_buffer_size = 0; diff --git a/bench/vbinary.cc b/bench/vbinary.cc index 526439f412a..7d24e90a663 100644 --- a/bench/vbinary.cc +++ b/bench/vbinary.cc @@ -13,7 +13,7 @@ #include #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/hardware-config.h" @@ -156,59 +156,59 @@ static void vbinary(benchmark::State& state, uint64_t arch_flags, ->Apply( \ benchmark::utils::BinaryElementwiseParameters) \ ->UseRealTime(); -#include "src/f16-vbinary/f16-vadd.h" -#include "src/f16-vbinary/f16-vaddc.h" -#include "src/f16-vbinary/f16-vdiv.h" -#include "src/f16-vbinary/f16-vdivc.h" -#include "src/f16-vbinary/f16-vmax.h" -#include "src/f16-vbinary/f16-vmaxc.h" -#include "src/f16-vbinary/f16-vmin.h" -#include "src/f16-vbinary/f16-vminc.h" -#include "src/f16-vbinary/f16-vmul.h" -#include "src/f16-vbinary/f16-vmulc.h" -#include "src/f16-vbinary/f16-vprelu.h" -#include "src/f16-vbinary/f16-vpreluc.h" -#include "src/f16-vbinary/f16-vrdivc.h" -#include "src/f16-vbinary/f16-vrpreluc.h" -#include "src/f16-vbinary/f16-vrsubc.h" -#include "src/f16-vbinary/f16-vsqrdiff.h" -#include "src/f16-vbinary/f16-vsqrdiffc.h" -#include "src/f16-vbinary/f16-vsub.h" -#include "src/f16-vbinary/f16-vsubc.h" -#include "src/f32-vbinary/f32-vadd.h" -#include "src/f32-vbinary/f32-vaddc.h" -#include "src/f32-vbinary/f32-vcopysign.h" -#include "src/f32-vbinary/f32-vcopysignc.h" -#include "src/f32-vbinary/f32-vdiv.h" -#include "src/f32-vbinary/f32-vdivc.h" -#include "src/f32-vbinary/f32-vmax.h" -#include "src/f32-vbinary/f32-vmaxc.h" -#include "src/f32-vbinary/f32-vmin.h" -#include "src/f32-vbinary/f32-vminc.h" -#include "src/f32-vbinary/f32-vmul.h" -#include "src/f32-vbinary/f32-vmulc.h" -#include "src/f32-vbinary/f32-vprelu.h" -#include "src/f32-vbinary/f32-vpreluc.h" -#include "src/f32-vbinary/f32-vrcopysignc.h" -#include "src/f32-vbinary/f32-vrdivc.h" -#include "src/f32-vbinary/f32-vrpreluc.h" -#include "src/f32-vbinary/f32-vrsubc.h" -#include "src/f32-vbinary/f32-vsqrdiff.h" -#include "src/f32-vbinary/f32-vsqrdiffc.h" -#include "src/f32-vbinary/f32-vsub.h" -#include "src/f32-vbinary/f32-vsubc.h" -#include "src/qs8-vadd/qs8-vadd-minmax.h" -#include "src/qs8-vaddc/qs8-vaddc-minmax.h" -#include "src/qs8-vmul/qs8-vmul-minmax-fp32.h" -#include "src/qs8-vmul/qs8-vmul-minmax-rndnu.h" -#include "src/qs8-vmulc/qs8-vmulc-minmax-fp32.h" -#include "src/qs8-vmulc/qs8-vmulc-minmax-rndnu.h" -#include "src/qu8-vadd/qu8-vadd-minmax.h" -#include "src/qu8-vaddc/qu8-vaddc-minmax.h" -#include "src/qu8-vmul/qu8-vmul-minmax-fp32.h" -#include "src/qu8-vmul/qu8-vmul-minmax-rndnu.h" -#include "src/qu8-vmulc/qu8-vmulc-minmax-fp32.h" -#include "src/qu8-vmulc/qu8-vmulc-minmax-rndnu.h" +#include "f16-vbinary/f16-vadd.h" +#include "f16-vbinary/f16-vaddc.h" +#include "f16-vbinary/f16-vdiv.h" +#include "f16-vbinary/f16-vdivc.h" +#include "f16-vbinary/f16-vmax.h" +#include "f16-vbinary/f16-vmaxc.h" +#include "f16-vbinary/f16-vmin.h" +#include "f16-vbinary/f16-vminc.h" +#include "f16-vbinary/f16-vmul.h" +#include "f16-vbinary/f16-vmulc.h" +#include "f16-vbinary/f16-vprelu.h" +#include "f16-vbinary/f16-vpreluc.h" +#include "f16-vbinary/f16-vrdivc.h" +#include "f16-vbinary/f16-vrpreluc.h" +#include "f16-vbinary/f16-vrsubc.h" +#include "f16-vbinary/f16-vsqrdiff.h" +#include "f16-vbinary/f16-vsqrdiffc.h" +#include "f16-vbinary/f16-vsub.h" +#include "f16-vbinary/f16-vsubc.h" +#include "f32-vbinary/f32-vadd.h" +#include "f32-vbinary/f32-vaddc.h" +#include "f32-vbinary/f32-vcopysign.h" +#include "f32-vbinary/f32-vcopysignc.h" +#include "f32-vbinary/f32-vdiv.h" +#include "f32-vbinary/f32-vdivc.h" +#include "f32-vbinary/f32-vmax.h" +#include "f32-vbinary/f32-vmaxc.h" +#include "f32-vbinary/f32-vmin.h" +#include "f32-vbinary/f32-vminc.h" +#include "f32-vbinary/f32-vmul.h" +#include "f32-vbinary/f32-vmulc.h" +#include "f32-vbinary/f32-vprelu.h" +#include "f32-vbinary/f32-vpreluc.h" +#include "f32-vbinary/f32-vrcopysignc.h" +#include "f32-vbinary/f32-vrdivc.h" +#include "f32-vbinary/f32-vrpreluc.h" +#include "f32-vbinary/f32-vrsubc.h" +#include "f32-vbinary/f32-vsqrdiff.h" +#include "f32-vbinary/f32-vsqrdiffc.h" +#include "f32-vbinary/f32-vsub.h" +#include "f32-vbinary/f32-vsubc.h" +#include "qs8-vadd/qs8-vadd-minmax.h" +#include "qs8-vaddc/qs8-vaddc-minmax.h" +#include "qs8-vmul/qs8-vmul-minmax-fp32.h" +#include "qs8-vmul/qs8-vmul-minmax-rndnu.h" +#include "qs8-vmulc/qs8-vmulc-minmax-fp32.h" +#include "qs8-vmulc/qs8-vmulc-minmax-rndnu.h" +#include "qu8-vadd/qu8-vadd-minmax.h" +#include "qu8-vaddc/qu8-vaddc-minmax.h" +#include "qu8-vmul/qu8-vmul-minmax-fp32.h" +#include "qu8-vmul/qu8-vmul-minmax-rndnu.h" +#include "qu8-vmulc/qu8-vmulc-minmax-fp32.h" +#include "qu8-vmulc/qu8-vmulc-minmax-rndnu.h" #undef XNN_UKERNEL_WITH_PARAMS #ifndef XNNPACK_BENCHMARK_NO_MAIN diff --git a/bench/vcvt-benchmark.h b/bench/vcvt-benchmark.h index db65e7d2fb6..a97a235fc66 100644 --- a/bench/vcvt-benchmark.h +++ b/bench/vcvt-benchmark.h @@ -8,7 +8,7 @@ #include #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/buffer.h" #include diff --git a/bench/vunary.cc b/bench/vunary.cc index a41b83222f3..30ef008f441 100644 --- a/bench/vunary.cc +++ b/bench/vunary.cc @@ -14,16 +14,16 @@ #include #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack.h" +#include "xnnpack/buffer.h" #include "xnnpack/common.h" #include "xnnpack/hardware-config.h" +#include "xnnpack/math.h" #include "xnnpack/microfnptr.h" #include "xnnpack/microparams-init.h" #include "xnnpack/microparams.h" -#include "xnnpack/vhswish.h" -#include "xnnpack/vlrelu.h" -#include "xnnpack/buffer.h" +#include "xnnpack/vunary.h" #include template @@ -197,45 +197,45 @@ void vunary(benchmark::State& state, uint64_t arch_flags, ->Apply( \ benchmark::utils::UnaryElementwiseParameters) \ ->UseRealTime(); -#include "src/f16-vabs/f16-vabs.h" -#include "src/f16-vclamp/f16-vclamp.h" -#include "src/f16-velu/f16-velu.h" -#include "src/f16-vhswish/f16-vhswish.h" -#include "src/f16-vlrelu/f16-vlrelu.h" -#include "src/f16-vneg/f16-vneg.h" -#include "src/f16-vrnd/f16-vrndd.h" -#include "src/f16-vrnd/f16-vrndne.h" -#include "src/f16-vrnd/f16-vrndu.h" -#include "src/f16-vrnd/f16-vrndz.h" -#include "src/f16-vrsqrt/f16-vrsqrt.h" -#include "src/f16-vsigmoid/f16-vsigmoid.h" -#include "src/f16-vsqr/f16-vsqr.h" -#include "src/f16-vsqrt/f16-vsqrt.h" -#include "src/f16-vtanh/f16-vtanh.h" -#include "src/f32-vabs/f32-vabs.h" -#include "src/f32-vclamp/f32-vclamp.h" -#include "src/f32-velu/f32-velu.h" -#include "src/f32-vgelu/f32-vgelu.h" -#include "src/f32-vhswish/f32-vhswish.h" -#include "src/f32-vlog/f32-vlog.h" -#include "src/f32-vlrelu/f32-vlrelu.h" -#include "src/f32-vneg/f32-vneg.h" -#include "src/f32-vrelu/f32-vrelu.h" -#include "src/f32-vrnd/f32-vrndd.h" -#include "src/f32-vrnd/f32-vrndne.h" -#include "src/f32-vrnd/f32-vrndu.h" -#include "src/f32-vrnd/f32-vrndz.h" -#include "src/f32-vrsqrt/f32-vrsqrt.h" -#include "src/f32-vsigmoid/f32-vsigmoid.h" -#include "src/f32-vsqr/f32-vsqr.h" -#include "src/f32-vsqrt/f32-vsqrt.h" -#include "src/f32-vtanh/f32-vtanh.h" -#include "src/qs8-vhswish/qs8-vhswish.h" -#include "src/qs8-vlrelu/qs8-vlrelu.h" -#include "src/qu8-vhswish/qu8-vhswish.h" -#include "src/qu8-vlrelu/qu8-vlrelu.h" -#include "src/s8-vclamp/s8-vclamp.h" -#include "src/u8-vclamp/u8-vclamp.h" +#include "f16-vabs/f16-vabs.h" +#include "f16-vclamp/f16-vclamp.h" +#include "f16-velu/f16-velu.h" +#include "f16-vhswish/f16-vhswish.h" +#include "f16-vlrelu/f16-vlrelu.h" +#include "f16-vneg/f16-vneg.h" +#include "f16-vrnd/f16-vrndd.h" +#include "f16-vrnd/f16-vrndne.h" +#include "f16-vrnd/f16-vrndu.h" +#include "f16-vrnd/f16-vrndz.h" +#include "f16-vrsqrt/f16-vrsqrt.h" +#include "f16-vsigmoid/f16-vsigmoid.h" +#include "f16-vsqr/f16-vsqr.h" +#include "f16-vsqrt/f16-vsqrt.h" +#include "f16-vtanh/f16-vtanh.h" +#include "f32-vabs/f32-vabs.h" +#include "f32-vclamp/f32-vclamp.h" +#include "f32-velu/f32-velu.h" +#include "f32-vgelu/f32-vgelu.h" +#include "f32-vhswish/f32-vhswish.h" +#include "f32-vlog/f32-vlog.h" +#include "f32-vlrelu/f32-vlrelu.h" +#include "f32-vneg/f32-vneg.h" +#include "f32-vrelu/f32-vrelu.h" +#include "f32-vrnd/f32-vrndd.h" +#include "f32-vrnd/f32-vrndne.h" +#include "f32-vrnd/f32-vrndu.h" +#include "f32-vrnd/f32-vrndz.h" +#include "f32-vrsqrt/f32-vrsqrt.h" +#include "f32-vsigmoid/f32-vsigmoid.h" +#include "f32-vsqr/f32-vsqr.h" +#include "f32-vsqrt/f32-vsqrt.h" +#include "f32-vtanh/f32-vtanh.h" +#include "qs8-vhswish/qs8-vhswish.h" +#include "qs8-vlrelu/qs8-vlrelu.h" +#include "qu8-vhswish/qu8-vhswish.h" +#include "qu8-vlrelu/qu8-vlrelu.h" +#include "s8-vclamp/s8-vclamp.h" +#include "u8-vclamp/u8-vclamp.h" #undef XNN_UKERNEL_WITH_PARAMS #ifndef XNNPACK_BENCHMARK_NO_MAIN diff --git a/bench/x16-packw.cc b/bench/x16-packw.cc index 74a11000bc4..a2b699bc3fc 100644 --- a/bench/x16-packw.cc +++ b/bench/x16-packw.cc @@ -5,9 +5,9 @@ #include -#include "bench/bgemm.h" -#include "bench/packw-benchmark.h" -#include "bench/utils.h" +#include "bgemm.h" +#include "packw-benchmark.h" +#include "utils.h" #include "xnnpack/common.h" #include "xnnpack/hardware-config.h" #include "xnnpack/packw.h" @@ -22,7 +22,7 @@ static void x16_packw(benchmark::State& state, const char* net, #define XNN_UKERNEL(arch_flags, ukernel, nr, kr, sr, kblock, nr_scale) \ BENCHMARK_CAPTURE_BGEMM(x16_packw, ukernel##_, ukernel, arch_flags, nr, kr, sr); -#include "src/x16-packw/x16-packw.h" +#include "x16-packw/x16-packw.h" #undef XNN_UKERNEL diff --git a/bench/x32-packw.cc b/bench/x32-packw.cc index 1d2e9b52551..a8663f9e4cd 100644 --- a/bench/x32-packw.cc +++ b/bench/x32-packw.cc @@ -5,9 +5,9 @@ #include -#include "bench/bgemm.h" -#include "bench/packw-benchmark.h" -#include "bench/utils.h" +#include "bgemm.h" +#include "packw-benchmark.h" +#include "utils.h" #include "xnnpack/common.h" #include "xnnpack/hardware-config.h" #include "xnnpack/packw.h" @@ -22,7 +22,7 @@ static void x32_packw(benchmark::State& state, const char* net, #define XNN_UKERNEL(arch_flags, ukernel, nr, kr, sr, kblock, nr_scale) \ BENCHMARK_CAPTURE_BGEMM(x32_packw, ukernel##_, ukernel, arch_flags, nr, kr, sr); -#include "src/x32-packw/x32-packw.h" +#include "x32-packw/x32-packw.h" #undef XNN_UKERNEL diff --git a/bench/x8-lut.cc b/bench/x8-lut.cc index 29d1263e648..5cd9bafadc1 100644 --- a/bench/x8-lut.cc +++ b/bench/x8-lut.cc @@ -10,7 +10,7 @@ #include #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/lut.h" @@ -74,7 +74,7 @@ static void x8_lut( ->UseRealTime(); #endif // XNN_ARCH_ARM64 -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_ENABLE_AVX512VBMI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) BENCHMARK_CAPTURE(x8_lut, avx512vbmi_vpermx2b_u64, xnn_x8_lut_ukernel__avx512vbmi_vpermx2b_u64, benchmark::utils::CheckAVX512VBMI) @@ -95,7 +95,9 @@ static void x8_lut( benchmark::utils::CheckAVX512VBMI) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); +#endif // XNN_ENABLE_AVX512VBMI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) BENCHMARK_CAPTURE(x8_lut, avx512skx_vpshufb_u64, xnn_x8_lut_ukernel__avx512skx_vpshufb_u64, benchmark::utils::CheckAVX512SKX) @@ -116,7 +118,9 @@ static void x8_lut( benchmark::utils::CheckAVX512SKX) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(x8_lut, avx2_u32, xnn_x8_lut_ukernel__avx2_u32, benchmark::utils::CheckAVX2) diff --git a/bench/x8-packq.cc b/bench/x8-packq.cc index aa5839fae92..b69755239b3 100644 --- a/bench/x8-packq.cc +++ b/bench/x8-packq.cc @@ -5,9 +5,9 @@ #include -#include "bench/bgemm.h" -#include "bench/packq-benchmark.h" -#include "bench/utils.h" +#include "bgemm.h" +#include "packq-benchmark.h" +#include "utils.h" #include "xnnpack/common.h" #include "xnnpack/packq.h" @@ -31,7 +31,7 @@ BENCHMARK_CAPTURE_BGEMM(x8_packq, ukernel##_mr4_kr1_, ukernel, arch_flags, /*mr= BENCHMARK_CAPTURE_BGEMM(x8_packq, ukernel##_mr4_kr2_, ukernel, arch_flags, /*mr=*/4, /*kr=*/2); \ BENCHMARK_CAPTURE_BGEMM(x8_packq, ukernel##_mr4_kr4_, ukernel, arch_flags, /*mr=*/4, /*kr=*/4); -#include "src/x8-packq/x8-packq.h" +#include "x8-packq/x8-packq.h" #undef XNN_UKERNEL diff --git a/bench/x8-packw.cc b/bench/x8-packw.cc index cbaffdd79b5..439fcf3f1c0 100644 --- a/bench/x8-packw.cc +++ b/bench/x8-packw.cc @@ -5,9 +5,9 @@ #include -#include "bench/bgemm.h" -#include "bench/packw-benchmark.h" -#include "bench/utils.h" +#include "bgemm.h" +#include "packw-benchmark.h" +#include "utils.h" #include "xnnpack/common.h" #include "xnnpack/hardware-config.h" #include "xnnpack/packw.h" @@ -22,7 +22,7 @@ static void x8_packw(benchmark::State& state, const char* net, #define XNN_UKERNEL(arch_flags, ukernel, nr, kr, sr, kblock, nr_scale) \ BENCHMARK_CAPTURE_BGEMM(x8_packw, ukernel##_, ukernel, arch_flags, nr, kr, sr); -#include "src/x8-packw/x8-packw.h" +#include "x8-packw/x8-packw.h" #undef XNN_UKERNEL #ifndef XNNPACK_BENCHMARK_NO_MAIN diff --git a/bench/xN-transposec.cc b/bench/xN-transposec.cc index 981485561e5..a642dc0b38f 100644 --- a/bench/xN-transposec.cc +++ b/bench/xN-transposec.cc @@ -9,7 +9,7 @@ #include #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/hardware-config.h" @@ -65,11 +65,11 @@ static void BenchmarkKernelSize(benchmark::internal::Benchmark* b) element_size) \ ->Apply(BenchmarkKernelSize) \ ->UseRealTime(); -#include "src/x8-transposec/x8-transposec.h" -#include "src/x16-transposec/x16-transposec.h" -#include "src/x24-transposec/x24-transposec.h" -#include "src/x32-transposec/x32-transposec.h" -#include "src/x64-transposec/x64-transposec.h" +#include "x8-transposec/x8-transposec.h" +#include "x16-transposec/x16-transposec.h" +#include "x24-transposec/x24-transposec.h" +#include "x32-transposec/x32-transposec.h" +#include "x64-transposec/x64-transposec.h" #undef XNN_TRANSPOSE_UKERNEL #ifndef XNNPACK_BENCHMARK_NO_MAIN diff --git a/bench/xx-transposev.cc b/bench/xx-transposev.cc index 9621a771cc0..6f5407abb1e 100644 --- a/bench/xx-transposev.cc +++ b/bench/xx-transposev.cc @@ -9,7 +9,7 @@ #include #include -#include "bench/utils.h" +#include "utils.h" #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/microfnptr.h" diff --git a/build_params.bzl b/build_params.bzl index d4f27a09099..d3deb037f5e 100644 --- a/build_params.bzl +++ b/build_params.bzl @@ -119,6 +119,10 @@ def xnnpack_configurable_defines(): ":avx512skx_enabled", ["XNN_ENABLE_AVX512SKX=1"], ["XNN_ENABLE_AVX512SKX=0"], + ) + xnnpack_select_if( + ":avx512vbmi_enabled", + ["XNN_ENABLE_AVX512VBMI=1"], + ["XNN_ENABLE_AVX512VBMI=0"], ) + xnnpack_select_if( ":avx512vnni_enabled", ["XNN_ENABLE_AVX512VNNI=1"], @@ -645,7 +649,7 @@ XNNPACK_PARAMS_FOR_ARCH = { msys_copts = ["-fno-asynchronous-unwind-tables"], ), "avx512vbmi": _create_params( - cond = "//build_config:x86", + cond = "//:avx512vbmi_enabled", gcc_x86_copts = [ "-mf16c", "-mfma", diff --git a/build_srcs.bzl b/build_srcs.bzl index 6d9746dcf6b..8ee26ea8578 100644 --- a/build_srcs.bzl +++ b/build_srcs.bzl @@ -24,7 +24,6 @@ OPERATOR_SRCS = [ "src/operators/global-average-pooling-nwc.c", "src/operators/lut-elementwise-nc.c", "src/operators/max-pooling-nhwc.c", - "src/operators/prelu-nc.c", "src/operators/reduce-nd.c", "src/operators/resize-bilinear-nchw.c", "src/operators/resize-bilinear-nhwc.c", @@ -71,7 +70,6 @@ SUBGRAPH_SRCS = [ "src/subgraph/log.c", "src/subgraph/max-pooling-2d.c", "src/subgraph/negate.c", - "src/subgraph/prelu.c", "src/subgraph/reciprocal-square-root.c", "src/subgraph/reshape-helpers.c", "src/subgraph/rope.c", @@ -120,7 +118,6 @@ XNNPACK_SRCS = [ "src/configs/lut32norm-config.c", "src/configs/maxpool-config.c", "src/configs/pavgpool-config.c", - "src/configs/prelu-config.c", "src/configs/raddstoreexpminusmax-config.c", "src/configs/reduce-config.c", "src/configs/rmax-config.c", diff --git a/cmake/gen/avx2_microkernels.cmake b/cmake/gen/avx2_microkernels.cmake index b4e04e06dca..7efba2d3b2a 100644 --- a/cmake/gen/avx2_microkernels.cmake +++ b/cmake/gen/avx2_microkernels.cmake @@ -74,9 +74,11 @@ SET(PROD_AVX2_MICROKERNEL_SRCS src/qu8-vaddc/gen/qu8-vaddc-minmax-avx2-mul32-ld64-u16.c src/qu8-vcvt/gen/qu8-vcvt-avx2-u32.c src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u32.c + src/s8-vclamp/s8-vclamp-avx2-u128.c src/s32-f32-vcvt/gen/s32-f32-vcvt-avx2.c src/s32-vmul/gen/s32-vmul-avx2.c src/s32-vmul/gen/s32-vmulc-avx2.c + src/u8-vclamp/u8-vclamp-avx2-u128.c src/u32-f32-vcvt/gen/u32-f32-vcvt-avx2.c src/x8-lut/gen/x8-lut-avx2-u128.c src/x8-transposec/gen/x8-transposec-32x32-reuse-switch-avx2.c diff --git a/cmake/gen/avx512f_microkernels.cmake b/cmake/gen/avx512f_microkernels.cmake index c871a1bce06..32879d7660c 100644 --- a/cmake/gen/avx512f_microkernels.cmake +++ b/cmake/gen/avx512f_microkernels.cmake @@ -19,7 +19,6 @@ SET(PROD_AVX512F_MICROKERNEL_SRCS src/f32-gemm/gen/f32-gemm-7x16-minmax-avx512f-broadcast.c src/f32-igemm/gen/f32-igemm-1x16-minmax-avx512f-broadcast.c src/f32-igemm/gen/f32-igemm-7x16-minmax-avx512f-broadcast.c - src/f32-prelu/gen/f32-prelu-avx512f-2x16.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u64-acc2.c src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx512f-c64.c src/f32-rminmax/gen/f32-rmax-avx512f-u64-acc4.c @@ -35,7 +34,10 @@ SET(PROD_AVX512F_MICROKERNEL_SRCS src/f32-vbinary/gen/f32-vminc-avx512f-u32.c src/f32-vbinary/gen/f32-vmul-avx512f-u32.c src/f32-vbinary/gen/f32-vmulc-avx512f-u32.c + src/f32-vbinary/gen/f32-vprelu-avx512f-u32.c + src/f32-vbinary/gen/f32-vpreluc-avx512f-u32.c src/f32-vbinary/gen/f32-vrdivc-avx512f-u32.c + src/f32-vbinary/gen/f32-vrpreluc-avx512f-u32.c src/f32-vbinary/gen/f32-vrsubc-avx512f-u32.c src/f32-vbinary/gen/f32-vsqrdiff-avx512f-u32.c src/f32-vbinary/gen/f32-vsqrdiffc-avx512f-u32.c @@ -98,7 +100,6 @@ SET(NON_PROD_AVX512F_MICROKERNEL_SRCS src/f32-igemm/gen/f32-igemm-5x16-minmax-avx512f-broadcast.c src/f32-igemm/gen/f32-igemm-6x16-minmax-avx512f-broadcast.c src/f32-igemm/gen/f32-igemm-8x16-minmax-avx512f-broadcast.c - src/f32-prelu/gen/f32-prelu-avx512f-2x32.c src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-u64-acc2.c src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-u64-acc4.c src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-u64.c @@ -167,12 +168,9 @@ SET(NON_PROD_AVX512F_MICROKERNEL_SRCS src/f32-vbinary/gen/f32-vmul-avx512f-u16.c src/f32-vbinary/gen/f32-vmulc-avx512f-u16.c src/f32-vbinary/gen/f32-vprelu-avx512f-u16.c - src/f32-vbinary/gen/f32-vprelu-avx512f-u32.c src/f32-vbinary/gen/f32-vpreluc-avx512f-u16.c - src/f32-vbinary/gen/f32-vpreluc-avx512f-u32.c src/f32-vbinary/gen/f32-vrdivc-avx512f-u16.c src/f32-vbinary/gen/f32-vrpreluc-avx512f-u16.c - src/f32-vbinary/gen/f32-vrpreluc-avx512f-u32.c src/f32-vbinary/gen/f32-vrsubc-avx512f-u16.c src/f32-vbinary/gen/f32-vsqrdiff-avx512f-u16.c src/f32-vbinary/gen/f32-vsqrdiffc-avx512f-u16.c diff --git a/cmake/gen/avx512fp16_microkernels.cmake b/cmake/gen/avx512fp16_microkernels.cmake index addfb1edaf1..7b4246ce191 100644 --- a/cmake/gen/avx512fp16_microkernels.cmake +++ b/cmake/gen/avx512fp16_microkernels.cmake @@ -26,7 +26,10 @@ SET(PROD_AVX512FP16_MICROKERNEL_SRCS src/f16-vbinary/gen/f16-vminc-avx512fp16-u64.c src/f16-vbinary/gen/f16-vmul-avx512fp16-u64.c src/f16-vbinary/gen/f16-vmulc-avx512fp16-u64.c + src/f16-vbinary/gen/f16-vprelu-avx512fp16-u64.c + src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u64.c src/f16-vbinary/gen/f16-vrdivc-avx512fp16-u64.c + src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u64.c src/f16-vbinary/gen/f16-vrsubc-avx512fp16-u64.c src/f16-vbinary/gen/f16-vsqrdiff-avx512fp16-u64.c src/f16-vbinary/gen/f16-vsqrdiffc-avx512fp16-u64.c @@ -83,12 +86,9 @@ SET(NON_PROD_AVX512FP16_MICROKERNEL_SRCS src/f16-vbinary/gen/f16-vmul-avx512fp16-u32.c src/f16-vbinary/gen/f16-vmulc-avx512fp16-u32.c src/f16-vbinary/gen/f16-vprelu-avx512fp16-u32.c - src/f16-vbinary/gen/f16-vprelu-avx512fp16-u64.c src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u32.c - src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u64.c src/f16-vbinary/gen/f16-vrdivc-avx512fp16-u32.c src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u32.c - src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u64.c src/f16-vbinary/gen/f16-vrsubc-avx512fp16-u32.c src/f16-vbinary/gen/f16-vsqrdiff-avx512fp16-u32.c src/f16-vbinary/gen/f16-vsqrdiffc-avx512fp16-u32.c diff --git a/cmake/gen/avx512skx_microkernels.cmake b/cmake/gen/avx512skx_microkernels.cmake index f589f669149..ce5910e3a3c 100644 --- a/cmake/gen/avx512skx_microkernels.cmake +++ b/cmake/gen/avx512skx_microkernels.cmake @@ -51,6 +51,8 @@ SET(PROD_AVX512SKX_MICROKERNEL_SRCS src/qu8-igemm/gen/qu8-igemm-7x16c8-minmax-fp32-avx512skx-prfm.c src/qu8-vadd/gen/qu8-vadd-minmax-avx512skx-mul32-ld128-u16.c src/qu8-vaddc/gen/qu8-vaddc-minmax-avx512skx-mul32-ld128-u16.c + src/s8-vclamp/s8-vclamp-avx512skx-u256.c + src/u8-vclamp/u8-vclamp-avx512skx-u256.c src/x8-lut/gen/x8-lut-avx512skx-vpshufb-u64.c) SET(NON_PROD_AVX512SKX_MICROKERNEL_SRCS diff --git a/cmake/gen/avx_microkernels.cmake b/cmake/gen/avx_microkernels.cmake index eb28687e6c5..900094b180a 100644 --- a/cmake/gen/avx_microkernels.cmake +++ b/cmake/gen/avx_microkernels.cmake @@ -21,7 +21,6 @@ SET(PROD_AVX_MICROKERNEL_SRCS src/f32-gemm/gen/f32-gemm-5x16-minmax-avx-broadcast.c src/f32-igemm/gen/f32-igemm-1x16-minmax-avx-broadcast.c src/f32-igemm/gen/f32-igemm-5x16-minmax-avx-broadcast.c - src/f32-prelu/gen/f32-prelu-avx-2x16.c src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x16-minmax-avx-broadcast.c src/f32-qc4w-gemm/gen/f32-qc4w-gemm-3x16-minmax-avx-broadcast.c src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x16-minmax-avx-broadcast.c @@ -42,7 +41,10 @@ SET(PROD_AVX_MICROKERNEL_SRCS src/f32-vbinary/gen/f32-vminc-avx-u16.c src/f32-vbinary/gen/f32-vmul-avx-u16.c src/f32-vbinary/gen/f32-vmulc-avx-u16.c + src/f32-vbinary/gen/f32-vprelu-avx-u16.c + src/f32-vbinary/gen/f32-vpreluc-avx-u16.c src/f32-vbinary/gen/f32-vrdivc-avx-u16.c + src/f32-vbinary/gen/f32-vrpreluc-avx-u16.c src/f32-vbinary/gen/f32-vrsubc-avx-u16.c src/f32-vbinary/gen/f32-vsqrdiff-avx-u16.c src/f32-vbinary/gen/f32-vsqrdiffc-avx-u16.c @@ -165,7 +167,7 @@ SET(NON_PROD_AVX_MICROKERNEL_SRCS src/f32-igemm/gen/f32-igemm-6x8-minmax-avx-broadcast.c src/f32-igemm/gen/f32-igemm-6x16-minmax-avx-broadcast.c src/f32-igemm/gen/f32-igemm-7x8-minmax-avx-broadcast.c - src/f32-prelu/gen/f32-prelu-avx-2x8.c + src/f32-prelu/gen/f32-prelu-avx-2x16.c src/f32-qc4w-gemm/gen/f32-qc4w-gemm-2x16-minmax-avx-broadcast.c src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x16-minmax-avx-broadcast.c src/f32-qc4w-gemm/gen/f32-qc4w-gemm-5x16-minmax-avx-broadcast.c @@ -214,12 +216,9 @@ SET(NON_PROD_AVX_MICROKERNEL_SRCS src/f32-vbinary/gen/f32-vmul-avx-u8.c src/f32-vbinary/gen/f32-vmulc-avx-u8.c src/f32-vbinary/gen/f32-vprelu-avx-u8.c - src/f32-vbinary/gen/f32-vprelu-avx-u16.c src/f32-vbinary/gen/f32-vpreluc-avx-u8.c - src/f32-vbinary/gen/f32-vpreluc-avx-u16.c src/f32-vbinary/gen/f32-vrdivc-avx-u8.c src/f32-vbinary/gen/f32-vrpreluc-avx-u8.c - src/f32-vbinary/gen/f32-vrpreluc-avx-u16.c src/f32-vbinary/gen/f32-vrsubc-avx-u8.c src/f32-vbinary/gen/f32-vsqrdiff-avx-u8.c src/f32-vbinary/gen/f32-vsqrdiffc-avx-u8.c diff --git a/cmake/gen/f16c_microkernels.cmake b/cmake/gen/f16c_microkernels.cmake index e478a8abc1d..1d48cc491de 100644 --- a/cmake/gen/f16c_microkernels.cmake +++ b/cmake/gen/f16c_microkernels.cmake @@ -18,7 +18,6 @@ SET(PROD_F16C_MICROKERNEL_SRCS src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c8.c src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c8.c src/f16-maxpool/f16-maxpool-9p8x-minmax-f16c-c8.c - src/f16-prelu/gen/f16-prelu-f16c-2x16.c src/f16-rminmax/f16-rmax-f16c-u32.c src/f16-vbinary/gen/f16-vadd-f16c-u16.c src/f16-vbinary/gen/f16-vaddc-f16c-u16.c @@ -30,7 +29,10 @@ SET(PROD_F16C_MICROKERNEL_SRCS src/f16-vbinary/gen/f16-vminc-f16c-u16.c src/f16-vbinary/gen/f16-vmul-f16c-u16.c src/f16-vbinary/gen/f16-vmulc-f16c-u16.c + src/f16-vbinary/gen/f16-vprelu-f16c-u16.c + src/f16-vbinary/gen/f16-vpreluc-f16c-u16.c src/f16-vbinary/gen/f16-vrdivc-f16c-u8.c + src/f16-vbinary/gen/f16-vrpreluc-f16c-u16.c src/f16-vbinary/gen/f16-vrsubc-f16c-u16.c src/f16-vbinary/gen/f16-vsqrdiff-f16c-u16.c src/f16-vbinary/gen/f16-vsqrdiffc-f16c-u16.c @@ -64,7 +66,6 @@ SET(NON_PROD_F16C_MICROKERNEL_SRCS src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c16.c src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c24.c src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c32.c - src/f16-prelu/gen/f16-prelu-f16c-2x8.c src/f16-vbinary/gen/f16-vadd-f16c-u8.c src/f16-vbinary/gen/f16-vaddc-f16c-u8.c src/f16-vbinary/gen/f16-vdiv-f16c-u16.c @@ -76,12 +77,9 @@ SET(NON_PROD_F16C_MICROKERNEL_SRCS src/f16-vbinary/gen/f16-vmul-f16c-u8.c src/f16-vbinary/gen/f16-vmulc-f16c-u8.c src/f16-vbinary/gen/f16-vprelu-f16c-u8.c - src/f16-vbinary/gen/f16-vprelu-f16c-u16.c src/f16-vbinary/gen/f16-vpreluc-f16c-u8.c - src/f16-vbinary/gen/f16-vpreluc-f16c-u16.c src/f16-vbinary/gen/f16-vrdivc-f16c-u16.c src/f16-vbinary/gen/f16-vrpreluc-f16c-u8.c - src/f16-vbinary/gen/f16-vrpreluc-f16c-u16.c src/f16-vbinary/gen/f16-vrsubc-f16c-u8.c src/f16-vbinary/gen/f16-vsqrdiff-f16c-u8.c src/f16-vbinary/gen/f16-vsqrdiffc-f16c-u8.c diff --git a/cmake/gen/neon_microkernels.cmake b/cmake/gen/neon_microkernels.cmake index 775480a2655..ff1f9efe773 100644 --- a/cmake/gen/neon_microkernels.cmake +++ b/cmake/gen/neon_microkernels.cmake @@ -41,7 +41,6 @@ SET(PROD_NEON_MICROKERNEL_SRCS src/f32-maxpool/f32-maxpool-9p8x-minmax-neon-c4.c src/f32-pavgpool/f32-pavgpool-9p8x-minmax-neon-c4.c src/f32-pavgpool/f32-pavgpool-9x-minmax-neon-c4.c - src/f32-prelu/gen/f32-prelu-neon-2x8.c src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-neon-lane-ld64.c src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x8-minmax-neon-lane-ld64.c src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-neon-lane-ld64.c @@ -61,6 +60,9 @@ SET(PROD_NEON_MICROKERNEL_SRCS src/f32-vbinary/gen/f32-vminc-neon-u8.c src/f32-vbinary/gen/f32-vmul-neon-u8.c src/f32-vbinary/gen/f32-vmulc-neon-u8.c + src/f32-vbinary/gen/f32-vprelu-neon-u8.c + src/f32-vbinary/gen/f32-vpreluc-neon-u8.c + src/f32-vbinary/gen/f32-vrpreluc-neon-u8.c src/f32-vbinary/gen/f32-vrsubc-neon-u8.c src/f32-vbinary/gen/f32-vsqrdiff-neon-u8.c src/f32-vbinary/gen/f32-vsqrdiffc-neon-u8.c @@ -335,14 +337,6 @@ SET(NON_PROD_NEON_MICROKERNEL_SRCS src/f32-ppmm/gen/f32-ppmm-4x16-minmax-neon.c src/f32-ppmm/gen/f32-ppmm-8x8-minmax-neon-prfm.c src/f32-ppmm/gen/f32-ppmm-8x8-minmax-neon.c - src/f32-prelu/gen/f32-prelu-neon-1x4.c - src/f32-prelu/gen/f32-prelu-neon-1x8.c - src/f32-prelu/gen/f32-prelu-neon-1x16.c - src/f32-prelu/gen/f32-prelu-neon-2x4.c - src/f32-prelu/gen/f32-prelu-neon-2x16.c - src/f32-prelu/gen/f32-prelu-neon-4x4.c - src/f32-prelu/gen/f32-prelu-neon-4x8.c - src/f32-prelu/gen/f32-prelu-neon-4x16.c src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-neon-dup-ld64.c src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x8-minmax-neon-dup-ld64.c src/f32-qc4w-gemm/gen/f32-qc4w-gemm-5x8-minmax-neon-lane-ld64.c @@ -409,11 +403,8 @@ SET(NON_PROD_NEON_MICROKERNEL_SRCS src/f32-vbinary/gen/f32-vmul-neon-u4.c src/f32-vbinary/gen/f32-vmulc-neon-u4.c src/f32-vbinary/gen/f32-vprelu-neon-u4.c - src/f32-vbinary/gen/f32-vprelu-neon-u8.c src/f32-vbinary/gen/f32-vpreluc-neon-u4.c - src/f32-vbinary/gen/f32-vpreluc-neon-u8.c src/f32-vbinary/gen/f32-vrpreluc-neon-u4.c - src/f32-vbinary/gen/f32-vrpreluc-neon-u8.c src/f32-vbinary/gen/f32-vrsubc-neon-u4.c src/f32-vbinary/gen/f32-vsqrdiff-neon-u4.c src/f32-vbinary/gen/f32-vsqrdiffc-neon-u4.c @@ -721,7 +712,6 @@ SET(NON_PROD_NEON_MICROKERNEL_SRCS src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-neon-c64.c src/qs8-requantization/qs8-requantization-fp32-neon.c src/qs8-requantization/qs8-requantization-gemmlowp-neon.c - src/qs8-requantization/qs8-requantization-rndna-neon.c src/qs8-requantization/qs8-requantization-rndnu-neon-mull.c src/qs8-requantization/qs8-requantization-rndnu-neon-qdmulh.c src/qs8-rsum/gen/qs8-rsum-neon-u16.c @@ -837,7 +827,6 @@ SET(NON_PROD_NEON_MICROKERNEL_SRCS src/qu8-rdsum/gen/qu8-rdsum-7p7x-neon-u64.c src/qu8-requantization/qu8-requantization-fp32-neon.c src/qu8-requantization/qu8-requantization-gemmlowp-neon.c - src/qu8-requantization/qu8-requantization-rndna-neon.c src/qu8-rsum/gen/qu8-rsum-neon-u16.c src/qu8-rsum/gen/qu8-rsum-neon-u64-acc2.c src/qu8-rsum/gen/qu8-rsum-neon-u64-acc4.c diff --git a/cmake/gen/neonfp16arith_microkernels.cmake b/cmake/gen/neonfp16arith_microkernels.cmake index 4cc50373bae..aa8ca503c40 100644 --- a/cmake/gen/neonfp16arith_microkernels.cmake +++ b/cmake/gen/neonfp16arith_microkernels.cmake @@ -40,7 +40,6 @@ SET(PROD_NEONFP16ARITH_MICROKERNEL_SRCS src/f16-maxpool/f16-maxpool-9p8x-minmax-neonfp16arith-c8.c src/f16-pavgpool/f16-pavgpool-9p8x-minmax-neonfp16arith-c8.c src/f16-pavgpool/f16-pavgpool-9x-minmax-neonfp16arith-c8.c - src/f16-prelu/gen/f16-prelu-neonfp16arith-2x16.c src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u32.c src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u32.c src/f16-rminmax/gen/f16-rmax-neonfp16arith-u32-acc4.c @@ -54,6 +53,9 @@ SET(PROD_NEONFP16ARITH_MICROKERNEL_SRCS src/f16-vbinary/gen/f16-vminc-neonfp16arith-u16.c src/f16-vbinary/gen/f16-vmul-neonfp16arith-u16.c src/f16-vbinary/gen/f16-vmulc-neonfp16arith-u16.c + src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u16.c + src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u16.c + src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u16.c src/f16-vbinary/gen/f16-vrsubc-neonfp16arith-u16.c src/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-u16.c src/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-u16.c @@ -194,7 +196,6 @@ SET(NON_PROD_NEONFP16ARITH_MICROKERNEL_SRCS src/f16-igemm/gen/f16-igemm-4x16-minmax-neonfp16arith-ld64.c src/f16-igemm/gen/f16-igemm-8x8-minmax-neonfp16arith-ld64.c src/f16-igemm/gen/f16-igemm-8x16-minmax-neonfp16arith-ld64.c - src/f16-prelu/gen/f16-prelu-neonfp16arith-2x8.c src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u8.c src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u16.c src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u24.c @@ -271,11 +272,8 @@ SET(NON_PROD_NEONFP16ARITH_MICROKERNEL_SRCS src/f16-vbinary/gen/f16-vmul-neonfp16arith-u8.c src/f16-vbinary/gen/f16-vmulc-neonfp16arith-u8.c src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u8.c - src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u16.c src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u8.c - src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u16.c src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u8.c - src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u16.c src/f16-vbinary/gen/f16-vrsubc-neonfp16arith-u8.c src/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-u8.c src/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-u8.c diff --git a/cmake/gen/rvv_microkernels.cmake b/cmake/gen/rvv_microkernels.cmake index 75b8e94b6c6..5d13ebb9fcd 100644 --- a/cmake/gen/rvv_microkernels.cmake +++ b/cmake/gen/rvv_microkernels.cmake @@ -62,6 +62,8 @@ SET(PROD_RVV_MICROKERNEL_SRCS src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u2v.c src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u2v.c src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u2v.c + src/s8-vclamp/gen/s8-vclamp-rvv-u4v.c + src/u8-vclamp/gen/u8-vclamp-rvv-u4v.c src/x32-packw/gen/x32-packw-x4v-gemm-goi-rvv-u8.c src/x32-transposec/gen/x32-transposec-4x4-rvv.c src/x32-transposec/gen/x32-transposec-8x8-rvv.c @@ -190,6 +192,12 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u1v.c src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u1v.c src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u1v.c + src/s8-vclamp/gen/s8-vclamp-rvv-u1v.c + src/s8-vclamp/gen/s8-vclamp-rvv-u2v.c + src/s8-vclamp/gen/s8-vclamp-rvv-u8v.c + src/u8-vclamp/gen/u8-vclamp-rvv-u1v.c + src/u8-vclamp/gen/u8-vclamp-rvv-u2v.c + src/u8-vclamp/gen/u8-vclamp-rvv-u8v.c src/x32-packw/gen/x32-packw-x1v-gemm-goi-rvv-u2.c src/x32-packw/gen/x32-packw-x1v-gemm-goi-rvv-u4.c src/x32-packw/gen/x32-packw-x1v-gemm-goi-rvv-u8.c diff --git a/cmake/gen/scalar_microkernels.cmake b/cmake/gen/scalar_microkernels.cmake index b4e3b340933..d7f3e8aafe0 100644 --- a/cmake/gen/scalar_microkernels.cmake +++ b/cmake/gen/scalar_microkernels.cmake @@ -70,7 +70,6 @@ SET(PROD_SCALAR_MICROKERNEL_SRCS src/f32-maxpool/f32-maxpool-9p8x-minmax-scalar-c1.c src/f32-pavgpool/f32-pavgpool-9p8x-minmax-scalar-c1.c src/f32-pavgpool/f32-pavgpool-9x-minmax-scalar-c1.c - src/f32-prelu/gen/f32-prelu-scalar-2x4.c src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x4-minmax-scalar.c src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x4-minmax-scalar.c src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x4-minmax-scalar.c @@ -99,7 +98,10 @@ SET(PROD_SCALAR_MICROKERNEL_SRCS src/f32-vbinary/gen/f32-vminc-scalar-u8.c src/f32-vbinary/gen/f32-vmul-scalar-u8.c src/f32-vbinary/gen/f32-vmulc-scalar-u8.c + src/f32-vbinary/gen/f32-vprelu-scalar-u8.c + src/f32-vbinary/gen/f32-vpreluc-scalar-u8.c src/f32-vbinary/gen/f32-vrdivc-scalar-u2.c + src/f32-vbinary/gen/f32-vrpreluc-scalar-u8.c src/f32-vbinary/gen/f32-vrsubc-scalar-u8.c src/f32-vbinary/gen/f32-vsqrdiff-scalar-u8.c src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u8.c @@ -383,7 +385,6 @@ SET(NON_PROD_SCALAR_MICROKERNEL_SRCS src/f32-ppmm/gen/f32-ppmm-3x3-minmax-scalar.c src/f32-ppmm/gen/f32-ppmm-4x2-minmax-scalar.c src/f32-ppmm/gen/f32-ppmm-4x4-minmax-scalar.c - src/f32-prelu/gen/f32-prelu-scalar-2x1.c src/f32-qc4w-gemm/gen/f32-qc4w-gemm-2x4-minmax-scalar.c src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x2-minmax-scalar.c src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x4-relu-scalar.c @@ -484,18 +485,15 @@ SET(NON_PROD_SCALAR_MICROKERNEL_SRCS src/f32-vbinary/gen/f32-vprelu-scalar-u1.c src/f32-vbinary/gen/f32-vprelu-scalar-u2.c src/f32-vbinary/gen/f32-vprelu-scalar-u4.c - src/f32-vbinary/gen/f32-vprelu-scalar-u8.c src/f32-vbinary/gen/f32-vpreluc-scalar-u1.c src/f32-vbinary/gen/f32-vpreluc-scalar-u2.c src/f32-vbinary/gen/f32-vpreluc-scalar-u4.c - src/f32-vbinary/gen/f32-vpreluc-scalar-u8.c src/f32-vbinary/gen/f32-vrdivc-scalar-u1.c src/f32-vbinary/gen/f32-vrdivc-scalar-u4.c src/f32-vbinary/gen/f32-vrdivc-scalar-u8.c src/f32-vbinary/gen/f32-vrpreluc-scalar-u1.c src/f32-vbinary/gen/f32-vrpreluc-scalar-u2.c src/f32-vbinary/gen/f32-vrpreluc-scalar-u4.c - src/f32-vbinary/gen/f32-vrpreluc-scalar-u8.c src/f32-vbinary/gen/f32-vrsubc-scalar-u1.c src/f32-vbinary/gen/f32-vrsubc-scalar-u2.c src/f32-vbinary/gen/f32-vrsubc-scalar-u4.c @@ -724,9 +722,6 @@ SET(NON_PROD_SCALAR_MICROKERNEL_SRCS src/qs8-requantization/qs8-requantization-fp32-scalar-fmagic.c src/qs8-requantization/qs8-requantization-fp32-scalar-lrintf.c src/qs8-requantization/qs8-requantization-gemmlowp-scalar.c - src/qs8-requantization/qs8-requantization-rndna-scalar-signed64.c - src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned32.c - src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned64.c src/qs8-requantization/qs8-requantization-rndnu-scalar.c src/qs8-rsum/gen/qs8-rsum-scalar-u1.c src/qs8-rsum/gen/qs8-rsum-scalar-u2.c @@ -863,9 +858,6 @@ SET(NON_PROD_SCALAR_MICROKERNEL_SRCS src/qu8-requantization/qu8-requantization-fp32-scalar-fmagic.c src/qu8-requantization/qu8-requantization-fp32-scalar-lrintf.c src/qu8-requantization/qu8-requantization-gemmlowp-scalar.c - src/qu8-requantization/qu8-requantization-rndna-scalar-signed64.c - src/qu8-requantization/qu8-requantization-rndna-scalar-unsigned32.c - src/qu8-requantization/qu8-requantization-rndna-scalar-unsigned64.c src/qu8-rsum/gen/qu8-rsum-scalar-u1.c src/qu8-rsum/gen/qu8-rsum-scalar-u2.c src/qu8-vadd/gen/qu8-vadd-minmax-scalar-u2.c diff --git a/cmake/gen/sse2_microkernels.cmake b/cmake/gen/sse2_microkernels.cmake index 0868611bc1f..64623e83e6a 100644 --- a/cmake/gen/sse2_microkernels.cmake +++ b/cmake/gen/sse2_microkernels.cmake @@ -17,10 +17,12 @@ SET(PROD_SSE2_MICROKERNEL_SRCS src/f32-argmaxpool/f32-argmaxpool-9p8x-sse2-c4.c src/f32-argmaxpool/f32-argmaxpool-9x-sse2-c4.c src/f32-f16-vcvt/gen/f32-f16-vcvt-sse2-u16.c - src/f32-prelu/gen/f32-prelu-sse2-2x8.c src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u32.c src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u32.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc2.c + src/f32-vbinary/gen/f32-vprelu-sse2-u8.c + src/f32-vbinary/gen/f32-vpreluc-sse2-u8.c + src/f32-vbinary/gen/f32-vrpreluc-sse2-u8.c src/f32-vcopysign/gen/f32-vcopysign-sse2.c src/f32-vcopysign/gen/f32-vcopysignc-sse2.c src/f32-vcopysign/gen/f32-vrcopysignc-sse2.c @@ -118,7 +120,6 @@ SET(NON_PROD_SSE2_MICROKERNEL_SRCS src/f32-f16-vcvt/gen/f32-f16-vcvt-sse2-u8.c src/f32-f16-vcvt/gen/f32-f16-vcvt-sse2-u24.c src/f32-f16-vcvt/gen/f32-f16-vcvt-sse2-u32.c - src/f32-prelu/gen/f32-prelu-sse2-2x4.c src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u8.c src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u16.c src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u24.c @@ -129,11 +130,8 @@ SET(NON_PROD_SSE2_MICROKERNEL_SRCS src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u8-acc2.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc4.c src/f32-vbinary/gen/f32-vprelu-sse2-u4.c - src/f32-vbinary/gen/f32-vprelu-sse2-u8.c src/f32-vbinary/gen/f32-vpreluc-sse2-u4.c - src/f32-vbinary/gen/f32-vpreluc-sse2-u8.c src/f32-vbinary/gen/f32-vrpreluc-sse2-u4.c - src/f32-vbinary/gen/f32-vrpreluc-sse2-u8.c src/f32-velu/gen/f32-velu-sse2-rr2-lut16-p3-u4.c src/f32-velu/gen/f32-velu-sse2-rr2-lut16-p3-u8.c src/f32-velu/gen/f32-velu-sse2-rr2-lut16-p3-u16.c @@ -272,7 +270,6 @@ SET(NON_PROD_SSE2_MICROKERNEL_SRCS src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x4c2s4-minmax-fp32-sse2-ld128.c src/qs8-requantization/qs8-requantization-fp32-sse2.c src/qs8-requantization/qs8-requantization-gemmlowp-sse2.c - src/qs8-requantization/qs8-requantization-rndna-sse2.c src/qs8-vadd/gen/qs8-vadd-minmax-sse2-mul16-ld64-u16.c src/qs8-vadd/gen/qs8-vadd-minmax-sse2-mul16-ld64-u24.c src/qs8-vadd/gen/qs8-vadd-minmax-sse2-mul16-ld64-u32.c @@ -344,7 +341,6 @@ SET(NON_PROD_SSE2_MICROKERNEL_SRCS src/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-sse2-ld128.c src/qu8-requantization/qu8-requantization-fp32-sse2.c src/qu8-requantization/qu8-requantization-gemmlowp-sse2.c - src/qu8-requantization/qu8-requantization-rndna-sse2.c src/qu8-rsum/gen/qu8-rsum-sse2-u16.c src/qu8-rsum/gen/qu8-rsum-sse2-u64-acc2.c src/qu8-rsum/gen/qu8-rsum-sse2-u64-acc4.c diff --git a/cmake/gen/sse41_microkernels.cmake b/cmake/gen/sse41_microkernels.cmake index 0ea18c62396..6bf14b87fbb 100644 --- a/cmake/gen/sse41_microkernels.cmake +++ b/cmake/gen/sse41_microkernels.cmake @@ -12,7 +12,6 @@ SET(PROD_SSE41_MICROKERNEL_SRCS src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u16.c src/f32-f16-vcvt/gen/f32-f16-vcvt-sse41-u8.c - src/f32-prelu/gen/f32-prelu-sse41-2x8.c src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-sse41-dup.c src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x8-minmax-sse41-dup.c src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-sse41-dup.c @@ -83,7 +82,6 @@ SET(NON_PROD_SSE41_MICROKERNEL_SRCS src/f32-f16-vcvt/gen/f32-f16-vcvt-sse41-u16.c src/f32-f16-vcvt/gen/f32-f16-vcvt-sse41-u24.c src/f32-f16-vcvt/gen/f32-f16-vcvt-sse41-u32.c - src/f32-prelu/gen/f32-prelu-sse41-2x4.c src/f32-qc4w-gemm/gen/f32-qc4w-gemm-3x8-minmax-sse41-dup.c src/f32-qc4w-gemm/gen/f32-qc4w-gemm-5x8-minmax-sse41-dup.c src/f32-qc4w-gemm/gen/f32-qc4w-gemm-6x8-minmax-sse41-dup.c @@ -280,7 +278,6 @@ SET(NON_PROD_SSE41_MICROKERNEL_SRCS src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-sse41-c32.c src/qs8-requantization/qs8-requantization-fp32-sse41.c src/qs8-requantization/qs8-requantization-gemmlowp-sse41.c - src/qs8-requantization/qs8-requantization-rndna-sse41.c src/qs8-requantization/qs8-requantization-rndnu-sse41-sra.c src/qs8-requantization/qs8-requantization-rndnu-sse41-srl.c src/qs8-vadd/gen/qs8-vadd-minmax-sse41-mul16-ld64-u16.c @@ -374,7 +371,6 @@ SET(NON_PROD_SSE41_MICROKERNEL_SRCS src/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-sse41-ld64.c src/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-sse41-ld128.c src/qu8-requantization/qu8-requantization-gemmlowp-sse41.c - src/qu8-requantization/qu8-requantization-rndna-sse41.c src/qu8-vadd/gen/qu8-vadd-minmax-sse41-mul16-ld64-u16.c src/qu8-vadd/gen/qu8-vadd-minmax-sse41-mul32-ld32-u8.c src/qu8-vadd/gen/qu8-vadd-minmax-sse41-mul32-ld32-u16.c diff --git a/cmake/gen/sse_microkernels.cmake b/cmake/gen/sse_microkernels.cmake index befa0bb3246..a682a18f490 100644 --- a/cmake/gen/sse_microkernels.cmake +++ b/cmake/gen/sse_microkernels.cmake @@ -180,8 +180,6 @@ SET(NON_PROD_SSE_MICROKERNEL_SRCS src/f32-igemm/gen/f32-igemm-6x8-minmax-sse-load1.c src/f32-igemm/gen/f32-igemm-6x8s4-minmax-sse.c src/f32-ppmm/gen/f32-ppmm-4x8-minmax-sse.c - src/f32-prelu/gen/f32-prelu-sse-2x4.c - src/f32-prelu/gen/f32-prelu-sse-2x8.c src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c32.c src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c64.c src/f32-rminmax/gen/f32-rmax-sse-u4.c diff --git a/cmake/gen/ssse3_microkernels.cmake b/cmake/gen/ssse3_microkernels.cmake index 7cb398b68eb..6309ebf281e 100644 --- a/cmake/gen/ssse3_microkernels.cmake +++ b/cmake/gen/ssse3_microkernels.cmake @@ -39,7 +39,6 @@ SET(NON_PROD_SSSE3_MICROKERNEL_SRCS src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-ssse3-madd.c src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-ssse3-madd-prfm.c src/qs8-requantization/qs8-requantization-gemmlowp-ssse3.c - src/qs8-requantization/qs8-requantization-rndna-ssse3.c src/qs8-rsum/gen/qs8-rsum-ssse3-u16.c src/qs8-rsum/gen/qs8-rsum-ssse3-u64-acc2.c src/qs8-rsum/gen/qs8-rsum-ssse3-u64-acc4.c @@ -52,7 +51,6 @@ SET(NON_PROD_SSSE3_MICROKERNEL_SRCS src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c16.c src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c32.c src/qu8-requantization/qu8-requantization-gemmlowp-ssse3.c - src/qu8-requantization/qu8-requantization-rndna-ssse3.c src/qu8-vcvt/gen/qu8-vcvt-ssse3-u16.c src/qu8-vhswish/gen/qu8-vhswish-ssse3-u16.c src/qu8-vhswish/gen/qu8-vhswish-ssse3-u32.c diff --git a/cmake/gen/wasm_microkernels.cmake b/cmake/gen/wasm_microkernels.cmake index de89cd83440..f60129cee2c 100644 --- a/cmake/gen/wasm_microkernels.cmake +++ b/cmake/gen/wasm_microkernels.cmake @@ -32,7 +32,6 @@ SET(PROD_WASM_MICROKERNEL_SRCS src/f32-maxpool/f32-maxpool-9p8x-minmax-wasm-c1.c src/f32-pavgpool/f32-pavgpool-9p8x-minmax-wasm-c1.c src/f32-pavgpool/f32-pavgpool-9x-minmax-wasm-c1.c - src/f32-prelu/gen/f32-prelu-wasm-2x4.c src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x4-minmax-wasm.c src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x4-minmax-wasm.c src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x4-minmax-wasm.c @@ -50,7 +49,10 @@ SET(PROD_WASM_MICROKERNEL_SRCS src/f32-vbinary/gen/f32-vminc-wasm-u8.c src/f32-vbinary/gen/f32-vmul-wasm-u8.c src/f32-vbinary/gen/f32-vmulc-wasm-u8.c + src/f32-vbinary/gen/f32-vprelu-wasm-u8.c + src/f32-vbinary/gen/f32-vpreluc-wasm-u8.c src/f32-vbinary/gen/f32-vrdivc-wasm-u8.c + src/f32-vbinary/gen/f32-vrpreluc-wasm-u8.c src/f32-vbinary/gen/f32-vrsubc-wasm-u8.c src/f32-vbinary/gen/f32-vsub-wasm-u8.c src/f32-vbinary/gen/f32-vsubc-wasm-u8.c @@ -110,7 +112,6 @@ SET(NON_PROD_WASM_MICROKERNEL_SRCS src/f32-igemm/gen/f32-igemm-2x4-minmax-wasm.c src/f32-igemm/gen/f32-igemm-2x4-relu-wasm.c src/f32-igemm/gen/f32-igemm-4x2-relu-wasm.c - src/f32-prelu/gen/f32-prelu-wasm-2x1.c src/f32-qc4w-gemm/gen/f32-qc4w-gemm-2x4-minmax-wasm.c src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x2-minmax-wasm.c src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x4-relu-wasm.c @@ -172,18 +173,15 @@ SET(NON_PROD_WASM_MICROKERNEL_SRCS src/f32-vbinary/gen/f32-vprelu-wasm-u1.c src/f32-vbinary/gen/f32-vprelu-wasm-u2.c src/f32-vbinary/gen/f32-vprelu-wasm-u4.c - src/f32-vbinary/gen/f32-vprelu-wasm-u8.c src/f32-vbinary/gen/f32-vpreluc-wasm-u1.c src/f32-vbinary/gen/f32-vpreluc-wasm-u2.c src/f32-vbinary/gen/f32-vpreluc-wasm-u4.c - src/f32-vbinary/gen/f32-vpreluc-wasm-u8.c src/f32-vbinary/gen/f32-vrdivc-wasm-u1.c src/f32-vbinary/gen/f32-vrdivc-wasm-u2.c src/f32-vbinary/gen/f32-vrdivc-wasm-u4.c src/f32-vbinary/gen/f32-vrpreluc-wasm-u1.c src/f32-vbinary/gen/f32-vrpreluc-wasm-u2.c src/f32-vbinary/gen/f32-vrpreluc-wasm-u4.c - src/f32-vbinary/gen/f32-vrpreluc-wasm-u8.c src/f32-vbinary/gen/f32-vrsubc-wasm-u1.c src/f32-vbinary/gen/f32-vrsubc-wasm-u2.c src/f32-vbinary/gen/f32-vrsubc-wasm-u4.c diff --git a/cmake/gen/wasmrelaxedsimd_microkernels.cmake b/cmake/gen/wasmrelaxedsimd_microkernels.cmake index 09596cbf072..509ffa887b5 100644 --- a/cmake/gen/wasmrelaxedsimd_microkernels.cmake +++ b/cmake/gen/wasmrelaxedsimd_microkernels.cmake @@ -51,8 +51,6 @@ SET(PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS src/f32-igemm/gen/f32-igemm-6x8-minmax-wasmrelaxedsimd-fma-splat.c src/f32-igemm/gen/f32-igemm-6x8-relu-wasmrelaxedsimd-fma-splat.c src/f32-igemm/gen/f32-igemm-6x8-wasmrelaxedsimd-fma-splat.c - src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x4.c - src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x4.c src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmrelaxedsimd-fma-loadsplat.c src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmrelaxedsimd-fma-splat.c src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-relu-wasmrelaxedsimd-fma-loadsplat.c @@ -284,22 +282,6 @@ SET(NON_PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS src/f32-igemm/gen/f32-igemm-6x8s4-minmax-wasmrelaxedsimd.c src/f32-igemm/gen/f32-igemm-6x8s4-relu-wasmrelaxedsimd-fma.c src/f32-igemm/gen/f32-igemm-6x8s4-wasmrelaxedsimd-fma.c - src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x4.c - src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x8.c - src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x16.c - src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x8.c - src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x16.c - src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x4.c - src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x8.c - src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x16.c - src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x4.c - src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x8.c - src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x16.c - src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x8.c - src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x16.c - src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x4.c - src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x8.c - src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x16.c src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmrelaxedsimd-loadsplat.c src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmrelaxedsimd-splat.c src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8s4-minmax-wasmrelaxedsimd-fma.c @@ -513,6 +495,7 @@ SET(NON_PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x8c8-minmax-wasmusdot.c src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-wasmusdot-u2.c src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-wasmusdot.c + src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c16-minmax-fp32-wasmsdot.c src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c16-minmax-fp32-wasmusdot.c src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-wasmusdot.c @@ -601,6 +584,7 @@ SET(NON_PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmusdot-u2-acc2.c src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmusdot-u2.c src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmusdot.c + src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c src/qs8-rsum/gen/qs8-rsum-wasmrelaxedsimd-u16.c src/qs8-rsum/gen/qs8-rsum-wasmrelaxedsimd-u32-acc2.c src/qs8-rsum/gen/qs8-rsum-wasmrelaxedsimd-u64-acc2.c diff --git a/cmake/gen/wasmsimd_microkernels.cmake b/cmake/gen/wasmsimd_microkernels.cmake index 103c2e67711..ba54c415f58 100644 --- a/cmake/gen/wasmsimd_microkernels.cmake +++ b/cmake/gen/wasmsimd_microkernels.cmake @@ -100,8 +100,6 @@ SET(PROD_WASMSIMD_MICROKERNEL_SRCS src/f32-pavgpool/f32-pavgpool-9p8x-minmax-wasmsimd-x86-c4.c src/f32-pavgpool/f32-pavgpool-9x-minmax-wasmsimd-arm-c4.c src/f32-pavgpool/f32-pavgpool-9x-minmax-wasmsimd-x86-c4.c - src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x8.c - src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x8.c src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmsimd-arm-splat.c src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmsimd-x86-splat.c src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-relu-wasmsimd-splat.c @@ -135,7 +133,10 @@ SET(PROD_WASMSIMD_MICROKERNEL_SRCS src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u16.c src/f32-vbinary/gen/f32-vmul-wasmsimd-u16.c src/f32-vbinary/gen/f32-vmulc-wasmsimd-u16.c + src/f32-vbinary/gen/f32-vprelu-wasmsimd-u16.c + src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u16.c src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u16.c + src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u16.c src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u16.c src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u16.c src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u16.c @@ -568,22 +569,6 @@ SET(NON_PROD_WASMSIMD_MICROKERNEL_SRCS src/f32-igemm/gen/f32-igemm-6x8s4-wasmsimd.c src/f32-ppmm/gen/f32-ppmm-4x8-minmax-wasmsimd-arm-splat.c src/f32-ppmm/gen/f32-ppmm-4x8-minmax-wasmsimd-x86-splat.c - src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x4.c - src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x8.c - src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x16.c - src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x4.c - src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x16.c - src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x4.c - src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x8.c - src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x16.c - src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x4.c - src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x8.c - src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x16.c - src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x4.c - src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x16.c - src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x4.c - src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x8.c - src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x16.c src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmsimd-arm-loadsplat.c src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmsimd-x86-loadsplat.c src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-relu-wasmsimd-loadsplat.c @@ -757,15 +742,12 @@ SET(NON_PROD_WASMSIMD_MICROKERNEL_SRCS src/f32-vbinary/gen/f32-vmulc-wasmsimd-u8.c src/f32-vbinary/gen/f32-vprelu-wasmsimd-u4.c src/f32-vbinary/gen/f32-vprelu-wasmsimd-u8.c - src/f32-vbinary/gen/f32-vprelu-wasmsimd-u16.c src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u4.c src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u8.c - src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u16.c src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u4.c src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u8.c src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u4.c src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u8.c - src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u16.c src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u4.c src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u8.c src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u4.c diff --git a/gen/avx2_microkernels.bzl b/gen/avx2_microkernels.bzl index fc68f74797d..f0bb53976ff 100644 --- a/gen/avx2_microkernels.bzl +++ b/gen/avx2_microkernels.bzl @@ -70,9 +70,11 @@ PROD_AVX2_MICROKERNEL_SRCS = [ "src/qu8-vaddc/gen/qu8-vaddc-minmax-avx2-mul32-ld64-u16.c", "src/qu8-vcvt/gen/qu8-vcvt-avx2-u32.c", "src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u32.c", + "src/s8-vclamp/s8-vclamp-avx2-u128.c", "src/s32-f32-vcvt/gen/s32-f32-vcvt-avx2.c", "src/s32-vmul/gen/s32-vmul-avx2.c", "src/s32-vmul/gen/s32-vmulc-avx2.c", + "src/u8-vclamp/u8-vclamp-avx2-u128.c", "src/u32-f32-vcvt/gen/u32-f32-vcvt-avx2.c", "src/x8-lut/gen/x8-lut-avx2-u128.c", "src/x8-transposec/gen/x8-transposec-32x32-reuse-switch-avx2.c", diff --git a/gen/avx512f_microkernels.bzl b/gen/avx512f_microkernels.bzl index 38bb3ba21c5..f12f928fb23 100644 --- a/gen/avx512f_microkernels.bzl +++ b/gen/avx512f_microkernels.bzl @@ -15,7 +15,6 @@ PROD_AVX512F_MICROKERNEL_SRCS = [ "src/f32-gemm/gen/f32-gemm-7x16-minmax-avx512f-broadcast.c", "src/f32-igemm/gen/f32-igemm-1x16-minmax-avx512f-broadcast.c", "src/f32-igemm/gen/f32-igemm-7x16-minmax-avx512f-broadcast.c", - "src/f32-prelu/gen/f32-prelu-avx512f-2x16.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u64-acc2.c", "src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx512f-c64.c", "src/f32-rminmax/gen/f32-rmax-avx512f-u64-acc4.c", @@ -31,7 +30,10 @@ PROD_AVX512F_MICROKERNEL_SRCS = [ "src/f32-vbinary/gen/f32-vminc-avx512f-u32.c", "src/f32-vbinary/gen/f32-vmul-avx512f-u32.c", "src/f32-vbinary/gen/f32-vmulc-avx512f-u32.c", + "src/f32-vbinary/gen/f32-vprelu-avx512f-u32.c", + "src/f32-vbinary/gen/f32-vpreluc-avx512f-u32.c", "src/f32-vbinary/gen/f32-vrdivc-avx512f-u32.c", + "src/f32-vbinary/gen/f32-vrpreluc-avx512f-u32.c", "src/f32-vbinary/gen/f32-vrsubc-avx512f-u32.c", "src/f32-vbinary/gen/f32-vsqrdiff-avx512f-u32.c", "src/f32-vbinary/gen/f32-vsqrdiffc-avx512f-u32.c", @@ -95,7 +97,6 @@ NON_PROD_AVX512F_MICROKERNEL_SRCS = [ "src/f32-igemm/gen/f32-igemm-5x16-minmax-avx512f-broadcast.c", "src/f32-igemm/gen/f32-igemm-6x16-minmax-avx512f-broadcast.c", "src/f32-igemm/gen/f32-igemm-8x16-minmax-avx512f-broadcast.c", - "src/f32-prelu/gen/f32-prelu-avx512f-2x32.c", "src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-u64-acc2.c", "src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-u64-acc4.c", "src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-u64.c", @@ -164,12 +165,9 @@ NON_PROD_AVX512F_MICROKERNEL_SRCS = [ "src/f32-vbinary/gen/f32-vmul-avx512f-u16.c", "src/f32-vbinary/gen/f32-vmulc-avx512f-u16.c", "src/f32-vbinary/gen/f32-vprelu-avx512f-u16.c", - "src/f32-vbinary/gen/f32-vprelu-avx512f-u32.c", "src/f32-vbinary/gen/f32-vpreluc-avx512f-u16.c", - "src/f32-vbinary/gen/f32-vpreluc-avx512f-u32.c", "src/f32-vbinary/gen/f32-vrdivc-avx512f-u16.c", "src/f32-vbinary/gen/f32-vrpreluc-avx512f-u16.c", - "src/f32-vbinary/gen/f32-vrpreluc-avx512f-u32.c", "src/f32-vbinary/gen/f32-vrsubc-avx512f-u16.c", "src/f32-vbinary/gen/f32-vsqrdiff-avx512f-u16.c", "src/f32-vbinary/gen/f32-vsqrdiffc-avx512f-u16.c", diff --git a/gen/avx512fp16_microkernels.bzl b/gen/avx512fp16_microkernels.bzl index aeb7b028a3f..f1c8d65af13 100644 --- a/gen/avx512fp16_microkernels.bzl +++ b/gen/avx512fp16_microkernels.bzl @@ -22,7 +22,10 @@ PROD_AVX512FP16_MICROKERNEL_SRCS = [ "src/f16-vbinary/gen/f16-vminc-avx512fp16-u64.c", "src/f16-vbinary/gen/f16-vmul-avx512fp16-u64.c", "src/f16-vbinary/gen/f16-vmulc-avx512fp16-u64.c", + "src/f16-vbinary/gen/f16-vprelu-avx512fp16-u64.c", + "src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u64.c", "src/f16-vbinary/gen/f16-vrdivc-avx512fp16-u64.c", + "src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u64.c", "src/f16-vbinary/gen/f16-vrsubc-avx512fp16-u64.c", "src/f16-vbinary/gen/f16-vsqrdiff-avx512fp16-u64.c", "src/f16-vbinary/gen/f16-vsqrdiffc-avx512fp16-u64.c", @@ -80,12 +83,9 @@ NON_PROD_AVX512FP16_MICROKERNEL_SRCS = [ "src/f16-vbinary/gen/f16-vmul-avx512fp16-u32.c", "src/f16-vbinary/gen/f16-vmulc-avx512fp16-u32.c", "src/f16-vbinary/gen/f16-vprelu-avx512fp16-u32.c", - "src/f16-vbinary/gen/f16-vprelu-avx512fp16-u64.c", "src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u32.c", - "src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u64.c", "src/f16-vbinary/gen/f16-vrdivc-avx512fp16-u32.c", "src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u32.c", - "src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u64.c", "src/f16-vbinary/gen/f16-vrsubc-avx512fp16-u32.c", "src/f16-vbinary/gen/f16-vsqrdiff-avx512fp16-u32.c", "src/f16-vbinary/gen/f16-vsqrdiffc-avx512fp16-u32.c", diff --git a/gen/avx512skx_microkernels.bzl b/gen/avx512skx_microkernels.bzl index fe29a4c806c..b1a653ae98a 100644 --- a/gen/avx512skx_microkernels.bzl +++ b/gen/avx512skx_microkernels.bzl @@ -47,6 +47,8 @@ PROD_AVX512SKX_MICROKERNEL_SRCS = [ "src/qu8-igemm/gen/qu8-igemm-7x16c8-minmax-fp32-avx512skx-prfm.c", "src/qu8-vadd/gen/qu8-vadd-minmax-avx512skx-mul32-ld128-u16.c", "src/qu8-vaddc/gen/qu8-vaddc-minmax-avx512skx-mul32-ld128-u16.c", + "src/s8-vclamp/s8-vclamp-avx512skx-u256.c", + "src/u8-vclamp/u8-vclamp-avx512skx-u256.c", "src/x8-lut/gen/x8-lut-avx512skx-vpshufb-u64.c", ] diff --git a/gen/avx_microkernels.bzl b/gen/avx_microkernels.bzl index 2794ad08fa7..04d649297cf 100644 --- a/gen/avx_microkernels.bzl +++ b/gen/avx_microkernels.bzl @@ -17,7 +17,6 @@ PROD_AVX_MICROKERNEL_SRCS = [ "src/f32-gemm/gen/f32-gemm-5x16-minmax-avx-broadcast.c", "src/f32-igemm/gen/f32-igemm-1x16-minmax-avx-broadcast.c", "src/f32-igemm/gen/f32-igemm-5x16-minmax-avx-broadcast.c", - "src/f32-prelu/gen/f32-prelu-avx-2x16.c", "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x16-minmax-avx-broadcast.c", "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-3x16-minmax-avx-broadcast.c", "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x16-minmax-avx-broadcast.c", @@ -38,7 +37,10 @@ PROD_AVX_MICROKERNEL_SRCS = [ "src/f32-vbinary/gen/f32-vminc-avx-u16.c", "src/f32-vbinary/gen/f32-vmul-avx-u16.c", "src/f32-vbinary/gen/f32-vmulc-avx-u16.c", + "src/f32-vbinary/gen/f32-vprelu-avx-u16.c", + "src/f32-vbinary/gen/f32-vpreluc-avx-u16.c", "src/f32-vbinary/gen/f32-vrdivc-avx-u16.c", + "src/f32-vbinary/gen/f32-vrpreluc-avx-u16.c", "src/f32-vbinary/gen/f32-vrsubc-avx-u16.c", "src/f32-vbinary/gen/f32-vsqrdiff-avx-u16.c", "src/f32-vbinary/gen/f32-vsqrdiffc-avx-u16.c", @@ -162,7 +164,7 @@ NON_PROD_AVX_MICROKERNEL_SRCS = [ "src/f32-igemm/gen/f32-igemm-6x8-minmax-avx-broadcast.c", "src/f32-igemm/gen/f32-igemm-6x16-minmax-avx-broadcast.c", "src/f32-igemm/gen/f32-igemm-7x8-minmax-avx-broadcast.c", - "src/f32-prelu/gen/f32-prelu-avx-2x8.c", + "src/f32-prelu/gen/f32-prelu-avx-2x16.c", "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-2x16-minmax-avx-broadcast.c", "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x16-minmax-avx-broadcast.c", "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-5x16-minmax-avx-broadcast.c", @@ -211,12 +213,9 @@ NON_PROD_AVX_MICROKERNEL_SRCS = [ "src/f32-vbinary/gen/f32-vmul-avx-u8.c", "src/f32-vbinary/gen/f32-vmulc-avx-u8.c", "src/f32-vbinary/gen/f32-vprelu-avx-u8.c", - "src/f32-vbinary/gen/f32-vprelu-avx-u16.c", "src/f32-vbinary/gen/f32-vpreluc-avx-u8.c", - "src/f32-vbinary/gen/f32-vpreluc-avx-u16.c", "src/f32-vbinary/gen/f32-vrdivc-avx-u8.c", "src/f32-vbinary/gen/f32-vrpreluc-avx-u8.c", - "src/f32-vbinary/gen/f32-vrpreluc-avx-u16.c", "src/f32-vbinary/gen/f32-vrsubc-avx-u8.c", "src/f32-vbinary/gen/f32-vsqrdiff-avx-u8.c", "src/f32-vbinary/gen/f32-vsqrdiffc-avx-u8.c", diff --git a/gen/f16c_microkernels.bzl b/gen/f16c_microkernels.bzl index 383bdde3cd5..5e2a145eb6c 100644 --- a/gen/f16c_microkernels.bzl +++ b/gen/f16c_microkernels.bzl @@ -14,7 +14,6 @@ PROD_F16C_MICROKERNEL_SRCS = [ "src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c8.c", "src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c8.c", "src/f16-maxpool/f16-maxpool-9p8x-minmax-f16c-c8.c", - "src/f16-prelu/gen/f16-prelu-f16c-2x16.c", "src/f16-rminmax/f16-rmax-f16c-u32.c", "src/f16-vbinary/gen/f16-vadd-f16c-u16.c", "src/f16-vbinary/gen/f16-vaddc-f16c-u16.c", @@ -26,7 +25,10 @@ PROD_F16C_MICROKERNEL_SRCS = [ "src/f16-vbinary/gen/f16-vminc-f16c-u16.c", "src/f16-vbinary/gen/f16-vmul-f16c-u16.c", "src/f16-vbinary/gen/f16-vmulc-f16c-u16.c", + "src/f16-vbinary/gen/f16-vprelu-f16c-u16.c", + "src/f16-vbinary/gen/f16-vpreluc-f16c-u16.c", "src/f16-vbinary/gen/f16-vrdivc-f16c-u8.c", + "src/f16-vbinary/gen/f16-vrpreluc-f16c-u16.c", "src/f16-vbinary/gen/f16-vrsubc-f16c-u16.c", "src/f16-vbinary/gen/f16-vsqrdiff-f16c-u16.c", "src/f16-vbinary/gen/f16-vsqrdiffc-f16c-u16.c", @@ -61,7 +63,6 @@ NON_PROD_F16C_MICROKERNEL_SRCS = [ "src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c16.c", "src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c24.c", "src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c32.c", - "src/f16-prelu/gen/f16-prelu-f16c-2x8.c", "src/f16-vbinary/gen/f16-vadd-f16c-u8.c", "src/f16-vbinary/gen/f16-vaddc-f16c-u8.c", "src/f16-vbinary/gen/f16-vdiv-f16c-u16.c", @@ -73,12 +74,9 @@ NON_PROD_F16C_MICROKERNEL_SRCS = [ "src/f16-vbinary/gen/f16-vmul-f16c-u8.c", "src/f16-vbinary/gen/f16-vmulc-f16c-u8.c", "src/f16-vbinary/gen/f16-vprelu-f16c-u8.c", - "src/f16-vbinary/gen/f16-vprelu-f16c-u16.c", "src/f16-vbinary/gen/f16-vpreluc-f16c-u8.c", - "src/f16-vbinary/gen/f16-vpreluc-f16c-u16.c", "src/f16-vbinary/gen/f16-vrdivc-f16c-u16.c", "src/f16-vbinary/gen/f16-vrpreluc-f16c-u8.c", - "src/f16-vbinary/gen/f16-vrpreluc-f16c-u16.c", "src/f16-vbinary/gen/f16-vrsubc-f16c-u8.c", "src/f16-vbinary/gen/f16-vsqrdiff-f16c-u8.c", "src/f16-vbinary/gen/f16-vsqrdiffc-f16c-u8.c", diff --git a/gen/neon_microkernels.bzl b/gen/neon_microkernels.bzl index ef111a9a5f7..66370894399 100644 --- a/gen/neon_microkernels.bzl +++ b/gen/neon_microkernels.bzl @@ -37,7 +37,6 @@ PROD_NEON_MICROKERNEL_SRCS = [ "src/f32-maxpool/f32-maxpool-9p8x-minmax-neon-c4.c", "src/f32-pavgpool/f32-pavgpool-9p8x-minmax-neon-c4.c", "src/f32-pavgpool/f32-pavgpool-9x-minmax-neon-c4.c", - "src/f32-prelu/gen/f32-prelu-neon-2x8.c", "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-neon-lane-ld64.c", "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x8-minmax-neon-lane-ld64.c", "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-neon-lane-ld64.c", @@ -57,6 +56,9 @@ PROD_NEON_MICROKERNEL_SRCS = [ "src/f32-vbinary/gen/f32-vminc-neon-u8.c", "src/f32-vbinary/gen/f32-vmul-neon-u8.c", "src/f32-vbinary/gen/f32-vmulc-neon-u8.c", + "src/f32-vbinary/gen/f32-vprelu-neon-u8.c", + "src/f32-vbinary/gen/f32-vpreluc-neon-u8.c", + "src/f32-vbinary/gen/f32-vrpreluc-neon-u8.c", "src/f32-vbinary/gen/f32-vrsubc-neon-u8.c", "src/f32-vbinary/gen/f32-vsqrdiff-neon-u8.c", "src/f32-vbinary/gen/f32-vsqrdiffc-neon-u8.c", @@ -332,14 +334,6 @@ NON_PROD_NEON_MICROKERNEL_SRCS = [ "src/f32-ppmm/gen/f32-ppmm-4x16-minmax-neon.c", "src/f32-ppmm/gen/f32-ppmm-8x8-minmax-neon-prfm.c", "src/f32-ppmm/gen/f32-ppmm-8x8-minmax-neon.c", - "src/f32-prelu/gen/f32-prelu-neon-1x4.c", - "src/f32-prelu/gen/f32-prelu-neon-1x8.c", - "src/f32-prelu/gen/f32-prelu-neon-1x16.c", - "src/f32-prelu/gen/f32-prelu-neon-2x4.c", - "src/f32-prelu/gen/f32-prelu-neon-2x16.c", - "src/f32-prelu/gen/f32-prelu-neon-4x4.c", - "src/f32-prelu/gen/f32-prelu-neon-4x8.c", - "src/f32-prelu/gen/f32-prelu-neon-4x16.c", "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-neon-dup-ld64.c", "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x8-minmax-neon-dup-ld64.c", "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-5x8-minmax-neon-lane-ld64.c", @@ -406,11 +400,8 @@ NON_PROD_NEON_MICROKERNEL_SRCS = [ "src/f32-vbinary/gen/f32-vmul-neon-u4.c", "src/f32-vbinary/gen/f32-vmulc-neon-u4.c", "src/f32-vbinary/gen/f32-vprelu-neon-u4.c", - "src/f32-vbinary/gen/f32-vprelu-neon-u8.c", "src/f32-vbinary/gen/f32-vpreluc-neon-u4.c", - "src/f32-vbinary/gen/f32-vpreluc-neon-u8.c", "src/f32-vbinary/gen/f32-vrpreluc-neon-u4.c", - "src/f32-vbinary/gen/f32-vrpreluc-neon-u8.c", "src/f32-vbinary/gen/f32-vrsubc-neon-u4.c", "src/f32-vbinary/gen/f32-vsqrdiff-neon-u4.c", "src/f32-vbinary/gen/f32-vsqrdiffc-neon-u4.c", @@ -718,7 +709,6 @@ NON_PROD_NEON_MICROKERNEL_SRCS = [ "src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-neon-c64.c", "src/qs8-requantization/qs8-requantization-fp32-neon.c", "src/qs8-requantization/qs8-requantization-gemmlowp-neon.c", - "src/qs8-requantization/qs8-requantization-rndna-neon.c", "src/qs8-requantization/qs8-requantization-rndnu-neon-mull.c", "src/qs8-requantization/qs8-requantization-rndnu-neon-qdmulh.c", "src/qs8-rsum/gen/qs8-rsum-neon-u16.c", @@ -834,7 +824,6 @@ NON_PROD_NEON_MICROKERNEL_SRCS = [ "src/qu8-rdsum/gen/qu8-rdsum-7p7x-neon-u64.c", "src/qu8-requantization/qu8-requantization-fp32-neon.c", "src/qu8-requantization/qu8-requantization-gemmlowp-neon.c", - "src/qu8-requantization/qu8-requantization-rndna-neon.c", "src/qu8-rsum/gen/qu8-rsum-neon-u16.c", "src/qu8-rsum/gen/qu8-rsum-neon-u64-acc2.c", "src/qu8-rsum/gen/qu8-rsum-neon-u64-acc4.c", diff --git a/gen/neonfp16arith_microkernels.bzl b/gen/neonfp16arith_microkernels.bzl index 7e0c07c3ca3..018ca23cdfd 100644 --- a/gen/neonfp16arith_microkernels.bzl +++ b/gen/neonfp16arith_microkernels.bzl @@ -36,7 +36,6 @@ PROD_NEONFP16ARITH_MICROKERNEL_SRCS = [ "src/f16-maxpool/f16-maxpool-9p8x-minmax-neonfp16arith-c8.c", "src/f16-pavgpool/f16-pavgpool-9p8x-minmax-neonfp16arith-c8.c", "src/f16-pavgpool/f16-pavgpool-9x-minmax-neonfp16arith-c8.c", - "src/f16-prelu/gen/f16-prelu-neonfp16arith-2x16.c", "src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u32.c", "src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u32.c", "src/f16-rminmax/gen/f16-rmax-neonfp16arith-u32-acc4.c", @@ -50,6 +49,9 @@ PROD_NEONFP16ARITH_MICROKERNEL_SRCS = [ "src/f16-vbinary/gen/f16-vminc-neonfp16arith-u16.c", "src/f16-vbinary/gen/f16-vmul-neonfp16arith-u16.c", "src/f16-vbinary/gen/f16-vmulc-neonfp16arith-u16.c", + "src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u16.c", + "src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u16.c", + "src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u16.c", "src/f16-vbinary/gen/f16-vrsubc-neonfp16arith-u16.c", "src/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-u16.c", "src/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-u16.c", @@ -191,7 +193,6 @@ NON_PROD_NEONFP16ARITH_MICROKERNEL_SRCS = [ "src/f16-igemm/gen/f16-igemm-4x16-minmax-neonfp16arith-ld64.c", "src/f16-igemm/gen/f16-igemm-8x8-minmax-neonfp16arith-ld64.c", "src/f16-igemm/gen/f16-igemm-8x16-minmax-neonfp16arith-ld64.c", - "src/f16-prelu/gen/f16-prelu-neonfp16arith-2x8.c", "src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u8.c", "src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u16.c", "src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u24.c", @@ -268,11 +269,8 @@ NON_PROD_NEONFP16ARITH_MICROKERNEL_SRCS = [ "src/f16-vbinary/gen/f16-vmul-neonfp16arith-u8.c", "src/f16-vbinary/gen/f16-vmulc-neonfp16arith-u8.c", "src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u8.c", - "src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u16.c", "src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u8.c", - "src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u16.c", "src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u8.c", - "src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u16.c", "src/f16-vbinary/gen/f16-vrsubc-neonfp16arith-u8.c", "src/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-u8.c", "src/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-u8.c", diff --git a/gen/rvv_microkernels.bzl b/gen/rvv_microkernels.bzl index 949c9f387eb..8790b58a8e4 100644 --- a/gen/rvv_microkernels.bzl +++ b/gen/rvv_microkernels.bzl @@ -58,6 +58,8 @@ PROD_RVV_MICROKERNEL_SRCS = [ "src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u2v.c", "src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u2v.c", "src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u2v.c", + "src/s8-vclamp/gen/s8-vclamp-rvv-u4v.c", + "src/u8-vclamp/gen/u8-vclamp-rvv-u4v.c", "src/x32-packw/gen/x32-packw-x4v-gemm-goi-rvv-u8.c", "src/x32-transposec/gen/x32-transposec-4x4-rvv.c", "src/x32-transposec/gen/x32-transposec-8x8-rvv.c", @@ -187,6 +189,12 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [ "src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u1v.c", "src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u1v.c", "src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u1v.c", + "src/s8-vclamp/gen/s8-vclamp-rvv-u1v.c", + "src/s8-vclamp/gen/s8-vclamp-rvv-u2v.c", + "src/s8-vclamp/gen/s8-vclamp-rvv-u8v.c", + "src/u8-vclamp/gen/u8-vclamp-rvv-u1v.c", + "src/u8-vclamp/gen/u8-vclamp-rvv-u2v.c", + "src/u8-vclamp/gen/u8-vclamp-rvv-u8v.c", "src/x32-packw/gen/x32-packw-x1v-gemm-goi-rvv-u2.c", "src/x32-packw/gen/x32-packw-x1v-gemm-goi-rvv-u4.c", "src/x32-packw/gen/x32-packw-x1v-gemm-goi-rvv-u8.c", diff --git a/gen/scalar_microkernels.bzl b/gen/scalar_microkernels.bzl index a7c2bba6c7c..b89b83c3211 100644 --- a/gen/scalar_microkernels.bzl +++ b/gen/scalar_microkernels.bzl @@ -66,7 +66,6 @@ PROD_SCALAR_MICROKERNEL_SRCS = [ "src/f32-maxpool/f32-maxpool-9p8x-minmax-scalar-c1.c", "src/f32-pavgpool/f32-pavgpool-9p8x-minmax-scalar-c1.c", "src/f32-pavgpool/f32-pavgpool-9x-minmax-scalar-c1.c", - "src/f32-prelu/gen/f32-prelu-scalar-2x4.c", "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x4-minmax-scalar.c", "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x4-minmax-scalar.c", "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x4-minmax-scalar.c", @@ -95,7 +94,10 @@ PROD_SCALAR_MICROKERNEL_SRCS = [ "src/f32-vbinary/gen/f32-vminc-scalar-u8.c", "src/f32-vbinary/gen/f32-vmul-scalar-u8.c", "src/f32-vbinary/gen/f32-vmulc-scalar-u8.c", + "src/f32-vbinary/gen/f32-vprelu-scalar-u8.c", + "src/f32-vbinary/gen/f32-vpreluc-scalar-u8.c", "src/f32-vbinary/gen/f32-vrdivc-scalar-u2.c", + "src/f32-vbinary/gen/f32-vrpreluc-scalar-u8.c", "src/f32-vbinary/gen/f32-vrsubc-scalar-u8.c", "src/f32-vbinary/gen/f32-vsqrdiff-scalar-u8.c", "src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u8.c", @@ -380,7 +382,6 @@ NON_PROD_SCALAR_MICROKERNEL_SRCS = [ "src/f32-ppmm/gen/f32-ppmm-3x3-minmax-scalar.c", "src/f32-ppmm/gen/f32-ppmm-4x2-minmax-scalar.c", "src/f32-ppmm/gen/f32-ppmm-4x4-minmax-scalar.c", - "src/f32-prelu/gen/f32-prelu-scalar-2x1.c", "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-2x4-minmax-scalar.c", "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x2-minmax-scalar.c", "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x4-relu-scalar.c", @@ -481,18 +482,15 @@ NON_PROD_SCALAR_MICROKERNEL_SRCS = [ "src/f32-vbinary/gen/f32-vprelu-scalar-u1.c", "src/f32-vbinary/gen/f32-vprelu-scalar-u2.c", "src/f32-vbinary/gen/f32-vprelu-scalar-u4.c", - "src/f32-vbinary/gen/f32-vprelu-scalar-u8.c", "src/f32-vbinary/gen/f32-vpreluc-scalar-u1.c", "src/f32-vbinary/gen/f32-vpreluc-scalar-u2.c", "src/f32-vbinary/gen/f32-vpreluc-scalar-u4.c", - "src/f32-vbinary/gen/f32-vpreluc-scalar-u8.c", "src/f32-vbinary/gen/f32-vrdivc-scalar-u1.c", "src/f32-vbinary/gen/f32-vrdivc-scalar-u4.c", "src/f32-vbinary/gen/f32-vrdivc-scalar-u8.c", "src/f32-vbinary/gen/f32-vrpreluc-scalar-u1.c", "src/f32-vbinary/gen/f32-vrpreluc-scalar-u2.c", "src/f32-vbinary/gen/f32-vrpreluc-scalar-u4.c", - "src/f32-vbinary/gen/f32-vrpreluc-scalar-u8.c", "src/f32-vbinary/gen/f32-vrsubc-scalar-u1.c", "src/f32-vbinary/gen/f32-vrsubc-scalar-u2.c", "src/f32-vbinary/gen/f32-vrsubc-scalar-u4.c", @@ -721,9 +719,6 @@ NON_PROD_SCALAR_MICROKERNEL_SRCS = [ "src/qs8-requantization/qs8-requantization-fp32-scalar-fmagic.c", "src/qs8-requantization/qs8-requantization-fp32-scalar-lrintf.c", "src/qs8-requantization/qs8-requantization-gemmlowp-scalar.c", - "src/qs8-requantization/qs8-requantization-rndna-scalar-signed64.c", - "src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned32.c", - "src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned64.c", "src/qs8-requantization/qs8-requantization-rndnu-scalar.c", "src/qs8-rsum/gen/qs8-rsum-scalar-u1.c", "src/qs8-rsum/gen/qs8-rsum-scalar-u2.c", @@ -860,9 +855,6 @@ NON_PROD_SCALAR_MICROKERNEL_SRCS = [ "src/qu8-requantization/qu8-requantization-fp32-scalar-fmagic.c", "src/qu8-requantization/qu8-requantization-fp32-scalar-lrintf.c", "src/qu8-requantization/qu8-requantization-gemmlowp-scalar.c", - "src/qu8-requantization/qu8-requantization-rndna-scalar-signed64.c", - "src/qu8-requantization/qu8-requantization-rndna-scalar-unsigned32.c", - "src/qu8-requantization/qu8-requantization-rndna-scalar-unsigned64.c", "src/qu8-rsum/gen/qu8-rsum-scalar-u1.c", "src/qu8-rsum/gen/qu8-rsum-scalar-u2.c", "src/qu8-vadd/gen/qu8-vadd-minmax-scalar-u2.c", diff --git a/gen/sse2_microkernels.bzl b/gen/sse2_microkernels.bzl index d53103095ce..86cbc15edeb 100644 --- a/gen/sse2_microkernels.bzl +++ b/gen/sse2_microkernels.bzl @@ -13,10 +13,12 @@ PROD_SSE2_MICROKERNEL_SRCS = [ "src/f32-argmaxpool/f32-argmaxpool-9p8x-sse2-c4.c", "src/f32-argmaxpool/f32-argmaxpool-9x-sse2-c4.c", "src/f32-f16-vcvt/gen/f32-f16-vcvt-sse2-u16.c", - "src/f32-prelu/gen/f32-prelu-sse2-2x8.c", "src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u32.c", "src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u32.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc2.c", + "src/f32-vbinary/gen/f32-vprelu-sse2-u8.c", + "src/f32-vbinary/gen/f32-vpreluc-sse2-u8.c", + "src/f32-vbinary/gen/f32-vrpreluc-sse2-u8.c", "src/f32-vcopysign/gen/f32-vcopysign-sse2.c", "src/f32-vcopysign/gen/f32-vcopysignc-sse2.c", "src/f32-vcopysign/gen/f32-vrcopysignc-sse2.c", @@ -115,7 +117,6 @@ NON_PROD_SSE2_MICROKERNEL_SRCS = [ "src/f32-f16-vcvt/gen/f32-f16-vcvt-sse2-u8.c", "src/f32-f16-vcvt/gen/f32-f16-vcvt-sse2-u24.c", "src/f32-f16-vcvt/gen/f32-f16-vcvt-sse2-u32.c", - "src/f32-prelu/gen/f32-prelu-sse2-2x4.c", "src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u8.c", "src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u16.c", "src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u24.c", @@ -126,11 +127,8 @@ NON_PROD_SSE2_MICROKERNEL_SRCS = [ "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u8-acc2.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc4.c", "src/f32-vbinary/gen/f32-vprelu-sse2-u4.c", - "src/f32-vbinary/gen/f32-vprelu-sse2-u8.c", "src/f32-vbinary/gen/f32-vpreluc-sse2-u4.c", - "src/f32-vbinary/gen/f32-vpreluc-sse2-u8.c", "src/f32-vbinary/gen/f32-vrpreluc-sse2-u4.c", - "src/f32-vbinary/gen/f32-vrpreluc-sse2-u8.c", "src/f32-velu/gen/f32-velu-sse2-rr2-lut16-p3-u4.c", "src/f32-velu/gen/f32-velu-sse2-rr2-lut16-p3-u8.c", "src/f32-velu/gen/f32-velu-sse2-rr2-lut16-p3-u16.c", @@ -269,7 +267,6 @@ NON_PROD_SSE2_MICROKERNEL_SRCS = [ "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x4c2s4-minmax-fp32-sse2-ld128.c", "src/qs8-requantization/qs8-requantization-fp32-sse2.c", "src/qs8-requantization/qs8-requantization-gemmlowp-sse2.c", - "src/qs8-requantization/qs8-requantization-rndna-sse2.c", "src/qs8-vadd/gen/qs8-vadd-minmax-sse2-mul16-ld64-u16.c", "src/qs8-vadd/gen/qs8-vadd-minmax-sse2-mul16-ld64-u24.c", "src/qs8-vadd/gen/qs8-vadd-minmax-sse2-mul16-ld64-u32.c", @@ -341,7 +338,6 @@ NON_PROD_SSE2_MICROKERNEL_SRCS = [ "src/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-sse2-ld128.c", "src/qu8-requantization/qu8-requantization-fp32-sse2.c", "src/qu8-requantization/qu8-requantization-gemmlowp-sse2.c", - "src/qu8-requantization/qu8-requantization-rndna-sse2.c", "src/qu8-rsum/gen/qu8-rsum-sse2-u16.c", "src/qu8-rsum/gen/qu8-rsum-sse2-u64-acc2.c", "src/qu8-rsum/gen/qu8-rsum-sse2-u64-acc4.c", diff --git a/gen/sse41_microkernels.bzl b/gen/sse41_microkernels.bzl index 50a2cbe1c7d..8ee156de2e2 100644 --- a/gen/sse41_microkernels.bzl +++ b/gen/sse41_microkernels.bzl @@ -8,7 +8,6 @@ Auto-generated file. Do not edit! PROD_SSE41_MICROKERNEL_SRCS = [ "src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u16.c", "src/f32-f16-vcvt/gen/f32-f16-vcvt-sse41-u8.c", - "src/f32-prelu/gen/f32-prelu-sse41-2x8.c", "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-sse41-dup.c", "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x8-minmax-sse41-dup.c", "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-sse41-dup.c", @@ -80,7 +79,6 @@ NON_PROD_SSE41_MICROKERNEL_SRCS = [ "src/f32-f16-vcvt/gen/f32-f16-vcvt-sse41-u16.c", "src/f32-f16-vcvt/gen/f32-f16-vcvt-sse41-u24.c", "src/f32-f16-vcvt/gen/f32-f16-vcvt-sse41-u32.c", - "src/f32-prelu/gen/f32-prelu-sse41-2x4.c", "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-3x8-minmax-sse41-dup.c", "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-5x8-minmax-sse41-dup.c", "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-6x8-minmax-sse41-dup.c", @@ -277,7 +275,6 @@ NON_PROD_SSE41_MICROKERNEL_SRCS = [ "src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-sse41-c32.c", "src/qs8-requantization/qs8-requantization-fp32-sse41.c", "src/qs8-requantization/qs8-requantization-gemmlowp-sse41.c", - "src/qs8-requantization/qs8-requantization-rndna-sse41.c", "src/qs8-requantization/qs8-requantization-rndnu-sse41-sra.c", "src/qs8-requantization/qs8-requantization-rndnu-sse41-srl.c", "src/qs8-vadd/gen/qs8-vadd-minmax-sse41-mul16-ld64-u16.c", @@ -371,7 +368,6 @@ NON_PROD_SSE41_MICROKERNEL_SRCS = [ "src/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-sse41-ld64.c", "src/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-sse41-ld128.c", "src/qu8-requantization/qu8-requantization-gemmlowp-sse41.c", - "src/qu8-requantization/qu8-requantization-rndna-sse41.c", "src/qu8-vadd/gen/qu8-vadd-minmax-sse41-mul16-ld64-u16.c", "src/qu8-vadd/gen/qu8-vadd-minmax-sse41-mul32-ld32-u8.c", "src/qu8-vadd/gen/qu8-vadd-minmax-sse41-mul32-ld32-u16.c", diff --git a/gen/sse_microkernels.bzl b/gen/sse_microkernels.bzl index 9364c3c4623..e912651df93 100644 --- a/gen/sse_microkernels.bzl +++ b/gen/sse_microkernels.bzl @@ -177,8 +177,6 @@ NON_PROD_SSE_MICROKERNEL_SRCS = [ "src/f32-igemm/gen/f32-igemm-6x8-minmax-sse-load1.c", "src/f32-igemm/gen/f32-igemm-6x8s4-minmax-sse.c", "src/f32-ppmm/gen/f32-ppmm-4x8-minmax-sse.c", - "src/f32-prelu/gen/f32-prelu-sse-2x4.c", - "src/f32-prelu/gen/f32-prelu-sse-2x8.c", "src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c32.c", "src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c64.c", "src/f32-rminmax/gen/f32-rmax-sse-u4.c", diff --git a/gen/ssse3_microkernels.bzl b/gen/ssse3_microkernels.bzl index 3175ba4c9ea..6e756912d54 100644 --- a/gen/ssse3_microkernels.bzl +++ b/gen/ssse3_microkernels.bzl @@ -36,7 +36,6 @@ NON_PROD_SSSE3_MICROKERNEL_SRCS = [ "src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-ssse3-madd.c", "src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-ssse3-madd-prfm.c", "src/qs8-requantization/qs8-requantization-gemmlowp-ssse3.c", - "src/qs8-requantization/qs8-requantization-rndna-ssse3.c", "src/qs8-rsum/gen/qs8-rsum-ssse3-u16.c", "src/qs8-rsum/gen/qs8-rsum-ssse3-u64-acc2.c", "src/qs8-rsum/gen/qs8-rsum-ssse3-u64-acc4.c", @@ -49,7 +48,6 @@ NON_PROD_SSSE3_MICROKERNEL_SRCS = [ "src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c16.c", "src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c32.c", "src/qu8-requantization/qu8-requantization-gemmlowp-ssse3.c", - "src/qu8-requantization/qu8-requantization-rndna-ssse3.c", "src/qu8-vcvt/gen/qu8-vcvt-ssse3-u16.c", "src/qu8-vhswish/gen/qu8-vhswish-ssse3-u16.c", "src/qu8-vhswish/gen/qu8-vhswish-ssse3-u32.c", diff --git a/gen/wasm_microkernels.bzl b/gen/wasm_microkernels.bzl index 79f5d620c30..a5f1b10aeae 100644 --- a/gen/wasm_microkernels.bzl +++ b/gen/wasm_microkernels.bzl @@ -28,7 +28,6 @@ PROD_WASM_MICROKERNEL_SRCS = [ "src/f32-maxpool/f32-maxpool-9p8x-minmax-wasm-c1.c", "src/f32-pavgpool/f32-pavgpool-9p8x-minmax-wasm-c1.c", "src/f32-pavgpool/f32-pavgpool-9x-minmax-wasm-c1.c", - "src/f32-prelu/gen/f32-prelu-wasm-2x4.c", "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x4-minmax-wasm.c", "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x4-minmax-wasm.c", "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x4-minmax-wasm.c", @@ -46,7 +45,10 @@ PROD_WASM_MICROKERNEL_SRCS = [ "src/f32-vbinary/gen/f32-vminc-wasm-u8.c", "src/f32-vbinary/gen/f32-vmul-wasm-u8.c", "src/f32-vbinary/gen/f32-vmulc-wasm-u8.c", + "src/f32-vbinary/gen/f32-vprelu-wasm-u8.c", + "src/f32-vbinary/gen/f32-vpreluc-wasm-u8.c", "src/f32-vbinary/gen/f32-vrdivc-wasm-u8.c", + "src/f32-vbinary/gen/f32-vrpreluc-wasm-u8.c", "src/f32-vbinary/gen/f32-vrsubc-wasm-u8.c", "src/f32-vbinary/gen/f32-vsub-wasm-u8.c", "src/f32-vbinary/gen/f32-vsubc-wasm-u8.c", @@ -107,7 +109,6 @@ NON_PROD_WASM_MICROKERNEL_SRCS = [ "src/f32-igemm/gen/f32-igemm-2x4-minmax-wasm.c", "src/f32-igemm/gen/f32-igemm-2x4-relu-wasm.c", "src/f32-igemm/gen/f32-igemm-4x2-relu-wasm.c", - "src/f32-prelu/gen/f32-prelu-wasm-2x1.c", "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-2x4-minmax-wasm.c", "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x2-minmax-wasm.c", "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x4-relu-wasm.c", @@ -169,18 +170,15 @@ NON_PROD_WASM_MICROKERNEL_SRCS = [ "src/f32-vbinary/gen/f32-vprelu-wasm-u1.c", "src/f32-vbinary/gen/f32-vprelu-wasm-u2.c", "src/f32-vbinary/gen/f32-vprelu-wasm-u4.c", - "src/f32-vbinary/gen/f32-vprelu-wasm-u8.c", "src/f32-vbinary/gen/f32-vpreluc-wasm-u1.c", "src/f32-vbinary/gen/f32-vpreluc-wasm-u2.c", "src/f32-vbinary/gen/f32-vpreluc-wasm-u4.c", - "src/f32-vbinary/gen/f32-vpreluc-wasm-u8.c", "src/f32-vbinary/gen/f32-vrdivc-wasm-u1.c", "src/f32-vbinary/gen/f32-vrdivc-wasm-u2.c", "src/f32-vbinary/gen/f32-vrdivc-wasm-u4.c", "src/f32-vbinary/gen/f32-vrpreluc-wasm-u1.c", "src/f32-vbinary/gen/f32-vrpreluc-wasm-u2.c", "src/f32-vbinary/gen/f32-vrpreluc-wasm-u4.c", - "src/f32-vbinary/gen/f32-vrpreluc-wasm-u8.c", "src/f32-vbinary/gen/f32-vrsubc-wasm-u1.c", "src/f32-vbinary/gen/f32-vrsubc-wasm-u2.c", "src/f32-vbinary/gen/f32-vrsubc-wasm-u4.c", diff --git a/gen/wasmrelaxedsimd_microkernels.bzl b/gen/wasmrelaxedsimd_microkernels.bzl index b2705786dcb..e4a4a042d83 100644 --- a/gen/wasmrelaxedsimd_microkernels.bzl +++ b/gen/wasmrelaxedsimd_microkernels.bzl @@ -47,8 +47,6 @@ PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS = [ "src/f32-igemm/gen/f32-igemm-6x8-minmax-wasmrelaxedsimd-fma-splat.c", "src/f32-igemm/gen/f32-igemm-6x8-relu-wasmrelaxedsimd-fma-splat.c", "src/f32-igemm/gen/f32-igemm-6x8-wasmrelaxedsimd-fma-splat.c", - "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x4.c", - "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x4.c", "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmrelaxedsimd-fma-loadsplat.c", "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmrelaxedsimd-fma-splat.c", "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-relu-wasmrelaxedsimd-fma-loadsplat.c", @@ -281,22 +279,6 @@ NON_PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS = [ "src/f32-igemm/gen/f32-igemm-6x8s4-minmax-wasmrelaxedsimd.c", "src/f32-igemm/gen/f32-igemm-6x8s4-relu-wasmrelaxedsimd-fma.c", "src/f32-igemm/gen/f32-igemm-6x8s4-wasmrelaxedsimd-fma.c", - "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x4.c", - "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x8.c", - "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x16.c", - "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x8.c", - "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x16.c", - "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x4.c", - "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x8.c", - "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x16.c", - "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x4.c", - "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x8.c", - "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x16.c", - "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x8.c", - "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x16.c", - "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x4.c", - "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x8.c", - "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x16.c", "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmrelaxedsimd-loadsplat.c", "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmrelaxedsimd-splat.c", "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8s4-minmax-wasmrelaxedsimd-fma.c", @@ -510,6 +492,7 @@ NON_PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS = [ "src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x8c8-minmax-wasmusdot.c", "src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-wasmusdot-u2.c", "src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-wasmusdot.c", + "src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c", "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c16-minmax-fp32-wasmsdot.c", "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c16-minmax-fp32-wasmusdot.c", "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-wasmusdot.c", @@ -598,6 +581,7 @@ NON_PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS = [ "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmusdot-u2-acc2.c", "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmusdot-u2.c", "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmusdot.c", + "src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c", "src/qs8-rsum/gen/qs8-rsum-wasmrelaxedsimd-u16.c", "src/qs8-rsum/gen/qs8-rsum-wasmrelaxedsimd-u32-acc2.c", "src/qs8-rsum/gen/qs8-rsum-wasmrelaxedsimd-u64-acc2.c", diff --git a/gen/wasmsimd_microkernels.bzl b/gen/wasmsimd_microkernels.bzl index b84cc5ec87f..1146bcac755 100644 --- a/gen/wasmsimd_microkernels.bzl +++ b/gen/wasmsimd_microkernels.bzl @@ -96,8 +96,6 @@ PROD_WASMSIMD_MICROKERNEL_SRCS = [ "src/f32-pavgpool/f32-pavgpool-9p8x-minmax-wasmsimd-x86-c4.c", "src/f32-pavgpool/f32-pavgpool-9x-minmax-wasmsimd-arm-c4.c", "src/f32-pavgpool/f32-pavgpool-9x-minmax-wasmsimd-x86-c4.c", - "src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x8.c", - "src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x8.c", "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmsimd-arm-splat.c", "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmsimd-x86-splat.c", "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-relu-wasmsimd-splat.c", @@ -131,7 +129,10 @@ PROD_WASMSIMD_MICROKERNEL_SRCS = [ "src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u16.c", "src/f32-vbinary/gen/f32-vmul-wasmsimd-u16.c", "src/f32-vbinary/gen/f32-vmulc-wasmsimd-u16.c", + "src/f32-vbinary/gen/f32-vprelu-wasmsimd-u16.c", + "src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u16.c", "src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u16.c", + "src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u16.c", "src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u16.c", "src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u16.c", "src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u16.c", @@ -565,22 +566,6 @@ NON_PROD_WASMSIMD_MICROKERNEL_SRCS = [ "src/f32-igemm/gen/f32-igemm-6x8s4-wasmsimd.c", "src/f32-ppmm/gen/f32-ppmm-4x8-minmax-wasmsimd-arm-splat.c", "src/f32-ppmm/gen/f32-ppmm-4x8-minmax-wasmsimd-x86-splat.c", - "src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x4.c", - "src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x8.c", - "src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x16.c", - "src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x4.c", - "src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x16.c", - "src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x4.c", - "src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x8.c", - "src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x16.c", - "src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x4.c", - "src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x8.c", - "src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x16.c", - "src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x4.c", - "src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x16.c", - "src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x4.c", - "src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x8.c", - "src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x16.c", "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmsimd-arm-loadsplat.c", "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmsimd-x86-loadsplat.c", "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-relu-wasmsimd-loadsplat.c", @@ -754,15 +739,12 @@ NON_PROD_WASMSIMD_MICROKERNEL_SRCS = [ "src/f32-vbinary/gen/f32-vmulc-wasmsimd-u8.c", "src/f32-vbinary/gen/f32-vprelu-wasmsimd-u4.c", "src/f32-vbinary/gen/f32-vprelu-wasmsimd-u8.c", - "src/f32-vbinary/gen/f32-vprelu-wasmsimd-u16.c", "src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u4.c", "src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u8.c", - "src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u16.c", "src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u4.c", "src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u8.c", "src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u4.c", "src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u8.c", - "src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u16.c", "src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u4.c", "src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u8.c", "src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u4.c", diff --git a/include/xnnpack.h b/include/xnnpack.h index f23338ac8c0..801cbcea64c 100644 --- a/include/xnnpack.h +++ b/include/xnnpack.h @@ -997,6 +997,7 @@ enum xnn_binary_operator { xnn_binary_minimum, xnn_binary_copysign, xnn_binary_squared_difference, + xnn_binary_prelu, }; struct xnn_binary_params { @@ -1650,7 +1651,7 @@ enum xnn_status xnn_define_static_resize_bilinear_2d( /// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph /// with [N, H, W, channels] dimensions. /// @param flags - binary features of the PReLU Node. No supported flags are currently defined. -enum xnn_status xnn_define_prelu( +XNN_DEPRECATED enum xnn_status xnn_define_prelu( xnn_subgraph_t subgraph, uint32_t input_id, uint32_t slope_id, @@ -1660,7 +1661,7 @@ enum xnn_status xnn_define_prelu( /// Define a RoPE (Rotary Positional Embeddings) Node and add it to a Subgraph. /// /// @param subgraph - a Subgraph object that will own the created Node. -/// @param max_tokens - maximum possible number of tokens (maximum sequence length) of the input/output tensors. +/// @param max_tokens - deprecated. /// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph /// with [batch, tokens, heads, channels] dimensions. /// @param weights_id - Value ID for the weights tensor. The weights tensor must be a 2D tensor defined in the @@ -2948,8 +2949,6 @@ enum xnn_status xnn_run_convert_nc_f32_f16( enum xnn_status xnn_create_convert_nc_f32_qs8( float output_scale, int8_t output_zero_point, - int8_t output_min, - int8_t output_max, uint32_t flags, xnn_operator_t* convert_op_out); @@ -2981,8 +2980,6 @@ enum xnn_status xnn_run_convert_nc_f32_qs8( enum xnn_status xnn_create_convert_nc_f32_qu8( float output_scale, uint8_t output_zero_point, - uint8_t output_min, - uint8_t output_max, uint32_t flags, xnn_operator_t* convert_op_out); @@ -5094,48 +5091,6 @@ enum xnn_status xnn_run_negate_nc_f32( uint32_t flags, pthreadpool_t threadpool); -enum xnn_status xnn_create_prelu_nc_f16( - size_t input_channels, - size_t slope_channels, - size_t input_stride, - size_t output_stride, - const void* negative_slope, - uint32_t flags, - xnn_code_cache_t code_cache, - xnn_weights_cache_t weights_cache, - xnn_operator_t* prelu_op_out); - -enum xnn_status xnn_reshape_prelu_nc_f16( - xnn_operator_t prelu_op, - size_t batch_size, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_prelu_nc_f16( - xnn_operator_t prelu_op, - const void* input, - void* output); - -enum xnn_status xnn_create_prelu_nc_f32( - size_t input_channels, - size_t slope_channels, - size_t input_stride, - size_t output_stride, - const float* negative_slope, - uint32_t flags, - xnn_code_cache_t code_cache, - xnn_weights_cache_t weights_cache, - xnn_operator_t* prelu_op_out); - -enum xnn_status xnn_reshape_prelu_nc_f32( - xnn_operator_t prelu_op, - size_t batch_size, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_prelu_nc_f32( - xnn_operator_t prelu_op, - const float* input, - float* output); - enum xnn_status xnn_create_resize_bilinear2d_nchw_f32( size_t output_height, size_t output_width, @@ -5275,7 +5230,6 @@ enum xnn_status xnn_setup_resize_bilinear2d_nhwc_u8( uint8_t* output); enum xnn_status xnn_create_rope_nthc_f16( - size_t max_tokens, uint32_t flags, xnn_operator_t* rope_op_out); @@ -5294,7 +5248,6 @@ enum xnn_status xnn_setup_rope_nthc_f16( void* output); enum xnn_status xnn_create_rope_nthc_f32( - size_t max_tokens, uint32_t flags, xnn_operator_t* rope_op_out); diff --git a/scripts/generate-f16-prelu.sh b/scripts/generate-f16-prelu.sh deleted file mode 100755 index b7213bb7836..00000000000 --- a/scripts/generate-f16-prelu.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/sh -# Copyright 2020 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -################################### ARM NEON ################################## -tools/xngen src/f16-prelu/neonfp16arith.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -o src/f16-prelu/gen/f16-prelu-neonfp16arith-2x8.c & -tools/xngen src/f16-prelu/neonfp16arith.c.in -D CHANNEL_TILE=16 -D ROW_TILE=2 -o src/f16-prelu/gen/f16-prelu-neonfp16arith-2x16.c & - -################################### x86 F16C ################################## -tools/xngen src/f16-prelu/f16c.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -o src/f16-prelu/gen/f16-prelu-f16c-2x8.c & -tools/xngen src/f16-prelu/f16c.c.in -D CHANNEL_TILE=16 -D ROW_TILE=2 -o src/f16-prelu/gen/f16-prelu-f16c-2x16.c & - -wait diff --git a/scripts/generate-f32-prelu.sh b/scripts/generate-f32-prelu.sh deleted file mode 100755 index f36b7ec405c..00000000000 --- a/scripts/generate-f32-prelu.sh +++ /dev/null @@ -1,87 +0,0 @@ -#!/bin/sh -# Copyright 2019 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -#################################### Scalar ################################### -tools/xngen src/f32-prelu/scalar.c.in -D CHANNEL_TILE=1 -D ROW_TILE=2 -o src/f32-prelu/gen/f32-prelu-scalar-2x1.c & -tools/xngen src/f32-prelu/scalar.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -o src/f32-prelu/gen/f32-prelu-scalar-2x4.c & - -##################################### WAsm #################################### -tools/xngen src/f32-prelu/wasm.c.in -D CHANNEL_TILE=1 -D ROW_TILE=2 -o src/f32-prelu/gen/f32-prelu-wasm-2x1.c & -tools/xngen src/f32-prelu/wasm.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -o src/f32-prelu/gen/f32-prelu-wasm-2x4.c & - -################################### ARM NEON ################################## -tools/xngen src/f32-prelu/neon.c.in -D CHANNEL_TILE=4 -D ROW_TILE=1 -o src/f32-prelu/gen/f32-prelu-neon-1x4.c & -tools/xngen src/f32-prelu/neon.c.in -D CHANNEL_TILE=8 -D ROW_TILE=1 -o src/f32-prelu/gen/f32-prelu-neon-1x8.c & -tools/xngen src/f32-prelu/neon.c.in -D CHANNEL_TILE=16 -D ROW_TILE=1 -o src/f32-prelu/gen/f32-prelu-neon-1x16.c & -tools/xngen src/f32-prelu/neon.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -o src/f32-prelu/gen/f32-prelu-neon-2x4.c & -tools/xngen src/f32-prelu/neon.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -o src/f32-prelu/gen/f32-prelu-neon-2x8.c & -tools/xngen src/f32-prelu/neon.c.in -D CHANNEL_TILE=16 -D ROW_TILE=2 -o src/f32-prelu/gen/f32-prelu-neon-2x16.c & -tools/xngen src/f32-prelu/neon.c.in -D CHANNEL_TILE=4 -D ROW_TILE=4 -o src/f32-prelu/gen/f32-prelu-neon-4x4.c & -tools/xngen src/f32-prelu/neon.c.in -D CHANNEL_TILE=8 -D ROW_TILE=4 -o src/f32-prelu/gen/f32-prelu-neon-4x8.c & -tools/xngen src/f32-prelu/neon.c.in -D CHANNEL_TILE=16 -D ROW_TILE=4 -o src/f32-prelu/gen/f32-prelu-neon-4x16.c & - -################################## WAsm SIMD ################################## -tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=4 -D ROW_TILE=1 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x4.c & -tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=8 -D ROW_TILE=1 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x8.c & -tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=16 -D ROW_TILE=1 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x16.c & -tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x4.c & -tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x8.c & -tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=16 -D ROW_TILE=2 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x16.c & -tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=4 -D ROW_TILE=4 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x4.c & -tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=8 -D ROW_TILE=4 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x8.c & -tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=16 -D ROW_TILE=4 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x16.c & - -tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=4 -D ROW_TILE=1 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x4.c & -tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=8 -D ROW_TILE=1 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x8.c & -tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=16 -D ROW_TILE=1 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x16.c & -tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x4.c & -tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x8.c & -tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=16 -D ROW_TILE=2 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x16.c & -tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=4 -D ROW_TILE=4 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x4.c & -tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=8 -D ROW_TILE=4 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x8.c & -tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=16 -D ROW_TILE=4 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x16.c & - -tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in -D CHANNEL_TILE=4 -D ROW_TILE=1 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x4.c & -tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in -D CHANNEL_TILE=8 -D ROW_TILE=1 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x8.c & -tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in -D CHANNEL_TILE=16 -D ROW_TILE=1 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x16.c & -tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x4.c & -tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x8.c & -tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in -D CHANNEL_TILE=16 -D ROW_TILE=2 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x16.c & -tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in -D CHANNEL_TILE=4 -D ROW_TILE=4 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x4.c & -tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in -D CHANNEL_TILE=8 -D ROW_TILE=4 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x8.c & -tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in -D CHANNEL_TILE=16 -D ROW_TILE=4 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x16.c & - -tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in -D CHANNEL_TILE=4 -D ROW_TILE=1 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x4.c & -tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in -D CHANNEL_TILE=8 -D ROW_TILE=1 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x8.c & -tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in -D CHANNEL_TILE=16 -D ROW_TILE=1 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x16.c & -tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x4.c & -tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x8.c & -tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in -D CHANNEL_TILE=16 -D ROW_TILE=2 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x16.c & -tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in -D CHANNEL_TILE=4 -D ROW_TILE=4 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x4.c & -tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in -D CHANNEL_TILE=8 -D ROW_TILE=4 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x8.c & -tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in -D CHANNEL_TILE=16 -D ROW_TILE=4 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x16.c & - -################################### ARM NEON ################################## -tools/xngen src/f32-prelu/neon.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -o src/f32-prelu/gen/f32-prelu-neon-2x4.c & -tools/xngen src/f32-prelu/neon.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -o src/f32-prelu/gen/f32-prelu-neon-2x8.c & - -############################# x86 SSE/SSE2/SSE4.1 ############################# -tools/xngen src/f32-prelu/sse.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -D SSE=1 -o src/f32-prelu/gen/f32-prelu-sse-2x4.c & -tools/xngen src/f32-prelu/sse.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -D SSE=1 -o src/f32-prelu/gen/f32-prelu-sse-2x8.c & -tools/xngen src/f32-prelu/sse.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -D SSE=2 -o src/f32-prelu/gen/f32-prelu-sse2-2x4.c & -tools/xngen src/f32-prelu/sse.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -D SSE=2 -o src/f32-prelu/gen/f32-prelu-sse2-2x8.c & -tools/xngen src/f32-prelu/sse.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -D SSE=4 -o src/f32-prelu/gen/f32-prelu-sse41-2x4.c & -tools/xngen src/f32-prelu/sse.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -D SSE=4 -o src/f32-prelu/gen/f32-prelu-sse41-2x8.c & - -################################### x86 AVX ################################### -tools/xngen src/f32-prelu/avx.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -o src/f32-prelu/gen/f32-prelu-avx-2x8.c & -tools/xngen src/f32-prelu/avx.c.in -D CHANNEL_TILE=16 -D ROW_TILE=2 -o src/f32-prelu/gen/f32-prelu-avx-2x16.c & - -################################## x86 AVX512 ################################# -tools/xngen src/f32-prelu/avx512f.c.in -D CHANNEL_TILE=16 -D ROW_TILE=2 -o src/f32-prelu/gen/f32-prelu-avx512f-2x16.c & -tools/xngen src/f32-prelu/avx512f.c.in -D CHANNEL_TILE=32 -D ROW_TILE=2 -o src/f32-prelu/gen/f32-prelu-avx512f-2x32.c & - -wait diff --git a/scripts/generate-tests.sh b/scripts/generate-tests.sh index ef6cb064603..75fc425003e 100755 --- a/scripts/generate-tests.sh +++ b/scripts/generate-tests.sh @@ -249,8 +249,8 @@ tools/generate-dwconv2d-chw-test.py --spec test/f32-dwconv2d-chw.yaml --output t ### Tests for VHSwish micro-kernels tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f16-vhswish --output test/f16-vhswish.cc & tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f32-vhswish --output test/f32-vhswish.cc & -tools/generate-vhswish-test.py --spec test/qs8-vhswish.yaml --output test/qs8-vhswish.cc & -tools/generate-vhswish-test.py --spec test/qu8-vhswish.yaml --output test/qu8-vhswish.cc & +tools/generate-vunary-test.py --tester VHSwishMicrokernelTester --ukernel qs8-vhswish --output test/qs8-vhswish.cc & +tools/generate-vunary-test.py --tester VHSwishMicrokernelTester --ukernel qu8-vhswish --output test/qu8-vhswish.cc & ### Tests for IBilinear micro-kernels tools/generate-ibilinear-test.py --spec test/f16-ibilinear.yaml --output test/f16-ibilinear.cc & @@ -262,10 +262,6 @@ tools/generate-ibilinear-test.py --spec test/u8-ibilinear.yaml --output test/u8- tools/generate-ibilinear-chw-test.py --spec test/f16-ibilinear-chw.yaml --output test/f16-ibilinear-chw.cc & tools/generate-ibilinear-chw-test.py --spec test/f32-ibilinear-chw.yaml --output test/f32-ibilinear-chw.cc & -### Tests for PRelu micro-kernels -tools/generate-prelu-test.py --spec test/f16-prelu.yaml --output test/f16-prelu.cc & -tools/generate-prelu-test.py --spec test/f32-prelu.yaml --output test/f32-prelu.cc & - ### Tests for RAddExpMinusMax micro-kernels tools/generate-raddexpminusmax-test.py --spec test/f32-raddexpminusmax.yaml --output test/f32-raddexpminusmax.cc & diff --git a/scripts/generate-x8-packw.sh b/scripts/generate-x8-packw.sh index 58bb3d2f991..d16e1214b47 100755 --- a/scripts/generate-x8-packw.sh +++ b/scripts/generate-x8-packw.sh @@ -53,4 +53,9 @@ tools/xngen src/x8-packw/kr-avxvnni.c.in -D NR=16 -D KR=8 -D TYPE=int8_t -D IZP= tools/xngen src/x8-packw/kr-avxvnni.c.in -D NR=16 -D KR=8 -D TYPE=int8_t -D IZP=128 -D AVX=10 -D PREFETCH=0 -o src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avx256vnni.c & tools/xngen src/x8-packw/kr-avxvnni.c.in -D NR=16 -D KR=8 -D TYPE=int8_t -D IZP=128 -D AVX=10 -D PREFETCH=1 -o src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avx256vnni-prfm.c & +### WAsm Relaxed SIMD +### C8 packing +tools/xngen src/x8-packw/kr-wasmdot.c.in -D NR=8 -D KR=8 -D TYPE=int8_t -D IZP=0 -o src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c & +tools/xngen src/x8-packw/kr-wasmdot.c.in -D NR=8 -D KR=8 -D TYPE=int8_t -D IZP=128 -o src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c & + wait diff --git a/scripts/generate-x8-vclamp.sh b/scripts/generate-x8-vclamp.sh new file mode 100755 index 00000000000..952691773bc --- /dev/null +++ b/scripts/generate-x8-vclamp.sh @@ -0,0 +1,18 @@ +#!/bin/sh +# Copyright 2024 Google LLC +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +################################ RISC-V Vector ################################ +tools/xngen src/s8-vclamp/rvv.c.in -D LMUL=1 -D DATATYPE=S8 -o src/s8-vclamp/gen/s8-vclamp-rvv-u1v.c & +tools/xngen src/s8-vclamp/rvv.c.in -D LMUL=2 -D DATATYPE=S8 -o src/s8-vclamp/gen/s8-vclamp-rvv-u2v.c & +tools/xngen src/s8-vclamp/rvv.c.in -D LMUL=4 -D DATATYPE=S8 -o src/s8-vclamp/gen/s8-vclamp-rvv-u4v.c & +tools/xngen src/s8-vclamp/rvv.c.in -D LMUL=8 -D DATATYPE=S8 -o src/s8-vclamp/gen/s8-vclamp-rvv-u8v.c & + +tools/xngen src/s8-vclamp/rvv.c.in -D LMUL=1 -D DATATYPE=U8 -o src/u8-vclamp/gen/u8-vclamp-rvv-u1v.c & +tools/xngen src/s8-vclamp/rvv.c.in -D LMUL=2 -D DATATYPE=U8 -o src/u8-vclamp/gen/u8-vclamp-rvv-u2v.c & +tools/xngen src/s8-vclamp/rvv.c.in -D LMUL=4 -D DATATYPE=U8 -o src/u8-vclamp/gen/u8-vclamp-rvv-u4v.c & +tools/xngen src/s8-vclamp/rvv.c.in -D LMUL=8 -D DATATYPE=U8 -o src/u8-vclamp/gen/u8-vclamp-rvv-u8v.c & + +wait diff --git a/src/configs/binary-elementwise-config.c b/src/configs/binary-elementwise-config.c index 3dd38743a99..73888eb564a 100644 --- a/src/configs/binary-elementwise-config.c +++ b/src/configs/binary-elementwise-config.c @@ -18,6 +18,7 @@ static struct xnn_binary_elementwise_config f16_vdiv_config = {0}; static struct xnn_binary_elementwise_config f16_vmax_config = {0}; static struct xnn_binary_elementwise_config f16_vmin_config = {0}; static struct xnn_binary_elementwise_config f16_vmul_config = {0}; +static struct xnn_binary_elementwise_config f16_vprelu_config = {0}; static struct xnn_binary_elementwise_config f16_vsub_config = {0}; static struct xnn_binary_elementwise_config f16_vsqrdiff_config = {0}; @@ -27,6 +28,7 @@ static struct xnn_binary_elementwise_config f32_vdiv_config = {0}; static struct xnn_binary_elementwise_config f32_vmax_config = {0}; static struct xnn_binary_elementwise_config f32_vmin_config = {0}; static struct xnn_binary_elementwise_config f32_vmul_config = {0}; +static struct xnn_binary_elementwise_config f32_vprelu_config = {0}; static struct xnn_binary_elementwise_config f32_vsub_config = {0}; static struct xnn_binary_elementwise_config f32_vsqrdiff_config = {0}; @@ -44,6 +46,7 @@ XNN_INIT_ONCE_GUARD(f16_vdiv); XNN_INIT_ONCE_GUARD(f16_vmax); XNN_INIT_ONCE_GUARD(f16_vmin); XNN_INIT_ONCE_GUARD(f16_vmul); +XNN_INIT_ONCE_GUARD(f16_vprelu); XNN_INIT_ONCE_GUARD(f16_vsub); XNN_INIT_ONCE_GUARD(f16_vsqrdiff); XNN_INIT_ONCE_GUARD(f32_vadd); @@ -52,6 +55,7 @@ XNN_INIT_ONCE_GUARD(f32_vdiv); XNN_INIT_ONCE_GUARD(f32_vmax); XNN_INIT_ONCE_GUARD(f32_vmin); XNN_INIT_ONCE_GUARD(f32_vmul); +XNN_INIT_ONCE_GUARD(f32_vprelu); XNN_INIT_ONCE_GUARD(f32_vsub); XNN_INIT_ONCE_GUARD(f32_vsqrdiff); XNN_INIT_ONCE_GUARD(s32_vmul); @@ -256,6 +260,45 @@ static void init_f16_vmul_config(void) { #endif } +static void init_f16_vprelu_config(void) { + #if XNN_ARCH_ARM && XNN_ENABLE_ARM_FP16_VECTOR && XNN_ENABLE_ARM_FP16_SCALAR + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + assert(hardware_config != NULL); + if (hardware_config->use_arm_neon_fp16_arith) { + f16_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vprelu_ukernel__neonfp16arith_u16; + f16_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vpreluc_ukernel__neonfp16arith_u16; + f16_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrpreluc_ukernel__neonfp16arith_u16; + f16_vprelu_config.element_tile = 16; + } + #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + assert(hardware_config != NULL); + if (hardware_config->use_arm_neon_fp16_arith) { + f16_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vprelu_ukernel__neonfp16arith_u16; + f16_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vpreluc_ukernel__neonfp16arith_u16; + f16_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrpreluc_ukernel__neonfp16arith_u16; + f16_vprelu_config.element_tile = 16; + } + #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + assert(hardware_config != NULL); + #if XNN_ENABLE_AVX512FP16 + if (hardware_config->use_x86_avx512fp16) { + f16_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vprelu_ukernel__avx512fp16_u64; + f16_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vpreluc_ukernel__avx512fp16_u64; + f16_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrpreluc_ukernel__avx512fp16_u64; + f16_vprelu_config.element_tile = 64; + } else + #endif + if (hardware_config->use_x86_f16c) { + f16_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vprelu_ukernel__f16c_u16; + f16_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vpreluc_ukernel__f16c_u16; + f16_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrpreluc_ukernel__f16c_u16; + f16_vprelu_config.element_tile = 16; + } + #endif +} + static void init_f16_vsub_config(void) { #if XNN_ARCH_ARM && XNN_ENABLE_ARM_FP16_VECTOR && XNN_ENABLE_ARM_FP16_SCALAR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); @@ -796,6 +839,66 @@ static void init_f32_vmul_config(void) { #endif } +static void init_f32_vprelu_config(void) { + #if XNN_ARCH_ARM + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + assert(hardware_config != NULL); + if (hardware_config->use_arm_neon){ + f32_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vprelu_ukernel__neon_u8; + f32_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vpreluc_ukernel__neon_u8; + f32_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrpreluc_ukernel__neon_u8; + f32_vprelu_config.element_tile = 8; + } else if (!XNN_PLATFORM_MOBILE) { + f32_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vprelu_ukernel__scalar_u8; + f32_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vpreluc_ukernel__scalar_u8; + f32_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrpreluc_ukernel__scalar_u8; + f32_vprelu_config.element_tile = 8; + } + #elif XNN_ARCH_ARM64 + f32_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vprelu_ukernel__neon_u8; + f32_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vpreluc_ukernel__neon_u8; + f32_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrpreluc_ukernel__neon_u8; + f32_vprelu_config.element_tile = 8; + #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + assert(hardware_config != NULL); + #if XNN_ENABLE_AVX512F + if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { + f32_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vprelu_ukernel__avx512f_u32; + f32_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vpreluc_ukernel__avx512f_u32; + f32_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrpreluc_ukernel__avx512f_u32; + f32_vprelu_config.element_tile = 32; + } else + #endif + if (hardware_config->use_x86_avx) { + f32_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vprelu_ukernel__avx_u16; + f32_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vpreluc_ukernel__avx_u16; + f32_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrpreluc_ukernel__avx_u16; + f32_vprelu_config.element_tile = 16; + } else { + f32_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vprelu_ukernel__sse2_u8; + f32_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vpreluc_ukernel__sse2_u8; + f32_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrpreluc_ukernel__sse2_u8; + f32_vprelu_config.element_tile = 8; + } + #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + f32_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vprelu_ukernel__wasmsimd_u16; + f32_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vpreluc_ukernel__wasmsimd_u16; + f32_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrpreluc_ukernel__wasmsimd_u16; + f32_vprelu_config.element_tile = 16; + #elif XNN_ARCH_WASM + f32_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vprelu_ukernel__wasm_u8; + f32_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vpreluc_ukernel__wasm_u8; + f32_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrpreluc_ukernel__wasm_u8; + f32_vprelu_config.element_tile = 8; + #else + f32_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vprelu_ukernel__scalar_u8; + f32_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vpreluc_ukernel__scalar_u8; + f32_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrpreluc_ukernel__scalar_u8; + f32_vprelu_config.element_tile = 8; + #endif +} + static void init_f32_vsub_config(void) { #if XNN_ARCH_ARM const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); @@ -1250,6 +1353,15 @@ const struct xnn_binary_elementwise_config* xnn_init_f16_vmul_config() { return &f16_vmul_config; } +const struct xnn_binary_elementwise_config* xnn_init_f16_vprelu_config() { + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + if (hardware_config == NULL || !xnn_is_f16_compatible_config(hardware_config)) { + return NULL; + } + XNN_INIT_ONCE(f16_vprelu); + return &f16_vprelu_config; +} + const struct xnn_binary_elementwise_config* xnn_init_f16_vsub_config() { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); if (hardware_config == NULL || !xnn_is_f16_compatible_config(hardware_config)) { @@ -1331,6 +1443,15 @@ const struct xnn_binary_elementwise_config* xnn_init_f32_vmul_config() { return &f32_vmul_config; } +const struct xnn_binary_elementwise_config* xnn_init_f32_vprelu_config() { + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + if (hardware_config == NULL) { + return NULL; + } + XNN_INIT_ONCE(f32_vprelu); + return &f32_vprelu_config; +} + const struct xnn_binary_elementwise_config* xnn_init_f32_vsub_config() { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); if (hardware_config == NULL) { diff --git a/src/configs/hardware-config.c b/src/configs/hardware-config.c index ca3f4f6e281..100186c2adf 100644 --- a/src/configs/hardware-config.c +++ b/src/configs/hardware-config.c @@ -126,7 +126,11 @@ static void init_hardware_config(void) { #else hardware_config.use_x86_avx512skx = 0; #endif +#if XNN_ENABLE_AVX512VBMI hardware_config.use_x86_avx512vbmi = hardware_config.use_x86_avx512skx && cpuinfo_has_x86_avx512vbmi(); +#else + hardware_config.use_x86_avx512vbmi = 0; +#endif #if XNN_ENABLE_AVX512VNNI hardware_config.use_x86_avx512vnni = hardware_config.use_x86_avx512skx && cpuinfo_has_x86_avx512vnni(); #else diff --git a/src/configs/prelu-config.c b/src/configs/prelu-config.c deleted file mode 100644 index e3508004459..00000000000 --- a/src/configs/prelu-config.c +++ /dev/null @@ -1,148 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include - -#include "xnnpack/common.h" -#include "xnnpack/config.h" -#include "xnnpack/init-once.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/prelu.h" - -static struct xnn_prelu_config f16_prelu_config = {0}; -static struct xnn_prelu_config f32_prelu_config = {0}; - -XNN_INIT_ONCE_GUARD(f16_prelu); -XNN_INIT_ONCE_GUARD(f32_prelu); - -static void init_f16_prelu_config(void) { - #if XNN_ARCH_ARM && XNN_ENABLE_ARM_FP16_VECTOR && XNN_ENABLE_ARM_FP16_SCALAR - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - assert(hardware_config != NULL); - if (hardware_config->use_arm_neon_fp16_arith) { - f16_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f16_prelu_ukernel__neonfp16arith_2x16; - f16_prelu_config.row_tile = 2; - f16_prelu_config.channel_tile = 16; - } - #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - assert(hardware_config != NULL); - if (hardware_config->use_arm_neon_fp16_arith) { - f16_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f16_prelu_ukernel__neonfp16arith_2x16; - f16_prelu_config.row_tile = 2; - f16_prelu_config.channel_tile = 16; - } - #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - assert(hardware_config != NULL); - if (hardware_config->use_x86_f16c) { - f16_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f16_prelu_ukernel__f16c_2x16; - f16_prelu_config.row_tile = 2; - f16_prelu_config.channel_tile = 16; - } - #endif -} - -static void init_f32_prelu_config(void) { - #if XNN_ARCH_ARM - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - assert(hardware_config != NULL); - if (hardware_config->use_arm_neon) { - f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__neon_2x8; - f32_prelu_config.row_tile = 2; - f32_prelu_config.channel_tile = 8; - } else if (!XNN_PLATFORM_MOBILE) { - f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__scalar_2x4; - f32_prelu_config.row_tile = 4; - f32_prelu_config.channel_tile = 4; - } - #elif XNN_ARCH_ARM64 - f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__neon_2x8; - f32_prelu_config.row_tile = 2; - f32_prelu_config.channel_tile = 8; - #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - assert(hardware_config != NULL); - #if XNN_ENABLE_AVX512F - if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { - f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__avx512f_2x16; - f32_prelu_config.row_tile = 2; - f32_prelu_config.channel_tile = 16; - } else - #endif - if (hardware_config->use_x86_avx) { - f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__avx_2x16; - f32_prelu_config.row_tile = 2; - f32_prelu_config.channel_tile = 16; - } else if (hardware_config->use_x86_sse4_1) { - f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__sse41_2x8; - f32_prelu_config.row_tile = 2; - f32_prelu_config.channel_tile = 8; - } else { - f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__sse2_2x8; - f32_prelu_config.row_tile = 2; - f32_prelu_config.channel_tile = 8; - } - #elif XNN_ARCH_WASMRELAXEDSIMD - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - assert(hardware_config != NULL); - if (hardware_config->is_x86) { - f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4; - f32_prelu_config.row_tile = 2; - f32_prelu_config.channel_tile = 4; - } else { - f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4; - f32_prelu_config.row_tile = 2; - f32_prelu_config.channel_tile = 4; - } - #elif XNN_ARCH_WASMSIMD - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - assert(hardware_config != NULL); - if (hardware_config->is_x86) { - f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8; - f32_prelu_config.row_tile = 2; - f32_prelu_config.channel_tile = 8; - } else { - f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8; - f32_prelu_config.row_tile = 2; - f32_prelu_config.channel_tile = 8; - } - #elif XNN_ARCH_WASM - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - assert(hardware_config != NULL); - if (hardware_config->is_x86) { - f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__scalar_2x4; - f32_prelu_config.row_tile = 2; - f32_prelu_config.channel_tile = 4; - } else { - f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__wasm_2x4; - f32_prelu_config.row_tile = 2; - f32_prelu_config.channel_tile = 4; - } - #else - f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__scalar_2x4; - f32_prelu_config.row_tile = 4; - f32_prelu_config.channel_tile = 4; - #endif -} - -const struct xnn_prelu_config* xnn_init_f16_prelu_config() { - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - if (hardware_config == NULL || !xnn_is_f16_compatible_config(hardware_config)) { - return NULL; - } - XNN_INIT_ONCE(f16_prelu); - return &f16_prelu_config; -} - -const struct xnn_prelu_config* xnn_init_f32_prelu_config() { - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - if (hardware_config == NULL) { - return NULL; - } - XNN_INIT_ONCE(f32_prelu); - return &f32_prelu_config; -} diff --git a/src/configs/unary-elementwise-config.c b/src/configs/unary-elementwise-config.c index 15f83474cdc..ff36ce67e75 100644 --- a/src/configs/unary-elementwise-config.c +++ b/src/configs/unary-elementwise-config.c @@ -15,7 +15,6 @@ #include "xnnpack/microparams-init.h" #include "xnnpack/packq.h" #include "xnnpack/vcvt.h" -#include "xnnpack/vlrelu.h" #include "xnnpack/vunary.h" static struct xnn_unary_elementwise_config f16_abs_config = {0}; @@ -1980,10 +1979,23 @@ static void init_s8_clamp_config(void) { #elif XNN_ARCH_ARM64 s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__neon_u64; s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params; + + #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR + s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__rvv_u4v; + s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); - if (hardware_config->use_x86_sse4_1) { + #if XNN_ENABLE_AVX512SKX + if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) { + s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__avx512skx_u256; + s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params; + } else + #endif + if (hardware_config->use_x86_avx2) { + s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__avx2_u128; + s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params; + } else if (hardware_config->use_x86_sse4_1) { s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__sse41_u64; s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params; } else { @@ -1993,12 +2005,6 @@ static void init_s8_clamp_config(void) { #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__wasmsimd_u64; s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params; - #elif XNN_ARCH_WASM - s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__scalar_u4; - s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params; - #elif XNN_ARCH_RISCV - s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__scalar_u4; - s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params; #else s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__scalar_u4; s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params; @@ -2019,9 +2025,25 @@ static void init_u8_clamp_config(void) { #elif XNN_ARCH_ARM64 u8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__neon_u64; u8_clamp_config.init.u8_minmax = xnn_init_u8_minmax_scalar_params; - #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 - u8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__sse2_u64; + #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR + u8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__rvv_u4v; u8_clamp_config.init.u8_minmax = xnn_init_u8_minmax_scalar_params; + #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + assert(hardware_config != NULL); + #if XNN_ENABLE_AVX512SKX + if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) { + u8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__avx512skx_u256; + u8_clamp_config.init.u8_minmax = xnn_init_u8_minmax_scalar_params; + } else + #endif + if (hardware_config->use_x86_avx2) { + u8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__avx2_u128; + u8_clamp_config.init.u8_minmax = xnn_init_u8_minmax_scalar_params; + } else { + u8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__sse2_u64; + u8_clamp_config.init.u8_minmax = xnn_init_u8_minmax_scalar_params; + } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD u8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__wasmsimd_u64; u8_clamp_config.init.u8_minmax = xnn_init_u8_minmax_scalar_params; diff --git a/src/configs/x8-lut-config.c b/src/configs/x8-lut-config.c index 59a39926987..5e1e548246f 100644 --- a/src/configs/x8-lut-config.c +++ b/src/configs/x8-lut-config.c @@ -25,13 +25,14 @@ static void init_x8_lut_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); + #if XNN_ENABLE_AVX256VBMI + if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vbmi) { + x8_lut_config.microkernel = xnn_x8_lut_ukernel__avx512vbmi_vpermx2b_u128; + } else + #endif #if XNN_ENABLE_AVX512SKX if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) { - if (hardware_config->use_x86_avx512vbmi) { - x8_lut_config.microkernel = xnn_x8_lut_ukernel__avx512vbmi_vpermx2b_u128; - } else { - x8_lut_config.microkernel = xnn_x8_lut_ukernel__avx512skx_vpshufb_u64; - } + x8_lut_config.microkernel = xnn_x8_lut_ukernel__avx512skx_vpshufb_u64; } else #endif if (hardware_config->use_x86_avx2) { diff --git a/src/f16-f32-vcvt/f16-f32-vcvt.h b/src/f16-f32-vcvt/f16-f32-vcvt.h index f8e5c42db91..0d685c2e399 100644 --- a/src/f16-f32-vcvt/f16-f32-vcvt.h +++ b/src/f16-f32-vcvt/f16-f32-vcvt.h @@ -56,9 +56,12 @@ XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f16_f32_vcvt_ukernel__avx_int3 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f16_f32_vcvt_ukernel__avx_int32_u32, 32, false, xnn_float16, float, void, NULL) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_f32_vcvt_ukernel__f16c_u8, 8, false, xnn_float16, float, void, NULL) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_f32_vcvt_ukernel__f16c_u16, 16, false, xnn_float16, float, void, NULL) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f16_f32_vcvt_ukernel__avx512skx_u16, 16, false, xnn_float16, float, void, NULL) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f16_f32_vcvt_ukernel__avx512skx_u32, 32, false, xnn_float16, float, void, NULL) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u8, 8, false, xnn_float16, float, void, NULL) diff --git a/src/f16-prelu/f16c.c.in b/src/f16-prelu/f16c.c.in deleted file mode 100644 index b187fa60a21..00000000000 --- a/src/f16-prelu/f16c.c.in +++ /dev/null @@ -1,149 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert CHANNEL_TILE % 8 == 0 -$assert CHANNEL_TILE >= 8 -$assert ROW_TILE >= 1 -$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" -#include - -#include - -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f16_prelu_ukernel__f16c_${ROW_TILE}x${CHANNEL_TILE}( - size_t rows, - size_t channels, - const xnn_float16* restrict input, - size_t input_stride, - const xnn_float16* restrict weights, - xnn_float16* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(uint16_t) == 0); - - const uint16_t* i0 = (const uint16_t*) input; - uint16_t* o0 = (uint16_t*) output; - $for M in range(1, ROW_TILE): - const uint16_t* i${M} = (const uint16_t*) ((uintptr_t) i${M-1} + input_stride); - uint16_t* o${M} = (uint16_t*) ((uintptr_t) o${M-1} + output_stride); - - const size_t input_increment = input_stride * ${ROW_TILE} - channels; - const size_t output_increment = output_stride * ${ROW_TILE} - channels; - - do { - $for M in range(1, ROW_TILE): - $if M % 2 == 0: - if XNN_UNPREDICTABLE(rows <= ${M}) { - i${M} = i${M-1}; - o${M} = o${M-1}; - } - $else: - if XNN_UNPREDICTABLE(rows < ${M+1}) { - i${M} = i${M-1}; - o${M} = o${M-1}; - } - - const uint16_t* w = (const uint16_t*) weights; - size_t c = channels; - $if CHANNEL_TILE > 8: - for (; c >= ${CHANNEL_TILE} * sizeof(uint16_t); c -= ${CHANNEL_TILE} * sizeof(uint16_t)) { - const __m256 vw${ABC[0:8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w)); - $for C in range(8, CHANNEL_TILE, 8): - const __m256 vw${ABC[C:C+8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + ${C}))); - w += ${CHANNEL_TILE}; - - $for M in range(ROW_TILE): - const __m256 vi${M}x0${ABC[0:8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i${M})); - $for C in range(8, CHANNEL_TILE, 8): - const __m256 vi${M}x0${ABC[C:C+8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i${M} + ${C}))); - i${M} += ${CHANNEL_TILE}; - - $for M in range(ROW_TILE): - $for C in range(0, CHANNEL_TILE, 8): - __m256 vacc${M}x0${ABC[C:C+8]} = _mm256_mul_ps(vi${M}x0${ABC[C:C+8]}, vw${ABC[C:C+8]}); - - $for M in range(ROW_TILE): - $for C in range(0, CHANNEL_TILE, 8): - vacc${M}x0${ABC[C:C+8]} = _mm256_blendv_ps(vi${M}x0${ABC[C:C+8]}, vacc${M}x0${ABC[C:C+8]}, vi${M}x0${ABC[C:C+8]}); - - $for M in range(ROW_TILE): - _mm_storeu_si128((__m128i*) o${M}, _mm256_cvtps_ph(vacc${M}x0${ABC[C:C+8]}, _MM_FROUND_TO_NEAREST_INT)); - $for C in range(0, CHANNEL_TILE, 8): - _mm_storeu_si128((__m128i*) (o${M} + ${C}), _mm256_cvtps_ph(vacc${M}x0${ABC[C:C+8]}, _MM_FROUND_TO_NEAREST_INT)); - o${M} += ${CHANNEL_TILE}; - } - for (; c >= 8 * sizeof(uint16_t); c -= 8 * sizeof(uint16_t)) { - const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w)); - w += 8; - - $for M in range(ROW_TILE): - const __m256 vi${M}x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i${M})); - i${M} += 8; - - $for M in range(ROW_TILE): - __m256 vacc${M}x01234567 = _mm256_mul_ps(vi${M}x01234567, vw01234567); - - $for M in range(ROW_TILE): - vacc${M}x01234567 = _mm256_blendv_ps(vi${M}x01234567, vacc${M}x01234567, vi${M}x01234567); - - $for M in range(ROW_TILE): - _mm_storeu_si128((__m128i*) o${M}, _mm256_cvtps_ph(vacc${M}x01234567, _MM_FROUND_TO_NEAREST_INT)); - o${M} += 8; - } - if XNN_UNLIKELY(c != 0) { - const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w)); - - $for M in range(ROW_TILE): - const __m256 vi${M}x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i${M})); - i${M} = (const uint16_t*) ((uintptr_t) i${M} + c); - - $for M in range(ROW_TILE): - __m256 vacc${M}x01234567 = _mm256_mul_ps(vi${M}x01234567, vw01234567); - - $for M in range(ROW_TILE): - vacc${M}x01234567 = _mm256_blendv_ps(vi${M}x01234567, vacc${M}x01234567, vi${M}x01234567); - - $for M in range(ROW_TILE): - __m128i vh${M}x01234567 = _mm256_cvtps_ph(vacc${M}x01234567, _MM_FROUND_TO_NEAREST_INT); - if (c & (4 * sizeof(uint16_t))) { - $for M in range(ROW_TILE): - _mm_storel_epi64((__m128i*) o${M}, vh${M}x01234567); - - $for M in range(ROW_TILE): - vh${M}x01234567 = _mm_unpackhi_epi64(vh${M}x01234567, vh${M}x01234567); - - $for M in range(ROW_TILE): - o${M} += 4; - } - if (c & (2 * sizeof(uint16_t))) { - $for M in range(ROW_TILE): - _mm_storeu_si32(o${M}, vh${M}x01234567); - - $for M in range(ROW_TILE): - vh${M}x01234567 = _mm_srli_epi64(vh${M}x01234567, 32); - - $for M in range(ROW_TILE): - o${M} += 2; - } - if (c & (1 * sizeof(uint16_t))) { - $for M in range(ROW_TILE): - *o${M} = (uint16_t) _mm_extract_epi16(vh${M}x01234567, 0); - - $for M in range(ROW_TILE): - o${M} += 1; - } - } - $for M in range(ROW_TILE): - i${M} = (const uint16_t*) ((uintptr_t) i${M} + input_increment); - o${M} = (uint16_t*) ((uintptr_t) o${M} + output_increment); - rows = doz(rows, ${ROW_TILE}); - } while (rows != 0); -} diff --git a/src/f16-prelu/gen/f16-prelu-f16c-2x16.c b/src/f16-prelu/gen/f16-prelu-f16c-2x16.c deleted file mode 100644 index f145aabf22f..00000000000 --- a/src/f16-prelu/gen/f16-prelu-f16c-2x16.c +++ /dev/null @@ -1,149 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-prelu/f16c.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f16_prelu_ukernel__f16c_2x16( - size_t rows, - size_t channels, - const xnn_float16* restrict input, - size_t input_stride, - const xnn_float16* restrict weights, - xnn_float16* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(uint16_t) == 0); - - const uint16_t* i0 = (const uint16_t*) input; - uint16_t* o0 = (uint16_t*) output; - const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride); - uint16_t* o1 = (uint16_t*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const uint16_t* w = (const uint16_t*) weights; - size_t c = channels; - for (; c >= 16 * sizeof(uint16_t); c -= 16 * sizeof(uint16_t)) { - const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w)); - const __m256 vw89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 8))); - w += 16; - - const __m256 vi0x001234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); - const __m256 vi0x089ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i0 + 8))); - i0 += 16; - const __m256 vi1x001234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); - const __m256 vi1x089ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i1 + 8))); - i1 += 16; - - __m256 vacc0x001234567 = _mm256_mul_ps(vi0x001234567, vw01234567); - __m256 vacc0x089ABCDEF = _mm256_mul_ps(vi0x089ABCDEF, vw89ABCDEF); - __m256 vacc1x001234567 = _mm256_mul_ps(vi1x001234567, vw01234567); - __m256 vacc1x089ABCDEF = _mm256_mul_ps(vi1x089ABCDEF, vw89ABCDEF); - - vacc0x001234567 = _mm256_blendv_ps(vi0x001234567, vacc0x001234567, vi0x001234567); - vacc0x089ABCDEF = _mm256_blendv_ps(vi0x089ABCDEF, vacc0x089ABCDEF, vi0x089ABCDEF); - vacc1x001234567 = _mm256_blendv_ps(vi1x001234567, vacc1x001234567, vi1x001234567); - vacc1x089ABCDEF = _mm256_blendv_ps(vi1x089ABCDEF, vacc1x089ABCDEF, vi1x089ABCDEF); - - _mm_storeu_si128((__m128i*) o0, _mm256_cvtps_ph(vacc0x089ABCDEF, _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) (o0 + 0), _mm256_cvtps_ph(vacc0x001234567, _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) (o0 + 8), _mm256_cvtps_ph(vacc0x089ABCDEF, _MM_FROUND_TO_NEAREST_INT)); - o0 += 16; - _mm_storeu_si128((__m128i*) o1, _mm256_cvtps_ph(vacc1x089ABCDEF, _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) (o1 + 0), _mm256_cvtps_ph(vacc1x001234567, _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) (o1 + 8), _mm256_cvtps_ph(vacc1x089ABCDEF, _MM_FROUND_TO_NEAREST_INT)); - o1 += 16; - } - for (; c >= 8 * sizeof(uint16_t); c -= 8 * sizeof(uint16_t)) { - const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w)); - w += 8; - - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); - i0 += 8; - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); - i1 += 8; - - __m256 vacc0x01234567 = _mm256_mul_ps(vi0x01234567, vw01234567); - __m256 vacc1x01234567 = _mm256_mul_ps(vi1x01234567, vw01234567); - - vacc0x01234567 = _mm256_blendv_ps(vi0x01234567, vacc0x01234567, vi0x01234567); - vacc1x01234567 = _mm256_blendv_ps(vi1x01234567, vacc1x01234567, vi1x01234567); - - _mm_storeu_si128((__m128i*) o0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_TO_NEAREST_INT)); - o0 += 8; - _mm_storeu_si128((__m128i*) o1, _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_TO_NEAREST_INT)); - o1 += 8; - } - if XNN_UNLIKELY(c != 0) { - const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w)); - - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); - i0 = (const uint16_t*) ((uintptr_t) i0 + c); - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); - i1 = (const uint16_t*) ((uintptr_t) i1 + c); - - __m256 vacc0x01234567 = _mm256_mul_ps(vi0x01234567, vw01234567); - __m256 vacc1x01234567 = _mm256_mul_ps(vi1x01234567, vw01234567); - - vacc0x01234567 = _mm256_blendv_ps(vi0x01234567, vacc0x01234567, vi0x01234567); - vacc1x01234567 = _mm256_blendv_ps(vi1x01234567, vacc1x01234567, vi1x01234567); - - __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_TO_NEAREST_INT); - __m128i vh1x01234567 = _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_TO_NEAREST_INT); - if (c & (4 * sizeof(uint16_t))) { - _mm_storel_epi64((__m128i*) o0, vh0x01234567); - _mm_storel_epi64((__m128i*) o1, vh1x01234567); - - vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567); - vh1x01234567 = _mm_unpackhi_epi64(vh1x01234567, vh1x01234567); - - o0 += 4; - o1 += 4; - } - if (c & (2 * sizeof(uint16_t))) { - _mm_storeu_si32(o0, vh0x01234567); - _mm_storeu_si32(o1, vh1x01234567); - - vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32); - vh1x01234567 = _mm_srli_epi64(vh1x01234567, 32); - - o0 += 2; - o1 += 2; - } - if (c & (1 * sizeof(uint16_t))) { - *o0 = (uint16_t) _mm_extract_epi16(vh0x01234567, 0); - *o1 = (uint16_t) _mm_extract_epi16(vh1x01234567, 0); - - o0 += 1; - o1 += 1; - } - } - i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment); - o0 = (uint16_t*) ((uintptr_t) o0 + output_increment); - i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment); - o1 = (uint16_t*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f16-prelu/gen/f16-prelu-f16c-2x8.c b/src/f16-prelu/gen/f16-prelu-f16c-2x8.c deleted file mode 100644 index 9a062951c33..00000000000 --- a/src/f16-prelu/gen/f16-prelu-f16c-2x8.c +++ /dev/null @@ -1,118 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-prelu/f16c.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f16_prelu_ukernel__f16c_2x8( - size_t rows, - size_t channels, - const xnn_float16* restrict input, - size_t input_stride, - const xnn_float16* restrict weights, - xnn_float16* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(uint16_t) == 0); - - const uint16_t* i0 = (const uint16_t*) input; - uint16_t* o0 = (uint16_t*) output; - const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride); - uint16_t* o1 = (uint16_t*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const uint16_t* w = (const uint16_t*) weights; - size_t c = channels; - for (; c >= 8 * sizeof(uint16_t); c -= 8 * sizeof(uint16_t)) { - const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w)); - w += 8; - - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); - i0 += 8; - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); - i1 += 8; - - __m256 vacc0x01234567 = _mm256_mul_ps(vi0x01234567, vw01234567); - __m256 vacc1x01234567 = _mm256_mul_ps(vi1x01234567, vw01234567); - - vacc0x01234567 = _mm256_blendv_ps(vi0x01234567, vacc0x01234567, vi0x01234567); - vacc1x01234567 = _mm256_blendv_ps(vi1x01234567, vacc1x01234567, vi1x01234567); - - _mm_storeu_si128((__m128i*) o0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_TO_NEAREST_INT)); - o0 += 8; - _mm_storeu_si128((__m128i*) o1, _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_TO_NEAREST_INT)); - o1 += 8; - } - if XNN_UNLIKELY(c != 0) { - const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w)); - - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); - i0 = (const uint16_t*) ((uintptr_t) i0 + c); - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); - i1 = (const uint16_t*) ((uintptr_t) i1 + c); - - __m256 vacc0x01234567 = _mm256_mul_ps(vi0x01234567, vw01234567); - __m256 vacc1x01234567 = _mm256_mul_ps(vi1x01234567, vw01234567); - - vacc0x01234567 = _mm256_blendv_ps(vi0x01234567, vacc0x01234567, vi0x01234567); - vacc1x01234567 = _mm256_blendv_ps(vi1x01234567, vacc1x01234567, vi1x01234567); - - __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_TO_NEAREST_INT); - __m128i vh1x01234567 = _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_TO_NEAREST_INT); - if (c & (4 * sizeof(uint16_t))) { - _mm_storel_epi64((__m128i*) o0, vh0x01234567); - _mm_storel_epi64((__m128i*) o1, vh1x01234567); - - vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567); - vh1x01234567 = _mm_unpackhi_epi64(vh1x01234567, vh1x01234567); - - o0 += 4; - o1 += 4; - } - if (c & (2 * sizeof(uint16_t))) { - _mm_storeu_si32(o0, vh0x01234567); - _mm_storeu_si32(o1, vh1x01234567); - - vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32); - vh1x01234567 = _mm_srli_epi64(vh1x01234567, 32); - - o0 += 2; - o1 += 2; - } - if (c & (1 * sizeof(uint16_t))) { - *o0 = (uint16_t) _mm_extract_epi16(vh0x01234567, 0); - *o1 = (uint16_t) _mm_extract_epi16(vh1x01234567, 0); - - o0 += 1; - o1 += 1; - } - } - i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment); - o0 = (uint16_t*) ((uintptr_t) o0 + output_increment); - i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment); - o1 = (uint16_t*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f16-prelu/gen/f16-prelu-neonfp16arith-2x16.c b/src/f16-prelu/gen/f16-prelu-neonfp16arith-2x16.c deleted file mode 100644 index 064c541c100..00000000000 --- a/src/f16-prelu/gen/f16-prelu-neonfp16arith-2x16.c +++ /dev/null @@ -1,136 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-prelu/neonfp16arith.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f16_prelu_ukernel__neonfp16arith_2x16( - size_t rows, - size_t channels, - const xnn_float16* restrict input, - size_t input_stride, - const xnn_float16* restrict weights, - xnn_float16* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(uint16_t) == 0); - - const uint16_t* i0 = (const uint16_t*) input; - uint16_t* o0 = (uint16_t*) output; - const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride); - uint16_t* o1 = (uint16_t*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const uint16_t* w = (const uint16_t*) weights; - size_t c = channels; - for (; c >= 16 * sizeof(uint16_t); c -= 16 * sizeof(uint16_t)) { - const float16x8_t vw01234567 = vreinterpretq_f16_u16(vld1q_u16(w)); w += 8; - const float16x8_t vw89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(w)); w += 8; - - const float16x8_t vi0x001234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi0x089ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi1x001234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - const float16x8_t vi1x089ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - - float16x8_t vacc0x001234567 = vmulq_f16(vi0x001234567, vw01234567); - const uint16x8_t vm0x001234567 = vcltq_s16(vreinterpretq_s16_f16(vi0x001234567), vmovq_n_s16(0)); - float16x8_t vacc0x089ABCDEF = vmulq_f16(vi0x089ABCDEF, vw89ABCDEF); - const uint16x8_t vm0x089ABCDEF = vcltq_s16(vreinterpretq_s16_f16(vi0x089ABCDEF), vmovq_n_s16(0)); - float16x8_t vacc1x001234567 = vmulq_f16(vi1x001234567, vw01234567); - const uint16x8_t vm1x001234567 = vcltq_s16(vreinterpretq_s16_f16(vi1x001234567), vmovq_n_s16(0)); - float16x8_t vacc1x089ABCDEF = vmulq_f16(vi1x089ABCDEF, vw89ABCDEF); - const uint16x8_t vm1x089ABCDEF = vcltq_s16(vreinterpretq_s16_f16(vi1x089ABCDEF), vmovq_n_s16(0)); - - vacc0x001234567 = vbslq_f16(vm0x001234567, vacc0x001234567, vi0x001234567); - vacc0x089ABCDEF = vbslq_f16(vm0x089ABCDEF, vacc0x089ABCDEF, vi0x089ABCDEF); - vacc1x001234567 = vbslq_f16(vm1x001234567, vacc1x001234567, vi1x001234567); - vacc1x089ABCDEF = vbslq_f16(vm1x089ABCDEF, vacc1x089ABCDEF, vi1x089ABCDEF); - - vst1q_u16(o0, vreinterpretq_u16_f16(vacc0x001234567)); o0 += 8; - vst1q_u16(o0, vreinterpretq_u16_f16(vacc0x089ABCDEF)); o0 += 8; - vst1q_u16(o1, vreinterpretq_u16_f16(vacc1x001234567)); o1 += 8; - vst1q_u16(o1, vreinterpretq_u16_f16(vacc1x089ABCDEF)); o1 += 8; - } - for (; c >= 8 * sizeof(uint16_t); c -= 8 * sizeof(uint16_t)) { - const float16x8_t vw01234567 = vreinterpretq_f16_u16(vld1q_u16(w)); w += 8; - - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); - i0 += 8; - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); - i1 += 8; - - float16x8_t vacc0x01234567 = vmulq_f16(vi0x01234567, vw01234567); - const uint16x8_t vm0x01234567 = vcltq_s16(vreinterpretq_s16_f16(vi0x01234567), vmovq_n_s16(0)); - float16x8_t vacc1x01234567 = vmulq_f16(vi1x01234567, vw01234567); - const uint16x8_t vm1x01234567 = vcltq_s16(vreinterpretq_s16_f16(vi1x01234567), vmovq_n_s16(0)); - - vacc0x01234567 = vbslq_f16(vm0x01234567, vacc0x01234567, vi0x01234567); - vacc1x01234567 = vbslq_f16(vm1x01234567, vacc1x01234567, vi1x01234567); - - vst1q_u16(o0, vreinterpretq_u16_f16(vacc0x01234567)); o0 += 8; - vst1q_u16(o1, vreinterpretq_u16_f16(vacc1x01234567)); o1 += 8; - } - if XNN_UNLIKELY(c != 0) { - const float16x8_t vw01234567 = vreinterpretq_f16_u16(vld1q_u16(w)); - - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); - i0 = (const uint16_t*) ((uintptr_t) i0 + c); - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); - i1 = (const uint16_t*) ((uintptr_t) i1 + c); - - float16x8_t vacc0x01234567 = vmulq_f16(vi0x01234567, vw01234567); - const uint16x8_t vm0x01234567 = vcltq_s16(vreinterpretq_s16_f16(vi0x01234567), vmovq_n_s16(0)); - float16x8_t vacc1x01234567 = vmulq_f16(vi1x01234567, vw01234567); - const uint16x8_t vm1x01234567 = vcltq_s16(vreinterpretq_s16_f16(vi1x01234567), vmovq_n_s16(0)); - - vacc0x01234567 = vbslq_f16(vm0x01234567, vacc0x01234567, vi0x01234567); - vacc1x01234567 = vbslq_f16(vm1x01234567, vacc1x01234567, vi1x01234567); - - float16x4_t vacc0x0123 = vget_low_f16(vacc0x01234567); - float16x4_t vacc1x0123 = vget_low_f16(vacc1x01234567); - if (c & (4 * sizeof(uint16_t))) { - vst1_u16(o0, vreinterpret_u16_f16(vacc0x0123)); o0 += 4; - vst1_u16(o1, vreinterpret_u16_f16(vacc1x0123)); o1 += 4; - - vacc0x0123 = vget_high_f16(vacc0x01234567); - vacc1x0123 = vget_high_f16(vacc1x01234567); - } - if (c & (2 * sizeof(uint16_t))) { - vst1_lane_u32((void*) o0, vreinterpret_u32_f16(vacc0x0123), 0); o0 += 2; - vacc0x0123 = vext_f16(vacc0x0123, vacc0x0123, 2); - vst1_lane_u32((void*) o1, vreinterpret_u32_f16(vacc1x0123), 0); o1 += 2; - vacc1x0123 = vext_f16(vacc1x0123, vacc1x0123, 2); - } - if (c & (1 * sizeof(uint16_t))) { - vst1_lane_u16(o0, vreinterpret_u16_f16(vacc0x0123), 0); o0 += 1; - vst1_lane_u16(o1, vreinterpret_u16_f16(vacc1x0123), 0); o1 += 1; - } - } - i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment); - o0 = (uint16_t*) ((uintptr_t) o0 + output_increment); - i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment); - o1 = (uint16_t*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f16-prelu/gen/f16-prelu-neonfp16arith-2x8.c b/src/f16-prelu/gen/f16-prelu-neonfp16arith-2x8.c deleted file mode 100644 index 7169917364b..00000000000 --- a/src/f16-prelu/gen/f16-prelu-neonfp16arith-2x8.c +++ /dev/null @@ -1,108 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-prelu/neonfp16arith.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f16_prelu_ukernel__neonfp16arith_2x8( - size_t rows, - size_t channels, - const xnn_float16* restrict input, - size_t input_stride, - const xnn_float16* restrict weights, - xnn_float16* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(uint16_t) == 0); - - const uint16_t* i0 = (const uint16_t*) input; - uint16_t* o0 = (uint16_t*) output; - const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride); - uint16_t* o1 = (uint16_t*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const uint16_t* w = (const uint16_t*) weights; - size_t c = channels; - for (; c >= 8 * sizeof(uint16_t); c -= 8 * sizeof(uint16_t)) { - const float16x8_t vw01234567 = vreinterpretq_f16_u16(vld1q_u16(w)); w += 8; - - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); - i0 += 8; - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); - i1 += 8; - - float16x8_t vacc0x01234567 = vmulq_f16(vi0x01234567, vw01234567); - const uint16x8_t vm0x01234567 = vcltq_s16(vreinterpretq_s16_f16(vi0x01234567), vmovq_n_s16(0)); - float16x8_t vacc1x01234567 = vmulq_f16(vi1x01234567, vw01234567); - const uint16x8_t vm1x01234567 = vcltq_s16(vreinterpretq_s16_f16(vi1x01234567), vmovq_n_s16(0)); - - vacc0x01234567 = vbslq_f16(vm0x01234567, vacc0x01234567, vi0x01234567); - vacc1x01234567 = vbslq_f16(vm1x01234567, vacc1x01234567, vi1x01234567); - - vst1q_u16(o0, vreinterpretq_u16_f16(vacc0x01234567)); o0 += 8; - vst1q_u16(o1, vreinterpretq_u16_f16(vacc1x01234567)); o1 += 8; - } - if XNN_UNLIKELY(c != 0) { - const float16x8_t vw01234567 = vreinterpretq_f16_u16(vld1q_u16(w)); - - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); - i0 = (const uint16_t*) ((uintptr_t) i0 + c); - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); - i1 = (const uint16_t*) ((uintptr_t) i1 + c); - - float16x8_t vacc0x01234567 = vmulq_f16(vi0x01234567, vw01234567); - const uint16x8_t vm0x01234567 = vcltq_s16(vreinterpretq_s16_f16(vi0x01234567), vmovq_n_s16(0)); - float16x8_t vacc1x01234567 = vmulq_f16(vi1x01234567, vw01234567); - const uint16x8_t vm1x01234567 = vcltq_s16(vreinterpretq_s16_f16(vi1x01234567), vmovq_n_s16(0)); - - vacc0x01234567 = vbslq_f16(vm0x01234567, vacc0x01234567, vi0x01234567); - vacc1x01234567 = vbslq_f16(vm1x01234567, vacc1x01234567, vi1x01234567); - - float16x4_t vacc0x0123 = vget_low_f16(vacc0x01234567); - float16x4_t vacc1x0123 = vget_low_f16(vacc1x01234567); - if (c & (4 * sizeof(uint16_t))) { - vst1_u16(o0, vreinterpret_u16_f16(vacc0x0123)); o0 += 4; - vst1_u16(o1, vreinterpret_u16_f16(vacc1x0123)); o1 += 4; - - vacc0x0123 = vget_high_f16(vacc0x01234567); - vacc1x0123 = vget_high_f16(vacc1x01234567); - } - if (c & (2 * sizeof(uint16_t))) { - vst1_lane_u32((void*) o0, vreinterpret_u32_f16(vacc0x0123), 0); o0 += 2; - vacc0x0123 = vext_f16(vacc0x0123, vacc0x0123, 2); - vst1_lane_u32((void*) o1, vreinterpret_u32_f16(vacc1x0123), 0); o1 += 2; - vacc1x0123 = vext_f16(vacc1x0123, vacc1x0123, 2); - } - if (c & (1 * sizeof(uint16_t))) { - vst1_lane_u16(o0, vreinterpret_u16_f16(vacc0x0123), 0); o0 += 1; - vst1_lane_u16(o1, vreinterpret_u16_f16(vacc1x0123), 0); o1 += 1; - } - } - i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment); - o0 = (uint16_t*) ((uintptr_t) o0 + output_increment); - i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment); - o1 = (uint16_t*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f16-prelu/neonfp16arith.c.in b/src/f16-prelu/neonfp16arith.c.in deleted file mode 100644 index f65f953a349..00000000000 --- a/src/f16-prelu/neonfp16arith.c.in +++ /dev/null @@ -1,132 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert CHANNEL_TILE % 8 == 0 -$assert CHANNEL_TILE >= 8 -$assert ROW_TILE >= 1 -$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f16_prelu_ukernel__neonfp16arith_${ROW_TILE}x${CHANNEL_TILE}( - size_t rows, - size_t channels, - const xnn_float16* restrict input, - size_t input_stride, - const xnn_float16* restrict weights, - xnn_float16* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(uint16_t) == 0); - - const uint16_t* i0 = (const uint16_t*) input; - uint16_t* o0 = (uint16_t*) output; - $for M in range(1, ROW_TILE): - const uint16_t* i${M} = (const uint16_t*) ((uintptr_t) i${M-1} + input_stride); - uint16_t* o${M} = (uint16_t*) ((uintptr_t) o${M-1} + output_stride); - - const size_t input_increment = input_stride * ${ROW_TILE} - channels; - const size_t output_increment = output_stride * ${ROW_TILE} - channels; - - do { - $for M in range(1, ROW_TILE): - $if M % 2 == 0: - if XNN_UNPREDICTABLE(rows <= ${M}) { - i${M} = i${M-1}; - o${M} = o${M-1}; - } - $else: - if XNN_UNPREDICTABLE(rows < ${M+1}) { - i${M} = i${M-1}; - o${M} = o${M-1}; - } - - const uint16_t* w = (const uint16_t*) weights; - size_t c = channels; - $if CHANNEL_TILE > 8: - for (; c >= ${CHANNEL_TILE} * sizeof(uint16_t); c -= ${CHANNEL_TILE} * sizeof(uint16_t)) { - $for C in range(0, CHANNEL_TILE, 8): - const float16x8_t vw${ABC[C:C+8]} = vreinterpretq_f16_u16(vld1q_u16(w)); w += 8; - - $for M in range(ROW_TILE): - $for C in range(0, CHANNEL_TILE, 8): - const float16x8_t vi${M}x0${ABC[C:C+8]} = vreinterpretq_f16_u16(vld1q_u16(i${M})); i${M} += 8; - - $for M in range(ROW_TILE): - $for C in range(0, CHANNEL_TILE, 8): - float16x8_t vacc${M}x0${ABC[C:C+8]} = vmulq_f16(vi${M}x0${ABC[C:C+8]}, vw${ABC[C:C+8]}); - const uint16x8_t vm${M}x0${ABC[C:C+8]} = vcltq_s16(vreinterpretq_s16_f16(vi${M}x0${ABC[C:C+8]}), vmovq_n_s16(0)); - - $for M in range(ROW_TILE): - $for C in range(0, CHANNEL_TILE, 8): - vacc${M}x0${ABC[C:C+8]} = vbslq_f16(vm${M}x0${ABC[C:C+8]}, vacc${M}x0${ABC[C:C+8]}, vi${M}x0${ABC[C:C+8]}); - - $for M in range(ROW_TILE): - $for C in range(0, CHANNEL_TILE, 8): - vst1q_u16(o${M}, vreinterpretq_u16_f16(vacc${M}x0${ABC[C:C+8]})); o${M} += 8; - } - for (; c >= 8 * sizeof(uint16_t); c -= 8 * sizeof(uint16_t)) { - const float16x8_t vw01234567 = vreinterpretq_f16_u16(vld1q_u16(w)); w += 8; - - $for M in range(ROW_TILE): - const float16x8_t vi${M}x01234567 = vreinterpretq_f16_u16(vld1q_u16(i${M})); - i${M} += 8; - - $for M in range(ROW_TILE): - float16x8_t vacc${M}x01234567 = vmulq_f16(vi${M}x01234567, vw01234567); - const uint16x8_t vm${M}x01234567 = vcltq_s16(vreinterpretq_s16_f16(vi${M}x01234567), vmovq_n_s16(0)); - - $for M in range(ROW_TILE): - vacc${M}x01234567 = vbslq_f16(vm${M}x01234567, vacc${M}x01234567, vi${M}x01234567); - - $for M in range(ROW_TILE): - vst1q_u16(o${M}, vreinterpretq_u16_f16(vacc${M}x01234567)); o${M} += 8; - } - if XNN_UNLIKELY(c != 0) { - const float16x8_t vw01234567 = vreinterpretq_f16_u16(vld1q_u16(w)); - - $for M in range(ROW_TILE): - const float16x8_t vi${M}x01234567 = vreinterpretq_f16_u16(vld1q_u16(i${M})); - i${M} = (const uint16_t*) ((uintptr_t) i${M} + c); - - $for M in range(ROW_TILE): - float16x8_t vacc${M}x01234567 = vmulq_f16(vi${M}x01234567, vw01234567); - const uint16x8_t vm${M}x01234567 = vcltq_s16(vreinterpretq_s16_f16(vi${M}x01234567), vmovq_n_s16(0)); - - $for M in range(ROW_TILE): - vacc${M}x01234567 = vbslq_f16(vm${M}x01234567, vacc${M}x01234567, vi${M}x01234567); - - $for M in range(ROW_TILE): - float16x4_t vacc${M}x0123 = vget_low_f16(vacc${M}x01234567); - if (c & (4 * sizeof(uint16_t))) { - $for M in range(ROW_TILE): - vst1_u16(o${M}, vreinterpret_u16_f16(vacc${M}x0123)); o${M} += 4; - - $for M in range(ROW_TILE): - vacc${M}x0123 = vget_high_f16(vacc${M}x01234567); - } - if (c & (2 * sizeof(uint16_t))) { - $for M in range(ROW_TILE): - vst1_lane_u32((void*) o${M}, vreinterpret_u32_f16(vacc${M}x0123), 0); o${M} += 2; - vacc${M}x0123 = vext_f16(vacc${M}x0123, vacc${M}x0123, 2); - } - if (c & (1 * sizeof(uint16_t))) { - $for M in range(ROW_TILE): - vst1_lane_u16(o${M}, vreinterpret_u16_f16(vacc${M}x0123), 0); o${M} += 1; - } - } - $for M in range(ROW_TILE): - i${M} = (const uint16_t*) ((uintptr_t) i${M} + input_increment); - o${M} = (uint16_t*) ((uintptr_t) o${M} + output_increment); - rows = doz(rows, ${ROW_TILE}); - } while (rows != 0); -} diff --git a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u16.c b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u16.c index fc2e2015708..b039b2c5552 100644 --- a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u16.c +++ b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u16.c @@ -31,8 +31,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u16( const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->scalar.scale)); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->scalar.output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->scalar.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->scalar.output_max); for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { float16x8_t vx0 = vreinterpretq_f16_u16(vld1q_u16(i)); i += 8; float16x8_t vx8 = vreinterpretq_f16_u16(vld1q_u16(i)); i += 8; @@ -48,10 +46,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u16( int8x16_t vy0 = vcombine_s8(vqmovn_s16(vacc0), vqmovn_s16(vacc8)); - vy0 = vmaxq_s8(vy0, voutput_min); - - vy0 = vminq_s8(vy0, voutput_max); - vst1q_s8(output, vy0); output += 16; } for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { @@ -64,8 +58,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u16( vacc = vqaddq_s16(vacc, voutput_zero_point); int8x8_t vy = vqmovn_s16(vacc); - vy = vmax_s8(vy, vget_low_s8(voutput_min)); - vy = vmin_s8(vy, vget_low_s8(voutput_max)); vst1_s8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { @@ -79,8 +71,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u16( vacc = vqaddq_s16(vacc, voutput_zero_point); int8x8_t vy = vqmovn_s16(vacc); - vy = vmax_s8(vy, vget_low_s8(voutput_min)); - vy = vmin_s8(vy, vget_low_s8(voutput_max)); if (batch & (4 * sizeof(uint16_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4; diff --git a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u24.c b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u24.c index 7e02a33f646..d3eb2ca8e4a 100644 --- a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u24.c +++ b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u24.c @@ -31,8 +31,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u24( const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->scalar.scale)); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->scalar.output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->scalar.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->scalar.output_max); for (; batch >= 24 * sizeof(uint16_t); batch -= 24 * sizeof(uint16_t)) { float16x8_t vx0 = vreinterpretq_f16_u16(vld1q_u16(i)); i += 8; float16x8_t vx8 = vreinterpretq_f16_u16(vld1q_u16(i)); i += 8; @@ -53,12 +51,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u24( int8x16_t vy0 = vcombine_s8(vqmovn_s16(vacc0), vqmovn_s16(vacc8)); int8x8_t vy16 = vqmovn_s16(vacc16); - vy0 = vmaxq_s8(vy0, voutput_min); - vy16 = vmax_s8(vy16, vget_low_s8(voutput_min)); - - vy0 = vminq_s8(vy0, voutput_max); - vy16 = vmin_s8(vy16, vget_low_s8(voutput_max)); - vst1q_s8(output, vy0); output += 16; vst1_s8(output, vy16); output += 8; } @@ -72,8 +64,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u24( vacc = vqaddq_s16(vacc, voutput_zero_point); int8x8_t vy = vqmovn_s16(vacc); - vy = vmax_s8(vy, vget_low_s8(voutput_min)); - vy = vmin_s8(vy, vget_low_s8(voutput_max)); vst1_s8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { @@ -87,8 +77,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u24( vacc = vqaddq_s16(vacc, voutput_zero_point); int8x8_t vy = vqmovn_s16(vacc); - vy = vmax_s8(vy, vget_low_s8(voutput_min)); - vy = vmin_s8(vy, vget_low_s8(voutput_max)); if (batch & (4 * sizeof(uint16_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4; diff --git a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u32.c b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u32.c index 9645bdb569c..8e035ffe87b 100644 --- a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u32.c +++ b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u32.c @@ -31,8 +31,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u32( const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->scalar.scale)); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->scalar.output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->scalar.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->scalar.output_max); for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { float16x8_t vx0 = vreinterpretq_f16_u16(vld1q_u16(i)); i += 8; float16x8_t vx8 = vreinterpretq_f16_u16(vld1q_u16(i)); i += 8; @@ -57,12 +55,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u32( int8x16_t vy0 = vcombine_s8(vqmovn_s16(vacc0), vqmovn_s16(vacc8)); int8x16_t vy16 = vcombine_s8(vqmovn_s16(vacc16), vqmovn_s16(vacc24)); - vy0 = vmaxq_s8(vy0, voutput_min); - vy16 = vmaxq_s8(vy16, voutput_min); - - vy0 = vminq_s8(vy0, voutput_max); - vy16 = vminq_s8(vy16, voutput_max); - vst1q_s8(output, vy0); output += 16; vst1q_s8(output, vy16); output += 16; } @@ -76,8 +68,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u32( vacc = vqaddq_s16(vacc, voutput_zero_point); int8x8_t vy = vqmovn_s16(vacc); - vy = vmax_s8(vy, vget_low_s8(voutput_min)); - vy = vmin_s8(vy, vget_low_s8(voutput_max)); vst1_s8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { @@ -91,8 +81,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u32( vacc = vqaddq_s16(vacc, voutput_zero_point); int8x8_t vy = vqmovn_s16(vacc); - vy = vmax_s8(vy, vget_low_s8(voutput_min)); - vy = vmin_s8(vy, vget_low_s8(voutput_max)); if (batch & (4 * sizeof(uint16_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4; diff --git a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u64.c b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u64.c index 6561ddf428f..ebc4fb5f661 100644 --- a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u64.c +++ b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u64.c @@ -31,8 +31,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u64( const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->scalar.scale)); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->scalar.output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->scalar.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->scalar.output_max); for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) { float16x8_t vx0 = vreinterpretq_f16_u16(vld1q_u16(i)); i += 8; float16x8_t vx8 = vreinterpretq_f16_u16(vld1q_u16(i)); i += 8; @@ -75,16 +73,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u64( int8x16_t vy32 = vcombine_s8(vqmovn_s16(vacc32), vqmovn_s16(vacc40)); int8x16_t vy48 = vcombine_s8(vqmovn_s16(vacc48), vqmovn_s16(vacc56)); - vy0 = vmaxq_s8(vy0, voutput_min); - vy16 = vmaxq_s8(vy16, voutput_min); - vy32 = vmaxq_s8(vy32, voutput_min); - vy48 = vmaxq_s8(vy48, voutput_min); - - vy0 = vminq_s8(vy0, voutput_max); - vy16 = vminq_s8(vy16, voutput_max); - vy32 = vminq_s8(vy32, voutput_max); - vy48 = vminq_s8(vy48, voutput_max); - vst1q_s8(output, vy0); output += 16; vst1q_s8(output, vy16); output += 16; vst1q_s8(output, vy32); output += 16; @@ -100,8 +88,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u64( vacc = vqaddq_s16(vacc, voutput_zero_point); int8x8_t vy = vqmovn_s16(vacc); - vy = vmax_s8(vy, vget_low_s8(voutput_min)); - vy = vmin_s8(vy, vget_low_s8(voutput_max)); vst1_s8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { @@ -115,8 +101,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u64( vacc = vqaddq_s16(vacc, voutput_zero_point); int8x8_t vy = vqmovn_s16(vacc); - vy = vmax_s8(vy, vget_low_s8(voutput_min)); - vy = vmin_s8(vy, vget_low_s8(voutput_max)); if (batch & (4 * sizeof(uint16_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4; diff --git a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u8.c b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u8.c index be0a15e9a7c..81839e9e305 100644 --- a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u8.c +++ b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u8.c @@ -31,8 +31,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u8( const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->scalar.scale)); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->scalar.output_zero_point); - const int8x8_t voutput_min = vld1_dup_s8(¶ms->scalar.output_min); - const int8x8_t voutput_max = vld1_dup_s8(¶ms->scalar.output_max); for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { float16x8_t vx = vreinterpretq_f16_u16(vld1q_u16(i)); i += 8; @@ -43,8 +41,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u8( vacc = vqaddq_s16(vacc, voutput_zero_point); int8x8_t vy = vqmovn_s16(vacc); - vy = vmax_s8(vy, voutput_min); - vy = vmin_s8(vy, voutput_max); vst1_s8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { @@ -58,8 +54,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u8( vacc = vqaddq_s16(vacc, voutput_zero_point); int8x8_t vy = vqmovn_s16(vacc); - vy = vmax_s8(vy, voutput_min); - vy = vmin_s8(vy, voutput_max); if (batch & (4 * sizeof(uint16_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4; diff --git a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u1.c b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u1.c index fb9e4dc17b6..e635693c36a 100644 --- a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u1.c +++ b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u1.c @@ -26,8 +26,8 @@ void xnn_f16_qs8_vcvt_ukernel__scalar_fmagic_u1( const xnn_float16* i = input; const float vscale = xnn_float16_to_float(params->scalar.scale); - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const float vmagic_bias = 12582912.0f; const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u2.c b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u2.c index e3561a3ccb3..b46d911e5c6 100644 --- a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u2.c +++ b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u2.c @@ -26,8 +26,8 @@ void xnn_f16_qs8_vcvt_ukernel__scalar_fmagic_u2( const xnn_float16* i = input; const float vscale = xnn_float16_to_float(params->scalar.scale); - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const float vmagic_bias = 12582912.0f; const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u3.c b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u3.c index f088ddf5684..e45cd24818a 100644 --- a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u3.c +++ b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u3.c @@ -26,8 +26,8 @@ void xnn_f16_qs8_vcvt_ukernel__scalar_fmagic_u3( const xnn_float16* i = input; const float vscale = xnn_float16_to_float(params->scalar.scale); - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const float vmagic_bias = 12582912.0f; const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u4.c b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u4.c index c2cb7a3594e..27682f5af68 100644 --- a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u4.c +++ b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u4.c @@ -26,8 +26,8 @@ void xnn_f16_qs8_vcvt_ukernel__scalar_fmagic_u4( const xnn_float16* i = input; const float vscale = xnn_float16_to_float(params->scalar.scale); - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const float vmagic_bias = 12582912.0f; const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u1.c b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u1.c index c995a3df11a..08f1f168b88 100644 --- a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u1.c +++ b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u1.c @@ -27,8 +27,8 @@ void xnn_f16_qs8_vcvt_ukernel__scalar_imagic_u1( const xnn_float16* i = input; const float vscale = xnn_float16_to_float(params->scalar.scale); const float vmagic_bias = 12582912.0f; - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float output_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point); const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point); const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u2.c b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u2.c index 00620ce80b3..0f854a094a1 100644 --- a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u2.c +++ b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u2.c @@ -27,8 +27,8 @@ void xnn_f16_qs8_vcvt_ukernel__scalar_imagic_u2( const xnn_float16* i = input; const float vscale = xnn_float16_to_float(params->scalar.scale); const float vmagic_bias = 12582912.0f; - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float output_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point); const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point); const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u3.c b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u3.c index c4f849e687e..d6dae2b9be8 100644 --- a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u3.c +++ b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u3.c @@ -27,8 +27,8 @@ void xnn_f16_qs8_vcvt_ukernel__scalar_imagic_u3( const xnn_float16* i = input; const float vscale = xnn_float16_to_float(params->scalar.scale); const float vmagic_bias = 12582912.0f; - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float output_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point); const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point); const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u4.c b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u4.c index 86e42c4ca57..2dcafe7e9a0 100644 --- a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u4.c +++ b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u4.c @@ -27,8 +27,8 @@ void xnn_f16_qs8_vcvt_ukernel__scalar_imagic_u4( const xnn_float16* i = input; const float vscale = xnn_float16_to_float(params->scalar.scale); const float vmagic_bias = 12582912.0f; - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float output_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point); const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point); const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f16-qs8-vcvt/neonfp16arith.c.in b/src/f16-qs8-vcvt/neonfp16arith.c.in index 7738146d92b..67797f2808c 100644 --- a/src/f16-qs8-vcvt/neonfp16arith.c.in +++ b/src/f16-qs8-vcvt/neonfp16arith.c.in @@ -29,12 +29,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u${BATCH_TILE}( const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->scalar.scale)); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->scalar.output_zero_point); - $if BATCH_TILE > 8: - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->scalar.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->scalar.output_max); - $else: - const int8x8_t voutput_min = vld1_dup_s8(¶ms->scalar.output_min); - const int8x8_t voutput_max = vld1_dup_s8(¶ms->scalar.output_max); $if BATCH_TILE > 8: for (; batch >= ${BATCH_TILE} * sizeof(uint16_t); batch -= ${BATCH_TILE} * sizeof(uint16_t)) { $for N in range(0, BATCH_TILE, 8): @@ -55,18 +49,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u${BATCH_TILE}( $else: int8x8_t vy${N} = vqmovn_s16(vacc${N}); - $for N in range(0, BATCH_TILE, 16): - $if N + 8 < BATCH_TILE: - vy${N} = vmaxq_s8(vy${N}, voutput_min); - $else: - vy${N} = vmax_s8(vy${N}, vget_low_s8(voutput_min)); - - $for N in range(0, BATCH_TILE, 16): - $if N + 8 < BATCH_TILE: - vy${N} = vminq_s8(vy${N}, voutput_max); - $else: - vy${N} = vmin_s8(vy${N}, vget_low_s8(voutput_max)); - $for N in range(0, BATCH_TILE, 16): $if N + 8 < BATCH_TILE: vst1q_s8(output, vy${N}); output += 16; @@ -83,12 +65,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u${BATCH_TILE}( vacc = vqaddq_s16(vacc, voutput_zero_point); int8x8_t vy = vqmovn_s16(vacc); - $if BATCH_TILE > 8: - vy = vmax_s8(vy, vget_low_s8(voutput_min)); - vy = vmin_s8(vy, vget_low_s8(voutput_max)); - $else: - vy = vmax_s8(vy, voutput_min); - vy = vmin_s8(vy, voutput_max); vst1_s8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { @@ -102,12 +78,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u${BATCH_TILE}( vacc = vqaddq_s16(vacc, voutput_zero_point); int8x8_t vy = vqmovn_s16(vacc); - $if BATCH_TILE > 8: - vy = vmax_s8(vy, vget_low_s8(voutput_min)); - vy = vmin_s8(vy, vget_low_s8(voutput_max)); - $else: - vy = vmax_s8(vy, voutput_min); - vy = vmin_s8(vy, voutput_max); if (batch & (4 * sizeof(uint16_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4; diff --git a/src/f16-vsqrt/f16-vsqrt.h b/src/f16-vsqrt/f16-vsqrt.h index f5001c3fe03..7bb5773a693 100644 --- a/src/f16-vsqrt/f16-vsqrt.h +++ b/src/f16-vsqrt/f16-vsqrt.h @@ -41,9 +41,6 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vsqrt_ukernel__avx512fp #endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f16_vsqrt_ukernel__avx512skx_sqrt_u16, 16, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f16_vsqrt_ukernel__avx512skx_sqrt_u32, 32, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f16_vsqrt_ukernel__avx512skx_sqrt_u64, 64, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsqrt_ukernel__f16c_rsqrt_u8, 8, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsqrt_ukernel__f16c_rsqrt_u16, 16, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsqrt_ukernel__f16c_rsqrt_u32, 32, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) @@ -52,6 +49,12 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsqrt_ukernel__f16c_sqrt_u16, XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsqrt_ukernel__f16c_sqrt_u32, 32, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f16_vsqrt_ukernel__avx512skx_sqrt_u16, 16, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f16_vsqrt_ukernel__avx512skx_sqrt_u32, 32, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f16_vsqrt_ukernel__avx512skx_sqrt_u64, 64, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + #ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS #undef XNN_DEFINED_UKERNEL_WITH_PARAMS diff --git a/src/f32-dwconv/f32-dwconv-minmax-multipass.h b/src/f32-dwconv/f32-dwconv-minmax-multipass.h index 659f4aa6f7c..197547ca656 100644 --- a/src/f32-dwconv/f32-dwconv-minmax-multipass.h +++ b/src/f32-dwconv/f32-dwconv-minmax-multipass.h @@ -76,6 +76,9 @@ XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_7f6m6l16c8 XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_7f6m6l16c8s4r__fma3_acc2, 7, 6, 6, 16, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_7f6m6l32c8s4r__fma3, 7, 6, 6, 32, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_7f6m6l32c8s4r__fma3_acc2, 7, 6, 6, 32, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_5f5m5l16c16s1r__avx512f, 5, 5, 5, 16, 16, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_5f5m5l16c16s1r__avx512f_acc2, 5, 5, 5, 16, 16, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_5f5m5l32c16s1r__avx512f, 5, 5, 5, 32, 16, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) diff --git a/src/f32-dwconv/f32-dwconv-minmax-unipass.h b/src/f32-dwconv/f32-dwconv-minmax-unipass.h index 9a5630b1b1c..56b94f8b195 100644 --- a/src/f32-dwconv/f32-dwconv-minmax-unipass.h +++ b/src/f32-dwconv/f32-dwconv-minmax-unipass.h @@ -111,6 +111,9 @@ XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_25p8c__fma3, XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_25p8c__fma3_acc2, 8, false, 8, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_25p16c__fma3, 16, false, 16, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_25p16c__fma3_acc2, 16, false, 16, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_3p16c__avx512f, 16, false, 16, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_3p16c__avx512f_acc2, 16, false, 16, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_3p32c__avx512f, 32, false, 32, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) diff --git a/src/f32-f16-vcvt/f32-f16-vcvt.h b/src/f32-f16-vcvt/f32-f16-vcvt.h index 8abb72ebbd7..89acb5523cf 100644 --- a/src/f32-f16-vcvt/f32-f16-vcvt.h +++ b/src/f32-f16-vcvt/f32-f16-vcvt.h @@ -40,9 +40,12 @@ XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_f16_vcvt_ukernel__avx_u24, XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_f16_vcvt_ukernel__avx_u32, 32, false, float, xnn_float16, void, NULL) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f32_f16_vcvt_ukernel__f16c_u8, 8, false, float, xnn_float16, void, NULL) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f32_f16_vcvt_ukernel__f16c_u16, 16, false, float, xnn_float16, void, NULL) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_f16_vcvt_ukernel__avx512skx_u16, 16, false, float, xnn_float16, void, NULL) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_f16_vcvt_ukernel__avx512skx_u32, 32, false, float, xnn_float16, void, NULL) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_f32_f16_vcvt_ukernel__wasmsimd_u8, 8, false, float, xnn_float16, void, NULL) diff --git a/src/f32-prelu/avx.c.in b/src/f32-prelu/avx.c.in deleted file mode 100644 index 752f85f8817..00000000000 --- a/src/f32-prelu/avx.c.in +++ /dev/null @@ -1,154 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert CHANNEL_TILE % 8 == 0 -$assert CHANNEL_TILE >= 8 -$assert ROW_TILE >= 1 -$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__avx_${ROW_TILE}x${CHANNEL_TILE}( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - - const float* i0 = input; - float* o0 = output; - $for M in range(1, ROW_TILE): - const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_stride); - float* o${M} = (float*) ((uintptr_t) o${M-1} + output_stride); - - const size_t input_increment = input_stride * ${ROW_TILE} - channels; - const size_t output_increment = output_stride * ${ROW_TILE} - channels; - - do { - $for M in range(1, ROW_TILE): - $if M % 2 == 0: - if XNN_UNPREDICTABLE(rows <= ${M}) { - i${M} = i${M-1}; - o${M} = o${M-1}; - } - $else: - if XNN_UNPREDICTABLE(rows < ${M+1}) { - i${M} = i${M-1}; - o${M} = o${M-1}; - } - - const float* w = weights; - size_t c = channels; - for (; c >= ${CHANNEL_TILE} * sizeof(float); c -= ${CHANNEL_TILE} * sizeof(float)) { - const __m256 vw${ABC[0:8]} = _mm256_load_ps(w); - $for C in range(8, CHANNEL_TILE, 8): - const __m256 vw${ABC[C:C+8]} = _mm256_load_ps(w + ${C}); - w += ${CHANNEL_TILE}; - - $for M in range(ROW_TILE): - const __m256 vi${M}x${ABC[0:8]} = _mm256_loadu_ps(i${M}); - $for C in range(8, CHANNEL_TILE, 8): - const __m256 vi${M}x${ABC[C:C+8]} = _mm256_loadu_ps(i${M} + ${C}); - i${M} += ${CHANNEL_TILE}; - - $for M in range(ROW_TILE): - $for C in range(0, CHANNEL_TILE, 8): - const __m256 vprod${M}x${ABC[C:C+8]} = _mm256_mul_ps(vi${M}x${ABC[C:C+8]}, vw${ABC[C:C+8]}); - - $for M in range(ROW_TILE): - $for C in range(0, CHANNEL_TILE, 8): - const __m256 vacc${M}x${ABC[C:C+8]} = _mm256_blendv_ps(vi${M}x${ABC[C:C+8]}, vprod${M}x${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]}); - - $for M in range(ROW_TILE): - _mm256_storeu_ps(o${M}, vacc${M}x${ABC[0:8]}); - $for C in range(8, CHANNEL_TILE, 8): - _mm256_storeu_ps(o${M} + ${C}, vacc${M}x${ABC[C:C+8]}); - o${M} += ${CHANNEL_TILE}; - } - $if CHANNEL_TILE > 8: - for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { - const __m256 vw = _mm256_load_ps(w); - w += 8; - - $for M in range(ROW_TILE): - const __m256 vi${M} = _mm256_loadu_ps(i${M}); - i${M} += 8; - - $for M in range(ROW_TILE): - const __m256 vprod${M} = _mm256_mul_ps(vi${M}, vw); - - $for M in range(ROW_TILE): - const __m256 vacc${M} = _mm256_blendv_ps(vi${M}, vprod${M}, vi${M}); - - $for M in range(ROW_TILE): - _mm256_storeu_ps(o${M}, vacc${M}); - o${M} += 8; - } - if XNN_UNLIKELY(c != 0) { - assert(c >= 1 * sizeof(float)); - assert(c <= 7 * sizeof(float)); - __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - c)); - - const __m256 vw = _mm256_maskload_ps(w, vmask); - - $for M in range(ROW_TILE): - const __m256 vi${M} = _mm256_maskload_ps(i${M}, vmask); - i${M} = (const float*) ((uintptr_t) i${M} + c); - - $for M in range(ROW_TILE): - const __m256 vprod${M} = _mm256_mul_ps(vi${M}, vw); - - $for M in range(ROW_TILE): - __m256 vacc${M} = _mm256_blendv_ps(vi${M}, vprod${M}, vi${M}); - - $for M in range(ROW_TILE): - __m128 vacc${M}_lo = _mm256_castps256_ps128(vacc${M}); - if (c & (4 * sizeof(float))) { - $for M in range(ROW_TILE): - _mm_storeu_ps(o${M}, vacc${M}_lo); - - $for M in range(ROW_TILE): - vacc${M}_lo = _mm256_extractf128_ps(vacc${M}, 1); - - $for M in range(ROW_TILE): - o${M} += 4; - } - if (c & (2 * sizeof(float))) { - $for M in range(ROW_TILE): - _mm_storel_pi((__m64*) o${M}, vacc${M}_lo); - - $for M in range(ROW_TILE): - vacc${M}_lo = _mm_movehl_ps(vacc${M}_lo, vacc${M}_lo); - - $for M in range(ROW_TILE): - o${M} += 2; - } - if (c & (1 * sizeof(float))) { - $for M in range(ROW_TILE): - _mm_store_ss(o${M}, vacc${M}_lo); - - $for M in range(ROW_TILE): - o${M} += 1; - } - } - $for M in range(ROW_TILE): - i${M} = (const float*) ((uintptr_t) i${M} + input_increment); - o${M} = (float*) ((uintptr_t) o${M} + output_increment); - rows = doz(rows, ${ROW_TILE}); - } while (rows != 0); -} diff --git a/src/f32-prelu/avx512f.c.in b/src/f32-prelu/avx512f.c.in deleted file mode 100644 index 859f59cd2ce..00000000000 --- a/src/f32-prelu/avx512f.c.in +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert CHANNEL_TILE % 16 == 0 -$assert CHANNEL_TILE >= 16 -$assert ROW_TILE >= 1 -$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" -#include - -#include - -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__avx512f_${ROW_TILE}x${CHANNEL_TILE}( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - $for M in range(1, ROW_TILE): - const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_stride); - float* o${M} = (float*) ((uintptr_t) o${M-1} + output_stride); - - const size_t input_increment = input_stride * ${ROW_TILE} - channels; - const size_t output_increment = output_stride * ${ROW_TILE} - channels; - - const __m512 vzero = _mm512_setzero_ps(); - do { - $for M in range(1, ROW_TILE): - $if M % 2 == 0: - if XNN_UNPREDICTABLE(rows <= ${M}) { - i${M} = i${M-1}; - o${M} = o${M-1}; - } - $else: - if XNN_UNPREDICTABLE(rows < ${M+1}) { - i${M} = i${M-1}; - o${M} = o${M-1}; - } - - const float* w = weights; - size_t c = channels; - for (; c >= ${CHANNEL_TILE} * sizeof(float); c -= ${CHANNEL_TILE} * sizeof(float)) { - const __m512 vw${ABC[0:16]} = _mm512_load_ps(w); - $for C in range(16, CHANNEL_TILE, 16): - const __m512 vw${ABC[C:C+16]} = _mm512_load_ps(w + ${C}); - w += ${CHANNEL_TILE}; - - $for M in range(ROW_TILE): - const __m512 vi${M}x${ABC[0:16]} = _mm512_loadu_ps(i${M}); - $for C in range(16, CHANNEL_TILE, 16): - const __m512 vi${M}x${ABC[C:C+16]} = _mm512_loadu_ps(i${M} + ${C}); - i${M} += ${CHANNEL_TILE}; - - $for M in range(ROW_TILE): - $for C in range(0, CHANNEL_TILE, 16): - const __mmask16 vsign${M}x${ABC[C:C+16]} = _mm512_cmp_ps_mask(vi${M}x${ABC[C:C+16]}, vzero, _CMP_LT_OQ); - const __m512 vacc${M}x${ABC[C:C+16]} = _mm512_mask_mul_ps(vi${M}x${ABC[C:C+16]}, vsign${M}x${ABC[C:C+16]}, vi${M}x${ABC[C:C+16]}, vw${ABC[C:C+16]}); - - $for M in range(ROW_TILE): - _mm512_storeu_ps(o${M}, vacc${M}x${ABC[0:16]}); - $for C in range(16, CHANNEL_TILE, 16): - _mm512_storeu_ps(o${M} + ${C}, vacc${M}x${ABC[C:C+16]}); - o${M} += ${CHANNEL_TILE}; - } - $if CHANNEL_TILE > 16: - for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) { - const __m512 vw = _mm512_load_ps(w); - w += 16; - - $for M in range(ROW_TILE): - const __m512 vi${M} = _mm512_loadu_ps(i${M}); - i${M} += 16; - - $for M in range(ROW_TILE): - const __mmask16 vsign${M} = _mm512_cmp_ps_mask(vi${M}, vzero, _CMP_LT_OQ); - const __m512 vacc${M} = _mm512_mask_mul_ps(vi${M}, vsign${M}, vi${M}, vw); - - $for M in range(ROW_TILE): - _mm512_storeu_ps(o${M}, vacc${M}); - o${M} += 16; - } - if XNN_UNLIKELY(c != 0) { - assert(c >= 1 * sizeof(float)); - assert(c <= 15 * sizeof(float)); - // Prepare mask for valid 32-bit elements (depends on c). - const __mmask16 vmask = _cvtu32_mask16((uint32_t) (UINT32_C(1) << (c >> XNN_LOG2_SIZEOF_FLOAT)) - UINT32_C(1)); - - const __m512 vw = _mm512_maskz_loadu_ps(vmask, w); - - $for M in range(ROW_TILE): - const __m512 vi${M} = _mm512_maskz_loadu_ps(vmask, i${M}); - i${M} = (const float*) ((uintptr_t) i${M} + c); - - $for M in range(ROW_TILE): - const __mmask16 vsign${M} = _mm512_cmp_ps_mask(vi${M}, vzero, _CMP_LT_OQ); - const __m512 vacc${M} = _mm512_mask_mul_ps(vi${M}, vsign${M}, vi${M}, vw); - - $for M in range(ROW_TILE): - _mm512_mask_storeu_ps(o${M}, vmask, vacc${M}); - o${M} = (float*) ((uintptr_t) o${M} + c); - } - $for M in range(ROW_TILE): - i${M} = (const float*) ((uintptr_t) i${M} + input_increment); - o${M} = (float*) ((uintptr_t) o${M} + output_increment); - rows = doz(rows, ${ROW_TILE}); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-avx-2x16.c b/src/f32-prelu/gen/f32-prelu-avx-2x16.c index 70cde0fd797..6ef21bd98a8 100644 --- a/src/f32-prelu/gen/f32-prelu-avx-2x16.c +++ b/src/f32-prelu/gen/f32-prelu-avx-2x16.c @@ -12,7 +12,7 @@ #include #include "xnnpack/math.h" -#include "xnnpack/prelu.h" +#include "xnnpack/vbinary.h" void xnn_f32_prelu_ukernel__avx_2x16( diff --git a/src/f32-prelu/gen/f32-prelu-avx-2x8.c b/src/f32-prelu/gen/f32-prelu-avx-2x8.c deleted file mode 100644 index bdd6a780fad..00000000000 --- a/src/f32-prelu/gen/f32-prelu-avx-2x8.c +++ /dev/null @@ -1,123 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/avx.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__avx_2x8( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { - const __m256 vw01234567 = _mm256_load_ps(w); - w += 8; - - const __m256 vi0x01234567 = _mm256_loadu_ps(i0); - i0 += 8; - const __m256 vi1x01234567 = _mm256_loadu_ps(i1); - i1 += 8; - - const __m256 vprod0x01234567 = _mm256_mul_ps(vi0x01234567, vw01234567); - const __m256 vprod1x01234567 = _mm256_mul_ps(vi1x01234567, vw01234567); - - const __m256 vacc0x01234567 = _mm256_blendv_ps(vi0x01234567, vprod0x01234567, vi0x01234567); - const __m256 vacc1x01234567 = _mm256_blendv_ps(vi1x01234567, vprod1x01234567, vi1x01234567); - - _mm256_storeu_ps(o0, vacc0x01234567); - o0 += 8; - _mm256_storeu_ps(o1, vacc1x01234567); - o1 += 8; - } - if XNN_UNLIKELY(c != 0) { - assert(c >= 1 * sizeof(float)); - assert(c <= 7 * sizeof(float)); - __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - c)); - - const __m256 vw = _mm256_maskload_ps(w, vmask); - - const __m256 vi0 = _mm256_maskload_ps(i0, vmask); - i0 = (const float*) ((uintptr_t) i0 + c); - const __m256 vi1 = _mm256_maskload_ps(i1, vmask); - i1 = (const float*) ((uintptr_t) i1 + c); - - const __m256 vprod0 = _mm256_mul_ps(vi0, vw); - const __m256 vprod1 = _mm256_mul_ps(vi1, vw); - - __m256 vacc0 = _mm256_blendv_ps(vi0, vprod0, vi0); - __m256 vacc1 = _mm256_blendv_ps(vi1, vprod1, vi1); - - __m128 vacc0_lo = _mm256_castps256_ps128(vacc0); - __m128 vacc1_lo = _mm256_castps256_ps128(vacc1); - if (c & (4 * sizeof(float))) { - _mm_storeu_ps(o0, vacc0_lo); - _mm_storeu_ps(o1, vacc1_lo); - - vacc0_lo = _mm256_extractf128_ps(vacc0, 1); - vacc1_lo = _mm256_extractf128_ps(vacc1, 1); - - o0 += 4; - o1 += 4; - } - if (c & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) o0, vacc0_lo); - _mm_storel_pi((__m64*) o1, vacc1_lo); - - vacc0_lo = _mm_movehl_ps(vacc0_lo, vacc0_lo); - vacc1_lo = _mm_movehl_ps(vacc1_lo, vacc1_lo); - - o0 += 2; - o1 += 2; - } - if (c & (1 * sizeof(float))) { - _mm_store_ss(o0, vacc0_lo); - _mm_store_ss(o1, vacc1_lo); - - o0 += 1; - o1 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-avx512f-2x16.c b/src/f32-prelu/gen/f32-prelu-avx512f-2x16.c deleted file mode 100644 index 4897ecbf064..00000000000 --- a/src/f32-prelu/gen/f32-prelu-avx512f-2x16.c +++ /dev/null @@ -1,97 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/avx512f.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__avx512f_2x16( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - const __m512 vzero = _mm512_setzero_ps(); - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) { - const __m512 vw0123456789ABCDEF = _mm512_load_ps(w); - w += 16; - - const __m512 vi0x0123456789ABCDEF = _mm512_loadu_ps(i0); - i0 += 16; - const __m512 vi1x0123456789ABCDEF = _mm512_loadu_ps(i1); - i1 += 16; - - const __mmask16 vsign0x0123456789ABCDEF = _mm512_cmp_ps_mask(vi0x0123456789ABCDEF, vzero, _CMP_LT_OQ); - const __m512 vacc0x0123456789ABCDEF = _mm512_mask_mul_ps(vi0x0123456789ABCDEF, vsign0x0123456789ABCDEF, vi0x0123456789ABCDEF, vw0123456789ABCDEF); - const __mmask16 vsign1x0123456789ABCDEF = _mm512_cmp_ps_mask(vi1x0123456789ABCDEF, vzero, _CMP_LT_OQ); - const __m512 vacc1x0123456789ABCDEF = _mm512_mask_mul_ps(vi1x0123456789ABCDEF, vsign1x0123456789ABCDEF, vi1x0123456789ABCDEF, vw0123456789ABCDEF); - - _mm512_storeu_ps(o0, vacc0x0123456789ABCDEF); - o0 += 16; - _mm512_storeu_ps(o1, vacc1x0123456789ABCDEF); - o1 += 16; - } - if XNN_UNLIKELY(c != 0) { - assert(c >= 1 * sizeof(float)); - assert(c <= 15 * sizeof(float)); - // Prepare mask for valid 32-bit elements (depends on c). - const __mmask16 vmask = _cvtu32_mask16((uint32_t) (UINT32_C(1) << (c >> XNN_LOG2_SIZEOF_FLOAT)) - UINT32_C(1)); - - const __m512 vw = _mm512_maskz_loadu_ps(vmask, w); - - const __m512 vi0 = _mm512_maskz_loadu_ps(vmask, i0); - i0 = (const float*) ((uintptr_t) i0 + c); - const __m512 vi1 = _mm512_maskz_loadu_ps(vmask, i1); - i1 = (const float*) ((uintptr_t) i1 + c); - - const __mmask16 vsign0 = _mm512_cmp_ps_mask(vi0, vzero, _CMP_LT_OQ); - const __m512 vacc0 = _mm512_mask_mul_ps(vi0, vsign0, vi0, vw); - const __mmask16 vsign1 = _mm512_cmp_ps_mask(vi1, vzero, _CMP_LT_OQ); - const __m512 vacc1 = _mm512_mask_mul_ps(vi1, vsign1, vi1, vw); - - _mm512_mask_storeu_ps(o0, vmask, vacc0); - o0 = (float*) ((uintptr_t) o0 + c); - _mm512_mask_storeu_ps(o1, vmask, vacc1); - o1 = (float*) ((uintptr_t) o1 + c); - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-avx512f-2x32.c b/src/f32-prelu/gen/f32-prelu-avx512f-2x32.c deleted file mode 100644 index a2f64e7c6c9..00000000000 --- a/src/f32-prelu/gen/f32-prelu-avx512f-2x32.c +++ /dev/null @@ -1,125 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/avx512f.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__avx512f_2x32( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - const __m512 vzero = _mm512_setzero_ps(); - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 32 * sizeof(float); c -= 32 * sizeof(float)) { - const __m512 vw0123456789ABCDEF = _mm512_load_ps(w); - const __m512 vwGHIJKLMNOPQRSTUV = _mm512_load_ps(w + 16); - w += 32; - - const __m512 vi0x0123456789ABCDEF = _mm512_loadu_ps(i0); - const __m512 vi0xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i0 + 16); - i0 += 32; - const __m512 vi1x0123456789ABCDEF = _mm512_loadu_ps(i1); - const __m512 vi1xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i1 + 16); - i1 += 32; - - const __mmask16 vsign0x0123456789ABCDEF = _mm512_cmp_ps_mask(vi0x0123456789ABCDEF, vzero, _CMP_LT_OQ); - const __m512 vacc0x0123456789ABCDEF = _mm512_mask_mul_ps(vi0x0123456789ABCDEF, vsign0x0123456789ABCDEF, vi0x0123456789ABCDEF, vw0123456789ABCDEF); - const __mmask16 vsign0xGHIJKLMNOPQRSTUV = _mm512_cmp_ps_mask(vi0xGHIJKLMNOPQRSTUV, vzero, _CMP_LT_OQ); - const __m512 vacc0xGHIJKLMNOPQRSTUV = _mm512_mask_mul_ps(vi0xGHIJKLMNOPQRSTUV, vsign0xGHIJKLMNOPQRSTUV, vi0xGHIJKLMNOPQRSTUV, vwGHIJKLMNOPQRSTUV); - const __mmask16 vsign1x0123456789ABCDEF = _mm512_cmp_ps_mask(vi1x0123456789ABCDEF, vzero, _CMP_LT_OQ); - const __m512 vacc1x0123456789ABCDEF = _mm512_mask_mul_ps(vi1x0123456789ABCDEF, vsign1x0123456789ABCDEF, vi1x0123456789ABCDEF, vw0123456789ABCDEF); - const __mmask16 vsign1xGHIJKLMNOPQRSTUV = _mm512_cmp_ps_mask(vi1xGHIJKLMNOPQRSTUV, vzero, _CMP_LT_OQ); - const __m512 vacc1xGHIJKLMNOPQRSTUV = _mm512_mask_mul_ps(vi1xGHIJKLMNOPQRSTUV, vsign1xGHIJKLMNOPQRSTUV, vi1xGHIJKLMNOPQRSTUV, vwGHIJKLMNOPQRSTUV); - - _mm512_storeu_ps(o0, vacc0x0123456789ABCDEF); - _mm512_storeu_ps(o0 + 16, vacc0xGHIJKLMNOPQRSTUV); - o0 += 32; - _mm512_storeu_ps(o1, vacc1x0123456789ABCDEF); - _mm512_storeu_ps(o1 + 16, vacc1xGHIJKLMNOPQRSTUV); - o1 += 32; - } - for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) { - const __m512 vw = _mm512_load_ps(w); - w += 16; - - const __m512 vi0 = _mm512_loadu_ps(i0); - i0 += 16; - const __m512 vi1 = _mm512_loadu_ps(i1); - i1 += 16; - - const __mmask16 vsign0 = _mm512_cmp_ps_mask(vi0, vzero, _CMP_LT_OQ); - const __m512 vacc0 = _mm512_mask_mul_ps(vi0, vsign0, vi0, vw); - const __mmask16 vsign1 = _mm512_cmp_ps_mask(vi1, vzero, _CMP_LT_OQ); - const __m512 vacc1 = _mm512_mask_mul_ps(vi1, vsign1, vi1, vw); - - _mm512_storeu_ps(o0, vacc0); - o0 += 16; - _mm512_storeu_ps(o1, vacc1); - o1 += 16; - } - if XNN_UNLIKELY(c != 0) { - assert(c >= 1 * sizeof(float)); - assert(c <= 15 * sizeof(float)); - // Prepare mask for valid 32-bit elements (depends on c). - const __mmask16 vmask = _cvtu32_mask16((uint32_t) (UINT32_C(1) << (c >> XNN_LOG2_SIZEOF_FLOAT)) - UINT32_C(1)); - - const __m512 vw = _mm512_maskz_loadu_ps(vmask, w); - - const __m512 vi0 = _mm512_maskz_loadu_ps(vmask, i0); - i0 = (const float*) ((uintptr_t) i0 + c); - const __m512 vi1 = _mm512_maskz_loadu_ps(vmask, i1); - i1 = (const float*) ((uintptr_t) i1 + c); - - const __mmask16 vsign0 = _mm512_cmp_ps_mask(vi0, vzero, _CMP_LT_OQ); - const __m512 vacc0 = _mm512_mask_mul_ps(vi0, vsign0, vi0, vw); - const __mmask16 vsign1 = _mm512_cmp_ps_mask(vi1, vzero, _CMP_LT_OQ); - const __m512 vacc1 = _mm512_mask_mul_ps(vi1, vsign1, vi1, vw); - - _mm512_mask_storeu_ps(o0, vmask, vacc0); - o0 = (float*) ((uintptr_t) o0 + c); - _mm512_mask_storeu_ps(o1, vmask, vacc1); - o1 = (float*) ((uintptr_t) o1 + c); - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-neon-1x16.c b/src/f32-prelu/gen/f32-prelu-neon-1x16.c deleted file mode 100644 index 8a43be75c30..00000000000 --- a/src/f32-prelu/gen/f32-prelu-neon-1x16.c +++ /dev/null @@ -1,109 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/neon.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__neon_1x16( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - - const size_t input_increment = input_stride * 1 - channels; - const size_t output_increment = output_stride * 1 - channels; - - do { - - const float* w = weights; - size_t c = channels; - for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) { - const float32x4_t vw0123 = vld1q_f32(w); w += 4; - const float32x4_t vw4567 = vld1q_f32(w); w += 4; - const float32x4_t vw89AB = vld1q_f32(w); w += 4; - const float32x4_t vwCDEF = vld1q_f32(w); w += 4; - - const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; - const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4; - const float32x4_t vi0x89AB = vld1q_f32(i0); i0 += 4; - const float32x4_t vi0xCDEF = vld1q_f32(i0); i0 += 4; - - float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123); - const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0)); - float32x4_t vacc0x4567 = vmulq_f32(vi0x4567, vw4567); - const uint32x4_t vm0x4567 = vcltq_s32(vreinterpretq_s32_f32(vi0x4567), vmovq_n_s32(0)); - float32x4_t vacc0x89AB = vmulq_f32(vi0x89AB, vw89AB); - const uint32x4_t vm0x89AB = vcltq_s32(vreinterpretq_s32_f32(vi0x89AB), vmovq_n_s32(0)); - float32x4_t vacc0xCDEF = vmulq_f32(vi0xCDEF, vwCDEF); - const uint32x4_t vm0xCDEF = vcltq_s32(vreinterpretq_s32_f32(vi0xCDEF), vmovq_n_s32(0)); - - vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123); - vacc0x4567 = vbslq_f32(vm0x4567, vacc0x4567, vi0x4567); - vacc0x89AB = vbslq_f32(vm0x89AB, vacc0x89AB, vi0x89AB); - vacc0xCDEF = vbslq_f32(vm0xCDEF, vacc0xCDEF, vi0xCDEF); - - vst1q_f32(o0, vacc0x0123); o0 += 4; - vst1q_f32(o0, vacc0x4567); o0 += 4; - vst1q_f32(o0, vacc0x89AB); o0 += 4; - vst1q_f32(o0, vacc0xCDEF); o0 += 4; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const float32x4_t vw0123 = vld1q_f32(w); w += 4; - - const float32x4_t vi0x0123 = vld1q_f32(i0); - i0 += 4; - - float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123); - const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0)); - - vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123); - - vst1q_f32(o0, vacc0x0123); o0 += 4; - } - if XNN_UNLIKELY(c != 0) { - const float32x4_t vw0123 = vld1q_f32(w); w += 4; - - const float32x4_t vi0x0123 = vld1q_f32(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - - float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123); - const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0)); - - vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123); - - float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); - if (c & (2 * sizeof(float))) { - vst1_f32(o0, vacc0x01); o0 += 2; - - vacc0x01 = vget_high_f32(vacc0x0123); - } - if (c & (1 * sizeof(float))) { - vst1_lane_f32(o0, vacc0x01, 0); o0 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - rows = doz(rows, 1); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-neon-1x4.c b/src/f32-prelu/gen/f32-prelu-neon-1x4.c deleted file mode 100644 index cac72266b83..00000000000 --- a/src/f32-prelu/gen/f32-prelu-neon-1x4.c +++ /dev/null @@ -1,78 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/neon.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__neon_1x4( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - - const size_t input_increment = input_stride * 1 - channels; - const size_t output_increment = output_stride * 1 - channels; - - do { - - const float* w = weights; - size_t c = channels; - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const float32x4_t vw0123 = vld1q_f32(w); w += 4; - - const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; - - float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123); - const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0)); - - vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123); - - vst1q_f32(o0, vacc0x0123); o0 += 4; - } - if XNN_UNLIKELY(c != 0) { - const float32x4_t vw0123 = vld1q_f32(w); w += 4; - - const float32x4_t vi0x0123 = vld1q_f32(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - - float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123); - const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0)); - - vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123); - - float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); - if (c & (2 * sizeof(float))) { - vst1_f32(o0, vacc0x01); o0 += 2; - - vacc0x01 = vget_high_f32(vacc0x0123); - } - if (c & (1 * sizeof(float))) { - vst1_lane_f32(o0, vacc0x01, 0); o0 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - rows = doz(rows, 1); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-neon-1x8.c b/src/f32-prelu/gen/f32-prelu-neon-1x8.c deleted file mode 100644 index 0674b56d772..00000000000 --- a/src/f32-prelu/gen/f32-prelu-neon-1x8.c +++ /dev/null @@ -1,97 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/neon.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__neon_1x8( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - - const size_t input_increment = input_stride * 1 - channels; - const size_t output_increment = output_stride * 1 - channels; - - do { - - const float* w = weights; - size_t c = channels; - for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { - const float32x4_t vw0123 = vld1q_f32(w); w += 4; - const float32x4_t vw4567 = vld1q_f32(w); w += 4; - - const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; - const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4; - - float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123); - const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0)); - float32x4_t vacc0x4567 = vmulq_f32(vi0x4567, vw4567); - const uint32x4_t vm0x4567 = vcltq_s32(vreinterpretq_s32_f32(vi0x4567), vmovq_n_s32(0)); - - vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123); - vacc0x4567 = vbslq_f32(vm0x4567, vacc0x4567, vi0x4567); - - vst1q_f32(o0, vacc0x0123); o0 += 4; - vst1q_f32(o0, vacc0x4567); o0 += 4; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const float32x4_t vw0123 = vld1q_f32(w); w += 4; - - const float32x4_t vi0x0123 = vld1q_f32(i0); - i0 += 4; - - float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123); - const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0)); - - vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123); - - vst1q_f32(o0, vacc0x0123); o0 += 4; - } - if XNN_UNLIKELY(c != 0) { - const float32x4_t vw0123 = vld1q_f32(w); w += 4; - - const float32x4_t vi0x0123 = vld1q_f32(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - - float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123); - const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0)); - - vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123); - - float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); - if (c & (2 * sizeof(float))) { - vst1_f32(o0, vacc0x01); o0 += 2; - - vacc0x01 = vget_high_f32(vacc0x0123); - } - if (c & (1 * sizeof(float))) { - vst1_lane_f32(o0, vacc0x01, 0); o0 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - rows = doz(rows, 1); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-neon-2x16.c b/src/f32-prelu/gen/f32-prelu-neon-2x16.c deleted file mode 100644 index 68d7f95ef22..00000000000 --- a/src/f32-prelu/gen/f32-prelu-neon-2x16.c +++ /dev/null @@ -1,152 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/neon.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__neon_2x16( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) { - const float32x4_t vw0123 = vld1q_f32(w); w += 4; - const float32x4_t vw4567 = vld1q_f32(w); w += 4; - const float32x4_t vw89AB = vld1q_f32(w); w += 4; - const float32x4_t vwCDEF = vld1q_f32(w); w += 4; - - const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; - const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4; - const float32x4_t vi0x89AB = vld1q_f32(i0); i0 += 4; - const float32x4_t vi0xCDEF = vld1q_f32(i0); i0 += 4; - const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; - const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; - const float32x4_t vi1x89AB = vld1q_f32(i1); i1 += 4; - const float32x4_t vi1xCDEF = vld1q_f32(i1); i1 += 4; - - float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123); - const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0)); - float32x4_t vacc0x4567 = vmulq_f32(vi0x4567, vw4567); - const uint32x4_t vm0x4567 = vcltq_s32(vreinterpretq_s32_f32(vi0x4567), vmovq_n_s32(0)); - float32x4_t vacc0x89AB = vmulq_f32(vi0x89AB, vw89AB); - const uint32x4_t vm0x89AB = vcltq_s32(vreinterpretq_s32_f32(vi0x89AB), vmovq_n_s32(0)); - float32x4_t vacc0xCDEF = vmulq_f32(vi0xCDEF, vwCDEF); - const uint32x4_t vm0xCDEF = vcltq_s32(vreinterpretq_s32_f32(vi0xCDEF), vmovq_n_s32(0)); - float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123); - const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0)); - float32x4_t vacc1x4567 = vmulq_f32(vi1x4567, vw4567); - const uint32x4_t vm1x4567 = vcltq_s32(vreinterpretq_s32_f32(vi1x4567), vmovq_n_s32(0)); - float32x4_t vacc1x89AB = vmulq_f32(vi1x89AB, vw89AB); - const uint32x4_t vm1x89AB = vcltq_s32(vreinterpretq_s32_f32(vi1x89AB), vmovq_n_s32(0)); - float32x4_t vacc1xCDEF = vmulq_f32(vi1xCDEF, vwCDEF); - const uint32x4_t vm1xCDEF = vcltq_s32(vreinterpretq_s32_f32(vi1xCDEF), vmovq_n_s32(0)); - - vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123); - vacc0x4567 = vbslq_f32(vm0x4567, vacc0x4567, vi0x4567); - vacc0x89AB = vbslq_f32(vm0x89AB, vacc0x89AB, vi0x89AB); - vacc0xCDEF = vbslq_f32(vm0xCDEF, vacc0xCDEF, vi0xCDEF); - vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123); - vacc1x4567 = vbslq_f32(vm1x4567, vacc1x4567, vi1x4567); - vacc1x89AB = vbslq_f32(vm1x89AB, vacc1x89AB, vi1x89AB); - vacc1xCDEF = vbslq_f32(vm1xCDEF, vacc1xCDEF, vi1xCDEF); - - vst1q_f32(o0, vacc0x0123); o0 += 4; - vst1q_f32(o0, vacc0x4567); o0 += 4; - vst1q_f32(o0, vacc0x89AB); o0 += 4; - vst1q_f32(o0, vacc0xCDEF); o0 += 4; - vst1q_f32(o1, vacc1x0123); o1 += 4; - vst1q_f32(o1, vacc1x4567); o1 += 4; - vst1q_f32(o1, vacc1x89AB); o1 += 4; - vst1q_f32(o1, vacc1xCDEF); o1 += 4; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const float32x4_t vw0123 = vld1q_f32(w); w += 4; - - const float32x4_t vi0x0123 = vld1q_f32(i0); - i0 += 4; - const float32x4_t vi1x0123 = vld1q_f32(i1); - i1 += 4; - - float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123); - const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0)); - float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123); - const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0)); - - vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123); - vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123); - - vst1q_f32(o0, vacc0x0123); o0 += 4; - vst1q_f32(o1, vacc1x0123); o1 += 4; - } - if XNN_UNLIKELY(c != 0) { - const float32x4_t vw0123 = vld1q_f32(w); w += 4; - - const float32x4_t vi0x0123 = vld1q_f32(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - const float32x4_t vi1x0123 = vld1q_f32(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - - float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123); - const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0)); - float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123); - const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0)); - - vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123); - vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123); - - float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); - float32x2_t vacc1x01 = vget_low_f32(vacc1x0123); - if (c & (2 * sizeof(float))) { - vst1_f32(o0, vacc0x01); o0 += 2; - vst1_f32(o1, vacc1x01); o1 += 2; - - vacc0x01 = vget_high_f32(vacc0x0123); - vacc1x01 = vget_high_f32(vacc1x0123); - } - if (c & (1 * sizeof(float))) { - vst1_lane_f32(o0, vacc0x01, 0); o0 += 1; - vst1_lane_f32(o1, vacc1x01, 0); o1 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-neon-2x4.c b/src/f32-prelu/gen/f32-prelu-neon-2x4.c deleted file mode 100644 index 6a0d0d42a18..00000000000 --- a/src/f32-prelu/gen/f32-prelu-neon-2x4.c +++ /dev/null @@ -1,100 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/neon.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__neon_2x4( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const float32x4_t vw0123 = vld1q_f32(w); w += 4; - - const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; - const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; - - float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123); - const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0)); - float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123); - const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0)); - - vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123); - vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123); - - vst1q_f32(o0, vacc0x0123); o0 += 4; - vst1q_f32(o1, vacc1x0123); o1 += 4; - } - if XNN_UNLIKELY(c != 0) { - const float32x4_t vw0123 = vld1q_f32(w); w += 4; - - const float32x4_t vi0x0123 = vld1q_f32(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - const float32x4_t vi1x0123 = vld1q_f32(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - - float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123); - const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0)); - float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123); - const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0)); - - vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123); - vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123); - - float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); - float32x2_t vacc1x01 = vget_low_f32(vacc1x0123); - if (c & (2 * sizeof(float))) { - vst1_f32(o0, vacc0x01); o0 += 2; - vst1_f32(o1, vacc1x01); o1 += 2; - - vacc0x01 = vget_high_f32(vacc0x0123); - vacc1x01 = vget_high_f32(vacc1x0123); - } - if (c & (1 * sizeof(float))) { - vst1_lane_f32(o0, vacc0x01, 0); o0 += 1; - vst1_lane_f32(o1, vacc1x01, 0); o1 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-neon-2x8.c b/src/f32-prelu/gen/f32-prelu-neon-2x8.c deleted file mode 100644 index 2077ade9aaa..00000000000 --- a/src/f32-prelu/gen/f32-prelu-neon-2x8.c +++ /dev/null @@ -1,130 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/neon.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__neon_2x8( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { - const float32x4_t vw0123 = vld1q_f32(w); w += 4; - const float32x4_t vw4567 = vld1q_f32(w); w += 4; - - const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; - const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4; - const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; - const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; - - float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123); - const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0)); - float32x4_t vacc0x4567 = vmulq_f32(vi0x4567, vw4567); - const uint32x4_t vm0x4567 = vcltq_s32(vreinterpretq_s32_f32(vi0x4567), vmovq_n_s32(0)); - float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123); - const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0)); - float32x4_t vacc1x4567 = vmulq_f32(vi1x4567, vw4567); - const uint32x4_t vm1x4567 = vcltq_s32(vreinterpretq_s32_f32(vi1x4567), vmovq_n_s32(0)); - - vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123); - vacc0x4567 = vbslq_f32(vm0x4567, vacc0x4567, vi0x4567); - vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123); - vacc1x4567 = vbslq_f32(vm1x4567, vacc1x4567, vi1x4567); - - vst1q_f32(o0, vacc0x0123); o0 += 4; - vst1q_f32(o0, vacc0x4567); o0 += 4; - vst1q_f32(o1, vacc1x0123); o1 += 4; - vst1q_f32(o1, vacc1x4567); o1 += 4; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const float32x4_t vw0123 = vld1q_f32(w); w += 4; - - const float32x4_t vi0x0123 = vld1q_f32(i0); - i0 += 4; - const float32x4_t vi1x0123 = vld1q_f32(i1); - i1 += 4; - - float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123); - const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0)); - float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123); - const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0)); - - vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123); - vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123); - - vst1q_f32(o0, vacc0x0123); o0 += 4; - vst1q_f32(o1, vacc1x0123); o1 += 4; - } - if XNN_UNLIKELY(c != 0) { - const float32x4_t vw0123 = vld1q_f32(w); w += 4; - - const float32x4_t vi0x0123 = vld1q_f32(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - const float32x4_t vi1x0123 = vld1q_f32(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - - float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123); - const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0)); - float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123); - const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0)); - - vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123); - vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123); - - float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); - float32x2_t vacc1x01 = vget_low_f32(vacc1x0123); - if (c & (2 * sizeof(float))) { - vst1_f32(o0, vacc0x01); o0 += 2; - vst1_f32(o1, vacc1x01); o1 += 2; - - vacc0x01 = vget_high_f32(vacc0x0123); - vacc1x01 = vget_high_f32(vacc1x0123); - } - if (c & (1 * sizeof(float))) { - vst1_lane_f32(o0, vacc0x01, 0); o0 += 1; - vst1_lane_f32(o1, vacc1x01, 0); o1 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-neon-4x16.c b/src/f32-prelu/gen/f32-prelu-neon-4x16.c deleted file mode 100644 index 531c27c58d5..00000000000 --- a/src/f32-prelu/gen/f32-prelu-neon-4x16.c +++ /dev/null @@ -1,238 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/neon.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__neon_4x16( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - float* o2 = (float*) ((uintptr_t) o1 + output_stride); - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - float* o3 = (float*) ((uintptr_t) o2 + output_stride); - - const size_t input_increment = input_stride * 4 - channels; - const size_t output_increment = output_stride * 4 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = i1; - o2 = o1; - } - if XNN_UNPREDICTABLE(rows < 4) { - i3 = i2; - o3 = o2; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) { - const float32x4_t vw0123 = vld1q_f32(w); w += 4; - const float32x4_t vw4567 = vld1q_f32(w); w += 4; - const float32x4_t vw89AB = vld1q_f32(w); w += 4; - const float32x4_t vwCDEF = vld1q_f32(w); w += 4; - - const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; - const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4; - const float32x4_t vi0x89AB = vld1q_f32(i0); i0 += 4; - const float32x4_t vi0xCDEF = vld1q_f32(i0); i0 += 4; - const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; - const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; - const float32x4_t vi1x89AB = vld1q_f32(i1); i1 += 4; - const float32x4_t vi1xCDEF = vld1q_f32(i1); i1 += 4; - const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; - const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4; - const float32x4_t vi2x89AB = vld1q_f32(i2); i2 += 4; - const float32x4_t vi2xCDEF = vld1q_f32(i2); i2 += 4; - const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; - const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; - const float32x4_t vi3x89AB = vld1q_f32(i3); i3 += 4; - const float32x4_t vi3xCDEF = vld1q_f32(i3); i3 += 4; - - float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123); - const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0)); - float32x4_t vacc0x4567 = vmulq_f32(vi0x4567, vw4567); - const uint32x4_t vm0x4567 = vcltq_s32(vreinterpretq_s32_f32(vi0x4567), vmovq_n_s32(0)); - float32x4_t vacc0x89AB = vmulq_f32(vi0x89AB, vw89AB); - const uint32x4_t vm0x89AB = vcltq_s32(vreinterpretq_s32_f32(vi0x89AB), vmovq_n_s32(0)); - float32x4_t vacc0xCDEF = vmulq_f32(vi0xCDEF, vwCDEF); - const uint32x4_t vm0xCDEF = vcltq_s32(vreinterpretq_s32_f32(vi0xCDEF), vmovq_n_s32(0)); - float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123); - const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0)); - float32x4_t vacc1x4567 = vmulq_f32(vi1x4567, vw4567); - const uint32x4_t vm1x4567 = vcltq_s32(vreinterpretq_s32_f32(vi1x4567), vmovq_n_s32(0)); - float32x4_t vacc1x89AB = vmulq_f32(vi1x89AB, vw89AB); - const uint32x4_t vm1x89AB = vcltq_s32(vreinterpretq_s32_f32(vi1x89AB), vmovq_n_s32(0)); - float32x4_t vacc1xCDEF = vmulq_f32(vi1xCDEF, vwCDEF); - const uint32x4_t vm1xCDEF = vcltq_s32(vreinterpretq_s32_f32(vi1xCDEF), vmovq_n_s32(0)); - float32x4_t vacc2x0123 = vmulq_f32(vi2x0123, vw0123); - const uint32x4_t vm2x0123 = vcltq_s32(vreinterpretq_s32_f32(vi2x0123), vmovq_n_s32(0)); - float32x4_t vacc2x4567 = vmulq_f32(vi2x4567, vw4567); - const uint32x4_t vm2x4567 = vcltq_s32(vreinterpretq_s32_f32(vi2x4567), vmovq_n_s32(0)); - float32x4_t vacc2x89AB = vmulq_f32(vi2x89AB, vw89AB); - const uint32x4_t vm2x89AB = vcltq_s32(vreinterpretq_s32_f32(vi2x89AB), vmovq_n_s32(0)); - float32x4_t vacc2xCDEF = vmulq_f32(vi2xCDEF, vwCDEF); - const uint32x4_t vm2xCDEF = vcltq_s32(vreinterpretq_s32_f32(vi2xCDEF), vmovq_n_s32(0)); - float32x4_t vacc3x0123 = vmulq_f32(vi3x0123, vw0123); - const uint32x4_t vm3x0123 = vcltq_s32(vreinterpretq_s32_f32(vi3x0123), vmovq_n_s32(0)); - float32x4_t vacc3x4567 = vmulq_f32(vi3x4567, vw4567); - const uint32x4_t vm3x4567 = vcltq_s32(vreinterpretq_s32_f32(vi3x4567), vmovq_n_s32(0)); - float32x4_t vacc3x89AB = vmulq_f32(vi3x89AB, vw89AB); - const uint32x4_t vm3x89AB = vcltq_s32(vreinterpretq_s32_f32(vi3x89AB), vmovq_n_s32(0)); - float32x4_t vacc3xCDEF = vmulq_f32(vi3xCDEF, vwCDEF); - const uint32x4_t vm3xCDEF = vcltq_s32(vreinterpretq_s32_f32(vi3xCDEF), vmovq_n_s32(0)); - - vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123); - vacc0x4567 = vbslq_f32(vm0x4567, vacc0x4567, vi0x4567); - vacc0x89AB = vbslq_f32(vm0x89AB, vacc0x89AB, vi0x89AB); - vacc0xCDEF = vbslq_f32(vm0xCDEF, vacc0xCDEF, vi0xCDEF); - vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123); - vacc1x4567 = vbslq_f32(vm1x4567, vacc1x4567, vi1x4567); - vacc1x89AB = vbslq_f32(vm1x89AB, vacc1x89AB, vi1x89AB); - vacc1xCDEF = vbslq_f32(vm1xCDEF, vacc1xCDEF, vi1xCDEF); - vacc2x0123 = vbslq_f32(vm2x0123, vacc2x0123, vi2x0123); - vacc2x4567 = vbslq_f32(vm2x4567, vacc2x4567, vi2x4567); - vacc2x89AB = vbslq_f32(vm2x89AB, vacc2x89AB, vi2x89AB); - vacc2xCDEF = vbslq_f32(vm2xCDEF, vacc2xCDEF, vi2xCDEF); - vacc3x0123 = vbslq_f32(vm3x0123, vacc3x0123, vi3x0123); - vacc3x4567 = vbslq_f32(vm3x4567, vacc3x4567, vi3x4567); - vacc3x89AB = vbslq_f32(vm3x89AB, vacc3x89AB, vi3x89AB); - vacc3xCDEF = vbslq_f32(vm3xCDEF, vacc3xCDEF, vi3xCDEF); - - vst1q_f32(o0, vacc0x0123); o0 += 4; - vst1q_f32(o0, vacc0x4567); o0 += 4; - vst1q_f32(o0, vacc0x89AB); o0 += 4; - vst1q_f32(o0, vacc0xCDEF); o0 += 4; - vst1q_f32(o1, vacc1x0123); o1 += 4; - vst1q_f32(o1, vacc1x4567); o1 += 4; - vst1q_f32(o1, vacc1x89AB); o1 += 4; - vst1q_f32(o1, vacc1xCDEF); o1 += 4; - vst1q_f32(o2, vacc2x0123); o2 += 4; - vst1q_f32(o2, vacc2x4567); o2 += 4; - vst1q_f32(o2, vacc2x89AB); o2 += 4; - vst1q_f32(o2, vacc2xCDEF); o2 += 4; - vst1q_f32(o3, vacc3x0123); o3 += 4; - vst1q_f32(o3, vacc3x4567); o3 += 4; - vst1q_f32(o3, vacc3x89AB); o3 += 4; - vst1q_f32(o3, vacc3xCDEF); o3 += 4; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const float32x4_t vw0123 = vld1q_f32(w); w += 4; - - const float32x4_t vi0x0123 = vld1q_f32(i0); - i0 += 4; - const float32x4_t vi1x0123 = vld1q_f32(i1); - i1 += 4; - const float32x4_t vi2x0123 = vld1q_f32(i2); - i2 += 4; - const float32x4_t vi3x0123 = vld1q_f32(i3); - i3 += 4; - - float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123); - const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0)); - float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123); - const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0)); - float32x4_t vacc2x0123 = vmulq_f32(vi2x0123, vw0123); - const uint32x4_t vm2x0123 = vcltq_s32(vreinterpretq_s32_f32(vi2x0123), vmovq_n_s32(0)); - float32x4_t vacc3x0123 = vmulq_f32(vi3x0123, vw0123); - const uint32x4_t vm3x0123 = vcltq_s32(vreinterpretq_s32_f32(vi3x0123), vmovq_n_s32(0)); - - vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123); - vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123); - vacc2x0123 = vbslq_f32(vm2x0123, vacc2x0123, vi2x0123); - vacc3x0123 = vbslq_f32(vm3x0123, vacc3x0123, vi3x0123); - - vst1q_f32(o0, vacc0x0123); o0 += 4; - vst1q_f32(o1, vacc1x0123); o1 += 4; - vst1q_f32(o2, vacc2x0123); o2 += 4; - vst1q_f32(o3, vacc3x0123); o3 += 4; - } - if XNN_UNLIKELY(c != 0) { - const float32x4_t vw0123 = vld1q_f32(w); w += 4; - - const float32x4_t vi0x0123 = vld1q_f32(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - const float32x4_t vi1x0123 = vld1q_f32(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - const float32x4_t vi2x0123 = vld1q_f32(i2); - i2 = (const float*) ((uintptr_t) i2 + c); - const float32x4_t vi3x0123 = vld1q_f32(i3); - i3 = (const float*) ((uintptr_t) i3 + c); - - float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123); - const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0)); - float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123); - const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0)); - float32x4_t vacc2x0123 = vmulq_f32(vi2x0123, vw0123); - const uint32x4_t vm2x0123 = vcltq_s32(vreinterpretq_s32_f32(vi2x0123), vmovq_n_s32(0)); - float32x4_t vacc3x0123 = vmulq_f32(vi3x0123, vw0123); - const uint32x4_t vm3x0123 = vcltq_s32(vreinterpretq_s32_f32(vi3x0123), vmovq_n_s32(0)); - - vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123); - vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123); - vacc2x0123 = vbslq_f32(vm2x0123, vacc2x0123, vi2x0123); - vacc3x0123 = vbslq_f32(vm3x0123, vacc3x0123, vi3x0123); - - float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); - float32x2_t vacc1x01 = vget_low_f32(vacc1x0123); - float32x2_t vacc2x01 = vget_low_f32(vacc2x0123); - float32x2_t vacc3x01 = vget_low_f32(vacc3x0123); - if (c & (2 * sizeof(float))) { - vst1_f32(o0, vacc0x01); o0 += 2; - vst1_f32(o1, vacc1x01); o1 += 2; - vst1_f32(o2, vacc2x01); o2 += 2; - vst1_f32(o3, vacc3x01); o3 += 2; - - vacc0x01 = vget_high_f32(vacc0x0123); - vacc1x01 = vget_high_f32(vacc1x0123); - vacc2x01 = vget_high_f32(vacc2x0123); - vacc3x01 = vget_high_f32(vacc3x0123); - } - if (c & (1 * sizeof(float))) { - vst1_lane_f32(o0, vacc0x01, 0); o0 += 1; - vst1_lane_f32(o1, vacc1x01, 0); o1 += 1; - vst1_lane_f32(o2, vacc2x01, 0); o2 += 1; - vst1_lane_f32(o3, vacc3x01, 0); o3 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - i2 = (const float*) ((uintptr_t) i2 + input_increment); - o2 = (float*) ((uintptr_t) o2 + output_increment); - i3 = (const float*) ((uintptr_t) i3 + input_increment); - o3 = (float*) ((uintptr_t) o3 + output_increment); - rows = doz(rows, 4); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-neon-4x4.c b/src/f32-prelu/gen/f32-prelu-neon-4x4.c deleted file mode 100644 index 495bde3e955..00000000000 --- a/src/f32-prelu/gen/f32-prelu-neon-4x4.c +++ /dev/null @@ -1,144 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/neon.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__neon_4x4( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - float* o2 = (float*) ((uintptr_t) o1 + output_stride); - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - float* o3 = (float*) ((uintptr_t) o2 + output_stride); - - const size_t input_increment = input_stride * 4 - channels; - const size_t output_increment = output_stride * 4 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = i1; - o2 = o1; - } - if XNN_UNPREDICTABLE(rows < 4) { - i3 = i2; - o3 = o2; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const float32x4_t vw0123 = vld1q_f32(w); w += 4; - - const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; - const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; - const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; - const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; - - float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123); - const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0)); - float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123); - const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0)); - float32x4_t vacc2x0123 = vmulq_f32(vi2x0123, vw0123); - const uint32x4_t vm2x0123 = vcltq_s32(vreinterpretq_s32_f32(vi2x0123), vmovq_n_s32(0)); - float32x4_t vacc3x0123 = vmulq_f32(vi3x0123, vw0123); - const uint32x4_t vm3x0123 = vcltq_s32(vreinterpretq_s32_f32(vi3x0123), vmovq_n_s32(0)); - - vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123); - vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123); - vacc2x0123 = vbslq_f32(vm2x0123, vacc2x0123, vi2x0123); - vacc3x0123 = vbslq_f32(vm3x0123, vacc3x0123, vi3x0123); - - vst1q_f32(o0, vacc0x0123); o0 += 4; - vst1q_f32(o1, vacc1x0123); o1 += 4; - vst1q_f32(o2, vacc2x0123); o2 += 4; - vst1q_f32(o3, vacc3x0123); o3 += 4; - } - if XNN_UNLIKELY(c != 0) { - const float32x4_t vw0123 = vld1q_f32(w); w += 4; - - const float32x4_t vi0x0123 = vld1q_f32(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - const float32x4_t vi1x0123 = vld1q_f32(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - const float32x4_t vi2x0123 = vld1q_f32(i2); - i2 = (const float*) ((uintptr_t) i2 + c); - const float32x4_t vi3x0123 = vld1q_f32(i3); - i3 = (const float*) ((uintptr_t) i3 + c); - - float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123); - const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0)); - float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123); - const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0)); - float32x4_t vacc2x0123 = vmulq_f32(vi2x0123, vw0123); - const uint32x4_t vm2x0123 = vcltq_s32(vreinterpretq_s32_f32(vi2x0123), vmovq_n_s32(0)); - float32x4_t vacc3x0123 = vmulq_f32(vi3x0123, vw0123); - const uint32x4_t vm3x0123 = vcltq_s32(vreinterpretq_s32_f32(vi3x0123), vmovq_n_s32(0)); - - vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123); - vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123); - vacc2x0123 = vbslq_f32(vm2x0123, vacc2x0123, vi2x0123); - vacc3x0123 = vbslq_f32(vm3x0123, vacc3x0123, vi3x0123); - - float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); - float32x2_t vacc1x01 = vget_low_f32(vacc1x0123); - float32x2_t vacc2x01 = vget_low_f32(vacc2x0123); - float32x2_t vacc3x01 = vget_low_f32(vacc3x0123); - if (c & (2 * sizeof(float))) { - vst1_f32(o0, vacc0x01); o0 += 2; - vst1_f32(o1, vacc1x01); o1 += 2; - vst1_f32(o2, vacc2x01); o2 += 2; - vst1_f32(o3, vacc3x01); o3 += 2; - - vacc0x01 = vget_high_f32(vacc0x0123); - vacc1x01 = vget_high_f32(vacc1x0123); - vacc2x01 = vget_high_f32(vacc2x0123); - vacc3x01 = vget_high_f32(vacc3x0123); - } - if (c & (1 * sizeof(float))) { - vst1_lane_f32(o0, vacc0x01, 0); o0 += 1; - vst1_lane_f32(o1, vacc1x01, 0); o1 += 1; - vst1_lane_f32(o2, vacc2x01, 0); o2 += 1; - vst1_lane_f32(o3, vacc3x01, 0); o3 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - i2 = (const float*) ((uintptr_t) i2 + input_increment); - o2 = (float*) ((uintptr_t) o2 + output_increment); - i3 = (const float*) ((uintptr_t) i3 + input_increment); - o3 = (float*) ((uintptr_t) o3 + output_increment); - rows = doz(rows, 4); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-neon-4x8.c b/src/f32-prelu/gen/f32-prelu-neon-4x8.c deleted file mode 100644 index 8f14872b6bb..00000000000 --- a/src/f32-prelu/gen/f32-prelu-neon-4x8.c +++ /dev/null @@ -1,196 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/neon.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__neon_4x8( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - float* o2 = (float*) ((uintptr_t) o1 + output_stride); - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - float* o3 = (float*) ((uintptr_t) o2 + output_stride); - - const size_t input_increment = input_stride * 4 - channels; - const size_t output_increment = output_stride * 4 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = i1; - o2 = o1; - } - if XNN_UNPREDICTABLE(rows < 4) { - i3 = i2; - o3 = o2; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { - const float32x4_t vw0123 = vld1q_f32(w); w += 4; - const float32x4_t vw4567 = vld1q_f32(w); w += 4; - - const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; - const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4; - const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; - const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; - const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; - const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4; - const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; - const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; - - float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123); - const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0)); - float32x4_t vacc0x4567 = vmulq_f32(vi0x4567, vw4567); - const uint32x4_t vm0x4567 = vcltq_s32(vreinterpretq_s32_f32(vi0x4567), vmovq_n_s32(0)); - float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123); - const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0)); - float32x4_t vacc1x4567 = vmulq_f32(vi1x4567, vw4567); - const uint32x4_t vm1x4567 = vcltq_s32(vreinterpretq_s32_f32(vi1x4567), vmovq_n_s32(0)); - float32x4_t vacc2x0123 = vmulq_f32(vi2x0123, vw0123); - const uint32x4_t vm2x0123 = vcltq_s32(vreinterpretq_s32_f32(vi2x0123), vmovq_n_s32(0)); - float32x4_t vacc2x4567 = vmulq_f32(vi2x4567, vw4567); - const uint32x4_t vm2x4567 = vcltq_s32(vreinterpretq_s32_f32(vi2x4567), vmovq_n_s32(0)); - float32x4_t vacc3x0123 = vmulq_f32(vi3x0123, vw0123); - const uint32x4_t vm3x0123 = vcltq_s32(vreinterpretq_s32_f32(vi3x0123), vmovq_n_s32(0)); - float32x4_t vacc3x4567 = vmulq_f32(vi3x4567, vw4567); - const uint32x4_t vm3x4567 = vcltq_s32(vreinterpretq_s32_f32(vi3x4567), vmovq_n_s32(0)); - - vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123); - vacc0x4567 = vbslq_f32(vm0x4567, vacc0x4567, vi0x4567); - vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123); - vacc1x4567 = vbslq_f32(vm1x4567, vacc1x4567, vi1x4567); - vacc2x0123 = vbslq_f32(vm2x0123, vacc2x0123, vi2x0123); - vacc2x4567 = vbslq_f32(vm2x4567, vacc2x4567, vi2x4567); - vacc3x0123 = vbslq_f32(vm3x0123, vacc3x0123, vi3x0123); - vacc3x4567 = vbslq_f32(vm3x4567, vacc3x4567, vi3x4567); - - vst1q_f32(o0, vacc0x0123); o0 += 4; - vst1q_f32(o0, vacc0x4567); o0 += 4; - vst1q_f32(o1, vacc1x0123); o1 += 4; - vst1q_f32(o1, vacc1x4567); o1 += 4; - vst1q_f32(o2, vacc2x0123); o2 += 4; - vst1q_f32(o2, vacc2x4567); o2 += 4; - vst1q_f32(o3, vacc3x0123); o3 += 4; - vst1q_f32(o3, vacc3x4567); o3 += 4; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const float32x4_t vw0123 = vld1q_f32(w); w += 4; - - const float32x4_t vi0x0123 = vld1q_f32(i0); - i0 += 4; - const float32x4_t vi1x0123 = vld1q_f32(i1); - i1 += 4; - const float32x4_t vi2x0123 = vld1q_f32(i2); - i2 += 4; - const float32x4_t vi3x0123 = vld1q_f32(i3); - i3 += 4; - - float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123); - const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0)); - float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123); - const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0)); - float32x4_t vacc2x0123 = vmulq_f32(vi2x0123, vw0123); - const uint32x4_t vm2x0123 = vcltq_s32(vreinterpretq_s32_f32(vi2x0123), vmovq_n_s32(0)); - float32x4_t vacc3x0123 = vmulq_f32(vi3x0123, vw0123); - const uint32x4_t vm3x0123 = vcltq_s32(vreinterpretq_s32_f32(vi3x0123), vmovq_n_s32(0)); - - vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123); - vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123); - vacc2x0123 = vbslq_f32(vm2x0123, vacc2x0123, vi2x0123); - vacc3x0123 = vbslq_f32(vm3x0123, vacc3x0123, vi3x0123); - - vst1q_f32(o0, vacc0x0123); o0 += 4; - vst1q_f32(o1, vacc1x0123); o1 += 4; - vst1q_f32(o2, vacc2x0123); o2 += 4; - vst1q_f32(o3, vacc3x0123); o3 += 4; - } - if XNN_UNLIKELY(c != 0) { - const float32x4_t vw0123 = vld1q_f32(w); w += 4; - - const float32x4_t vi0x0123 = vld1q_f32(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - const float32x4_t vi1x0123 = vld1q_f32(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - const float32x4_t vi2x0123 = vld1q_f32(i2); - i2 = (const float*) ((uintptr_t) i2 + c); - const float32x4_t vi3x0123 = vld1q_f32(i3); - i3 = (const float*) ((uintptr_t) i3 + c); - - float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123); - const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0)); - float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123); - const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0)); - float32x4_t vacc2x0123 = vmulq_f32(vi2x0123, vw0123); - const uint32x4_t vm2x0123 = vcltq_s32(vreinterpretq_s32_f32(vi2x0123), vmovq_n_s32(0)); - float32x4_t vacc3x0123 = vmulq_f32(vi3x0123, vw0123); - const uint32x4_t vm3x0123 = vcltq_s32(vreinterpretq_s32_f32(vi3x0123), vmovq_n_s32(0)); - - vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123); - vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123); - vacc2x0123 = vbslq_f32(vm2x0123, vacc2x0123, vi2x0123); - vacc3x0123 = vbslq_f32(vm3x0123, vacc3x0123, vi3x0123); - - float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); - float32x2_t vacc1x01 = vget_low_f32(vacc1x0123); - float32x2_t vacc2x01 = vget_low_f32(vacc2x0123); - float32x2_t vacc3x01 = vget_low_f32(vacc3x0123); - if (c & (2 * sizeof(float))) { - vst1_f32(o0, vacc0x01); o0 += 2; - vst1_f32(o1, vacc1x01); o1 += 2; - vst1_f32(o2, vacc2x01); o2 += 2; - vst1_f32(o3, vacc3x01); o3 += 2; - - vacc0x01 = vget_high_f32(vacc0x0123); - vacc1x01 = vget_high_f32(vacc1x0123); - vacc2x01 = vget_high_f32(vacc2x0123); - vacc3x01 = vget_high_f32(vacc3x0123); - } - if (c & (1 * sizeof(float))) { - vst1_lane_f32(o0, vacc0x01, 0); o0 += 1; - vst1_lane_f32(o1, vacc1x01, 0); o1 += 1; - vst1_lane_f32(o2, vacc2x01, 0); o2 += 1; - vst1_lane_f32(o3, vacc3x01, 0); o3 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - i2 = (const float*) ((uintptr_t) i2 + input_increment); - o2 = (float*) ((uintptr_t) o2 + output_increment); - i3 = (const float*) ((uintptr_t) i3 + input_increment); - o3 = (float*) ((uintptr_t) o3 + output_increment); - rows = doz(rows, 4); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-scalar-2x1.c b/src/f32-prelu/gen/f32-prelu-scalar-2x1.c deleted file mode 100644 index 7d1b7c3fdf2..00000000000 --- a/src/f32-prelu/gen/f32-prelu-scalar-2x1.c +++ /dev/null @@ -1,65 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__scalar_2x1( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - do { - const float vw = *w++; - - const float vi0 = *i0++; - const float vi1 = *i1++; - - const float vacc0 = XNN_UNPREDICTABLE(vi0 < 0.0f) ? vi0 * vw : vi0; - const float vacc1 = XNN_UNPREDICTABLE(vi1 < 0.0f) ? vi1 * vw : vi1; - - *o0++ = vacc0; - *o1++ = vacc1; - - c -= sizeof(float); - } while (c != 0); - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-scalar-2x4.c b/src/f32-prelu/gen/f32-prelu-scalar-2x4.c deleted file mode 100644 index b1bd371446b..00000000000 --- a/src/f32-prelu/gen/f32-prelu-scalar-2x4.c +++ /dev/null @@ -1,102 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/scalar.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__scalar_2x4( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const float vw0 = w[0]; - const float vw1 = w[1]; - const float vw2 = w[2]; - const float vw3 = w[3]; - - const float vi0x0 = i0[0]; - const float vi0x1 = i0[1]; - const float vi0x2 = i0[2]; - const float vi0x3 = i0[3]; - i0 += 4; - const float vi1x0 = i1[0]; - const float vi1x1 = i1[1]; - const float vi1x2 = i1[2]; - const float vi1x3 = i1[3]; - i1 += 4; - - const float vacc0x0 = XNN_UNPREDICTABLE(vi0x0 < 0.0f) ? vi0x0 * vw0 : vi0x0; - const float vacc0x1 = XNN_UNPREDICTABLE(vi0x1 < 0.0f) ? vi0x1 * vw1 : vi0x1; - const float vacc0x2 = XNN_UNPREDICTABLE(vi0x2 < 0.0f) ? vi0x2 * vw2 : vi0x2; - const float vacc0x3 = XNN_UNPREDICTABLE(vi0x3 < 0.0f) ? vi0x3 * vw3 : vi0x3; - const float vacc1x0 = XNN_UNPREDICTABLE(vi1x0 < 0.0f) ? vi1x0 * vw0 : vi1x0; - const float vacc1x1 = XNN_UNPREDICTABLE(vi1x1 < 0.0f) ? vi1x1 * vw1 : vi1x1; - const float vacc1x2 = XNN_UNPREDICTABLE(vi1x2 < 0.0f) ? vi1x2 * vw2 : vi1x2; - const float vacc1x3 = XNN_UNPREDICTABLE(vi1x3 < 0.0f) ? vi1x3 * vw3 : vi1x3; - - o0[0] = vacc0x0; - o0[1] = vacc0x1; - o0[2] = vacc0x2; - o0[3] = vacc0x3; - o0 += 4; - o1[0] = vacc1x0; - o1[1] = vacc1x1; - o1[2] = vacc1x2; - o1[3] = vacc1x3; - o1 += 4; - - w += 4; - } - for (; c != 0; c -= sizeof(float)) { - const float vw = *w++; - - const float vi0 = *i0++; - const float vi1 = *i1++; - - const float vacc0 = XNN_UNPREDICTABLE(vi0 < 0.0f) ? vi0 * vw : vi0; - const float vacc1 = XNN_UNPREDICTABLE(vi1 < 0.0f) ? vi1 * vw : vi1; - - *o0++ = vacc0; - *o1++ = vacc1; - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-sse-2x4.c b/src/f32-prelu/gen/f32-prelu-sse-2x4.c deleted file mode 100644 index f3a5063f78d..00000000000 --- a/src/f32-prelu/gen/f32-prelu-sse-2x4.c +++ /dev/null @@ -1,111 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/sse.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__sse_2x4( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - const __m128 vzero = _mm_setzero_ps(); - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const __m128 vw0123 = _mm_load_ps(w); - w += 4; - - __m128 vi0x0123 = _mm_loadu_ps(i0); - i0 += 4; - __m128 vi1x0123 = _mm_loadu_ps(i1); - i1 += 4; - - __m128 vacc0x0123 = _mm_max_ps(_mm_setzero_ps(), vi0x0123); - vi0x0123 = _mm_min_ps(vi0x0123, vzero); - __m128 vacc1x0123 = _mm_max_ps(_mm_setzero_ps(), vi1x0123); - vi1x0123 = _mm_min_ps(vi1x0123, vzero); - - vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(vi0x0123, vw0123)); - vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(vi1x0123, vw0123)); - - _mm_storeu_ps(o0, vacc0x0123); - o0 += 4; - _mm_storeu_ps(o1, vacc1x0123); - o1 += 4; - } - if XNN_UNLIKELY(c != 0) { - const __m128 vw0123 = _mm_load_ps(w); - w = (const float*) ((uintptr_t) w + c); - - __m128 vi0x0123 = _mm_loadu_ps(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - __m128 vi1x0123 = _mm_loadu_ps(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - - __m128 vacc0x0123 = _mm_max_ps(_mm_setzero_ps(), vi0x0123); - vi0x0123 = _mm_min_ps(vi0x0123, vzero); - __m128 vacc1x0123 = _mm_max_ps(_mm_setzero_ps(), vi1x0123); - vi1x0123 = _mm_min_ps(vi1x0123, vzero); - - vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(vi0x0123, vw0123)); - vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(vi1x0123, vw0123)); - - if (c & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) o0, vacc0x0123); - _mm_storel_pi((__m64*) o1, vacc1x0123); - - vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123); - vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123); - - o0 += 2; - o1 += 2; - } - if (c & (1 * sizeof(float))) { - _mm_store_ss(o0, vacc0x0123); - _mm_store_ss(o1, vacc1x0123); - - o0 += 1; - o1 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-sse-2x8.c b/src/f32-prelu/gen/f32-prelu-sse-2x8.c deleted file mode 100644 index b361d5a293c..00000000000 --- a/src/f32-prelu/gen/f32-prelu-sse-2x8.c +++ /dev/null @@ -1,144 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/sse.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__sse_2x8( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - const __m128 vzero = _mm_setzero_ps(); - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { - const __m128 vw0123 = _mm_load_ps(w); - const __m128 vw4567 = _mm_load_ps(w + 4); - w += 8; - - __m128 vi0x0123 = _mm_loadu_ps(i0); - __m128 vi0x4567 = _mm_loadu_ps(i0 + 4); - i0 += 8; - __m128 vi1x0123 = _mm_loadu_ps(i1); - __m128 vi1x4567 = _mm_loadu_ps(i1 + 4); - i1 += 8; - - __m128 vacc0x0123 = _mm_max_ps(_mm_setzero_ps(), vi0x0123); - vi0x0123 = _mm_min_ps(vi0x0123, vzero); - __m128 vacc0x4567 = _mm_max_ps(_mm_setzero_ps(), vi0x4567); - vi0x4567 = _mm_min_ps(vi0x4567, vzero); - __m128 vacc1x0123 = _mm_max_ps(_mm_setzero_ps(), vi1x0123); - vi1x0123 = _mm_min_ps(vi1x0123, vzero); - __m128 vacc1x4567 = _mm_max_ps(_mm_setzero_ps(), vi1x4567); - vi1x4567 = _mm_min_ps(vi1x4567, vzero); - - vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(vi0x0123, vw0123)); - vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(vi0x4567, vw4567)); - vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(vi1x0123, vw0123)); - vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(vi1x4567, vw4567)); - - _mm_storeu_ps(o0, vacc0x0123); - _mm_storeu_ps(o0 + 4, vacc0x4567); - o0 += 8; - _mm_storeu_ps(o1, vacc1x0123); - _mm_storeu_ps(o1 + 4, vacc1x4567); - o1 += 8; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const __m128 vw0123 = _mm_load_ps(w); - w += 4; - - __m128 vi0x0123 = _mm_loadu_ps(i0); - i0 += 4; - __m128 vi1x0123 = _mm_loadu_ps(i1); - i1 += 4; - - __m128 vacc0x0123 = _mm_max_ps(_mm_setzero_ps(), vi0x0123); - vi0x0123 = _mm_min_ps(vi0x0123, vzero); - __m128 vacc1x0123 = _mm_max_ps(_mm_setzero_ps(), vi1x0123); - vi1x0123 = _mm_min_ps(vi1x0123, vzero); - - vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(vi0x0123, vw0123)); - vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(vi1x0123, vw0123)); - - _mm_storeu_ps(o0, vacc0x0123); - o0 += 4; - _mm_storeu_ps(o1, vacc1x0123); - o1 += 4; - } - if XNN_UNLIKELY(c != 0) { - const __m128 vw0123 = _mm_load_ps(w); - w = (const float*) ((uintptr_t) w + c); - - __m128 vi0x0123 = _mm_loadu_ps(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - __m128 vi1x0123 = _mm_loadu_ps(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - - __m128 vacc0x0123 = _mm_max_ps(_mm_setzero_ps(), vi0x0123); - vi0x0123 = _mm_min_ps(vi0x0123, vzero); - __m128 vacc1x0123 = _mm_max_ps(_mm_setzero_ps(), vi1x0123); - vi1x0123 = _mm_min_ps(vi1x0123, vzero); - - vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(vi0x0123, vw0123)); - vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(vi1x0123, vw0123)); - - if (c & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) o0, vacc0x0123); - _mm_storel_pi((__m64*) o1, vacc1x0123); - - vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123); - vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123); - - o0 += 2; - o1 += 2; - } - if (c & (1 * sizeof(float))) { - _mm_store_ss(o0, vacc0x0123); - _mm_store_ss(o1, vacc1x0123); - - o0 += 1; - o1 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-sse2-2x4.c b/src/f32-prelu/gen/f32-prelu-sse2-2x4.c deleted file mode 100644 index a4d3ca533da..00000000000 --- a/src/f32-prelu/gen/f32-prelu-sse2-2x4.c +++ /dev/null @@ -1,110 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/sse.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__sse2_2x4( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const __m128 vw0123 = _mm_load_ps(w); - w += 4; - - const __m128 vi0x0123 = _mm_loadu_ps(i0); - i0 += 4; - const __m128 vi1x0123 = _mm_loadu_ps(i1); - i1 += 4; - - const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123); - const __m128 vmask0x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi0x0123))); - const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123); - const __m128 vmask1x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi1x0123))); - - const __m128 vacc0x0123 = _mm_or_ps(_mm_and_ps(vprod0x0123, vmask0x0123), _mm_andnot_ps(vmask0x0123, vi0x0123)); - const __m128 vacc1x0123 = _mm_or_ps(_mm_and_ps(vprod1x0123, vmask1x0123), _mm_andnot_ps(vmask1x0123, vi1x0123)); - - _mm_storeu_ps(o0, vacc0x0123); - o0 += 4; - _mm_storeu_ps(o1, vacc1x0123); - o1 += 4; - } - if XNN_UNLIKELY(c != 0) { - const __m128 vw0123 = _mm_load_ps(w); - w = (const float*) ((uintptr_t) w + c); - - const __m128 vi0x0123 = _mm_loadu_ps(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - const __m128 vi1x0123 = _mm_loadu_ps(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - - const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123); - const __m128 vmask0x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi0x0123))); - const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123); - const __m128 vmask1x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi1x0123))); - - __m128 vacc0x0123 = _mm_or_ps(_mm_and_ps(vprod0x0123, vmask0x0123), _mm_andnot_ps(vmask0x0123, vi0x0123)); - __m128 vacc1x0123 = _mm_or_ps(_mm_and_ps(vprod1x0123, vmask1x0123), _mm_andnot_ps(vmask1x0123, vi1x0123)); - - if (c & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) o0, vacc0x0123); - _mm_storel_pi((__m64*) o1, vacc1x0123); - - vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123); - vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123); - - o0 += 2; - o1 += 2; - } - if (c & (1 * sizeof(float))) { - _mm_store_ss(o0, vacc0x0123); - _mm_store_ss(o1, vacc1x0123); - - o0 += 1; - o1 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-sse2-2x8.c b/src/f32-prelu/gen/f32-prelu-sse2-2x8.c deleted file mode 100644 index 87f3ebbd99f..00000000000 --- a/src/f32-prelu/gen/f32-prelu-sse2-2x8.c +++ /dev/null @@ -1,143 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/sse.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__sse2_2x8( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { - const __m128 vw0123 = _mm_load_ps(w); - const __m128 vw4567 = _mm_load_ps(w + 4); - w += 8; - - const __m128 vi0x0123 = _mm_loadu_ps(i0); - const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4); - i0 += 8; - const __m128 vi1x0123 = _mm_loadu_ps(i1); - const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4); - i1 += 8; - - const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123); - const __m128 vmask0x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi0x0123))); - const __m128 vprod0x4567 = _mm_mul_ps(vi0x4567, vw4567); - const __m128 vmask0x4567 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi0x4567))); - const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123); - const __m128 vmask1x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi1x0123))); - const __m128 vprod1x4567 = _mm_mul_ps(vi1x4567, vw4567); - const __m128 vmask1x4567 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi1x4567))); - - const __m128 vacc0x0123 = _mm_or_ps(_mm_and_ps(vprod0x0123, vmask0x0123), _mm_andnot_ps(vmask0x0123, vi0x0123)); - const __m128 vacc0x4567 = _mm_or_ps(_mm_and_ps(vprod0x4567, vmask0x4567), _mm_andnot_ps(vmask0x4567, vi0x4567)); - const __m128 vacc1x0123 = _mm_or_ps(_mm_and_ps(vprod1x0123, vmask1x0123), _mm_andnot_ps(vmask1x0123, vi1x0123)); - const __m128 vacc1x4567 = _mm_or_ps(_mm_and_ps(vprod1x4567, vmask1x4567), _mm_andnot_ps(vmask1x4567, vi1x4567)); - - _mm_storeu_ps(o0, vacc0x0123); - _mm_storeu_ps(o0 + 4, vacc0x4567); - o0 += 8; - _mm_storeu_ps(o1, vacc1x0123); - _mm_storeu_ps(o1 + 4, vacc1x4567); - o1 += 8; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const __m128 vw0123 = _mm_load_ps(w); - w += 4; - - const __m128 vi0x0123 = _mm_loadu_ps(i0); - i0 += 4; - const __m128 vi1x0123 = _mm_loadu_ps(i1); - i1 += 4; - - const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123); - const __m128 vmask0x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi0x0123))); - const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123); - const __m128 vmask1x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi1x0123))); - - __m128 vacc0x0123 = _mm_or_ps(_mm_and_ps(vprod0x0123, vmask0x0123), _mm_andnot_ps(vmask0x0123, vi0x0123)); - __m128 vacc1x0123 = _mm_or_ps(_mm_and_ps(vprod1x0123, vmask1x0123), _mm_andnot_ps(vmask1x0123, vi1x0123)); - - _mm_storeu_ps(o0, vacc0x0123); - o0 += 4; - _mm_storeu_ps(o1, vacc1x0123); - o1 += 4; - } - if XNN_UNLIKELY(c != 0) { - const __m128 vw0123 = _mm_load_ps(w); - w = (const float*) ((uintptr_t) w + c); - - const __m128 vi0x0123 = _mm_loadu_ps(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - const __m128 vi1x0123 = _mm_loadu_ps(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - - const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123); - const __m128 vmask0x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi0x0123))); - const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123); - const __m128 vmask1x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi1x0123))); - - __m128 vacc0x0123 = _mm_or_ps(_mm_and_ps(vprod0x0123, vmask0x0123), _mm_andnot_ps(vmask0x0123, vi0x0123)); - __m128 vacc1x0123 = _mm_or_ps(_mm_and_ps(vprod1x0123, vmask1x0123), _mm_andnot_ps(vmask1x0123, vi1x0123)); - - if (c & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) o0, vacc0x0123); - _mm_storel_pi((__m64*) o1, vacc1x0123); - - vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123); - vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123); - - o0 += 2; - o1 += 2; - } - if (c & (1 * sizeof(float))) { - _mm_store_ss(o0, vacc0x0123); - _mm_store_ss(o1, vacc1x0123); - - o0 += 1; - o1 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-sse41-2x4.c b/src/f32-prelu/gen/f32-prelu-sse41-2x4.c deleted file mode 100644 index 1bcec873fe7..00000000000 --- a/src/f32-prelu/gen/f32-prelu-sse41-2x4.c +++ /dev/null @@ -1,106 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/sse.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__sse41_2x4( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const __m128 vw0123 = _mm_load_ps(w); - w += 4; - - const __m128 vi0x0123 = _mm_loadu_ps(i0); - i0 += 4; - const __m128 vi1x0123 = _mm_loadu_ps(i1); - i1 += 4; - - const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123); - const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123); - - const __m128 vacc0x0123 = _mm_blendv_ps(vi0x0123, vprod0x0123, vi0x0123); - const __m128 vacc1x0123 = _mm_blendv_ps(vi1x0123, vprod1x0123, vi1x0123); - - _mm_storeu_ps(o0, vacc0x0123); - o0 += 4; - _mm_storeu_ps(o1, vacc1x0123); - o1 += 4; - } - if XNN_UNLIKELY(c != 0) { - const __m128 vw0123 = _mm_load_ps(w); - w = (const float*) ((uintptr_t) w + c); - - const __m128 vi0x0123 = _mm_loadu_ps(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - const __m128 vi1x0123 = _mm_loadu_ps(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - - const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123); - const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123); - - __m128 vacc0x0123 = _mm_blendv_ps(vi0x0123, vprod0x0123, vi0x0123); - __m128 vacc1x0123 = _mm_blendv_ps(vi1x0123, vprod1x0123, vi1x0123); - - if (c & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) o0, vacc0x0123); - _mm_storel_pi((__m64*) o1, vacc1x0123); - - vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123); - vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123); - - o0 += 2; - o1 += 2; - } - if (c & (1 * sizeof(float))) { - _mm_store_ss(o0, vacc0x0123); - _mm_store_ss(o1, vacc1x0123); - - o0 += 1; - o1 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-sse41-2x8.c b/src/f32-prelu/gen/f32-prelu-sse41-2x8.c deleted file mode 100644 index 21548d5c193..00000000000 --- a/src/f32-prelu/gen/f32-prelu-sse41-2x8.c +++ /dev/null @@ -1,135 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/sse.c.in -// Generator: tools/xngen -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__sse41_2x8( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { - const __m128 vw0123 = _mm_load_ps(w); - const __m128 vw4567 = _mm_load_ps(w + 4); - w += 8; - - const __m128 vi0x0123 = _mm_loadu_ps(i0); - const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4); - i0 += 8; - const __m128 vi1x0123 = _mm_loadu_ps(i1); - const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4); - i1 += 8; - - const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123); - const __m128 vprod0x4567 = _mm_mul_ps(vi0x4567, vw4567); - const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123); - const __m128 vprod1x4567 = _mm_mul_ps(vi1x4567, vw4567); - - const __m128 vacc0x0123 = _mm_blendv_ps(vi0x0123, vprod0x0123, vi0x0123); - const __m128 vacc0x4567 = _mm_blendv_ps(vi0x4567, vprod0x4567, vi0x4567); - const __m128 vacc1x0123 = _mm_blendv_ps(vi1x0123, vprod1x0123, vi1x0123); - const __m128 vacc1x4567 = _mm_blendv_ps(vi1x4567, vprod1x4567, vi1x4567); - - _mm_storeu_ps(o0, vacc0x0123); - _mm_storeu_ps(o0 + 4, vacc0x4567); - o0 += 8; - _mm_storeu_ps(o1, vacc1x0123); - _mm_storeu_ps(o1 + 4, vacc1x4567); - o1 += 8; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const __m128 vw0123 = _mm_load_ps(w); - w += 4; - - const __m128 vi0x0123 = _mm_loadu_ps(i0); - i0 += 4; - const __m128 vi1x0123 = _mm_loadu_ps(i1); - i1 += 4; - - const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123); - const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123); - - __m128 vacc0x0123 = _mm_blendv_ps(vi0x0123, vprod0x0123, vi0x0123); - __m128 vacc1x0123 = _mm_blendv_ps(vi1x0123, vprod1x0123, vi1x0123); - - _mm_storeu_ps(o0, vacc0x0123); - o0 += 4; - _mm_storeu_ps(o1, vacc1x0123); - o1 += 4; - } - if XNN_UNLIKELY(c != 0) { - const __m128 vw0123 = _mm_load_ps(w); - w = (const float*) ((uintptr_t) w + c); - - const __m128 vi0x0123 = _mm_loadu_ps(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - const __m128 vi1x0123 = _mm_loadu_ps(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - - const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123); - const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123); - - __m128 vacc0x0123 = _mm_blendv_ps(vi0x0123, vprod0x0123, vi0x0123); - __m128 vacc1x0123 = _mm_blendv_ps(vi1x0123, vprod1x0123, vi1x0123); - - if (c & (2 * sizeof(float))) { - _mm_storel_pi((__m64*) o0, vacc0x0123); - _mm_storel_pi((__m64*) o1, vacc1x0123); - - vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123); - vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123); - - o0 += 2; - o1 += 2; - } - if (c & (1 * sizeof(float))) { - _mm_store_ss(o0, vacc0x0123); - _mm_store_ss(o1, vacc1x0123); - - o0 += 1; - o1 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasm-2x1.c b/src/f32-prelu/gen/f32-prelu-wasm-2x1.c deleted file mode 100644 index f4d21934848..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasm-2x1.c +++ /dev/null @@ -1,71 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasm.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasm_2x1( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - const float vzero = 0.0f; - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - do { - const float vw = *w++; - - float vi0 = *i0++; - float vi1 = *i1++; - - float vacc0 = __builtin_wasm_max_f32(vi0, vzero); - vi0 = __builtin_wasm_min_f32(vi0, vzero); - float vacc1 = __builtin_wasm_max_f32(vi1, vzero); - vi1 = __builtin_wasm_min_f32(vi1, vzero); - - vacc0 += vi0 * vw; - vacc1 += vi1 * vw; - - *o0++ = vacc0; - *o1++ = vacc1; - - c -= sizeof(float); - } while (c != 0); - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasm-2x4.c b/src/f32-prelu/gen/f32-prelu-wasm-2x4.c deleted file mode 100644 index beee640db7f..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasm-2x4.c +++ /dev/null @@ -1,125 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasm.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasm_2x4( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - const float vzero = 0.0f; - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const float vw0 = w[0]; - const float vw1 = w[1]; - const float vw2 = w[2]; - const float vw3 = w[3]; - - float vi0x0 = i0[0]; - float vi0x1 = i0[1]; - float vi0x2 = i0[2]; - float vi0x3 = i0[3]; - i0 += 4; - float vi1x0 = i1[0]; - float vi1x1 = i1[1]; - float vi1x2 = i1[2]; - float vi1x3 = i1[3]; - i1 += 4; - - float vacc0x0 = __builtin_wasm_max_f32(vi0x0, vzero); - vi0x0 = __builtin_wasm_min_f32(vi0x0, vzero); - float vacc0x1 = __builtin_wasm_max_f32(vi0x1, vzero); - vi0x1 = __builtin_wasm_min_f32(vi0x1, vzero); - float vacc0x2 = __builtin_wasm_max_f32(vi0x2, vzero); - vi0x2 = __builtin_wasm_min_f32(vi0x2, vzero); - float vacc0x3 = __builtin_wasm_max_f32(vi0x3, vzero); - vi0x3 = __builtin_wasm_min_f32(vi0x3, vzero); - float vacc1x0 = __builtin_wasm_max_f32(vi1x0, vzero); - vi1x0 = __builtin_wasm_min_f32(vi1x0, vzero); - float vacc1x1 = __builtin_wasm_max_f32(vi1x1, vzero); - vi1x1 = __builtin_wasm_min_f32(vi1x1, vzero); - float vacc1x2 = __builtin_wasm_max_f32(vi1x2, vzero); - vi1x2 = __builtin_wasm_min_f32(vi1x2, vzero); - float vacc1x3 = __builtin_wasm_max_f32(vi1x3, vzero); - vi1x3 = __builtin_wasm_min_f32(vi1x3, vzero); - - vacc0x0 += vi0x0 * vw0; - vacc0x1 += vi0x1 * vw1; - vacc0x2 += vi0x2 * vw2; - vacc0x3 += vi0x3 * vw3; - vacc1x0 += vi1x0 * vw0; - vacc1x1 += vi1x1 * vw1; - vacc1x2 += vi1x2 * vw2; - vacc1x3 += vi1x3 * vw3; - - o0[0] = vacc0x0; - o0[1] = vacc0x1; - o0[2] = vacc0x2; - o0[3] = vacc0x3; - o0 += 4; - o1[0] = vacc1x0; - o1[1] = vacc1x1; - o1[2] = vacc1x2; - o1[3] = vacc1x3; - o1 += 4; - - w += 4; - } - for (; c != 0; c -= sizeof(float)) { - const float vw = *w++; - - float vi0 = *i0++; - float vi1 = *i1++; - - float vacc0 = __builtin_wasm_max_f32(vi0, vzero); - vi0 = __builtin_wasm_min_f32(vi0, vzero); - float vacc1 = __builtin_wasm_max_f32(vi1, vzero); - vi1 = __builtin_wasm_min_f32(vi1, vzero); - - vacc0 += vi0 * vw; - vacc1 += vi1 * vw; - - *o0++ = vacc0; - *o1++ = vacc1; - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x16.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x16.c deleted file mode 100644 index c3f3c18d6a6..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x16.c +++ /dev/null @@ -1,119 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-iminmax.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x16( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - - const size_t input_increment = input_stride * 1 - channels; - const size_t output_increment = output_stride * 1 - channels; - - const v128_t vzero = wasm_i32x4_const_splat(0); - do { - - const float* w = weights; - size_t c = channels; - for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - const v128_t vw4567 = wasm_v128_load(w + 4); - const v128_t vw89AB = wasm_v128_load(w + 8); - const v128_t vwCDEF = wasm_v128_load(w + 12); - w += 16; - - v128_t vi0x0123 = wasm_v128_load(i0); - v128_t vi0x4567 = wasm_v128_load(i0 + 4); - v128_t vi0x89AB = wasm_v128_load(i0 + 8); - v128_t vi0xCDEF = wasm_v128_load(i0 + 12); - i0 += 16; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc0x4567 = wasm_i32x4_max(vi0x4567, vzero); - vi0x4567 = wasm_i32x4_min(vi0x4567, vzero); - v128_t vacc0x89AB = wasm_i32x4_max(vi0x89AB, vzero); - vi0x89AB = wasm_i32x4_min(vi0x89AB, vzero); - v128_t vacc0xCDEF = wasm_i32x4_max(vi0xCDEF, vzero); - vi0xCDEF = wasm_i32x4_min(vi0xCDEF, vzero); - - vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123); - vacc0x4567 = wasm_f32x4_relaxed_madd(vi0x4567, vw4567, vacc0x4567); - vacc0x89AB = wasm_f32x4_relaxed_madd(vi0x89AB, vw89AB, vacc0x89AB); - vacc0xCDEF = wasm_f32x4_relaxed_madd(vi0xCDEF, vwCDEF, vacc0xCDEF); - - wasm_v128_store(o0, vacc0x0123); - wasm_v128_store(o0 + 4, vacc0x4567); - wasm_v128_store(o0 + 8, vacc0x89AB); - wasm_v128_store(o0 + 12, vacc0xCDEF); - o0 += 16; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - - vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - - vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - - o0 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - - o0 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - rows = doz(rows, 1); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x4.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x4.c deleted file mode 100644 index 36bb65e0b3c..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x4.c +++ /dev/null @@ -1,86 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-iminmax.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x4( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - - const size_t input_increment = input_stride * 1 - channels; - const size_t output_increment = output_stride * 1 - channels; - - const v128_t vzero = wasm_i32x4_const_splat(0); - do { - - const float* w = weights; - size_t c = channels; - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - - vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - - vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - - o0 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - - o0 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - rows = doz(rows, 1); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x8.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x8.c deleted file mode 100644 index 5fd22f53efa..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x8.c +++ /dev/null @@ -1,107 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-iminmax.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x8( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - - const size_t input_increment = input_stride * 1 - channels; - const size_t output_increment = output_stride * 1 - channels; - - const v128_t vzero = wasm_i32x4_const_splat(0); - do { - - const float* w = weights; - size_t c = channels; - for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - const v128_t vw4567 = wasm_v128_load(w + 4); - w += 8; - - v128_t vi0x0123 = wasm_v128_load(i0); - v128_t vi0x4567 = wasm_v128_load(i0 + 4); - i0 += 8; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc0x4567 = wasm_i32x4_max(vi0x4567, vzero); - vi0x4567 = wasm_i32x4_min(vi0x4567, vzero); - - vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123); - vacc0x4567 = wasm_f32x4_relaxed_madd(vi0x4567, vw4567, vacc0x4567); - - wasm_v128_store(o0, vacc0x0123); - wasm_v128_store(o0 + 4, vacc0x4567); - o0 += 8; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - - vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - - vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - - o0 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - - o0 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - rows = doz(rows, 1); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x16.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x16.c deleted file mode 100644 index 473d4b5273e..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x16.c +++ /dev/null @@ -1,166 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-iminmax.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x16( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - const v128_t vzero = wasm_i32x4_const_splat(0); - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - const v128_t vw4567 = wasm_v128_load(w + 4); - const v128_t vw89AB = wasm_v128_load(w + 8); - const v128_t vwCDEF = wasm_v128_load(w + 12); - w += 16; - - v128_t vi0x0123 = wasm_v128_load(i0); - v128_t vi0x4567 = wasm_v128_load(i0 + 4); - v128_t vi0x89AB = wasm_v128_load(i0 + 8); - v128_t vi0xCDEF = wasm_v128_load(i0 + 12); - i0 += 16; - v128_t vi1x0123 = wasm_v128_load(i1); - v128_t vi1x4567 = wasm_v128_load(i1 + 4); - v128_t vi1x89AB = wasm_v128_load(i1 + 8); - v128_t vi1xCDEF = wasm_v128_load(i1 + 12); - i1 += 16; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc0x4567 = wasm_i32x4_max(vi0x4567, vzero); - vi0x4567 = wasm_i32x4_min(vi0x4567, vzero); - v128_t vacc0x89AB = wasm_i32x4_max(vi0x89AB, vzero); - vi0x89AB = wasm_i32x4_min(vi0x89AB, vzero); - v128_t vacc0xCDEF = wasm_i32x4_max(vi0xCDEF, vzero); - vi0xCDEF = wasm_i32x4_min(vi0xCDEF, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - v128_t vacc1x4567 = wasm_i32x4_max(vi1x4567, vzero); - vi1x4567 = wasm_i32x4_min(vi1x4567, vzero); - v128_t vacc1x89AB = wasm_i32x4_max(vi1x89AB, vzero); - vi1x89AB = wasm_i32x4_min(vi1x89AB, vzero); - v128_t vacc1xCDEF = wasm_i32x4_max(vi1xCDEF, vzero); - vi1xCDEF = wasm_i32x4_min(vi1xCDEF, vzero); - - vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123); - vacc0x4567 = wasm_f32x4_relaxed_madd(vi0x4567, vw4567, vacc0x4567); - vacc0x89AB = wasm_f32x4_relaxed_madd(vi0x89AB, vw89AB, vacc0x89AB); - vacc0xCDEF = wasm_f32x4_relaxed_madd(vi0xCDEF, vwCDEF, vacc0xCDEF); - vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123); - vacc1x4567 = wasm_f32x4_relaxed_madd(vi1x4567, vw4567, vacc1x4567); - vacc1x89AB = wasm_f32x4_relaxed_madd(vi1x89AB, vw89AB, vacc1x89AB); - vacc1xCDEF = wasm_f32x4_relaxed_madd(vi1xCDEF, vwCDEF, vacc1xCDEF); - - wasm_v128_store(o0, vacc0x0123); - wasm_v128_store(o0 + 4, vacc0x4567); - wasm_v128_store(o0 + 8, vacc0x89AB); - wasm_v128_store(o0 + 12, vacc0xCDEF); - o0 += 16; - wasm_v128_store(o1, vacc1x0123); - wasm_v128_store(o1 + 4, vacc1x4567); - wasm_v128_store(o1 + 8, vacc1x89AB); - wasm_v128_store(o1 + 12, vacc1xCDEF); - o1 += 16; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - v128_t vi1x0123 = wasm_v128_load(i1); - i1 += 4; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - - vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123); - vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - wasm_v128_store(o1, vacc1x0123); - o1 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - v128_t vi1x0123 = wasm_v128_load(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - - vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123); - vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - wasm_v128_store64_lane(o1, vacc1x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); - - o0 += 2; - o1 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - wasm_v128_store32_lane(o1, vacc1x0123, 0); - - o0 += 1; - o1 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x4.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x4.c deleted file mode 100644 index ddf7efb3685..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x4.c +++ /dev/null @@ -1,111 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-iminmax.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - const v128_t vzero = wasm_i32x4_const_splat(0); - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - v128_t vi1x0123 = wasm_v128_load(i1); - i1 += 4; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - - vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123); - vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - wasm_v128_store(o1, vacc1x0123); - o1 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - v128_t vi1x0123 = wasm_v128_load(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - - vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123); - vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - wasm_v128_store64_lane(o1, vacc1x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); - - o0 += 2; - o1 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - wasm_v128_store32_lane(o1, vacc1x0123, 0); - - o0 += 1; - o1 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x8.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x8.c deleted file mode 100644 index cd4ce1f33e7..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x8.c +++ /dev/null @@ -1,144 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-iminmax.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x8( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - const v128_t vzero = wasm_i32x4_const_splat(0); - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - const v128_t vw4567 = wasm_v128_load(w + 4); - w += 8; - - v128_t vi0x0123 = wasm_v128_load(i0); - v128_t vi0x4567 = wasm_v128_load(i0 + 4); - i0 += 8; - v128_t vi1x0123 = wasm_v128_load(i1); - v128_t vi1x4567 = wasm_v128_load(i1 + 4); - i1 += 8; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc0x4567 = wasm_i32x4_max(vi0x4567, vzero); - vi0x4567 = wasm_i32x4_min(vi0x4567, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - v128_t vacc1x4567 = wasm_i32x4_max(vi1x4567, vzero); - vi1x4567 = wasm_i32x4_min(vi1x4567, vzero); - - vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123); - vacc0x4567 = wasm_f32x4_relaxed_madd(vi0x4567, vw4567, vacc0x4567); - vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123); - vacc1x4567 = wasm_f32x4_relaxed_madd(vi1x4567, vw4567, vacc1x4567); - - wasm_v128_store(o0, vacc0x0123); - wasm_v128_store(o0 + 4, vacc0x4567); - o0 += 8; - wasm_v128_store(o1, vacc1x0123); - wasm_v128_store(o1 + 4, vacc1x4567); - o1 += 8; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - v128_t vi1x0123 = wasm_v128_load(i1); - i1 += 4; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - - vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123); - vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - wasm_v128_store(o1, vacc1x0123); - o1 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - v128_t vi1x0123 = wasm_v128_load(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - - vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123); - vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - wasm_v128_store64_lane(o1, vacc1x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); - - o0 += 2; - o1 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - wasm_v128_store32_lane(o1, vacc1x0123, 0); - - o0 += 1; - o1 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x16.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x16.c deleted file mode 100644 index 084aa6997e3..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x16.c +++ /dev/null @@ -1,260 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-iminmax.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x16( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - float* o2 = (float*) ((uintptr_t) o1 + output_stride); - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - float* o3 = (float*) ((uintptr_t) o2 + output_stride); - - const size_t input_increment = input_stride * 4 - channels; - const size_t output_increment = output_stride * 4 - channels; - - const v128_t vzero = wasm_i32x4_const_splat(0); - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = i1; - o2 = o1; - } - if XNN_UNPREDICTABLE(rows < 4) { - i3 = i2; - o3 = o2; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - const v128_t vw4567 = wasm_v128_load(w + 4); - const v128_t vw89AB = wasm_v128_load(w + 8); - const v128_t vwCDEF = wasm_v128_load(w + 12); - w += 16; - - v128_t vi0x0123 = wasm_v128_load(i0); - v128_t vi0x4567 = wasm_v128_load(i0 + 4); - v128_t vi0x89AB = wasm_v128_load(i0 + 8); - v128_t vi0xCDEF = wasm_v128_load(i0 + 12); - i0 += 16; - v128_t vi1x0123 = wasm_v128_load(i1); - v128_t vi1x4567 = wasm_v128_load(i1 + 4); - v128_t vi1x89AB = wasm_v128_load(i1 + 8); - v128_t vi1xCDEF = wasm_v128_load(i1 + 12); - i1 += 16; - v128_t vi2x0123 = wasm_v128_load(i2); - v128_t vi2x4567 = wasm_v128_load(i2 + 4); - v128_t vi2x89AB = wasm_v128_load(i2 + 8); - v128_t vi2xCDEF = wasm_v128_load(i2 + 12); - i2 += 16; - v128_t vi3x0123 = wasm_v128_load(i3); - v128_t vi3x4567 = wasm_v128_load(i3 + 4); - v128_t vi3x89AB = wasm_v128_load(i3 + 8); - v128_t vi3xCDEF = wasm_v128_load(i3 + 12); - i3 += 16; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc0x4567 = wasm_i32x4_max(vi0x4567, vzero); - vi0x4567 = wasm_i32x4_min(vi0x4567, vzero); - v128_t vacc0x89AB = wasm_i32x4_max(vi0x89AB, vzero); - vi0x89AB = wasm_i32x4_min(vi0x89AB, vzero); - v128_t vacc0xCDEF = wasm_i32x4_max(vi0xCDEF, vzero); - vi0xCDEF = wasm_i32x4_min(vi0xCDEF, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - v128_t vacc1x4567 = wasm_i32x4_max(vi1x4567, vzero); - vi1x4567 = wasm_i32x4_min(vi1x4567, vzero); - v128_t vacc1x89AB = wasm_i32x4_max(vi1x89AB, vzero); - vi1x89AB = wasm_i32x4_min(vi1x89AB, vzero); - v128_t vacc1xCDEF = wasm_i32x4_max(vi1xCDEF, vzero); - vi1xCDEF = wasm_i32x4_min(vi1xCDEF, vzero); - v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero); - vi2x0123 = wasm_i32x4_min(vi2x0123, vzero); - v128_t vacc2x4567 = wasm_i32x4_max(vi2x4567, vzero); - vi2x4567 = wasm_i32x4_min(vi2x4567, vzero); - v128_t vacc2x89AB = wasm_i32x4_max(vi2x89AB, vzero); - vi2x89AB = wasm_i32x4_min(vi2x89AB, vzero); - v128_t vacc2xCDEF = wasm_i32x4_max(vi2xCDEF, vzero); - vi2xCDEF = wasm_i32x4_min(vi2xCDEF, vzero); - v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero); - vi3x0123 = wasm_i32x4_min(vi3x0123, vzero); - v128_t vacc3x4567 = wasm_i32x4_max(vi3x4567, vzero); - vi3x4567 = wasm_i32x4_min(vi3x4567, vzero); - v128_t vacc3x89AB = wasm_i32x4_max(vi3x89AB, vzero); - vi3x89AB = wasm_i32x4_min(vi3x89AB, vzero); - v128_t vacc3xCDEF = wasm_i32x4_max(vi3xCDEF, vzero); - vi3xCDEF = wasm_i32x4_min(vi3xCDEF, vzero); - - vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123); - vacc0x4567 = wasm_f32x4_relaxed_madd(vi0x4567, vw4567, vacc0x4567); - vacc0x89AB = wasm_f32x4_relaxed_madd(vi0x89AB, vw89AB, vacc0x89AB); - vacc0xCDEF = wasm_f32x4_relaxed_madd(vi0xCDEF, vwCDEF, vacc0xCDEF); - vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123); - vacc1x4567 = wasm_f32x4_relaxed_madd(vi1x4567, vw4567, vacc1x4567); - vacc1x89AB = wasm_f32x4_relaxed_madd(vi1x89AB, vw89AB, vacc1x89AB); - vacc1xCDEF = wasm_f32x4_relaxed_madd(vi1xCDEF, vwCDEF, vacc1xCDEF); - vacc2x0123 = wasm_f32x4_relaxed_madd(vi2x0123, vw0123, vacc2x0123); - vacc2x4567 = wasm_f32x4_relaxed_madd(vi2x4567, vw4567, vacc2x4567); - vacc2x89AB = wasm_f32x4_relaxed_madd(vi2x89AB, vw89AB, vacc2x89AB); - vacc2xCDEF = wasm_f32x4_relaxed_madd(vi2xCDEF, vwCDEF, vacc2xCDEF); - vacc3x0123 = wasm_f32x4_relaxed_madd(vi3x0123, vw0123, vacc3x0123); - vacc3x4567 = wasm_f32x4_relaxed_madd(vi3x4567, vw4567, vacc3x4567); - vacc3x89AB = wasm_f32x4_relaxed_madd(vi3x89AB, vw89AB, vacc3x89AB); - vacc3xCDEF = wasm_f32x4_relaxed_madd(vi3xCDEF, vwCDEF, vacc3xCDEF); - - wasm_v128_store(o0, vacc0x0123); - wasm_v128_store(o0 + 4, vacc0x4567); - wasm_v128_store(o0 + 8, vacc0x89AB); - wasm_v128_store(o0 + 12, vacc0xCDEF); - o0 += 16; - wasm_v128_store(o1, vacc1x0123); - wasm_v128_store(o1 + 4, vacc1x4567); - wasm_v128_store(o1 + 8, vacc1x89AB); - wasm_v128_store(o1 + 12, vacc1xCDEF); - o1 += 16; - wasm_v128_store(o2, vacc2x0123); - wasm_v128_store(o2 + 4, vacc2x4567); - wasm_v128_store(o2 + 8, vacc2x89AB); - wasm_v128_store(o2 + 12, vacc2xCDEF); - o2 += 16; - wasm_v128_store(o3, vacc3x0123); - wasm_v128_store(o3 + 4, vacc3x4567); - wasm_v128_store(o3 + 8, vacc3x89AB); - wasm_v128_store(o3 + 12, vacc3xCDEF); - o3 += 16; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - v128_t vi1x0123 = wasm_v128_load(i1); - i1 += 4; - v128_t vi2x0123 = wasm_v128_load(i2); - i2 += 4; - v128_t vi3x0123 = wasm_v128_load(i3); - i3 += 4; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero); - vi2x0123 = wasm_i32x4_min(vi2x0123, vzero); - v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero); - vi3x0123 = wasm_i32x4_min(vi3x0123, vzero); - - vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123); - vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123); - vacc2x0123 = wasm_f32x4_relaxed_madd(vi2x0123, vw0123, vacc2x0123); - vacc3x0123 = wasm_f32x4_relaxed_madd(vi3x0123, vw0123, vacc3x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - wasm_v128_store(o1, vacc1x0123); - o1 += 4; - wasm_v128_store(o2, vacc2x0123); - o2 += 4; - wasm_v128_store(o3, vacc3x0123); - o3 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - v128_t vi1x0123 = wasm_v128_load(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - v128_t vi2x0123 = wasm_v128_load(i2); - i2 = (const float*) ((uintptr_t) i2 + c); - v128_t vi3x0123 = wasm_v128_load(i3); - i3 = (const float*) ((uintptr_t) i3 + c); - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero); - vi2x0123 = wasm_i32x4_min(vi2x0123, vzero); - v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero); - vi3x0123 = wasm_i32x4_min(vi3x0123, vzero); - - vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123); - vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123); - vacc2x0123 = wasm_f32x4_relaxed_madd(vi2x0123, vw0123, vacc2x0123); - vacc3x0123 = wasm_f32x4_relaxed_madd(vi3x0123, vw0123, vacc3x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - wasm_v128_store64_lane(o1, vacc1x0123, 0); - wasm_v128_store64_lane(o2, vacc2x0123, 0); - wasm_v128_store64_lane(o3, vacc3x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); - vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); - vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); - - o0 += 2; - o1 += 2; - o2 += 2; - o3 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - wasm_v128_store32_lane(o1, vacc1x0123, 0); - wasm_v128_store32_lane(o2, vacc2x0123, 0); - wasm_v128_store32_lane(o3, vacc3x0123, 0); - - o0 += 1; - o1 += 1; - o2 += 1; - o3 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - i2 = (const float*) ((uintptr_t) i2 + input_increment); - o2 = (float*) ((uintptr_t) o2 + output_increment); - i3 = (const float*) ((uintptr_t) i3 + input_increment); - o3 = (float*) ((uintptr_t) o3 + output_increment); - rows = doz(rows, 4); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x4.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x4.c deleted file mode 100644 index 2606faa27e4..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x4.c +++ /dev/null @@ -1,161 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-iminmax.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x4( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - float* o2 = (float*) ((uintptr_t) o1 + output_stride); - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - float* o3 = (float*) ((uintptr_t) o2 + output_stride); - - const size_t input_increment = input_stride * 4 - channels; - const size_t output_increment = output_stride * 4 - channels; - - const v128_t vzero = wasm_i32x4_const_splat(0); - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = i1; - o2 = o1; - } - if XNN_UNPREDICTABLE(rows < 4) { - i3 = i2; - o3 = o2; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - v128_t vi1x0123 = wasm_v128_load(i1); - i1 += 4; - v128_t vi2x0123 = wasm_v128_load(i2); - i2 += 4; - v128_t vi3x0123 = wasm_v128_load(i3); - i3 += 4; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero); - vi2x0123 = wasm_i32x4_min(vi2x0123, vzero); - v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero); - vi3x0123 = wasm_i32x4_min(vi3x0123, vzero); - - vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123); - vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123); - vacc2x0123 = wasm_f32x4_relaxed_madd(vi2x0123, vw0123, vacc2x0123); - vacc3x0123 = wasm_f32x4_relaxed_madd(vi3x0123, vw0123, vacc3x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - wasm_v128_store(o1, vacc1x0123); - o1 += 4; - wasm_v128_store(o2, vacc2x0123); - o2 += 4; - wasm_v128_store(o3, vacc3x0123); - o3 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - v128_t vi1x0123 = wasm_v128_load(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - v128_t vi2x0123 = wasm_v128_load(i2); - i2 = (const float*) ((uintptr_t) i2 + c); - v128_t vi3x0123 = wasm_v128_load(i3); - i3 = (const float*) ((uintptr_t) i3 + c); - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero); - vi2x0123 = wasm_i32x4_min(vi2x0123, vzero); - v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero); - vi3x0123 = wasm_i32x4_min(vi3x0123, vzero); - - vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123); - vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123); - vacc2x0123 = wasm_f32x4_relaxed_madd(vi2x0123, vw0123, vacc2x0123); - vacc3x0123 = wasm_f32x4_relaxed_madd(vi3x0123, vw0123, vacc3x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - wasm_v128_store64_lane(o1, vacc1x0123, 0); - wasm_v128_store64_lane(o2, vacc2x0123, 0); - wasm_v128_store64_lane(o3, vacc3x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); - vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); - vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); - - o0 += 2; - o1 += 2; - o2 += 2; - o3 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - wasm_v128_store32_lane(o1, vacc1x0123, 0); - wasm_v128_store32_lane(o2, vacc2x0123, 0); - wasm_v128_store32_lane(o3, vacc3x0123, 0); - - o0 += 1; - o1 += 1; - o2 += 1; - o3 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - i2 = (const float*) ((uintptr_t) i2 + input_increment); - o2 = (float*) ((uintptr_t) o2 + output_increment); - i3 = (const float*) ((uintptr_t) i3 + input_increment); - o3 = (float*) ((uintptr_t) o3 + output_increment); - rows = doz(rows, 4); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x8.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x8.c deleted file mode 100644 index 1a4595e0c98..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x8.c +++ /dev/null @@ -1,218 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-iminmax.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x8( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - float* o2 = (float*) ((uintptr_t) o1 + output_stride); - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - float* o3 = (float*) ((uintptr_t) o2 + output_stride); - - const size_t input_increment = input_stride * 4 - channels; - const size_t output_increment = output_stride * 4 - channels; - - const v128_t vzero = wasm_i32x4_const_splat(0); - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = i1; - o2 = o1; - } - if XNN_UNPREDICTABLE(rows < 4) { - i3 = i2; - o3 = o2; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - const v128_t vw4567 = wasm_v128_load(w + 4); - w += 8; - - v128_t vi0x0123 = wasm_v128_load(i0); - v128_t vi0x4567 = wasm_v128_load(i0 + 4); - i0 += 8; - v128_t vi1x0123 = wasm_v128_load(i1); - v128_t vi1x4567 = wasm_v128_load(i1 + 4); - i1 += 8; - v128_t vi2x0123 = wasm_v128_load(i2); - v128_t vi2x4567 = wasm_v128_load(i2 + 4); - i2 += 8; - v128_t vi3x0123 = wasm_v128_load(i3); - v128_t vi3x4567 = wasm_v128_load(i3 + 4); - i3 += 8; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc0x4567 = wasm_i32x4_max(vi0x4567, vzero); - vi0x4567 = wasm_i32x4_min(vi0x4567, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - v128_t vacc1x4567 = wasm_i32x4_max(vi1x4567, vzero); - vi1x4567 = wasm_i32x4_min(vi1x4567, vzero); - v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero); - vi2x0123 = wasm_i32x4_min(vi2x0123, vzero); - v128_t vacc2x4567 = wasm_i32x4_max(vi2x4567, vzero); - vi2x4567 = wasm_i32x4_min(vi2x4567, vzero); - v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero); - vi3x0123 = wasm_i32x4_min(vi3x0123, vzero); - v128_t vacc3x4567 = wasm_i32x4_max(vi3x4567, vzero); - vi3x4567 = wasm_i32x4_min(vi3x4567, vzero); - - vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123); - vacc0x4567 = wasm_f32x4_relaxed_madd(vi0x4567, vw4567, vacc0x4567); - vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123); - vacc1x4567 = wasm_f32x4_relaxed_madd(vi1x4567, vw4567, vacc1x4567); - vacc2x0123 = wasm_f32x4_relaxed_madd(vi2x0123, vw0123, vacc2x0123); - vacc2x4567 = wasm_f32x4_relaxed_madd(vi2x4567, vw4567, vacc2x4567); - vacc3x0123 = wasm_f32x4_relaxed_madd(vi3x0123, vw0123, vacc3x0123); - vacc3x4567 = wasm_f32x4_relaxed_madd(vi3x4567, vw4567, vacc3x4567); - - wasm_v128_store(o0, vacc0x0123); - wasm_v128_store(o0 + 4, vacc0x4567); - o0 += 8; - wasm_v128_store(o1, vacc1x0123); - wasm_v128_store(o1 + 4, vacc1x4567); - o1 += 8; - wasm_v128_store(o2, vacc2x0123); - wasm_v128_store(o2 + 4, vacc2x4567); - o2 += 8; - wasm_v128_store(o3, vacc3x0123); - wasm_v128_store(o3 + 4, vacc3x4567); - o3 += 8; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - v128_t vi1x0123 = wasm_v128_load(i1); - i1 += 4; - v128_t vi2x0123 = wasm_v128_load(i2); - i2 += 4; - v128_t vi3x0123 = wasm_v128_load(i3); - i3 += 4; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero); - vi2x0123 = wasm_i32x4_min(vi2x0123, vzero); - v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero); - vi3x0123 = wasm_i32x4_min(vi3x0123, vzero); - - vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123); - vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123); - vacc2x0123 = wasm_f32x4_relaxed_madd(vi2x0123, vw0123, vacc2x0123); - vacc3x0123 = wasm_f32x4_relaxed_madd(vi3x0123, vw0123, vacc3x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - wasm_v128_store(o1, vacc1x0123); - o1 += 4; - wasm_v128_store(o2, vacc2x0123); - o2 += 4; - wasm_v128_store(o3, vacc3x0123); - o3 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - v128_t vi1x0123 = wasm_v128_load(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - v128_t vi2x0123 = wasm_v128_load(i2); - i2 = (const float*) ((uintptr_t) i2 + c); - v128_t vi3x0123 = wasm_v128_load(i3); - i3 = (const float*) ((uintptr_t) i3 + c); - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero); - vi2x0123 = wasm_i32x4_min(vi2x0123, vzero); - v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero); - vi3x0123 = wasm_i32x4_min(vi3x0123, vzero); - - vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123); - vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123); - vacc2x0123 = wasm_f32x4_relaxed_madd(vi2x0123, vw0123, vacc2x0123); - vacc3x0123 = wasm_f32x4_relaxed_madd(vi3x0123, vw0123, vacc3x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - wasm_v128_store64_lane(o1, vacc1x0123, 0); - wasm_v128_store64_lane(o2, vacc2x0123, 0); - wasm_v128_store64_lane(o3, vacc3x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); - vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); - vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); - - o0 += 2; - o1 += 2; - o2 += 2; - o3 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - wasm_v128_store32_lane(o1, vacc1x0123, 0); - wasm_v128_store32_lane(o2, vacc2x0123, 0); - wasm_v128_store32_lane(o3, vacc3x0123, 0); - - o0 += 1; - o1 += 1; - o2 += 1; - o3 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - i2 = (const float*) ((uintptr_t) i2 + input_increment); - o2 = (float*) ((uintptr_t) o2 + output_increment); - i3 = (const float*) ((uintptr_t) i3 + input_increment); - o3 = (float*) ((uintptr_t) o3 + output_increment); - rows = doz(rows, 4); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x16.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x16.c deleted file mode 100644 index 974dab42c07..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x16.c +++ /dev/null @@ -1,118 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-laneselect.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x16( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - - const size_t input_increment = input_stride * 1 - channels; - const size_t output_increment = output_stride * 1 - channels; - - do { - - const float* w = weights; - size_t c = channels; - for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - const v128_t vw4567 = wasm_v128_load(w + 4); - const v128_t vw89AB = wasm_v128_load(w + 8); - const v128_t vwCDEF = wasm_v128_load(w + 12); - w += 16; - - const v128_t vi0x0123 = wasm_v128_load(i0); - const v128_t vi0x4567 = wasm_v128_load(i0 + 4); - const v128_t vi0x89AB = wasm_v128_load(i0 + 8); - const v128_t vi0xCDEF = wasm_v128_load(i0 + 12); - i0 += 16; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc0x4567 = wasm_f32x4_mul(vi0x4567, vw4567); - const v128_t vmask0x4567 = wasm_i32x4_shr(vi0x4567, 31); - v128_t vacc0x89AB = wasm_f32x4_mul(vi0x89AB, vw89AB); - const v128_t vmask0x89AB = wasm_i32x4_shr(vi0x89AB, 31); - v128_t vacc0xCDEF = wasm_f32x4_mul(vi0xCDEF, vwCDEF); - const v128_t vmask0xCDEF = wasm_i32x4_shr(vi0xCDEF, 31); - - vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc0x4567 = wasm_i32x4_relaxed_laneselect(vacc0x4567, vi0x4567, vmask0x4567); - vacc0x89AB = wasm_i32x4_relaxed_laneselect(vacc0x89AB, vi0x89AB, vmask0x89AB); - vacc0xCDEF = wasm_i32x4_relaxed_laneselect(vacc0xCDEF, vi0xCDEF, vmask0xCDEF); - - wasm_v128_store(o0, vacc0x0123); - wasm_v128_store(o0 + 4, vacc0x4567); - wasm_v128_store(o0 + 8, vacc0x89AB); - wasm_v128_store(o0 + 12, vacc0xCDEF); - o0 += 16; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - - vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - - vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - - o0 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - - o0 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - rows = doz(rows, 1); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x4.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x4.c deleted file mode 100644 index 88c927a6504..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x4.c +++ /dev/null @@ -1,85 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-laneselect.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x4( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - - const size_t input_increment = input_stride * 1 - channels; - const size_t output_increment = output_stride * 1 - channels; - - do { - - const float* w = weights; - size_t c = channels; - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - - vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - - vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - - o0 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - - o0 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - rows = doz(rows, 1); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x8.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x8.c deleted file mode 100644 index 5e0deffdbcb..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x8.c +++ /dev/null @@ -1,106 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-laneselect.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x8( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - - const size_t input_increment = input_stride * 1 - channels; - const size_t output_increment = output_stride * 1 - channels; - - do { - - const float* w = weights; - size_t c = channels; - for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - const v128_t vw4567 = wasm_v128_load(w + 4); - w += 8; - - const v128_t vi0x0123 = wasm_v128_load(i0); - const v128_t vi0x4567 = wasm_v128_load(i0 + 4); - i0 += 8; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc0x4567 = wasm_f32x4_mul(vi0x4567, vw4567); - const v128_t vmask0x4567 = wasm_i32x4_shr(vi0x4567, 31); - - vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc0x4567 = wasm_i32x4_relaxed_laneselect(vacc0x4567, vi0x4567, vmask0x4567); - - wasm_v128_store(o0, vacc0x0123); - wasm_v128_store(o0 + 4, vacc0x4567); - o0 += 8; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - - vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - - vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - - o0 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - - o0 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - rows = doz(rows, 1); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x16.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x16.c deleted file mode 100644 index dad82d1178e..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x16.c +++ /dev/null @@ -1,165 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-laneselect.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x16( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - const v128_t vw4567 = wasm_v128_load(w + 4); - const v128_t vw89AB = wasm_v128_load(w + 8); - const v128_t vwCDEF = wasm_v128_load(w + 12); - w += 16; - - const v128_t vi0x0123 = wasm_v128_load(i0); - const v128_t vi0x4567 = wasm_v128_load(i0 + 4); - const v128_t vi0x89AB = wasm_v128_load(i0 + 8); - const v128_t vi0xCDEF = wasm_v128_load(i0 + 12); - i0 += 16; - const v128_t vi1x0123 = wasm_v128_load(i1); - const v128_t vi1x4567 = wasm_v128_load(i1 + 4); - const v128_t vi1x89AB = wasm_v128_load(i1 + 8); - const v128_t vi1xCDEF = wasm_v128_load(i1 + 12); - i1 += 16; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc0x4567 = wasm_f32x4_mul(vi0x4567, vw4567); - const v128_t vmask0x4567 = wasm_i32x4_shr(vi0x4567, 31); - v128_t vacc0x89AB = wasm_f32x4_mul(vi0x89AB, vw89AB); - const v128_t vmask0x89AB = wasm_i32x4_shr(vi0x89AB, 31); - v128_t vacc0xCDEF = wasm_f32x4_mul(vi0xCDEF, vwCDEF); - const v128_t vmask0xCDEF = wasm_i32x4_shr(vi0xCDEF, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - v128_t vacc1x4567 = wasm_f32x4_mul(vi1x4567, vw4567); - const v128_t vmask1x4567 = wasm_i32x4_shr(vi1x4567, 31); - v128_t vacc1x89AB = wasm_f32x4_mul(vi1x89AB, vw89AB); - const v128_t vmask1x89AB = wasm_i32x4_shr(vi1x89AB, 31); - v128_t vacc1xCDEF = wasm_f32x4_mul(vi1xCDEF, vwCDEF); - const v128_t vmask1xCDEF = wasm_i32x4_shr(vi1xCDEF, 31); - - vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc0x4567 = wasm_i32x4_relaxed_laneselect(vacc0x4567, vi0x4567, vmask0x4567); - vacc0x89AB = wasm_i32x4_relaxed_laneselect(vacc0x89AB, vi0x89AB, vmask0x89AB); - vacc0xCDEF = wasm_i32x4_relaxed_laneselect(vacc0xCDEF, vi0xCDEF, vmask0xCDEF); - vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123); - vacc1x4567 = wasm_i32x4_relaxed_laneselect(vacc1x4567, vi1x4567, vmask1x4567); - vacc1x89AB = wasm_i32x4_relaxed_laneselect(vacc1x89AB, vi1x89AB, vmask1x89AB); - vacc1xCDEF = wasm_i32x4_relaxed_laneselect(vacc1xCDEF, vi1xCDEF, vmask1xCDEF); - - wasm_v128_store(o0, vacc0x0123); - wasm_v128_store(o0 + 4, vacc0x4567); - wasm_v128_store(o0 + 8, vacc0x89AB); - wasm_v128_store(o0 + 12, vacc0xCDEF); - o0 += 16; - wasm_v128_store(o1, vacc1x0123); - wasm_v128_store(o1 + 4, vacc1x4567); - wasm_v128_store(o1 + 8, vacc1x89AB); - wasm_v128_store(o1 + 12, vacc1xCDEF); - o1 += 16; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - const v128_t vi1x0123 = wasm_v128_load(i1); - i1 += 4; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - - vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - wasm_v128_store(o1, vacc1x0123); - o1 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - const v128_t vi1x0123 = wasm_v128_load(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - - vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - wasm_v128_store64_lane(o1, vacc1x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); - - o0 += 2; - o1 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - wasm_v128_store32_lane(o1, vacc1x0123, 0); - - o0 += 1; - o1 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x4.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x4.c deleted file mode 100644 index fc94f57f6ef..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x4.c +++ /dev/null @@ -1,110 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-laneselect.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - const v128_t vi1x0123 = wasm_v128_load(i1); - i1 += 4; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - - vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - wasm_v128_store(o1, vacc1x0123); - o1 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - const v128_t vi1x0123 = wasm_v128_load(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - - vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - wasm_v128_store64_lane(o1, vacc1x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); - - o0 += 2; - o1 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - wasm_v128_store32_lane(o1, vacc1x0123, 0); - - o0 += 1; - o1 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x8.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x8.c deleted file mode 100644 index 22ba498bae0..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x8.c +++ /dev/null @@ -1,143 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-laneselect.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x8( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - const v128_t vw4567 = wasm_v128_load(w + 4); - w += 8; - - const v128_t vi0x0123 = wasm_v128_load(i0); - const v128_t vi0x4567 = wasm_v128_load(i0 + 4); - i0 += 8; - const v128_t vi1x0123 = wasm_v128_load(i1); - const v128_t vi1x4567 = wasm_v128_load(i1 + 4); - i1 += 8; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc0x4567 = wasm_f32x4_mul(vi0x4567, vw4567); - const v128_t vmask0x4567 = wasm_i32x4_shr(vi0x4567, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - v128_t vacc1x4567 = wasm_f32x4_mul(vi1x4567, vw4567); - const v128_t vmask1x4567 = wasm_i32x4_shr(vi1x4567, 31); - - vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc0x4567 = wasm_i32x4_relaxed_laneselect(vacc0x4567, vi0x4567, vmask0x4567); - vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123); - vacc1x4567 = wasm_i32x4_relaxed_laneselect(vacc1x4567, vi1x4567, vmask1x4567); - - wasm_v128_store(o0, vacc0x0123); - wasm_v128_store(o0 + 4, vacc0x4567); - o0 += 8; - wasm_v128_store(o1, vacc1x0123); - wasm_v128_store(o1 + 4, vacc1x4567); - o1 += 8; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - const v128_t vi1x0123 = wasm_v128_load(i1); - i1 += 4; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - - vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - wasm_v128_store(o1, vacc1x0123); - o1 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - const v128_t vi1x0123 = wasm_v128_load(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - - vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - wasm_v128_store64_lane(o1, vacc1x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); - - o0 += 2; - o1 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - wasm_v128_store32_lane(o1, vacc1x0123, 0); - - o0 += 1; - o1 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x16.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x16.c deleted file mode 100644 index 029d5b5aad1..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x16.c +++ /dev/null @@ -1,259 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-laneselect.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x16( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - float* o2 = (float*) ((uintptr_t) o1 + output_stride); - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - float* o3 = (float*) ((uintptr_t) o2 + output_stride); - - const size_t input_increment = input_stride * 4 - channels; - const size_t output_increment = output_stride * 4 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = i1; - o2 = o1; - } - if XNN_UNPREDICTABLE(rows < 4) { - i3 = i2; - o3 = o2; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - const v128_t vw4567 = wasm_v128_load(w + 4); - const v128_t vw89AB = wasm_v128_load(w + 8); - const v128_t vwCDEF = wasm_v128_load(w + 12); - w += 16; - - const v128_t vi0x0123 = wasm_v128_load(i0); - const v128_t vi0x4567 = wasm_v128_load(i0 + 4); - const v128_t vi0x89AB = wasm_v128_load(i0 + 8); - const v128_t vi0xCDEF = wasm_v128_load(i0 + 12); - i0 += 16; - const v128_t vi1x0123 = wasm_v128_load(i1); - const v128_t vi1x4567 = wasm_v128_load(i1 + 4); - const v128_t vi1x89AB = wasm_v128_load(i1 + 8); - const v128_t vi1xCDEF = wasm_v128_load(i1 + 12); - i1 += 16; - const v128_t vi2x0123 = wasm_v128_load(i2); - const v128_t vi2x4567 = wasm_v128_load(i2 + 4); - const v128_t vi2x89AB = wasm_v128_load(i2 + 8); - const v128_t vi2xCDEF = wasm_v128_load(i2 + 12); - i2 += 16; - const v128_t vi3x0123 = wasm_v128_load(i3); - const v128_t vi3x4567 = wasm_v128_load(i3 + 4); - const v128_t vi3x89AB = wasm_v128_load(i3 + 8); - const v128_t vi3xCDEF = wasm_v128_load(i3 + 12); - i3 += 16; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc0x4567 = wasm_f32x4_mul(vi0x4567, vw4567); - const v128_t vmask0x4567 = wasm_i32x4_shr(vi0x4567, 31); - v128_t vacc0x89AB = wasm_f32x4_mul(vi0x89AB, vw89AB); - const v128_t vmask0x89AB = wasm_i32x4_shr(vi0x89AB, 31); - v128_t vacc0xCDEF = wasm_f32x4_mul(vi0xCDEF, vwCDEF); - const v128_t vmask0xCDEF = wasm_i32x4_shr(vi0xCDEF, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - v128_t vacc1x4567 = wasm_f32x4_mul(vi1x4567, vw4567); - const v128_t vmask1x4567 = wasm_i32x4_shr(vi1x4567, 31); - v128_t vacc1x89AB = wasm_f32x4_mul(vi1x89AB, vw89AB); - const v128_t vmask1x89AB = wasm_i32x4_shr(vi1x89AB, 31); - v128_t vacc1xCDEF = wasm_f32x4_mul(vi1xCDEF, vwCDEF); - const v128_t vmask1xCDEF = wasm_i32x4_shr(vi1xCDEF, 31); - v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123); - const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31); - v128_t vacc2x4567 = wasm_f32x4_mul(vi2x4567, vw4567); - const v128_t vmask2x4567 = wasm_i32x4_shr(vi2x4567, 31); - v128_t vacc2x89AB = wasm_f32x4_mul(vi2x89AB, vw89AB); - const v128_t vmask2x89AB = wasm_i32x4_shr(vi2x89AB, 31); - v128_t vacc2xCDEF = wasm_f32x4_mul(vi2xCDEF, vwCDEF); - const v128_t vmask2xCDEF = wasm_i32x4_shr(vi2xCDEF, 31); - v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123); - const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31); - v128_t vacc3x4567 = wasm_f32x4_mul(vi3x4567, vw4567); - const v128_t vmask3x4567 = wasm_i32x4_shr(vi3x4567, 31); - v128_t vacc3x89AB = wasm_f32x4_mul(vi3x89AB, vw89AB); - const v128_t vmask3x89AB = wasm_i32x4_shr(vi3x89AB, 31); - v128_t vacc3xCDEF = wasm_f32x4_mul(vi3xCDEF, vwCDEF); - const v128_t vmask3xCDEF = wasm_i32x4_shr(vi3xCDEF, 31); - - vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc0x4567 = wasm_i32x4_relaxed_laneselect(vacc0x4567, vi0x4567, vmask0x4567); - vacc0x89AB = wasm_i32x4_relaxed_laneselect(vacc0x89AB, vi0x89AB, vmask0x89AB); - vacc0xCDEF = wasm_i32x4_relaxed_laneselect(vacc0xCDEF, vi0xCDEF, vmask0xCDEF); - vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123); - vacc1x4567 = wasm_i32x4_relaxed_laneselect(vacc1x4567, vi1x4567, vmask1x4567); - vacc1x89AB = wasm_i32x4_relaxed_laneselect(vacc1x89AB, vi1x89AB, vmask1x89AB); - vacc1xCDEF = wasm_i32x4_relaxed_laneselect(vacc1xCDEF, vi1xCDEF, vmask1xCDEF); - vacc2x0123 = wasm_i32x4_relaxed_laneselect(vacc2x0123, vi2x0123, vmask2x0123); - vacc2x4567 = wasm_i32x4_relaxed_laneselect(vacc2x4567, vi2x4567, vmask2x4567); - vacc2x89AB = wasm_i32x4_relaxed_laneselect(vacc2x89AB, vi2x89AB, vmask2x89AB); - vacc2xCDEF = wasm_i32x4_relaxed_laneselect(vacc2xCDEF, vi2xCDEF, vmask2xCDEF); - vacc3x0123 = wasm_i32x4_relaxed_laneselect(vacc3x0123, vi3x0123, vmask3x0123); - vacc3x4567 = wasm_i32x4_relaxed_laneselect(vacc3x4567, vi3x4567, vmask3x4567); - vacc3x89AB = wasm_i32x4_relaxed_laneselect(vacc3x89AB, vi3x89AB, vmask3x89AB); - vacc3xCDEF = wasm_i32x4_relaxed_laneselect(vacc3xCDEF, vi3xCDEF, vmask3xCDEF); - - wasm_v128_store(o0, vacc0x0123); - wasm_v128_store(o0 + 4, vacc0x4567); - wasm_v128_store(o0 + 8, vacc0x89AB); - wasm_v128_store(o0 + 12, vacc0xCDEF); - o0 += 16; - wasm_v128_store(o1, vacc1x0123); - wasm_v128_store(o1 + 4, vacc1x4567); - wasm_v128_store(o1 + 8, vacc1x89AB); - wasm_v128_store(o1 + 12, vacc1xCDEF); - o1 += 16; - wasm_v128_store(o2, vacc2x0123); - wasm_v128_store(o2 + 4, vacc2x4567); - wasm_v128_store(o2 + 8, vacc2x89AB); - wasm_v128_store(o2 + 12, vacc2xCDEF); - o2 += 16; - wasm_v128_store(o3, vacc3x0123); - wasm_v128_store(o3 + 4, vacc3x4567); - wasm_v128_store(o3 + 8, vacc3x89AB); - wasm_v128_store(o3 + 12, vacc3xCDEF); - o3 += 16; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - const v128_t vi1x0123 = wasm_v128_load(i1); - i1 += 4; - const v128_t vi2x0123 = wasm_v128_load(i2); - i2 += 4; - const v128_t vi3x0123 = wasm_v128_load(i3); - i3 += 4; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123); - const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31); - v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123); - const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31); - - vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123); - vacc2x0123 = wasm_i32x4_relaxed_laneselect(vacc2x0123, vi2x0123, vmask2x0123); - vacc3x0123 = wasm_i32x4_relaxed_laneselect(vacc3x0123, vi3x0123, vmask3x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - wasm_v128_store(o1, vacc1x0123); - o1 += 4; - wasm_v128_store(o2, vacc2x0123); - o2 += 4; - wasm_v128_store(o3, vacc3x0123); - o3 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - const v128_t vi1x0123 = wasm_v128_load(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - const v128_t vi2x0123 = wasm_v128_load(i2); - i2 = (const float*) ((uintptr_t) i2 + c); - const v128_t vi3x0123 = wasm_v128_load(i3); - i3 = (const float*) ((uintptr_t) i3 + c); - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123); - const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31); - v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123); - const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31); - - vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123); - vacc2x0123 = wasm_i32x4_relaxed_laneselect(vacc2x0123, vi2x0123, vmask2x0123); - vacc3x0123 = wasm_i32x4_relaxed_laneselect(vacc3x0123, vi3x0123, vmask3x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - wasm_v128_store64_lane(o1, vacc1x0123, 0); - wasm_v128_store64_lane(o2, vacc2x0123, 0); - wasm_v128_store64_lane(o3, vacc3x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); - vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); - vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); - - o0 += 2; - o1 += 2; - o2 += 2; - o3 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - wasm_v128_store32_lane(o1, vacc1x0123, 0); - wasm_v128_store32_lane(o2, vacc2x0123, 0); - wasm_v128_store32_lane(o3, vacc3x0123, 0); - - o0 += 1; - o1 += 1; - o2 += 1; - o3 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - i2 = (const float*) ((uintptr_t) i2 + input_increment); - o2 = (float*) ((uintptr_t) o2 + output_increment); - i3 = (const float*) ((uintptr_t) i3 + input_increment); - o3 = (float*) ((uintptr_t) o3 + output_increment); - rows = doz(rows, 4); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x4.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x4.c deleted file mode 100644 index c5a1998e223..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x4.c +++ /dev/null @@ -1,160 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-laneselect.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x4( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - float* o2 = (float*) ((uintptr_t) o1 + output_stride); - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - float* o3 = (float*) ((uintptr_t) o2 + output_stride); - - const size_t input_increment = input_stride * 4 - channels; - const size_t output_increment = output_stride * 4 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = i1; - o2 = o1; - } - if XNN_UNPREDICTABLE(rows < 4) { - i3 = i2; - o3 = o2; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - const v128_t vi1x0123 = wasm_v128_load(i1); - i1 += 4; - const v128_t vi2x0123 = wasm_v128_load(i2); - i2 += 4; - const v128_t vi3x0123 = wasm_v128_load(i3); - i3 += 4; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123); - const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31); - v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123); - const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31); - - vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123); - vacc2x0123 = wasm_i32x4_relaxed_laneselect(vacc2x0123, vi2x0123, vmask2x0123); - vacc3x0123 = wasm_i32x4_relaxed_laneselect(vacc3x0123, vi3x0123, vmask3x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - wasm_v128_store(o1, vacc1x0123); - o1 += 4; - wasm_v128_store(o2, vacc2x0123); - o2 += 4; - wasm_v128_store(o3, vacc3x0123); - o3 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - const v128_t vi1x0123 = wasm_v128_load(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - const v128_t vi2x0123 = wasm_v128_load(i2); - i2 = (const float*) ((uintptr_t) i2 + c); - const v128_t vi3x0123 = wasm_v128_load(i3); - i3 = (const float*) ((uintptr_t) i3 + c); - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123); - const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31); - v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123); - const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31); - - vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123); - vacc2x0123 = wasm_i32x4_relaxed_laneselect(vacc2x0123, vi2x0123, vmask2x0123); - vacc3x0123 = wasm_i32x4_relaxed_laneselect(vacc3x0123, vi3x0123, vmask3x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - wasm_v128_store64_lane(o1, vacc1x0123, 0); - wasm_v128_store64_lane(o2, vacc2x0123, 0); - wasm_v128_store64_lane(o3, vacc3x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); - vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); - vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); - - o0 += 2; - o1 += 2; - o2 += 2; - o3 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - wasm_v128_store32_lane(o1, vacc1x0123, 0); - wasm_v128_store32_lane(o2, vacc2x0123, 0); - wasm_v128_store32_lane(o3, vacc3x0123, 0); - - o0 += 1; - o1 += 1; - o2 += 1; - o3 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - i2 = (const float*) ((uintptr_t) i2 + input_increment); - o2 = (float*) ((uintptr_t) o2 + output_increment); - i3 = (const float*) ((uintptr_t) i3 + input_increment); - o3 = (float*) ((uintptr_t) o3 + output_increment); - rows = doz(rows, 4); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x8.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x8.c deleted file mode 100644 index 31a49e32df4..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x8.c +++ /dev/null @@ -1,217 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-laneselect.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x8( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - float* o2 = (float*) ((uintptr_t) o1 + output_stride); - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - float* o3 = (float*) ((uintptr_t) o2 + output_stride); - - const size_t input_increment = input_stride * 4 - channels; - const size_t output_increment = output_stride * 4 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = i1; - o2 = o1; - } - if XNN_UNPREDICTABLE(rows < 4) { - i3 = i2; - o3 = o2; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - const v128_t vw4567 = wasm_v128_load(w + 4); - w += 8; - - const v128_t vi0x0123 = wasm_v128_load(i0); - const v128_t vi0x4567 = wasm_v128_load(i0 + 4); - i0 += 8; - const v128_t vi1x0123 = wasm_v128_load(i1); - const v128_t vi1x4567 = wasm_v128_load(i1 + 4); - i1 += 8; - const v128_t vi2x0123 = wasm_v128_load(i2); - const v128_t vi2x4567 = wasm_v128_load(i2 + 4); - i2 += 8; - const v128_t vi3x0123 = wasm_v128_load(i3); - const v128_t vi3x4567 = wasm_v128_load(i3 + 4); - i3 += 8; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc0x4567 = wasm_f32x4_mul(vi0x4567, vw4567); - const v128_t vmask0x4567 = wasm_i32x4_shr(vi0x4567, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - v128_t vacc1x4567 = wasm_f32x4_mul(vi1x4567, vw4567); - const v128_t vmask1x4567 = wasm_i32x4_shr(vi1x4567, 31); - v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123); - const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31); - v128_t vacc2x4567 = wasm_f32x4_mul(vi2x4567, vw4567); - const v128_t vmask2x4567 = wasm_i32x4_shr(vi2x4567, 31); - v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123); - const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31); - v128_t vacc3x4567 = wasm_f32x4_mul(vi3x4567, vw4567); - const v128_t vmask3x4567 = wasm_i32x4_shr(vi3x4567, 31); - - vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc0x4567 = wasm_i32x4_relaxed_laneselect(vacc0x4567, vi0x4567, vmask0x4567); - vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123); - vacc1x4567 = wasm_i32x4_relaxed_laneselect(vacc1x4567, vi1x4567, vmask1x4567); - vacc2x0123 = wasm_i32x4_relaxed_laneselect(vacc2x0123, vi2x0123, vmask2x0123); - vacc2x4567 = wasm_i32x4_relaxed_laneselect(vacc2x4567, vi2x4567, vmask2x4567); - vacc3x0123 = wasm_i32x4_relaxed_laneselect(vacc3x0123, vi3x0123, vmask3x0123); - vacc3x4567 = wasm_i32x4_relaxed_laneselect(vacc3x4567, vi3x4567, vmask3x4567); - - wasm_v128_store(o0, vacc0x0123); - wasm_v128_store(o0 + 4, vacc0x4567); - o0 += 8; - wasm_v128_store(o1, vacc1x0123); - wasm_v128_store(o1 + 4, vacc1x4567); - o1 += 8; - wasm_v128_store(o2, vacc2x0123); - wasm_v128_store(o2 + 4, vacc2x4567); - o2 += 8; - wasm_v128_store(o3, vacc3x0123); - wasm_v128_store(o3 + 4, vacc3x4567); - o3 += 8; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - const v128_t vi1x0123 = wasm_v128_load(i1); - i1 += 4; - const v128_t vi2x0123 = wasm_v128_load(i2); - i2 += 4; - const v128_t vi3x0123 = wasm_v128_load(i3); - i3 += 4; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123); - const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31); - v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123); - const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31); - - vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123); - vacc2x0123 = wasm_i32x4_relaxed_laneselect(vacc2x0123, vi2x0123, vmask2x0123); - vacc3x0123 = wasm_i32x4_relaxed_laneselect(vacc3x0123, vi3x0123, vmask3x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - wasm_v128_store(o1, vacc1x0123); - o1 += 4; - wasm_v128_store(o2, vacc2x0123); - o2 += 4; - wasm_v128_store(o3, vacc3x0123); - o3 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - const v128_t vi1x0123 = wasm_v128_load(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - const v128_t vi2x0123 = wasm_v128_load(i2); - i2 = (const float*) ((uintptr_t) i2 + c); - const v128_t vi3x0123 = wasm_v128_load(i3); - i3 = (const float*) ((uintptr_t) i3 + c); - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123); - const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31); - v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123); - const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31); - - vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123); - vacc2x0123 = wasm_i32x4_relaxed_laneselect(vacc2x0123, vi2x0123, vmask2x0123); - vacc3x0123 = wasm_i32x4_relaxed_laneselect(vacc3x0123, vi3x0123, vmask3x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - wasm_v128_store64_lane(o1, vacc1x0123, 0); - wasm_v128_store64_lane(o2, vacc2x0123, 0); - wasm_v128_store64_lane(o3, vacc3x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); - vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); - vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); - - o0 += 2; - o1 += 2; - o2 += 2; - o3 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - wasm_v128_store32_lane(o1, vacc1x0123, 0); - wasm_v128_store32_lane(o2, vacc2x0123, 0); - wasm_v128_store32_lane(o3, vacc3x0123, 0); - - o0 += 1; - o1 += 1; - o2 += 1; - o3 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - i2 = (const float*) ((uintptr_t) i2 + input_increment); - o2 = (float*) ((uintptr_t) o2 + output_increment); - i3 = (const float*) ((uintptr_t) i3 + input_increment); - o3 = (float*) ((uintptr_t) o3 + output_increment); - rows = doz(rows, 4); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x16.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x16.c deleted file mode 100644 index 7aba563c465..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x16.c +++ /dev/null @@ -1,119 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-iminmax.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x16( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - - const size_t input_increment = input_stride * 1 - channels; - const size_t output_increment = output_stride * 1 - channels; - - const v128_t vzero = wasm_i32x4_const_splat(0); - do { - - const float* w = weights; - size_t c = channels; - for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - const v128_t vw4567 = wasm_v128_load(w + 4); - const v128_t vw89AB = wasm_v128_load(w + 8); - const v128_t vwCDEF = wasm_v128_load(w + 12); - w += 16; - - v128_t vi0x0123 = wasm_v128_load(i0); - v128_t vi0x4567 = wasm_v128_load(i0 + 4); - v128_t vi0x89AB = wasm_v128_load(i0 + 8); - v128_t vi0xCDEF = wasm_v128_load(i0 + 12); - i0 += 16; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc0x4567 = wasm_i32x4_max(vi0x4567, vzero); - vi0x4567 = wasm_i32x4_min(vi0x4567, vzero); - v128_t vacc0x89AB = wasm_i32x4_max(vi0x89AB, vzero); - vi0x89AB = wasm_i32x4_min(vi0x89AB, vzero); - v128_t vacc0xCDEF = wasm_i32x4_max(vi0xCDEF, vzero); - vi0xCDEF = wasm_i32x4_min(vi0xCDEF, vzero); - - vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123); - vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi0x4567, vw4567), vacc0x4567); - vacc0x89AB = wasm_f32x4_add(wasm_f32x4_mul(vi0x89AB, vw89AB), vacc0x89AB); - vacc0xCDEF = wasm_f32x4_add(wasm_f32x4_mul(vi0xCDEF, vwCDEF), vacc0xCDEF); - - wasm_v128_store(o0, vacc0x0123); - wasm_v128_store(o0 + 4, vacc0x4567); - wasm_v128_store(o0 + 8, vacc0x89AB); - wasm_v128_store(o0 + 12, vacc0xCDEF); - o0 += 16; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - - vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - - vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - - o0 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - - o0 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - rows = doz(rows, 1); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x4.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x4.c deleted file mode 100644 index 56c78fcc29a..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x4.c +++ /dev/null @@ -1,86 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-iminmax.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x4( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - - const size_t input_increment = input_stride * 1 - channels; - const size_t output_increment = output_stride * 1 - channels; - - const v128_t vzero = wasm_i32x4_const_splat(0); - do { - - const float* w = weights; - size_t c = channels; - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - - vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - - vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - - o0 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - - o0 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - rows = doz(rows, 1); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x8.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x8.c deleted file mode 100644 index 221d1f4a79c..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x8.c +++ /dev/null @@ -1,107 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-iminmax.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x8( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - - const size_t input_increment = input_stride * 1 - channels; - const size_t output_increment = output_stride * 1 - channels; - - const v128_t vzero = wasm_i32x4_const_splat(0); - do { - - const float* w = weights; - size_t c = channels; - for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - const v128_t vw4567 = wasm_v128_load(w + 4); - w += 8; - - v128_t vi0x0123 = wasm_v128_load(i0); - v128_t vi0x4567 = wasm_v128_load(i0 + 4); - i0 += 8; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc0x4567 = wasm_i32x4_max(vi0x4567, vzero); - vi0x4567 = wasm_i32x4_min(vi0x4567, vzero); - - vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123); - vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi0x4567, vw4567), vacc0x4567); - - wasm_v128_store(o0, vacc0x0123); - wasm_v128_store(o0 + 4, vacc0x4567); - o0 += 8; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - - vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - - vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - - o0 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - - o0 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - rows = doz(rows, 1); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x16.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x16.c deleted file mode 100644 index 4980c8327cc..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x16.c +++ /dev/null @@ -1,166 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-iminmax.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x16( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - const v128_t vzero = wasm_i32x4_const_splat(0); - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - const v128_t vw4567 = wasm_v128_load(w + 4); - const v128_t vw89AB = wasm_v128_load(w + 8); - const v128_t vwCDEF = wasm_v128_load(w + 12); - w += 16; - - v128_t vi0x0123 = wasm_v128_load(i0); - v128_t vi0x4567 = wasm_v128_load(i0 + 4); - v128_t vi0x89AB = wasm_v128_load(i0 + 8); - v128_t vi0xCDEF = wasm_v128_load(i0 + 12); - i0 += 16; - v128_t vi1x0123 = wasm_v128_load(i1); - v128_t vi1x4567 = wasm_v128_load(i1 + 4); - v128_t vi1x89AB = wasm_v128_load(i1 + 8); - v128_t vi1xCDEF = wasm_v128_load(i1 + 12); - i1 += 16; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc0x4567 = wasm_i32x4_max(vi0x4567, vzero); - vi0x4567 = wasm_i32x4_min(vi0x4567, vzero); - v128_t vacc0x89AB = wasm_i32x4_max(vi0x89AB, vzero); - vi0x89AB = wasm_i32x4_min(vi0x89AB, vzero); - v128_t vacc0xCDEF = wasm_i32x4_max(vi0xCDEF, vzero); - vi0xCDEF = wasm_i32x4_min(vi0xCDEF, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - v128_t vacc1x4567 = wasm_i32x4_max(vi1x4567, vzero); - vi1x4567 = wasm_i32x4_min(vi1x4567, vzero); - v128_t vacc1x89AB = wasm_i32x4_max(vi1x89AB, vzero); - vi1x89AB = wasm_i32x4_min(vi1x89AB, vzero); - v128_t vacc1xCDEF = wasm_i32x4_max(vi1xCDEF, vzero); - vi1xCDEF = wasm_i32x4_min(vi1xCDEF, vzero); - - vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123); - vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi0x4567, vw4567), vacc0x4567); - vacc0x89AB = wasm_f32x4_add(wasm_f32x4_mul(vi0x89AB, vw89AB), vacc0x89AB); - vacc0xCDEF = wasm_f32x4_add(wasm_f32x4_mul(vi0xCDEF, vwCDEF), vacc0xCDEF); - vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123); - vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi1x4567, vw4567), vacc1x4567); - vacc1x89AB = wasm_f32x4_add(wasm_f32x4_mul(vi1x89AB, vw89AB), vacc1x89AB); - vacc1xCDEF = wasm_f32x4_add(wasm_f32x4_mul(vi1xCDEF, vwCDEF), vacc1xCDEF); - - wasm_v128_store(o0, vacc0x0123); - wasm_v128_store(o0 + 4, vacc0x4567); - wasm_v128_store(o0 + 8, vacc0x89AB); - wasm_v128_store(o0 + 12, vacc0xCDEF); - o0 += 16; - wasm_v128_store(o1, vacc1x0123); - wasm_v128_store(o1 + 4, vacc1x4567); - wasm_v128_store(o1 + 8, vacc1x89AB); - wasm_v128_store(o1 + 12, vacc1xCDEF); - o1 += 16; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - v128_t vi1x0123 = wasm_v128_load(i1); - i1 += 4; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - - vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123); - vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - wasm_v128_store(o1, vacc1x0123); - o1 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - v128_t vi1x0123 = wasm_v128_load(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - - vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123); - vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - wasm_v128_store64_lane(o1, vacc1x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); - - o0 += 2; - o1 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - wasm_v128_store32_lane(o1, vacc1x0123, 0); - - o0 += 1; - o1 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x4.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x4.c deleted file mode 100644 index d1117ad4b3b..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x4.c +++ /dev/null @@ -1,111 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-iminmax.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x4( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - const v128_t vzero = wasm_i32x4_const_splat(0); - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - v128_t vi1x0123 = wasm_v128_load(i1); - i1 += 4; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - - vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123); - vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - wasm_v128_store(o1, vacc1x0123); - o1 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - v128_t vi1x0123 = wasm_v128_load(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - - vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123); - vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - wasm_v128_store64_lane(o1, vacc1x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); - - o0 += 2; - o1 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - wasm_v128_store32_lane(o1, vacc1x0123, 0); - - o0 += 1; - o1 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x8.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x8.c deleted file mode 100644 index a293e9533e6..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x8.c +++ /dev/null @@ -1,144 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-iminmax.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - const v128_t vzero = wasm_i32x4_const_splat(0); - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - const v128_t vw4567 = wasm_v128_load(w + 4); - w += 8; - - v128_t vi0x0123 = wasm_v128_load(i0); - v128_t vi0x4567 = wasm_v128_load(i0 + 4); - i0 += 8; - v128_t vi1x0123 = wasm_v128_load(i1); - v128_t vi1x4567 = wasm_v128_load(i1 + 4); - i1 += 8; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc0x4567 = wasm_i32x4_max(vi0x4567, vzero); - vi0x4567 = wasm_i32x4_min(vi0x4567, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - v128_t vacc1x4567 = wasm_i32x4_max(vi1x4567, vzero); - vi1x4567 = wasm_i32x4_min(vi1x4567, vzero); - - vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123); - vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi0x4567, vw4567), vacc0x4567); - vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123); - vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi1x4567, vw4567), vacc1x4567); - - wasm_v128_store(o0, vacc0x0123); - wasm_v128_store(o0 + 4, vacc0x4567); - o0 += 8; - wasm_v128_store(o1, vacc1x0123); - wasm_v128_store(o1 + 4, vacc1x4567); - o1 += 8; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - v128_t vi1x0123 = wasm_v128_load(i1); - i1 += 4; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - - vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123); - vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - wasm_v128_store(o1, vacc1x0123); - o1 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - v128_t vi1x0123 = wasm_v128_load(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - - vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123); - vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - wasm_v128_store64_lane(o1, vacc1x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); - - o0 += 2; - o1 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - wasm_v128_store32_lane(o1, vacc1x0123, 0); - - o0 += 1; - o1 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x16.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x16.c deleted file mode 100644 index 0d01bf6fef6..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x16.c +++ /dev/null @@ -1,260 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-iminmax.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x16( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - float* o2 = (float*) ((uintptr_t) o1 + output_stride); - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - float* o3 = (float*) ((uintptr_t) o2 + output_stride); - - const size_t input_increment = input_stride * 4 - channels; - const size_t output_increment = output_stride * 4 - channels; - - const v128_t vzero = wasm_i32x4_const_splat(0); - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = i1; - o2 = o1; - } - if XNN_UNPREDICTABLE(rows < 4) { - i3 = i2; - o3 = o2; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - const v128_t vw4567 = wasm_v128_load(w + 4); - const v128_t vw89AB = wasm_v128_load(w + 8); - const v128_t vwCDEF = wasm_v128_load(w + 12); - w += 16; - - v128_t vi0x0123 = wasm_v128_load(i0); - v128_t vi0x4567 = wasm_v128_load(i0 + 4); - v128_t vi0x89AB = wasm_v128_load(i0 + 8); - v128_t vi0xCDEF = wasm_v128_load(i0 + 12); - i0 += 16; - v128_t vi1x0123 = wasm_v128_load(i1); - v128_t vi1x4567 = wasm_v128_load(i1 + 4); - v128_t vi1x89AB = wasm_v128_load(i1 + 8); - v128_t vi1xCDEF = wasm_v128_load(i1 + 12); - i1 += 16; - v128_t vi2x0123 = wasm_v128_load(i2); - v128_t vi2x4567 = wasm_v128_load(i2 + 4); - v128_t vi2x89AB = wasm_v128_load(i2 + 8); - v128_t vi2xCDEF = wasm_v128_load(i2 + 12); - i2 += 16; - v128_t vi3x0123 = wasm_v128_load(i3); - v128_t vi3x4567 = wasm_v128_load(i3 + 4); - v128_t vi3x89AB = wasm_v128_load(i3 + 8); - v128_t vi3xCDEF = wasm_v128_load(i3 + 12); - i3 += 16; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc0x4567 = wasm_i32x4_max(vi0x4567, vzero); - vi0x4567 = wasm_i32x4_min(vi0x4567, vzero); - v128_t vacc0x89AB = wasm_i32x4_max(vi0x89AB, vzero); - vi0x89AB = wasm_i32x4_min(vi0x89AB, vzero); - v128_t vacc0xCDEF = wasm_i32x4_max(vi0xCDEF, vzero); - vi0xCDEF = wasm_i32x4_min(vi0xCDEF, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - v128_t vacc1x4567 = wasm_i32x4_max(vi1x4567, vzero); - vi1x4567 = wasm_i32x4_min(vi1x4567, vzero); - v128_t vacc1x89AB = wasm_i32x4_max(vi1x89AB, vzero); - vi1x89AB = wasm_i32x4_min(vi1x89AB, vzero); - v128_t vacc1xCDEF = wasm_i32x4_max(vi1xCDEF, vzero); - vi1xCDEF = wasm_i32x4_min(vi1xCDEF, vzero); - v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero); - vi2x0123 = wasm_i32x4_min(vi2x0123, vzero); - v128_t vacc2x4567 = wasm_i32x4_max(vi2x4567, vzero); - vi2x4567 = wasm_i32x4_min(vi2x4567, vzero); - v128_t vacc2x89AB = wasm_i32x4_max(vi2x89AB, vzero); - vi2x89AB = wasm_i32x4_min(vi2x89AB, vzero); - v128_t vacc2xCDEF = wasm_i32x4_max(vi2xCDEF, vzero); - vi2xCDEF = wasm_i32x4_min(vi2xCDEF, vzero); - v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero); - vi3x0123 = wasm_i32x4_min(vi3x0123, vzero); - v128_t vacc3x4567 = wasm_i32x4_max(vi3x4567, vzero); - vi3x4567 = wasm_i32x4_min(vi3x4567, vzero); - v128_t vacc3x89AB = wasm_i32x4_max(vi3x89AB, vzero); - vi3x89AB = wasm_i32x4_min(vi3x89AB, vzero); - v128_t vacc3xCDEF = wasm_i32x4_max(vi3xCDEF, vzero); - vi3xCDEF = wasm_i32x4_min(vi3xCDEF, vzero); - - vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123); - vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi0x4567, vw4567), vacc0x4567); - vacc0x89AB = wasm_f32x4_add(wasm_f32x4_mul(vi0x89AB, vw89AB), vacc0x89AB); - vacc0xCDEF = wasm_f32x4_add(wasm_f32x4_mul(vi0xCDEF, vwCDEF), vacc0xCDEF); - vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123); - vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi1x4567, vw4567), vacc1x4567); - vacc1x89AB = wasm_f32x4_add(wasm_f32x4_mul(vi1x89AB, vw89AB), vacc1x89AB); - vacc1xCDEF = wasm_f32x4_add(wasm_f32x4_mul(vi1xCDEF, vwCDEF), vacc1xCDEF); - vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vw0123), vacc2x0123); - vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi2x4567, vw4567), vacc2x4567); - vacc2x89AB = wasm_f32x4_add(wasm_f32x4_mul(vi2x89AB, vw89AB), vacc2x89AB); - vacc2xCDEF = wasm_f32x4_add(wasm_f32x4_mul(vi2xCDEF, vwCDEF), vacc2xCDEF); - vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vw0123), vacc3x0123); - vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi3x4567, vw4567), vacc3x4567); - vacc3x89AB = wasm_f32x4_add(wasm_f32x4_mul(vi3x89AB, vw89AB), vacc3x89AB); - vacc3xCDEF = wasm_f32x4_add(wasm_f32x4_mul(vi3xCDEF, vwCDEF), vacc3xCDEF); - - wasm_v128_store(o0, vacc0x0123); - wasm_v128_store(o0 + 4, vacc0x4567); - wasm_v128_store(o0 + 8, vacc0x89AB); - wasm_v128_store(o0 + 12, vacc0xCDEF); - o0 += 16; - wasm_v128_store(o1, vacc1x0123); - wasm_v128_store(o1 + 4, vacc1x4567); - wasm_v128_store(o1 + 8, vacc1x89AB); - wasm_v128_store(o1 + 12, vacc1xCDEF); - o1 += 16; - wasm_v128_store(o2, vacc2x0123); - wasm_v128_store(o2 + 4, vacc2x4567); - wasm_v128_store(o2 + 8, vacc2x89AB); - wasm_v128_store(o2 + 12, vacc2xCDEF); - o2 += 16; - wasm_v128_store(o3, vacc3x0123); - wasm_v128_store(o3 + 4, vacc3x4567); - wasm_v128_store(o3 + 8, vacc3x89AB); - wasm_v128_store(o3 + 12, vacc3xCDEF); - o3 += 16; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - v128_t vi1x0123 = wasm_v128_load(i1); - i1 += 4; - v128_t vi2x0123 = wasm_v128_load(i2); - i2 += 4; - v128_t vi3x0123 = wasm_v128_load(i3); - i3 += 4; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero); - vi2x0123 = wasm_i32x4_min(vi2x0123, vzero); - v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero); - vi3x0123 = wasm_i32x4_min(vi3x0123, vzero); - - vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123); - vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123); - vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vw0123), vacc2x0123); - vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vw0123), vacc3x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - wasm_v128_store(o1, vacc1x0123); - o1 += 4; - wasm_v128_store(o2, vacc2x0123); - o2 += 4; - wasm_v128_store(o3, vacc3x0123); - o3 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - v128_t vi1x0123 = wasm_v128_load(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - v128_t vi2x0123 = wasm_v128_load(i2); - i2 = (const float*) ((uintptr_t) i2 + c); - v128_t vi3x0123 = wasm_v128_load(i3); - i3 = (const float*) ((uintptr_t) i3 + c); - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero); - vi2x0123 = wasm_i32x4_min(vi2x0123, vzero); - v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero); - vi3x0123 = wasm_i32x4_min(vi3x0123, vzero); - - vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123); - vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123); - vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vw0123), vacc2x0123); - vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vw0123), vacc3x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - wasm_v128_store64_lane(o1, vacc1x0123, 0); - wasm_v128_store64_lane(o2, vacc2x0123, 0); - wasm_v128_store64_lane(o3, vacc3x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); - vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); - vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); - - o0 += 2; - o1 += 2; - o2 += 2; - o3 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - wasm_v128_store32_lane(o1, vacc1x0123, 0); - wasm_v128_store32_lane(o2, vacc2x0123, 0); - wasm_v128_store32_lane(o3, vacc3x0123, 0); - - o0 += 1; - o1 += 1; - o2 += 1; - o3 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - i2 = (const float*) ((uintptr_t) i2 + input_increment); - o2 = (float*) ((uintptr_t) o2 + output_increment); - i3 = (const float*) ((uintptr_t) i3 + input_increment); - o3 = (float*) ((uintptr_t) o3 + output_increment); - rows = doz(rows, 4); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x4.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x4.c deleted file mode 100644 index 97e712636f5..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x4.c +++ /dev/null @@ -1,161 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-iminmax.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x4( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - float* o2 = (float*) ((uintptr_t) o1 + output_stride); - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - float* o3 = (float*) ((uintptr_t) o2 + output_stride); - - const size_t input_increment = input_stride * 4 - channels; - const size_t output_increment = output_stride * 4 - channels; - - const v128_t vzero = wasm_i32x4_const_splat(0); - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = i1; - o2 = o1; - } - if XNN_UNPREDICTABLE(rows < 4) { - i3 = i2; - o3 = o2; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - v128_t vi1x0123 = wasm_v128_load(i1); - i1 += 4; - v128_t vi2x0123 = wasm_v128_load(i2); - i2 += 4; - v128_t vi3x0123 = wasm_v128_load(i3); - i3 += 4; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero); - vi2x0123 = wasm_i32x4_min(vi2x0123, vzero); - v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero); - vi3x0123 = wasm_i32x4_min(vi3x0123, vzero); - - vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123); - vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123); - vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vw0123), vacc2x0123); - vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vw0123), vacc3x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - wasm_v128_store(o1, vacc1x0123); - o1 += 4; - wasm_v128_store(o2, vacc2x0123); - o2 += 4; - wasm_v128_store(o3, vacc3x0123); - o3 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - v128_t vi1x0123 = wasm_v128_load(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - v128_t vi2x0123 = wasm_v128_load(i2); - i2 = (const float*) ((uintptr_t) i2 + c); - v128_t vi3x0123 = wasm_v128_load(i3); - i3 = (const float*) ((uintptr_t) i3 + c); - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero); - vi2x0123 = wasm_i32x4_min(vi2x0123, vzero); - v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero); - vi3x0123 = wasm_i32x4_min(vi3x0123, vzero); - - vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123); - vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123); - vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vw0123), vacc2x0123); - vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vw0123), vacc3x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - wasm_v128_store64_lane(o1, vacc1x0123, 0); - wasm_v128_store64_lane(o2, vacc2x0123, 0); - wasm_v128_store64_lane(o3, vacc3x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); - vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); - vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); - - o0 += 2; - o1 += 2; - o2 += 2; - o3 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - wasm_v128_store32_lane(o1, vacc1x0123, 0); - wasm_v128_store32_lane(o2, vacc2x0123, 0); - wasm_v128_store32_lane(o3, vacc3x0123, 0); - - o0 += 1; - o1 += 1; - o2 += 1; - o3 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - i2 = (const float*) ((uintptr_t) i2 + input_increment); - o2 = (float*) ((uintptr_t) o2 + output_increment); - i3 = (const float*) ((uintptr_t) i3 + input_increment); - o3 = (float*) ((uintptr_t) o3 + output_increment); - rows = doz(rows, 4); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x8.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x8.c deleted file mode 100644 index a62ad1e8412..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x8.c +++ /dev/null @@ -1,218 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-iminmax.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x8( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - float* o2 = (float*) ((uintptr_t) o1 + output_stride); - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - float* o3 = (float*) ((uintptr_t) o2 + output_stride); - - const size_t input_increment = input_stride * 4 - channels; - const size_t output_increment = output_stride * 4 - channels; - - const v128_t vzero = wasm_i32x4_const_splat(0); - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = i1; - o2 = o1; - } - if XNN_UNPREDICTABLE(rows < 4) { - i3 = i2; - o3 = o2; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - const v128_t vw4567 = wasm_v128_load(w + 4); - w += 8; - - v128_t vi0x0123 = wasm_v128_load(i0); - v128_t vi0x4567 = wasm_v128_load(i0 + 4); - i0 += 8; - v128_t vi1x0123 = wasm_v128_load(i1); - v128_t vi1x4567 = wasm_v128_load(i1 + 4); - i1 += 8; - v128_t vi2x0123 = wasm_v128_load(i2); - v128_t vi2x4567 = wasm_v128_load(i2 + 4); - i2 += 8; - v128_t vi3x0123 = wasm_v128_load(i3); - v128_t vi3x4567 = wasm_v128_load(i3 + 4); - i3 += 8; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc0x4567 = wasm_i32x4_max(vi0x4567, vzero); - vi0x4567 = wasm_i32x4_min(vi0x4567, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - v128_t vacc1x4567 = wasm_i32x4_max(vi1x4567, vzero); - vi1x4567 = wasm_i32x4_min(vi1x4567, vzero); - v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero); - vi2x0123 = wasm_i32x4_min(vi2x0123, vzero); - v128_t vacc2x4567 = wasm_i32x4_max(vi2x4567, vzero); - vi2x4567 = wasm_i32x4_min(vi2x4567, vzero); - v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero); - vi3x0123 = wasm_i32x4_min(vi3x0123, vzero); - v128_t vacc3x4567 = wasm_i32x4_max(vi3x4567, vzero); - vi3x4567 = wasm_i32x4_min(vi3x4567, vzero); - - vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123); - vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi0x4567, vw4567), vacc0x4567); - vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123); - vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi1x4567, vw4567), vacc1x4567); - vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vw0123), vacc2x0123); - vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi2x4567, vw4567), vacc2x4567); - vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vw0123), vacc3x0123); - vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi3x4567, vw4567), vacc3x4567); - - wasm_v128_store(o0, vacc0x0123); - wasm_v128_store(o0 + 4, vacc0x4567); - o0 += 8; - wasm_v128_store(o1, vacc1x0123); - wasm_v128_store(o1 + 4, vacc1x4567); - o1 += 8; - wasm_v128_store(o2, vacc2x0123); - wasm_v128_store(o2 + 4, vacc2x4567); - o2 += 8; - wasm_v128_store(o3, vacc3x0123); - wasm_v128_store(o3 + 4, vacc3x4567); - o3 += 8; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - v128_t vi1x0123 = wasm_v128_load(i1); - i1 += 4; - v128_t vi2x0123 = wasm_v128_load(i2); - i2 += 4; - v128_t vi3x0123 = wasm_v128_load(i3); - i3 += 4; - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero); - vi2x0123 = wasm_i32x4_min(vi2x0123, vzero); - v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero); - vi3x0123 = wasm_i32x4_min(vi3x0123, vzero); - - vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123); - vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123); - vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vw0123), vacc2x0123); - vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vw0123), vacc3x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - wasm_v128_store(o1, vacc1x0123); - o1 += 4; - wasm_v128_store(o2, vacc2x0123); - o2 += 4; - wasm_v128_store(o3, vacc3x0123); - o3 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - v128_t vi1x0123 = wasm_v128_load(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - v128_t vi2x0123 = wasm_v128_load(i2); - i2 = (const float*) ((uintptr_t) i2 + c); - v128_t vi3x0123 = wasm_v128_load(i3); - i3 = (const float*) ((uintptr_t) i3 + c); - - v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); - vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); - v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); - vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); - v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero); - vi2x0123 = wasm_i32x4_min(vi2x0123, vzero); - v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero); - vi3x0123 = wasm_i32x4_min(vi3x0123, vzero); - - vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123); - vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123); - vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vw0123), vacc2x0123); - vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vw0123), vacc3x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - wasm_v128_store64_lane(o1, vacc1x0123, 0); - wasm_v128_store64_lane(o2, vacc2x0123, 0); - wasm_v128_store64_lane(o3, vacc3x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); - vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); - vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); - - o0 += 2; - o1 += 2; - o2 += 2; - o3 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - wasm_v128_store32_lane(o1, vacc1x0123, 0); - wasm_v128_store32_lane(o2, vacc2x0123, 0); - wasm_v128_store32_lane(o3, vacc3x0123, 0); - - o0 += 1; - o1 += 1; - o2 += 1; - o3 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - i2 = (const float*) ((uintptr_t) i2 + input_increment); - o2 = (float*) ((uintptr_t) o2 + output_increment); - i3 = (const float*) ((uintptr_t) i3 + input_increment); - o3 = (float*) ((uintptr_t) o3 + output_increment); - rows = doz(rows, 4); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x16.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x16.c deleted file mode 100644 index e0df620d3a0..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x16.c +++ /dev/null @@ -1,118 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-laneselect.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x16( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - - const size_t input_increment = input_stride * 1 - channels; - const size_t output_increment = output_stride * 1 - channels; - - do { - - const float* w = weights; - size_t c = channels; - for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - const v128_t vw4567 = wasm_v128_load(w + 4); - const v128_t vw89AB = wasm_v128_load(w + 8); - const v128_t vwCDEF = wasm_v128_load(w + 12); - w += 16; - - const v128_t vi0x0123 = wasm_v128_load(i0); - const v128_t vi0x4567 = wasm_v128_load(i0 + 4); - const v128_t vi0x89AB = wasm_v128_load(i0 + 8); - const v128_t vi0xCDEF = wasm_v128_load(i0 + 12); - i0 += 16; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc0x4567 = wasm_f32x4_mul(vi0x4567, vw4567); - const v128_t vmask0x4567 = wasm_i32x4_shr(vi0x4567, 31); - v128_t vacc0x89AB = wasm_f32x4_mul(vi0x89AB, vw89AB); - const v128_t vmask0x89AB = wasm_i32x4_shr(vi0x89AB, 31); - v128_t vacc0xCDEF = wasm_f32x4_mul(vi0xCDEF, vwCDEF); - const v128_t vmask0xCDEF = wasm_i32x4_shr(vi0xCDEF, 31); - - vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc0x4567 = wasm_v128_bitselect(vacc0x4567, vi0x4567, vmask0x4567); - vacc0x89AB = wasm_v128_bitselect(vacc0x89AB, vi0x89AB, vmask0x89AB); - vacc0xCDEF = wasm_v128_bitselect(vacc0xCDEF, vi0xCDEF, vmask0xCDEF); - - wasm_v128_store(o0, vacc0x0123); - wasm_v128_store(o0 + 4, vacc0x4567); - wasm_v128_store(o0 + 8, vacc0x89AB); - wasm_v128_store(o0 + 12, vacc0xCDEF); - o0 += 16; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - - vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - - vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - - o0 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - - o0 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - rows = doz(rows, 1); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x4.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x4.c deleted file mode 100644 index 0f8d2658807..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x4.c +++ /dev/null @@ -1,85 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-laneselect.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x4( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - - const size_t input_increment = input_stride * 1 - channels; - const size_t output_increment = output_stride * 1 - channels; - - do { - - const float* w = weights; - size_t c = channels; - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - - vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - - vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - - o0 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - - o0 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - rows = doz(rows, 1); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x8.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x8.c deleted file mode 100644 index 47cf36c4134..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x8.c +++ /dev/null @@ -1,106 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-laneselect.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x8( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - - const size_t input_increment = input_stride * 1 - channels; - const size_t output_increment = output_stride * 1 - channels; - - do { - - const float* w = weights; - size_t c = channels; - for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - const v128_t vw4567 = wasm_v128_load(w + 4); - w += 8; - - const v128_t vi0x0123 = wasm_v128_load(i0); - const v128_t vi0x4567 = wasm_v128_load(i0 + 4); - i0 += 8; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc0x4567 = wasm_f32x4_mul(vi0x4567, vw4567); - const v128_t vmask0x4567 = wasm_i32x4_shr(vi0x4567, 31); - - vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc0x4567 = wasm_v128_bitselect(vacc0x4567, vi0x4567, vmask0x4567); - - wasm_v128_store(o0, vacc0x0123); - wasm_v128_store(o0 + 4, vacc0x4567); - o0 += 8; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - - vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - - vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - - o0 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - - o0 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - rows = doz(rows, 1); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x16.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x16.c deleted file mode 100644 index 0d7bd5537f4..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x16.c +++ /dev/null @@ -1,165 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-laneselect.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x16( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - const v128_t vw4567 = wasm_v128_load(w + 4); - const v128_t vw89AB = wasm_v128_load(w + 8); - const v128_t vwCDEF = wasm_v128_load(w + 12); - w += 16; - - const v128_t vi0x0123 = wasm_v128_load(i0); - const v128_t vi0x4567 = wasm_v128_load(i0 + 4); - const v128_t vi0x89AB = wasm_v128_load(i0 + 8); - const v128_t vi0xCDEF = wasm_v128_load(i0 + 12); - i0 += 16; - const v128_t vi1x0123 = wasm_v128_load(i1); - const v128_t vi1x4567 = wasm_v128_load(i1 + 4); - const v128_t vi1x89AB = wasm_v128_load(i1 + 8); - const v128_t vi1xCDEF = wasm_v128_load(i1 + 12); - i1 += 16; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc0x4567 = wasm_f32x4_mul(vi0x4567, vw4567); - const v128_t vmask0x4567 = wasm_i32x4_shr(vi0x4567, 31); - v128_t vacc0x89AB = wasm_f32x4_mul(vi0x89AB, vw89AB); - const v128_t vmask0x89AB = wasm_i32x4_shr(vi0x89AB, 31); - v128_t vacc0xCDEF = wasm_f32x4_mul(vi0xCDEF, vwCDEF); - const v128_t vmask0xCDEF = wasm_i32x4_shr(vi0xCDEF, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - v128_t vacc1x4567 = wasm_f32x4_mul(vi1x4567, vw4567); - const v128_t vmask1x4567 = wasm_i32x4_shr(vi1x4567, 31); - v128_t vacc1x89AB = wasm_f32x4_mul(vi1x89AB, vw89AB); - const v128_t vmask1x89AB = wasm_i32x4_shr(vi1x89AB, 31); - v128_t vacc1xCDEF = wasm_f32x4_mul(vi1xCDEF, vwCDEF); - const v128_t vmask1xCDEF = wasm_i32x4_shr(vi1xCDEF, 31); - - vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc0x4567 = wasm_v128_bitselect(vacc0x4567, vi0x4567, vmask0x4567); - vacc0x89AB = wasm_v128_bitselect(vacc0x89AB, vi0x89AB, vmask0x89AB); - vacc0xCDEF = wasm_v128_bitselect(vacc0xCDEF, vi0xCDEF, vmask0xCDEF); - vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123); - vacc1x4567 = wasm_v128_bitselect(vacc1x4567, vi1x4567, vmask1x4567); - vacc1x89AB = wasm_v128_bitselect(vacc1x89AB, vi1x89AB, vmask1x89AB); - vacc1xCDEF = wasm_v128_bitselect(vacc1xCDEF, vi1xCDEF, vmask1xCDEF); - - wasm_v128_store(o0, vacc0x0123); - wasm_v128_store(o0 + 4, vacc0x4567); - wasm_v128_store(o0 + 8, vacc0x89AB); - wasm_v128_store(o0 + 12, vacc0xCDEF); - o0 += 16; - wasm_v128_store(o1, vacc1x0123); - wasm_v128_store(o1 + 4, vacc1x4567); - wasm_v128_store(o1 + 8, vacc1x89AB); - wasm_v128_store(o1 + 12, vacc1xCDEF); - o1 += 16; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - const v128_t vi1x0123 = wasm_v128_load(i1); - i1 += 4; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - - vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - wasm_v128_store(o1, vacc1x0123); - o1 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - const v128_t vi1x0123 = wasm_v128_load(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - - vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - wasm_v128_store64_lane(o1, vacc1x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); - - o0 += 2; - o1 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - wasm_v128_store32_lane(o1, vacc1x0123, 0); - - o0 += 1; - o1 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x4.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x4.c deleted file mode 100644 index a5f295680eb..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x4.c +++ /dev/null @@ -1,110 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-laneselect.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x4( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - const v128_t vi1x0123 = wasm_v128_load(i1); - i1 += 4; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - - vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - wasm_v128_store(o1, vacc1x0123); - o1 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - const v128_t vi1x0123 = wasm_v128_load(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - - vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - wasm_v128_store64_lane(o1, vacc1x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); - - o0 += 2; - o1 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - wasm_v128_store32_lane(o1, vacc1x0123, 0); - - o0 += 1; - o1 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x8.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x8.c deleted file mode 100644 index b1aa60d17c0..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x8.c +++ /dev/null @@ -1,143 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-laneselect.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - - const size_t input_increment = input_stride * 2 - channels; - const size_t output_increment = output_stride * 2 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - const v128_t vw4567 = wasm_v128_load(w + 4); - w += 8; - - const v128_t vi0x0123 = wasm_v128_load(i0); - const v128_t vi0x4567 = wasm_v128_load(i0 + 4); - i0 += 8; - const v128_t vi1x0123 = wasm_v128_load(i1); - const v128_t vi1x4567 = wasm_v128_load(i1 + 4); - i1 += 8; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc0x4567 = wasm_f32x4_mul(vi0x4567, vw4567); - const v128_t vmask0x4567 = wasm_i32x4_shr(vi0x4567, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - v128_t vacc1x4567 = wasm_f32x4_mul(vi1x4567, vw4567); - const v128_t vmask1x4567 = wasm_i32x4_shr(vi1x4567, 31); - - vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc0x4567 = wasm_v128_bitselect(vacc0x4567, vi0x4567, vmask0x4567); - vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123); - vacc1x4567 = wasm_v128_bitselect(vacc1x4567, vi1x4567, vmask1x4567); - - wasm_v128_store(o0, vacc0x0123); - wasm_v128_store(o0 + 4, vacc0x4567); - o0 += 8; - wasm_v128_store(o1, vacc1x0123); - wasm_v128_store(o1 + 4, vacc1x4567); - o1 += 8; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - const v128_t vi1x0123 = wasm_v128_load(i1); - i1 += 4; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - - vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - wasm_v128_store(o1, vacc1x0123); - o1 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - const v128_t vi1x0123 = wasm_v128_load(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - - vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - wasm_v128_store64_lane(o1, vacc1x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); - - o0 += 2; - o1 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - wasm_v128_store32_lane(o1, vacc1x0123, 0); - - o0 += 1; - o1 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - rows = doz(rows, 2); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x16.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x16.c deleted file mode 100644 index 3cdd0b9793d..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x16.c +++ /dev/null @@ -1,259 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-laneselect.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x16( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - float* o2 = (float*) ((uintptr_t) o1 + output_stride); - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - float* o3 = (float*) ((uintptr_t) o2 + output_stride); - - const size_t input_increment = input_stride * 4 - channels; - const size_t output_increment = output_stride * 4 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = i1; - o2 = o1; - } - if XNN_UNPREDICTABLE(rows < 4) { - i3 = i2; - o3 = o2; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - const v128_t vw4567 = wasm_v128_load(w + 4); - const v128_t vw89AB = wasm_v128_load(w + 8); - const v128_t vwCDEF = wasm_v128_load(w + 12); - w += 16; - - const v128_t vi0x0123 = wasm_v128_load(i0); - const v128_t vi0x4567 = wasm_v128_load(i0 + 4); - const v128_t vi0x89AB = wasm_v128_load(i0 + 8); - const v128_t vi0xCDEF = wasm_v128_load(i0 + 12); - i0 += 16; - const v128_t vi1x0123 = wasm_v128_load(i1); - const v128_t vi1x4567 = wasm_v128_load(i1 + 4); - const v128_t vi1x89AB = wasm_v128_load(i1 + 8); - const v128_t vi1xCDEF = wasm_v128_load(i1 + 12); - i1 += 16; - const v128_t vi2x0123 = wasm_v128_load(i2); - const v128_t vi2x4567 = wasm_v128_load(i2 + 4); - const v128_t vi2x89AB = wasm_v128_load(i2 + 8); - const v128_t vi2xCDEF = wasm_v128_load(i2 + 12); - i2 += 16; - const v128_t vi3x0123 = wasm_v128_load(i3); - const v128_t vi3x4567 = wasm_v128_load(i3 + 4); - const v128_t vi3x89AB = wasm_v128_load(i3 + 8); - const v128_t vi3xCDEF = wasm_v128_load(i3 + 12); - i3 += 16; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc0x4567 = wasm_f32x4_mul(vi0x4567, vw4567); - const v128_t vmask0x4567 = wasm_i32x4_shr(vi0x4567, 31); - v128_t vacc0x89AB = wasm_f32x4_mul(vi0x89AB, vw89AB); - const v128_t vmask0x89AB = wasm_i32x4_shr(vi0x89AB, 31); - v128_t vacc0xCDEF = wasm_f32x4_mul(vi0xCDEF, vwCDEF); - const v128_t vmask0xCDEF = wasm_i32x4_shr(vi0xCDEF, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - v128_t vacc1x4567 = wasm_f32x4_mul(vi1x4567, vw4567); - const v128_t vmask1x4567 = wasm_i32x4_shr(vi1x4567, 31); - v128_t vacc1x89AB = wasm_f32x4_mul(vi1x89AB, vw89AB); - const v128_t vmask1x89AB = wasm_i32x4_shr(vi1x89AB, 31); - v128_t vacc1xCDEF = wasm_f32x4_mul(vi1xCDEF, vwCDEF); - const v128_t vmask1xCDEF = wasm_i32x4_shr(vi1xCDEF, 31); - v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123); - const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31); - v128_t vacc2x4567 = wasm_f32x4_mul(vi2x4567, vw4567); - const v128_t vmask2x4567 = wasm_i32x4_shr(vi2x4567, 31); - v128_t vacc2x89AB = wasm_f32x4_mul(vi2x89AB, vw89AB); - const v128_t vmask2x89AB = wasm_i32x4_shr(vi2x89AB, 31); - v128_t vacc2xCDEF = wasm_f32x4_mul(vi2xCDEF, vwCDEF); - const v128_t vmask2xCDEF = wasm_i32x4_shr(vi2xCDEF, 31); - v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123); - const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31); - v128_t vacc3x4567 = wasm_f32x4_mul(vi3x4567, vw4567); - const v128_t vmask3x4567 = wasm_i32x4_shr(vi3x4567, 31); - v128_t vacc3x89AB = wasm_f32x4_mul(vi3x89AB, vw89AB); - const v128_t vmask3x89AB = wasm_i32x4_shr(vi3x89AB, 31); - v128_t vacc3xCDEF = wasm_f32x4_mul(vi3xCDEF, vwCDEF); - const v128_t vmask3xCDEF = wasm_i32x4_shr(vi3xCDEF, 31); - - vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc0x4567 = wasm_v128_bitselect(vacc0x4567, vi0x4567, vmask0x4567); - vacc0x89AB = wasm_v128_bitselect(vacc0x89AB, vi0x89AB, vmask0x89AB); - vacc0xCDEF = wasm_v128_bitselect(vacc0xCDEF, vi0xCDEF, vmask0xCDEF); - vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123); - vacc1x4567 = wasm_v128_bitselect(vacc1x4567, vi1x4567, vmask1x4567); - vacc1x89AB = wasm_v128_bitselect(vacc1x89AB, vi1x89AB, vmask1x89AB); - vacc1xCDEF = wasm_v128_bitselect(vacc1xCDEF, vi1xCDEF, vmask1xCDEF); - vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vi2x0123, vmask2x0123); - vacc2x4567 = wasm_v128_bitselect(vacc2x4567, vi2x4567, vmask2x4567); - vacc2x89AB = wasm_v128_bitselect(vacc2x89AB, vi2x89AB, vmask2x89AB); - vacc2xCDEF = wasm_v128_bitselect(vacc2xCDEF, vi2xCDEF, vmask2xCDEF); - vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vi3x0123, vmask3x0123); - vacc3x4567 = wasm_v128_bitselect(vacc3x4567, vi3x4567, vmask3x4567); - vacc3x89AB = wasm_v128_bitselect(vacc3x89AB, vi3x89AB, vmask3x89AB); - vacc3xCDEF = wasm_v128_bitselect(vacc3xCDEF, vi3xCDEF, vmask3xCDEF); - - wasm_v128_store(o0, vacc0x0123); - wasm_v128_store(o0 + 4, vacc0x4567); - wasm_v128_store(o0 + 8, vacc0x89AB); - wasm_v128_store(o0 + 12, vacc0xCDEF); - o0 += 16; - wasm_v128_store(o1, vacc1x0123); - wasm_v128_store(o1 + 4, vacc1x4567); - wasm_v128_store(o1 + 8, vacc1x89AB); - wasm_v128_store(o1 + 12, vacc1xCDEF); - o1 += 16; - wasm_v128_store(o2, vacc2x0123); - wasm_v128_store(o2 + 4, vacc2x4567); - wasm_v128_store(o2 + 8, vacc2x89AB); - wasm_v128_store(o2 + 12, vacc2xCDEF); - o2 += 16; - wasm_v128_store(o3, vacc3x0123); - wasm_v128_store(o3 + 4, vacc3x4567); - wasm_v128_store(o3 + 8, vacc3x89AB); - wasm_v128_store(o3 + 12, vacc3xCDEF); - o3 += 16; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - const v128_t vi1x0123 = wasm_v128_load(i1); - i1 += 4; - const v128_t vi2x0123 = wasm_v128_load(i2); - i2 += 4; - const v128_t vi3x0123 = wasm_v128_load(i3); - i3 += 4; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123); - const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31); - v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123); - const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31); - - vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123); - vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vi2x0123, vmask2x0123); - vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vi3x0123, vmask3x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - wasm_v128_store(o1, vacc1x0123); - o1 += 4; - wasm_v128_store(o2, vacc2x0123); - o2 += 4; - wasm_v128_store(o3, vacc3x0123); - o3 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - const v128_t vi1x0123 = wasm_v128_load(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - const v128_t vi2x0123 = wasm_v128_load(i2); - i2 = (const float*) ((uintptr_t) i2 + c); - const v128_t vi3x0123 = wasm_v128_load(i3); - i3 = (const float*) ((uintptr_t) i3 + c); - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123); - const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31); - v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123); - const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31); - - vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123); - vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vi2x0123, vmask2x0123); - vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vi3x0123, vmask3x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - wasm_v128_store64_lane(o1, vacc1x0123, 0); - wasm_v128_store64_lane(o2, vacc2x0123, 0); - wasm_v128_store64_lane(o3, vacc3x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); - vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); - vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); - - o0 += 2; - o1 += 2; - o2 += 2; - o3 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - wasm_v128_store32_lane(o1, vacc1x0123, 0); - wasm_v128_store32_lane(o2, vacc2x0123, 0); - wasm_v128_store32_lane(o3, vacc3x0123, 0); - - o0 += 1; - o1 += 1; - o2 += 1; - o3 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - i2 = (const float*) ((uintptr_t) i2 + input_increment); - o2 = (float*) ((uintptr_t) o2 + output_increment); - i3 = (const float*) ((uintptr_t) i3 + input_increment); - o3 = (float*) ((uintptr_t) o3 + output_increment); - rows = doz(rows, 4); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x4.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x4.c deleted file mode 100644 index 894ab77465b..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x4.c +++ /dev/null @@ -1,160 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-laneselect.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x4( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - float* o2 = (float*) ((uintptr_t) o1 + output_stride); - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - float* o3 = (float*) ((uintptr_t) o2 + output_stride); - - const size_t input_increment = input_stride * 4 - channels; - const size_t output_increment = output_stride * 4 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = i1; - o2 = o1; - } - if XNN_UNPREDICTABLE(rows < 4) { - i3 = i2; - o3 = o2; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - const v128_t vi1x0123 = wasm_v128_load(i1); - i1 += 4; - const v128_t vi2x0123 = wasm_v128_load(i2); - i2 += 4; - const v128_t vi3x0123 = wasm_v128_load(i3); - i3 += 4; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123); - const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31); - v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123); - const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31); - - vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123); - vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vi2x0123, vmask2x0123); - vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vi3x0123, vmask3x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - wasm_v128_store(o1, vacc1x0123); - o1 += 4; - wasm_v128_store(o2, vacc2x0123); - o2 += 4; - wasm_v128_store(o3, vacc3x0123); - o3 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - const v128_t vi1x0123 = wasm_v128_load(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - const v128_t vi2x0123 = wasm_v128_load(i2); - i2 = (const float*) ((uintptr_t) i2 + c); - const v128_t vi3x0123 = wasm_v128_load(i3); - i3 = (const float*) ((uintptr_t) i3 + c); - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123); - const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31); - v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123); - const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31); - - vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123); - vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vi2x0123, vmask2x0123); - vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vi3x0123, vmask3x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - wasm_v128_store64_lane(o1, vacc1x0123, 0); - wasm_v128_store64_lane(o2, vacc2x0123, 0); - wasm_v128_store64_lane(o3, vacc3x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); - vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); - vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); - - o0 += 2; - o1 += 2; - o2 += 2; - o3 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - wasm_v128_store32_lane(o1, vacc1x0123, 0); - wasm_v128_store32_lane(o2, vacc2x0123, 0); - wasm_v128_store32_lane(o3, vacc3x0123, 0); - - o0 += 1; - o1 += 1; - o2 += 1; - o3 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - i2 = (const float*) ((uintptr_t) i2 + input_increment); - o2 = (float*) ((uintptr_t) o2 + output_increment); - i3 = (const float*) ((uintptr_t) i3 + input_increment); - o3 = (float*) ((uintptr_t) o3 + output_increment); - rows = doz(rows, 4); - } while (rows != 0); -} diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x8.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x8.c deleted file mode 100644 index f160a03e220..00000000000 --- a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x8.c +++ /dev/null @@ -1,217 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-prelu/wasmsimd-laneselect.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x8( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - float* o1 = (float*) ((uintptr_t) o0 + output_stride); - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - float* o2 = (float*) ((uintptr_t) o1 + output_stride); - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - float* o3 = (float*) ((uintptr_t) o2 + output_stride); - - const size_t input_increment = input_stride * 4 - channels; - const size_t output_increment = output_stride * 4 - channels; - - do { - if XNN_UNPREDICTABLE(rows < 2) { - i1 = i0; - o1 = o0; - } - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = i1; - o2 = o1; - } - if XNN_UNPREDICTABLE(rows < 4) { - i3 = i2; - o3 = o2; - } - - const float* w = weights; - size_t c = channels; - for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - const v128_t vw4567 = wasm_v128_load(w + 4); - w += 8; - - const v128_t vi0x0123 = wasm_v128_load(i0); - const v128_t vi0x4567 = wasm_v128_load(i0 + 4); - i0 += 8; - const v128_t vi1x0123 = wasm_v128_load(i1); - const v128_t vi1x4567 = wasm_v128_load(i1 + 4); - i1 += 8; - const v128_t vi2x0123 = wasm_v128_load(i2); - const v128_t vi2x4567 = wasm_v128_load(i2 + 4); - i2 += 8; - const v128_t vi3x0123 = wasm_v128_load(i3); - const v128_t vi3x4567 = wasm_v128_load(i3 + 4); - i3 += 8; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc0x4567 = wasm_f32x4_mul(vi0x4567, vw4567); - const v128_t vmask0x4567 = wasm_i32x4_shr(vi0x4567, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - v128_t vacc1x4567 = wasm_f32x4_mul(vi1x4567, vw4567); - const v128_t vmask1x4567 = wasm_i32x4_shr(vi1x4567, 31); - v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123); - const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31); - v128_t vacc2x4567 = wasm_f32x4_mul(vi2x4567, vw4567); - const v128_t vmask2x4567 = wasm_i32x4_shr(vi2x4567, 31); - v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123); - const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31); - v128_t vacc3x4567 = wasm_f32x4_mul(vi3x4567, vw4567); - const v128_t vmask3x4567 = wasm_i32x4_shr(vi3x4567, 31); - - vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc0x4567 = wasm_v128_bitselect(vacc0x4567, vi0x4567, vmask0x4567); - vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123); - vacc1x4567 = wasm_v128_bitselect(vacc1x4567, vi1x4567, vmask1x4567); - vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vi2x0123, vmask2x0123); - vacc2x4567 = wasm_v128_bitselect(vacc2x4567, vi2x4567, vmask2x4567); - vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vi3x0123, vmask3x0123); - vacc3x4567 = wasm_v128_bitselect(vacc3x4567, vi3x4567, vmask3x4567); - - wasm_v128_store(o0, vacc0x0123); - wasm_v128_store(o0 + 4, vacc0x4567); - o0 += 8; - wasm_v128_store(o1, vacc1x0123); - wasm_v128_store(o1 + 4, vacc1x4567); - o1 += 8; - wasm_v128_store(o2, vacc2x0123); - wasm_v128_store(o2 + 4, vacc2x4567); - o2 += 8; - wasm_v128_store(o3, vacc3x0123); - wasm_v128_store(o3 + 4, vacc3x4567); - o3 += 8; - } - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 += 4; - const v128_t vi1x0123 = wasm_v128_load(i1); - i1 += 4; - const v128_t vi2x0123 = wasm_v128_load(i2); - i2 += 4; - const v128_t vi3x0123 = wasm_v128_load(i3); - i3 += 4; - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123); - const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31); - v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123); - const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31); - - vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123); - vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vi2x0123, vmask2x0123); - vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vi3x0123, vmask3x0123); - - wasm_v128_store(o0, vacc0x0123); - o0 += 4; - wasm_v128_store(o1, vacc1x0123); - o1 += 4; - wasm_v128_store(o2, vacc2x0123); - o2 += 4; - wasm_v128_store(o3, vacc3x0123); - o3 += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - const v128_t vi0x0123 = wasm_v128_load(i0); - i0 = (const float*) ((uintptr_t) i0 + c); - const v128_t vi1x0123 = wasm_v128_load(i1); - i1 = (const float*) ((uintptr_t) i1 + c); - const v128_t vi2x0123 = wasm_v128_load(i2); - i2 = (const float*) ((uintptr_t) i2 + c); - const v128_t vi3x0123 = wasm_v128_load(i3); - i3 = (const float*) ((uintptr_t) i3 + c); - - v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); - const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); - v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); - const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); - v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123); - const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31); - v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123); - const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31); - - vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123); - vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123); - vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vi2x0123, vmask2x0123); - vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vi3x0123, vmask3x0123); - - if (c & (2 * sizeof(float))) { - wasm_v128_store64_lane(o0, vacc0x0123, 0); - wasm_v128_store64_lane(o1, vacc1x0123, 0); - wasm_v128_store64_lane(o2, vacc2x0123, 0); - wasm_v128_store64_lane(o3, vacc3x0123, 0); - - vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); - vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); - vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); - vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); - - o0 += 2; - o1 += 2; - o2 += 2; - o3 += 2; - } - if (c & (1 * sizeof(float))) { - wasm_v128_store32_lane(o0, vacc0x0123, 0); - wasm_v128_store32_lane(o1, vacc1x0123, 0); - wasm_v128_store32_lane(o2, vacc2x0123, 0); - wasm_v128_store32_lane(o3, vacc3x0123, 0); - - o0 += 1; - o1 += 1; - o2 += 1; - o3 += 1; - } - } - i0 = (const float*) ((uintptr_t) i0 + input_increment); - o0 = (float*) ((uintptr_t) o0 + output_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - o1 = (float*) ((uintptr_t) o1 + output_increment); - i2 = (const float*) ((uintptr_t) i2 + input_increment); - o2 = (float*) ((uintptr_t) o2 + output_increment); - i3 = (const float*) ((uintptr_t) i3 + input_increment); - o3 = (float*) ((uintptr_t) o3 + output_increment); - rows = doz(rows, 4); - } while (rows != 0); -} diff --git a/src/f32-prelu/neon.c.in b/src/f32-prelu/neon.c.in deleted file mode 100644 index 3774bbebc97..00000000000 --- a/src/f32-prelu/neon.c.in +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert CHANNEL_TILE % 4 == 0 -$assert CHANNEL_TILE >= 4 -$assert ROW_TILE >= 1 -$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__neon_${ROW_TILE}x${CHANNEL_TILE}( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - $for M in range(1, ROW_TILE): - const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_stride); - float* o${M} = (float*) ((uintptr_t) o${M-1} + output_stride); - - const size_t input_increment = input_stride * ${ROW_TILE} - channels; - const size_t output_increment = output_stride * ${ROW_TILE} - channels; - - do { - $for M in range(1, ROW_TILE): - $if M % 2 == 0: - if XNN_UNPREDICTABLE(rows <= ${M}) { - i${M} = i${M-1}; - o${M} = o${M-1}; - } - $else: - if XNN_UNPREDICTABLE(rows < ${M+1}) { - i${M} = i${M-1}; - o${M} = o${M-1}; - } - - const float* w = weights; - size_t c = channels; - for (; c >= ${CHANNEL_TILE} * sizeof(float); c -= ${CHANNEL_TILE} * sizeof(float)) { - $for C in range(0, CHANNEL_TILE, 4): - const float32x4_t vw${ABC[C:C+4]} = vld1q_f32(w); w += 4; - - $for M in range(ROW_TILE): - $for C in range(0, CHANNEL_TILE, 4): - const float32x4_t vi${M}x${ABC[C:C+4]} = vld1q_f32(i${M}); i${M} += 4; - - $for M in range(ROW_TILE): - $for C in range(0, CHANNEL_TILE, 4): - float32x4_t vacc${M}x${ABC[C:C+4]} = vmulq_f32(vi${M}x${ABC[C:C+4]}, vw${ABC[C:C+4]}); - const uint32x4_t vm${M}x${ABC[C:C+4]} = vcltq_s32(vreinterpretq_s32_f32(vi${M}x${ABC[C:C+4]}), vmovq_n_s32(0)); - - $for M in range(ROW_TILE): - $for C in range(0, CHANNEL_TILE, 4): - vacc${M}x${ABC[C:C+4]} = vbslq_f32(vm${M}x${ABC[C:C+4]}, vacc${M}x${ABC[C:C+4]}, vi${M}x${ABC[C:C+4]}); - - $for M in range(ROW_TILE): - $for C in range(0, CHANNEL_TILE, 4): - vst1q_f32(o${M}, vacc${M}x${ABC[C:C+4]}); o${M} += 4; - } - $if CHANNEL_TILE != 4: - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const float32x4_t vw0123 = vld1q_f32(w); w += 4; - - $for M in range(ROW_TILE): - const float32x4_t vi${M}x0123 = vld1q_f32(i${M}); - i${M} += 4; - - $for M in range(ROW_TILE): - float32x4_t vacc${M}x0123 = vmulq_f32(vi${M}x0123, vw0123); - const uint32x4_t vm${M}x0123 = vcltq_s32(vreinterpretq_s32_f32(vi${M}x0123), vmovq_n_s32(0)); - - $for M in range(ROW_TILE): - vacc${M}x0123 = vbslq_f32(vm${M}x0123, vacc${M}x0123, vi${M}x0123); - - $for M in range(ROW_TILE): - vst1q_f32(o${M}, vacc${M}x0123); o${M} += 4; - } - if XNN_UNLIKELY(c != 0) { - const float32x4_t vw0123 = vld1q_f32(w); w += 4; - - $for M in range(ROW_TILE): - const float32x4_t vi${M}x0123 = vld1q_f32(i${M}); - i${M} = (const float*) ((uintptr_t) i${M} + c); - - $for M in range(ROW_TILE): - float32x4_t vacc${M}x0123 = vmulq_f32(vi${M}x0123, vw0123); - const uint32x4_t vm${M}x0123 = vcltq_s32(vreinterpretq_s32_f32(vi${M}x0123), vmovq_n_s32(0)); - - $for M in range(ROW_TILE): - vacc${M}x0123 = vbslq_f32(vm${M}x0123, vacc${M}x0123, vi${M}x0123); - - $for M in range(ROW_TILE): - float32x2_t vacc${M}x01 = vget_low_f32(vacc${M}x0123); - if (c & (2 * sizeof(float))) { - $for M in range(ROW_TILE): - vst1_f32(o${M}, vacc${M}x01); o${M} += 2; - - $for M in range(ROW_TILE): - vacc${M}x01 = vget_high_f32(vacc${M}x0123); - } - if (c & (1 * sizeof(float))) { - $for M in range(ROW_TILE): - vst1_lane_f32(o${M}, vacc${M}x01, 0); o${M} += 1; - } - } - $for M in range(ROW_TILE): - i${M} = (const float*) ((uintptr_t) i${M} + input_increment); - o${M} = (float*) ((uintptr_t) o${M} + output_increment); - rows = doz(rows, ${ROW_TILE}); - } while (rows != 0); -} diff --git a/src/f32-prelu/scalar.c.in b/src/f32-prelu/scalar.c.in deleted file mode 100644 index ba3f11fbaf7..00000000000 --- a/src/f32-prelu/scalar.c.in +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert CHANNEL_TILE > 0 -$assert ROW_TILE >= 1 -$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__scalar_${ROW_TILE}x${CHANNEL_TILE}( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - $for M in range(1, ROW_TILE): - const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_stride); - float* o${M} = (float*) ((uintptr_t) o${M-1} + output_stride); - - const size_t input_increment = input_stride * ${ROW_TILE} - channels; - const size_t output_increment = output_stride * ${ROW_TILE} - channels; - - do { - $for M in range(1, ROW_TILE): - $if M % 2 == 0: - if XNN_UNPREDICTABLE(rows <= ${M}) { - i${M} = i${M-1}; - o${M} = o${M-1}; - } - $else: - if XNN_UNPREDICTABLE(rows < ${M+1}) { - i${M} = i${M-1}; - o${M} = o${M-1}; - } - - const float* w = weights; - size_t c = channels; - $if CHANNEL_TILE > 1: - for (; c >= ${CHANNEL_TILE} * sizeof(float); c -= ${CHANNEL_TILE} * sizeof(float)) { - $for C in range(CHANNEL_TILE): - const float vw${ABC[C]} = w[${C}]; - - $for M in range(ROW_TILE): - $for C in range(CHANNEL_TILE): - const float vi${M}x${ABC[C]} = i${M}[${C}]; - i${M} += ${CHANNEL_TILE}; - - $for M in range(ROW_TILE): - $for C in range(CHANNEL_TILE): - const float vacc${M}x${ABC[C]} = XNN_UNPREDICTABLE(vi${M}x${ABC[C]} < 0.0f) ? vi${M}x${ABC[C]} * vw${ABC[C]} : vi${M}x${ABC[C]}; - - $for M in range(ROW_TILE): - $for C in range(CHANNEL_TILE): - o${M}[${C}] = vacc${M}x${ABC[C]}; - o${M} += ${CHANNEL_TILE}; - - w += ${CHANNEL_TILE}; - } - for (; c != 0; c -= sizeof(float)) { - const float vw = *w++; - - $for M in range(ROW_TILE): - const float vi${M} = *i${M}++; - - $for M in range(ROW_TILE): - const float vacc${M} = XNN_UNPREDICTABLE(vi${M} < 0.0f) ? vi${M} * vw : vi${M}; - - $for M in range(ROW_TILE): - *o${M}++ = vacc${M}; - } - $else: - do { - const float vw = *w++; - - $for M in range(ROW_TILE): - const float vi${M} = *i${M}++; - - $for M in range(ROW_TILE): - const float vacc${M} = XNN_UNPREDICTABLE(vi${M} < 0.0f) ? vi${M} * vw : vi${M}; - - $for M in range(ROW_TILE): - *o${M}++ = vacc${M}; - - c -= sizeof(float); - } while (c != 0); - $for M in range(ROW_TILE): - i${M} = (const float*) ((uintptr_t) i${M} + input_increment); - o${M} = (float*) ((uintptr_t) o${M} + output_increment); - rows = doz(rows, ${ROW_TILE}); - } while (rows != 0); -} diff --git a/src/f32-prelu/sse.c.in b/src/f32-prelu/sse.c.in deleted file mode 100644 index 7e51e3a496d..00000000000 --- a/src/f32-prelu/sse.c.in +++ /dev/null @@ -1,186 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert CHANNEL_TILE % 4 == 0 -$assert CHANNEL_TILE >= 4 -$assert ROW_TILE >= 1 -$assert SSE in [1, 2, 4] -$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" -$SSE_HEADER = {1: "xmmintrin.h", 2: "emmintrin.h", 4: "smmintrin.h"}[SSE] -#include - -#include <${SSE_HEADER}> - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -$ISA = {1: "sse", 2: "sse2", 4: "sse41"}[SSE] -void xnn_f32_prelu_ukernel__${ISA}_${ROW_TILE}x${CHANNEL_TILE}( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - $for M in range(1, ROW_TILE): - const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_stride); - float* o${M} = (float*) ((uintptr_t) o${M-1} + output_stride); - - const size_t input_increment = input_stride * ${ROW_TILE} - channels; - const size_t output_increment = output_stride * ${ROW_TILE} - channels; - - $if SSE == 1: - const __m128 vzero = _mm_setzero_ps(); - do { - $for M in range(1, ROW_TILE): - $if M % 2 == 0: - if XNN_UNPREDICTABLE(rows <= ${M}) { - i${M} = i${M-1}; - o${M} = o${M-1}; - } - $else: - if XNN_UNPREDICTABLE(rows < ${M+1}) { - i${M} = i${M-1}; - o${M} = o${M-1}; - } - - const float* w = weights; - size_t c = channels; - for (; c >= ${CHANNEL_TILE} * sizeof(float); c -= ${CHANNEL_TILE} * sizeof(float)) { - const __m128 vw${ABC[0:4]} = _mm_load_ps(w); - $for C in range(4, CHANNEL_TILE, 4): - const __m128 vw${ABC[C:C+4]} = _mm_load_ps(w + ${C}); - w += ${CHANNEL_TILE}; - - $for M in range(ROW_TILE): - $if SSE == 1: - __m128 vi${M}x${ABC[0:4]} = _mm_loadu_ps(i${M}); - $for C in range(4, CHANNEL_TILE, 4): - __m128 vi${M}x${ABC[C:C+4]} = _mm_loadu_ps(i${M} + ${C}); - $else: - const __m128 vi${M}x${ABC[0:4]} = _mm_loadu_ps(i${M}); - $for C in range(4, CHANNEL_TILE, 4): - const __m128 vi${M}x${ABC[C:C+4]} = _mm_loadu_ps(i${M} + ${C}); - i${M} += ${CHANNEL_TILE}; - - $for M in range(ROW_TILE): - $for C in range(0, CHANNEL_TILE, 4): - $if SSE == 1: - __m128 vacc${M}x${ABC[C:C+4]} = _mm_max_ps(_mm_setzero_ps(), vi${M}x${ABC[C:C+4]}); - vi${M}x${ABC[C:C+4]} = _mm_min_ps(vi${M}x${ABC[C:C+4]}, vzero); - $else: - const __m128 vprod${M}x${ABC[C:C+4]} = _mm_mul_ps(vi${M}x${ABC[C:C+4]}, vw${ABC[C:C+4]}); - $if SSE == 2: - const __m128 vmask${M}x${ABC[C:C+4]} = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi${M}x${ABC[C:C+4]}))); - - $for M in range(ROW_TILE): - $for C in range(0, CHANNEL_TILE, 4): - $if SSE == 1: - vacc${M}x${ABC[C:C+4]} = _mm_add_ps(vacc${M}x${ABC[C:C+4]}, _mm_mul_ps(vi${M}x${ABC[C:C+4]}, vw${ABC[C:C+4]})); - $elif SSE == 2: - const __m128 vacc${M}x${ABC[C:C+4]} = _mm_or_ps(_mm_and_ps(vprod${M}x${ABC[C:C+4]}, vmask${M}x${ABC[C:C+4]}), _mm_andnot_ps(vmask${M}x${ABC[C:C+4]}, vi${M}x${ABC[C:C+4]})); - $elif SSE == 4: - const __m128 vacc${M}x${ABC[C:C+4]} = _mm_blendv_ps(vi${M}x${ABC[C:C+4]}, vprod${M}x${ABC[C:C+4]}, vi${M}x${ABC[C:C+4]}); - - $for M in range(ROW_TILE): - _mm_storeu_ps(o${M}, vacc${M}x${ABC[0:4]}); - $for C in range(4, CHANNEL_TILE, 4): - _mm_storeu_ps(o${M} + ${C}, vacc${M}x${ABC[C:C+4]}); - o${M} += ${CHANNEL_TILE}; - } - $if CHANNEL_TILE > 4: - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const __m128 vw0123 = _mm_load_ps(w); - w += 4; - - $for M in range(ROW_TILE): - $if SSE == 1: - __m128 vi${M}x0123 = _mm_loadu_ps(i${M}); - $else: - const __m128 vi${M}x0123 = _mm_loadu_ps(i${M}); - i${M} += 4; - - $for M in range(ROW_TILE): - $if SSE == 1: - __m128 vacc${M}x0123 = _mm_max_ps(_mm_setzero_ps(), vi${M}x0123); - vi${M}x0123 = _mm_min_ps(vi${M}x0123, vzero); - $else: - const __m128 vprod${M}x0123 = _mm_mul_ps(vi${M}x0123, vw0123); - $if SSE == 2: - const __m128 vmask${M}x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi${M}x0123))); - - $for M in range(ROW_TILE): - $if SSE == 1: - vacc${M}x0123 = _mm_add_ps(vacc${M}x0123, _mm_mul_ps(vi${M}x0123, vw0123)); - $elif SSE == 2: - __m128 vacc${M}x0123 = _mm_or_ps(_mm_and_ps(vprod${M}x0123, vmask${M}x0123), _mm_andnot_ps(vmask${M}x0123, vi${M}x0123)); - $elif SSE == 4: - __m128 vacc${M}x0123 = _mm_blendv_ps(vi${M}x0123, vprod${M}x0123, vi${M}x0123); - - $for M in range(ROW_TILE): - _mm_storeu_ps(o${M}, vacc${M}x0123); - o${M} += 4; - } - if XNN_UNLIKELY(c != 0) { - const __m128 vw0123 = _mm_load_ps(w); - w = (const float*) ((uintptr_t) w + c); - - $for M in range(ROW_TILE): - $if SSE == 1: - __m128 vi${M}x0123 = _mm_loadu_ps(i${M}); - $else: - const __m128 vi${M}x0123 = _mm_loadu_ps(i${M}); - i${M} = (const float*) ((uintptr_t) i${M} + c); - - $for M in range(ROW_TILE): - $if SSE == 1: - __m128 vacc${M}x0123 = _mm_max_ps(_mm_setzero_ps(), vi${M}x0123); - vi${M}x0123 = _mm_min_ps(vi${M}x0123, vzero); - $else: - const __m128 vprod${M}x0123 = _mm_mul_ps(vi${M}x0123, vw0123); - $if SSE == 2: - const __m128 vmask${M}x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi${M}x0123))); - - $for M in range(ROW_TILE): - $if SSE == 1: - vacc${M}x0123 = _mm_add_ps(vacc${M}x0123, _mm_mul_ps(vi${M}x0123, vw0123)); - $elif SSE == 2: - __m128 vacc${M}x0123 = _mm_or_ps(_mm_and_ps(vprod${M}x0123, vmask${M}x0123), _mm_andnot_ps(vmask${M}x0123, vi${M}x0123)); - $elif SSE == 4: - __m128 vacc${M}x0123 = _mm_blendv_ps(vi${M}x0123, vprod${M}x0123, vi${M}x0123); - - if (c & (2 * sizeof(float))) { - $for M in range(ROW_TILE): - _mm_storel_pi((__m64*) o${M}, vacc${M}x0123); - - $for M in range(ROW_TILE): - vacc${M}x0123 = _mm_movehl_ps(vacc${M}x0123, vacc${M}x0123); - - $for M in range(ROW_TILE): - o${M} += 2; - } - if (c & (1 * sizeof(float))) { - $for M in range(ROW_TILE): - _mm_store_ss(o${M}, vacc${M}x0123); - - $for M in range(ROW_TILE): - o${M} += 1; - } - } - $for M in range(ROW_TILE): - i${M} = (const float*) ((uintptr_t) i${M} + input_increment); - o${M} = (float*) ((uintptr_t) o${M} + output_increment); - rows = doz(rows, ${ROW_TILE}); - } while (rows != 0); -} diff --git a/src/f32-prelu/wasm.c.in b/src/f32-prelu/wasm.c.in deleted file mode 100644 index fd6801ffa3b..00000000000 --- a/src/f32-prelu/wasm.c.in +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert CHANNEL_TILE > 0 -$assert ROW_TILE >= 1 -$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -void xnn_f32_prelu_ukernel__wasm_${ROW_TILE}x${CHANNEL_TILE}( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - $for M in range(1, ROW_TILE): - const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_stride); - float* o${M} = (float*) ((uintptr_t) o${M-1} + output_stride); - - const size_t input_increment = input_stride * ${ROW_TILE} - channels; - const size_t output_increment = output_stride * ${ROW_TILE} - channels; - - const float vzero = 0.0f; - do { - $for M in range(1, ROW_TILE): - $if M % 2 == 0: - if XNN_UNPREDICTABLE(rows <= ${M}) { - i${M} = i${M-1}; - o${M} = o${M-1}; - } - $else: - if XNN_UNPREDICTABLE(rows < ${M+1}) { - i${M} = i${M-1}; - o${M} = o${M-1}; - } - - const float* w = weights; - size_t c = channels; - $if CHANNEL_TILE > 1: - for (; c >= ${CHANNEL_TILE} * sizeof(float); c -= ${CHANNEL_TILE} * sizeof(float)) { - $for C in range(CHANNEL_TILE): - const float vw${ABC[C]} = w[${C}]; - - $for M in range(ROW_TILE): - $for C in range(CHANNEL_TILE): - float vi${M}x${ABC[C]} = i${M}[${C}]; - i${M} += ${CHANNEL_TILE}; - - $for M in range(ROW_TILE): - $for C in range(CHANNEL_TILE): - float vacc${M}x${ABC[C]} = __builtin_wasm_max_f32(vi${M}x${ABC[C]}, vzero); - vi${M}x${ABC[C]} = __builtin_wasm_min_f32(vi${M}x${ABC[C]}, vzero); - - $for M in range(ROW_TILE): - $for C in range(CHANNEL_TILE): - vacc${M}x${ABC[C]} += vi${M}x${ABC[C]} * vw${ABC[C]}; - - $for M in range(ROW_TILE): - $for C in range(CHANNEL_TILE): - o${M}[${C}] = vacc${M}x${ABC[C]}; - o${M} += ${CHANNEL_TILE}; - - w += ${CHANNEL_TILE}; - } - for (; c != 0; c -= sizeof(float)) { - const float vw = *w++; - - $for M in range(ROW_TILE): - float vi${M} = *i${M}++; - - $for M in range(ROW_TILE): - float vacc${M} = __builtin_wasm_max_f32(vi${M}, vzero); - vi${M} = __builtin_wasm_min_f32(vi${M}, vzero); - - $for M in range(ROW_TILE): - vacc${M} += vi${M} * vw; - - $for M in range(ROW_TILE): - *o${M}++ = vacc${M}; - } - $else: - do { - const float vw = *w++; - - $for M in range(ROW_TILE): - float vi${M} = *i${M}++; - - $for M in range(ROW_TILE): - float vacc${M} = __builtin_wasm_max_f32(vi${M}, vzero); - vi${M} = __builtin_wasm_min_f32(vi${M}, vzero); - - $for M in range(ROW_TILE): - vacc${M} += vi${M} * vw; - - $for M in range(ROW_TILE): - *o${M}++ = vacc${M}; - - c -= sizeof(float); - } while (c != 0); - $for M in range(ROW_TILE): - i${M} = (const float*) ((uintptr_t) i${M} + input_increment); - o${M} = (float*) ((uintptr_t) o${M} + output_increment); - rows = doz(rows, ${ROW_TILE}); - } while (rows != 0); -} diff --git a/src/f32-prelu/wasmsimd-iminmax.c.in b/src/f32-prelu/wasmsimd-iminmax.c.in deleted file mode 100644 index 6a9920b5317..00000000000 --- a/src/f32-prelu/wasmsimd-iminmax.c.in +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert CHANNEL_TILE % 4 == 0 -$assert CHANNEL_TILE >= 4 -$assert ROW_TILE >= 1 -$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -$ISA = "wasmrelaxedsimd" if RELAXED else "wasmsimd" -void xnn_f32_prelu_ukernel__${ISA}_iminmax_${ROW_TILE}x${CHANNEL_TILE}( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - $for M in range(1, ROW_TILE): - const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_stride); - float* o${M} = (float*) ((uintptr_t) o${M-1} + output_stride); - - const size_t input_increment = input_stride * ${ROW_TILE} - channels; - const size_t output_increment = output_stride * ${ROW_TILE} - channels; - - const v128_t vzero = wasm_i32x4_const_splat(0); - do { - $for M in range(1, ROW_TILE): - $if M % 2 == 0: - if XNN_UNPREDICTABLE(rows <= ${M}) { - i${M} = i${M-1}; - o${M} = o${M-1}; - } - $else: - if XNN_UNPREDICTABLE(rows < ${M+1}) { - i${M} = i${M-1}; - o${M} = o${M-1}; - } - - const float* w = weights; - size_t c = channels; - for (; c >= ${CHANNEL_TILE} * sizeof(float); c -= ${CHANNEL_TILE} * sizeof(float)) { - const v128_t vw${ABC[0:4]} = wasm_v128_load(w); - $for C in range(4, CHANNEL_TILE, 4): - const v128_t vw${ABC[C:C+4]} = wasm_v128_load(w + ${C}); - w += ${CHANNEL_TILE}; - - $for M in range(ROW_TILE): - v128_t vi${M}x${ABC[0:4]} = wasm_v128_load(i${M}); - $for C in range(4, CHANNEL_TILE, 4): - v128_t vi${M}x${ABC[C:C+4]} = wasm_v128_load(i${M} + ${C}); - i${M} += ${CHANNEL_TILE}; - - $for M in range(ROW_TILE): - $for C in range(0, CHANNEL_TILE, 4): - v128_t vacc${M}x${ABC[C:C+4]} = wasm_i32x4_max(vi${M}x${ABC[C:C+4]}, vzero); - vi${M}x${ABC[C:C+4]} = wasm_i32x4_min(vi${M}x${ABC[C:C+4]}, vzero); - - $for M in range(ROW_TILE): - $for C in range(0, CHANNEL_TILE, 4): - $if RELAXED: - vacc${M}x${ABC[C:C+4]} = wasm_f32x4_relaxed_madd(vi${M}x${ABC[C:C+4]}, vw${ABC[C:C+4]}, vacc${M}x${ABC[C:C+4]}); - $else: - vacc${M}x${ABC[C:C+4]} = wasm_f32x4_add(wasm_f32x4_mul(vi${M}x${ABC[C:C+4]}, vw${ABC[C:C+4]}), vacc${M}x${ABC[C:C+4]}); - - $for M in range(ROW_TILE): - wasm_v128_store(o${M}, vacc${M}x${ABC[0:4]}); - $for C in range(4, CHANNEL_TILE, 4): - wasm_v128_store(o${M} + ${C}, vacc${M}x${ABC[C:C+4]}); - o${M} += ${CHANNEL_TILE}; - } - $if CHANNEL_TILE > 4: - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - $for M in range(ROW_TILE): - v128_t vi${M}x0123 = wasm_v128_load(i${M}); - i${M} += 4; - - $for M in range(ROW_TILE): - v128_t vacc${M}x0123 = wasm_i32x4_max(vi${M}x0123, vzero); - vi${M}x0123 = wasm_i32x4_min(vi${M}x0123, vzero); - - $for M in range(ROW_TILE): - $if RELAXED: - vacc${M}x0123 = wasm_f32x4_relaxed_madd(vi${M}x0123, vw0123, vacc${M}x0123); - $else: - vacc${M}x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi${M}x0123, vw0123), vacc${M}x0123); - - $for M in range(ROW_TILE): - wasm_v128_store(o${M}, vacc${M}x0123); - o${M} += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - $for M in range(ROW_TILE): - v128_t vi${M}x0123 = wasm_v128_load(i${M}); - i${M} = (const float*) ((uintptr_t) i${M} + c); - - $for M in range(ROW_TILE): - v128_t vacc${M}x0123 = wasm_i32x4_max(vi${M}x0123, vzero); - vi${M}x0123 = wasm_i32x4_min(vi${M}x0123, vzero); - - $for M in range(ROW_TILE): - $if RELAXED: - vacc${M}x0123 = wasm_f32x4_relaxed_madd(vi${M}x0123, vw0123, vacc${M}x0123); - $else: - vacc${M}x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi${M}x0123, vw0123), vacc${M}x0123); - - if (c & (2 * sizeof(float))) { - $for M in range(ROW_TILE): - wasm_v128_store64_lane(o${M}, vacc${M}x0123, 0); - - $for M in range(ROW_TILE): - vacc${M}x0123 = wasm_v64x2_shuffle(vacc${M}x0123, vacc${M}x0123, 1, 1); - - $for M in range(ROW_TILE): - o${M} += 2; - } - if (c & (1 * sizeof(float))) { - $for M in range(ROW_TILE): - wasm_v128_store32_lane(o${M}, vacc${M}x0123, 0); - - $for M in range(ROW_TILE): - o${M} += 1; - } - } - $for M in range(ROW_TILE): - i${M} = (const float*) ((uintptr_t) i${M} + input_increment); - o${M} = (float*) ((uintptr_t) o${M} + output_increment); - rows = doz(rows, ${ROW_TILE}); - } while (rows != 0); -} diff --git a/src/f32-prelu/wasmsimd-laneselect.c.in b/src/f32-prelu/wasmsimd-laneselect.c.in deleted file mode 100644 index c5b81fbaafd..00000000000 --- a/src/f32-prelu/wasmsimd-laneselect.c.in +++ /dev/null @@ -1,142 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert CHANNEL_TILE % 4 == 0 -$assert CHANNEL_TILE >= 4 -$assert ROW_TILE >= 1 -$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/prelu.h" - - -$WASM_V32X4_LANESELECT = "wasm_i32x4_relaxed_laneselect" if RELAXED else "wasm_v128_bitselect" -$ISA = "wasmrelaxedsimd" if RELAXED else "wasmsimd" -void xnn_f32_prelu_ukernel__${ISA}_laneselect_${ROW_TILE}x${CHANNEL_TILE}( - size_t rows, - size_t channels, - const float* restrict input, - size_t input_stride, - const float* restrict weights, - float* restrict output, - size_t output_stride) XNN_OOB_READS -{ - assert(rows != 0); - assert(channels != 0); - assert(channels % sizeof(float) == 0); - - const float* i0 = input; - float* o0 = output; - $for M in range(1, ROW_TILE): - const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_stride); - float* o${M} = (float*) ((uintptr_t) o${M-1} + output_stride); - - const size_t input_increment = input_stride * ${ROW_TILE} - channels; - const size_t output_increment = output_stride * ${ROW_TILE} - channels; - - do { - $for M in range(1, ROW_TILE): - $if M % 2 == 0: - if XNN_UNPREDICTABLE(rows <= ${M}) { - i${M} = i${M-1}; - o${M} = o${M-1}; - } - $else: - if XNN_UNPREDICTABLE(rows < ${M+1}) { - i${M} = i${M-1}; - o${M} = o${M-1}; - } - - const float* w = weights; - size_t c = channels; - for (; c >= ${CHANNEL_TILE} * sizeof(float); c -= ${CHANNEL_TILE} * sizeof(float)) { - const v128_t vw${ABC[0:4]} = wasm_v128_load(w); - $for C in range(4, CHANNEL_TILE, 4): - const v128_t vw${ABC[C:C+4]} = wasm_v128_load(w + ${C}); - w += ${CHANNEL_TILE}; - - $for M in range(ROW_TILE): - const v128_t vi${M}x${ABC[0:4]} = wasm_v128_load(i${M}); - $for C in range(4, CHANNEL_TILE, 4): - const v128_t vi${M}x${ABC[C:C+4]} = wasm_v128_load(i${M} + ${C}); - i${M} += ${CHANNEL_TILE}; - - $for M in range(ROW_TILE): - $for C in range(0, CHANNEL_TILE, 4): - v128_t vacc${M}x${ABC[C:C+4]} = wasm_f32x4_mul(vi${M}x${ABC[C:C+4]}, vw${ABC[C:C+4]}); - const v128_t vmask${M}x${ABC[C:C+4]} = wasm_i32x4_shr(vi${M}x${ABC[C:C+4]}, 31); - - $for M in range(ROW_TILE): - $for C in range(0, CHANNEL_TILE, 4): - vacc${M}x${ABC[C:C+4]} = ${WASM_V32X4_LANESELECT}(vacc${M}x${ABC[C:C+4]}, vi${M}x${ABC[C:C+4]}, vmask${M}x${ABC[C:C+4]}); - - $for M in range(ROW_TILE): - wasm_v128_store(o${M}, vacc${M}x${ABC[0:4]}); - $for C in range(4, CHANNEL_TILE, 4): - wasm_v128_store(o${M} + ${C}, vacc${M}x${ABC[C:C+4]}); - o${M} += ${CHANNEL_TILE}; - } - $if CHANNEL_TILE > 4: - for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { - const v128_t vw0123 = wasm_v128_load(w); - w += 4; - - $for M in range(ROW_TILE): - const v128_t vi${M}x0123 = wasm_v128_load(i${M}); - i${M} += 4; - - $for M in range(ROW_TILE): - v128_t vacc${M}x0123 = wasm_f32x4_mul(vi${M}x0123, vw0123); - const v128_t vmask${M}x0123 = wasm_i32x4_shr(vi${M}x0123, 31); - - $for M in range(ROW_TILE): - vacc${M}x0123 = ${WASM_V32X4_LANESELECT}(vacc${M}x0123, vi${M}x0123, vmask${M}x0123); - - $for M in range(ROW_TILE): - wasm_v128_store(o${M}, vacc${M}x0123); - o${M} += 4; - } - if XNN_UNLIKELY(c != 0) { - const v128_t vw0123 = wasm_v128_load(w); - w = (const float*) ((uintptr_t) w + c); - - $for M in range(ROW_TILE): - const v128_t vi${M}x0123 = wasm_v128_load(i${M}); - i${M} = (const float*) ((uintptr_t) i${M} + c); - - $for M in range(ROW_TILE): - v128_t vacc${M}x0123 = wasm_f32x4_mul(vi${M}x0123, vw0123); - const v128_t vmask${M}x0123 = wasm_i32x4_shr(vi${M}x0123, 31); - - $for M in range(ROW_TILE): - vacc${M}x0123 = ${WASM_V32X4_LANESELECT}(vacc${M}x0123, vi${M}x0123, vmask${M}x0123); - - if (c & (2 * sizeof(float))) { - $for M in range(ROW_TILE): - wasm_v128_store64_lane(o${M}, vacc${M}x0123, 0); - - $for M in range(ROW_TILE): - vacc${M}x0123 = wasm_v64x2_shuffle(vacc${M}x0123, vacc${M}x0123, 1, 1); - - $for M in range(ROW_TILE): - o${M} += 2; - } - if (c & (1 * sizeof(float))) { - $for M in range(ROW_TILE): - wasm_v128_store32_lane(o${M}, vacc${M}x0123, 0); - - $for M in range(ROW_TILE): - o${M} += 1; - } - } - $for M in range(ROW_TILE): - i${M} = (const float*) ((uintptr_t) i${M} + input_increment); - o${M} = (float*) ((uintptr_t) o${M} + output_increment); - rows = doz(rows, ${ROW_TILE}); - } while (rows != 0); -} diff --git a/src/f32-qs8-vcvt/avx.c.in b/src/f32-qs8-vcvt/avx.c.in index adf7dc83940..2cdd41eff6a 100644 --- a/src/f32-qs8-vcvt/avx.c.in +++ b/src/f32-qs8-vcvt/avx.c.in @@ -18,7 +18,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] $_MM_PACKXS_EPI16 = {"QS8": "_mm_packs_epi16", "QU8": "_mm_packus_epi16"}[DATATYPE] -$_MM_MAX_EPX8 = {"QS8": "_mm_max_epi8", "QU8": "_mm_max_epu8"}[DATATYPE] +$OUTPUT_MAX = {"QS8": 127, "QU8": 255}[DATATYPE] void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx_u${BATCH_TILE}( size_t batch, const float* input, @@ -33,13 +33,11 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx_u${BATCH_TILE}( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; const __m256 vscale = _mm256_set1_ps(params->scalar.scale); - const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) ${OUTPUT_MAX} - (int32_t) params->scalar.output_zero_point)); const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point); - const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); $if BATCH_TILE > 8: for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { @@ -69,12 +67,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx_u${BATCH_TILE}( $else: vy${ABC[N:N+8]} = ${_MM_PACKXS_EPI16}(vy${ABC[N:N+8]}, vy${ABC[N:N+8]}); - $for N in range(0, BATCH_TILE, 16): - $if N + 8 < BATCH_TILE: - vy${ABC[N:N+16]} = ${_MM_MAX_EPX8}(vy${ABC[N:N+16]}, voutput_min); - $else: - vy${ABC[N:N+8]} = ${_MM_MAX_EPX8}(vy${ABC[N:N+8]}, voutput_min); - _mm_storeu_si128((__m128i*) output, vy${ABC[0:16]}); $for N in range(16, BATCH_TILE, 16): $if N + 8 < BATCH_TILE: @@ -94,7 +86,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx_u${BATCH_TILE}( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = ${_MM_PACKXS_EPI16}(vy, vy); - vy = ${_MM_MAX_EPX8}(vy, voutput_min); _mm_storel_epi64((__m128i*) output, vy); output += 8; @@ -113,7 +104,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx_u${BATCH_TILE}( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = ${_MM_PACKXS_EPI16}(vy, vy); - vy = ${_MM_MAX_EPX8}(vy, voutput_min); if (batch & (4 * sizeof(float))) { _mm_storeu_si32(output, vy); diff --git a/src/f32-qs8-vcvt/avx2.c.in b/src/f32-qs8-vcvt/avx2.c.in index f6c965d62c9..dc9c9e5b5ca 100644 --- a/src/f32-qs8-vcvt/avx2.c.in +++ b/src/f32-qs8-vcvt/avx2.c.in @@ -20,8 +20,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] $_MM256_PACKXS_EPI16 = {"QS8": "_mm256_packs_epi16", "QU8": "_mm256_packus_epi16"}[DATATYPE] $_MM_PACKXS_EPI16 = {"QS8": "_mm_packs_epi16", "QU8": "_mm_packus_epi16"}[DATATYPE] -$_MM256_MAX_EPX8 = {"QS8": "_mm256_max_epi8", "QU8": "_mm256_max_epu8"}[DATATYPE] -$_MM_MAX_EPX8 = {"QS8": "_mm_max_epi8", "QU8": "_mm_max_epu8"}[DATATYPE] +$OUTPUT_MAX = {"QS8": 127, "QU8": 255}[DATATYPE] void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx2_u${BATCH_TILE}( size_t batch, const float* input, @@ -36,18 +35,14 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx2_u${BATCH_TILE}( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; const __m256 vscale = _mm256_set1_ps(params->scalar.scale); - const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) ${OUTPUT_MAX} - (int32_t) params->scalar.output_zero_point)); const __m256i voutput_zero_point = _mm256_set1_epi16(params->scalar.output_zero_point); $if BATCH_TILE > 16: XNN_ALIGN(32) static const uint32_t shuffle_mask[8] = {0, 4, 1, 5, 2, 6, 3, 7}; const __m256i vshuffle_mask = _mm256_load_si256((const __m256i*) shuffle_mask); - const __m256i voutput_min = _mm256_set1_epi8(params->scalar.output_min); - $else: - const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { __m256 vx${ABC[0:2]} = _mm256_loadu_ps(input); @@ -82,14 +77,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx2_u${BATCH_TILE}( $else: __m128i vy${ABC[N:N+4]} = _mm_shuffle_epi32(vy${ABC[N]}${ABC[N+2]}${ABC[N+1]}${ABC[N+3]}, _MM_SHUFFLE(3, 1, 2, 0)); - $for N in range(0, SIMD_TILE, 8): - $if N + 4 < SIMD_TILE: - vy${ABC[N:N+8]} = ${_MM256_MAX_EPX8}(vy${ABC[N:N+8]}, voutput_min); - $elif BATCH_TILE > 16: - vy${ABC[N:N+4]} = ${_MM_MAX_EPX8}(vy${ABC[N:N+4]}, _mm256_castsi256_si128(voutput_min)); - $else: - vy${ABC[N:N+4]} = ${_MM_MAX_EPX8}(vy${ABC[N:N+4]}, voutput_min); - $if SIMD_TILE > 4: _mm256_storeu_si256((__m256i*) output, vy${ABC[0:8]}); $else: @@ -112,10 +99,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx2_u${BATCH_TILE}( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point)); vy = ${_MM_PACKXS_EPI16}(vy, vy); - $if BATCH_TILE > 16: - vy = ${_MM_MAX_EPX8}(vy, _mm256_castsi256_si128(voutput_min)); - $else: - vy = ${_MM_MAX_EPX8}(vy, voutput_min); _mm_storel_epi64((__m128i*) output, vy); output += 8; @@ -134,10 +117,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx2_u${BATCH_TILE}( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point)); vy = ${_MM_PACKXS_EPI16}(vy, vy); - $if BATCH_TILE > 16: - vy = ${_MM_MAX_EPX8}(vy, _mm256_castsi256_si128(voutput_min)); - $else: - vy = ${_MM_MAX_EPX8}(vy, voutput_min); if (batch & (4 * sizeof(float))) { _mm_storeu_si32(output, vy); diff --git a/src/f32-qs8-vcvt/avx512skx.c.in b/src/f32-qs8-vcvt/avx512skx.c.in index 50728697c93..23b78a6f922 100644 --- a/src/f32-qs8-vcvt/avx512skx.c.in +++ b/src/f32-qs8-vcvt/avx512skx.c.in @@ -21,9 +21,7 @@ $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] $_MM512_PACKXS_EPI16 = {"QS8": "_mm512_packs_epi16", "QU8": "_mm512_packus_epi16"}[DATATYPE] $_MM256_PACKXS_EPI16 = {"QS8": "_mm256_packs_epi16", "QU8": "_mm256_packus_epi16"}[DATATYPE] $_MM_PACKXS_EPI16 = {"QS8": "_mm_packs_epi16", "QU8": "_mm_packus_epi16"}[DATATYPE] -$_MM512_MAX_EPX8 = {"QS8": "_mm512_max_epi8", "QU8": "_mm512_max_epu8"}[DATATYPE] -$_MM256_MAX_EPX8 = {"QS8": "_mm256_max_epi8", "QU8": "_mm256_max_epu8"}[DATATYPE] -$_MM_MAX_EPX8 = {"QS8": "_mm_max_epi8", "QU8": "_mm_max_epu8"}[DATATYPE] +$OUTPUT_MAX = {"QS8": 127, "QU8": 255}[DATATYPE] void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx512skx_u${BATCH_TILE}( size_t batch, const float* input, @@ -42,20 +40,15 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx512skx_u${BATCH_TILE}( XNN_ALIGN(32) static const uint32_t shuffle256_mask[8] = {0, 4, 2, 6, 1, 5, 3, 7}; const __m512 vscale = _mm512_set1_ps(params->scalar.scale); - const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) ${OUTPUT_MAX} - (int32_t) params->scalar.output_zero_point)); const __m512i voutput_zero_point = _mm512_set1_epi16(params->scalar.output_zero_point); $if SIMD_TILE > 8: const __m512i vshuffle512_mask = _mm512_load_si512(shuffle512_mask); $if SIMD_TILE % 16 != 0: const __m256i vshuffle256_mask = _mm256_load_si256((const __m256i*) shuffle256_mask); - $if SIMD_TILE > 8: - const __m512i voutput_min = _mm512_set1_epi8(params->scalar.output_min); - $else: - const __m256i voutput_min = _mm256_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { __m512 vx0123 = _mm512_loadu_ps(input); $for N in range(4, SIMD_TILE, 4): @@ -83,14 +76,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx512skx_u${BATCH_TILE}( $else: __m256i vy${ABC[N]}${ABC[N+4]}${ABC[N+2]}${ABC[N+6]}${ABC[N+1]}${ABC[N+5]}${ABC[N+3]}${ABC[N+7]} = ${_MM256_PACKXS_EPI16}(_mm512_castsi512_si256(vacc${ABC[N]}${ABC[N+4]}${ABC[N+1]}${ABC[N+5]}${ABC[N+2]}${ABC[N+6]}${ABC[N+3]}${ABC[N+7]}), _mm512_extracti32x8_epi32(vacc${ABC[N]}${ABC[N+4]}${ABC[N+1]}${ABC[N+5]}${ABC[N+2]}${ABC[N+6]}${ABC[N+3]}${ABC[N+7]}, 1)); - $for N in range(0, SIMD_TILE, 16): - $if N + 8 < SIMD_TILE: - vy${ABC[N]}${ABC[N+4]}${ABC[N+8]}${ABC[N+12]}${ABC[N+1]}${ABC[N+5]}${ABC[N+9]}${ABC[N+13]}${ABC[N+2]}${ABC[N+6]}${ABC[N+10]}${ABC[N+14]}${ABC[N+3]}${ABC[N+7]}${ABC[N+11]}${ABC[N+15]} = ${_MM512_MAX_EPX8}(vy${ABC[N]}${ABC[N+4]}${ABC[N+8]}${ABC[N+12]}${ABC[N+1]}${ABC[N+5]}${ABC[N+9]}${ABC[N+13]}${ABC[N+2]}${ABC[N+6]}${ABC[N+10]}${ABC[N+14]}${ABC[N+3]}${ABC[N+7]}${ABC[N+11]}${ABC[N+15]}, voutput_min); - $elif SIMD_TILE > 8: - vy${ABC[N]}${ABC[N+4]}${ABC[N+2]}${ABC[N+6]}${ABC[N+1]}${ABC[N+5]}${ABC[N+3]}${ABC[N+7]} = ${_MM256_MAX_EPX8}(vy${ABC[N]}${ABC[N+4]}${ABC[N+2]}${ABC[N+6]}${ABC[N+1]}${ABC[N+5]}${ABC[N+3]}${ABC[N+7]}, _mm512_castsi512_si256(voutput_min)); - $else: - vy${ABC[N]}${ABC[N+4]}${ABC[N+2]}${ABC[N+6]}${ABC[N+1]}${ABC[N+5]}${ABC[N+3]}${ABC[N+7]} = ${_MM256_MAX_EPX8}(vy${ABC[N]}${ABC[N+4]}${ABC[N+2]}${ABC[N+6]}${ABC[N+1]}${ABC[N+5]}${ABC[N+3]}${ABC[N+7]}, voutput_min); - $for N in range(0, SIMD_TILE, 16): $if N + 8 < SIMD_TILE: const __m512i vy${ABC[N:N+16]} = _mm512_permutexvar_epi32(vshuffle512_mask, vy${ABC[N]}${ABC[N+4]}${ABC[N+8]}${ABC[N+12]}${ABC[N+1]}${ABC[N+5]}${ABC[N+9]}${ABC[N+13]}${ABC[N+2]}${ABC[N+6]}${ABC[N+10]}${ABC[N+14]}${ABC[N+3]}${ABC[N+7]}${ABC[N+11]}${ABC[N+15]}); @@ -120,10 +105,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx512skx_u${BATCH_TILE}( vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point)); const __m128i vy0213 = ${_MM_PACKXS_EPI16}(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1)); __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0)); - $if SIMD_TILE > 8: - vy0123 = ${_MM_MAX_EPX8}(vy0123, _mm512_castsi512_si128(voutput_min)); - $else: - vy0123 = ${_MM_MAX_EPX8}(vy0123, _mm256_castsi256_si128(voutput_min)); _mm_storeu_si128((__m128i*) output, vy0123); output += 16; @@ -146,10 +127,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx512skx_u${BATCH_TILE}( vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point)); const __m128i vy0213 = ${_MM_PACKXS_EPI16}(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1)); __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0)); - $if SIMD_TILE > 8: - vy0123 = ${_MM_MAX_EPX8}(vy0123, _mm512_castsi512_si128(voutput_min)); - $else: - vy0123 = ${_MM_MAX_EPX8}(vy0123, _mm256_castsi256_si128(voutput_min)); _mm_mask_storeu_epi8(output, vmask, vy0123); } diff --git a/src/f32-qs8-vcvt/f32-qs8-vcvt.h b/src/f32-qs8-vcvt/f32-qs8-vcvt.h index cca362a9af2..a21fc308d84 100644 --- a/src/f32-qs8-vcvt/f32-qs8-vcvt.h +++ b/src/f32-qs8-vcvt/f32-qs8-vcvt.h @@ -51,11 +51,14 @@ XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_qs8_vcvt_ukernel__avx2_u1 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_qs8_vcvt_ukernel__avx2_u32, 32, false, float, int8_t, struct xnn_f32_qs8_cvt_params, xnn_init_f32_qs8_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_qs8_vcvt_ukernel__avx2_u48, 48, false, float, int8_t, struct xnn_f32_qs8_cvt_params, xnn_init_f32_qs8_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_qs8_vcvt_ukernel__avx2_u64, 64, false, float, int8_t, struct xnn_f32_qs8_cvt_params, xnn_init_f32_qs8_cvt_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_qs8_vcvt_ukernel__avx512skx_u32, 32, false, float, int8_t, struct xnn_f32_qs8_cvt_params, xnn_init_f32_qs8_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_qs8_vcvt_ukernel__avx512skx_u64, 64, false, float, int8_t, struct xnn_f32_qs8_cvt_params, xnn_init_f32_qs8_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_qs8_vcvt_ukernel__avx512skx_u96, 96, false, float, int8_t, struct xnn_f32_qs8_cvt_params, xnn_init_f32_qs8_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_qs8_vcvt_ukernel__avx512skx_u128, 128, false, float, int8_t, struct xnn_f32_qs8_cvt_params, xnn_init_f32_qs8_cvt_scalar_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u8, 8, false, float, int8_t, struct xnn_f32_qs8_cvt_params, xnn_init_f32_qs8_cvt_scalar_params) diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u16.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u16.c index 077b88cd582..7b199c3a48e 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u16.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u16.c @@ -30,13 +30,11 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u16( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; const __m256 vscale = _mm256_set1_ps(params->scalar.scale); - const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point)); const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point); - const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { __m256 vx01234567 = _mm256_loadu_ps(input); @@ -60,8 +58,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u16( __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF); - vy0123456789ABCDEF = _mm_max_epi8(vy0123456789ABCDEF, voutput_min); - _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF); output += 16; } @@ -76,7 +72,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u16( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packs_epi16(vy, vy); - vy = _mm_max_epi8(vy, voutput_min); _mm_storel_epi64((__m128i*) output, vy); output += 8; @@ -95,7 +90,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u16( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packs_epi16(vy, vy); - vy = _mm_max_epi8(vy, voutput_min); if (batch & (4 * sizeof(float))) { _mm_storeu_si32(output, vy); diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u24.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u24.c index 6ad7909cc74..d81aae2ef9e 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u24.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u24.c @@ -30,13 +30,11 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u24( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; const __m256 vscale = _mm256_set1_ps(params->scalar.scale); - const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point)); const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point); - const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 24 * sizeof(float); batch -= 24 * sizeof(float)) { __m256 vx01234567 = _mm256_loadu_ps(input); @@ -67,9 +65,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u24( __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF); vyGHIJKLMN = _mm_packs_epi16(vyGHIJKLMN, vyGHIJKLMN); - vy0123456789ABCDEF = _mm_max_epi8(vy0123456789ABCDEF, voutput_min); - vyGHIJKLMN = _mm_max_epi8(vyGHIJKLMN, voutput_min); - _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF); _mm_storel_epi64((__m128i*) (output + 16), vyGHIJKLMN); output += 24; @@ -85,7 +80,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u24( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packs_epi16(vy, vy); - vy = _mm_max_epi8(vy, voutput_min); _mm_storel_epi64((__m128i*) output, vy); output += 8; @@ -104,7 +98,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u24( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packs_epi16(vy, vy); - vy = _mm_max_epi8(vy, voutput_min); if (batch & (4 * sizeof(float))) { _mm_storeu_si32(output, vy); diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u32.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u32.c index c249138d905..b82a02e540f 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u32.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u32.c @@ -30,13 +30,11 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u32( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; const __m256 vscale = _mm256_set1_ps(params->scalar.scale); - const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point)); const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point); - const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { __m256 vx01234567 = _mm256_loadu_ps(input); @@ -73,9 +71,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u32( __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF); __m128i vyGHIJKLMNOPQRSTUV = _mm_packs_epi16(vyGHIJKLMN, vyOPQRSTUV); - vy0123456789ABCDEF = _mm_max_epi8(vy0123456789ABCDEF, voutput_min); - vyGHIJKLMNOPQRSTUV = _mm_max_epi8(vyGHIJKLMNOPQRSTUV, voutput_min); - _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF); _mm_storeu_si128((__m128i*) (output + 16), vyGHIJKLMNOPQRSTUV); output += 32; @@ -91,7 +86,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u32( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packs_epi16(vy, vy); - vy = _mm_max_epi8(vy, voutput_min); _mm_storel_epi64((__m128i*) output, vy); output += 8; @@ -110,7 +104,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u32( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packs_epi16(vy, vy); - vy = _mm_max_epi8(vy, voutput_min); if (batch & (4 * sizeof(float))) { _mm_storeu_si32(output, vy); diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u8.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u8.c index 193acb70e13..2272d45a3be 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u8.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u8.c @@ -30,13 +30,11 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u8( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; const __m256 vscale = _mm256_set1_ps(params->scalar.scale); - const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point)); const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point); - const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { __m256 vx = _mm256_loadu_ps(input); @@ -49,7 +47,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u8( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packs_epi16(vy, vy); - vy = _mm_max_epi8(vy, voutput_min); _mm_storel_epi64((__m128i*) output, vy); output += 8; @@ -68,7 +65,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u8( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packs_epi16(vy, vy); - vy = _mm_max_epi8(vy, voutput_min); if (batch & (4 * sizeof(float))) { _mm_storeu_si32(output, vy); diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u16.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u16.c index 122cbfac25b..a4267455b14 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u16.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u16.c @@ -30,13 +30,11 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u16( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; const __m256 vscale = _mm256_set1_ps(params->scalar.scale); - const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point)); const __m256i voutput_zero_point = _mm256_set1_epi16(params->scalar.output_zero_point); - const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { __m256 vx01 = _mm256_loadu_ps(input); @@ -60,8 +58,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u16( __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0)); - vy0123 = _mm_max_epi8(vy0123, voutput_min); - _mm_storeu_si128((__m128i*) output, vy0123); output += 16; } @@ -76,7 +72,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u16( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point)); vy = _mm_packs_epi16(vy, vy); - vy = _mm_max_epi8(vy, voutput_min); _mm_storel_epi64((__m128i*) output, vy); output += 8; @@ -95,7 +90,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u16( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point)); vy = _mm_packs_epi16(vy, vy); - vy = _mm_max_epi8(vy, voutput_min); if (batch & (4 * sizeof(float))) { _mm_storeu_si32(output, vy); diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u32.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u32.c index de671df9956..b8faf2cbc8e 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u32.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u32.c @@ -30,15 +30,13 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u32( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; const __m256 vscale = _mm256_set1_ps(params->scalar.scale); - const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point)); const __m256i voutput_zero_point = _mm256_set1_epi16(params->scalar.output_zero_point); XNN_ALIGN(32) static const uint32_t shuffle_mask[8] = {0, 4, 1, 5, 2, 6, 3, 7}; const __m256i vshuffle_mask = _mm256_load_si256((const __m256i*) shuffle_mask); - const __m256i voutput_min = _mm256_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { __m256 vx01 = _mm256_loadu_ps(input); @@ -72,8 +70,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u32( __m256i vy01234567 = _mm256_permutevar8x32_epi32(vy02461357, vshuffle_mask); - vy01234567 = _mm256_max_epi8(vy01234567, voutput_min); - _mm256_storeu_si256((__m256i*) output, vy01234567); output += 32; } @@ -88,7 +84,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u32( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point)); vy = _mm_packs_epi16(vy, vy); - vy = _mm_max_epi8(vy, _mm256_castsi256_si128(voutput_min)); _mm_storel_epi64((__m128i*) output, vy); output += 8; @@ -107,7 +102,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u32( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point)); vy = _mm_packs_epi16(vy, vy); - vy = _mm_max_epi8(vy, _mm256_castsi256_si128(voutput_min)); if (batch & (4 * sizeof(float))) { _mm_storeu_si32(output, vy); diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u48.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u48.c index cb6aab374fa..351cac10264 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u48.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u48.c @@ -30,15 +30,13 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u48( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; const __m256 vscale = _mm256_set1_ps(params->scalar.scale); - const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point)); const __m256i voutput_zero_point = _mm256_set1_epi16(params->scalar.output_zero_point); XNN_ALIGN(32) static const uint32_t shuffle_mask[8] = {0, 4, 1, 5, 2, 6, 3, 7}; const __m256i vshuffle_mask = _mm256_load_si256((const __m256i*) shuffle_mask); - const __m256i voutput_min = _mm256_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 48 * sizeof(float); batch -= 48 * sizeof(float)) { __m256 vx01 = _mm256_loadu_ps(input); @@ -84,9 +82,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u48( __m256i vy01234567 = _mm256_permutevar8x32_epi32(vy02461357, vshuffle_mask); __m128i vy89AB = _mm_shuffle_epi32(vy8A9B, _MM_SHUFFLE(3, 1, 2, 0)); - vy01234567 = _mm256_max_epi8(vy01234567, voutput_min); - vy89AB = _mm_max_epi8(vy89AB, _mm256_castsi256_si128(voutput_min)); - _mm256_storeu_si256((__m256i*) output, vy01234567); _mm_storeu_si128((__m128i*) (output + 32), vy89AB); output += 48; @@ -102,7 +97,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u48( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point)); vy = _mm_packs_epi16(vy, vy); - vy = _mm_max_epi8(vy, _mm256_castsi256_si128(voutput_min)); _mm_storel_epi64((__m128i*) output, vy); output += 8; @@ -121,7 +115,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u48( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point)); vy = _mm_packs_epi16(vy, vy); - vy = _mm_max_epi8(vy, _mm256_castsi256_si128(voutput_min)); if (batch & (4 * sizeof(float))) { _mm_storeu_si32(output, vy); diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u64.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u64.c index a20495ab83b..c1cfc2a3d18 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u64.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u64.c @@ -30,15 +30,13 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u64( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; const __m256 vscale = _mm256_set1_ps(params->scalar.scale); - const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point)); const __m256i voutput_zero_point = _mm256_set1_epi16(params->scalar.output_zero_point); XNN_ALIGN(32) static const uint32_t shuffle_mask[8] = {0, 4, 1, 5, 2, 6, 3, 7}; const __m256i vshuffle_mask = _mm256_load_si256((const __m256i*) shuffle_mask); - const __m256i voutput_min = _mm256_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { __m256 vx01 = _mm256_loadu_ps(input); @@ -94,9 +92,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u64( __m256i vy01234567 = _mm256_permutevar8x32_epi32(vy02461357, vshuffle_mask); __m256i vy89ABCDEF = _mm256_permutevar8x32_epi32(vy8ACE9BDF, vshuffle_mask); - vy01234567 = _mm256_max_epi8(vy01234567, voutput_min); - vy89ABCDEF = _mm256_max_epi8(vy89ABCDEF, voutput_min); - _mm256_storeu_si256((__m256i*) output, vy01234567); _mm256_storeu_si256((__m256i*) (output + 32), vy89ABCDEF); output += 64; @@ -112,7 +107,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u64( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point)); vy = _mm_packs_epi16(vy, vy); - vy = _mm_max_epi8(vy, _mm256_castsi256_si128(voutput_min)); _mm_storel_epi64((__m128i*) output, vy); output += 8; @@ -131,7 +125,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u64( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point)); vy = _mm_packs_epi16(vy, vy); - vy = _mm_max_epi8(vy, _mm256_castsi256_si128(voutput_min)); if (batch & (4 * sizeof(float))) { _mm_storeu_si32(output, vy); diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u128.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u128.c index 850a3cd2489..6ee078c00dd 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u128.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u128.c @@ -31,14 +31,12 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u128( const __m512 vscale = _mm512_set1_ps(params->scalar.scale); - const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point)); const __m512i voutput_zero_point = _mm512_set1_epi16(params->scalar.output_zero_point); const __m512i vshuffle512_mask = _mm512_load_si512(shuffle512_mask); - const __m512i voutput_min = _mm512_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 128 * sizeof(float); batch -= 128 * sizeof(float)) { __m512 vx0123 = _mm512_loadu_ps(input); __m512 vx4567 = _mm512_loadu_ps(input + 16); @@ -90,9 +88,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u128( __m512i vy048C159D26AE37BF = _mm512_packs_epi16(vacc04152637, vacc8C9DAEBF); __m512i vyGKOSHLPTIMQUJNRV = _mm512_packs_epi16(vaccGKHLIMJN, vaccOSPTQURV); - vy048C159D26AE37BF = _mm512_max_epi8(vy048C159D26AE37BF, voutput_min); - vyGKOSHLPTIMQUJNRV = _mm512_max_epi8(vyGKOSHLPTIMQUJNRV, voutput_min); - const __m512i vy0123456789ABCDEF = _mm512_permutexvar_epi32(vshuffle512_mask, vy048C159D26AE37BF); const __m512i vyGHIJKLMNOPQRSTUV = _mm512_permutexvar_epi32(vshuffle512_mask, vyGKOSHLPTIMQUJNRV); @@ -112,7 +107,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u128( vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point)); const __m128i vy0213 = _mm_packs_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1)); __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0)); - vy0123 = _mm_max_epi8(vy0123, _mm512_castsi512_si128(voutput_min)); _mm_storeu_si128((__m128i*) output, vy0123); output += 16; @@ -135,7 +129,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u128( vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point)); const __m128i vy0213 = _mm_packs_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1)); __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0)); - vy0123 = _mm_max_epi8(vy0123, _mm512_castsi512_si128(voutput_min)); _mm_mask_storeu_epi8(output, vmask, vy0123); } diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u32.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u32.c index d732a7f539f..43a38fceec4 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u32.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u32.c @@ -31,14 +31,12 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u32( XNN_ALIGN(32) static const uint32_t shuffle256_mask[8] = {0, 4, 2, 6, 1, 5, 3, 7}; const __m512 vscale = _mm512_set1_ps(params->scalar.scale); - const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point)); const __m512i voutput_zero_point = _mm512_set1_epi16(params->scalar.output_zero_point); const __m256i vshuffle256_mask = _mm256_load_si256((const __m256i*) shuffle256_mask); - const __m256i voutput_min = _mm256_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { __m512 vx0123 = _mm512_loadu_ps(input); __m512 vx4567 = _mm512_loadu_ps(input + 16); @@ -59,8 +57,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u32( __m256i vy04261537 = _mm256_packs_epi16(_mm512_castsi512_si256(vacc04152637), _mm512_extracti32x8_epi32(vacc04152637, 1)); - vy04261537 = _mm256_max_epi8(vy04261537, voutput_min); - const __m256i vy01234567 = _mm256_permutevar8x32_epi32(vy04261537, vshuffle256_mask); _mm256_storeu_si256((__m256i*) output, vy01234567); @@ -78,7 +74,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u32( vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point)); const __m128i vy0213 = _mm_packs_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1)); __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0)); - vy0123 = _mm_max_epi8(vy0123, _mm256_castsi256_si128(voutput_min)); _mm_storeu_si128((__m128i*) output, vy0123); output += 16; @@ -101,7 +96,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u32( vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point)); const __m128i vy0213 = _mm_packs_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1)); __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0)); - vy0123 = _mm_max_epi8(vy0123, _mm256_castsi256_si128(voutput_min)); _mm_mask_storeu_epi8(output, vmask, vy0123); } diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u64.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u64.c index 2534b3c70f3..40a5bd1d02e 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u64.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u64.c @@ -31,14 +31,12 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u64( const __m512 vscale = _mm512_set1_ps(params->scalar.scale); - const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point)); const __m512i voutput_zero_point = _mm512_set1_epi16(params->scalar.output_zero_point); const __m512i vshuffle512_mask = _mm512_load_si512(shuffle512_mask); - const __m512i voutput_min = _mm512_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { __m512 vx0123 = _mm512_loadu_ps(input); __m512 vx4567 = _mm512_loadu_ps(input + 16); @@ -69,8 +67,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u64( __m512i vy048C159D26AE37BF = _mm512_packs_epi16(vacc04152637, vacc8C9DAEBF); - vy048C159D26AE37BF = _mm512_max_epi8(vy048C159D26AE37BF, voutput_min); - const __m512i vy0123456789ABCDEF = _mm512_permutexvar_epi32(vshuffle512_mask, vy048C159D26AE37BF); _mm512_storeu_si512(output, vy0123456789ABCDEF); @@ -88,7 +84,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u64( vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point)); const __m128i vy0213 = _mm_packs_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1)); __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0)); - vy0123 = _mm_max_epi8(vy0123, _mm512_castsi512_si128(voutput_min)); _mm_storeu_si128((__m128i*) output, vy0123); output += 16; @@ -111,7 +106,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u64( vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point)); const __m128i vy0213 = _mm_packs_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1)); __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0)); - vy0123 = _mm_max_epi8(vy0123, _mm512_castsi512_si128(voutput_min)); _mm_mask_storeu_epi8(output, vmask, vy0123); } diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u96.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u96.c index 2c34fc6864b..943e467a2c9 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u96.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u96.c @@ -32,15 +32,13 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u96( XNN_ALIGN(32) static const uint32_t shuffle256_mask[8] = {0, 4, 2, 6, 1, 5, 3, 7}; const __m512 vscale = _mm512_set1_ps(params->scalar.scale); - const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point)); const __m512i voutput_zero_point = _mm512_set1_epi16(params->scalar.output_zero_point); const __m512i vshuffle512_mask = _mm512_load_si512(shuffle512_mask); const __m256i vshuffle256_mask = _mm256_load_si256((const __m256i*) shuffle256_mask); - const __m512i voutput_min = _mm512_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 96 * sizeof(float); batch -= 96 * sizeof(float)) { __m512 vx0123 = _mm512_loadu_ps(input); __m512 vx4567 = _mm512_loadu_ps(input + 16); @@ -82,9 +80,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u96( __m512i vy048C159D26AE37BF = _mm512_packs_epi16(vacc04152637, vacc8C9DAEBF); __m256i vyGKIMHLJN = _mm256_packs_epi16(_mm512_castsi512_si256(vaccGKHLIMJN), _mm512_extracti32x8_epi32(vaccGKHLIMJN, 1)); - vy048C159D26AE37BF = _mm512_max_epi8(vy048C159D26AE37BF, voutput_min); - vyGKIMHLJN = _mm256_max_epi8(vyGKIMHLJN, _mm512_castsi512_si256(voutput_min)); - const __m512i vy0123456789ABCDEF = _mm512_permutexvar_epi32(vshuffle512_mask, vy048C159D26AE37BF); const __m256i vyGHIJKLMN = _mm256_permutevar8x32_epi32(vyGKIMHLJN, vshuffle256_mask); @@ -104,7 +99,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u96( vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point)); const __m128i vy0213 = _mm_packs_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1)); __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0)); - vy0123 = _mm_max_epi8(vy0123, _mm512_castsi512_si128(voutput_min)); _mm_storeu_si128((__m128i*) output, vy0123); output += 16; @@ -127,7 +121,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u96( vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point)); const __m128i vy0213 = _mm_packs_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1)); __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0)); - vy0123 = _mm_max_epi8(vy0123, _mm512_castsi512_si128(voutput_min)); _mm_mask_storeu_epi8(output, vmask, vy0123); } diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u128.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u128.c index 277f5a0a102..686e1eaf431 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u128.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u128.c @@ -26,8 +26,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u128( const HVX_Vector vscale = xnn_set1_f32(params->scalar.scale); const HVX_Vector vmagic_bias = xnn_set1_f32(12582912.0f); const HVX_Vector vmagic_bias_less_zero_point = Q6_V_vsplat_R(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point); - const HVX_Vector voutput_min = Q6_Vb_vsplat_R(params->scalar.output_min); - const HVX_Vector voutput_max = Q6_Vb_vsplat_R(params->scalar.output_max); XNN_FORCE_REALIZATION(vmagic_bias); for (; batch >= 128 * sizeof(float); batch -= 128 * sizeof(float)) { HVX_Vector vx0 = xnn_loadu_f32(input); @@ -53,9 +51,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u128( // narrowing 16-bit to 8-bit HVX_Vector vy0 = Q6_Vb_vpack_VhVh_sat(vacc_h1, vacc_h0); - vy0 = Q6_Vb_vmax_VbVb(voutput_min, vy0); - vy0 = Q6_Vb_vmin_VbVb(voutput_max, vy0); - *((HVX_UVector *) output) = vy0; output += 128; } @@ -71,9 +66,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u128( HVX_Vector vy = Q6_Vb_vpack_VhVh_sat(vacc_h, vacc_h); - vy = Q6_Vb_vmax_VbVb(voutput_min, vy); - vy = Q6_Vb_vmin_VbVb(voutput_max, vy); - Q6_V_vstu_variable(output, 32, vy); output += 32; } @@ -90,9 +82,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u128( HVX_Vector vy = Q6_Vb_vpack_VhVh_sat(vacc_h, vacc_h); - vy = Q6_Vb_vmax_VbVb(voutput_min, vy); - vy = Q6_Vb_vmin_VbVb(voutput_max, vy); - // Since the output data type is int8_t, // we simply determine the number of elements using batch >> 2 // without multiplying by sizeof(int8_t). diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u256.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u256.c index 9d96a73e6d6..7aa784bb36f 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u256.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u256.c @@ -26,8 +26,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u256( const HVX_Vector vscale = xnn_set1_f32(params->scalar.scale); const HVX_Vector vmagic_bias = xnn_set1_f32(12582912.0f); const HVX_Vector vmagic_bias_less_zero_point = Q6_V_vsplat_R(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point); - const HVX_Vector voutput_min = Q6_Vb_vsplat_R(params->scalar.output_min); - const HVX_Vector voutput_max = Q6_Vb_vsplat_R(params->scalar.output_max); XNN_FORCE_REALIZATION(vmagic_bias); for (; batch >= 256 * sizeof(float); batch -= 256 * sizeof(float)) { HVX_Vector vx0 = xnn_loadu_f32(input); @@ -68,11 +66,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u256( HVX_Vector vy0 = Q6_Vb_vpack_VhVh_sat(vacc_h1, vacc_h0); HVX_Vector vy1 = Q6_Vb_vpack_VhVh_sat(vacc_h3, vacc_h2); - vy0 = Q6_Vb_vmax_VbVb(voutput_min, vy0); - vy0 = Q6_Vb_vmin_VbVb(voutput_max, vy0); - vy1 = Q6_Vb_vmax_VbVb(voutput_min, vy1); - vy1 = Q6_Vb_vmin_VbVb(voutput_max, vy1); - *((HVX_UVector *) output) = vy0; output += 128; *((HVX_UVector *) output) = vy1; @@ -90,9 +83,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u256( HVX_Vector vy = Q6_Vb_vpack_VhVh_sat(vacc_h, vacc_h); - vy = Q6_Vb_vmax_VbVb(voutput_min, vy); - vy = Q6_Vb_vmin_VbVb(voutput_max, vy); - Q6_V_vstu_variable(output, 32, vy); output += 32; } @@ -109,9 +99,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u256( HVX_Vector vy = Q6_Vb_vpack_VhVh_sat(vacc_h, vacc_h); - vy = Q6_Vb_vmax_VbVb(voutput_min, vy); - vy = Q6_Vb_vmin_VbVb(voutput_max, vy); - // Since the output data type is int8_t, // we simply determine the number of elements using batch >> 2 // without multiplying by sizeof(int8_t). diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u32.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u32.c index 80c7c050e96..02a2540be9f 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u32.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u32.c @@ -26,8 +26,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u32( const HVX_Vector vscale = xnn_set1_f32(params->scalar.scale); const HVX_Vector vmagic_bias = xnn_set1_f32(12582912.0f); const HVX_Vector vmagic_bias_less_zero_point = Q6_V_vsplat_R(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point); - const HVX_Vector voutput_min = Q6_Vb_vsplat_R(params->scalar.output_min); - const HVX_Vector voutput_max = Q6_Vb_vsplat_R(params->scalar.output_max); XNN_FORCE_REALIZATION(vmagic_bias); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { HVX_Vector vx = xnn_loadu_f32(input); @@ -41,9 +39,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u32( HVX_Vector vy = Q6_Vb_vpack_VhVh_sat(vacc_h, vacc_h); - vy = Q6_Vb_vmax_VbVb(voutput_min, vy); - vy = Q6_Vb_vmin_VbVb(voutput_max, vy); - Q6_V_vstu_variable(output, 32, vy); output += 32; } @@ -60,9 +55,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u32( HVX_Vector vy = Q6_Vb_vpack_VhVh_sat(vacc_h, vacc_h); - vy = Q6_Vb_vmax_VbVb(voutput_min, vy); - vy = Q6_Vb_vmin_VbVb(voutput_max, vy); - // Since the output data type is int8_t, // we simply determine the number of elements using batch >> 2 // without multiplying by sizeof(int8_t). diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u64.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u64.c index 234b656aa4c..95e41c9055f 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u64.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u64.c @@ -26,8 +26,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u64( const HVX_Vector vscale = xnn_set1_f32(params->scalar.scale); const HVX_Vector vmagic_bias = xnn_set1_f32(12582912.0f); const HVX_Vector vmagic_bias_less_zero_point = Q6_V_vsplat_R(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point); - const HVX_Vector voutput_min = Q6_Vb_vsplat_R(params->scalar.output_min); - const HVX_Vector voutput_max = Q6_Vb_vsplat_R(params->scalar.output_max); XNN_FORCE_REALIZATION(vmagic_bias); for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { HVX_Vector vx0 = xnn_loadu_f32(input); @@ -46,9 +44,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u64( // narrowing 16-bit to 8-bit HVX_Vector vy0 = Q6_Vb_vpack_VhVh_sat(vacc_h0, vacc_h0); - vy0 = Q6_Vb_vmax_VbVb(voutput_min, vy0); - vy0 = Q6_Vb_vmin_VbVb(voutput_max, vy0); - Q6_V_vstu_variable(output, 64, vy0); output += 64; } @@ -64,9 +59,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u64( HVX_Vector vy = Q6_Vb_vpack_VhVh_sat(vacc_h, vacc_h); - vy = Q6_Vb_vmax_VbVb(voutput_min, vy); - vy = Q6_Vb_vmin_VbVb(voutput_max, vy); - Q6_V_vstu_variable(output, 32, vy); output += 32; } @@ -83,9 +75,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u64( HVX_Vector vy = Q6_Vb_vpack_VhVh_sat(vacc_h, vacc_h); - vy = Q6_Vb_vmax_VbVb(voutput_min, vy); - vy = Q6_Vb_vmin_VbVb(voutput_max, vy); - // Since the output data type is int8_t, // we simply determine the number of elements using batch >> 2 // without multiplying by sizeof(int8_t). diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u96.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u96.c index 131e51562ba..3b336d79e1c 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u96.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u96.c @@ -26,8 +26,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u96( const HVX_Vector vscale = xnn_set1_f32(params->scalar.scale); const HVX_Vector vmagic_bias = xnn_set1_f32(12582912.0f); const HVX_Vector vmagic_bias_less_zero_point = Q6_V_vsplat_R(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point); - const HVX_Vector voutput_min = Q6_Vb_vsplat_R(params->scalar.output_min); - const HVX_Vector voutput_max = Q6_Vb_vsplat_R(params->scalar.output_max); XNN_FORCE_REALIZATION(vmagic_bias); for (; batch >= 96 * sizeof(float); batch -= 96 * sizeof(float)) { HVX_Vector vx0 = xnn_loadu_f32(input); @@ -50,9 +48,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u96( // narrowing 16-bit to 8-bit HVX_Vector vy0 = Q6_Vb_vpack_VhVh_sat(vacc_h1, vacc_h0); - vy0 = Q6_Vb_vmax_VbVb(voutput_min, vy0); - vy0 = Q6_Vb_vmin_VbVb(voutput_max, vy0); - Q6_V_vstu_variable(output, 96, vy0); output += 96; } @@ -68,9 +63,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u96( HVX_Vector vy = Q6_Vb_vpack_VhVh_sat(vacc_h, vacc_h); - vy = Q6_Vb_vmax_VbVb(voutput_min, vy); - vy = Q6_Vb_vmin_VbVb(voutput_max, vy); - Q6_V_vstu_variable(output, 32, vy); output += 32; } @@ -87,9 +79,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u96( HVX_Vector vy = Q6_Vb_vpack_VhVh_sat(vacc_h, vacc_h); - vy = Q6_Vb_vmax_VbVb(voutput_min, vy); - vy = Q6_Vb_vmin_VbVb(voutput_max, vy); - // Since the output data type is int8_t, // we simply determine the number of elements using batch >> 2 // without multiplying by sizeof(int8_t). diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u16.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u16.c index aa77d56645c..ddd09f979de 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u16.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u16.c @@ -30,8 +30,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u16( const float32x4_t vscale = vld1q_dup_f32(¶ms->scalar.scale); const float32x4_t vmagic_bias = vdupq_n_f32(12582912.0f); const int32x4_t vmagic_bias_less_zero_point = vdupq_n_s32(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->scalar.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->scalar.output_max); XNN_FORCE_REALIZATION(vmagic_bias); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { float32x4_t vx0123 = vld1q_f32(input); input += 4; @@ -59,10 +57,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u16( int8x16_t vy0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); - vy0123456789ABCDEF = vmaxq_s8(vy0123456789ABCDEF, voutput_min); - - vy0123456789ABCDEF = vminq_s8(vy0123456789ABCDEF, voutput_max); - vst1q_s8(output, vy0123456789ABCDEF); output += 16; } for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { @@ -81,8 +75,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u16( const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); int8x8_t vy = vqmovn_s16(vacc); - vy = vmax_s8(vy, vget_low_s8(voutput_min)); - vy = vmin_s8(vy, vget_low_s8(voutput_max)); vst1_s8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { @@ -104,8 +96,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u16( const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); int8x8_t vy = vqmovn_s16(vacc); - vy = vmax_s8(vy, vget_low_s8(voutput_min)); - vy = vmin_s8(vy, vget_low_s8(voutput_max)); if (batch & (4 * sizeof(float))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4; diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u24.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u24.c index 0014b184788..332a6a22358 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u24.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u24.c @@ -30,8 +30,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u24( const float32x4_t vscale = vld1q_dup_f32(¶ms->scalar.scale); const float32x4_t vmagic_bias = vdupq_n_f32(12582912.0f); const int32x4_t vmagic_bias_less_zero_point = vdupq_n_s32(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->scalar.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->scalar.output_max); XNN_FORCE_REALIZATION(vmagic_bias); for (; batch >= 24 * sizeof(float); batch -= 24 * sizeof(float)) { float32x4_t vx0123 = vld1q_f32(input); input += 4; @@ -69,12 +67,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u24( int8x16_t vy0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); int8x8_t vyGHIJKLMN = vqmovn_s16(vaccGHIJKLMN); - vy0123456789ABCDEF = vmaxq_s8(vy0123456789ABCDEF, voutput_min); - vyGHIJKLMN = vmax_s8(vyGHIJKLMN, vget_low_s8(voutput_min)); - - vy0123456789ABCDEF = vminq_s8(vy0123456789ABCDEF, voutput_max); - vyGHIJKLMN = vmin_s8(vyGHIJKLMN, vget_low_s8(voutput_max)); - vst1q_s8(output, vy0123456789ABCDEF); output += 16; vst1_s8(output, vyGHIJKLMN); output += 8; } @@ -94,8 +86,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u24( const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); int8x8_t vy = vqmovn_s16(vacc); - vy = vmax_s8(vy, vget_low_s8(voutput_min)); - vy = vmin_s8(vy, vget_low_s8(voutput_max)); vst1_s8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { @@ -117,8 +107,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u24( const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); int8x8_t vy = vqmovn_s16(vacc); - vy = vmax_s8(vy, vget_low_s8(voutput_min)); - vy = vmin_s8(vy, vget_low_s8(voutput_max)); if (batch & (4 * sizeof(float))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4; diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u32.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u32.c index c269620c21c..4e8e50c6f2c 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u32.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u32.c @@ -30,8 +30,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u32( const float32x4_t vscale = vld1q_dup_f32(¶ms->scalar.scale); const float32x4_t vmagic_bias = vdupq_n_f32(12582912.0f); const int32x4_t vmagic_bias_less_zero_point = vdupq_n_s32(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->scalar.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->scalar.output_max); XNN_FORCE_REALIZATION(vmagic_bias); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { float32x4_t vx0123 = vld1q_f32(input); input += 4; @@ -78,12 +76,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u32( int8x16_t vy0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); int8x16_t vyGHIJKLMNOPQRSTUV = vcombine_s8(vqmovn_s16(vaccGHIJKLMN), vqmovn_s16(vaccOPQRSTUV)); - vy0123456789ABCDEF = vmaxq_s8(vy0123456789ABCDEF, voutput_min); - vyGHIJKLMNOPQRSTUV = vmaxq_s8(vyGHIJKLMNOPQRSTUV, voutput_min); - - vy0123456789ABCDEF = vminq_s8(vy0123456789ABCDEF, voutput_max); - vyGHIJKLMNOPQRSTUV = vminq_s8(vyGHIJKLMNOPQRSTUV, voutput_max); - vst1q_s8(output, vy0123456789ABCDEF); output += 16; vst1q_s8(output, vyGHIJKLMNOPQRSTUV); output += 16; } @@ -103,8 +95,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u32( const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); int8x8_t vy = vqmovn_s16(vacc); - vy = vmax_s8(vy, vget_low_s8(voutput_min)); - vy = vmin_s8(vy, vget_low_s8(voutput_max)); vst1_s8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { @@ -126,8 +116,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u32( const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); int8x8_t vy = vqmovn_s16(vacc); - vy = vmax_s8(vy, vget_low_s8(voutput_min)); - vy = vmin_s8(vy, vget_low_s8(voutput_max)); if (batch & (4 * sizeof(float))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4; diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u8.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u8.c index ebca2f0cd62..bac5161d0d2 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u8.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u8.c @@ -30,8 +30,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u8( const float32x4_t vscale = vld1q_dup_f32(¶ms->scalar.scale); const float32x4_t vmagic_bias = vdupq_n_f32(12582912.0f); const int32x4_t vmagic_bias_less_zero_point = vdupq_n_s32(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point); - const int8x8_t voutput_min = vld1_dup_s8(¶ms->scalar.output_min); - const int8x8_t voutput_max = vld1_dup_s8(¶ms->scalar.output_max); XNN_FORCE_REALIZATION(vmagic_bias); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { float32x4_t vx_lo = vld1q_f32(input); input += 4; @@ -49,8 +47,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u8( const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); int8x8_t vy = vqmovn_s16(vacc); - vy = vmax_s8(vy, voutput_min); - vy = vmin_s8(vy, voutput_max); vst1_s8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { @@ -72,8 +68,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u8( const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); int8x8_t vy = vqmovn_s16(vacc); - vy = vmax_s8(vy, voutput_min); - vy = vmin_s8(vy, voutput_max); if (batch & (4 * sizeof(float))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4; diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u16.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u16.c index 7288a72069d..acb1398d7aa 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u16.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u16.c @@ -29,8 +29,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u16( const float32x4_t vscale = vld1q_dup_f32(¶ms->scalar.scale); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->scalar.output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->scalar.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->scalar.output_max); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { float32x4_t vx0123 = vld1q_f32(input); input += 4; float32x4_t vx4567 = vld1q_f32(input); input += 4; @@ -55,10 +53,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u16( int8x16_t vy0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); - vy0123456789ABCDEF = vmaxq_s8(vy0123456789ABCDEF, voutput_min); - - vy0123456789ABCDEF = vminq_s8(vy0123456789ABCDEF, voutput_max); - vst1q_s8(output, vy0123456789ABCDEF); output += 16; } for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { @@ -75,8 +69,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u16( vacc = vqaddq_s16(vacc, voutput_zero_point); int8x8_t vy = vqmovn_s16(vacc); - vy = vmax_s8(vy, vget_low_s8(voutput_min)); - vy = vmin_s8(vy, vget_low_s8(voutput_max)); vst1_s8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { @@ -96,8 +88,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u16( vacc = vqaddq_s16(vacc, voutput_zero_point); int8x8_t vy = vqmovn_s16(vacc); - vy = vmax_s8(vy, vget_low_s8(voutput_min)); - vy = vmin_s8(vy, vget_low_s8(voutput_max)); if (batch & (4 * sizeof(float))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4; diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u24.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u24.c index d10a83ea2a4..2cd5a932404 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u24.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u24.c @@ -29,8 +29,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u24( const float32x4_t vscale = vld1q_dup_f32(¶ms->scalar.scale); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->scalar.output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->scalar.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->scalar.output_max); for (; batch >= 24 * sizeof(float); batch -= 24 * sizeof(float)) { float32x4_t vx0123 = vld1q_f32(input); input += 4; float32x4_t vx4567 = vld1q_f32(input); input += 4; @@ -64,12 +62,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u24( int8x16_t vy0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); int8x8_t vyGHIJKLMN = vqmovn_s16(vaccGHIJKLMN); - vy0123456789ABCDEF = vmaxq_s8(vy0123456789ABCDEF, voutput_min); - vyGHIJKLMN = vmax_s8(vyGHIJKLMN, vget_low_s8(voutput_min)); - - vy0123456789ABCDEF = vminq_s8(vy0123456789ABCDEF, voutput_max); - vyGHIJKLMN = vmin_s8(vyGHIJKLMN, vget_low_s8(voutput_max)); - vst1q_s8(output, vy0123456789ABCDEF); output += 16; vst1_s8(output, vyGHIJKLMN); output += 8; } @@ -87,8 +79,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u24( vacc = vqaddq_s16(vacc, voutput_zero_point); int8x8_t vy = vqmovn_s16(vacc); - vy = vmax_s8(vy, vget_low_s8(voutput_min)); - vy = vmin_s8(vy, vget_low_s8(voutput_max)); vst1_s8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { @@ -108,8 +98,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u24( vacc = vqaddq_s16(vacc, voutput_zero_point); int8x8_t vy = vqmovn_s16(vacc); - vy = vmax_s8(vy, vget_low_s8(voutput_min)); - vy = vmin_s8(vy, vget_low_s8(voutput_max)); if (batch & (4 * sizeof(float))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4; diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u32.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u32.c index c0ae721e9de..05229f96051 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u32.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u32.c @@ -29,8 +29,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u32( const float32x4_t vscale = vld1q_dup_f32(¶ms->scalar.scale); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->scalar.output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->scalar.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->scalar.output_max); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { float32x4_t vx0123 = vld1q_f32(input); input += 4; float32x4_t vx4567 = vld1q_f32(input); input += 4; @@ -72,12 +70,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u32( int8x16_t vy0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); int8x16_t vyGHIJKLMNOPQRSTUV = vcombine_s8(vqmovn_s16(vaccGHIJKLMN), vqmovn_s16(vaccOPQRSTUV)); - vy0123456789ABCDEF = vmaxq_s8(vy0123456789ABCDEF, voutput_min); - vyGHIJKLMNOPQRSTUV = vmaxq_s8(vyGHIJKLMNOPQRSTUV, voutput_min); - - vy0123456789ABCDEF = vminq_s8(vy0123456789ABCDEF, voutput_max); - vyGHIJKLMNOPQRSTUV = vminq_s8(vyGHIJKLMNOPQRSTUV, voutput_max); - vst1q_s8(output, vy0123456789ABCDEF); output += 16; vst1q_s8(output, vyGHIJKLMNOPQRSTUV); output += 16; } @@ -95,8 +87,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u32( vacc = vqaddq_s16(vacc, voutput_zero_point); int8x8_t vy = vqmovn_s16(vacc); - vy = vmax_s8(vy, vget_low_s8(voutput_min)); - vy = vmin_s8(vy, vget_low_s8(voutput_max)); vst1_s8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { @@ -116,8 +106,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u32( vacc = vqaddq_s16(vacc, voutput_zero_point); int8x8_t vy = vqmovn_s16(vacc); - vy = vmax_s8(vy, vget_low_s8(voutput_min)); - vy = vmin_s8(vy, vget_low_s8(voutput_max)); if (batch & (4 * sizeof(float))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4; diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u8.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u8.c index b695ce35e83..04aec02db27 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u8.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u8.c @@ -29,8 +29,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u8( const float32x4_t vscale = vld1q_dup_f32(¶ms->scalar.scale); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->scalar.output_zero_point); - const int8x8_t voutput_min = vld1_dup_s8(¶ms->scalar.output_min); - const int8x8_t voutput_max = vld1_dup_s8(¶ms->scalar.output_max); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { float32x4_t vx_lo = vld1q_f32(input); input += 4; float32x4_t vx_hi = vld1q_f32(input); input += 4; @@ -45,8 +43,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u8( vacc = vqaddq_s16(vacc, voutput_zero_point); int8x8_t vy = vqmovn_s16(vacc); - vy = vmax_s8(vy, voutput_min); - vy = vmin_s8(vy, voutput_max); vst1_s8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { @@ -66,8 +62,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u8( vacc = vqaddq_s16(vacc, voutput_zero_point); int8x8_t vy = vqmovn_s16(vacc); - vy = vmax_s8(vy, voutput_min); - vy = vmin_s8(vy, voutput_max); if (batch & (4 * sizeof(float))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4; diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u1v.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u1v.c index 3b34cf53db2..d321f60096c 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u1v.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u1v.c @@ -29,8 +29,9 @@ void xnn_f32_qs8_vcvt_ukernel__rvv_u1v( batch >>= XNN_LOG2_SIZEOF_FLOAT; const float scale = params->scalar.scale; - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + // TODO: Clamp may not be necessary. RISCV spec doesn't say if vncvt saturates... + const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float output_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const int32_t output_zero_point = params->scalar.output_zero_point; for (; batch > 0; ) { diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u2v.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u2v.c index 007cfc2674c..7c705028874 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u2v.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u2v.c @@ -29,8 +29,9 @@ void xnn_f32_qs8_vcvt_ukernel__rvv_u2v( batch >>= XNN_LOG2_SIZEOF_FLOAT; const float scale = params->scalar.scale; - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + // TODO: Clamp may not be necessary. RISCV spec doesn't say if vncvt saturates... + const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float output_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const int32_t output_zero_point = params->scalar.output_zero_point; for (; batch > 0; ) { diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u4v.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u4v.c index 1dc5f95390d..f7b097b8dab 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u4v.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u4v.c @@ -29,8 +29,9 @@ void xnn_f32_qs8_vcvt_ukernel__rvv_u4v( batch >>= XNN_LOG2_SIZEOF_FLOAT; const float scale = params->scalar.scale; - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + // TODO: Clamp may not be necessary. RISCV spec doesn't say if vncvt saturates... + const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float output_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const int32_t output_zero_point = params->scalar.output_zero_point; for (; batch > 0; ) { diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u8v.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u8v.c index 825ab646212..a697c9a51f9 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u8v.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u8v.c @@ -29,8 +29,9 @@ void xnn_f32_qs8_vcvt_ukernel__rvv_u8v( batch >>= XNN_LOG2_SIZEOF_FLOAT; const float scale = params->scalar.scale; - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + // TODO: Clamp may not be necessary. RISCV spec doesn't say if vncvt saturates... + const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float output_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const int32_t output_zero_point = params->scalar.output_zero_point; for (; batch > 0; ) { diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u1.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u1.c index c8b2d40a1bc..7977f9d3c48 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u1.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u1.c @@ -26,8 +26,8 @@ void xnn_f32_qs8_vcvt_ukernel__scalar_fmagic_u1( const float* i = input; const float vscale = params->scalar.scale; - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const float vmagic_bias = 12582912.0f; const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u2.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u2.c index 43d349249a5..8ffe971eef2 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u2.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u2.c @@ -26,8 +26,8 @@ void xnn_f32_qs8_vcvt_ukernel__scalar_fmagic_u2( const float* i = input; const float vscale = params->scalar.scale; - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const float vmagic_bias = 12582912.0f; const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u3.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u3.c index 2464f6f929a..174cd884515 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u3.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u3.c @@ -26,8 +26,8 @@ void xnn_f32_qs8_vcvt_ukernel__scalar_fmagic_u3( const float* i = input; const float vscale = params->scalar.scale; - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const float vmagic_bias = 12582912.0f; const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u4.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u4.c index 5f79a10ac3f..e3963ebe017 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u4.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u4.c @@ -26,8 +26,8 @@ void xnn_f32_qs8_vcvt_ukernel__scalar_fmagic_u4( const float* i = input; const float vscale = params->scalar.scale; - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const float vmagic_bias = 12582912.0f; const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u1.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u1.c index e9d3f8c00ef..6a6f8ccfcd1 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u1.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u1.c @@ -27,8 +27,8 @@ void xnn_f32_qs8_vcvt_ukernel__scalar_imagic_u1( const float* i = input; const float vscale = params->scalar.scale; const float vmagic_bias = 12582912.0f; - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float output_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point); const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point); const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u2.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u2.c index ec02afcb18f..3842ef98972 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u2.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u2.c @@ -27,8 +27,8 @@ void xnn_f32_qs8_vcvt_ukernel__scalar_imagic_u2( const float* i = input; const float vscale = params->scalar.scale; const float vmagic_bias = 12582912.0f; - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float output_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point); const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point); const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u3.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u3.c index d71b65c92f4..cc2043221c5 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u3.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u3.c @@ -27,8 +27,8 @@ void xnn_f32_qs8_vcvt_ukernel__scalar_imagic_u3( const float* i = input; const float vscale = params->scalar.scale; const float vmagic_bias = 12582912.0f; - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float output_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point); const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point); const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u4.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u4.c index 55a3b724597..ba3508948b6 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u4.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u4.c @@ -27,8 +27,8 @@ void xnn_f32_qs8_vcvt_ukernel__scalar_imagic_u4( const float* i = input; const float vscale = params->scalar.scale; const float vmagic_bias = 12582912.0f; - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float output_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point); const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point); const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u1.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u1.c index ff91a4a5a76..772a1b9ac21 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u1.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u1.c @@ -27,8 +27,8 @@ void xnn_f32_qs8_vcvt_ukernel__scalar_lrintf_u1( assert(output != NULL); const float vscale = params->scalar.scale; - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const int32_t voutput_zero_point = params->scalar.output_zero_point; do { diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u2.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u2.c index 117536d2fe0..656e12af70c 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u2.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u2.c @@ -27,8 +27,8 @@ void xnn_f32_qs8_vcvt_ukernel__scalar_lrintf_u2( assert(output != NULL); const float vscale = params->scalar.scale; - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const int32_t voutput_zero_point = params->scalar.output_zero_point; for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u3.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u3.c index f479dbcbd8e..c6867a7366e 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u3.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u3.c @@ -27,8 +27,8 @@ void xnn_f32_qs8_vcvt_ukernel__scalar_lrintf_u3( assert(output != NULL); const float vscale = params->scalar.scale; - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const int32_t voutput_zero_point = params->scalar.output_zero_point; for (; batch >= 3 * sizeof(float); batch -= 3 * sizeof(float)) { diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u4.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u4.c index 60d73a29ed1..67e2adef52d 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u4.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u4.c @@ -27,8 +27,8 @@ void xnn_f32_qs8_vcvt_ukernel__scalar_lrintf_u4( assert(output != NULL); const float vscale = params->scalar.scale; - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const int32_t voutput_zero_point = params->scalar.output_zero_point; for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u16.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u16.c index 18715c32046..9498d5e23c9 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u16.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u16.c @@ -28,13 +28,11 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u16( assert(output != NULL); const __m128 vscale = _mm_set1_ps(params->scalar.scale); - const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point)); const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point); - const __m128i voutput_min = _mm_set1_epi16(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { __m128 vx0123 = _mm_loadu_ps(input); @@ -64,12 +62,8 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u16( vy01234567 = _mm_adds_epi16(vy01234567, voutput_zero_point); vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point); - vy01234567 = _mm_max_epi16(vy01234567, voutput_min); - vy89ABCDEF = _mm_max_epi16(vy89ABCDEF, voutput_min); - __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF); - _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF); output += 16; } @@ -89,7 +83,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u16( __m128i vy = _mm_packs_epi32(vy_lo, vy_hi); vy = _mm_adds_epi16(vy, voutput_zero_point); - vy = _mm_max_epi16(vy, voutput_min); vy = _mm_packs_epi16(vy, vy); _mm_storel_epi64((__m128i*) output, vy); @@ -111,7 +104,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u16( __m128i vy = _mm_packs_epi32(vy_lo, vy_hi); vy = _mm_adds_epi16(vy, voutput_zero_point); - vy = _mm_max_epi16(vy, voutput_min); vy = _mm_packs_epi16(vy, vy); if (batch & (4 * sizeof(float))) { diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u24.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u24.c index c2dd1fab9bf..64cfa2b64b5 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u24.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u24.c @@ -28,13 +28,11 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u24( assert(output != NULL); const __m128 vscale = _mm_set1_ps(params->scalar.scale); - const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point)); const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point); - const __m128i voutput_min = _mm_set1_epi16(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 24 * sizeof(float); batch -= 24 * sizeof(float)) { __m128 vx0123 = _mm_loadu_ps(input); @@ -74,14 +72,9 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u24( vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point); vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point); - vy01234567 = _mm_max_epi16(vy01234567, voutput_min); - vy89ABCDEF = _mm_max_epi16(vy89ABCDEF, voutput_min); - vyGHIJKLMN = _mm_max_epi16(vyGHIJKLMN, voutput_min); - __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF); vyGHIJKLMN = _mm_packs_epi16(vyGHIJKLMN, vyGHIJKLMN); - _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF); _mm_storel_epi64((__m128i*) (output + 16), vyGHIJKLMN); output += 24; @@ -102,7 +95,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u24( __m128i vy = _mm_packs_epi32(vy_lo, vy_hi); vy = _mm_adds_epi16(vy, voutput_zero_point); - vy = _mm_max_epi16(vy, voutput_min); vy = _mm_packs_epi16(vy, vy); _mm_storel_epi64((__m128i*) output, vy); @@ -124,7 +116,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u24( __m128i vy = _mm_packs_epi32(vy_lo, vy_hi); vy = _mm_adds_epi16(vy, voutput_zero_point); - vy = _mm_max_epi16(vy, voutput_min); vy = _mm_packs_epi16(vy, vy); if (batch & (4 * sizeof(float))) { diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u32.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u32.c index 9404c0fde05..eed7b436f63 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u32.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u32.c @@ -28,13 +28,11 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u32( assert(output != NULL); const __m128 vscale = _mm_set1_ps(params->scalar.scale); - const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point)); const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point); - const __m128i voutput_min = _mm_set1_epi16(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { __m128 vx0123 = _mm_loadu_ps(input); @@ -84,15 +82,9 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u32( vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point); vyOPQRSTUV = _mm_adds_epi16(vyOPQRSTUV, voutput_zero_point); - vy01234567 = _mm_max_epi16(vy01234567, voutput_min); - vy89ABCDEF = _mm_max_epi16(vy89ABCDEF, voutput_min); - vyGHIJKLMN = _mm_max_epi16(vyGHIJKLMN, voutput_min); - vyOPQRSTUV = _mm_max_epi16(vyOPQRSTUV, voutput_min); - __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF); __m128i vyGHIJKLMNOPQRSTUV = _mm_packs_epi16(vyGHIJKLMN, vyOPQRSTUV); - _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF); _mm_storeu_si128((__m128i*) (output + 16), vyGHIJKLMNOPQRSTUV); output += 32; @@ -113,7 +105,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u32( __m128i vy = _mm_packs_epi32(vy_lo, vy_hi); vy = _mm_adds_epi16(vy, voutput_zero_point); - vy = _mm_max_epi16(vy, voutput_min); vy = _mm_packs_epi16(vy, vy); _mm_storel_epi64((__m128i*) output, vy); @@ -135,7 +126,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u32( __m128i vy = _mm_packs_epi32(vy_lo, vy_hi); vy = _mm_adds_epi16(vy, voutput_zero_point); - vy = _mm_max_epi16(vy, voutput_min); vy = _mm_packs_epi16(vy, vy); if (batch & (4 * sizeof(float))) { diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u8.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u8.c index a13f7e24a6e..b11c2fed70f 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u8.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u8.c @@ -28,13 +28,11 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u8( assert(output != NULL); const __m128 vscale = _mm_set1_ps(params->scalar.scale); - const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point)); const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point); - const __m128i voutput_min = _mm_set1_epi16(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { __m128 vx_lo = _mm_loadu_ps(input); @@ -52,7 +50,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u8( __m128i vy = _mm_packs_epi32(vy_lo, vy_hi); vy = _mm_adds_epi16(vy, voutput_zero_point); - vy = _mm_max_epi16(vy, voutput_min); vy = _mm_packs_epi16(vy, vy); _mm_storel_epi64((__m128i*) output, vy); @@ -74,7 +71,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u8( __m128i vy = _mm_packs_epi32(vy_lo, vy_hi); vy = _mm_adds_epi16(vy, voutput_zero_point); - vy = _mm_max_epi16(vy, voutput_min); vy = _mm_packs_epi16(vy, vy); if (batch & (4 * sizeof(float))) { diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u16.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u16.c index 48c217c81fa..c155b181f09 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u16.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u16.c @@ -28,13 +28,11 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u16( assert(output != NULL); const __m128 vscale = _mm_set1_ps(params->scalar.scale); - const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point)); const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point); - const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { __m128 vx0123 = _mm_loadu_ps(input); @@ -64,11 +62,8 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u16( vy01234567 = _mm_adds_epi16(vy01234567, voutput_zero_point); vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point); - __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF); - vy0123456789ABCDEF = _mm_max_epi8(vy0123456789ABCDEF, voutput_min); - _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF); output += 16; } @@ -89,7 +84,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u16( __m128i vy = _mm_packs_epi32(vy_lo, vy_hi); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packs_epi16(vy, vy); - vy = _mm_max_epi8(vy, voutput_min); _mm_storel_epi64((__m128i*) output, vy); output += 8; @@ -111,7 +105,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u16( __m128i vy = _mm_packs_epi32(vy_lo, vy_hi); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packs_epi16(vy, vy); - vy = _mm_max_epi8(vy, voutput_min); if (batch & (4 * sizeof(float))) { unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy)); diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u24.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u24.c index 54944780d4c..6f07a0f425b 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u24.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u24.c @@ -28,13 +28,11 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u24( assert(output != NULL); const __m128 vscale = _mm_set1_ps(params->scalar.scale); - const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point)); const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point); - const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 24 * sizeof(float); batch -= 24 * sizeof(float)) { __m128 vx0123 = _mm_loadu_ps(input); @@ -74,13 +72,9 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u24( vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point); vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point); - __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF); vyGHIJKLMN = _mm_packs_epi16(vyGHIJKLMN, vyGHIJKLMN); - vy0123456789ABCDEF = _mm_max_epi8(vy0123456789ABCDEF, voutput_min); - vyGHIJKLMN = _mm_max_epi8(vyGHIJKLMN, voutput_min); - _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF); _mm_storel_epi64((__m128i*) (output + 16), vyGHIJKLMN); output += 24; @@ -102,7 +96,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u24( __m128i vy = _mm_packs_epi32(vy_lo, vy_hi); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packs_epi16(vy, vy); - vy = _mm_max_epi8(vy, voutput_min); _mm_storel_epi64((__m128i*) output, vy); output += 8; @@ -124,7 +117,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u24( __m128i vy = _mm_packs_epi32(vy_lo, vy_hi); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packs_epi16(vy, vy); - vy = _mm_max_epi8(vy, voutput_min); if (batch & (4 * sizeof(float))) { unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy)); diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u32.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u32.c index 5cede1dbd14..3d050f75c58 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u32.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u32.c @@ -28,13 +28,11 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u32( assert(output != NULL); const __m128 vscale = _mm_set1_ps(params->scalar.scale); - const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point)); const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point); - const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { __m128 vx0123 = _mm_loadu_ps(input); @@ -84,13 +82,9 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u32( vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point); vyOPQRSTUV = _mm_adds_epi16(vyOPQRSTUV, voutput_zero_point); - __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF); __m128i vyGHIJKLMNOPQRSTUV = _mm_packs_epi16(vyGHIJKLMN, vyOPQRSTUV); - vy0123456789ABCDEF = _mm_max_epi8(vy0123456789ABCDEF, voutput_min); - vyGHIJKLMNOPQRSTUV = _mm_max_epi8(vyGHIJKLMNOPQRSTUV, voutput_min); - _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF); _mm_storeu_si128((__m128i*) (output + 16), vyGHIJKLMNOPQRSTUV); output += 32; @@ -112,7 +106,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u32( __m128i vy = _mm_packs_epi32(vy_lo, vy_hi); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packs_epi16(vy, vy); - vy = _mm_max_epi8(vy, voutput_min); _mm_storel_epi64((__m128i*) output, vy); output += 8; @@ -134,7 +127,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u32( __m128i vy = _mm_packs_epi32(vy_lo, vy_hi); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packs_epi16(vy, vy); - vy = _mm_max_epi8(vy, voutput_min); if (batch & (4 * sizeof(float))) { unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy)); diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u8.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u8.c index c029b2edebf..fca10deddab 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u8.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u8.c @@ -28,13 +28,11 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u8( assert(output != NULL); const __m128 vscale = _mm_set1_ps(params->scalar.scale); - const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point)); const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point); - const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { __m128 vx_lo = _mm_loadu_ps(input); @@ -53,7 +51,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u8( __m128i vy = _mm_packs_epi32(vy_lo, vy_hi); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packs_epi16(vy, vy); - vy = _mm_max_epi8(vy, voutput_min); _mm_storel_epi64((__m128i*) output, vy); output += 8; @@ -75,7 +72,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u8( __m128i vy = _mm_packs_epi32(vy_lo, vy_hi); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packs_epi16(vy, vy); - vy = _mm_max_epi8(vy, voutput_min); if (batch & (4 * sizeof(float))) { unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy)); diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u1.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u1.c index 45da09e9419..79a2d37748b 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u1.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u1.c @@ -26,8 +26,8 @@ void xnn_f32_qs8_vcvt_ukernel__wasm_fmagic_u1( const float* i = input; const float vscale = params->scalar.scale; - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const float vmagic_bias = 12582912.0f; const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u2.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u2.c index 85c306cbf9f..e33221c736f 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u2.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u2.c @@ -26,8 +26,8 @@ void xnn_f32_qs8_vcvt_ukernel__wasm_fmagic_u2( const float* i = input; const float vscale = params->scalar.scale; - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const float vmagic_bias = 12582912.0f; const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u3.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u3.c index 6611776a42d..9f3b8b5c77c 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u3.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u3.c @@ -26,8 +26,8 @@ void xnn_f32_qs8_vcvt_ukernel__wasm_fmagic_u3( const float* i = input; const float vscale = params->scalar.scale; - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const float vmagic_bias = 12582912.0f; const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u4.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u4.c index 41428e83021..cc9f407cd5a 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u4.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u4.c @@ -26,8 +26,8 @@ void xnn_f32_qs8_vcvt_ukernel__wasm_fmagic_u4( const float* i = input; const float vscale = params->scalar.scale; - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point); const float vmagic_bias = 12582912.0f; const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u16.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u16.c index 5f52592f181..bb183e7b855 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u16.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u16.c @@ -29,12 +29,8 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u16( const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); const v128_t voutput_zero_point = wasm_v128_load16_splat(¶ms->scalar.output_zero_point); - const v128_t voutput_min = wasm_v128_load8_splat(¶ms->scalar.output_min); - const v128_t voutput_max = wasm_v128_load8_splat(¶ms->scalar.output_max); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { v128_t vx0123 = wasm_v128_load(input); v128_t vx4567 = wasm_v128_load(input + 4); @@ -65,10 +61,6 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u16( v128_t vy0123456789ABCDEF = wasm_i8x16_narrow_i16x8(vacc01234567, vacc89ABCDEF); - vy0123456789ABCDEF = wasm_i8x16_max(vy0123456789ABCDEF, voutput_min); - - vy0123456789ABCDEF = wasm_i8x16_min(vy0123456789ABCDEF, voutput_max); - wasm_v128_store(output, vy0123456789ABCDEF); output += 16; } @@ -90,8 +82,6 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u16( vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc); - vy = wasm_i8x16_max(vy, voutput_min); - vy = wasm_i8x16_min(vy, voutput_max); wasm_v128_store64_lane(output, vy, 0); output += 8; @@ -116,8 +106,6 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u16( vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc); - vy = wasm_i8x16_max(vy, voutput_min); - vy = wasm_i8x16_min(vy, voutput_max); if (batch & (4 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u24.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u24.c index 8529640968a..f7dfe1ae059 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u24.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u24.c @@ -29,12 +29,8 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u24( const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); const v128_t voutput_zero_point = wasm_v128_load16_splat(¶ms->scalar.output_zero_point); - const v128_t voutput_min = wasm_v128_load8_splat(¶ms->scalar.output_min); - const v128_t voutput_max = wasm_v128_load8_splat(¶ms->scalar.output_max); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); for (; batch >= 24 * sizeof(float); batch -= 24 * sizeof(float)) { v128_t vx0123 = wasm_v128_load(input); v128_t vx4567 = wasm_v128_load(input + 4); @@ -76,12 +72,6 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u24( v128_t vy0123456789ABCDEF = wasm_i8x16_narrow_i16x8(vacc01234567, vacc89ABCDEF); v128_t vyGHIJKLMN = wasm_i8x16_narrow_i16x8(vaccGHIJKLMN, vaccGHIJKLMN); - vy0123456789ABCDEF = wasm_i8x16_max(vy0123456789ABCDEF, voutput_min); - vyGHIJKLMN = wasm_i8x16_max(vyGHIJKLMN, voutput_min); - - vy0123456789ABCDEF = wasm_i8x16_min(vy0123456789ABCDEF, voutput_max); - vyGHIJKLMN = wasm_i8x16_min(vyGHIJKLMN, voutput_max); - wasm_v128_store(output, vy0123456789ABCDEF); wasm_v128_store64_lane(output + 16, vyGHIJKLMN, 0); output += 24; @@ -104,8 +94,6 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u24( vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc); - vy = wasm_i8x16_max(vy, voutput_min); - vy = wasm_i8x16_min(vy, voutput_max); wasm_v128_store64_lane(output, vy, 0); output += 8; @@ -130,8 +118,6 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u24( vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc); - vy = wasm_i8x16_max(vy, voutput_min); - vy = wasm_i8x16_min(vy, voutput_max); if (batch & (4 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u32.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u32.c index ec9fae4c169..fc1d0a8ec23 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u32.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u32.c @@ -29,12 +29,8 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u32( const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); const v128_t voutput_zero_point = wasm_v128_load16_splat(¶ms->scalar.output_zero_point); - const v128_t voutput_min = wasm_v128_load8_splat(¶ms->scalar.output_min); - const v128_t voutput_max = wasm_v128_load8_splat(¶ms->scalar.output_max); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { v128_t vx0123 = wasm_v128_load(input); v128_t vx4567 = wasm_v128_load(input + 4); @@ -86,12 +82,6 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u32( v128_t vy0123456789ABCDEF = wasm_i8x16_narrow_i16x8(vacc01234567, vacc89ABCDEF); v128_t vyGHIJKLMNOPQRSTUV = wasm_i8x16_narrow_i16x8(vaccGHIJKLMN, vaccOPQRSTUV); - vy0123456789ABCDEF = wasm_i8x16_max(vy0123456789ABCDEF, voutput_min); - vyGHIJKLMNOPQRSTUV = wasm_i8x16_max(vyGHIJKLMNOPQRSTUV, voutput_min); - - vy0123456789ABCDEF = wasm_i8x16_min(vy0123456789ABCDEF, voutput_max); - vyGHIJKLMNOPQRSTUV = wasm_i8x16_min(vyGHIJKLMNOPQRSTUV, voutput_max); - wasm_v128_store(output, vy0123456789ABCDEF); wasm_v128_store(output + 16, vyGHIJKLMNOPQRSTUV); output += 32; @@ -114,8 +104,6 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u32( vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc); - vy = wasm_i8x16_max(vy, voutput_min); - vy = wasm_i8x16_min(vy, voutput_max); wasm_v128_store64_lane(output, vy, 0); output += 8; @@ -140,8 +128,6 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u32( vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc); - vy = wasm_i8x16_max(vy, voutput_min); - vy = wasm_i8x16_min(vy, voutput_max); if (batch & (4 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u8.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u8.c index 2bff6a3f06b..9323c1b82e7 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u8.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u8.c @@ -29,12 +29,8 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u8( const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); const v128_t voutput_zero_point = wasm_v128_load16_splat(¶ms->scalar.output_zero_point); - const v128_t voutput_min = wasm_v128_load8_splat(¶ms->scalar.output_min); - const v128_t voutput_max = wasm_v128_load8_splat(¶ms->scalar.output_max); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { v128_t vx_lo = wasm_v128_load(input); v128_t vx_hi = wasm_v128_load(input + 4); @@ -53,8 +49,6 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u8( vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc); - vy = wasm_i8x16_max(vy, voutput_min); - vy = wasm_i8x16_min(vy, voutput_max); wasm_v128_store64_lane(output, vy, 0); output += 8; @@ -79,8 +73,6 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u8( vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc); - vy = wasm_i8x16_max(vy, voutput_min); - vy = wasm_i8x16_min(vy, voutput_max); if (batch & (4 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u16.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u16.c index 3f62f774212..e73be322f5d 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u16.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u16.c @@ -28,12 +28,12 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_magic_u16( assert(input != NULL); assert(output != NULL); - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); + const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); const v128_t vmagic_min = wasm_u32x4_splat(float_as_uint32(12582912.0f + output_min_less_zero_point)); const v128_t vmagic_bias_less_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point); - const v128_t voutput_max = wasm_v128_load8_splat(¶ms->scalar.output_max); + const v128_t voutput_max = wasm_u8x16_const_splat(127); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(vmagic_bias); XNN_FORCE_REALIZATION(vmagic_min); diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u24.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u24.c index 87078726928..4dff7b9324a 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u24.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u24.c @@ -28,12 +28,12 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_magic_u24( assert(input != NULL); assert(output != NULL); - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); + const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); const v128_t vmagic_min = wasm_u32x4_splat(float_as_uint32(12582912.0f + output_min_less_zero_point)); const v128_t vmagic_bias_less_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point); - const v128_t voutput_max = wasm_v128_load8_splat(¶ms->scalar.output_max); + const v128_t voutput_max = wasm_u8x16_const_splat(127); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(vmagic_bias); XNN_FORCE_REALIZATION(vmagic_min); diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u32.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u32.c index 5fbbeb7a1a7..d392c0b8997 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u32.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u32.c @@ -28,12 +28,12 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_magic_u32( assert(input != NULL); assert(output != NULL); - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); + const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); const v128_t vmagic_min = wasm_u32x4_splat(float_as_uint32(12582912.0f + output_min_less_zero_point)); const v128_t vmagic_bias_less_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point); - const v128_t voutput_max = wasm_v128_load8_splat(¶ms->scalar.output_max); + const v128_t voutput_max = wasm_u8x16_const_splat(127); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(vmagic_bias); XNN_FORCE_REALIZATION(vmagic_min); diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u8.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u8.c index d822d804fa8..4b67d131af7 100644 --- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u8.c +++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u8.c @@ -28,12 +28,12 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_magic_u8( assert(input != NULL); assert(output != NULL); - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); + const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point); const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); const v128_t vmagic_min = wasm_u32x4_splat(float_as_uint32(12582912.0f + output_min_less_zero_point)); const v128_t vmagic_bias_less_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point); - const v128_t voutput_max = wasm_v128_load8_splat(¶ms->scalar.output_max); + const v128_t voutput_max = wasm_u8x16_const_splat(127); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(vmagic_bias); XNN_FORCE_REALIZATION(vmagic_min); diff --git a/src/f32-qs8-vcvt/hvx.c.in b/src/f32-qs8-vcvt/hvx.c.in index 3971581110a..b66363ad7d8 100644 --- a/src/f32-qs8-vcvt/hvx.c.in +++ b/src/f32-qs8-vcvt/hvx.c.in @@ -25,8 +25,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__hvx_u${BATCH_TILE}( const HVX_Vector vscale = xnn_set1_f32(params->scalar.scale); const HVX_Vector vmagic_bias = xnn_set1_f32(12582912.0f); const HVX_Vector vmagic_bias_less_zero_point = Q6_V_vsplat_R(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point); - const HVX_Vector voutput_min = Q6_Vb_vsplat_R(params->scalar.output_min); - const HVX_Vector voutput_max = Q6_Vb_vsplat_R(params->scalar.output_max); XNN_FORCE_REALIZATION(vmagic_bias); $if BATCH_TILE > 32: for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { @@ -55,10 +53,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__hvx_u${BATCH_TILE}( $else: HVX_Vector vy${int(N/128)} = Q6_Vb_vpack_VhVh_sat(vacc_h${int(N/64)}, vacc_h${int(N/64)}); - $for N in range(0, BATCH_TILE, 128): - vy${int(N/128)} = Q6_Vb_vmax_VbVb(voutput_min, vy${int(N/128)}); - vy${int(N/128)} = Q6_Vb_vmin_VbVb(voutput_max, vy${int(N/128)}); - $for N in range(0, BATCH_TILE, 128): $if N + 128 <= BATCH_TILE: *((HVX_UVector *) output) = vy${int(N/128)}; @@ -79,9 +73,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__hvx_u${BATCH_TILE}( HVX_Vector vy = Q6_Vb_vpack_VhVh_sat(vacc_h, vacc_h); - vy = Q6_Vb_vmax_VbVb(voutput_min, vy); - vy = Q6_Vb_vmin_VbVb(voutput_max, vy); - Q6_V_vstu_variable(output, 32, vy); output += 32; } @@ -98,9 +89,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__hvx_u${BATCH_TILE}( HVX_Vector vy = Q6_Vb_vpack_VhVh_sat(vacc_h, vacc_h); - vy = Q6_Vb_vmax_VbVb(voutput_min, vy); - vy = Q6_Vb_vmin_VbVb(voutput_max, vy); - // Since the output data type is int8_t, // we simply determine the number of elements using batch >> 2 // without multiplying by sizeof(int8_t). diff --git a/src/f32-qs8-vcvt/neon.c.in b/src/f32-qs8-vcvt/neon.c.in index 4ceffd80b50..fb391ad4300 100644 --- a/src/f32-qs8-vcvt/neon.c.in +++ b/src/f32-qs8-vcvt/neon.c.in @@ -29,10 +29,6 @@ $VCOMBINE_X8 = {"QS8": "vcombine_s8", "QU8": "vcombine_u8"}[DATATYPE] $VGET_LOW_X8 = {"QS8": "vget_low_s8", "QU8": "vget_low_u8"}[DATATYPE] $VREINTERPRET_U16_X8 = {"QS8": "vreinterpret_u16_s8", "QU8": "vreinterpret_u16_u8"}[DATATYPE] $VREINTERPRET_U32_X8 = {"QS8": "vreinterpret_u32_s8", "QU8": "vreinterpret_u32_u8"}[DATATYPE] -$VMINQ_X8 = {"QS8": "vminq_s8", "QU8": "vminq_u8"}[DATATYPE] -$VMIN_X8 = {"QS8": "vmin_s8", "QU8": "vmin_u8"}[DATATYPE] -$VMAXQ_X8 = {"QS8": "vmaxq_s8", "QU8": "vmaxq_u8"}[DATATYPE] -$VMAX_X8 = {"QS8": "vmax_s8", "QU8": "vmax_u8"}[DATATYPE] void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__neon_u${BATCH_TILE}( size_t batch, const float* input, @@ -47,12 +43,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__neon_u${BATCH_TILE}( const float32x4_t vscale = vld1q_dup_f32(¶ms->scalar.scale); const float32x4_t vmagic_bias = vdupq_n_f32(12582912.0f); const int32x4_t vmagic_bias_less_zero_point = vdupq_n_s32(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point); - $if BATCH_TILE > 8: - const ${XINT8X16_T} voutput_min = ${VLD1Q_DUP_X8}(¶ms->scalar.output_min); - const ${XINT8X16_T} voutput_max = ${VLD1Q_DUP_X8}(¶ms->scalar.output_max); - $else: - const ${XINT8X8_T} voutput_min = ${VLD1_DUP_X8}(¶ms->scalar.output_min); - const ${XINT8X8_T} voutput_max = ${VLD1_DUP_X8}(¶ms->scalar.output_max); XNN_FORCE_REALIZATION(vmagic_bias); $if BATCH_TILE > 8: for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { @@ -77,18 +67,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__neon_u${BATCH_TILE}( $else: ${XINT8X8_T} vy${ABC[N:N+8]} = ${VQMOVXN_S16}(vacc${ABC[N:N+8]}); - $for N in range(0, BATCH_TILE, 16): - $if N + 8 < BATCH_TILE: - vy${ABC[N:N+16]} = ${VMAXQ_X8}(vy${ABC[N:N+16]}, voutput_min); - $else: - vy${ABC[N:N+8]} = ${VMAX_X8}(vy${ABC[N:N+8]}, ${VGET_LOW_X8}(voutput_min)); - - $for N in range(0, BATCH_TILE, 16): - $if N + 8 < BATCH_TILE: - vy${ABC[N:N+16]} = ${VMINQ_X8}(vy${ABC[N:N+16]}, voutput_max); - $else: - vy${ABC[N:N+8]} = ${VMIN_X8}(vy${ABC[N:N+8]}, ${VGET_LOW_X8}(voutput_max)); - $for N in range(0, BATCH_TILE, 16): $if N + 8 < BATCH_TILE: ${VST1Q_X8}(output, vy${ABC[N:N+16]}); output += 16; @@ -111,12 +89,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__neon_u${BATCH_TILE}( const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); ${XINT8X8_T} vy = ${VQMOVXN_S16}(vacc); - $if BATCH_TILE > 8: - vy = ${VMAX_X8}(vy, ${VGET_LOW_X8}(voutput_min)); - vy = ${VMIN_X8}(vy, ${VGET_LOW_X8}(voutput_max)); - $else: - vy = ${VMAX_X8}(vy, voutput_min); - vy = ${VMIN_X8}(vy, voutput_max); ${VST1_X8}(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { @@ -138,12 +110,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__neon_u${BATCH_TILE}( const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); ${XINT8X8_T} vy = ${VQMOVXN_S16}(vacc); - $if BATCH_TILE > 8: - vy = ${VMAX_X8}(vy, ${VGET_LOW_X8}(voutput_min)); - vy = ${VMIN_X8}(vy, ${VGET_LOW_X8}(voutput_max)); - $else: - vy = ${VMAX_X8}(vy, voutput_min); - vy = ${VMIN_X8}(vy, voutput_max); if (batch & (4 * sizeof(float))) { vst1_lane_u32((void*) output, ${VREINTERPRET_U32_X8}(vy), 0); output += 4; diff --git a/src/f32-qs8-vcvt/neonv8.c.in b/src/f32-qs8-vcvt/neonv8.c.in index dc21df69461..998aa16bdaf 100644 --- a/src/f32-qs8-vcvt/neonv8.c.in +++ b/src/f32-qs8-vcvt/neonv8.c.in @@ -29,10 +29,6 @@ $VCOMBINE_X8 = {"QS8": "vcombine_s8", "QU8": "vcombine_u8"}[DATATYPE] $VGET_LOW_X8 = {"QS8": "vget_low_s8", "QU8": "vget_low_u8"}[DATATYPE] $VREINTERPRET_U16_X8 = {"QS8": "vreinterpret_u16_s8", "QU8": "vreinterpret_u16_u8"}[DATATYPE] $VREINTERPRET_U32_X8 = {"QS8": "vreinterpret_u32_s8", "QU8": "vreinterpret_u32_u8"}[DATATYPE] -$VMAXQ_X8 = {"QS8": "vmaxq_s8", "QU8": "vmaxq_u8"}[DATATYPE] -$VMAX_X8 = {"QS8": "vmax_s8", "QU8": "vmax_u8"}[DATATYPE] -$VMINQ_X8 = {"QS8": "vminq_s8", "QU8": "vminq_u8"}[DATATYPE] -$VMIN_X8 = {"QS8": "vmin_s8", "QU8": "vmin_u8"}[DATATYPE] void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__neonv8_u${BATCH_TILE}( size_t batch, const float* input, @@ -46,12 +42,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__neonv8_u${BATCH_TILE}( const float32x4_t vscale = vld1q_dup_f32(¶ms->scalar.scale); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->scalar.output_zero_point); - $if BATCH_TILE > 8: - const ${XINT8X16_T} voutput_min = ${VLD1Q_DUP_X8}(¶ms->scalar.output_min); - const ${XINT8X16_T} voutput_max = ${VLD1Q_DUP_X8}(¶ms->scalar.output_max); - $else: - const ${XINT8X8_T} voutput_min = ${VLD1_DUP_X8}(¶ms->scalar.output_min); - const ${XINT8X8_T} voutput_max = ${VLD1_DUP_X8}(¶ms->scalar.output_max); $if BATCH_TILE > 8: for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { $for N in range(0, BATCH_TILE, 4): @@ -75,18 +65,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__neonv8_u${BATCH_TILE}( $else: ${XINT8X8_T} vy${ABC[N:N+8]} = ${VQMOVXN_S16}(vacc${ABC[N:N+8]}); - $for N in range(0, BATCH_TILE, 16): - $if N + 8 < BATCH_TILE: - vy${ABC[N:N+16]} = ${VMAXQ_X8}(vy${ABC[N:N+16]}, voutput_min); - $else: - vy${ABC[N:N+8]} = ${VMAX_X8}(vy${ABC[N:N+8]}, ${VGET_LOW_X8}(voutput_min)); - - $for N in range(0, BATCH_TILE, 16): - $if N + 8 < BATCH_TILE: - vy${ABC[N:N+16]} = ${VMINQ_X8}(vy${ABC[N:N+16]}, voutput_max); - $else: - vy${ABC[N:N+8]} = ${VMIN_X8}(vy${ABC[N:N+8]}, ${VGET_LOW_X8}(voutput_max)); - $for N in range(0, BATCH_TILE, 16): $if N + 8 < BATCH_TILE: ${VST1Q_X8}(output, vy${ABC[N:N+16]}); output += 16; @@ -107,12 +85,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__neonv8_u${BATCH_TILE}( vacc = vqaddq_s16(vacc, voutput_zero_point); ${XINT8X8_T} vy = ${VQMOVXN_S16}(vacc); - $if BATCH_TILE > 8: - vy = ${VMAX_X8}(vy, ${VGET_LOW_X8}(voutput_min)); - vy = ${VMIN_X8}(vy, ${VGET_LOW_X8}(voutput_max)); - $else: - vy = ${VMAX_X8}(vy, voutput_min); - vy = ${VMIN_X8}(vy, voutput_max); ${VST1_X8}(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { @@ -132,12 +104,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__neonv8_u${BATCH_TILE}( vacc = vqaddq_s16(vacc, voutput_zero_point); ${XINT8X8_T} vy = ${VQMOVXN_S16}(vacc); - $if BATCH_TILE > 8: - vy = ${VMAX_X8}(vy, ${VGET_LOW_X8}(voutput_min)); - vy = ${VMIN_X8}(vy, ${VGET_LOW_X8}(voutput_max)); - $else: - vy = ${VMAX_X8}(vy, voutput_min); - vy = ${VMIN_X8}(vy, voutput_max); if (batch & (4 * sizeof(float))) { vst1_lane_u32((void*) output, ${VREINTERPRET_U32_X8}(vy), 0); output += 4; diff --git a/src/f32-qs8-vcvt/rvv.c.in b/src/f32-qs8-vcvt/rvv.c.in index a725a9d4f0e..c389a826ee2 100755 --- a/src/f32-qs8-vcvt/rvv.c.in +++ b/src/f32-qs8-vcvt/rvv.c.in @@ -15,6 +15,8 @@ $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] #include "xnnpack/vcvt.h" +$OUTPUT_MIN = {"QS8": -128, "QU8": 0}[DATATYPE] +$OUTPUT_MAX = {"QS8": 127, "QU8": 255}[DATATYPE] void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__rvv_u${LMUL}v( size_t batch, const float* input, @@ -29,8 +31,9 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__rvv_u${LMUL}v( batch >>= XNN_LOG2_SIZEOF_FLOAT; const float scale = params->scalar.scale; - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + // TODO: Clamp may not be necessary. RISCV spec doesn't say if vncvt saturates... + const float output_min_less_zero_point = (float) ((int32_t) ${OUTPUT_MIN} - (int32_t) params->scalar.output_zero_point); + const float output_max_less_zero_point = (float) ((int32_t) ${OUTPUT_MAX} - (int32_t) params->scalar.output_zero_point); const int32_t output_zero_point = params->scalar.output_zero_point; for (; batch > 0; ) { diff --git a/src/f32-qs8-vcvt/scalar-fmagic.c.in b/src/f32-qs8-vcvt/scalar-fmagic.c.in index 94218c1745a..72a150faea9 100644 --- a/src/f32-qs8-vcvt/scalar-fmagic.c.in +++ b/src/f32-qs8-vcvt/scalar-fmagic.c.in @@ -17,6 +17,8 @@ $INPUT_T = {"F16": "xnn_float16", "F32": "float"}[IDATATYPE] $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[ODATATYPE] $MIN_F32 = "__builtin_wasm_min_f32" if WASM else "math_min_f32" $MAX_F32 = "__builtin_wasm_max_f32" if WASM else "math_max_f32" +$OUTPUT_MIN = {"QS8": -128, "QU8": 0}[ODATATYPE] +$OUTPUT_MAX = {"QS8": 127, "QU8": 255}[ODATATYPE] void xnn_${IDATATYPE.lower()}_${ODATATYPE.lower()}_vcvt_ukernel__${"wasm" if WASM else "scalar"}_fmagic_u${BATCH_TILE}( size_t batch, const ${INPUT_T}* input, @@ -34,8 +36,8 @@ void xnn_${IDATATYPE.lower()}_${ODATATYPE.lower()}_vcvt_ukernel__${"wasm" if WAS $else: const float* i = input; const float vscale = params->scalar.scale; - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) ${OUTPUT_MIN} - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) ${OUTPUT_MAX} - (int32_t) params->scalar.output_zero_point); const float vmagic_bias = 12582912.0f; const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f32-qs8-vcvt/scalar-imagic.c.in b/src/f32-qs8-vcvt/scalar-imagic.c.in index 2d83fb24ed0..a14d728bf58 100644 --- a/src/f32-qs8-vcvt/scalar-imagic.c.in +++ b/src/f32-qs8-vcvt/scalar-imagic.c.in @@ -15,6 +15,8 @@ $assert IDATATYPE == "F16" and ODATATYPE == "QS8" or IDATATYPE == "F32" $INPUT_T = {"F16": "xnn_float16", "F32": "float"}[IDATATYPE] $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[ODATATYPE] +$OUTPUT_MIN = {"QS8": -128, "QU8": 0}[ODATATYPE] +$OUTPUT_MAX = {"QS8": 127, "QU8": 255}[ODATATYPE] void xnn_${IDATATYPE.lower()}_${ODATATYPE.lower()}_vcvt_ukernel__${"wasm" if WASM else "scalar"}_imagic_u${BATCH_TILE}( size_t batch, const ${INPUT_T}* input, @@ -33,8 +35,8 @@ void xnn_${IDATATYPE.lower()}_${ODATATYPE.lower()}_vcvt_ukernel__${"wasm" if WAS const float* i = input; const float vscale = params->scalar.scale; const float vmagic_bias = 12582912.0f; - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float output_min_less_zero_point = (float) ((int32_t) ${OUTPUT_MIN} - (int32_t) params->scalar.output_zero_point); + const float output_max_less_zero_point = (float) ((int32_t) ${OUTPUT_MAX} - (int32_t) params->scalar.output_zero_point); const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point); const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point); const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f32-qs8-vcvt/scalar-lrintf.c.in b/src/f32-qs8-vcvt/scalar-lrintf.c.in index bbe2352e5ad..808ae9f2117 100644 --- a/src/f32-qs8-vcvt/scalar-lrintf.c.in +++ b/src/f32-qs8-vcvt/scalar-lrintf.c.in @@ -15,6 +15,8 @@ $assert BATCH_TILE >= 1 $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] $MIN_F32 = "__builtin_wasm_min_f32" if WASM else "math_min_f32" $MAX_F32 = "__builtin_wasm_max_f32" if WASM else "math_max_f32" +$OUTPUT_MIN = {"QS8": -128, "QU8": 0}[DATATYPE] +$OUTPUT_MAX = {"QS8": 127, "QU8": 255}[DATATYPE] void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__${"wasm" if WASM else "scalar"}_lrintf_u${BATCH_TILE}( size_t batch, const float* input, @@ -27,8 +29,8 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__${"wasm" if WASM else "scalar"}_l assert(output != NULL); const float vscale = params->scalar.scale; - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) ${OUTPUT_MIN} - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) ${OUTPUT_MAX} - (int32_t) params->scalar.output_zero_point); const int32_t voutput_zero_point = params->scalar.output_zero_point; $if BATCH_TILE == 1: diff --git a/src/f32-qs8-vcvt/sse.c.in b/src/f32-qs8-vcvt/sse.c.in index 0cf69499c38..4442ccc3a76 100644 --- a/src/f32-qs8-vcvt/sse.c.in +++ b/src/f32-qs8-vcvt/sse.c.in @@ -21,7 +21,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" $ISA = {2: "sse2", 4: "sse41"}[SSE] $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] $_MM_PACKXS_EPI16 = {"QS8": "_mm_packs_epi16", "QU8": "_mm_packus_epi16"}[DATATYPE] -$_MM_MAX_EPX8 = {"QS8": "_mm_max_epi8", "QU8": "_mm_max_epu8"}[DATATYPE] +$OUTPUT_MAX = {"QS8": 127, "QU8": 255}[DATATYPE] void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__${ISA}_u${BATCH_TILE}( size_t batch, const float* input, @@ -34,16 +34,11 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__${ISA}_u${BATCH_TILE}( assert(output != NULL); const __m128 vscale = _mm_set1_ps(params->scalar.scale); - const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) ${OUTPUT_MAX} - (int32_t) params->scalar.output_zero_point)); const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point); - $if DATATYPE == "QS8" and SSE < 4: - const __m128i voutput_min = _mm_set1_epi16(params->scalar.output_min); - $if DATATYPE == "QU8" or SSE == 4: - const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); $if BATCH_TILE > 8: for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { @@ -67,23 +62,12 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__${ISA}_u${BATCH_TILE}( $for N in range(0, BATCH_TILE, 8): vy${ABC[N:N+8]} = _mm_adds_epi16(vy${ABC[N:N+8]}, voutput_zero_point); - $if DATATYPE == "QS8" and SSE < 4: - $for N in range(0, BATCH_TILE, 8): - vy${ABC[N:N+8]} = _mm_max_epi16(vy${ABC[N:N+8]}, voutput_min); - $for N in range(0, BATCH_TILE, 16): $if N + 8 < BATCH_TILE: __m128i vy${ABC[N:N+16]} = ${_MM_PACKXS_EPI16}(vy${ABC[N:N+8]}, vy${ABC[N+8:N+16]}); $else: vy${ABC[N:N+8]} = ${_MM_PACKXS_EPI16}(vy${ABC[N:N+8]}, vy${ABC[N:N+8]}); - $if DATATYPE == "QU8" or SSE == 4: - $for N in range(0, BATCH_TILE, 16): - $if N + 8 < BATCH_TILE: - vy${ABC[N:N+16]} = ${_MM_MAX_EPX8}(vy${ABC[N:N+16]}, voutput_min); - $else: - vy${ABC[N:N+8]} = ${_MM_MAX_EPX8}(vy${ABC[N:N+8]}, voutput_min); - _mm_storeu_si128((__m128i*) output, vy${ABC[0:16]}); $for N in range(16, BATCH_TILE, 16): $if N + 8 < BATCH_TILE: @@ -108,11 +92,7 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__${ISA}_u${BATCH_TILE}( __m128i vy = _mm_packs_epi32(vy_lo, vy_hi); vy = _mm_adds_epi16(vy, voutput_zero_point); - $if DATATYPE == "QS8" and SSE < 4: - vy = _mm_max_epi16(vy, voutput_min); vy = ${_MM_PACKXS_EPI16}(vy, vy); - $if DATATYPE == "QU8" or SSE == 4: - vy = ${_MM_MAX_EPX8}(vy, voutput_min); _mm_storel_epi64((__m128i*) output, vy); output += 8; @@ -133,11 +113,7 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__${ISA}_u${BATCH_TILE}( __m128i vy = _mm_packs_epi32(vy_lo, vy_hi); vy = _mm_adds_epi16(vy, voutput_zero_point); - $if DATATYPE == "QS8" and SSE < 4: - vy = _mm_max_epi16(vy, voutput_min); vy = ${_MM_PACKXS_EPI16}(vy, vy); - $if DATATYPE == "QU8" or SSE == 4: - vy = ${_MM_MAX_EPX8}(vy, voutput_min); if (batch & (4 * sizeof(float))) { unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy)); diff --git a/src/f32-qs8-vcvt/wasmsimd-cvt.c.in b/src/f32-qs8-vcvt/wasmsimd-cvt.c.in index bb18a7ee250..318983d5f3f 100644 --- a/src/f32-qs8-vcvt/wasmsimd-cvt.c.in +++ b/src/f32-qs8-vcvt/wasmsimd-cvt.c.in @@ -17,8 +17,6 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] $WASM_X8X16_NARROW_I16X8 = {"QS8": "wasm_i8x16_narrow_i16x8", "QU8": "wasm_u8x16_narrow_i16x8"}[DATATYPE] -$WASM_X8X16_MIN = {"QS8": "wasm_i8x16_min", "QU8": "wasm_u8x16_min"}[DATATYPE] -$WASM_X8X16_MAX = {"QS8": "wasm_i8x16_max", "QU8": "wasm_u8x16_max"}[DATATYPE] void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__wasmsimd_cvt_u${BATCH_TILE}( size_t batch, const float* input, @@ -32,12 +30,8 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__wasmsimd_cvt_u${BATCH_TILE}( const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); const v128_t voutput_zero_point = wasm_v128_load16_splat(¶ms->scalar.output_zero_point); - const v128_t voutput_min = wasm_v128_load8_splat(¶ms->scalar.output_min); - const v128_t voutput_max = wasm_v128_load8_splat(¶ms->scalar.output_max); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); $if BATCH_TILE > 8: for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { v128_t vx${ABC[0:4]} = wasm_v128_load(input); @@ -66,18 +60,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__wasmsimd_cvt_u${BATCH_TILE}( $else: v128_t vy${ABC[N:N+8]} = ${WASM_X8X16_NARROW_I16X8}(vacc${ABC[N:N+8]}, vacc${ABC[N:N+8]}); - $for N in range(0, BATCH_TILE, 16): - $if N + 8 < BATCH_TILE: - vy${ABC[N:N+16]} = ${WASM_X8X16_MAX}(vy${ABC[N:N+16]}, voutput_min); - $else: - vy${ABC[N:N+8]} = ${WASM_X8X16_MAX}(vy${ABC[N:N+8]}, voutput_min); - - $for N in range(0, BATCH_TILE, 16): - $if N + 8 < BATCH_TILE: - vy${ABC[N:N+16]} = ${WASM_X8X16_MIN}(vy${ABC[N:N+16]}, voutput_max); - $else: - vy${ABC[N:N+8]} = ${WASM_X8X16_MIN}(vy${ABC[N:N+8]}, voutput_max); - wasm_v128_store(output, vy${ABC[0:16]}); $for N in range(16, BATCH_TILE, 16): $if N + 8 < BATCH_TILE: @@ -104,8 +86,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__wasmsimd_cvt_u${BATCH_TILE}( vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); v128_t vy = ${WASM_X8X16_NARROW_I16X8}(vacc, vacc); - vy = ${WASM_X8X16_MAX}(vy, voutput_min); - vy = ${WASM_X8X16_MIN}(vy, voutput_max); wasm_v128_store64_lane(output, vy, 0); output += 8; @@ -130,8 +110,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__wasmsimd_cvt_u${BATCH_TILE}( vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); v128_t vy = ${WASM_X8X16_NARROW_I16X8}(vacc, vacc); - vy = ${WASM_X8X16_MAX}(vy, voutput_min); - vy = ${WASM_X8X16_MIN}(vy, voutput_max); if (batch & (4 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); diff --git a/src/f32-qs8-vcvt/wasmsimd-magic.c.in b/src/f32-qs8-vcvt/wasmsimd-magic.c.in index 5f0bee52414..160157276c2 100644 --- a/src/f32-qs8-vcvt/wasmsimd-magic.c.in +++ b/src/f32-qs8-vcvt/wasmsimd-magic.c.in @@ -19,6 +19,8 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] $WASM_X8X16_NARROW_I16X8 = {"QS8": "wasm_i8x16_narrow_i16x8", "QU8": "wasm_u8x16_narrow_i16x8"}[DATATYPE] $WASM_X8X16_MIN = {"QS8": "wasm_i8x16_min", "QU8": "wasm_u8x16_min"}[DATATYPE] +$OUTPUT_MIN = {"QS8": -128, "QU8": 0}[DATATYPE] +$OUTPUT_MAX = {"QS8": 127, "QU8": 255}[DATATYPE] void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__wasmsimd_magic_u${BATCH_TILE}( size_t batch, const float* input, @@ -30,12 +32,12 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__wasmsimd_magic_u${BATCH_TILE}( assert(input != NULL); assert(output != NULL); - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); + const float output_min_less_zero_point = (float) ((int32_t) ${OUTPUT_MIN} - (int32_t) params->scalar.output_zero_point); const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); const v128_t vmagic_min = wasm_u32x4_splat(float_as_uint32(12582912.0f + output_min_less_zero_point)); const v128_t vmagic_bias_less_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point); - const v128_t voutput_max = wasm_v128_load8_splat(¶ms->scalar.output_max); + const v128_t voutput_max = wasm_u8x16_const_splat(${OUTPUT_MAX}); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(vmagic_bias); XNN_FORCE_REALIZATION(vmagic_min); diff --git a/src/f32-qu8-vcvt/f32-qu8-vcvt.h b/src/f32-qu8-vcvt/f32-qu8-vcvt.h index 9f664a284d4..7ce3eaec17b 100644 --- a/src/f32-qu8-vcvt/f32-qu8-vcvt.h +++ b/src/f32-qu8-vcvt/f32-qu8-vcvt.h @@ -47,11 +47,14 @@ XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_qu8_vcvt_ukernel__avx2_u1 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_qu8_vcvt_ukernel__avx2_u32, 32, false, float, uint8_t, struct xnn_f32_qu8_cvt_params, xnn_init_f32_qu8_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_qu8_vcvt_ukernel__avx2_u48, 48, false, float, uint8_t, struct xnn_f32_qu8_cvt_params, xnn_init_f32_qu8_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_qu8_vcvt_ukernel__avx2_u64, 64, false, float, uint8_t, struct xnn_f32_qu8_cvt_params, xnn_init_f32_qu8_cvt_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_qu8_vcvt_ukernel__avx512skx_u32, 32, false, float, uint8_t, struct xnn_f32_qu8_cvt_params, xnn_init_f32_qu8_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_qu8_vcvt_ukernel__avx512skx_u64, 64, false, float, uint8_t, struct xnn_f32_qu8_cvt_params, xnn_init_f32_qu8_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_qu8_vcvt_ukernel__avx512skx_u96, 96, false, float, uint8_t, struct xnn_f32_qu8_cvt_params, xnn_init_f32_qu8_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_qu8_vcvt_ukernel__avx512skx_u128, 128, false, float, uint8_t, struct xnn_f32_qu8_cvt_params, xnn_init_f32_qu8_cvt_scalar_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u8, 8, false, float, uint8_t, struct xnn_f32_qu8_cvt_params, xnn_init_f32_qu8_cvt_scalar_params) diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u16.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u16.c index 0231f8f7614..1b79bfbe5c4 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u16.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u16.c @@ -30,13 +30,11 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u16( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; const __m256 vscale = _mm256_set1_ps(params->scalar.scale); - const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point)); const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point); - const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { __m256 vx01234567 = _mm256_loadu_ps(input); @@ -60,8 +58,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u16( __m128i vy0123456789ABCDEF = _mm_packus_epi16(vy01234567, vy89ABCDEF); - vy0123456789ABCDEF = _mm_max_epu8(vy0123456789ABCDEF, voutput_min); - _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF); output += 16; } @@ -76,7 +72,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u16( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packus_epi16(vy, vy); - vy = _mm_max_epu8(vy, voutput_min); _mm_storel_epi64((__m128i*) output, vy); output += 8; @@ -95,7 +90,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u16( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packus_epi16(vy, vy); - vy = _mm_max_epu8(vy, voutput_min); if (batch & (4 * sizeof(float))) { _mm_storeu_si32(output, vy); diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u24.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u24.c index 8f90d0f07ac..700c23d66de 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u24.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u24.c @@ -30,13 +30,11 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u24( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; const __m256 vscale = _mm256_set1_ps(params->scalar.scale); - const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point)); const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point); - const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 24 * sizeof(float); batch -= 24 * sizeof(float)) { __m256 vx01234567 = _mm256_loadu_ps(input); @@ -67,9 +65,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u24( __m128i vy0123456789ABCDEF = _mm_packus_epi16(vy01234567, vy89ABCDEF); vyGHIJKLMN = _mm_packus_epi16(vyGHIJKLMN, vyGHIJKLMN); - vy0123456789ABCDEF = _mm_max_epu8(vy0123456789ABCDEF, voutput_min); - vyGHIJKLMN = _mm_max_epu8(vyGHIJKLMN, voutput_min); - _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF); _mm_storel_epi64((__m128i*) (output + 16), vyGHIJKLMN); output += 24; @@ -85,7 +80,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u24( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packus_epi16(vy, vy); - vy = _mm_max_epu8(vy, voutput_min); _mm_storel_epi64((__m128i*) output, vy); output += 8; @@ -104,7 +98,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u24( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packus_epi16(vy, vy); - vy = _mm_max_epu8(vy, voutput_min); if (batch & (4 * sizeof(float))) { _mm_storeu_si32(output, vy); diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u32.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u32.c index 560d212fa8f..c255c8f1d9b 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u32.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u32.c @@ -30,13 +30,11 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u32( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; const __m256 vscale = _mm256_set1_ps(params->scalar.scale); - const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point)); const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point); - const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { __m256 vx01234567 = _mm256_loadu_ps(input); @@ -73,9 +71,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u32( __m128i vy0123456789ABCDEF = _mm_packus_epi16(vy01234567, vy89ABCDEF); __m128i vyGHIJKLMNOPQRSTUV = _mm_packus_epi16(vyGHIJKLMN, vyOPQRSTUV); - vy0123456789ABCDEF = _mm_max_epu8(vy0123456789ABCDEF, voutput_min); - vyGHIJKLMNOPQRSTUV = _mm_max_epu8(vyGHIJKLMNOPQRSTUV, voutput_min); - _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF); _mm_storeu_si128((__m128i*) (output + 16), vyGHIJKLMNOPQRSTUV); output += 32; @@ -91,7 +86,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u32( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packus_epi16(vy, vy); - vy = _mm_max_epu8(vy, voutput_min); _mm_storel_epi64((__m128i*) output, vy); output += 8; @@ -110,7 +104,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u32( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packus_epi16(vy, vy); - vy = _mm_max_epu8(vy, voutput_min); if (batch & (4 * sizeof(float))) { _mm_storeu_si32(output, vy); diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u8.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u8.c index c61a5b69643..7e2c779b842 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u8.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u8.c @@ -30,13 +30,11 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u8( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; const __m256 vscale = _mm256_set1_ps(params->scalar.scale); - const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point)); const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point); - const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { __m256 vx = _mm256_loadu_ps(input); @@ -49,7 +47,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u8( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packus_epi16(vy, vy); - vy = _mm_max_epu8(vy, voutput_min); _mm_storel_epi64((__m128i*) output, vy); output += 8; @@ -68,7 +65,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u8( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packus_epi16(vy, vy); - vy = _mm_max_epu8(vy, voutput_min); if (batch & (4 * sizeof(float))) { _mm_storeu_si32(output, vy); diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u16.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u16.c index e1be2cd4ad6..308d0f224aa 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u16.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u16.c @@ -30,13 +30,11 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u16( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; const __m256 vscale = _mm256_set1_ps(params->scalar.scale); - const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point)); const __m256i voutput_zero_point = _mm256_set1_epi16(params->scalar.output_zero_point); - const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { __m256 vx01 = _mm256_loadu_ps(input); @@ -60,8 +58,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u16( __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0)); - vy0123 = _mm_max_epu8(vy0123, voutput_min); - _mm_storeu_si128((__m128i*) output, vy0123); output += 16; } @@ -76,7 +72,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u16( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point)); vy = _mm_packus_epi16(vy, vy); - vy = _mm_max_epu8(vy, voutput_min); _mm_storel_epi64((__m128i*) output, vy); output += 8; @@ -95,7 +90,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u16( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point)); vy = _mm_packus_epi16(vy, vy); - vy = _mm_max_epu8(vy, voutput_min); if (batch & (4 * sizeof(float))) { _mm_storeu_si32(output, vy); diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u32.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u32.c index ac968a255c2..5e37dc74d4b 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u32.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u32.c @@ -30,15 +30,13 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u32( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; const __m256 vscale = _mm256_set1_ps(params->scalar.scale); - const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point)); const __m256i voutput_zero_point = _mm256_set1_epi16(params->scalar.output_zero_point); XNN_ALIGN(32) static const uint32_t shuffle_mask[8] = {0, 4, 1, 5, 2, 6, 3, 7}; const __m256i vshuffle_mask = _mm256_load_si256((const __m256i*) shuffle_mask); - const __m256i voutput_min = _mm256_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { __m256 vx01 = _mm256_loadu_ps(input); @@ -72,8 +70,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u32( __m256i vy01234567 = _mm256_permutevar8x32_epi32(vy02461357, vshuffle_mask); - vy01234567 = _mm256_max_epu8(vy01234567, voutput_min); - _mm256_storeu_si256((__m256i*) output, vy01234567); output += 32; } @@ -88,7 +84,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u32( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point)); vy = _mm_packus_epi16(vy, vy); - vy = _mm_max_epu8(vy, _mm256_castsi256_si128(voutput_min)); _mm_storel_epi64((__m128i*) output, vy); output += 8; @@ -107,7 +102,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u32( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point)); vy = _mm_packus_epi16(vy, vy); - vy = _mm_max_epu8(vy, _mm256_castsi256_si128(voutput_min)); if (batch & (4 * sizeof(float))) { _mm_storeu_si32(output, vy); diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u48.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u48.c index d7fda136262..fd5add47ae7 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u48.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u48.c @@ -30,15 +30,13 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u48( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; const __m256 vscale = _mm256_set1_ps(params->scalar.scale); - const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point)); const __m256i voutput_zero_point = _mm256_set1_epi16(params->scalar.output_zero_point); XNN_ALIGN(32) static const uint32_t shuffle_mask[8] = {0, 4, 1, 5, 2, 6, 3, 7}; const __m256i vshuffle_mask = _mm256_load_si256((const __m256i*) shuffle_mask); - const __m256i voutput_min = _mm256_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 48 * sizeof(float); batch -= 48 * sizeof(float)) { __m256 vx01 = _mm256_loadu_ps(input); @@ -84,9 +82,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u48( __m256i vy01234567 = _mm256_permutevar8x32_epi32(vy02461357, vshuffle_mask); __m128i vy89AB = _mm_shuffle_epi32(vy8A9B, _MM_SHUFFLE(3, 1, 2, 0)); - vy01234567 = _mm256_max_epu8(vy01234567, voutput_min); - vy89AB = _mm_max_epu8(vy89AB, _mm256_castsi256_si128(voutput_min)); - _mm256_storeu_si256((__m256i*) output, vy01234567); _mm_storeu_si128((__m128i*) (output + 32), vy89AB); output += 48; @@ -102,7 +97,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u48( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point)); vy = _mm_packus_epi16(vy, vy); - vy = _mm_max_epu8(vy, _mm256_castsi256_si128(voutput_min)); _mm_storel_epi64((__m128i*) output, vy); output += 8; @@ -121,7 +115,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u48( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point)); vy = _mm_packus_epi16(vy, vy); - vy = _mm_max_epu8(vy, _mm256_castsi256_si128(voutput_min)); if (batch & (4 * sizeof(float))) { _mm_storeu_si32(output, vy); diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u64.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u64.c index ff915dc9f72..07f2dcd7642 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u64.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u64.c @@ -30,15 +30,13 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u64( static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; const __m256 vscale = _mm256_set1_ps(params->scalar.scale); - const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point)); const __m256i voutput_zero_point = _mm256_set1_epi16(params->scalar.output_zero_point); XNN_ALIGN(32) static const uint32_t shuffle_mask[8] = {0, 4, 1, 5, 2, 6, 3, 7}; const __m256i vshuffle_mask = _mm256_load_si256((const __m256i*) shuffle_mask); - const __m256i voutput_min = _mm256_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { __m256 vx01 = _mm256_loadu_ps(input); @@ -94,9 +92,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u64( __m256i vy01234567 = _mm256_permutevar8x32_epi32(vy02461357, vshuffle_mask); __m256i vy89ABCDEF = _mm256_permutevar8x32_epi32(vy8ACE9BDF, vshuffle_mask); - vy01234567 = _mm256_max_epu8(vy01234567, voutput_min); - vy89ABCDEF = _mm256_max_epu8(vy89ABCDEF, voutput_min); - _mm256_storeu_si256((__m256i*) output, vy01234567); _mm256_storeu_si256((__m256i*) (output + 32), vy89ABCDEF); output += 64; @@ -112,7 +107,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u64( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point)); vy = _mm_packus_epi16(vy, vy); - vy = _mm_max_epu8(vy, _mm256_castsi256_si128(voutput_min)); _mm_storel_epi64((__m128i*) output, vy); output += 8; @@ -131,7 +125,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u64( __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1)); vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point)); vy = _mm_packus_epi16(vy, vy); - vy = _mm_max_epu8(vy, _mm256_castsi256_si128(voutput_min)); if (batch & (4 * sizeof(float))) { _mm_storeu_si32(output, vy); diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u128.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u128.c index 52503bc5c83..57e686a05e7 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u128.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u128.c @@ -31,14 +31,12 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u128( const __m512 vscale = _mm512_set1_ps(params->scalar.scale); - const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point)); const __m512i voutput_zero_point = _mm512_set1_epi16(params->scalar.output_zero_point); const __m512i vshuffle512_mask = _mm512_load_si512(shuffle512_mask); - const __m512i voutput_min = _mm512_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 128 * sizeof(float); batch -= 128 * sizeof(float)) { __m512 vx0123 = _mm512_loadu_ps(input); __m512 vx4567 = _mm512_loadu_ps(input + 16); @@ -90,9 +88,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u128( __m512i vy048C159D26AE37BF = _mm512_packus_epi16(vacc04152637, vacc8C9DAEBF); __m512i vyGKOSHLPTIMQUJNRV = _mm512_packus_epi16(vaccGKHLIMJN, vaccOSPTQURV); - vy048C159D26AE37BF = _mm512_max_epu8(vy048C159D26AE37BF, voutput_min); - vyGKOSHLPTIMQUJNRV = _mm512_max_epu8(vyGKOSHLPTIMQUJNRV, voutput_min); - const __m512i vy0123456789ABCDEF = _mm512_permutexvar_epi32(vshuffle512_mask, vy048C159D26AE37BF); const __m512i vyGHIJKLMNOPQRSTUV = _mm512_permutexvar_epi32(vshuffle512_mask, vyGKOSHLPTIMQUJNRV); @@ -112,7 +107,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u128( vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point)); const __m128i vy0213 = _mm_packus_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1)); __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0)); - vy0123 = _mm_max_epu8(vy0123, _mm512_castsi512_si128(voutput_min)); _mm_storeu_si128((__m128i*) output, vy0123); output += 16; @@ -135,7 +129,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u128( vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point)); const __m128i vy0213 = _mm_packus_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1)); __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0)); - vy0123 = _mm_max_epu8(vy0123, _mm512_castsi512_si128(voutput_min)); _mm_mask_storeu_epi8(output, vmask, vy0123); } diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u32.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u32.c index 774916be772..ad642cedd2c 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u32.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u32.c @@ -31,14 +31,12 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u32( XNN_ALIGN(32) static const uint32_t shuffle256_mask[8] = {0, 4, 2, 6, 1, 5, 3, 7}; const __m512 vscale = _mm512_set1_ps(params->scalar.scale); - const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point)); const __m512i voutput_zero_point = _mm512_set1_epi16(params->scalar.output_zero_point); const __m256i vshuffle256_mask = _mm256_load_si256((const __m256i*) shuffle256_mask); - const __m256i voutput_min = _mm256_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { __m512 vx0123 = _mm512_loadu_ps(input); __m512 vx4567 = _mm512_loadu_ps(input + 16); @@ -59,8 +57,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u32( __m256i vy04261537 = _mm256_packus_epi16(_mm512_castsi512_si256(vacc04152637), _mm512_extracti32x8_epi32(vacc04152637, 1)); - vy04261537 = _mm256_max_epu8(vy04261537, voutput_min); - const __m256i vy01234567 = _mm256_permutevar8x32_epi32(vy04261537, vshuffle256_mask); _mm256_storeu_si256((__m256i*) output, vy01234567); @@ -78,7 +74,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u32( vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point)); const __m128i vy0213 = _mm_packus_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1)); __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0)); - vy0123 = _mm_max_epu8(vy0123, _mm256_castsi256_si128(voutput_min)); _mm_storeu_si128((__m128i*) output, vy0123); output += 16; @@ -101,7 +96,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u32( vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point)); const __m128i vy0213 = _mm_packus_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1)); __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0)); - vy0123 = _mm_max_epu8(vy0123, _mm256_castsi256_si128(voutput_min)); _mm_mask_storeu_epi8(output, vmask, vy0123); } diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u64.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u64.c index 90ebb74ed21..ca20f0ca9d2 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u64.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u64.c @@ -31,14 +31,12 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u64( const __m512 vscale = _mm512_set1_ps(params->scalar.scale); - const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point)); const __m512i voutput_zero_point = _mm512_set1_epi16(params->scalar.output_zero_point); const __m512i vshuffle512_mask = _mm512_load_si512(shuffle512_mask); - const __m512i voutput_min = _mm512_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { __m512 vx0123 = _mm512_loadu_ps(input); __m512 vx4567 = _mm512_loadu_ps(input + 16); @@ -69,8 +67,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u64( __m512i vy048C159D26AE37BF = _mm512_packus_epi16(vacc04152637, vacc8C9DAEBF); - vy048C159D26AE37BF = _mm512_max_epu8(vy048C159D26AE37BF, voutput_min); - const __m512i vy0123456789ABCDEF = _mm512_permutexvar_epi32(vshuffle512_mask, vy048C159D26AE37BF); _mm512_storeu_si512(output, vy0123456789ABCDEF); @@ -88,7 +84,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u64( vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point)); const __m128i vy0213 = _mm_packus_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1)); __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0)); - vy0123 = _mm_max_epu8(vy0123, _mm512_castsi512_si128(voutput_min)); _mm_storeu_si128((__m128i*) output, vy0123); output += 16; @@ -111,7 +106,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u64( vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point)); const __m128i vy0213 = _mm_packus_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1)); __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0)); - vy0123 = _mm_max_epu8(vy0123, _mm512_castsi512_si128(voutput_min)); _mm_mask_storeu_epi8(output, vmask, vy0123); } diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u96.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u96.c index 85e158bdc6b..abcf436c122 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u96.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u96.c @@ -32,15 +32,13 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u96( XNN_ALIGN(32) static const uint32_t shuffle256_mask[8] = {0, 4, 2, 6, 1, 5, 3, 7}; const __m512 vscale = _mm512_set1_ps(params->scalar.scale); - const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point)); const __m512i voutput_zero_point = _mm512_set1_epi16(params->scalar.output_zero_point); const __m512i vshuffle512_mask = _mm512_load_si512(shuffle512_mask); const __m256i vshuffle256_mask = _mm256_load_si256((const __m256i*) shuffle256_mask); - const __m512i voutput_min = _mm512_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 96 * sizeof(float); batch -= 96 * sizeof(float)) { __m512 vx0123 = _mm512_loadu_ps(input); __m512 vx4567 = _mm512_loadu_ps(input + 16); @@ -82,9 +80,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u96( __m512i vy048C159D26AE37BF = _mm512_packus_epi16(vacc04152637, vacc8C9DAEBF); __m256i vyGKIMHLJN = _mm256_packus_epi16(_mm512_castsi512_si256(vaccGKHLIMJN), _mm512_extracti32x8_epi32(vaccGKHLIMJN, 1)); - vy048C159D26AE37BF = _mm512_max_epu8(vy048C159D26AE37BF, voutput_min); - vyGKIMHLJN = _mm256_max_epu8(vyGKIMHLJN, _mm512_castsi512_si256(voutput_min)); - const __m512i vy0123456789ABCDEF = _mm512_permutexvar_epi32(vshuffle512_mask, vy048C159D26AE37BF); const __m256i vyGHIJKLMN = _mm256_permutevar8x32_epi32(vyGKIMHLJN, vshuffle256_mask); @@ -104,7 +99,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u96( vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point)); const __m128i vy0213 = _mm_packus_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1)); __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0)); - vy0123 = _mm_max_epu8(vy0123, _mm512_castsi512_si128(voutput_min)); _mm_storeu_si128((__m128i*) output, vy0123); output += 16; @@ -127,7 +121,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u96( vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point)); const __m128i vy0213 = _mm_packus_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1)); __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0)); - vy0123 = _mm_max_epu8(vy0123, _mm512_castsi512_si128(voutput_min)); _mm_mask_storeu_epi8(output, vmask, vy0123); } diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u16.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u16.c index 800f3c4011c..d6379ac534b 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u16.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u16.c @@ -30,8 +30,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u16( const float32x4_t vscale = vld1q_dup_f32(¶ms->scalar.scale); const float32x4_t vmagic_bias = vdupq_n_f32(12582912.0f); const int32x4_t vmagic_bias_less_zero_point = vdupq_n_s32(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point); - const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->scalar.output_min); - const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->scalar.output_max); XNN_FORCE_REALIZATION(vmagic_bias); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { float32x4_t vx0123 = vld1q_f32(input); input += 4; @@ -59,10 +57,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u16( uint8x16_t vy0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); - vy0123456789ABCDEF = vmaxq_u8(vy0123456789ABCDEF, voutput_min); - - vy0123456789ABCDEF = vminq_u8(vy0123456789ABCDEF, voutput_max); - vst1q_u8(output, vy0123456789ABCDEF); output += 16; } for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { @@ -81,8 +75,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u16( const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); uint8x8_t vy = vqmovun_s16(vacc); - vy = vmax_u8(vy, vget_low_u8(voutput_min)); - vy = vmin_u8(vy, vget_low_u8(voutput_max)); vst1_u8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { @@ -104,8 +96,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u16( const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); uint8x8_t vy = vqmovun_s16(vacc); - vy = vmax_u8(vy, vget_low_u8(voutput_min)); - vy = vmin_u8(vy, vget_low_u8(voutput_max)); if (batch & (4 * sizeof(float))) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4; diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u24.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u24.c index b24291995a3..9c3c7a37b5b 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u24.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u24.c @@ -30,8 +30,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u24( const float32x4_t vscale = vld1q_dup_f32(¶ms->scalar.scale); const float32x4_t vmagic_bias = vdupq_n_f32(12582912.0f); const int32x4_t vmagic_bias_less_zero_point = vdupq_n_s32(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point); - const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->scalar.output_min); - const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->scalar.output_max); XNN_FORCE_REALIZATION(vmagic_bias); for (; batch >= 24 * sizeof(float); batch -= 24 * sizeof(float)) { float32x4_t vx0123 = vld1q_f32(input); input += 4; @@ -69,12 +67,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u24( uint8x16_t vy0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); uint8x8_t vyGHIJKLMN = vqmovun_s16(vaccGHIJKLMN); - vy0123456789ABCDEF = vmaxq_u8(vy0123456789ABCDEF, voutput_min); - vyGHIJKLMN = vmax_u8(vyGHIJKLMN, vget_low_u8(voutput_min)); - - vy0123456789ABCDEF = vminq_u8(vy0123456789ABCDEF, voutput_max); - vyGHIJKLMN = vmin_u8(vyGHIJKLMN, vget_low_u8(voutput_max)); - vst1q_u8(output, vy0123456789ABCDEF); output += 16; vst1_u8(output, vyGHIJKLMN); output += 8; } @@ -94,8 +86,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u24( const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); uint8x8_t vy = vqmovun_s16(vacc); - vy = vmax_u8(vy, vget_low_u8(voutput_min)); - vy = vmin_u8(vy, vget_low_u8(voutput_max)); vst1_u8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { @@ -117,8 +107,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u24( const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); uint8x8_t vy = vqmovun_s16(vacc); - vy = vmax_u8(vy, vget_low_u8(voutput_min)); - vy = vmin_u8(vy, vget_low_u8(voutput_max)); if (batch & (4 * sizeof(float))) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4; diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u32.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u32.c index 7b99af2082f..b7e8ebcae3a 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u32.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u32.c @@ -30,8 +30,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u32( const float32x4_t vscale = vld1q_dup_f32(¶ms->scalar.scale); const float32x4_t vmagic_bias = vdupq_n_f32(12582912.0f); const int32x4_t vmagic_bias_less_zero_point = vdupq_n_s32(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point); - const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->scalar.output_min); - const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->scalar.output_max); XNN_FORCE_REALIZATION(vmagic_bias); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { float32x4_t vx0123 = vld1q_f32(input); input += 4; @@ -78,12 +76,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u32( uint8x16_t vy0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); uint8x16_t vyGHIJKLMNOPQRSTUV = vcombine_u8(vqmovun_s16(vaccGHIJKLMN), vqmovun_s16(vaccOPQRSTUV)); - vy0123456789ABCDEF = vmaxq_u8(vy0123456789ABCDEF, voutput_min); - vyGHIJKLMNOPQRSTUV = vmaxq_u8(vyGHIJKLMNOPQRSTUV, voutput_min); - - vy0123456789ABCDEF = vminq_u8(vy0123456789ABCDEF, voutput_max); - vyGHIJKLMNOPQRSTUV = vminq_u8(vyGHIJKLMNOPQRSTUV, voutput_max); - vst1q_u8(output, vy0123456789ABCDEF); output += 16; vst1q_u8(output, vyGHIJKLMNOPQRSTUV); output += 16; } @@ -103,8 +95,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u32( const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); uint8x8_t vy = vqmovun_s16(vacc); - vy = vmax_u8(vy, vget_low_u8(voutput_min)); - vy = vmin_u8(vy, vget_low_u8(voutput_max)); vst1_u8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { @@ -126,8 +116,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u32( const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); uint8x8_t vy = vqmovun_s16(vacc); - vy = vmax_u8(vy, vget_low_u8(voutput_min)); - vy = vmin_u8(vy, vget_low_u8(voutput_max)); if (batch & (4 * sizeof(float))) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4; diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u8.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u8.c index 98a17a6f2ad..031c7ec5792 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u8.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u8.c @@ -30,8 +30,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u8( const float32x4_t vscale = vld1q_dup_f32(¶ms->scalar.scale); const float32x4_t vmagic_bias = vdupq_n_f32(12582912.0f); const int32x4_t vmagic_bias_less_zero_point = vdupq_n_s32(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point); - const uint8x8_t voutput_min = vld1_dup_u8(¶ms->scalar.output_min); - const uint8x8_t voutput_max = vld1_dup_u8(¶ms->scalar.output_max); XNN_FORCE_REALIZATION(vmagic_bias); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { float32x4_t vx_lo = vld1q_f32(input); input += 4; @@ -49,8 +47,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u8( const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); uint8x8_t vy = vqmovun_s16(vacc); - vy = vmax_u8(vy, voutput_min); - vy = vmin_u8(vy, voutput_max); vst1_u8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { @@ -72,8 +68,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u8( const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); uint8x8_t vy = vqmovun_s16(vacc); - vy = vmax_u8(vy, voutput_min); - vy = vmin_u8(vy, voutput_max); if (batch & (4 * sizeof(float))) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4; diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u16.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u16.c index f4a83df4736..39c606fdd6d 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u16.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u16.c @@ -29,8 +29,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u16( const float32x4_t vscale = vld1q_dup_f32(¶ms->scalar.scale); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->scalar.output_zero_point); - const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->scalar.output_min); - const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->scalar.output_max); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { float32x4_t vx0123 = vld1q_f32(input); input += 4; float32x4_t vx4567 = vld1q_f32(input); input += 4; @@ -55,10 +53,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u16( uint8x16_t vy0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); - vy0123456789ABCDEF = vmaxq_u8(vy0123456789ABCDEF, voutput_min); - - vy0123456789ABCDEF = vminq_u8(vy0123456789ABCDEF, voutput_max); - vst1q_u8(output, vy0123456789ABCDEF); output += 16; } for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { @@ -75,8 +69,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u16( vacc = vqaddq_s16(vacc, voutput_zero_point); uint8x8_t vy = vqmovun_s16(vacc); - vy = vmax_u8(vy, vget_low_u8(voutput_min)); - vy = vmin_u8(vy, vget_low_u8(voutput_max)); vst1_u8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { @@ -96,8 +88,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u16( vacc = vqaddq_s16(vacc, voutput_zero_point); uint8x8_t vy = vqmovun_s16(vacc); - vy = vmax_u8(vy, vget_low_u8(voutput_min)); - vy = vmin_u8(vy, vget_low_u8(voutput_max)); if (batch & (4 * sizeof(float))) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4; diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u24.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u24.c index 967b8792a69..4928a0e41ac 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u24.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u24.c @@ -29,8 +29,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u24( const float32x4_t vscale = vld1q_dup_f32(¶ms->scalar.scale); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->scalar.output_zero_point); - const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->scalar.output_min); - const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->scalar.output_max); for (; batch >= 24 * sizeof(float); batch -= 24 * sizeof(float)) { float32x4_t vx0123 = vld1q_f32(input); input += 4; float32x4_t vx4567 = vld1q_f32(input); input += 4; @@ -64,12 +62,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u24( uint8x16_t vy0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); uint8x8_t vyGHIJKLMN = vqmovun_s16(vaccGHIJKLMN); - vy0123456789ABCDEF = vmaxq_u8(vy0123456789ABCDEF, voutput_min); - vyGHIJKLMN = vmax_u8(vyGHIJKLMN, vget_low_u8(voutput_min)); - - vy0123456789ABCDEF = vminq_u8(vy0123456789ABCDEF, voutput_max); - vyGHIJKLMN = vmin_u8(vyGHIJKLMN, vget_low_u8(voutput_max)); - vst1q_u8(output, vy0123456789ABCDEF); output += 16; vst1_u8(output, vyGHIJKLMN); output += 8; } @@ -87,8 +79,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u24( vacc = vqaddq_s16(vacc, voutput_zero_point); uint8x8_t vy = vqmovun_s16(vacc); - vy = vmax_u8(vy, vget_low_u8(voutput_min)); - vy = vmin_u8(vy, vget_low_u8(voutput_max)); vst1_u8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { @@ -108,8 +98,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u24( vacc = vqaddq_s16(vacc, voutput_zero_point); uint8x8_t vy = vqmovun_s16(vacc); - vy = vmax_u8(vy, vget_low_u8(voutput_min)); - vy = vmin_u8(vy, vget_low_u8(voutput_max)); if (batch & (4 * sizeof(float))) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4; diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u32.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u32.c index 2908f275a66..175c99cc179 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u32.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u32.c @@ -29,8 +29,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u32( const float32x4_t vscale = vld1q_dup_f32(¶ms->scalar.scale); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->scalar.output_zero_point); - const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->scalar.output_min); - const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->scalar.output_max); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { float32x4_t vx0123 = vld1q_f32(input); input += 4; float32x4_t vx4567 = vld1q_f32(input); input += 4; @@ -72,12 +70,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u32( uint8x16_t vy0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); uint8x16_t vyGHIJKLMNOPQRSTUV = vcombine_u8(vqmovun_s16(vaccGHIJKLMN), vqmovun_s16(vaccOPQRSTUV)); - vy0123456789ABCDEF = vmaxq_u8(vy0123456789ABCDEF, voutput_min); - vyGHIJKLMNOPQRSTUV = vmaxq_u8(vyGHIJKLMNOPQRSTUV, voutput_min); - - vy0123456789ABCDEF = vminq_u8(vy0123456789ABCDEF, voutput_max); - vyGHIJKLMNOPQRSTUV = vminq_u8(vyGHIJKLMNOPQRSTUV, voutput_max); - vst1q_u8(output, vy0123456789ABCDEF); output += 16; vst1q_u8(output, vyGHIJKLMNOPQRSTUV); output += 16; } @@ -95,8 +87,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u32( vacc = vqaddq_s16(vacc, voutput_zero_point); uint8x8_t vy = vqmovun_s16(vacc); - vy = vmax_u8(vy, vget_low_u8(voutput_min)); - vy = vmin_u8(vy, vget_low_u8(voutput_max)); vst1_u8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { @@ -116,8 +106,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u32( vacc = vqaddq_s16(vacc, voutput_zero_point); uint8x8_t vy = vqmovun_s16(vacc); - vy = vmax_u8(vy, vget_low_u8(voutput_min)); - vy = vmin_u8(vy, vget_low_u8(voutput_max)); if (batch & (4 * sizeof(float))) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4; diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u8.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u8.c index 5f871cfe28a..aa75c316192 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u8.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u8.c @@ -29,8 +29,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u8( const float32x4_t vscale = vld1q_dup_f32(¶ms->scalar.scale); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->scalar.output_zero_point); - const uint8x8_t voutput_min = vld1_dup_u8(¶ms->scalar.output_min); - const uint8x8_t voutput_max = vld1_dup_u8(¶ms->scalar.output_max); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { float32x4_t vx_lo = vld1q_f32(input); input += 4; float32x4_t vx_hi = vld1q_f32(input); input += 4; @@ -45,8 +43,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u8( vacc = vqaddq_s16(vacc, voutput_zero_point); uint8x8_t vy = vqmovun_s16(vacc); - vy = vmax_u8(vy, voutput_min); - vy = vmin_u8(vy, voutput_max); vst1_u8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { @@ -66,8 +62,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u8( vacc = vqaddq_s16(vacc, voutput_zero_point); uint8x8_t vy = vqmovun_s16(vacc); - vy = vmax_u8(vy, voutput_min); - vy = vmin_u8(vy, voutput_max); if (batch & (4 * sizeof(float))) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4; diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u1v.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u1v.c index 12c86a5427b..b3ffa68167c 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u1v.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u1v.c @@ -29,8 +29,9 @@ void xnn_f32_qu8_vcvt_ukernel__rvv_u1v( batch >>= XNN_LOG2_SIZEOF_FLOAT; const float scale = params->scalar.scale; - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + // TODO: Clamp may not be necessary. RISCV spec doesn't say if vncvt saturates... + const float output_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point); + const float output_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point); const int32_t output_zero_point = params->scalar.output_zero_point; for (; batch > 0; ) { diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u2v.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u2v.c index 57d41b0e906..3223238d890 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u2v.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u2v.c @@ -29,8 +29,9 @@ void xnn_f32_qu8_vcvt_ukernel__rvv_u2v( batch >>= XNN_LOG2_SIZEOF_FLOAT; const float scale = params->scalar.scale; - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + // TODO: Clamp may not be necessary. RISCV spec doesn't say if vncvt saturates... + const float output_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point); + const float output_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point); const int32_t output_zero_point = params->scalar.output_zero_point; for (; batch > 0; ) { diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u4v.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u4v.c index 4d654460c75..801d4ea26ef 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u4v.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u4v.c @@ -29,8 +29,9 @@ void xnn_f32_qu8_vcvt_ukernel__rvv_u4v( batch >>= XNN_LOG2_SIZEOF_FLOAT; const float scale = params->scalar.scale; - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + // TODO: Clamp may not be necessary. RISCV spec doesn't say if vncvt saturates... + const float output_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point); + const float output_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point); const int32_t output_zero_point = params->scalar.output_zero_point; for (; batch > 0; ) { diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u8v.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u8v.c index a344b38cffe..1c897023906 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u8v.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u8v.c @@ -29,8 +29,9 @@ void xnn_f32_qu8_vcvt_ukernel__rvv_u8v( batch >>= XNN_LOG2_SIZEOF_FLOAT; const float scale = params->scalar.scale; - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + // TODO: Clamp may not be necessary. RISCV spec doesn't say if vncvt saturates... + const float output_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point); + const float output_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point); const int32_t output_zero_point = params->scalar.output_zero_point; for (; batch > 0; ) { diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u1.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u1.c index 6bf96a303b4..2bf4b8c50f1 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u1.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u1.c @@ -26,8 +26,8 @@ void xnn_f32_qu8_vcvt_ukernel__scalar_fmagic_u1( const float* i = input; const float vscale = params->scalar.scale; - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point); const float vmagic_bias = 12582912.0f; const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u2.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u2.c index d0b4631e28e..6ed343f8d9d 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u2.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u2.c @@ -26,8 +26,8 @@ void xnn_f32_qu8_vcvt_ukernel__scalar_fmagic_u2( const float* i = input; const float vscale = params->scalar.scale; - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point); const float vmagic_bias = 12582912.0f; const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u3.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u3.c index 61d888901c8..15c59c505f5 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u3.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u3.c @@ -26,8 +26,8 @@ void xnn_f32_qu8_vcvt_ukernel__scalar_fmagic_u3( const float* i = input; const float vscale = params->scalar.scale; - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point); const float vmagic_bias = 12582912.0f; const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u4.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u4.c index d140844158b..f6bd1e419b1 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u4.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u4.c @@ -26,8 +26,8 @@ void xnn_f32_qu8_vcvt_ukernel__scalar_fmagic_u4( const float* i = input; const float vscale = params->scalar.scale; - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point); const float vmagic_bias = 12582912.0f; const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u1.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u1.c index bd2387ce372..46ad989c1ce 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u1.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u1.c @@ -27,8 +27,8 @@ void xnn_f32_qu8_vcvt_ukernel__scalar_imagic_u1( const float* i = input; const float vscale = params->scalar.scale; const float vmagic_bias = 12582912.0f; - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float output_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point); + const float output_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point); const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point); const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point); const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u2.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u2.c index df926a44093..61a7edfbac5 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u2.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u2.c @@ -27,8 +27,8 @@ void xnn_f32_qu8_vcvt_ukernel__scalar_imagic_u2( const float* i = input; const float vscale = params->scalar.scale; const float vmagic_bias = 12582912.0f; - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float output_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point); + const float output_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point); const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point); const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point); const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u3.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u3.c index a406101ba2f..14c2e17cdab 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u3.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u3.c @@ -27,8 +27,8 @@ void xnn_f32_qu8_vcvt_ukernel__scalar_imagic_u3( const float* i = input; const float vscale = params->scalar.scale; const float vmagic_bias = 12582912.0f; - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float output_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point); + const float output_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point); const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point); const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point); const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u4.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u4.c index c6e30581518..0b2e85261a2 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u4.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u4.c @@ -27,8 +27,8 @@ void xnn_f32_qu8_vcvt_ukernel__scalar_imagic_u4( const float* i = input; const float vscale = params->scalar.scale; const float vmagic_bias = 12582912.0f; - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float output_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point); + const float output_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point); const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point); const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point); const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u1.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u1.c index 74343847a58..5e2e027a2b2 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u1.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u1.c @@ -27,8 +27,8 @@ void xnn_f32_qu8_vcvt_ukernel__scalar_lrintf_u1( assert(output != NULL); const float vscale = params->scalar.scale; - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point); const int32_t voutput_zero_point = params->scalar.output_zero_point; do { diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u2.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u2.c index 8f06f8c3c99..f95a3eaf2d0 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u2.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u2.c @@ -27,8 +27,8 @@ void xnn_f32_qu8_vcvt_ukernel__scalar_lrintf_u2( assert(output != NULL); const float vscale = params->scalar.scale; - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point); const int32_t voutput_zero_point = params->scalar.output_zero_point; for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) { diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u3.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u3.c index fd2fe136f5f..ce552f0fab4 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u3.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u3.c @@ -27,8 +27,8 @@ void xnn_f32_qu8_vcvt_ukernel__scalar_lrintf_u3( assert(output != NULL); const float vscale = params->scalar.scale; - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point); const int32_t voutput_zero_point = params->scalar.output_zero_point; for (; batch >= 3 * sizeof(float); batch -= 3 * sizeof(float)) { diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u4.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u4.c index e16efd930d5..0bd3ba88488 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u4.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u4.c @@ -27,8 +27,8 @@ void xnn_f32_qu8_vcvt_ukernel__scalar_lrintf_u4( assert(output != NULL); const float vscale = params->scalar.scale; - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point); const int32_t voutput_zero_point = params->scalar.output_zero_point; for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u16.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u16.c index d8af609659e..4e8538adb6c 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u16.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u16.c @@ -28,13 +28,11 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u16( assert(output != NULL); const __m128 vscale = _mm_set1_ps(params->scalar.scale); - const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point)); const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point); - const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { __m128 vx0123 = _mm_loadu_ps(input); @@ -64,11 +62,8 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u16( vy01234567 = _mm_adds_epi16(vy01234567, voutput_zero_point); vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point); - __m128i vy0123456789ABCDEF = _mm_packus_epi16(vy01234567, vy89ABCDEF); - vy0123456789ABCDEF = _mm_max_epu8(vy0123456789ABCDEF, voutput_min); - _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF); output += 16; } @@ -89,7 +84,6 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u16( __m128i vy = _mm_packs_epi32(vy_lo, vy_hi); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packus_epi16(vy, vy); - vy = _mm_max_epu8(vy, voutput_min); _mm_storel_epi64((__m128i*) output, vy); output += 8; @@ -111,7 +105,6 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u16( __m128i vy = _mm_packs_epi32(vy_lo, vy_hi); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packus_epi16(vy, vy); - vy = _mm_max_epu8(vy, voutput_min); if (batch & (4 * sizeof(float))) { unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy)); diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u24.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u24.c index 93981b75964..dcce8e75919 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u24.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u24.c @@ -28,13 +28,11 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u24( assert(output != NULL); const __m128 vscale = _mm_set1_ps(params->scalar.scale); - const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point)); const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point); - const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 24 * sizeof(float); batch -= 24 * sizeof(float)) { __m128 vx0123 = _mm_loadu_ps(input); @@ -74,13 +72,9 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u24( vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point); vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point); - __m128i vy0123456789ABCDEF = _mm_packus_epi16(vy01234567, vy89ABCDEF); vyGHIJKLMN = _mm_packus_epi16(vyGHIJKLMN, vyGHIJKLMN); - vy0123456789ABCDEF = _mm_max_epu8(vy0123456789ABCDEF, voutput_min); - vyGHIJKLMN = _mm_max_epu8(vyGHIJKLMN, voutput_min); - _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF); _mm_storel_epi64((__m128i*) (output + 16), vyGHIJKLMN); output += 24; @@ -102,7 +96,6 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u24( __m128i vy = _mm_packs_epi32(vy_lo, vy_hi); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packus_epi16(vy, vy); - vy = _mm_max_epu8(vy, voutput_min); _mm_storel_epi64((__m128i*) output, vy); output += 8; @@ -124,7 +117,6 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u24( __m128i vy = _mm_packs_epi32(vy_lo, vy_hi); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packus_epi16(vy, vy); - vy = _mm_max_epu8(vy, voutput_min); if (batch & (4 * sizeof(float))) { unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy)); diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u32.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u32.c index fc028418ac7..d40037131e7 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u32.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u32.c @@ -28,13 +28,11 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u32( assert(output != NULL); const __m128 vscale = _mm_set1_ps(params->scalar.scale); - const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point)); const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point); - const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { __m128 vx0123 = _mm_loadu_ps(input); @@ -84,13 +82,9 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u32( vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point); vyOPQRSTUV = _mm_adds_epi16(vyOPQRSTUV, voutput_zero_point); - __m128i vy0123456789ABCDEF = _mm_packus_epi16(vy01234567, vy89ABCDEF); __m128i vyGHIJKLMNOPQRSTUV = _mm_packus_epi16(vyGHIJKLMN, vyOPQRSTUV); - vy0123456789ABCDEF = _mm_max_epu8(vy0123456789ABCDEF, voutput_min); - vyGHIJKLMNOPQRSTUV = _mm_max_epu8(vyGHIJKLMNOPQRSTUV, voutput_min); - _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF); _mm_storeu_si128((__m128i*) (output + 16), vyGHIJKLMNOPQRSTUV); output += 32; @@ -112,7 +106,6 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u32( __m128i vy = _mm_packs_epi32(vy_lo, vy_hi); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packus_epi16(vy, vy); - vy = _mm_max_epu8(vy, voutput_min); _mm_storel_epi64((__m128i*) output, vy); output += 8; @@ -134,7 +127,6 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u32( __m128i vy = _mm_packs_epi32(vy_lo, vy_hi); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packus_epi16(vy, vy); - vy = _mm_max_epu8(vy, voutput_min); if (batch & (4 * sizeof(float))) { unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy)); diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u8.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u8.c index fcb6e1bed70..6e8c2014b91 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u8.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u8.c @@ -28,13 +28,11 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u8( assert(output != NULL); const __m128 vscale = _mm_set1_ps(params->scalar.scale); - const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point)); + const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point)); const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point); - const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_max_less_zero_point); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { __m128 vx_lo = _mm_loadu_ps(input); @@ -53,7 +51,6 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u8( __m128i vy = _mm_packs_epi32(vy_lo, vy_hi); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packus_epi16(vy, vy); - vy = _mm_max_epu8(vy, voutput_min); _mm_storel_epi64((__m128i*) output, vy); output += 8; @@ -75,7 +72,6 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u8( __m128i vy = _mm_packs_epi32(vy_lo, vy_hi); vy = _mm_adds_epi16(vy, voutput_zero_point); vy = _mm_packus_epi16(vy, vy); - vy = _mm_max_epu8(vy, voutput_min); if (batch & (4 * sizeof(float))) { unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy)); diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u1.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u1.c index c3ae46820a9..42f9be10b53 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u1.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u1.c @@ -26,8 +26,8 @@ void xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_u1( const float* i = input; const float vscale = params->scalar.scale; - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point); const float vmagic_bias = 12582912.0f; const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u2.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u2.c index c4c1570ff6e..65c0619c1db 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u2.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u2.c @@ -26,8 +26,8 @@ void xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_u2( const float* i = input; const float vscale = params->scalar.scale; - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point); const float vmagic_bias = 12582912.0f; const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u3.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u3.c index 3c3951aa44c..11aa8edf2cb 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u3.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u3.c @@ -26,8 +26,8 @@ void xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_u3( const float* i = input; const float vscale = params->scalar.scale; - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point); const float vmagic_bias = 12582912.0f; const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u4.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u4.c index b07449669db..08a1bd0c048 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u4.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u4.c @@ -26,8 +26,8 @@ void xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_u4( const float* i = input; const float vscale = params->scalar.scale; - const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); - const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point); + const float voutput_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point); + const float voutput_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point); const float vmagic_bias = 12582912.0f; const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point; diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u16.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u16.c index 6c9b9d6b5dd..6028ac155f2 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u16.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u16.c @@ -29,12 +29,8 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u16( const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); const v128_t voutput_zero_point = wasm_v128_load16_splat(¶ms->scalar.output_zero_point); - const v128_t voutput_min = wasm_v128_load8_splat(¶ms->scalar.output_min); - const v128_t voutput_max = wasm_v128_load8_splat(¶ms->scalar.output_max); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { v128_t vx0123 = wasm_v128_load(input); v128_t vx4567 = wasm_v128_load(input + 4); @@ -65,10 +61,6 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u16( v128_t vy0123456789ABCDEF = wasm_u8x16_narrow_i16x8(vacc01234567, vacc89ABCDEF); - vy0123456789ABCDEF = wasm_u8x16_max(vy0123456789ABCDEF, voutput_min); - - vy0123456789ABCDEF = wasm_u8x16_min(vy0123456789ABCDEF, voutput_max); - wasm_v128_store(output, vy0123456789ABCDEF); output += 16; } @@ -90,8 +82,6 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u16( vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc); - vy = wasm_u8x16_max(vy, voutput_min); - vy = wasm_u8x16_min(vy, voutput_max); wasm_v128_store64_lane(output, vy, 0); output += 8; @@ -116,8 +106,6 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u16( vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc); - vy = wasm_u8x16_max(vy, voutput_min); - vy = wasm_u8x16_min(vy, voutput_max); if (batch & (4 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u24.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u24.c index 5574ea37dba..eddb82fa3e9 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u24.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u24.c @@ -29,12 +29,8 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u24( const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); const v128_t voutput_zero_point = wasm_v128_load16_splat(¶ms->scalar.output_zero_point); - const v128_t voutput_min = wasm_v128_load8_splat(¶ms->scalar.output_min); - const v128_t voutput_max = wasm_v128_load8_splat(¶ms->scalar.output_max); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); for (; batch >= 24 * sizeof(float); batch -= 24 * sizeof(float)) { v128_t vx0123 = wasm_v128_load(input); v128_t vx4567 = wasm_v128_load(input + 4); @@ -76,12 +72,6 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u24( v128_t vy0123456789ABCDEF = wasm_u8x16_narrow_i16x8(vacc01234567, vacc89ABCDEF); v128_t vyGHIJKLMN = wasm_u8x16_narrow_i16x8(vaccGHIJKLMN, vaccGHIJKLMN); - vy0123456789ABCDEF = wasm_u8x16_max(vy0123456789ABCDEF, voutput_min); - vyGHIJKLMN = wasm_u8x16_max(vyGHIJKLMN, voutput_min); - - vy0123456789ABCDEF = wasm_u8x16_min(vy0123456789ABCDEF, voutput_max); - vyGHIJKLMN = wasm_u8x16_min(vyGHIJKLMN, voutput_max); - wasm_v128_store(output, vy0123456789ABCDEF); wasm_v128_store64_lane(output + 16, vyGHIJKLMN, 0); output += 24; @@ -104,8 +94,6 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u24( vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc); - vy = wasm_u8x16_max(vy, voutput_min); - vy = wasm_u8x16_min(vy, voutput_max); wasm_v128_store64_lane(output, vy, 0); output += 8; @@ -130,8 +118,6 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u24( vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc); - vy = wasm_u8x16_max(vy, voutput_min); - vy = wasm_u8x16_min(vy, voutput_max); if (batch & (4 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u32.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u32.c index 3aa4990c9ee..839b96b9999 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u32.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u32.c @@ -29,12 +29,8 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u32( const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); const v128_t voutput_zero_point = wasm_v128_load16_splat(¶ms->scalar.output_zero_point); - const v128_t voutput_min = wasm_v128_load8_splat(¶ms->scalar.output_min); - const v128_t voutput_max = wasm_v128_load8_splat(¶ms->scalar.output_max); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { v128_t vx0123 = wasm_v128_load(input); v128_t vx4567 = wasm_v128_load(input + 4); @@ -86,12 +82,6 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u32( v128_t vy0123456789ABCDEF = wasm_u8x16_narrow_i16x8(vacc01234567, vacc89ABCDEF); v128_t vyGHIJKLMNOPQRSTUV = wasm_u8x16_narrow_i16x8(vaccGHIJKLMN, vaccOPQRSTUV); - vy0123456789ABCDEF = wasm_u8x16_max(vy0123456789ABCDEF, voutput_min); - vyGHIJKLMNOPQRSTUV = wasm_u8x16_max(vyGHIJKLMNOPQRSTUV, voutput_min); - - vy0123456789ABCDEF = wasm_u8x16_min(vy0123456789ABCDEF, voutput_max); - vyGHIJKLMNOPQRSTUV = wasm_u8x16_min(vyGHIJKLMNOPQRSTUV, voutput_max); - wasm_v128_store(output, vy0123456789ABCDEF); wasm_v128_store(output + 16, vyGHIJKLMNOPQRSTUV); output += 32; @@ -114,8 +104,6 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u32( vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc); - vy = wasm_u8x16_max(vy, voutput_min); - vy = wasm_u8x16_min(vy, voutput_max); wasm_v128_store64_lane(output, vy, 0); output += 8; @@ -140,8 +128,6 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u32( vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc); - vy = wasm_u8x16_max(vy, voutput_min); - vy = wasm_u8x16_min(vy, voutput_max); if (batch & (4 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u8.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u8.c index 987cc380756..266c803473b 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u8.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u8.c @@ -29,12 +29,8 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u8( const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); const v128_t voutput_zero_point = wasm_v128_load16_splat(¶ms->scalar.output_zero_point); - const v128_t voutput_min = wasm_v128_load8_splat(¶ms->scalar.output_min); - const v128_t voutput_max = wasm_v128_load8_splat(¶ms->scalar.output_max); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(voutput_zero_point); - XNN_FORCE_REALIZATION(voutput_min); - XNN_FORCE_REALIZATION(voutput_max); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { v128_t vx_lo = wasm_v128_load(input); v128_t vx_hi = wasm_v128_load(input + 4); @@ -53,8 +49,6 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u8( vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc); - vy = wasm_u8x16_max(vy, voutput_min); - vy = wasm_u8x16_min(vy, voutput_max); wasm_v128_store64_lane(output, vy, 0); output += 8; @@ -79,8 +73,6 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u8( vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc); - vy = wasm_u8x16_max(vy, voutput_min); - vy = wasm_u8x16_min(vy, voutput_max); if (batch & (4 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u16.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u16.c index 25090ca0739..7447d350ba3 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u16.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u16.c @@ -28,12 +28,12 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_magic_u16( assert(input != NULL); assert(output != NULL); - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); + const float output_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point); const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); const v128_t vmagic_min = wasm_u32x4_splat(float_as_uint32(12582912.0f + output_min_less_zero_point)); const v128_t vmagic_bias_less_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point); - const v128_t voutput_max = wasm_v128_load8_splat(¶ms->scalar.output_max); + const v128_t voutput_max = wasm_u8x16_const_splat(255); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(vmagic_bias); XNN_FORCE_REALIZATION(vmagic_min); diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u24.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u24.c index fcb1df590e2..f186cfb1448 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u24.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u24.c @@ -28,12 +28,12 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_magic_u24( assert(input != NULL); assert(output != NULL); - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); + const float output_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point); const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); const v128_t vmagic_min = wasm_u32x4_splat(float_as_uint32(12582912.0f + output_min_less_zero_point)); const v128_t vmagic_bias_less_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point); - const v128_t voutput_max = wasm_v128_load8_splat(¶ms->scalar.output_max); + const v128_t voutput_max = wasm_u8x16_const_splat(255); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(vmagic_bias); XNN_FORCE_REALIZATION(vmagic_min); diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u32.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u32.c index 1247cc6fece..c2e51f0ae64 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u32.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u32.c @@ -28,12 +28,12 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_magic_u32( assert(input != NULL); assert(output != NULL); - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); + const float output_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point); const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); const v128_t vmagic_min = wasm_u32x4_splat(float_as_uint32(12582912.0f + output_min_less_zero_point)); const v128_t vmagic_bias_less_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point); - const v128_t voutput_max = wasm_v128_load8_splat(¶ms->scalar.output_max); + const v128_t voutput_max = wasm_u8x16_const_splat(255); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(vmagic_bias); XNN_FORCE_REALIZATION(vmagic_min); diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u8.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u8.c index 1dfb39b6406..3f8d1a469e7 100644 --- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u8.c +++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u8.c @@ -28,12 +28,12 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_magic_u8( assert(input != NULL); assert(output != NULL); - const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point); + const float output_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point); const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); const v128_t vmagic_min = wasm_u32x4_splat(float_as_uint32(12582912.0f + output_min_less_zero_point)); const v128_t vmagic_bias_less_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point); - const v128_t voutput_max = wasm_v128_load8_splat(¶ms->scalar.output_max); + const v128_t voutput_max = wasm_u8x16_const_splat(255); XNN_FORCE_REALIZATION(vscale); XNN_FORCE_REALIZATION(vmagic_bias); XNN_FORCE_REALIZATION(vmagic_min); diff --git a/src/f32-vabs/f32-vabs.h b/src/f32-vabs/f32-vabs.h index a73a35540c2..4bc0b787b98 100644 --- a/src/f32-vabs/f32-vabs.h +++ b/src/f32-vabs/f32-vabs.h @@ -36,10 +36,13 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vabs_ukernel__sse2_u12, 12, false, float, str XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vabs_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vabs_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vabs_ukernel__avx_u24, 24, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vabs_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vabs_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vabs_ukernel__avx512f_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#endif // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vabs_ukernel__hvx_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) diff --git a/src/f32-vbinary/f32-vadd.h b/src/f32-vbinary/f32-vadd.h index c0ab4527528..edf8ba1080e 100644 --- a/src/f32-vbinary/f32-vadd.h +++ b/src/f32-vbinary/f32-vadd.h @@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_ukernel__sse_u4, 4, false, float, struct XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vadd_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vadd_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vadd_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vadd_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f32-vbinary/f32-vaddc.h b/src/f32-vbinary/f32-vaddc.h index ca2080ac180..bc6a89ef61b 100644 --- a/src/f32-vbinary/f32-vaddc.h +++ b/src/f32-vbinary/f32-vaddc.h @@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_ukernel__sse_u4, 4, false, float, struc XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vaddc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vaddc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vaddc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vaddc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f32-vbinary/f32-vcmul.h b/src/f32-vbinary/f32-vcmul.h index 953bc8752db..49e98be55bf 100644 --- a/src/f32-vbinary/f32-vcmul.h +++ b/src/f32-vbinary/f32-vcmul.h @@ -43,13 +43,15 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vcmul_ukernel__fma3_u8, 8, fa XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vcmul_ukernel__fma3_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vcmul_ukernel__fma3_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vcmul_ukernel__fma3_u64, 64, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) - -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_vcmul_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_vcmul_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_vcmul_ukernel__avx512f_u64, 64, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_vcmul_ukernel__avx512f_u128, 128, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vcmul_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vcmul_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vcmul_ukernel__avx512f_u64, 64, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vcmul_ukernel__avx512f_u128, 128, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vcmul_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vcmul_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) diff --git a/src/f32-vbinary/f32-vcopysign.h b/src/f32-vbinary/f32-vcopysign.h index a7710ca6d74..27f9b1358a1 100644 --- a/src/f32-vbinary/f32-vcopysign.h +++ b/src/f32-vbinary/f32-vcopysign.h @@ -29,6 +29,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vcopysign_ukernel__avx_u8, 8, XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vcopysign_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vcopysign_ukernel__avx_u24, 24, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vcopysign_ukernel__avx_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vcopysign_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vcopysign_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vcopysign_ukernel__avx512f_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) diff --git a/src/f32-vbinary/f32-vcopysignc.h b/src/f32-vbinary/f32-vcopysignc.h index 7d989e7674e..c7d189efeb8 100644 --- a/src/f32-vbinary/f32-vcopysignc.h +++ b/src/f32-vbinary/f32-vcopysignc.h @@ -29,6 +29,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vcopysignc_ukernel__avx_u8, 8, XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vcopysignc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vcopysignc_ukernel__avx_u24, 24, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vcopysignc_ukernel__avx_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vcopysignc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vcopysignc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vcopysignc_ukernel__avx512f_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) diff --git a/src/f32-vbinary/f32-vdiv.h b/src/f32-vbinary/f32-vdiv.h index 23937909a3f..4667ce66563 100644 --- a/src/f32-vbinary/f32-vdiv.h +++ b/src/f32-vbinary/f32-vdiv.h @@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_ukernel__sse_u4, 4, false, float, struct XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vdiv_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vdiv_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vdiv_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vdiv_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f32-vbinary/f32-vdivc.h b/src/f32-vbinary/f32-vdivc.h index e776858bab1..9a8d40f378a 100644 --- a/src/f32-vbinary/f32-vdivc.h +++ b/src/f32-vbinary/f32-vdivc.h @@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_ukernel__sse_u4, 4, false, float, struc XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vdivc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vdivc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vdivc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vdivc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f32-vbinary/f32-vmax.h b/src/f32-vbinary/f32-vmax.h index 27538e4e25f..e819669fafd 100644 --- a/src/f32-vbinary/f32-vmax.h +++ b/src/f32-vbinary/f32-vmax.h @@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmax_ukernel__sse_u4, 4, false, float, struct XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmax_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmax_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmax_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmax_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmax_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f32-vbinary/f32-vmaxc.h b/src/f32-vbinary/f32-vmaxc.h index afd0074bd3e..1ef1357039e 100644 --- a/src/f32-vbinary/f32-vmaxc.h +++ b/src/f32-vbinary/f32-vmaxc.h @@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmaxc_ukernel__sse_u4, 4, false, float, struc XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmaxc_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmaxc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmaxc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmaxc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmaxc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f32-vbinary/f32-vmin.h b/src/f32-vbinary/f32-vmin.h index b4ccbe1e358..b7435fc1921 100644 --- a/src/f32-vbinary/f32-vmin.h +++ b/src/f32-vbinary/f32-vmin.h @@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmin_ukernel__sse_u4, 4, false, float, struct XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmin_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmin_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmin_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmin_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmin_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f32-vbinary/f32-vminc.h b/src/f32-vbinary/f32-vminc.h index ae3b8cbfa15..80d45392c4d 100644 --- a/src/f32-vbinary/f32-vminc.h +++ b/src/f32-vbinary/f32-vminc.h @@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vminc_ukernel__sse_u4, 4, false, float, struc XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vminc_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vminc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vminc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vminc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vminc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f32-vbinary/f32-vmul.h b/src/f32-vbinary/f32-vmul.h index 437721a8d80..05090b84382 100644 --- a/src/f32-vbinary/f32-vmul.h +++ b/src/f32-vbinary/f32-vmul.h @@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_ukernel__sse_u4, 4, false, float, struct XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmul_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmul_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmul_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmul_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f32-vbinary/f32-vmulc.h b/src/f32-vbinary/f32-vmulc.h index ad45df4c31c..e1242b33274 100644 --- a/src/f32-vbinary/f32-vmulc.h +++ b/src/f32-vbinary/f32-vmulc.h @@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_ukernel__sse_u4, 4, false, float, struc XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmulc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmulc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmulc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmulc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f32-vbinary/f32-vprelu.h b/src/f32-vbinary/f32-vprelu.h index f990211c6b9..30582c742c5 100644 --- a/src/f32-vbinary/f32-vprelu.h +++ b/src/f32-vbinary/f32-vprelu.h @@ -28,6 +28,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vprelu_ukernel__sse41_u4, 4 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vprelu_ukernel__sse41_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vprelu_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vprelu_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vprelu_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vprelu_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -46,11 +49,15 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vprelu_ukernel__wasmrelaxedsimd_u16, 16, fals #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vprelu_ukernel__wasm_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vprelu_ukernel__wasm_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vprelu_ukernel__wasm_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vprelu_ukernel__wasm_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vprelu_ukernel__scalar_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vprelu_ukernel__scalar_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vprelu_ukernel__scalar_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vprelu_ukernel__scalar_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS #undef XNN_DEFINED_UKERNEL_WITH_PARAMS diff --git a/src/f32-vbinary/f32-vpreluc.h b/src/f32-vbinary/f32-vpreluc.h index 20ed57d1a87..7c7653222a4 100644 --- a/src/f32-vbinary/f32-vpreluc.h +++ b/src/f32-vbinary/f32-vpreluc.h @@ -28,6 +28,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vpreluc_ukernel__sse41_u4, XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vpreluc_ukernel__sse41_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vpreluc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vpreluc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vpreluc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vpreluc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -46,11 +49,15 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vpreluc_ukernel__wasmrelaxedsimd_u16, 16, fal #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vpreluc_ukernel__wasm_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vpreluc_ukernel__wasm_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vpreluc_ukernel__wasm_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vpreluc_ukernel__wasm_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vpreluc_ukernel__scalar_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vpreluc_ukernel__scalar_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vpreluc_ukernel__scalar_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vpreluc_ukernel__scalar_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS #undef XNN_DEFINED_UKERNEL_WITH_PARAMS diff --git a/src/f32-vbinary/f32-vrcopysignc.h b/src/f32-vbinary/f32-vrcopysignc.h index 3c2389e91ff..00c5e4aad23 100644 --- a/src/f32-vbinary/f32-vrcopysignc.h +++ b/src/f32-vbinary/f32-vrcopysignc.h @@ -29,6 +29,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrcopysignc_ukernel__avx_u8, 8 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrcopysignc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrcopysignc_ukernel__avx_u24, 24, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrcopysignc_ukernel__avx_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrcopysignc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrcopysignc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrcopysignc_ukernel__avx512f_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) diff --git a/src/f32-vbinary/f32-vrdivc.h b/src/f32-vbinary/f32-vrdivc.h index 586ea49ede9..bf15c2a0f15 100644 --- a/src/f32-vbinary/f32-vrdivc.h +++ b/src/f32-vbinary/f32-vrdivc.h @@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_ukernel__sse_u4, 4, false, float, stru XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrdivc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrdivc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrdivc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrdivc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f32-vbinary/f32-vrpreluc.h b/src/f32-vbinary/f32-vrpreluc.h index 041043ad8f6..69ee06b5d0e 100644 --- a/src/f32-vbinary/f32-vrpreluc.h +++ b/src/f32-vbinary/f32-vrpreluc.h @@ -28,6 +28,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrpreluc_ukernel__sse41_u4, XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrpreluc_ukernel__sse41_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrpreluc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrpreluc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrpreluc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrpreluc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -46,11 +49,15 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrpreluc_ukernel__wasmrelaxedsimd_u16, 16, fa #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrpreluc_ukernel__wasm_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrpreluc_ukernel__wasm_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrpreluc_ukernel__wasm_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrpreluc_ukernel__wasm_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrpreluc_ukernel__scalar_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrpreluc_ukernel__scalar_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrpreluc_ukernel__scalar_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrpreluc_ukernel__scalar_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS #undef XNN_DEFINED_UKERNEL_WITH_PARAMS diff --git a/src/f32-vbinary/f32-vrsubc.h b/src/f32-vbinary/f32-vrsubc.h index 2fcda551f07..9c599ecd861 100644 --- a/src/f32-vbinary/f32-vrsubc.h +++ b/src/f32-vbinary/f32-vrsubc.h @@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_ukernel__sse_u4, 4, false, float, stru XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrsubc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrsubc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrsubc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrsubc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f32-vbinary/f32-vsqrdiff.h b/src/f32-vbinary/f32-vsqrdiff.h index b5dbbf2df05..5b3dc78de66 100644 --- a/src/f32-vbinary/f32-vsqrdiff.h +++ b/src/f32-vbinary/f32-vsqrdiff.h @@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrdiff_ukernel__sse_u4, 4, false, float, st XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrdiff_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqrdiff_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqrdiff_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqrdiff_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqrdiff_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f32-vbinary/f32-vsqrdiffc.h b/src/f32-vbinary/f32-vsqrdiffc.h index 1aa224f208a..04bf92ae335 100644 --- a/src/f32-vbinary/f32-vsqrdiffc.h +++ b/src/f32-vbinary/f32-vsqrdiffc.h @@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrdiffc_ukernel__sse_u4, 4, false, float, s XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrdiffc_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqrdiffc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqrdiffc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqrdiffc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqrdiffc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f32-vbinary/f32-vsub.h b/src/f32-vbinary/f32-vsub.h index 5208f2d904d..1679792dfc6 100644 --- a/src/f32-vbinary/f32-vsub.h +++ b/src/f32-vbinary/f32-vsub.h @@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_ukernel__sse_u4, 4, false, float, struct XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsub_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsub_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsub_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsub_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f32-vbinary/f32-vsubc.h b/src/f32-vbinary/f32-vsubc.h index 477f39a5edb..cde2d65a090 100644 --- a/src/f32-vbinary/f32-vsubc.h +++ b/src/f32-vbinary/f32-vsubc.h @@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_ukernel__sse_u4, 4, false, float, struc XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsubc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsubc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsubc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsubc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f32-vclamp/f32-vclamp.h b/src/f32-vclamp/f32-vclamp.h index cdfb0ac46ba..a801e8efe4a 100644 --- a/src/f32-vclamp/f32-vclamp.h +++ b/src/f32-vclamp/f32-vclamp.h @@ -34,6 +34,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__sse_u4, 4, false, float, unio XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__sse_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vclamp_ukernel__avx_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vclamp_ukernel__avx_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vclamp_ukernel__avx512f_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vclamp_ukernel__avx512f_u32, 32, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f32-velu/f32-velu.h b/src/f32-velu/f32-velu.h index c5ade61212d..7de1d9e3055 100644 --- a/src/f32-velu/f32-velu.h +++ b/src/f32-velu/f32-velu.h @@ -126,6 +126,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_velu_ukernel__avx2_rr1_p6_u56 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_velu_ukernel__avx2_rr1_p6_u64, 64, false, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_velu_ukernel__avx2_rr1_p6_u72, 72, false, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_velu_ukernel__avx2_rr1_p6_u80, 80, false, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_u16, 16, false, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_u32, 32, false, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_u48, 48, false, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params) diff --git a/src/f32-vgelu/f32-vgelu.h b/src/f32-vgelu/f32-vgelu.h index a22c8591bcc..8c60243f2bd 100644 --- a/src/f32-vgelu/f32-vgelu.h +++ b/src/f32-vgelu/f32-vgelu.h @@ -36,6 +36,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vgelu_ukernel__fma3_rational_ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vgelu_ukernel__fma3_rational_12_10_div_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vgelu_ukernel__fma3_rational_12_10_div_u24, 24, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vgelu_ukernel__fma3_rational_12_10_div_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vgelu_ukernel__avx512f_rational_12_10_div_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vgelu_ukernel__avx512f_rational_12_10_div_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vgelu_ukernel__avx512f_rational_12_10_div_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) diff --git a/src/f32-vhswish/f32-vhswish.h b/src/f32-vhswish/f32-vhswish.h index 440323861ba..fa31e9d5b92 100644 --- a/src/f32-vhswish/f32-vhswish.h +++ b/src/f32-vhswish/f32-vhswish.h @@ -36,6 +36,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vhswish_ukernel__avx_u8, 8, fa XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vhswish_ukernel__avx_u16, 16, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vhswish_ukernel__fma3_u8, 8, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vhswish_ukernel__fma3_u16, 16, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vhswish_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vhswish_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f32-vlog/f32-vlog.h b/src/f32-vlog/f32-vlog.h index d479e100cb6..c3b5c18673b 100644 --- a/src/f32-vlog/f32-vlog.h +++ b/src/f32-vlog/f32-vlog.h @@ -40,6 +40,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vlog_ukernel__fma3_rational_3 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vlog_ukernel__fma3_rational_3_3_nr_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vlog_ukernel__fma3_rational_3_3_nr_u24, 24, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vlog_ukernel__fma3_rational_3_3_nr_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vlog_ukernel__avx512f_rational_3_3_div_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vlog_ukernel__avx512f_rational_3_3_div_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vlog_ukernel__avx512f_rational_3_3_div_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) diff --git a/src/f32-vlrelu/f32-vlrelu.h b/src/f32-vlrelu/f32-vlrelu.h index c7b6de61334..4c26184f8a3 100644 --- a/src/f32-vlrelu/f32-vlrelu.h +++ b/src/f32-vlrelu/f32-vlrelu.h @@ -37,6 +37,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vlrelu_ukernel__sse41_u4, 4 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vlrelu_ukernel__sse41_u8, 8, false, float, struct xnn_f32_lrelu_params, xnn_init_f32_lrelu_scalar_params) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vlrelu_ukernel__avx_u8, 8, false, float, struct xnn_f32_lrelu_params, xnn_init_f32_lrelu_scalar_params) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vlrelu_ukernel__avx_u16, 16, false, float, struct xnn_f32_lrelu_params, xnn_init_f32_lrelu_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vlrelu_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_lrelu_params, xnn_init_f32_lrelu_scalar_params) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vlrelu_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_lrelu_params, xnn_init_f32_lrelu_scalar_params) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f32-vneg/f32-vneg.h b/src/f32-vneg/f32-vneg.h index 18fc3c56686..491c2491e77 100644 --- a/src/f32-vneg/f32-vneg.h +++ b/src/f32-vneg/f32-vneg.h @@ -36,6 +36,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vneg_ukernel__sse2_u12, 12, false, float, str XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vneg_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vneg_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vneg_ukernel__avx_u24, 24, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vneg_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vneg_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vneg_ukernel__avx512f_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) diff --git a/src/f32-vrelu/f32-vrelu.h b/src/f32-vrelu/f32-vrelu.h index 462dd311111..d6642db19ce 100644 --- a/src/f32-vrelu/f32-vrelu.h +++ b/src/f32-vrelu/f32-vrelu.h @@ -33,6 +33,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__sse_u4, 4, false, float, struc XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__sse_u8, 8, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrelu_ukernel__avx_u8, 8, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrelu_ukernel__avx_u16, 16, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrelu_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrelu_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f32-vrnd/f32-vrndd.h b/src/f32-vrnd/f32-vrndd.h index 4abe7dc7bc7..87da6ddad84 100644 --- a/src/f32-vrnd/f32-vrndd.h +++ b/src/f32-vrnd/f32-vrndd.h @@ -37,6 +37,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndd_ukernel__sse41_u4, 4, XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndd_ukernel__sse41_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndd_ukernel__avx_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndd_ukernel__avx_u16, 16, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndd_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndd_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f32-vrnd/f32-vrndne.h b/src/f32-vrnd/f32-vrndne.h index af62e53fe59..4cf05f41dcc 100644 --- a/src/f32-vrnd/f32-vrndne.h +++ b/src/f32-vrnd/f32-vrndne.h @@ -37,6 +37,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndne_ukernel__sse41_u4, 4 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndne_ukernel__sse41_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndne_ukernel__avx_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndne_ukernel__avx_u16, 16, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndne_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndne_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f32-vrnd/f32-vrndu.h b/src/f32-vrnd/f32-vrndu.h index 7cb276255da..efaeb9977bb 100644 --- a/src/f32-vrnd/f32-vrndu.h +++ b/src/f32-vrnd/f32-vrndu.h @@ -37,6 +37,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndu_ukernel__sse41_u4, 4, XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndu_ukernel__sse41_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndu_ukernel__avx_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndu_ukernel__avx_u16, 16, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndu_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndu_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f32-vrnd/f32-vrndz.h b/src/f32-vrnd/f32-vrndz.h index 96da3c85cd9..30fa23a9f6b 100644 --- a/src/f32-vrnd/f32-vrndz.h +++ b/src/f32-vrnd/f32-vrndz.h @@ -37,6 +37,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndz_ukernel__sse41_u4, 4, XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndz_ukernel__sse41_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndz_ukernel__avx_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndz_ukernel__avx_u16, 16, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndz_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndz_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f32-vrsqrt/f32-vrsqrt.h b/src/f32-vrsqrt/f32-vrsqrt.h index a56caf31b23..2e4e841e6aa 100644 --- a/src/f32-vrsqrt/f32-vrsqrt.h +++ b/src/f32-vrsqrt/f32-vrsqrt.h @@ -41,6 +41,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrsqrt_ukernel__avx_rsqrt_u32, XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u8, 8, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u16, 16, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u32, 32, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u16, 16, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u32, 32, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u64, 64, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL)) diff --git a/src/f32-vsigmoid/f32-vsigmoid.h b/src/f32-vsigmoid/f32-vsigmoid.h index c1098dea0b8..106a5dc2586 100644 --- a/src/f32-vsigmoid/f32-vsigmoid.h +++ b/src/f32-vsigmoid/f32-vsigmoid.h @@ -187,6 +187,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr2fma_u64, 64, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr2fma_u72, 72, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr2fma_u80, 80, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_u32, 32, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_u48, 48, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) diff --git a/src/f32-vsqr/f32-vsqr.h b/src/f32-vsqr/f32-vsqr.h index e9a4542bb72..2a876685664 100644 --- a/src/f32-vsqr/f32-vsqr.h +++ b/src/f32-vsqr/f32-vsqr.h @@ -36,6 +36,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqr_ukernel__sse2_u12, 12, false, float, str XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqr_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqr_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqr_ukernel__avx_u24, 24, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqr_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqr_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqr_ukernel__avx512f_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) diff --git a/src/f32-vsqrt/f32-vsqrt.h b/src/f32-vsqrt/f32-vsqrt.h index 4696076506c..f036b68c58f 100644 --- a/src/f32-vsqrt/f32-vsqrt.h +++ b/src/f32-vsqrt/f32-vsqrt.h @@ -45,6 +45,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqrt_ukernel__avx_rsqrt_u32, XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vsqrt_ukernel__fma3_rsqrt_u8, 8, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vsqrt_ukernel__fma3_rsqrt_u16, 16, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vsqrt_ukernel__fma3_rsqrt_u32, 32, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u16, 16, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u32, 32, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u48, 48, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) diff --git a/src/f32-vtanh/f32-vtanh.h b/src/f32-vtanh/f32-vtanh.h index 0d447b7082a..7db0a58f588 100644 --- a/src/f32-vtanh/f32-vtanh.h +++ b/src/f32-vtanh/f32-vtanh.h @@ -45,6 +45,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vtanh_ukernel__fma3_rational_ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vtanh_ukernel__fma3_rational_9_8_nr_u16, 16, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vtanh_ukernel__fma3_rational_9_8_nr_u24, 24, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vtanh_ukernel__fma3_rational_9_8_nr_u32, 32, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vtanh_ukernel__avx512f_rational_9_8_div_u16, 16, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vtanh_ukernel__avx512f_rational_9_8_div_u32, 32, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vtanh_ukernel__avx512f_rational_9_8_div_u48, 48, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) diff --git a/src/microparams-init.c b/src/microparams-init.c index 2d7398c0f84..df4014b7d7e 100644 --- a/src/microparams-init.c +++ b/src/microparams-init.c @@ -2064,13 +2064,9 @@ size_t xnn_init_qs8_mul_minmax_rndnu_neon_params( size_t xnn_init_f16_qs8_cvt_scalar_params( struct xnn_f16_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)], xnn_float16 scale, - int8_t output_zero_point, - int8_t output_min, - int8_t output_max) + int8_t output_zero_point) { params->scalar.scale = scale; - params->scalar.output_min = output_min; - params->scalar.output_max = output_max; params->scalar.output_zero_point = output_zero_point; return sizeof(params->scalar); } @@ -2078,14 +2074,10 @@ size_t xnn_init_f16_qs8_cvt_scalar_params( size_t xnn_init_f32_qs8_cvt_scalar_params( struct xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)], float scale, - int8_t output_zero_point, - int8_t output_min, - int8_t output_max) + int8_t output_zero_point) { params->scalar.scale = scale; params->scalar.output_zero_point = (int16_t) output_zero_point; - params->scalar.output_min = output_min; - params->scalar.output_max = output_max; return sizeof(params->scalar); } @@ -2120,14 +2112,10 @@ size_t xnn_init_qu8_reduce_minmax_scalar_params( size_t xnn_init_f32_qu8_cvt_scalar_params( struct xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)], float scale, - uint8_t output_zero_point, - uint8_t output_min, - uint8_t output_max) + uint8_t output_zero_point) { params->scalar.scale = scale; params->scalar.output_zero_point = (int16_t) output_zero_point; - params->scalar.output_min = output_min; - params->scalar.output_max = output_max; return sizeof(params->scalar); } diff --git a/src/operator-run.c b/src/operator-run.c index ec9d1cb0a9f..9bec626c35b 100644 --- a/src/operator-run.c +++ b/src/operator-run.c @@ -1562,19 +1562,6 @@ void xnn_compute_resize_bilinear_chw( context->input_channel_stride); } -void xnn_compute_prelu( - const struct prelu_context context[restrict XNN_MIN_ELEMENTS(1)], - size_t batch_start, - size_t batch_range) -{ - const size_t x_stride = context->x_stride; - const size_t y_stride = context->y_stride; - const void* x = (const void*) ((uintptr_t) context->x + x_stride * batch_start); - void* y = (void*) ((uintptr_t) context->y + y_stride * batch_start); - - context->ukernel(batch_range, context->n, x, x_stride, context->w, y, y_stride); -} - void xnn_compute_pad_5d( const struct pad_context context[restrict XNN_MIN_ELEMENTS(1)], size_t i, size_t j, size_t k, size_t l, size_t m) @@ -2197,7 +2184,7 @@ void xnn_compute_contiguous_reduce( context->s32_f32_cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, workspace_ptr, /*params=*/&s32_f32_cvt_params); struct xnn_f32_qs8_cvt_params cvt_params; - xnn_init_f32_qs8_cvt_scalar_params(&cvt_params, context->params.qs8_mean.scalar.scale, context->params.qs8_mean.scalar.output_zero_point, INT8_MIN, INT8_MAX); + xnn_init_f32_qs8_cvt_scalar_params(&cvt_params, context->params.qs8_mean.scalar.scale, context->params.qs8_mean.scalar.output_zero_point); context->cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, output_ptr, /*params=*/&cvt_params); } else if (context->u32_f32_cvt_ukernel) { @@ -2206,7 +2193,7 @@ void xnn_compute_contiguous_reduce( context->u32_f32_cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, workspace_ptr, /*params=*/&u32_f32_cvt_params); struct xnn_f32_qu8_cvt_params cvt_params; - xnn_init_f32_qu8_cvt_scalar_params(&cvt_params, context->params.qu8_mean.scalar.scale, context->params.qu8_mean.scalar.output_zero_point, 0, UINT8_MAX); + xnn_init_f32_qu8_cvt_scalar_params(&cvt_params, context->params.qu8_mean.scalar.scale, context->params.qu8_mean.scalar.output_zero_point); context->cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, output_ptr, /*params=*/&cvt_params); } else { @@ -2278,7 +2265,7 @@ void xnn_compute_discontiguous_reduce( context->s32_f32_cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, workspace_ptr, /*params=*/&s32_f32_cvt_params); struct xnn_f32_qs8_cvt_params cvt_params; - xnn_init_f32_qs8_cvt_scalar_params(&cvt_params, context->params.qs8_mean.scalar.scale, context->params.qs8_mean.scalar.output_zero_point, INT8_MIN, INT8_MAX); + xnn_init_f32_qs8_cvt_scalar_params(&cvt_params, context->params.qs8_mean.scalar.scale, context->params.qs8_mean.scalar.output_zero_point); context->cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, output_ptr, /*params=*/&cvt_params); } else if (context->u32_f32_cvt_ukernel) { @@ -2287,7 +2274,7 @@ void xnn_compute_discontiguous_reduce( context->u32_f32_cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, workspace_ptr, /*params=*/&u32_f32_cvt_params); struct xnn_f32_qu8_cvt_params cvt_params; - xnn_init_f32_qu8_cvt_scalar_params(&cvt_params, context->params.qu8_mean.scalar.scale, context->params.qu8_mean.scalar.output_zero_point, 0, UINT8_MAX); + xnn_init_f32_qu8_cvt_scalar_params(&cvt_params, context->params.qu8_mean.scalar.scale, context->params.qu8_mean.scalar.output_zero_point); context->cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, output_ptr, /*params=*/&cvt_params); } else { @@ -2323,7 +2310,7 @@ void xnn_compute_f16_qd8_convert( context->quantization_params[batch_index] = xnn_f16_qd8_asymmetric_quantization_params(minmax[0], minmax[1], &f16_scale); struct xnn_f16_qs8_cvt_params params; - context->init_params(¶ms, f16_scale, context->quantization_params[batch_index].zero_point, INT8_MIN, INT8_MAX); + context->init_params(¶ms, f16_scale, context->quantization_params[batch_index].zero_point); context->convert_ukernel(n, input, output, ¶ms); } @@ -2342,7 +2329,7 @@ void xnn_compute_f32_qd8_convert( context->quantization_params[batch_index] = xnn_f32_qd8_asymmetric_quantization_params(minmax[0], minmax[1]); struct xnn_f32_qs8_cvt_params params; - context->init_params(¶ms, 1.0f / context->quantization_params[batch_index].inv_scale, context->quantization_params[batch_index].zero_point, INT8_MIN, INT8_MAX); + context->init_params(¶ms, 1.0f / context->quantization_params[batch_index].inv_scale, context->quantization_params[batch_index].zero_point); context->convert_ukernel(n, input, output, ¶ms); } diff --git a/src/operators/binary-elementwise-nd.c b/src/operators/binary-elementwise-nd.c index 99f71611b08..f8a77c842a4 100644 --- a/src/operators/binary-elementwise-nd.c +++ b/src/operators/binary-elementwise-nd.c @@ -141,6 +141,15 @@ static const struct xnn_binary_elementwise_config* init_config( default: return NULL; } + case xnn_binary_prelu: + switch (datatype) { + case xnn_datatype_fp32: + return xnn_init_f32_vprelu_config(); + case xnn_datatype_fp16: + return xnn_init_f16_vprelu_config(); + default: + return NULL; + } default: return NULL; } diff --git a/src/operators/prelu-nc.c b/src/operators/prelu-nc.c deleted file mode 100644 index 50e96c237c0..00000000000 --- a/src/operators/prelu-nc.c +++ /dev/null @@ -1,336 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/allocator.h" -#include "xnnpack/cache.h" -#include "xnnpack/common.h" -#include "xnnpack/compute.h" -#include "xnnpack/config-types.h" -#include "xnnpack/config.h" -#include "xnnpack/log.h" -#include "xnnpack/math.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator-utils.h" -#include "xnnpack/operator.h" -#include "xnnpack/pack.h" -#include "xnnpack/params.h" -#include "pthreadpool.h" - -static enum xnn_status create_prelu_nc( - size_t input_channels, - size_t slope_channels, - size_t input_stride, - size_t output_stride, - const void* negative_slope, - uint32_t flags, - uint32_t log2_weights_element_size, - xnn_pack_prelu_w_fn pack_prelu_w, - enum xnn_operator_type operator_type, - const struct xnn_prelu_config* prelu_config, - xnn_code_cache_t code_cache, - xnn_weights_cache_t weights_cache, - xnn_operator_t* prelu_op_out) -{ - xnn_operator_t prelu_op = NULL; - enum xnn_status status = xnn_status_uninitialized; - - if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { - xnn_log_error("failed to setup %s operator: XNNPACK is not initialized", - xnn_operator_type_to_string(operator_type)); - return xnn_status_uninitialized; - } - - status = xnn_status_invalid_parameter; - - if (slope_channels == 0) { - xnn_log_error( - "failed to create %s operator with %zu slope channels: number of slope channels must be non-zero", - xnn_operator_type_to_string(operator_type), slope_channels); - goto error; - } - - if (input_channels != slope_channels && slope_channels != 1) { - xnn_log_error( - "failed to create %s operator with input channels of %zu: " - "slope channels (%zu) must be either equal to the number input channels or 1", - xnn_operator_type_to_string(operator_type), slope_channels, input_channels); - goto error; - } - - if (input_stride < input_channels) { - xnn_log_error( - "failed to create %s operator with input element stride of %zu: " - "stride must be at least as large as the number of input channels (%zu)", - xnn_operator_type_to_string(operator_type), input_stride, input_channels); - goto error; - } - - if (output_stride < input_channels) { - xnn_log_error( - "failed to create %s operator with output element stride of %zu: " - "stride must be at least as large as the number of input channels (%zu)", - xnn_operator_type_to_string(operator_type), output_stride, input_channels); - goto error; - } - - status = xnn_status_out_of_memory; - - prelu_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator)); - if (prelu_op == NULL) { - xnn_log_error( - "failed to allocate %zu bytes for %s operator descriptor", - sizeof(struct xnn_operator), xnn_operator_type_to_string(operator_type)); - goto error; - } - - prelu_op->input_pixel_stride = input_stride; - prelu_op->output_pixel_stride = output_stride; - - prelu_op->weights_cache = weights_cache; - - const size_t packed_weights_size = (input_channels << log2_weights_element_size) + XNN_EXTRA_BYTES; - const size_t aligned_total_weights_size = round_up_po2(packed_weights_size, XNN_ALLOCATION_ALIGNMENT); - void* weights_ptr = xnn_get_pointer_to_write_weights(prelu_op, aligned_total_weights_size, 0); - xnn_log_debug("allocated %zu bytes for packed weights in %s operator", - aligned_total_weights_size, xnn_operator_type_to_string(operator_type)); - - pack_prelu_w(input_channels, slope_channels, negative_slope, weights_ptr); - - if (use_weights_cache(prelu_op)) { - struct xnn_weights_cache_look_up_key cache_key; - cache_key.seed = murmur_hash3(weights_ptr, aligned_total_weights_size, /*seed=*/7); - cache_key.kernel = negative_slope; - cache_key.bias = NULL; - prelu_op->packed_weights.offset = xnn_look_up_or_insert_weights_cache( - prelu_op->weights_cache, &cache_key, weights_ptr, aligned_total_weights_size); - } - - prelu_op->channels = input_channels; - - prelu_op->type = operator_type; - prelu_op->flags = flags; - prelu_op->prelu_config = prelu_config; - - prelu_op->state = xnn_run_state_invalid; - - *prelu_op_out = prelu_op; - return xnn_status_success; - -error: - xnn_delete_operator(prelu_op); - return status; -} - - -enum xnn_status xnn_create_prelu_nc_f16( - size_t input_channels, - size_t slope_channels, - size_t input_stride, - size_t output_stride, - const void* negative_slope, - uint32_t flags, - xnn_code_cache_t code_cache, - xnn_weights_cache_t weights_cache, - xnn_operator_t* prelu_op_out) -{ - xnn_pack_prelu_w_fn pack_prelu_w = (xnn_pack_prelu_w_fn) xnn_pack_f16_prelu_w; - if (flags & XNN_FLAG_FP32_STATIC_WEIGHTS) { - pack_prelu_w = (xnn_pack_prelu_w_fn) xnn_pack_f32_to_f16_prelu_w; - } - - const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config(); - if (prelu_config == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_prelu_nc_f16)); - return xnn_status_unsupported_hardware; - } - - return create_prelu_nc( - input_channels, slope_channels, input_stride, - output_stride, negative_slope, flags, - /*log2_weights_element_size=*/XNN_LOG2_SIZEOF_HALF, - pack_prelu_w, - xnn_operator_type_prelu_nc_f16, - prelu_config, - /*code_cache=*/code_cache, - /*weights_cache=*/weights_cache, - prelu_op_out); -} - -enum xnn_status xnn_create_prelu_nc_f32( - size_t input_channels, - size_t slope_channels, - size_t input_stride, - size_t output_stride, - const float* negative_slope, - uint32_t flags, - xnn_code_cache_t code_cache, - xnn_weights_cache_t weights_cache, - xnn_operator_t* prelu_op_out) -{ - const struct xnn_prelu_config* prelu_config = xnn_init_f32_prelu_config(); - if (prelu_config == NULL) { - xnn_log_error("failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_prelu_nc_f32)); - return xnn_status_unsupported_hardware; - } - - return create_prelu_nc( - input_channels, slope_channels, input_stride, - output_stride, negative_slope, flags, - /*log2_weights_element_size=*/XNN_LOG2_SIZEOF_FLOAT, - (xnn_pack_prelu_w_fn) xnn_pack_f32_prelu_w, - xnn_operator_type_prelu_nc_f32, - prelu_config, - /*code_cache=*/code_cache, - /*weights_cache=*/weights_cache, - prelu_op_out); -} - -static enum xnn_status reshape_prelu_nc( - xnn_operator_t prelu_op, - enum xnn_operator_type expected_operator_type, - size_t batch_size, - uint32_t log2_element_size, - pthreadpool_t threadpool) -{ - if (prelu_op->type != expected_operator_type) { - xnn_log_error("failed to reshape operator: operator type mismatch (expected %s, got %s)", - xnn_operator_type_to_string(expected_operator_type), - xnn_operator_type_to_string(prelu_op->type)); - return xnn_status_invalid_parameter; - } - prelu_op->state = xnn_run_state_invalid; - - if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { - xnn_log_error("failed to reshape %s operator: XNNPACK is not initialized", - xnn_operator_type_to_string(expected_operator_type)); - return xnn_status_uninitialized; - } - - if (batch_size == 0) { - prelu_op->state = xnn_run_state_skip; - return xnn_status_success; - } - - const struct xnn_prelu_config* prelu = prelu_op->prelu_config; - - const size_t input_channels = prelu_op->channels; - prelu_op->context.prelu = (struct prelu_context) { - .n = input_channels << log2_element_size, - .x_stride = prelu_op->input_pixel_stride << log2_element_size, - .w = packed_weights(prelu_op), - .y_stride = prelu_op->output_pixel_stride << log2_element_size, - .ukernel = prelu->ukernel, - }; - - size_t batch_tile = batch_size; - const size_t num_threads = pthreadpool_get_threads_count(threadpool); - if (num_threads > 1) { - const size_t target_tiles_per_thread = 5; - const size_t max_batch_tile = divide_round_up(batch_size, num_threads * target_tiles_per_thread); - if (max_batch_tile < batch_tile) { - const uint32_t row_tile = prelu->row_tile; - batch_tile = min(batch_tile, divide_round_up(batch_tile, max_batch_tile * row_tile) * row_tile); - } - } - - prelu_op->compute[0].type = xnn_parallelization_type_1d_tile_1d; - prelu_op->compute[0].task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_prelu; - prelu_op->compute[0].range[0] = batch_size; - prelu_op->compute[0].tile[0] = batch_tile; - prelu_op->state = xnn_run_state_needs_setup; - - return xnn_status_success; -} - -enum xnn_status xnn_reshape_prelu_nc_f16( - xnn_operator_t prelu_op, - size_t batch_size, - pthreadpool_t threadpool) -{ - return reshape_prelu_nc( - prelu_op, xnn_operator_type_prelu_nc_f16, - batch_size, /*log2_element_size=*/XNN_LOG2_SIZEOF_HALF, - threadpool); -} - -enum xnn_status xnn_reshape_prelu_nc_f32( - xnn_operator_t prelu_op, - size_t batch_size, - pthreadpool_t threadpool) -{ - return reshape_prelu_nc( - prelu_op, xnn_operator_type_prelu_nc_f32, - batch_size, /*log2_element_size=*/XNN_LOG2_SIZEOF_FLOAT, - threadpool); -} - -static enum xnn_status setup_prelu_nc( - xnn_operator_t prelu_op, - enum xnn_operator_type expected_operator_type, - const float* input, - float* output) -{ - if (prelu_op->type != expected_operator_type) { - xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)", - xnn_operator_type_to_string(expected_operator_type), - xnn_operator_type_to_string(prelu_op->type)); - return xnn_status_invalid_parameter; - } - - if (prelu_op->weights_cache != NULL && !xnn_weights_cache_is_finalized(prelu_op->weights_cache)) { - xnn_log_error("failed to setup %s operator: weights cache is not finalized", - xnn_operator_type_to_string(expected_operator_type)); - return xnn_status_invalid_state; - } - - switch (prelu_op->state) { - case xnn_run_state_skip: - return xnn_status_success; - case xnn_run_state_invalid: - xnn_log_error( - "failed to setup %s operator: operator has not been reshaped yet", - xnn_operator_type_to_string(prelu_op->type)); - return xnn_status_invalid_state; - case xnn_run_state_needs_setup: - // Operator has been reshaped, but not setup, continue with setup. - case xnn_run_state_ready: - // Operator has been reshaped, and we are setting up with different pointers. - break; - } - - prelu_op->context.prelu.x = input; - prelu_op->context.prelu.y = output; - prelu_op->state = xnn_run_state_ready; - - return xnn_status_success; -} - -enum xnn_status xnn_setup_prelu_nc_f16( - xnn_operator_t prelu_op, - const void* input, - void* output) -{ - return setup_prelu_nc( - prelu_op, xnn_operator_type_prelu_nc_f16, - input, output); -} - -enum xnn_status xnn_setup_prelu_nc_f32( - xnn_operator_t prelu_op, - const float* input, - float* output) -{ - return setup_prelu_nc( - prelu_op, xnn_operator_type_prelu_nc_f32, - input, output); -} diff --git a/src/operators/rope-nthc.c b/src/operators/rope-nthc.c index 5de450de70f..43d4e33e8fc 100644 --- a/src/operators/rope-nthc.c +++ b/src/operators/rope-nthc.c @@ -22,7 +22,6 @@ #include "pthreadpool.h" static enum xnn_status create_rope_nthc( - size_t max_tokens, uint32_t flags, enum xnn_operator_type operator_type, const struct xnn_cmul_config* config, @@ -39,13 +38,6 @@ static enum xnn_status create_rope_nthc( status = xnn_status_invalid_parameter; - if (max_tokens == 0) { - xnn_log_error( - "failed to create %s operator with %zu max tokens: maximum number of tokens must be non-zero", - xnn_operator_type_to_string(operator_type), max_tokens); - goto error; - } - status = xnn_status_out_of_memory; rope_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator)); @@ -56,8 +48,6 @@ static enum xnn_status create_rope_nthc( goto error; } - rope_op->max_tokens = max_tokens; - rope_op->type = operator_type; rope_op->flags = flags; rope_op->cmul_config = config; @@ -73,7 +63,6 @@ static enum xnn_status create_rope_nthc( } enum xnn_status xnn_create_rope_nthc_f16( - size_t max_tokens, uint32_t flags, xnn_operator_t* rope_op_out) { @@ -85,7 +74,6 @@ enum xnn_status xnn_create_rope_nthc_f16( } return create_rope_nthc( - max_tokens, flags, xnn_operator_type_rope_nthc_f16, config, @@ -93,7 +81,6 @@ enum xnn_status xnn_create_rope_nthc_f16( } enum xnn_status xnn_create_rope_nthc_f32( - size_t max_tokens, uint32_t flags, xnn_operator_t* rope_op_out) { @@ -105,7 +92,6 @@ enum xnn_status xnn_create_rope_nthc_f32( } return create_rope_nthc( - max_tokens, flags, xnn_operator_type_rope_nthc_f32, config, @@ -138,13 +124,6 @@ static enum xnn_status reshape_rope_nthc( return xnn_status_invalid_parameter; } - if (tokens > rope_op->max_tokens) { - xnn_log_error( - "failed to reshape %s operator with %zu tokens: number of tokens can not exceed the maximum %zu", - xnn_operator_type_to_string(rope_op->type), tokens, rope_op->max_tokens); - return xnn_status_invalid_parameter; - } - if (heads == 0) { xnn_log_error( "failed to reshape %s operator with %zu heads: number of heads must be non-zero", diff --git a/src/operators/softmax-nc.c b/src/operators/softmax-nc.c index 75e57edc9c9..9e9fbb49101 100644 --- a/src/operators/softmax-nc.c +++ b/src/operators/softmax-nc.c @@ -21,6 +21,7 @@ #include "xnnpack/config-types.h" #include "xnnpack/config.h" #include "xnnpack/log.h" +#include "xnnpack/math.h" #include "xnnpack/microfnptr.h" #include "xnnpack/microparams.h" #include "xnnpack/operator-type.h" diff --git a/src/operators/unary-elementwise-nc.c b/src/operators/unary-elementwise-nc.c index 2270934695a..a0d9764227b 100644 --- a/src/operators/unary-elementwise-nc.c +++ b/src/operators/unary-elementwise-nc.c @@ -18,6 +18,7 @@ #include "xnnpack/config-types.h" #include "xnnpack/config.h" #include "xnnpack/log.h" +#include "xnnpack/math.h" #include "xnnpack/microfnptr.h" #include "xnnpack/microparams.h" #include "xnnpack/operator-type.h" @@ -516,8 +517,6 @@ enum xnn_status xnn_create_convert_nc_f32_f16( enum xnn_status xnn_create_convert_nc_f32_qs8( float output_scale, int8_t output_zero_point, - int8_t output_min, - int8_t output_max, uint32_t flags, xnn_operator_t* convert_op_out) { @@ -528,19 +527,12 @@ enum xnn_status xnn_create_convert_nc_f32_qs8( return xnn_status_invalid_parameter; } - if (output_min > output_max) { - xnn_log_error( - "failed to create %s operator with [%" PRId8 ", %" PRId8 "] output range: lower bound must be less than or equal to upper bound", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_f32_qs8), output_min, output_max); - return xnn_status_invalid_parameter; - } - const struct xnn_unary_elementwise_config* f32_to_qs8_cvt_config = xnn_init_f32_to_qs8_cvt_config(); struct xnn_f32_qs8_cvt_params params; if XNN_LIKELY(f32_to_qs8_cvt_config != NULL) { assert(f32_to_qs8_cvt_config->init.f32_qs8_cvt != NULL); - f32_to_qs8_cvt_config->init.f32_qs8_cvt(¶ms, 1.0f / output_scale, output_zero_point, output_min, output_max); + f32_to_qs8_cvt_config->init.f32_qs8_cvt(¶ms, 1.0f / output_scale, output_zero_point); } return create_unary_elementwise_nc( @@ -619,8 +611,6 @@ enum xnn_status xnn_create_convert_nc_f32_qp8(uint32_t flags, enum xnn_status xnn_create_convert_nc_f32_qu8( float output_scale, uint8_t output_zero_point, - uint8_t output_min, - uint8_t output_max, uint32_t flags, xnn_operator_t* convert_op_out) { @@ -631,19 +621,12 @@ enum xnn_status xnn_create_convert_nc_f32_qu8( return xnn_status_invalid_parameter; } - if (output_min > output_max) { - xnn_log_error( - "failed to create %s operator with [%" PRIu8 ", %" PRIu8 "] output range: lower bound must be less than or equal to upper bound", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_f32_qu8), output_min, output_max); - return xnn_status_invalid_parameter; - } - const struct xnn_unary_elementwise_config* f32_to_qu8_cvt_config = xnn_init_f32_to_qu8_cvt_config(); struct xnn_f32_qu8_cvt_params params; if XNN_LIKELY(f32_to_qu8_cvt_config != NULL) { assert(f32_to_qu8_cvt_config->init.f32_qu8_cvt != NULL); - f32_to_qu8_cvt_config->init.f32_qu8_cvt(¶ms, 1.0f / output_scale, output_zero_point, output_min, output_max); + f32_to_qu8_cvt_config->init.f32_qu8_cvt(¶ms, 1.0f / output_scale, output_zero_point); } return create_unary_elementwise_nc( @@ -3420,7 +3403,7 @@ enum xnn_status xnn_run_convert_nc_f32_qs8( struct xnn_f32_qs8_cvt_params params; if XNN_LIKELY(f32_to_qs8_cvt_config != NULL) { assert(f32_to_qs8_cvt_config->init.f32_qs8_cvt != NULL); - f32_to_qs8_cvt_config->init.f32_qs8_cvt(¶ms, 1.0f / output_scale, output_zero_point, INT8_MIN, INT8_MAX); + f32_to_qs8_cvt_config->init.f32_qs8_cvt(¶ms, 1.0f / output_scale, output_zero_point); } return run_unary_elementwise_nc( @@ -3458,7 +3441,7 @@ enum xnn_status xnn_run_convert_nc_f32_qu8( struct xnn_f32_qu8_cvt_params params; if XNN_LIKELY(f32_to_qu8_cvt_config != NULL) { assert(f32_to_qu8_cvt_config->init.f32_qu8_cvt != NULL); - f32_to_qu8_cvt_config->init.f32_qu8_cvt(¶ms, 1.0f / output_scale, output_zero_point, 0, UINT8_MAX); + f32_to_qu8_cvt_config->init.f32_qu8_cvt(¶ms, 1.0f / output_scale, output_zero_point); } return run_unary_elementwise_nc( diff --git a/src/packing.cc b/src/packing.cc index 39f81b97dbe..19cf4e357af 100644 --- a/src/packing.cc +++ b/src/packing.cc @@ -999,21 +999,11 @@ void xnn_pack_f32_qc4w_gemm_goi_w( } while (--g != 0); } -void xnn_pack_f32_gemm_gio_w( - size_t g, - size_t nc, - size_t kc, - size_t nr, - size_t kr, - size_t sr, - size_t k_stride, - const float* k, - const float* b, - const void* scale, - float* packed_weights, - size_t extra_bytes, - const void* params) -{ +void xnn_pack_f32_gemm_gio_w(size_t g, size_t nc, size_t kc, size_t nr, + size_t kr, size_t sr, size_t k_stride, + const float* k, const float* b, const void* scale, + float* packed_weights, size_t extra_bytes, + const void* params) { assert(g != 0); assert(nr >= sr); assert(k != nullptr); @@ -1026,20 +1016,39 @@ void xnn_pack_f32_gemm_gio_w( copy_bias(b, nr_block_start, nr_block_size, packed_weights); packed_weights += nr; - for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) { - for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { - const size_t kc_begin = round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & (skr - 1)); - for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) { - const size_t kc_idx = kc_begin + kr_block_offset; - if (kc_idx < kc) { - packed_weights[kr_block_offset] = k[kc_idx * k_stride + nr_block_start + nr_block_offset]; + // Special case for trivial packings. + if (skr == 1) { + for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start++) { + const size_t kc_idx = round_down_po2(kr_block_start, skr); + if (kc_idx < kc) { + std::copy_n(&k[kc_idx * k_stride + nr_block_start], nr_block_size, + packed_weights); + } + packed_weights += nr; + } + + } else { + for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); + kr_block_start += kr) { + for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; + nr_block_offset++) { + const size_t kc_begin = + round_down_po2(kr_block_start, skr) + + ((kr_block_start + nr_block_offset * kr) & (skr - 1)); + for (size_t kr_block_offset = 0; kr_block_offset < kr; + kr_block_offset++) { + const size_t kc_idx = kc_begin + kr_block_offset; + if (kc_idx < kc) { + packed_weights[kr_block_offset] = + k[kc_idx * k_stride + nr_block_start + nr_block_offset]; + } } + packed_weights += kr; } - packed_weights += kr; + packed_weights += (nr - nr_block_size) * kr; } - packed_weights += (nr - nr_block_size) * kr; } - packed_weights = (float*) ((uintptr_t) packed_weights + extra_bytes); + packed_weights = (float*)((uintptr_t)packed_weights + extra_bytes); } k += nc * kc; if XNN_UNPREDICTABLE(b != nullptr) { @@ -1048,21 +1057,11 @@ void xnn_pack_f32_gemm_gio_w( } while (--g != 0); } -void xnn_pack_f16_gemm_gio_w( - size_t g, - size_t nc, - size_t kc, - size_t nr, - size_t kr, - size_t sr, - size_t k_stride, - const uint16_t* k, - const uint16_t* b, - const void* scale, - uint16_t* packed_weights, - size_t extra_bytes, - const void* params) -{ +void xnn_pack_f16_gemm_gio_w(size_t g, size_t nc, size_t kc, size_t nr, + size_t kr, size_t sr, size_t k_stride, + const uint16_t* k, const uint16_t* b, + const void* scale, uint16_t* packed_weights, + size_t extra_bytes, const void* params) { assert(g != 0); assert(nr >= sr); assert(k != nullptr); @@ -1075,20 +1074,39 @@ void xnn_pack_f16_gemm_gio_w( copy_bias(b, nr_block_start, nr_block_size, packed_weights); packed_weights += nr; - for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) { - for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { - const size_t kc_begin = round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & (skr - 1)); - for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) { - const size_t kc_idx = kc_begin + kr_block_offset; - if (kc_idx < kc) { - packed_weights[kr_block_offset] = k[kc_idx * k_stride + nr_block_start + nr_block_offset]; + // Special case for trivial packings. + if (skr == 1) { + for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start++) { + const size_t kc_idx = round_down_po2(kr_block_start, skr); + if (kc_idx < kc) { + std::copy_n(&k[kc_idx * k_stride + nr_block_start], nr_block_size, + packed_weights); + } + packed_weights += nr; + } + + } else { + for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); + kr_block_start += kr) { + for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; + nr_block_offset++) { + const size_t kc_begin = + round_down_po2(kr_block_start, skr) + + ((kr_block_start + nr_block_offset * kr) & (skr - 1)); + for (size_t kr_block_offset = 0; kr_block_offset < kr; + kr_block_offset++) { + const size_t kc_idx = kc_begin + kr_block_offset; + if (kc_idx < kc) { + packed_weights[kr_block_offset] = + k[kc_idx * k_stride + nr_block_start + nr_block_offset]; + } } + packed_weights += kr; } - packed_weights += kr; + packed_weights += (nr - nr_block_size) * kr; } - packed_weights += (nr - nr_block_size) * kr; } - packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes); + packed_weights = (uint16_t*)((uintptr_t)packed_weights + extra_bytes); } k += nc * kc; if XNN_UNPREDICTABLE(b != nullptr) { @@ -4918,66 +4936,6 @@ void xnn_pack_f32_to_f16_vmulcaddc_w( } } -void xnn_pack_f32_prelu_w( - size_t input_channels, - size_t slope_channels, - const float* s, - float* packed_weights) -{ - assert(s != nullptr); - assert(packed_weights != nullptr); - assert(slope_channels == input_channels || slope_channels == 1); - - if (slope_channels == 1) { - do { - *packed_weights++ = *s; - } while (--input_channels != 0); - } else { - memcpy(packed_weights, s, slope_channels * sizeof(float)); - } -} - -void xnn_pack_f16_prelu_w( - size_t input_channels, - size_t slope_channels, - const uint16_t* s, - uint16_t* packed_weights) -{ - assert(s != nullptr); - assert(packed_weights != nullptr); - assert(slope_channels == input_channels || slope_channels == 1); - - if (slope_channels == 1) { - do { - *packed_weights++ = *s; - } while (--input_channels != 0); - } else { - memcpy(packed_weights, s, slope_channels * sizeof(uint16_t)); - } -} - -void xnn_pack_f32_to_f16_prelu_w( - size_t input_channels, - size_t slope_channels, - const float* s, - xnn_float16* packed_weights) -{ - assert(s != nullptr); - assert(packed_weights != nullptr); - assert(slope_channels == input_channels || slope_channels == 1); - - if (slope_channels == 1) { - xnn_float16 v = xnn_float16_from_float(*s); - for (size_t i = 0; i < input_channels; ++i) { - packed_weights[i] = v; - } - } else { - do { - *packed_weights++ = xnn_float16_from_float(*s++); - } while (--input_channels != 0); - } -} - void xnn_analyze_f32_spmm_w( size_t group_output_channels, size_t group_input_channels, diff --git a/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-fmagic.c b/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-fmagic.c index e6029459e8b..85821458413 100644 --- a/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-fmagic.c +++ b/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic( diff --git a/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-imagic.c b/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-imagic.c index d7928ea6468..80cc8e91903 100644 --- a/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-imagic.c +++ b/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-imagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic( diff --git a/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-lrintf.c b/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-lrintf.c index d7937594ffe..56500d0f7e9 100644 --- a/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-lrintf.c +++ b/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-lrintf.c @@ -9,9 +9,13 @@ #include #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__scalar_lrintf( diff --git a/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-wasm-fmagic.c b/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-wasm-fmagic.c index ec59cdca556..3a9906a607b 100644 --- a/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-wasm-fmagic.c +++ b/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-wasm-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__wasm_fmagic( diff --git a/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-fmagic.c b/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-fmagic.c index 7731e2f25e6..50b4797ff28 100644 --- a/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-fmagic.c +++ b/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__scalar_fmagic( diff --git a/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-imagic.c b/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-imagic.c index d3c2c5521cd..6f1849d5446 100644 --- a/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-imagic.c +++ b/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-imagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__scalar_imagic( diff --git a/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-lrintf.c b/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-lrintf.c index 24bf861851f..fa6ca75f77c 100644 --- a/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-lrintf.c +++ b/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-lrintf.c @@ -9,9 +9,13 @@ #include #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf( diff --git a/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-wasm-fmagic.c b/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-wasm-fmagic.c index 32f23a8dfba..d29196ad15e 100644 --- a/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-wasm-fmagic.c +++ b/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-wasm-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__wasm_fmagic( diff --git a/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-fmagic.c b/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-fmagic.c index 034d45ac4c7..36f998d1f28 100644 --- a/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-fmagic.c +++ b/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" void xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__scalar_fmagic( size_t channels, diff --git a/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-imagic.c b/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-imagic.c index 2b851a0cbdb..47aa2941df2 100644 --- a/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-imagic.c +++ b/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-imagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" void xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__scalar_imagic( size_t channels, diff --git a/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c b/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c index 5dad72ff47d..310d6c783e7 100644 --- a/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c +++ b/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c @@ -9,9 +9,13 @@ #include #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" void xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__scalar_lrintf( size_t channels, diff --git a/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-wasm-fmagic.c b/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-wasm-fmagic.c index b49a44b2c28..9db83f1952a 100644 --- a/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-wasm-fmagic.c +++ b/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-wasm-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" void xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__wasm_fmagic( size_t channels, diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-fmagic.c b/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-fmagic.c index a9dc7ab79aa..c78c217ddf1 100644 --- a/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-fmagic.c +++ b/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic( diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-imagic.c b/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-imagic.c index f0a9de4edab..57ae6e8e40f 100644 --- a/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-imagic.c +++ b/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-imagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__scalar_imagic( diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-lrintf.c b/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-lrintf.c index 4a8871c3381..342843bf218 100644 --- a/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-lrintf.c +++ b/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-lrintf.c @@ -9,9 +9,13 @@ #include #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__scalar_lrintf( diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-wasm-fmagic.c b/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-wasm-fmagic.c index cf4e8b04060..294ab54efdf 100644 --- a/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-wasm-fmagic.c +++ b/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-wasm-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__wasm_fmagic( diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-rndnu-scalar.c b/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-rndnu-scalar.c index a9229e75b65..0619458007d 100644 --- a/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-rndnu-scalar.c +++ b/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-rndnu-scalar.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_dwconv_minmax_rndnu_ukernel_9p1c__scalar( diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-fmagic.c b/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-fmagic.c index c3c235de507..5308c1cfcdd 100644 --- a/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-fmagic.c +++ b/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__scalar_fmagic( diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-imagic.c b/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-imagic.c index f50ad5aa52b..ea8c111e9ad 100644 --- a/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-imagic.c +++ b/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-imagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic( diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-lrintf.c b/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-lrintf.c index 225cbbd3d05..6df79d4d0d9 100644 --- a/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-lrintf.c +++ b/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-lrintf.c @@ -9,9 +9,13 @@ #include #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf( diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-wasm-fmagic.c b/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-wasm-fmagic.c index 464a40ef827..e25551b317a 100644 --- a/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-wasm-fmagic.c +++ b/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-wasm-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__wasm_fmagic( diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-rndnu-scalar.c b/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-rndnu-scalar.c index 0bf0c224060..115300ce509 100644 --- a/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-rndnu-scalar.c +++ b/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-rndnu-scalar.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_dwconv_minmax_rndnu_ukernel_9p2c__scalar( diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-fmagic.c b/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-fmagic.c index c310670400c..ba767d9b162 100644 --- a/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-fmagic.c +++ b/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" void xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__scalar_fmagic( size_t channels, diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-imagic.c b/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-imagic.c index 9c66853c11b..54145636610 100644 --- a/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-imagic.c +++ b/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-imagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" void xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__scalar_imagic( size_t channels, diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-lrintf.c b/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-lrintf.c index 85a538f1e2c..546582e8cb7 100644 --- a/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-lrintf.c +++ b/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-lrintf.c @@ -9,9 +9,13 @@ #include #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" void xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__scalar_lrintf( size_t channels, diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-wasm-fmagic.c b/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-wasm-fmagic.c index e3b1e6315c2..670aded74fd 100644 --- a/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-wasm-fmagic.c +++ b/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-wasm-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" void xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__wasm_fmagic( size_t channels, diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-rndnu-scalar.c b/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-rndnu-scalar.c index 65e67b5810e..bafbd4d6f23 100644 --- a/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-rndnu-scalar.c +++ b/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-rndnu-scalar.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" void xnn_qs8_dwconv_minmax_rndnu_ukernel_9p4c__scalar( size_t channels, diff --git a/src/qs8-dwconv/qs8-dwconv-minmax-multipass-fp32.h b/src/qs8-dwconv/qs8-dwconv-minmax-multipass-fp32.h index e595ae13676..87381179a7c 100644 --- a/src/qs8-dwconv/qs8-dwconv-minmax-multipass-fp32.h +++ b/src/qs8-dwconv/qs8-dwconv-minmax-multipass-fp32.h @@ -91,13 +91,16 @@ XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_add16_vpunpck, 8, 8, 9, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_vpmovsx, 8, 8, 9, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_vpunpck, 8, 8, 9, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c16s1r__avx512skx_mul32, 5, 5, 5, 16, 16, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c16s1r__avx512skx_mul32, 5, 5, 5, 32, 16, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c16s1r__avx512skx_mul32, 6, 6, 7, 16, 16, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c16s1r__avx512skx_mul32, 6, 6, 7, 32, 16, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c16s1r__avx512skx_mul32, 8, 8, 9, 16, 16, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s1r__avx512skx_mul32, 8, 8, 9, 32, 16, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) diff --git a/src/qs8-dwconv/qs8-dwconv-minmax-unipass-fp32.h b/src/qs8-dwconv/qs8-dwconv-minmax-unipass-fp32.h index f3773aad7bf..bd63b39ce4b 100644 --- a/src/qs8-dwconv/qs8-dwconv-minmax-unipass-fp32.h +++ b/src/qs8-dwconv/qs8-dwconv-minmax-unipass-fp32.h @@ -72,11 +72,14 @@ XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c_ XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_vpmovsx, 32, false, 32, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_vpunpck, 32, false, 32, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul32, 32, false, 32, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx512skx_mul32, 16, false, 16, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32, 32, false, 32, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx512skx_mul32, 16, false, 16, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32, 32, false, 32, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16, 8, false, 8, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params) diff --git a/src/qs8-dwconv/unipass-scalar.c.in b/src/qs8-dwconv/unipass-scalar.c.in index 6e7d0370ac1..a744988b8e4 100644 --- a/src/qs8-dwconv/unipass-scalar.c.in +++ b/src/qs8-dwconv/unipass-scalar.c.in @@ -10,9 +10,13 @@ $assert DATATYPE in ["QC8", "QS8", "QU8"] #include $if VARIANT == "LRINTF": #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" $if CHANNEL_TILE % 4 != 0: #include "xnnpack/unaligned.h" diff --git a/src/qs8-f32-vcvt/qs8-f32-vcvt.h b/src/qs8-f32-vcvt/qs8-f32-vcvt.h index e7de4a98713..0df6e93b83f 100644 --- a/src/qs8-f32-vcvt/qs8-f32-vcvt.h +++ b/src/qs8-f32-vcvt/qs8-f32-vcvt.h @@ -40,11 +40,14 @@ XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_f32_vcvt_ukernel__avx2_u8 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_f32_vcvt_ukernel__avx2_u16, 16, false, int8_t, float, struct xnn_qs8_f32_cvt_params, xnn_init_qs8_f32_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_f32_vcvt_ukernel__avx2_u24, 24, false, int8_t, float, struct xnn_qs8_f32_cvt_params, xnn_init_qs8_f32_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_f32_vcvt_ukernel__avx2_u32, 32, false, int8_t, float, struct xnn_qs8_f32_cvt_params, xnn_init_qs8_f32_cvt_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qs8_f32_vcvt_ukernel__avx512skx_u16, 16, false, int8_t, float, struct xnn_qs8_f32_cvt_params, xnn_init_qs8_f32_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qs8_f32_vcvt_ukernel__avx512skx_u32, 32, false, int8_t, float, struct xnn_qs8_f32_cvt_params, xnn_init_qs8_f32_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qs8_f32_vcvt_ukernel__avx512skx_u48, 48, false, int8_t, float, struct xnn_qs8_f32_cvt_params, xnn_init_qs8_f32_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qs8_f32_vcvt_ukernel__avx512skx_u64, 64, false, int8_t, float, struct xnn_qs8_f32_cvt_params, xnn_init_qs8_f32_cvt_scalar_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qs8_f32_vcvt_ukernel__wasmsimd_u8, 8, false, int8_t, float, struct xnn_qs8_f32_cvt_params, xnn_init_qs8_f32_cvt_scalar_params) diff --git a/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avx256vnni-prfm.c b/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avx256vnni-prfm.c index cfe2ec2166a..7e36fddd660 100644 --- a/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avx256vnni-prfm.c +++ b/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avx256vnni-prfm.c @@ -31,7 +31,7 @@ void xnn_qs8_packw_gemm_goi_ukernel_x16c8__avx256vnni_prfm( const void* scale, int8_t* packed_weights, size_t extra_bytes, - const void* params) + const void* params) XNN_OOB_READS { assert(g != 0); assert(nc != 0); @@ -187,190 +187,46 @@ void xnn_qs8_packw_gemm_goi_ukernel_x16c8__avx256vnni_prfm( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - __m256i v8 = _mm256_setzero_si256(); - __m256i v12 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - w8 += 4; - w9 += 4; - w10 += 4; - w11 += 4; - w12 += 4; - w13 += 4; - w14 += 4; - w15 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - w8 += 2; - w9 += 2; - w10 += 2; - w11 += 2; - w12 += 2; - w13 += 2; - w14 += 2; - w15 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - w8 += 1; - w9 += 1; - w10 += 1; - w11 += 1; - w12 += 1; - w13 += 1; - w14 += 1; - w15 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8)); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0); + v8 = _mm256_and_si256(v8, vmask); + __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12)); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0); + v12 = _mm256_and_si256(v12, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; + w8 += k; + w9 += k; + w10 += k; + w11 += k; + w12 += k; + w13 += k; + w14 += k; + w15 += k; vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4); @@ -587,190 +443,46 @@ void xnn_qs8_packw_gemm_goi_ukernel_x16c8__avx256vnni_prfm( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - __m256i v8 = _mm256_setzero_si256(); - __m256i v12 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - w8 += 4; - w9 += 4; - w10 += 4; - w11 += 4; - w12 += 4; - w13 += 4; - w14 += 4; - w15 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - w8 += 2; - w9 += 2; - w10 += 2; - w11 += 2; - w12 += 2; - w13 += 2; - w14 += 2; - w15 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - w8 += 1; - w9 += 1; - w10 += 1; - w11 += 1; - w12 += 1; - w13 += 1; - w14 += 1; - w15 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8)); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0); + v8 = _mm256_and_si256(v8, vmask); + __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12)); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0); + v12 = _mm256_and_si256(v12, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; + w8 += k; + w9 += k; + w10 += k; + w11 += k; + w12 += k; + w13 += k; + w14 += k; + w15 += k; vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4); diff --git a/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avx256vnni.c b/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avx256vnni.c index 67091f212a8..c12845e16c8 100644 --- a/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avx256vnni.c +++ b/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avx256vnni.c @@ -30,7 +30,7 @@ void xnn_qs8_packw_gemm_goi_ukernel_x16c8__avx256vnni( const void* scale, int8_t* packed_weights, size_t extra_bytes, - const void* params) + const void* params) XNN_OOB_READS { assert(g != 0); assert(nc != 0); @@ -138,190 +138,46 @@ void xnn_qs8_packw_gemm_goi_ukernel_x16c8__avx256vnni( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - __m256i v8 = _mm256_setzero_si256(); - __m256i v12 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - w8 += 4; - w9 += 4; - w10 += 4; - w11 += 4; - w12 += 4; - w13 += 4; - w14 += 4; - w15 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - w8 += 2; - w9 += 2; - w10 += 2; - w11 += 2; - w12 += 2; - w13 += 2; - w14 += 2; - w15 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - w8 += 1; - w9 += 1; - w10 += 1; - w11 += 1; - w12 += 1; - w13 += 1; - w14 += 1; - w15 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8)); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0); + v8 = _mm256_and_si256(v8, vmask); + __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12)); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0); + v12 = _mm256_and_si256(v12, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; + w8 += k; + w9 += k; + w10 += k; + w11 += k; + w12 += k; + w13 += k; + w14 += k; + w15 += k; vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4); @@ -490,190 +346,46 @@ void xnn_qs8_packw_gemm_goi_ukernel_x16c8__avx256vnni( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - __m256i v8 = _mm256_setzero_si256(); - __m256i v12 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - w8 += 4; - w9 += 4; - w10 += 4; - w11 += 4; - w12 += 4; - w13 += 4; - w14 += 4; - w15 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - w8 += 2; - w9 += 2; - w10 += 2; - w11 += 2; - w12 += 2; - w13 += 2; - w14 += 2; - w15 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - w8 += 1; - w9 += 1; - w10 += 1; - w11 += 1; - w12 += 1; - w13 += 1; - w14 += 1; - w15 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8)); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0); + v8 = _mm256_and_si256(v8, vmask); + __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12)); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0); + v12 = _mm256_and_si256(v12, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; + w8 += k; + w9 += k; + w10 += k; + w11 += k; + w12 += k; + w13 += k; + w14 += k; + w15 += k; vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4); diff --git a/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avxvnni-prfm.c b/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avxvnni-prfm.c index 6e2926d419d..e1bbd2b1556 100644 --- a/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avxvnni-prfm.c +++ b/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avxvnni-prfm.c @@ -31,7 +31,7 @@ void xnn_qs8_packw_gemm_goi_ukernel_x16c8__avxvnni_prfm( const void* scale, int8_t* packed_weights, size_t extra_bytes, - const void* params) + const void* params) XNN_OOB_READS { assert(g != 0); assert(nc != 0); @@ -187,190 +187,46 @@ void xnn_qs8_packw_gemm_goi_ukernel_x16c8__avxvnni_prfm( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - __m256i v8 = _mm256_setzero_si256(); - __m256i v12 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - w8 += 4; - w9 += 4; - w10 += 4; - w11 += 4; - w12 += 4; - w13 += 4; - w14 += 4; - w15 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - w8 += 2; - w9 += 2; - w10 += 2; - w11 += 2; - w12 += 2; - w13 += 2; - w14 += 2; - w15 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - w8 += 1; - w9 += 1; - w10 += 1; - w11 += 1; - w12 += 1; - w13 += 1; - w14 += 1; - w15 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8)); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0); + v8 = _mm256_and_si256(v8, vmask); + __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12)); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0); + v12 = _mm256_and_si256(v12, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; + w8 += k; + w9 += k; + w10 += k; + w11 += k; + w12 += k; + w13 += k; + w14 += k; + w15 += k; vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4); @@ -587,190 +443,46 @@ void xnn_qs8_packw_gemm_goi_ukernel_x16c8__avxvnni_prfm( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - __m256i v8 = _mm256_setzero_si256(); - __m256i v12 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - w8 += 4; - w9 += 4; - w10 += 4; - w11 += 4; - w12 += 4; - w13 += 4; - w14 += 4; - w15 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - w8 += 2; - w9 += 2; - w10 += 2; - w11 += 2; - w12 += 2; - w13 += 2; - w14 += 2; - w15 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - w8 += 1; - w9 += 1; - w10 += 1; - w11 += 1; - w12 += 1; - w13 += 1; - w14 += 1; - w15 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8)); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0); + v8 = _mm256_and_si256(v8, vmask); + __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12)); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0); + v12 = _mm256_and_si256(v12, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; + w8 += k; + w9 += k; + w10 += k; + w11 += k; + w12 += k; + w13 += k; + w14 += k; + w15 += k; vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4); diff --git a/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avxvnni.c b/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avxvnni.c index 8ddf82e3b88..1891bbff0f8 100644 --- a/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avxvnni.c +++ b/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avxvnni.c @@ -30,7 +30,7 @@ void xnn_qs8_packw_gemm_goi_ukernel_x16c8__avxvnni( const void* scale, int8_t* packed_weights, size_t extra_bytes, - const void* params) + const void* params) XNN_OOB_READS { assert(g != 0); assert(nc != 0); @@ -138,190 +138,46 @@ void xnn_qs8_packw_gemm_goi_ukernel_x16c8__avxvnni( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - __m256i v8 = _mm256_setzero_si256(); - __m256i v12 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - w8 += 4; - w9 += 4; - w10 += 4; - w11 += 4; - w12 += 4; - w13 += 4; - w14 += 4; - w15 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - w8 += 2; - w9 += 2; - w10 += 2; - w11 += 2; - w12 += 2; - w13 += 2; - w14 += 2; - w15 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - w8 += 1; - w9 += 1; - w10 += 1; - w11 += 1; - w12 += 1; - w13 += 1; - w14 += 1; - w15 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8)); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0); + v8 = _mm256_and_si256(v8, vmask); + __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12)); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0); + v12 = _mm256_and_si256(v12, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; + w8 += k; + w9 += k; + w10 += k; + w11 += k; + w12 += k; + w13 += k; + w14 += k; + w15 += k; vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4); @@ -490,190 +346,46 @@ void xnn_qs8_packw_gemm_goi_ukernel_x16c8__avxvnni( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - __m256i v8 = _mm256_setzero_si256(); - __m256i v12 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - w8 += 4; - w9 += 4; - w10 += 4; - w11 += 4; - w12 += 4; - w13 += 4; - w14 += 4; - w15 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - w8 += 2; - w9 += 2; - w10 += 2; - w11 += 2; - w12 += 2; - w13 += 2; - w14 += 2; - w15 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - w8 += 1; - w9 += 1; - w10 += 1; - w11 += 1; - w12 += 1; - w13 += 1; - w14 += 1; - w15 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8)); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0); + v8 = _mm256_and_si256(v8, vmask); + __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12)); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0); + v12 = _mm256_and_si256(v12, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; + w8 += k; + w9 += k; + w10 += k; + w11 += k; + w12 += k; + w13 += k; + w14 += k; + w15 += k; vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4); diff --git a/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avx256vnni-prfm.c b/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avx256vnni-prfm.c index f3f8d2672b4..4a60755892c 100644 --- a/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avx256vnni-prfm.c +++ b/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avx256vnni-prfm.c @@ -31,7 +31,7 @@ void xnn_qs8_packw_gemm_goi_ukernel_x8c8__avx256vnni_prfm( const void* scale, int8_t* packed_weights, size_t extra_bytes, - const void* params) + const void* params) XNN_OOB_READS { assert(g != 0); assert(nc != 0); @@ -130,108 +130,28 @@ void xnn_qs8_packw_gemm_goi_ukernel_x8c8__avx256vnni_prfm( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4); @@ -360,108 +280,28 @@ void xnn_qs8_packw_gemm_goi_ukernel_x8c8__avx256vnni_prfm( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4); diff --git a/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avx256vnni.c b/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avx256vnni.c index 7919e5d3561..c25d8598d6c 100644 --- a/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avx256vnni.c +++ b/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avx256vnni.c @@ -30,7 +30,7 @@ void xnn_qs8_packw_gemm_goi_ukernel_x8c8__avx256vnni( const void* scale, int8_t* packed_weights, size_t extra_bytes, - const void* params) + const void* params) XNN_OOB_READS { assert(g != 0); assert(nc != 0); @@ -105,108 +105,28 @@ void xnn_qs8_packw_gemm_goi_ukernel_x8c8__avx256vnni( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4); @@ -311,108 +231,28 @@ void xnn_qs8_packw_gemm_goi_ukernel_x8c8__avx256vnni( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4); diff --git a/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avxvnni-prfm.c b/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avxvnni-prfm.c index 687273d457f..44354081b3f 100644 --- a/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avxvnni-prfm.c +++ b/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avxvnni-prfm.c @@ -31,7 +31,7 @@ void xnn_qs8_packw_gemm_goi_ukernel_x8c8__avxvnni_prfm( const void* scale, int8_t* packed_weights, size_t extra_bytes, - const void* params) + const void* params) XNN_OOB_READS { assert(g != 0); assert(nc != 0); @@ -130,108 +130,28 @@ void xnn_qs8_packw_gemm_goi_ukernel_x8c8__avxvnni_prfm( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4); @@ -360,108 +280,28 @@ void xnn_qs8_packw_gemm_goi_ukernel_x8c8__avxvnni_prfm( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4); diff --git a/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avxvnni.c b/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avxvnni.c index d7b555ff552..acbbdfdc876 100644 --- a/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avxvnni.c +++ b/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avxvnni.c @@ -30,7 +30,7 @@ void xnn_qs8_packw_gemm_goi_ukernel_x8c8__avxvnni( const void* scale, int8_t* packed_weights, size_t extra_bytes, - const void* params) + const void* params) XNN_OOB_READS { assert(g != 0); assert(nc != 0); @@ -105,108 +105,28 @@ void xnn_qs8_packw_gemm_goi_ukernel_x8c8__avxvnni( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4); @@ -311,108 +231,28 @@ void xnn_qs8_packw_gemm_goi_ukernel_x8c8__avxvnni( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4); diff --git a/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c b/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c new file mode 100644 index 00000000000..881ea5af2a2 --- /dev/null +++ b/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c @@ -0,0 +1,370 @@ +// Auto-generated file. Do not edit! +// Template: src/x8-packw/kr-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + + +#include + +#include + +#include "xnnpack/packw.h" + + +void xnn_qs8_packw_gemm_goi_ukernel_x8c8__wasmrelaxedsimd( + size_t g, + size_t nc, + size_t kc, + size_t nr, + size_t kr, + size_t sr, + const int8_t* weights, + const int32_t* bias, + const void* scale, + int8_t* packed_weights, + size_t extra_bytes, + const void* params) XNN_OOB_READS +{ + assert(g != 0); + assert(nc != 0); + assert(kc != 0); + assert(nr == 8); + assert(kr == 8); + assert(sr == 1); + assert(weights != NULL); + assert(packed_weights != NULL); + + const v128_t vone = wasm_i8x16_splat(1); + const v128_t vzero = wasm_i32x4_splat(0); + XNN_FORCE_REALIZATION(vone); + XNN_FORCE_REALIZATION(vzero); + int8_t* out = (int8_t*) packed_weights; + const uint32_t* b = (const uint32_t*) bias; + const uint32_t izp = (uint32_t) (params ? (((const struct xnn_qs8_packw_params*) params)->input_zero_point + 0): 0); + v128_t vzeropoint = wasm_i32x4_splat((int32_t) izp); + + do { + // NC main loop multiple of 8 + const int8_t* w0 = (const int8_t*) weights; + size_t n = nc; + for (;n >= 8; n -= 8) { + int32_t* packed_b = (int32_t*) out; + if XNN_LIKELY(b != NULL) { + const v128_t vb0 = wasm_v128_load(b + 0); + wasm_v128_store(out + 0, vb0); + const v128_t vb1 = wasm_v128_load(b + 4); + wasm_v128_store(out + 16, vb1); + b += 8; + } else { + wasm_v128_store(out + 0, vzero); + wasm_v128_store(out + 16, vzero); + } + out += 8 * sizeof(uint32_t); + + const int8_t* w1 = w0 + kc; + const int8_t* w2 = w1 + kc; + const int8_t* w3 = w2 + kc; + const int8_t* w4 = w3 + kc; + const int8_t* w5 = w4 + kc; + const int8_t* w6 = w5 + kc; + const int8_t* w7 = w6 + kc; + + v128_t vacc01 = wasm_i32x4_splat(0); + v128_t vacc23 = wasm_i32x4_splat(0); + v128_t vacc45 = wasm_i32x4_splat(0); + v128_t vacc67 = wasm_i32x4_splat(0); + + // KC main loop multiple of 8x8 + size_t k = kc; + for (; k >= 16; k -= 16) { + v128_t v0_01 = wasm_v128_load(w0); + v128_t v1_01 = wasm_v128_load(w1); + v128_t v2_01 = wasm_v128_load(w2); + v128_t v3_01 = wasm_v128_load(w3); + v128_t v4_01 = wasm_v128_load(w4); + v128_t v5_01 = wasm_v128_load(w5); + v128_t v6_01 = wasm_v128_load(w6); + v128_t v7_01 = wasm_v128_load(w7); + + v128_t v01_0 = wasm_i64x2_shuffle(v0_01, v1_01, 0, 2); + v128_t v01_1 = wasm_i64x2_shuffle(v0_01, v1_01, 1, 3); + v128_t v23_0 = wasm_i64x2_shuffle(v2_01, v3_01, 0, 2); + v128_t v23_1 = wasm_i64x2_shuffle(v2_01, v3_01, 1, 3); + v128_t v45_0 = wasm_i64x2_shuffle(v4_01, v5_01, 0, 2); + v128_t v45_1 = wasm_i64x2_shuffle(v4_01, v5_01, 1, 3); + v128_t v67_0 = wasm_i64x2_shuffle(v6_01, v7_01, 0, 2); + v128_t v67_1 = wasm_i64x2_shuffle(v6_01, v7_01, 1, 3); + + vacc01 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v01_0, vone, vacc01); + vacc01 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v01_1, vone, vacc01); + vacc23 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v23_0, vone, vacc23); + vacc23 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v23_1, vone, vacc23); + vacc45 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v45_0, vone, vacc45); + vacc45 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v45_1, vone, vacc45); + vacc67 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v67_0, vone, vacc67); + vacc67 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v67_1, vone, vacc67); + + wasm_v128_store(out + 0, v01_0); + wasm_v128_store(out + 16, v23_0); + wasm_v128_store(out + 32, v45_0); + wasm_v128_store(out + 48, v67_0); + + wasm_v128_store(out + 64, v01_1); + wasm_v128_store(out + 80, v23_1); + wasm_v128_store(out + 96, v45_1); + wasm_v128_store(out + 112, v67_1); + + w0 += 16; + w1 += 16; + w2 += 16; + w3 += 16; + w4 += 16; + w5 += 16; + w6 += 16; + w7 += 16; + out += 128; + } + + for (; k >= 8; k -= 8) { + const v128_t v0 = wasm_v128_load64_splat(w0); + const v128_t v1 = wasm_v128_load64_splat(w1); + const v128_t v01 = wasm_i64x2_shuffle(v0, v1, 0, 3); + const v128_t v2 = wasm_v128_load64_splat(w2); + const v128_t v3 = wasm_v128_load64_splat(w3); + const v128_t v23 = wasm_i64x2_shuffle(v2, v3, 0, 3); + const v128_t v4 = wasm_v128_load64_splat(w4); + const v128_t v5 = wasm_v128_load64_splat(w5); + const v128_t v45 = wasm_i64x2_shuffle(v4, v5, 0, 3); + const v128_t v6 = wasm_v128_load64_splat(w6); + const v128_t v7 = wasm_v128_load64_splat(w7); + const v128_t v67 = wasm_i64x2_shuffle(v6, v7, 0, 3); + + vacc01 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v01, vone, vacc01); + vacc23 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v23, vone, vacc23); + vacc45 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v45, vone, vacc45); + vacc67 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v67, vone, vacc67); + + wasm_v128_store(out + 0, v01); + wasm_v128_store(out + 16, v23); + wasm_v128_store(out + 32, v45); + wasm_v128_store(out + 48, v67); + + w0 += 8; + w1 += 8; + w2 += 8; + w3 += 8; + w4 += 8; + w5 += 8; + w6 += 8; + w7 += 8; + out += 64; + } + + // KC remainder 1..KR-1 + if (k != 0) { + assert(k >= 1 && k <= 7); + + const v128_t vmask = wasm_u64x2_shr(wasm_i32x4_splat(-1), (8 - k) * sizeof(int8_t) * 8); + + const v128_t v0 = wasm_v128_load64_splat(w0); + const v128_t v1 = wasm_v128_load64_splat(w1); + v128_t v01 = wasm_i64x2_shuffle(v0, v1, 0, 3); + v01 = wasm_v128_and(v01, vmask); + const v128_t v2 = wasm_v128_load64_splat(w2); + const v128_t v3 = wasm_v128_load64_splat(w3); + v128_t v23 = wasm_i64x2_shuffle(v2, v3, 0, 3); + v23 = wasm_v128_and(v23, vmask); + const v128_t v4 = wasm_v128_load64_splat(w4); + const v128_t v5 = wasm_v128_load64_splat(w5); + v128_t v45 = wasm_i64x2_shuffle(v4, v5, 0, 3); + v45 = wasm_v128_and(v45, vmask); + const v128_t v6 = wasm_v128_load64_splat(w6); + const v128_t v7 = wasm_v128_load64_splat(w7); + v128_t v67 = wasm_i64x2_shuffle(v6, v7, 0, 3); + v67 = wasm_v128_and(v67, vmask); + + vacc01 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v01, vone, vacc01); + vacc23 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v23, vone, vacc23); + vacc45 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v45, vone, vacc45); + vacc67 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v67, vone, vacc67); + + wasm_v128_store(out + 0, v01); + wasm_v128_store(out + 16, v23); + wasm_v128_store(out + 32, v45); + wasm_v128_store(out + 48, v67); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; + out += 64; + } + + v128_t vksum0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc01, vacc23, 0, 2, 4, 6), wasm_v32x4_shuffle(vacc01, vacc23, 1, 3, 5, 7)); + v128_t vksum4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc45, vacc67, 0, 2, 4, 6), wasm_v32x4_shuffle(vacc45, vacc67, 1, 3, 5, 7)); + + vksum0123 = wasm_i32x4_mul(vksum0123, vzeropoint); + vksum4567 = wasm_i32x4_mul(vksum4567, vzeropoint); + + v128_t vpack0123 = wasm_v128_load(packed_b); + v128_t vpack4567 = wasm_v128_load(packed_b + 4); + + wasm_v128_store(packed_b, wasm_i32x4_sub(vpack0123, vksum0123)); + wasm_v128_store(packed_b + 4, wasm_i32x4_sub(vpack4567, vksum4567)); + + out = (int8_t*) ((uintptr_t) out + extra_bytes); + w0 = w7; + } + + // NC remainder (1..7) + if XNN_UNLIKELY(n != 0) { + assert(n >= 1 && n <= 7); + + int32_t* packed_b = (int32_t*) out; + if XNN_LIKELY(b != NULL) { + size_t nb = n; + do { + *((uint32_t*) out) = *b++; + out += sizeof(uint32_t); + } while (--nb != 0); + } else { + size_t nb = n; + do { + *((uint32_t*) out) = 0; + out += sizeof(uint32_t); + } while (--nb != 0); + } + out += (8 - n) * sizeof(uint32_t); + + const int8_t* w1 = w0 + kc; + if XNN_UNPREDICTABLE(n < 2) { + w1 = w0; + } + const int8_t* w2 = w1 + kc; + if XNN_UNPREDICTABLE(n <= 2) { + w2 = w1; + } + const int8_t* w3 = w2 + kc; + if XNN_UNPREDICTABLE(n < 4) { + w3 = w2; + } + const int8_t* w4 = w3 + kc; + if XNN_UNPREDICTABLE(n <= 4) { + w4 = w3; + } + const int8_t* w5 = w4 + kc; + if XNN_UNPREDICTABLE(n < 6) { + w5 = w4; + } + const int8_t* w6 = w5 + kc; + if XNN_UNPREDICTABLE(n <= 6) { + w6 = w5; + } + const int8_t* w7 = w6 + kc; + if XNN_UNPREDICTABLE(n < 8) { + w7 = w6; + } + + v128_t vacc01 = wasm_i32x4_splat(0); + v128_t vacc23 = wasm_i32x4_splat(0); + v128_t vacc45 = wasm_i32x4_splat(0); + v128_t vacc67 = wasm_i32x4_splat(0); + + // KC main loop multiple of 8x8 + size_t k = kc; + for (; k >= 8; k -= 8) { + const v128_t v0 = wasm_v128_load64_splat(w0); + const v128_t v1 = wasm_v128_load64_splat(w1); + const v128_t v01 = wasm_i64x2_shuffle(v0, v1, 0, 3); + const v128_t v2 = wasm_v128_load64_splat(w2); + const v128_t v3 = wasm_v128_load64_splat(w3); + const v128_t v23 = wasm_i64x2_shuffle(v2, v3, 0, 3); + const v128_t v4 = wasm_v128_load64_splat(w4); + const v128_t v5 = wasm_v128_load64_splat(w5); + const v128_t v45 = wasm_i64x2_shuffle(v4, v5, 0, 3); + const v128_t v6 = wasm_v128_load64_splat(w6); + const v128_t v7 = wasm_v128_load64_splat(w7); + const v128_t v67 = wasm_i64x2_shuffle(v6, v7, 0, 3); + + vacc01 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v01, vone, vacc01); + vacc23 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v23, vone, vacc23); + vacc45 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v45, vone, vacc45); + vacc67 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v67, vone, vacc67); + + wasm_v128_store(out + 0, v01); + wasm_v128_store(out + 16, v23); + wasm_v128_store(out + 32, v45); + wasm_v128_store(out + 48, v67); + + w0 += 8; + w1 += 8; + w2 += 8; + w3 += 8; + w4 += 8; + w5 += 8; + w6 += 8; + w7 += 8; + out += 64; + } + + // KC remainder of 1..7 + if (k != 0) { + assert(k >= 1 && k <= 7); + + const v128_t vmask = wasm_u64x2_shr(wasm_i32x4_splat(-1), (8 - k) * sizeof(int8_t) * 8); + + const v128_t v0 = wasm_v128_load64_splat(w0); + const v128_t v1 = wasm_v128_load64_splat(w1); + v128_t v01 = wasm_i64x2_shuffle(v0, v1, 0, 3); + v01 = wasm_v128_and(v01, vmask); + const v128_t v2 = wasm_v128_load64_splat(w2); + const v128_t v3 = wasm_v128_load64_splat(w3); + v128_t v23 = wasm_i64x2_shuffle(v2, v3, 0, 3); + v23 = wasm_v128_and(v23, vmask); + const v128_t v4 = wasm_v128_load64_splat(w4); + const v128_t v5 = wasm_v128_load64_splat(w5); + v128_t v45 = wasm_i64x2_shuffle(v4, v5, 0, 3); + v45 = wasm_v128_and(v45, vmask); + const v128_t v6 = wasm_v128_load64_splat(w6); + const v128_t v7 = wasm_v128_load64_splat(w7); + v128_t v67 = wasm_i64x2_shuffle(v6, v7, 0, 3); + v67 = wasm_v128_and(v67, vmask); + + vacc01 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v01, vone, vacc01); + vacc23 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v23, vone, vacc23); + vacc45 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v45, vone, vacc45); + vacc67 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v67, vone, vacc67); + + wasm_v128_store(out + 0, v01); + wasm_v128_store(out + 16, v23); + wasm_v128_store(out + 32, v45); + wasm_v128_store(out + 48, v67); + + out += 64; + } + + v128_t vksum0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc01, vacc23, 0, 2, 4, 6), wasm_v32x4_shuffle(vacc01, vacc23, 1, 3, 5, 7)); + v128_t vksum4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc45, vacc67, 0, 2, 4, 6), wasm_v32x4_shuffle(vacc45, vacc67, 1, 3, 5, 7)); + + vksum0123 = wasm_i32x4_mul(vksum0123, vzeropoint); + vksum4567 = wasm_i32x4_mul(vksum4567, vzeropoint); + + v128_t vpack0123 = wasm_v128_load(packed_b); + v128_t vpack4567 = wasm_v128_load(packed_b + 4); + + wasm_v128_store(packed_b, wasm_i32x4_sub(vpack0123, vksum0123)); + wasm_v128_store(packed_b + 4, wasm_i32x4_sub(vpack4567, vksum4567)); + + out = (int8_t*) ((uintptr_t) out + extra_bytes); + } + weights += nc * kc; + } while (--g != 0); +} diff --git a/src/qs8-packw/qs8-packw.h b/src/qs8-packw/qs8-packw.h index 21088b9e763..24fc85abf05 100644 --- a/src/qs8-packw/qs8-packw.h +++ b/src/qs8-packw/qs8-packw.h @@ -38,3 +38,8 @@ XNN_QS8_UKERNEL(xnn_arch_x86_avx256vnni, xnn_qs8_packw_gemm_goi_ukernel_x16c8__a XNN_QS8_UKERNEL(xnn_arch_x86_avx256vnni, xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avx256vnni, 16, 8, 1, 8, 1, 128) XNN_QS8_UKERNEL(xnn_arch_x86_avx256vnni, xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avx256vnni_prfm, 16, 8, 1, 8, 1, 128) #endif + +#if XNN_ARCH_WASMRELAXEDSIMD +XNN_QS8_UKERNEL(0, xnn_qs8_packw_gemm_goi_ukernel_x8c8__wasmrelaxedsimd, 8, 8, 1, 8, 1, 0) +XNN_QS8_UKERNEL(0, xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__wasmrelaxedsimd, 8, 8, 1, 8, 1, 128) +#endif // XNN_ARCH_WASMRELAXEDSIMD diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-scalar-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-scalar-fmagic.c index 0c68f57a3a6..94407ec9dcb 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-scalar-fmagic.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-scalar-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic( diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-scalar-imagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-scalar-imagic.c index 1bcc3f67e72..e14f5a01bf8 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-scalar-imagic.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-scalar-imagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic( diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-scalar-lrintf.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-scalar-lrintf.c index 0430a0900b7..f97e7b295ba 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-scalar-lrintf.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-scalar-lrintf.c @@ -9,9 +9,13 @@ #include #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__scalar_lrintf( diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-wasm-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-wasm-fmagic.c index b18c4ad52b8..c1c4dbe9d39 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-wasm-fmagic.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-wasm-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__wasm_fmagic( diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-scalar-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-scalar-fmagic.c index cc41ca8cd2d..06a39ff90e8 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-scalar-fmagic.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-scalar-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p2c__scalar_fmagic( diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-scalar-imagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-scalar-imagic.c index c762f870f3d..a0ff8a51dec 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-scalar-imagic.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-scalar-imagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p2c__scalar_imagic( diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-scalar-lrintf.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-scalar-lrintf.c index addfe971325..ce932cc4423 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-scalar-lrintf.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-scalar-lrintf.c @@ -9,9 +9,13 @@ #include #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf( diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-wasm-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-wasm-fmagic.c index d47df6e330d..6d78543d04c 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-wasm-fmagic.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-wasm-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p2c__wasm_fmagic( diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-scalar-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-scalar-fmagic.c index a810c5da499..b8c4d56a4d2 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-scalar-fmagic.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-scalar-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p4c__scalar_fmagic( size_t channels, diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-scalar-imagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-scalar-imagic.c index d4c4c571d15..499f670fb42 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-scalar-imagic.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-scalar-imagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p4c__scalar_imagic( size_t channels, diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-scalar-lrintf.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-scalar-lrintf.c index e43c03815f1..0c8925e1854 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-scalar-lrintf.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-scalar-lrintf.c @@ -9,9 +9,13 @@ #include #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p4c__scalar_lrintf( size_t channels, diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-wasm-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-wasm-fmagic.c index 43d3632997a..0a0fbaf8b04 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-wasm-fmagic.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-wasm-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p4c__wasm_fmagic( size_t channels, diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p1c-minmax-fp32-scalar-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p1c-minmax-fp32-scalar-fmagic.c index b47e330f6fd..ca03406b25f 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p1c-minmax-fp32-scalar-fmagic.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p1c-minmax-fp32-scalar-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p1c__scalar_fmagic( diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p2c-minmax-fp32-scalar-imagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p2c-minmax-fp32-scalar-imagic.c index 5cf82696e6c..71afee0f507 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p2c-minmax-fp32-scalar-imagic.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p2c-minmax-fp32-scalar-imagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p2c__scalar_imagic( diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p2c-minmax-fp32-scalar-lrintf.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p2c-minmax-fp32-scalar-lrintf.c index 68a72856c97..c1ad1d55718 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p2c-minmax-fp32-scalar-lrintf.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p2c-minmax-fp32-scalar-lrintf.c @@ -9,9 +9,13 @@ #include #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p2c__scalar_lrintf( diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p2c-minmax-fp32-wasm-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p2c-minmax-fp32-wasm-fmagic.c index bf7c0aae4e8..3065b899216 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p2c-minmax-fp32-wasm-fmagic.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p2c-minmax-fp32-wasm-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p2c__wasm_fmagic( diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-4p2c-minmax-fp32-scalar-imagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-4p2c-minmax-fp32-scalar-imagic.c index 1fcf3acecfc..64a064d3c71 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-4p2c-minmax-fp32-scalar-imagic.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-4p2c-minmax-fp32-scalar-imagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_4p2c__scalar_imagic( diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-scalar-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-scalar-fmagic.c index 5030009c408..2fc424d2acc 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-scalar-fmagic.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-scalar-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic( diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-scalar-imagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-scalar-imagic.c index e0971ab002c..8dfa3021075 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-scalar-imagic.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-scalar-imagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p1c__scalar_imagic( diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-scalar-lrintf.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-scalar-lrintf.c index d8ebc7fffec..07cb8cbe367 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-scalar-lrintf.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-scalar-lrintf.c @@ -9,9 +9,13 @@ #include #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p1c__scalar_lrintf( diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-wasm-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-wasm-fmagic.c index 97a16c1b510..d132f26b328 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-wasm-fmagic.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-wasm-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p1c__wasm_fmagic( diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-scalar-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-scalar-fmagic.c index 9b0a23e3d71..db08261c7ff 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-scalar-fmagic.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-scalar-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__scalar_fmagic( diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-scalar-imagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-scalar-imagic.c index 1a00cc7b501..7b22beb58d3 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-scalar-imagic.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-scalar-imagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic( diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-scalar-lrintf.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-scalar-lrintf.c index a14d6fece87..0c77229c630 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-scalar-lrintf.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-scalar-lrintf.c @@ -9,9 +9,13 @@ #include #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf( diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-wasm-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-wasm-fmagic.c index 65ba30f7590..1f648aab110 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-wasm-fmagic.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-wasm-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__wasm_fmagic( diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-scalar-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-scalar-fmagic.c index 570c2a0b0c6..e126d0484a4 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-scalar-fmagic.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-scalar-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p4c__scalar_fmagic( size_t channels, diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-scalar-imagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-scalar-imagic.c index 708fd85cbdc..a67aaba7bf4 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-scalar-imagic.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-scalar-imagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p4c__scalar_imagic( size_t channels, diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-scalar-lrintf.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-scalar-lrintf.c index 428bcb6b067..ba90d727b0f 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-scalar-lrintf.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-scalar-lrintf.c @@ -9,9 +9,13 @@ #include #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p4c__scalar_lrintf( size_t channels, diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-wasm-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-wasm-fmagic.c index 877d026b07b..5acc1a34a33 100644 --- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-wasm-fmagic.c +++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-wasm-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p4c__wasm_fmagic( size_t channels, diff --git a/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-multipass-fp32.h b/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-multipass-fp32.h index 2787de0b4a4..429fc86b505 100644 --- a/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-multipass-fp32.h +++ b/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-multipass-fp32.h @@ -127,13 +127,16 @@ XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_ XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_add16_vpunpck, 8, 8, 9, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_vpmovsx, 8, 8, 9, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_vpunpck, 8, 8, 9, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c16s1r__avx512skx_mul32, 5, 5, 5, 16, 16, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c16s1r__avx512skx_mul32, 5, 5, 5, 32, 16, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c16s1r__avx512skx_mul32, 6, 6, 7, 16, 16, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c16s1r__avx512skx_mul32, 6, 6, 7, 32, 16, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c16s1r__avx512skx_mul32, 8, 8, 9, 16, 16, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c16s1r__avx512skx_mul32, 8, 8, 9, 32, 16, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) diff --git a/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-unipass-fp32.h b/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-unipass-fp32.h index 5281e1b2243..81174b6ffbc 100644 --- a/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-unipass-fp32.h +++ b/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-unipass-fp32.h @@ -118,12 +118,15 @@ XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25 XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_vpmovsx, 32, false, 32, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_vpunpck, 32, false, 32, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul32, 32, false, 32, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p32c__avx512skx_mul32, 32, false, 32, 3, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx512skx_mul32, 16, false, 16, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32, 32, false, 32, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx512skx_mul32, 16, false, 16, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32, 32, false, 32, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__wasmsimd_mul16_add16, 16, false, 16, 3, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params) diff --git a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avx256vnni-prfm.c b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avx256vnni-prfm.c index 4b40552f071..e66654f7dbf 100644 --- a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avx256vnni-prfm.c +++ b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avx256vnni-prfm.c @@ -31,7 +31,7 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avx256vnni_prfm( const void* scale, int8_t* packed_weights, size_t extra_bytes, - const void* params) + const void* params) XNN_OOB_READS { assert(g != 0); assert(nc != 0); @@ -187,190 +187,46 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avx256vnni_prfm( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - __m256i v8 = _mm256_setzero_si256(); - __m256i v12 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - w8 += 4; - w9 += 4; - w10 += 4; - w11 += 4; - w12 += 4; - w13 += 4; - w14 += 4; - w15 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - w8 += 2; - w9 += 2; - w10 += 2; - w11 += 2; - w12 += 2; - w13 += 2; - w14 += 2; - w15 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - w8 += 1; - w9 += 1; - w10 += 1; - w11 += 1; - w12 += 1; - w13 += 1; - w14 += 1; - w15 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8)); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0); + v8 = _mm256_and_si256(v8, vmask); + __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12)); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0); + v12 = _mm256_and_si256(v12, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; + w8 += k; + w9 += k; + w10 += k; + w11 += k; + w12 += k; + w13 += k; + w14 += k; + w15 += k; vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4); @@ -587,190 +443,46 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avx256vnni_prfm( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - __m256i v8 = _mm256_setzero_si256(); - __m256i v12 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - w8 += 4; - w9 += 4; - w10 += 4; - w11 += 4; - w12 += 4; - w13 += 4; - w14 += 4; - w15 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - w8 += 2; - w9 += 2; - w10 += 2; - w11 += 2; - w12 += 2; - w13 += 2; - w14 += 2; - w15 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - w8 += 1; - w9 += 1; - w10 += 1; - w11 += 1; - w12 += 1; - w13 += 1; - w14 += 1; - w15 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8)); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0); + v8 = _mm256_and_si256(v8, vmask); + __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12)); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0); + v12 = _mm256_and_si256(v12, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; + w8 += k; + w9 += k; + w10 += k; + w11 += k; + w12 += k; + w13 += k; + w14 += k; + w15 += k; vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4); diff --git a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avx256vnni.c b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avx256vnni.c index 357f922f971..5072bde3fab 100644 --- a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avx256vnni.c +++ b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avx256vnni.c @@ -30,7 +30,7 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avx256vnni( const void* scale, int8_t* packed_weights, size_t extra_bytes, - const void* params) + const void* params) XNN_OOB_READS { assert(g != 0); assert(nc != 0); @@ -138,190 +138,46 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avx256vnni( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - __m256i v8 = _mm256_setzero_si256(); - __m256i v12 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - w8 += 4; - w9 += 4; - w10 += 4; - w11 += 4; - w12 += 4; - w13 += 4; - w14 += 4; - w15 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - w8 += 2; - w9 += 2; - w10 += 2; - w11 += 2; - w12 += 2; - w13 += 2; - w14 += 2; - w15 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - w8 += 1; - w9 += 1; - w10 += 1; - w11 += 1; - w12 += 1; - w13 += 1; - w14 += 1; - w15 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8)); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0); + v8 = _mm256_and_si256(v8, vmask); + __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12)); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0); + v12 = _mm256_and_si256(v12, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; + w8 += k; + w9 += k; + w10 += k; + w11 += k; + w12 += k; + w13 += k; + w14 += k; + w15 += k; vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4); @@ -490,190 +346,46 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avx256vnni( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - __m256i v8 = _mm256_setzero_si256(); - __m256i v12 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - w8 += 4; - w9 += 4; - w10 += 4; - w11 += 4; - w12 += 4; - w13 += 4; - w14 += 4; - w15 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - w8 += 2; - w9 += 2; - w10 += 2; - w11 += 2; - w12 += 2; - w13 += 2; - w14 += 2; - w15 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - w8 += 1; - w9 += 1; - w10 += 1; - w11 += 1; - w12 += 1; - w13 += 1; - w14 += 1; - w15 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8)); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0); + v8 = _mm256_and_si256(v8, vmask); + __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12)); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0); + v12 = _mm256_and_si256(v12, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; + w8 += k; + w9 += k; + w10 += k; + w11 += k; + w12 += k; + w13 += k; + w14 += k; + w15 += k; vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4); diff --git a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avxvnni-prfm.c b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avxvnni-prfm.c index a63ac278656..0c7edfb76fa 100644 --- a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avxvnni-prfm.c +++ b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avxvnni-prfm.c @@ -31,7 +31,7 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avxvnni_prfm( const void* scale, int8_t* packed_weights, size_t extra_bytes, - const void* params) + const void* params) XNN_OOB_READS { assert(g != 0); assert(nc != 0); @@ -187,190 +187,46 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avxvnni_prfm( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - __m256i v8 = _mm256_setzero_si256(); - __m256i v12 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - w8 += 4; - w9 += 4; - w10 += 4; - w11 += 4; - w12 += 4; - w13 += 4; - w14 += 4; - w15 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - w8 += 2; - w9 += 2; - w10 += 2; - w11 += 2; - w12 += 2; - w13 += 2; - w14 += 2; - w15 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - w8 += 1; - w9 += 1; - w10 += 1; - w11 += 1; - w12 += 1; - w13 += 1; - w14 += 1; - w15 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8)); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0); + v8 = _mm256_and_si256(v8, vmask); + __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12)); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0); + v12 = _mm256_and_si256(v12, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; + w8 += k; + w9 += k; + w10 += k; + w11 += k; + w12 += k; + w13 += k; + w14 += k; + w15 += k; vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4); @@ -587,190 +443,46 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avxvnni_prfm( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - __m256i v8 = _mm256_setzero_si256(); - __m256i v12 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - w8 += 4; - w9 += 4; - w10 += 4; - w11 += 4; - w12 += 4; - w13 += 4; - w14 += 4; - w15 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - w8 += 2; - w9 += 2; - w10 += 2; - w11 += 2; - w12 += 2; - w13 += 2; - w14 += 2; - w15 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - w8 += 1; - w9 += 1; - w10 += 1; - w11 += 1; - w12 += 1; - w13 += 1; - w14 += 1; - w15 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8)); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0); + v8 = _mm256_and_si256(v8, vmask); + __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12)); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0); + v12 = _mm256_and_si256(v12, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; + w8 += k; + w9 += k; + w10 += k; + w11 += k; + w12 += k; + w13 += k; + w14 += k; + w15 += k; vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4); diff --git a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avxvnni.c b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avxvnni.c index f11674bc7a3..15041c4f0ba 100644 --- a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avxvnni.c +++ b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avxvnni.c @@ -30,7 +30,7 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avxvnni( const void* scale, int8_t* packed_weights, size_t extra_bytes, - const void* params) + const void* params) XNN_OOB_READS { assert(g != 0); assert(nc != 0); @@ -138,190 +138,46 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avxvnni( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - __m256i v8 = _mm256_setzero_si256(); - __m256i v12 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - w8 += 4; - w9 += 4; - w10 += 4; - w11 += 4; - w12 += 4; - w13 += 4; - w14 += 4; - w15 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - w8 += 2; - w9 += 2; - w10 += 2; - w11 += 2; - w12 += 2; - w13 += 2; - w14 += 2; - w15 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - w8 += 1; - w9 += 1; - w10 += 1; - w11 += 1; - w12 += 1; - w13 += 1; - w14 += 1; - w15 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8)); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0); + v8 = _mm256_and_si256(v8, vmask); + __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12)); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0); + v12 = _mm256_and_si256(v12, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; + w8 += k; + w9 += k; + w10 += k; + w11 += k; + w12 += k; + w13 += k; + w14 += k; + w15 += k; vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4); @@ -490,190 +346,46 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avxvnni( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - __m256i v8 = _mm256_setzero_si256(); - __m256i v12 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4); - v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4); - v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - w8 += 4; - w9 += 4; - w10 += 4; - w11 += 4; - w12 += 4; - w13 += 4; - w14 += 4; - w15 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8); - v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8); - v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - w8 += 2; - w9 += 2; - w10 += 2; - w11 += 2; - w12 += 2; - w13 += 2; - w14 += 2; - w15 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16); - v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16); - v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - w8 += 1; - w9 += 1; - w10 += 1; - w11 += 1; - w12 += 1; - w13 += 1; - w14 += 1; - w15 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8)); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30); + v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0); + v8 = _mm256_and_si256(v8, vmask); + __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12)); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30); + v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0); + v12 = _mm256_and_si256(v12, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; + w8 += k; + w9 += k; + w10 += k; + w11 += k; + w12 += k; + w13 += k; + w14 += k; + w15 += k; vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4); diff --git a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avx256vnni-prfm.c b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avx256vnni-prfm.c index 56153c32777..7eca738746c 100644 --- a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avx256vnni-prfm.c +++ b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avx256vnni-prfm.c @@ -31,7 +31,7 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__avx256vnni_prfm( const void* scale, int8_t* packed_weights, size_t extra_bytes, - const void* params) + const void* params) XNN_OOB_READS { assert(g != 0); assert(nc != 0); @@ -130,108 +130,28 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__avx256vnni_prfm( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4); @@ -360,108 +280,28 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__avx256vnni_prfm( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4); diff --git a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avx256vnni.c b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avx256vnni.c index 53ca5f206d1..dfac2ab8885 100644 --- a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avx256vnni.c +++ b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avx256vnni.c @@ -30,7 +30,7 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__avx256vnni( const void* scale, int8_t* packed_weights, size_t extra_bytes, - const void* params) + const void* params) XNN_OOB_READS { assert(g != 0); assert(nc != 0); @@ -105,108 +105,28 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__avx256vnni( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4); @@ -311,108 +231,28 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__avx256vnni( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4); diff --git a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avxvnni-prfm.c b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avxvnni-prfm.c index 6bf982a99e5..ef9ea4338d0 100644 --- a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avxvnni-prfm.c +++ b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avxvnni-prfm.c @@ -31,7 +31,7 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__avxvnni_prfm( const void* scale, int8_t* packed_weights, size_t extra_bytes, - const void* params) + const void* params) XNN_OOB_READS { assert(g != 0); assert(nc != 0); @@ -130,108 +130,28 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__avxvnni_prfm( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4); @@ -360,108 +280,28 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__avxvnni_prfm( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4); diff --git a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avxvnni.c b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avxvnni.c index 9c6c539df97..6fdc2cc2940 100644 --- a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avxvnni.c +++ b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avxvnni.c @@ -30,7 +30,7 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__avxvnni( const void* scale, int8_t* packed_weights, size_t extra_bytes, - const void* params) + const void* params) XNN_OOB_READS { assert(g != 0); assert(nc != 0); @@ -105,108 +105,28 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__avxvnni( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4); @@ -311,108 +231,28 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__avxvnni( // KC remainder of 1..7 if (k != 0) { assert(k >= 1 && k <= 7); - __m256i v0 = _mm256_setzero_si256(); - __m256i v4 = _mm256_setzero_si256(); - - if (k & 4) { - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4); - v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4); - v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6); - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - w4 += 4; - w5 += 4; - w6 += 4; - w7 += 4; - } - if (k & 2) { - if (k & 4) { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14); - } else { - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8); - v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8); - v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12); - } - - w0 += 2; - w1 += 2; - w2 += 2; - w3 += 2; - w4 += 2; - w5 += 2; - w6 += 2; - w7 += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30); - } - else if (k & 4) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28); - } - else if (k & 2) { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26); - } - else { - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16); - v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16); - v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24); - } - - w0 += 1; - w1 += 1; - w2 += 1; - w3 += 1; - w4 += 1; - w5 += 1; - w6 += 1; - w7 += 1; - } + + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); + + __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0)); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30); + v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0); + v0 = _mm256_and_si256(v0, vmask); + __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4)); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30); + v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0); + v4 = _mm256_and_si256(v4, vmask); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0); vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4); diff --git a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c new file mode 100644 index 00000000000..f479706779f --- /dev/null +++ b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c @@ -0,0 +1,370 @@ +// Auto-generated file. Do not edit! +// Template: src/x8-packw/kr-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + + +#include + +#include + +#include "xnnpack/packw.h" + + +void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__wasmrelaxedsimd( + size_t g, + size_t nc, + size_t kc, + size_t nr, + size_t kr, + size_t sr, + const int8_t* weights, + const int32_t* bias, + const void* scale, + int8_t* packed_weights, + size_t extra_bytes, + const void* params) XNN_OOB_READS +{ + assert(g != 0); + assert(nc != 0); + assert(kc != 0); + assert(nr == 8); + assert(kr == 8); + assert(sr == 1); + assert(weights != NULL); + assert(packed_weights != NULL); + + const v128_t vone = wasm_i8x16_splat(1); + const v128_t vzero = wasm_i32x4_splat(0); + XNN_FORCE_REALIZATION(vone); + XNN_FORCE_REALIZATION(vzero); + int8_t* out = (int8_t*) packed_weights; + const uint32_t* b = (const uint32_t*) bias; + const uint32_t izp = (uint32_t) (params ? (((const struct xnn_qs8_packw_params*) params)->input_zero_point + 128): 128); + v128_t vzeropoint = wasm_i32x4_splat((int32_t) izp); + + do { + // NC main loop multiple of 8 + const int8_t* w0 = (const int8_t*) weights; + size_t n = nc; + for (;n >= 8; n -= 8) { + int32_t* packed_b = (int32_t*) out; + if XNN_LIKELY(b != NULL) { + const v128_t vb0 = wasm_v128_load(b + 0); + wasm_v128_store(out + 0, vb0); + const v128_t vb1 = wasm_v128_load(b + 4); + wasm_v128_store(out + 16, vb1); + b += 8; + } else { + wasm_v128_store(out + 0, vzero); + wasm_v128_store(out + 16, vzero); + } + out += 8 * sizeof(uint32_t); + + const int8_t* w1 = w0 + kc; + const int8_t* w2 = w1 + kc; + const int8_t* w3 = w2 + kc; + const int8_t* w4 = w3 + kc; + const int8_t* w5 = w4 + kc; + const int8_t* w6 = w5 + kc; + const int8_t* w7 = w6 + kc; + + v128_t vacc01 = wasm_i32x4_splat(0); + v128_t vacc23 = wasm_i32x4_splat(0); + v128_t vacc45 = wasm_i32x4_splat(0); + v128_t vacc67 = wasm_i32x4_splat(0); + + // KC main loop multiple of 8x8 + size_t k = kc; + for (; k >= 16; k -= 16) { + v128_t v0_01 = wasm_v128_load(w0); + v128_t v1_01 = wasm_v128_load(w1); + v128_t v2_01 = wasm_v128_load(w2); + v128_t v3_01 = wasm_v128_load(w3); + v128_t v4_01 = wasm_v128_load(w4); + v128_t v5_01 = wasm_v128_load(w5); + v128_t v6_01 = wasm_v128_load(w6); + v128_t v7_01 = wasm_v128_load(w7); + + v128_t v01_0 = wasm_i64x2_shuffle(v0_01, v1_01, 0, 2); + v128_t v01_1 = wasm_i64x2_shuffle(v0_01, v1_01, 1, 3); + v128_t v23_0 = wasm_i64x2_shuffle(v2_01, v3_01, 0, 2); + v128_t v23_1 = wasm_i64x2_shuffle(v2_01, v3_01, 1, 3); + v128_t v45_0 = wasm_i64x2_shuffle(v4_01, v5_01, 0, 2); + v128_t v45_1 = wasm_i64x2_shuffle(v4_01, v5_01, 1, 3); + v128_t v67_0 = wasm_i64x2_shuffle(v6_01, v7_01, 0, 2); + v128_t v67_1 = wasm_i64x2_shuffle(v6_01, v7_01, 1, 3); + + vacc01 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v01_0, vone, vacc01); + vacc01 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v01_1, vone, vacc01); + vacc23 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v23_0, vone, vacc23); + vacc23 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v23_1, vone, vacc23); + vacc45 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v45_0, vone, vacc45); + vacc45 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v45_1, vone, vacc45); + vacc67 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v67_0, vone, vacc67); + vacc67 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v67_1, vone, vacc67); + + wasm_v128_store(out + 0, v01_0); + wasm_v128_store(out + 16, v23_0); + wasm_v128_store(out + 32, v45_0); + wasm_v128_store(out + 48, v67_0); + + wasm_v128_store(out + 64, v01_1); + wasm_v128_store(out + 80, v23_1); + wasm_v128_store(out + 96, v45_1); + wasm_v128_store(out + 112, v67_1); + + w0 += 16; + w1 += 16; + w2 += 16; + w3 += 16; + w4 += 16; + w5 += 16; + w6 += 16; + w7 += 16; + out += 128; + } + + for (; k >= 8; k -= 8) { + const v128_t v0 = wasm_v128_load64_splat(w0); + const v128_t v1 = wasm_v128_load64_splat(w1); + const v128_t v01 = wasm_i64x2_shuffle(v0, v1, 0, 3); + const v128_t v2 = wasm_v128_load64_splat(w2); + const v128_t v3 = wasm_v128_load64_splat(w3); + const v128_t v23 = wasm_i64x2_shuffle(v2, v3, 0, 3); + const v128_t v4 = wasm_v128_load64_splat(w4); + const v128_t v5 = wasm_v128_load64_splat(w5); + const v128_t v45 = wasm_i64x2_shuffle(v4, v5, 0, 3); + const v128_t v6 = wasm_v128_load64_splat(w6); + const v128_t v7 = wasm_v128_load64_splat(w7); + const v128_t v67 = wasm_i64x2_shuffle(v6, v7, 0, 3); + + vacc01 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v01, vone, vacc01); + vacc23 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v23, vone, vacc23); + vacc45 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v45, vone, vacc45); + vacc67 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v67, vone, vacc67); + + wasm_v128_store(out + 0, v01); + wasm_v128_store(out + 16, v23); + wasm_v128_store(out + 32, v45); + wasm_v128_store(out + 48, v67); + + w0 += 8; + w1 += 8; + w2 += 8; + w3 += 8; + w4 += 8; + w5 += 8; + w6 += 8; + w7 += 8; + out += 64; + } + + // KC remainder 1..KR-1 + if (k != 0) { + assert(k >= 1 && k <= 7); + + const v128_t vmask = wasm_u64x2_shr(wasm_i32x4_splat(-1), (8 - k) * sizeof(int8_t) * 8); + + const v128_t v0 = wasm_v128_load64_splat(w0); + const v128_t v1 = wasm_v128_load64_splat(w1); + v128_t v01 = wasm_i64x2_shuffle(v0, v1, 0, 3); + v01 = wasm_v128_and(v01, vmask); + const v128_t v2 = wasm_v128_load64_splat(w2); + const v128_t v3 = wasm_v128_load64_splat(w3); + v128_t v23 = wasm_i64x2_shuffle(v2, v3, 0, 3); + v23 = wasm_v128_and(v23, vmask); + const v128_t v4 = wasm_v128_load64_splat(w4); + const v128_t v5 = wasm_v128_load64_splat(w5); + v128_t v45 = wasm_i64x2_shuffle(v4, v5, 0, 3); + v45 = wasm_v128_and(v45, vmask); + const v128_t v6 = wasm_v128_load64_splat(w6); + const v128_t v7 = wasm_v128_load64_splat(w7); + v128_t v67 = wasm_i64x2_shuffle(v6, v7, 0, 3); + v67 = wasm_v128_and(v67, vmask); + + vacc01 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v01, vone, vacc01); + vacc23 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v23, vone, vacc23); + vacc45 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v45, vone, vacc45); + vacc67 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v67, vone, vacc67); + + wasm_v128_store(out + 0, v01); + wasm_v128_store(out + 16, v23); + wasm_v128_store(out + 32, v45); + wasm_v128_store(out + 48, v67); + + w0 += k; + w1 += k; + w2 += k; + w3 += k; + w4 += k; + w5 += k; + w6 += k; + w7 += k; + out += 64; + } + + v128_t vksum0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc01, vacc23, 0, 2, 4, 6), wasm_v32x4_shuffle(vacc01, vacc23, 1, 3, 5, 7)); + v128_t vksum4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc45, vacc67, 0, 2, 4, 6), wasm_v32x4_shuffle(vacc45, vacc67, 1, 3, 5, 7)); + + vksum0123 = wasm_i32x4_mul(vksum0123, vzeropoint); + vksum4567 = wasm_i32x4_mul(vksum4567, vzeropoint); + + v128_t vpack0123 = wasm_v128_load(packed_b); + v128_t vpack4567 = wasm_v128_load(packed_b + 4); + + wasm_v128_store(packed_b, wasm_i32x4_sub(vpack0123, vksum0123)); + wasm_v128_store(packed_b + 4, wasm_i32x4_sub(vpack4567, vksum4567)); + + out = (int8_t*) ((uintptr_t) out + extra_bytes); + w0 = w7; + } + + // NC remainder (1..7) + if XNN_UNLIKELY(n != 0) { + assert(n >= 1 && n <= 7); + + int32_t* packed_b = (int32_t*) out; + if XNN_LIKELY(b != NULL) { + size_t nb = n; + do { + *((uint32_t*) out) = *b++; + out += sizeof(uint32_t); + } while (--nb != 0); + } else { + size_t nb = n; + do { + *((uint32_t*) out) = 0; + out += sizeof(uint32_t); + } while (--nb != 0); + } + out += (8 - n) * sizeof(uint32_t); + + const int8_t* w1 = w0 + kc; + if XNN_UNPREDICTABLE(n < 2) { + w1 = w0; + } + const int8_t* w2 = w1 + kc; + if XNN_UNPREDICTABLE(n <= 2) { + w2 = w1; + } + const int8_t* w3 = w2 + kc; + if XNN_UNPREDICTABLE(n < 4) { + w3 = w2; + } + const int8_t* w4 = w3 + kc; + if XNN_UNPREDICTABLE(n <= 4) { + w4 = w3; + } + const int8_t* w5 = w4 + kc; + if XNN_UNPREDICTABLE(n < 6) { + w5 = w4; + } + const int8_t* w6 = w5 + kc; + if XNN_UNPREDICTABLE(n <= 6) { + w6 = w5; + } + const int8_t* w7 = w6 + kc; + if XNN_UNPREDICTABLE(n < 8) { + w7 = w6; + } + + v128_t vacc01 = wasm_i32x4_splat(0); + v128_t vacc23 = wasm_i32x4_splat(0); + v128_t vacc45 = wasm_i32x4_splat(0); + v128_t vacc67 = wasm_i32x4_splat(0); + + // KC main loop multiple of 8x8 + size_t k = kc; + for (; k >= 8; k -= 8) { + const v128_t v0 = wasm_v128_load64_splat(w0); + const v128_t v1 = wasm_v128_load64_splat(w1); + const v128_t v01 = wasm_i64x2_shuffle(v0, v1, 0, 3); + const v128_t v2 = wasm_v128_load64_splat(w2); + const v128_t v3 = wasm_v128_load64_splat(w3); + const v128_t v23 = wasm_i64x2_shuffle(v2, v3, 0, 3); + const v128_t v4 = wasm_v128_load64_splat(w4); + const v128_t v5 = wasm_v128_load64_splat(w5); + const v128_t v45 = wasm_i64x2_shuffle(v4, v5, 0, 3); + const v128_t v6 = wasm_v128_load64_splat(w6); + const v128_t v7 = wasm_v128_load64_splat(w7); + const v128_t v67 = wasm_i64x2_shuffle(v6, v7, 0, 3); + + vacc01 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v01, vone, vacc01); + vacc23 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v23, vone, vacc23); + vacc45 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v45, vone, vacc45); + vacc67 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v67, vone, vacc67); + + wasm_v128_store(out + 0, v01); + wasm_v128_store(out + 16, v23); + wasm_v128_store(out + 32, v45); + wasm_v128_store(out + 48, v67); + + w0 += 8; + w1 += 8; + w2 += 8; + w3 += 8; + w4 += 8; + w5 += 8; + w6 += 8; + w7 += 8; + out += 64; + } + + // KC remainder of 1..7 + if (k != 0) { + assert(k >= 1 && k <= 7); + + const v128_t vmask = wasm_u64x2_shr(wasm_i32x4_splat(-1), (8 - k) * sizeof(int8_t) * 8); + + const v128_t v0 = wasm_v128_load64_splat(w0); + const v128_t v1 = wasm_v128_load64_splat(w1); + v128_t v01 = wasm_i64x2_shuffle(v0, v1, 0, 3); + v01 = wasm_v128_and(v01, vmask); + const v128_t v2 = wasm_v128_load64_splat(w2); + const v128_t v3 = wasm_v128_load64_splat(w3); + v128_t v23 = wasm_i64x2_shuffle(v2, v3, 0, 3); + v23 = wasm_v128_and(v23, vmask); + const v128_t v4 = wasm_v128_load64_splat(w4); + const v128_t v5 = wasm_v128_load64_splat(w5); + v128_t v45 = wasm_i64x2_shuffle(v4, v5, 0, 3); + v45 = wasm_v128_and(v45, vmask); + const v128_t v6 = wasm_v128_load64_splat(w6); + const v128_t v7 = wasm_v128_load64_splat(w7); + v128_t v67 = wasm_i64x2_shuffle(v6, v7, 0, 3); + v67 = wasm_v128_and(v67, vmask); + + vacc01 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v01, vone, vacc01); + vacc23 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v23, vone, vacc23); + vacc45 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v45, vone, vacc45); + vacc67 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v67, vone, vacc67); + + wasm_v128_store(out + 0, v01); + wasm_v128_store(out + 16, v23); + wasm_v128_store(out + 32, v45); + wasm_v128_store(out + 48, v67); + + out += 64; + } + + v128_t vksum0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc01, vacc23, 0, 2, 4, 6), wasm_v32x4_shuffle(vacc01, vacc23, 1, 3, 5, 7)); + v128_t vksum4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc45, vacc67, 0, 2, 4, 6), wasm_v32x4_shuffle(vacc45, vacc67, 1, 3, 5, 7)); + + vksum0123 = wasm_i32x4_mul(vksum0123, vzeropoint); + vksum4567 = wasm_i32x4_mul(vksum4567, vzeropoint); + + v128_t vpack0123 = wasm_v128_load(packed_b); + v128_t vpack4567 = wasm_v128_load(packed_b + 4); + + wasm_v128_store(packed_b, wasm_i32x4_sub(vpack0123, vksum0123)); + wasm_v128_store(packed_b + 4, wasm_i32x4_sub(vpack4567, vksum4567)); + + out = (int8_t*) ((uintptr_t) out + extra_bytes); + } + weights += nc * kc; + } while (--g != 0); +} diff --git a/src/qs8-requantization/qs8-requantization-rndna-neon.c b/src/qs8-requantization/qs8-requantization-rndna-neon.c deleted file mode 100644 index 4a1ac199959..00000000000 --- a/src/qs8-requantization/qs8-requantization-rndna-neon.c +++ /dev/null @@ -1,165 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/requantization-stubs.h" - - -void xnn_qs8_requantize_rndna__neon( - size_t n, - const int32_t* input, - float scale, - int8_t zero_point, - int8_t qmin, - int8_t qmax, - int8_t* output) -{ - assert(n % 16 == 0); - assert(scale < 1.0f); - assert(scale >= 0x1.0p-32f); - - const uint32_t scale_bits = float_as_uint32(scale); - const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000); - const int32_t shift = 127 + 23 - (scale_bits >> 23); - assert(shift >= 24); - assert(shift < 56); - -#if defined(__aarch64__) - const int32x4_t vmultiplier = vdupq_n_s32(multiplier); -#else - const int32x2_t vmultiplier = vdup_n_s32(multiplier); -#endif - const int16x8_t vzero_point = vdupq_n_s16((int16_t) zero_point); - const int64x2_t vshift = vdupq_n_s64(-shift); - const int8x16_t vqmin = vdupq_n_s8(qmin); - const int8x16_t vqmax = vdupq_n_s8(qmax); - for (; n != 0; n -= 16) { - const int32x4_t x = vld1q_s32(input); - const int32x4_t y = vld1q_s32(input + 4); - const int32x4_t z = vld1q_s32(input + 8); - const int32x4_t w = vld1q_s32(input + 12); - input += 16; - - const uint32x4_t x_neg_mask = vcltq_s32(x, vmovq_n_s32(0)); - const uint32x4_t y_neg_mask = vcltq_s32(y, vmovq_n_s32(0)); - const uint32x4_t z_neg_mask = vcltq_s32(z, vmovq_n_s32(0)); - const uint32x4_t w_neg_mask = vcltq_s32(w, vmovq_n_s32(0)); - -#if defined(__aarch64__) - const int64x2_t x01_product = vmull_s32(vget_low_s32(x), vget_low_s32(vmultiplier)); - const int64x2_t x23_product = vmull_high_s32(x, vmultiplier); - const int64x2_t y01_product = vmull_s32(vget_low_s32(y), vget_low_s32(vmultiplier)); - const int64x2_t y23_product = vmull_high_s32(y, vmultiplier); - const int64x2_t z01_product = vmull_s32(vget_low_s32(z), vget_low_s32(vmultiplier)); - const int64x2_t z23_product = vmull_high_s32(z, vmultiplier); - const int64x2_t w01_product = vmull_s32(vget_low_s32(w), vget_low_s32(vmultiplier)); - const int64x2_t w23_product = vmull_high_s32(w, vmultiplier); -#else - const int64x2_t x01_product = vmull_s32(vget_low_s32(x), vmultiplier); - const int64x2_t x23_product = vmull_s32(vget_high_s32(x), vmultiplier); - const int64x2_t y01_product = vmull_s32(vget_low_s32(y), vmultiplier); - const int64x2_t y23_product = vmull_s32(vget_high_s32(y), vmultiplier); - const int64x2_t z01_product = vmull_s32(vget_low_s32(z), vmultiplier); - const int64x2_t z23_product = vmull_s32(vget_high_s32(z), vmultiplier); - const int64x2_t w01_product = vmull_s32(vget_low_s32(w), vmultiplier); - const int64x2_t w23_product = vmull_s32(vget_high_s32(w), vmultiplier); -#endif - -#if defined(__aarch64__) - const int64x2_t x01_adjusted_product = vaddw_s32(x01_product, vreinterpret_s32_u32(vget_low_u32(x_neg_mask))); - const int64x2_t x23_adjusted_product = vaddw_high_s32(x23_product, vreinterpretq_s32_u32(x_neg_mask)); - const int64x2_t y01_adjusted_product = vaddw_s32(y01_product, vreinterpret_s32_u32(vget_low_u32(y_neg_mask))); - const int64x2_t y23_adjusted_product = vaddw_high_s32(y23_product, vreinterpretq_s32_u32(y_neg_mask)); - const int64x2_t z01_adjusted_product = vaddw_s32(z01_product, vreinterpret_s32_u32(vget_low_u32(z_neg_mask))); - const int64x2_t z23_adjusted_product = vaddw_high_s32(z23_product, vreinterpretq_s32_u32(z_neg_mask)); - const int64x2_t w01_adjusted_product = vaddw_s32(w01_product, vreinterpret_s32_u32(vget_low_u32(w_neg_mask))); - const int64x2_t w23_adjusted_product = vaddw_high_s32(w23_product, vreinterpretq_s32_u32(w_neg_mask)); -#else - const int64x2_t x01_adjusted_product = vaddw_s32(x01_product, vreinterpret_s32_u32(vget_low_u32(x_neg_mask))); - const int64x2_t x23_adjusted_product = vaddw_s32(x23_product, vreinterpret_s32_u32(vget_high_u32(x_neg_mask))); - const int64x2_t y01_adjusted_product = vaddw_s32(y01_product, vreinterpret_s32_u32(vget_low_u32(y_neg_mask))); - const int64x2_t y23_adjusted_product = vaddw_s32(y23_product, vreinterpret_s32_u32(vget_high_u32(y_neg_mask))); - const int64x2_t z01_adjusted_product = vaddw_s32(z01_product, vreinterpret_s32_u32(vget_low_u32(z_neg_mask))); - const int64x2_t z23_adjusted_product = vaddw_s32(z23_product, vreinterpret_s32_u32(vget_high_u32(z_neg_mask))); - const int64x2_t w01_adjusted_product = vaddw_s32(w01_product, vreinterpret_s32_u32(vget_low_u32(w_neg_mask))); - const int64x2_t w23_adjusted_product = vaddw_s32(w23_product, vreinterpret_s32_u32(vget_high_u32(w_neg_mask))); -#endif - - const int64x2_t x01_scaled = vrshlq_s64(x01_adjusted_product, vshift); - const int64x2_t x23_scaled = vrshlq_s64(x23_adjusted_product, vshift); - const int64x2_t y01_scaled = vrshlq_s64(y01_adjusted_product, vshift); - const int64x2_t y23_scaled = vrshlq_s64(y23_adjusted_product, vshift); - const int64x2_t z01_scaled = vrshlq_s64(z01_adjusted_product, vshift); - const int64x2_t z23_scaled = vrshlq_s64(z23_adjusted_product, vshift); - const int64x2_t w01_scaled = vrshlq_s64(w01_adjusted_product, vshift); - const int64x2_t w23_scaled = vrshlq_s64(w23_adjusted_product, vshift); - -#ifdef __aarch64__ - const int32x4_t x_scaled = vuzp1q_s32(vreinterpretq_s32_s64(x01_scaled), vreinterpretq_s32_s64(x23_scaled)); - const int32x4_t y_scaled = vuzp1q_s32(vreinterpretq_s32_s64(y01_scaled), vreinterpretq_s32_s64(y23_scaled)); - const int32x4_t z_scaled = vuzp1q_s32(vreinterpretq_s32_s64(z01_scaled), vreinterpretq_s32_s64(z23_scaled)); - const int32x4_t w_scaled = vuzp1q_s32(vreinterpretq_s32_s64(w01_scaled), vreinterpretq_s32_s64(w23_scaled)); - - const int16x8_t xy_packed = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(x_scaled), y_scaled), vzero_point); - const int16x8_t zw_packed = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(z_scaled), w_scaled), vzero_point); - const int8x16_t xyzw_packed = vqmovn_high_s16(vqmovn_s16(xy_packed), zw_packed); -#else - const int32x4_t x_scaled = vcombine_s32(vmovn_s64(x01_scaled), vmovn_s64(x23_scaled)); - const int32x4_t y_scaled = vcombine_s32(vmovn_s64(y01_scaled), vmovn_s64(y23_scaled)); - const int32x4_t z_scaled = vcombine_s32(vmovn_s64(z01_scaled), vmovn_s64(z23_scaled)); - const int32x4_t w_scaled = vcombine_s32(vmovn_s64(w01_scaled), vmovn_s64(w23_scaled)); - - const int16x8_t xy_packed = vqaddq_s16(vcombine_s16(vqmovn_s32(x_scaled), vqmovn_s32(y_scaled)), vzero_point); - const int16x8_t zw_packed = vqaddq_s16(vcombine_s16(vqmovn_s32(z_scaled), vqmovn_s32(w_scaled)), vzero_point); - const int8x16_t xyzw_packed = vcombine_s8(vqmovn_s16(xy_packed), vqmovn_s16(zw_packed)); -#endif - - const int8x16_t xyzw_clamped = vmaxq_s8(vminq_s8(xyzw_packed, vqmax), vqmin); - - // AArch32 version: - // 4x VCLT.S32 Qd, Qm, #0 - // 8x VMULL.S32 Qd, Dm, Dn - // 8x VADDW.S32 Qd, Qm, Dn - // 8x VRSHL.S32 Qd, Qm, Qn - // 8x VMOVN.S64 Dd, Qm - // 4x VQMOVN.S32 Dd, Qm - // 2x VQADD.S16 Qd, Qm, Qn - // 2x VQMOVUN.S16 Dd, Qm - // 1x VMAX.U8 Qd, Qm, Qn - // 1x VMIN.U8 Qd, Qm, Qn - // --------------------- - // 46 instructions total - // - // AArch64 version: - // 4x CMLT Vd.4S, Vn.4S, #0 - // 4x SMULL Vd.2D, Vn.2S, Vm.2S - // 4x SMULL2 Vd.2D, Vn.4S, Vm.4S - // 4x SADDW Vd.2D, Vn.2D, Vm.2S - // 4x SADDW2 Vd.2D, Vn.2D, Vm.4S - // 8x SRSHL Vd.2D, Vn.2D, Vm.2D - // 4x UZP1 Vd.4S, Vn.4S, Vm.4S - // 2x SQXTN Vd.4H, Vn.4S - // 2x SQXTN2 Vd.8H, Vn.4S - // 2x SQADD Vd.8H, Vn.8H, Vm.8H - // 1x SQXTN Vd.8B, Vn.8H - // 1x SQXTN2 Vd.16B, Vn.8H - // 1x SMIN Vd.16B, Vn.16B, Vm.16B - // 1x SMAX Vd.16B, Vn.16B, Vm.16B - // --------------------- - // 42 instructions total - - vst1q_s8(output, xyzw_clamped); - output += 16; - } -} diff --git a/src/qs8-requantization/qs8-requantization-rndna-scalar-signed64.c b/src/qs8-requantization/qs8-requantization-rndna-scalar-signed64.c deleted file mode 100644 index bc190737b57..00000000000 --- a/src/qs8-requantization/qs8-requantization-rndna-scalar-signed64.c +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/requantization-stubs.h" - - -void xnn_qs8_requantize_rndna__scalar_signed64( - size_t n, - const int32_t* input, - float scale, - int8_t zero_point, - int8_t qmin, - int8_t qmax, - int8_t* output) -{ - assert(n % 4 == 0); - assert(scale < 1.0f); - assert(scale >= 0x1.0p-32f); - - const uint32_t scale_bits = float_as_uint32(scale); - const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000); - const uint32_t shift = 127 + 23 - (scale_bits >> 23); - assert(shift >= 24); - assert(shift < 56); - - const int64_t rounding = INT64_C(1) << (shift - 1); - const int32_t smin = (int32_t) qmin - (int32_t) zero_point; - const int32_t smax = (int32_t) qmax - (int32_t) zero_point; - for (; n != 0; n -= 4) { - const int32_t x = input[0]; - const int32_t y = input[1]; - const int32_t z = input[2]; - const int32_t w = input[3]; - input += 4; - - // Compute full 64-bit product of signed 32-bit factors. - // - // Note: multiplier can be treated as either signed or unsigned. - const int64_t x_product = (int64_t) x * (int64_t) multiplier; - const int64_t y_product = (int64_t) y * (int64_t) multiplier; - const int64_t z_product = (int64_t) z * (int64_t) multiplier; - const int64_t w_product = (int64_t) w * (int64_t) multiplier; - - // Adjust product before subsequent shift with rounding up to simulate shift with rounding away from zero. - const int64_t x_adjusted_product = x_product - (int64_t)(x < 0); - const int64_t y_adjusted_product = y_product - (int64_t)(y < 0); - const int64_t z_adjusted_product = z_product - (int64_t)(z < 0); - const int64_t w_adjusted_product = w_product - (int64_t)(w < 0); - - // Arithmetically shift the full 64-bit product right with rounding. - // Rounding is performed towards closest integer, with midpoints rounded up. - // - // Note that although rounding is precomputed, it is dependent on shift value, and on processors with 64-bit - // "right shift with rounding" instruction each line below can be represented by just one such instruction - // (e.g. VRSHL.S64 on ARM NEON, SRSHL in ARM64 Advanced SIMD). - const int32_t x_scaled = (int32_t) math_asr_s64(x_adjusted_product + rounding, shift); - const int32_t y_scaled = (int32_t) math_asr_s64(y_adjusted_product + rounding, shift); - const int32_t z_scaled = (int32_t) math_asr_s64(z_adjusted_product + rounding, shift); - const int32_t w_scaled = (int32_t) math_asr_s64(w_adjusted_product + rounding, shift); - - // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point). - const int32_t x_clamped = math_min_s32(math_max_s32(x_scaled, smin), smax); - const int32_t y_clamped = math_min_s32(math_max_s32(y_scaled, smin), smax); - const int32_t z_clamped = math_min_s32(math_max_s32(z_scaled, smin), smax); - const int32_t w_clamped = math_min_s32(math_max_s32(w_scaled, smin), smax); - - // Add zero point to clamped value. - // The result is guaranteed to be in [qmin, qmax] range. - // - // This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519] - // range, so addition of zero point (which can be up to 127) can overflow signed 32-bit integer. - const int32_t x_biased = x_clamped + zero_point; - const int32_t y_biased = y_clamped + zero_point; - const int32_t z_biased = z_clamped + zero_point; - const int32_t w_biased = w_clamped + zero_point; - - output[0] = (int8_t) x_biased; - output[1] = (int8_t) y_biased; - output[2] = (int8_t) z_biased; - output[3] = (int8_t) w_biased; - output += 4; - } -} diff --git a/src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned32.c b/src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned32.c deleted file mode 100644 index 0d8c27109b8..00000000000 --- a/src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned32.c +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/requantization-stubs.h" - - -void xnn_qs8_requantize_rndna__scalar_unsigned32( - size_t n, - const int32_t* input, - float scale, - int8_t zero_point, - int8_t qmin, - int8_t qmax, - int8_t* output) -{ - assert(n % 4 == 0); - assert(scale < 1.0f); - assert(scale >= 0x1.0p-32f); - - const uint32_t scale_bits = float_as_uint32(scale); - const uint32_t multiplier = (scale_bits << 8) | UINT32_C(0x80000000); - const uint32_t shift = 127 + 31 - (scale_bits >> 23); - assert(shift >= 32); - assert(shift < 64); - - const uint64_t rounding = UINT64_C(1) << (shift - 1); - const uint32_t rounding_hi = (uint32_t)(rounding >> 32); - const uint32_t rounding_lo = (uint32_t) rounding; - const uint32_t shift_minus_32 = shift - 32; - const int32_t smin = (int32_t) qmin - (int32_t) zero_point; - const int32_t smax = (int32_t) qmax - (int32_t) zero_point; - for (; n != 0; n -= 4) { - const int32_t x = input[0]; - const int32_t y = input[1]; - const int32_t z = input[2]; - const int32_t w = input[3]; - input += 4; - - // Compute absolute value of input as unsigned 32-bit int. - // All further computations will work with unsigned values to avoid undefined behaviour on signed operations. - const uint32_t x_abs = (x >= 0) ? (uint32_t) x : -(uint32_t) x; - const uint32_t y_abs = (y >= 0) ? (uint32_t) y : -(uint32_t) y; - const uint32_t z_abs = (z >= 0) ? (uint32_t) z : -(uint32_t) z; - const uint32_t w_abs = (w >= 0) ? (uint32_t) w : -(uint32_t) w; - - // Compute full 64-bit product of 32-bit factors. - const uint64_t x_product = (uint64_t) x_abs * (uint64_t) multiplier; - const uint64_t y_product = (uint64_t) y_abs * (uint64_t) multiplier; - const uint64_t z_product = (uint64_t) z_abs * (uint64_t) multiplier; - const uint64_t w_product = (uint64_t) w_abs * (uint64_t) multiplier; - - // Shift the full 64-bit product right with rounding. - // Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero). - // - // Generally, this operation requires both 64-bit addition and 64-bit shift, but we use two tricks to replace - // 64-bit operations with 32-bit operations. - // - // To avoid full 64-bit addition we make use of three facts: - // - 64-bit rounding value added before the shift is a power of 2, and thus has only one bit set. - // - When 0x1.0p-32f <= scale < 0x1.0p-31f, then the non-zero bit in rounding is in the low 32 bits, and - // rounding is exactly 0x80000000 (2**31), because rounding is 2**(scale-1) and scale >= 32. In this case, - // addition of rounding can affect high 32 bits of the product only through overflow, which happens if - // low 32-bit part of the product equals or exceeds 0x80000000. We can reformulate the latter condition - // as low 32-bit part of the product has the bit 31 set, and then overflow happens if both the low 32-bit part - // of the product and the low 32-bit part of the rounding value have bit 31 set. Since 32-bit numbers with the - // bit 31 set are negative when interpreted as signed integers, we can check the overflow condition as - // (int32_t) (LOW(product) & LOW(rounding)) < 0 - // - When 0x1.0p-31f <= scale < 1.0f, then the non-zero bit is in the high 32 bits of rounding. We just need - // to do 32-bit addition of high 32 bits of rounding and high 32 bits of product. This addition never - // overflows because product <= 0x80000000 * 0xFFFFFF00 < 2**63 and rounding = 2**(scale-1) <= 2**62. - // - // To avoid full 64-bit shift, we leverage the fact that shift >= 32, and do it in two steps: - // - Shift by 32, which can be implemented by extacting the high 32-bit word on 32-bit systems. - // - Shift by (shift - 32), which can be implemented as a 32-bit shift of high word of addition result. - const uint32_t x_carry_lo = (uint32_t) ((int32_t) ((uint32_t) x_product & rounding_lo) < 0); - const uint32_t y_carry_lo = (uint32_t) ((int32_t) ((uint32_t) y_product & rounding_lo) < 0); - const uint32_t z_carry_lo = (uint32_t) ((int32_t) ((uint32_t) z_product & rounding_lo) < 0); - const uint32_t w_carry_lo = (uint32_t) ((int32_t) ((uint32_t) w_product & rounding_lo) < 0); - - const uint32_t x_product_hi = (uint32_t) (x_product >> 32); - const uint32_t y_product_hi = (uint32_t) (y_product >> 32); - const uint32_t z_product_hi = (uint32_t) (z_product >> 32); - const uint32_t w_product_hi = (uint32_t) (w_product >> 32); - - const uint32_t x_abs_scaled = (uint32_t) (x_product_hi + rounding_hi + x_carry_lo) >> shift_minus_32; - const uint32_t y_abs_scaled = (uint32_t) (y_product_hi + rounding_hi + y_carry_lo) >> shift_minus_32; - const uint32_t z_abs_scaled = (uint32_t) (z_product_hi + rounding_hi + z_carry_lo) >> shift_minus_32; - const uint32_t w_abs_scaled = (uint32_t) (w_product_hi + rounding_hi + w_carry_lo) >> shift_minus_32; - - // Copy the sign of input to scaled absolute input value. - const int32_t x_scaled = (int32_t) (x >= 0 ? x_abs_scaled : -x_abs_scaled); - const int32_t y_scaled = (int32_t) (y >= 0 ? y_abs_scaled : -y_abs_scaled); - const int32_t z_scaled = (int32_t) (z >= 0 ? z_abs_scaled : -z_abs_scaled); - const int32_t w_scaled = (int32_t) (w >= 0 ? w_abs_scaled : -w_abs_scaled); - - // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point). - const int32_t x_clamped = math_min_s32(math_max_s32(x_scaled, smin), smax); - const int32_t y_clamped = math_min_s32(math_max_s32(y_scaled, smin), smax); - const int32_t z_clamped = math_min_s32(math_max_s32(z_scaled, smin), smax); - const int32_t w_clamped = math_min_s32(math_max_s32(w_scaled, smin), smax); - - // Add zero point to clamped value. - // The result is guaranteed to be in [qmin, qmax] range. - // - // This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519] - // range, so addition of zero point (which can be up to 127) can overflow signed 32-bit integer. - const int32_t x_biased = x_clamped + zero_point; - const int32_t y_biased = y_clamped + zero_point; - const int32_t z_biased = z_clamped + zero_point; - const int32_t w_biased = w_clamped + zero_point; - - output[0] = (int8_t) x_biased; - output[1] = (int8_t) y_biased; - output[2] = (int8_t) z_biased; - output[3] = (int8_t) w_biased; - output += 4; - } -} diff --git a/src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned64.c b/src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned64.c deleted file mode 100644 index 71d3a0a1f65..00000000000 --- a/src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned64.c +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/requantization-stubs.h" - - -void xnn_qs8_requantize_rndna__scalar_unsigned64( - size_t n, - const int32_t* input, - float scale, - int8_t zero_point, - int8_t qmin, - int8_t qmax, - int8_t* output) -{ - assert(n % 4 == 0); - assert(scale < 1.0f); - assert(scale >= 0x1.0p-32f); - - const uint32_t scale_bits = float_as_uint32(scale); - const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000); - const uint32_t shift = 127 + 23 - (scale_bits >> 23); - assert(shift >= 24); - assert(shift < 56); - - const uint64_t rounding = UINT64_C(1) << (shift - 1); - const int32_t smin = (int32_t) qmin - (int32_t) zero_point; - const int32_t smax = (int32_t) qmax - (int32_t) zero_point; - for (; n != 0; n -= 4) { - const int32_t x = input[0]; - const int32_t y = input[1]; - const int32_t z = input[2]; - const int32_t w = input[3]; - input += 4; - - // Compute absolute value of input as unsigned 32-bit int. - // All further computations will work with unsigned values to avoid undefined behaviour on signed operations. - const uint32_t x_abs = (x >= 0) ? (uint32_t) x : -(uint32_t) x; - const uint32_t y_abs = (y >= 0) ? (uint32_t) y : -(uint32_t) y; - const uint32_t z_abs = (z >= 0) ? (uint32_t) z : -(uint32_t) z; - const uint32_t w_abs = (w >= 0) ? (uint32_t) w : -(uint32_t) w; - - // Compute full 64-bit product of 32-bit factors. - const uint64_t x_product = (uint64_t) x_abs * (uint64_t) multiplier; - const uint64_t y_product = (uint64_t) y_abs * (uint64_t) multiplier; - const uint64_t z_product = (uint64_t) z_abs * (uint64_t) multiplier; - const uint64_t w_product = (uint64_t) w_abs * (uint64_t) multiplier; - - // Shift the full 64-bit product right with rounding. - // Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero). - // - // Note that although rounding is precomputed, it is dependent on shift value, and on processors with 64-bit - // "right shift with rounding" instruction each line below can be represented by just one such instruction - // (e.g. VRSHL.U64 on ARM NEON, URSHL in ARM64 Advanced SIMD). - const uint32_t x_abs_scaled = (uint32_t) ((x_product + rounding) >> shift); - const uint32_t y_abs_scaled = (uint32_t) ((y_product + rounding) >> shift); - const uint32_t z_abs_scaled = (uint32_t) ((z_product + rounding) >> shift); - const uint32_t w_abs_scaled = (uint32_t) ((w_product + rounding) >> shift); - - // Copy the sign of input to scaled absolute input value. - // - // On x86 processors with SSSE3 instruction set, this operation nicely maps to PSIGND instruction. - const int32_t x_scaled = (int32_t) (x >= 0 ? x_abs_scaled : -x_abs_scaled); - const int32_t y_scaled = (int32_t) (y >= 0 ? y_abs_scaled : -y_abs_scaled); - const int32_t z_scaled = (int32_t) (z >= 0 ? z_abs_scaled : -z_abs_scaled); - const int32_t w_scaled = (int32_t) (w >= 0 ? w_abs_scaled : -w_abs_scaled); - - // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point). - const int32_t x_clamped = math_min_s32(math_max_s32(x_scaled, smin), smax); - const int32_t y_clamped = math_min_s32(math_max_s32(y_scaled, smin), smax); - const int32_t z_clamped = math_min_s32(math_max_s32(z_scaled, smin), smax); - const int32_t w_clamped = math_min_s32(math_max_s32(w_scaled, smin), smax); - - // Add zero point to clamped value. - // The result is guaranteed to be in [qmin, qmax] range. - // - // This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519] - // range, so addition of zero point (which can be up to 127) can overflow signed 32-bit integer. - const int32_t x_biased = x_clamped + zero_point; - const int32_t y_biased = y_clamped + zero_point; - const int32_t z_biased = z_clamped + zero_point; - const int32_t w_biased = w_clamped + zero_point; - - output[0] = (int8_t) x_biased; - output[1] = (int8_t) y_biased; - output[2] = (int8_t) z_biased; - output[3] = (int8_t) w_biased; - output += 4; - } -} diff --git a/src/qs8-requantization/qs8-requantization-rndna-sse2.c b/src/qs8-requantization/qs8-requantization-rndna-sse2.c deleted file mode 100644 index a9158b17659..00000000000 --- a/src/qs8-requantization/qs8-requantization-rndna-sse2.c +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/requantization-stubs.h" - - -void xnn_qs8_requantize_rndna__sse2( - size_t n, - const int32_t* input, - float scale, - int8_t zero_point, - int8_t qmin, - int8_t qmax, - int8_t* output) -{ - assert(n % 16 == 0); - assert(scale < 1.0f); - assert(scale >= 0x1.0p-32f); - - const uint32_t scale_bits = float_as_uint32(scale); - const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000); - const uint32_t shift = 127 + 23 - (scale_bits >> 23); - assert(shift >= 24); - assert(shift < 56); - const uint64_t rounding = UINT64_C(1) << (shift - 1); - - const __m128i vmultiplier = _mm_set1_epi32(multiplier); - const __m128i vzero_point = _mm_set1_epi16((short) zero_point); - const __m128i vqmin = _mm_set1_epi8((short) qmin); - const __m128i vqmax = _mm_set1_epi8((short) qmax); - const __m128i vshift = _mm_cvtsi32_si128((int) shift); - const __m128i vrounding = _mm_set1_epi64x(rounding); - for (; n != 0; n -= 16) { - const __m128i x = _mm_loadu_si128((const __m128i*) input); - const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4)); - const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8)); - const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12)); - input += 16; - - const __m128i x_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), x); - const __m128i y_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), y); - const __m128i z_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), z); - const __m128i w_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), w); - - const __m128i x_abs0123 = _mm_sub_epi32(_mm_xor_si128(x, x_neg_mask), x_neg_mask); - const __m128i y_abs0123 = _mm_sub_epi32(_mm_xor_si128(y, y_neg_mask), y_neg_mask); - const __m128i z_abs0123 = _mm_sub_epi32(_mm_xor_si128(z, z_neg_mask), z_neg_mask); - const __m128i w_abs0123 = _mm_sub_epi32(_mm_xor_si128(w, w_neg_mask), w_neg_mask); - - const __m128i x_abs1032 = _mm_shuffle_epi32(x_abs0123, _MM_SHUFFLE(2, 3, 0, 1)); - const __m128i y_abs1032 = _mm_shuffle_epi32(y_abs0123, _MM_SHUFFLE(2, 3, 0, 1)); - const __m128i z_abs1032 = _mm_shuffle_epi32(z_abs0123, _MM_SHUFFLE(2, 3, 0, 1)); - const __m128i w_abs1032 = _mm_shuffle_epi32(w_abs0123, _MM_SHUFFLE(2, 3, 0, 1)); - - const __m128i x_absmul02 = _mm_mul_epu32(x_abs0123, vmultiplier); - const __m128i y_absmul02 = _mm_mul_epu32(y_abs0123, vmultiplier); - const __m128i z_absmul02 = _mm_mul_epu32(z_abs0123, vmultiplier); - const __m128i w_absmul02 = _mm_mul_epu32(w_abs0123, vmultiplier); - - const __m128i x_absmul13 = _mm_mul_epu32(x_abs1032, vmultiplier); - const __m128i y_absmul13 = _mm_mul_epu32(y_abs1032, vmultiplier); - const __m128i z_absmul13 = _mm_mul_epu32(z_abs1032, vmultiplier); - const __m128i w_absmul13 = _mm_mul_epu32(w_abs1032, vmultiplier); - - const __m128i x_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(x_absmul02, vrounding), vshift); - const __m128i x_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(x_absmul13, vrounding), vshift); - const __m128i y_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(y_absmul02, vrounding), vshift); - const __m128i y_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(y_absmul13, vrounding), vshift); - const __m128i z_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(z_absmul02, vrounding), vshift); - const __m128i z_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(z_absmul13, vrounding), vshift); - const __m128i w_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(w_absmul02, vrounding), vshift); - const __m128i w_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(w_absmul13, vrounding), vshift); - - const __m128i x_abs_scaled0213 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(x_abs_scaled02), _mm_castsi128_ps(x_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0))); - const __m128i y_abs_scaled0213 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(y_abs_scaled02), _mm_castsi128_ps(y_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0))); - const __m128i z_abs_scaled0213 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(z_abs_scaled02), _mm_castsi128_ps(z_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0))); - const __m128i w_abs_scaled0213 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(w_abs_scaled02), _mm_castsi128_ps(w_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0))); - - const __m128i x_abs_scaled = _mm_shuffle_epi32(x_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0)); - const __m128i y_abs_scaled = _mm_shuffle_epi32(y_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0)); - const __m128i z_abs_scaled = _mm_shuffle_epi32(z_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0)); - const __m128i w_abs_scaled = _mm_shuffle_epi32(w_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0)); - - const __m128i x_scaled = _mm_sub_epi32(_mm_xor_si128(x_abs_scaled, x_neg_mask), x_neg_mask); - const __m128i y_scaled = _mm_sub_epi32(_mm_xor_si128(y_abs_scaled, y_neg_mask), y_neg_mask); - const __m128i z_scaled = _mm_sub_epi32(_mm_xor_si128(z_abs_scaled, z_neg_mask), z_neg_mask); - const __m128i w_scaled = _mm_sub_epi32(_mm_xor_si128(w_abs_scaled, w_neg_mask), w_neg_mask); - - const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_scaled, y_scaled), vzero_point); - const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_scaled, w_scaled), vzero_point); - const __m128i xy_clamped = _mm_max_epi16(_mm_min_epi16(xy_packed, vqmax), vqmin); - const __m128i zw_clamped = _mm_max_epi16(_mm_min_epi16(zw_packed, vqmax), vqmin); - const __m128i xyzw_clamped = _mm_packs_epi16(xy_clamped, zw_clamped); - - // 4x PXOR (setzero) - // 8x PSUBD - // 8x PXOR - // 8x PSHUFD - // 8x PMULUDQ - // 8x PSRLQ - // 8x PADDQ - // 4x SHUFPS - // 2x PACKSSDW - // 2x PADDSW - // 2x PMAXSW - // 2x PMINSW - // 1x PACKSSWB - // --------------------- - // 63 instructions total - - _mm_storeu_si128((__m128i*) output, xyzw_clamped); - output += 16; - } -} diff --git a/src/qs8-requantization/qs8-requantization-rndna-sse41.c b/src/qs8-requantization/qs8-requantization-rndna-sse41.c deleted file mode 100644 index 1801e3642ca..00000000000 --- a/src/qs8-requantization/qs8-requantization-rndna-sse41.c +++ /dev/null @@ -1,116 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/requantization-stubs.h" - - -void xnn_qs8_requantize_rndna__sse41( - size_t n, - const int32_t* input, - float scale, - int8_t zero_point, - int8_t qmin, - int8_t qmax, - int8_t* output) -{ - assert(n % 16 == 0); - assert(scale < 1.0f); - assert(scale >= 0x1.0p-32f); - - const uint32_t scale_bits = float_as_uint32(scale); - const uint32_t multiplier = (scale_bits << 8) | UINT32_C(0x80000000); - const uint32_t shift = 127 + 31 - (scale_bits >> 23); - assert(shift >= 32); - assert(shift < 64); - const uint64_t rounding = UINT64_C(1) << (shift - 1); - - const __m128i vmultiplier = _mm_set1_epi32(multiplier); - const __m128i vzero_point = _mm_set1_epi16((short) zero_point); - const __m128i vqmin = _mm_set1_epi8((char) qmin); - const __m128i vqmax = _mm_set1_epi8((char) qmax); - const __m128i vshiftlo = _mm_cvtsi32_si128((int) shift); - const __m128i vshifthi = _mm_cvtsi32_si128((int) shift - 32); - const __m128i vrounding = _mm_set1_epi64x(rounding); - for (; n != 0; n -= 16) { - const __m128i x = _mm_loadu_si128((const __m128i*) input); - const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4)); - const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8)); - const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12)); - input += 16; - - const __m128i x_abs0123 = _mm_abs_epi32(x); - const __m128i y_abs0123 = _mm_abs_epi32(y); - const __m128i z_abs0123 = _mm_abs_epi32(z); - const __m128i w_abs0123 = _mm_abs_epi32(w); - - const __m128i x_abs1032 = _mm_shuffle_epi32(x_abs0123, _MM_SHUFFLE(2, 3, 0, 1)); - const __m128i y_abs1032 = _mm_shuffle_epi32(y_abs0123, _MM_SHUFFLE(2, 3, 0, 1)); - const __m128i z_abs1032 = _mm_shuffle_epi32(z_abs0123, _MM_SHUFFLE(2, 3, 0, 1)); - const __m128i w_abs1032 = _mm_shuffle_epi32(w_abs0123, _MM_SHUFFLE(2, 3, 0, 1)); - - const __m128i x_absmul02 = _mm_mul_epu32(x_abs0123, vmultiplier); - const __m128i y_absmul02 = _mm_mul_epu32(y_abs0123, vmultiplier); - const __m128i z_absmul02 = _mm_mul_epu32(z_abs0123, vmultiplier); - const __m128i w_absmul02 = _mm_mul_epu32(w_abs0123, vmultiplier); - - const __m128i x_absmul13 = _mm_mul_epu32(x_abs1032, vmultiplier); - const __m128i y_absmul13 = _mm_mul_epu32(y_abs1032, vmultiplier); - const __m128i z_absmul13 = _mm_mul_epu32(z_abs1032, vmultiplier); - const __m128i w_absmul13 = _mm_mul_epu32(w_abs1032, vmultiplier); - - const __m128i x_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(x_absmul02, vrounding), vshiftlo); - const __m128i x_abs_scaled13 = _mm_srl_epi32(_mm_add_epi64(x_absmul13, vrounding), vshifthi); - const __m128i y_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(y_absmul02, vrounding), vshiftlo); - const __m128i y_abs_scaled13 = _mm_srl_epi32(_mm_add_epi64(y_absmul13, vrounding), vshifthi); - const __m128i z_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(z_absmul02, vrounding), vshiftlo); - const __m128i z_abs_scaled13 = _mm_srl_epi32(_mm_add_epi64(z_absmul13, vrounding), vshifthi); - const __m128i w_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(w_absmul02, vrounding), vshiftlo); - const __m128i w_abs_scaled13 = _mm_srl_epi32(_mm_add_epi64(w_absmul13, vrounding), vshifthi); - - const __m128i x_abs_scaled = _mm_blend_epi16(x_abs_scaled02, x_abs_scaled13, 0xCC); - const __m128i y_abs_scaled = _mm_blend_epi16(y_abs_scaled02, y_abs_scaled13, 0xCC); - const __m128i z_abs_scaled = _mm_blend_epi16(z_abs_scaled02, z_abs_scaled13, 0xCC); - const __m128i w_abs_scaled = _mm_blend_epi16(w_abs_scaled02, w_abs_scaled13, 0xCC); - - const __m128i x_scaled = _mm_sign_epi32(x_abs_scaled, x); - const __m128i y_scaled = _mm_sign_epi32(y_abs_scaled, y); - const __m128i z_scaled = _mm_sign_epi32(z_abs_scaled, z); - const __m128i w_scaled = _mm_sign_epi32(w_abs_scaled, w); - - const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_scaled, y_scaled), vzero_point); - const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_scaled, w_scaled), vzero_point); - const __m128i xyzw_packed = _mm_packs_epi16(xy_packed, zw_packed); - const __m128i xyzw_clamped = _mm_max_epi8(_mm_min_epi8(xyzw_packed, vqmax), vqmin); - - // 4x PABSD - // 4x PSHUFD - // 8x PMULUDQ - // 4x PSRLQ - // 4x PSRLD - // 8x PADDQ - // 4x PBLENDW - // 4x PSIGND - // 2x PACKSSDW - // 2x PADDSW - // 1x PACKSSWB - // 1x PMAXSB - // 1x PMINSB - // --------------------- - // 47 instructions total - - _mm_storeu_si128((__m128i*) output, xyzw_clamped); - output += 16; - } -} diff --git a/src/qs8-requantization/qs8-requantization-rndna-ssse3.c b/src/qs8-requantization/qs8-requantization-rndna-ssse3.c deleted file mode 100644 index 7ef62fca878..00000000000 --- a/src/qs8-requantization/qs8-requantization-rndna-ssse3.c +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/requantization-stubs.h" - - -void xnn_qs8_requantize_rndna__ssse3( - size_t n, - const int32_t* input, - float scale, - int8_t zero_point, - int8_t qmin, - int8_t qmax, - int8_t* output) -{ - assert(n % 16 == 0); - assert(scale < 1.0f); - assert(scale >= 0x1.0p-32f); - - const uint32_t scale_bits = float_as_uint32(scale); - const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000); - const uint32_t shift = 127 + 23 - (scale_bits >> 23); - assert(shift >= 24); - assert(shift < 56); - const uint64_t rounding = UINT64_C(1) << (shift - 1); - - const __m128i vmultiplier = _mm_set1_epi32(multiplier); - const __m128i vzero_point = _mm_set1_epi16((short) zero_point); - const __m128i vqmin = _mm_set1_epi8((char) qmin); - const __m128i vqmax = _mm_set1_epi8((char) qmax); - const __m128i vshift = _mm_cvtsi32_si128((int) shift); - const __m128i vrounding = _mm_set1_epi64x(rounding); - for (; n != 0; n -= 16) { - const __m128i x = _mm_loadu_si128((const __m128i*) input); - const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4)); - const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8)); - const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12)); - input += 16; - - const __m128i x_abs0123 = _mm_abs_epi32(x); - const __m128i y_abs0123 = _mm_abs_epi32(y); - const __m128i z_abs0123 = _mm_abs_epi32(z); - const __m128i w_abs0123 = _mm_abs_epi32(w); - - const __m128i x_abs1032 = _mm_shuffle_epi32(x_abs0123, _MM_SHUFFLE(2, 3, 0, 1)); - const __m128i y_abs1032 = _mm_shuffle_epi32(y_abs0123, _MM_SHUFFLE(2, 3, 0, 1)); - const __m128i z_abs1032 = _mm_shuffle_epi32(z_abs0123, _MM_SHUFFLE(2, 3, 0, 1)); - const __m128i w_abs1032 = _mm_shuffle_epi32(w_abs0123, _MM_SHUFFLE(2, 3, 0, 1)); - - const __m128i x_absmul02 = _mm_mul_epu32(x_abs0123, vmultiplier); - const __m128i y_absmul02 = _mm_mul_epu32(y_abs0123, vmultiplier); - const __m128i z_absmul02 = _mm_mul_epu32(z_abs0123, vmultiplier); - const __m128i w_absmul02 = _mm_mul_epu32(w_abs0123, vmultiplier); - - const __m128i x_absmul13 = _mm_mul_epu32(x_abs1032, vmultiplier); - const __m128i y_absmul13 = _mm_mul_epu32(y_abs1032, vmultiplier); - const __m128i z_absmul13 = _mm_mul_epu32(z_abs1032, vmultiplier); - const __m128i w_absmul13 = _mm_mul_epu32(w_abs1032, vmultiplier); - - const __m128i x_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(x_absmul02, vrounding), vshift); - const __m128i x_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(x_absmul13, vrounding), vshift); - const __m128i y_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(y_absmul02, vrounding), vshift); - const __m128i y_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(y_absmul13, vrounding), vshift); - const __m128i z_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(z_absmul02, vrounding), vshift); - const __m128i z_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(z_absmul13, vrounding), vshift); - const __m128i w_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(w_absmul02, vrounding), vshift); - const __m128i w_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(w_absmul13, vrounding), vshift); - - const __m128i x_abs_scaled0213 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(x_abs_scaled02), _mm_castsi128_ps(x_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0))); - const __m128i y_abs_scaled0213 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(y_abs_scaled02), _mm_castsi128_ps(y_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0))); - const __m128i z_abs_scaled0213 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(z_abs_scaled02), _mm_castsi128_ps(z_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0))); - const __m128i w_abs_scaled0213 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(w_abs_scaled02), _mm_castsi128_ps(w_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0))); - - const __m128i x_abs_scaled = _mm_shuffle_epi32(x_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0)); - const __m128i y_abs_scaled = _mm_shuffle_epi32(y_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0)); - const __m128i z_abs_scaled = _mm_shuffle_epi32(z_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0)); - const __m128i w_abs_scaled = _mm_shuffle_epi32(w_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0)); - - const __m128i x_scaled = _mm_sign_epi32(x_abs_scaled, x); - const __m128i y_scaled = _mm_sign_epi32(y_abs_scaled, y); - const __m128i z_scaled = _mm_sign_epi32(z_abs_scaled, z); - const __m128i w_scaled = _mm_sign_epi32(w_abs_scaled, w); - - const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_scaled, y_scaled), vzero_point); - const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_scaled, w_scaled), vzero_point); - const __m128i xy_clamped = _mm_max_epi16(_mm_min_epi16(xy_packed, vqmax), vqmin); - const __m128i zw_clamped = _mm_max_epi16(_mm_min_epi16(zw_packed, vqmax), vqmin); - const __m128i xyzw_clamped = _mm_packs_epi16(xy_clamped, zw_clamped); - - // 4x PABSD - // 8x PSHUFD - // 8x PMULUDQ - // 8x PSRLQ - // 8x PADDQ - // 4x SHUFPS - // 4x PSIGND - // 2x PACKSSDW - // 2x PADDSW - // 2x PMAXSW - // 2x PMINSW - // 1x PACKSSWB - // --------------------- - // 53 instructions total - - _mm_storeu_si128((__m128i*) output, xyzw_clamped); - output += 16; - } -} diff --git a/src/qs8-rsum/avxvnni.c.in b/src/qs8-rsum/avxvnni.c.in index a9360704950..3144dadd551 100644 --- a/src/qs8-rsum/avxvnni.c.in +++ b/src/qs8-rsum/avxvnni.c.in @@ -22,7 +22,7 @@ void xnn_qs8_rsum_ukernel__${ISA}_u${CHANNEL_TILE}${ACC_SUFFIX}( size_t batch, const int8_t* input, int32_t* output, - const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(input != NULL); diff --git a/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u128-acc2.c b/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u128-acc2.c index 1966a24f5a4..cad57ae03ae 100644 --- a/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u128-acc2.c +++ b/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u128-acc2.c @@ -19,7 +19,7 @@ void xnn_qs8_rsum_ukernel__avx256vnni_u128_acc2( size_t batch, const int8_t* input, int32_t* output, - const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(input != NULL); diff --git a/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u128-acc4.c b/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u128-acc4.c index bd0696d9a69..b15d29b403c 100644 --- a/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u128-acc4.c +++ b/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u128-acc4.c @@ -19,7 +19,7 @@ void xnn_qs8_rsum_ukernel__avx256vnni_u128_acc4( size_t batch, const int8_t* input, int32_t* output, - const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(input != NULL); diff --git a/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u32.c b/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u32.c index 40d68d7348f..875122eaebe 100644 --- a/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u32.c +++ b/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u32.c @@ -19,7 +19,7 @@ void xnn_qs8_rsum_ukernel__avx256vnni_u32( size_t batch, const int8_t* input, int32_t* output, - const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(input != NULL); diff --git a/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u64-acc2.c b/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u64-acc2.c index 0bd11e29652..b14679e6fb7 100644 --- a/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u64-acc2.c +++ b/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u64-acc2.c @@ -19,7 +19,7 @@ void xnn_qs8_rsum_ukernel__avx256vnni_u64_acc2( size_t batch, const int8_t* input, int32_t* output, - const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(input != NULL); diff --git a/src/qs8-rsum/gen/qs8-rsum-avxvnni-u128-acc2.c b/src/qs8-rsum/gen/qs8-rsum-avxvnni-u128-acc2.c index 36ab9b51333..12c1106f1a1 100644 --- a/src/qs8-rsum/gen/qs8-rsum-avxvnni-u128-acc2.c +++ b/src/qs8-rsum/gen/qs8-rsum-avxvnni-u128-acc2.c @@ -19,7 +19,7 @@ void xnn_qs8_rsum_ukernel__avxvnni_u128_acc2( size_t batch, const int8_t* input, int32_t* output, - const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(input != NULL); diff --git a/src/qs8-rsum/gen/qs8-rsum-avxvnni-u128-acc4.c b/src/qs8-rsum/gen/qs8-rsum-avxvnni-u128-acc4.c index 95eaa1d0152..8f8f83a184f 100644 --- a/src/qs8-rsum/gen/qs8-rsum-avxvnni-u128-acc4.c +++ b/src/qs8-rsum/gen/qs8-rsum-avxvnni-u128-acc4.c @@ -19,7 +19,7 @@ void xnn_qs8_rsum_ukernel__avxvnni_u128_acc4( size_t batch, const int8_t* input, int32_t* output, - const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(input != NULL); diff --git a/src/qs8-rsum/gen/qs8-rsum-avxvnni-u32.c b/src/qs8-rsum/gen/qs8-rsum-avxvnni-u32.c index 9c9dbcc02d8..5f941b2e3aa 100644 --- a/src/qs8-rsum/gen/qs8-rsum-avxvnni-u32.c +++ b/src/qs8-rsum/gen/qs8-rsum-avxvnni-u32.c @@ -19,7 +19,7 @@ void xnn_qs8_rsum_ukernel__avxvnni_u32( size_t batch, const int8_t* input, int32_t* output, - const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(input != NULL); diff --git a/src/qs8-rsum/gen/qs8-rsum-avxvnni-u64-acc2.c b/src/qs8-rsum/gen/qs8-rsum-avxvnni-u64-acc2.c index 49c1e249ead..b4da645e710 100644 --- a/src/qs8-rsum/gen/qs8-rsum-avxvnni-u64-acc2.c +++ b/src/qs8-rsum/gen/qs8-rsum-avxvnni-u64-acc2.c @@ -19,7 +19,7 @@ void xnn_qs8_rsum_ukernel__avxvnni_u64_acc2( size_t batch, const int8_t* input, int32_t* output, - const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)]) + const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(input != NULL); diff --git a/src/qs8-vadd/qs8-vadd-minmax.h b/src/qs8-vadd/qs8-vadd-minmax.h index 7dc637cb967..b7733a26eac 100644 --- a/src/qs8-vadd/qs8-vadd-minmax.h +++ b/src/qs8-vadd/qs8-vadd-minmax.h @@ -50,9 +50,12 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vadd_minmax_ukernel__avx2_mul XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_u16, 16, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_u24, 24, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_u32, 32, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qs8_vadd_minmax_ukernel__avx512skx_mul32_ld128_u16, 16, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qs8_vadd_minmax_ukernel__avx512skx_mul32_ld128_u32, 32, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vadd_minmax_ukernel__wasmsimd_u8, 8, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params) diff --git a/src/qs8-vaddc/qs8-vaddc-minmax.h b/src/qs8-vaddc/qs8-vaddc-minmax.h index 4b355506b4a..ad1d3378c1a 100644 --- a/src/qs8-vaddc/qs8-vaddc-minmax.h +++ b/src/qs8-vaddc/qs8-vaddc-minmax.h @@ -50,9 +50,12 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vaddc_minmax_ukernel__avx2_mu XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_u16, 16, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_u24, 24, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_u32, 32, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u16, 16, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u32, 32, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vaddc_minmax_ukernel__wasmsimd_u8, 8, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params) diff --git a/src/qs8-vhswish/gen/qs8-vhswish-avx-u16.c b/src/qs8-vhswish/gen/qs8-vhswish-avx-u16.c index 07502a89c30..db71888fb26 100644 --- a/src/qs8-vhswish/gen/qs8-vhswish-avx-u16.c +++ b/src/qs8-vhswish/gen/qs8-vhswish-avx-u16.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qs8_vhswish_ukernel__avx_u16( diff --git a/src/qs8-vhswish/gen/qs8-vhswish-avx-u32.c b/src/qs8-vhswish/gen/qs8-vhswish-avx-u32.c index 320d57cd955..678b0673069 100644 --- a/src/qs8-vhswish/gen/qs8-vhswish-avx-u32.c +++ b/src/qs8-vhswish/gen/qs8-vhswish-avx-u32.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qs8_vhswish_ukernel__avx_u32( diff --git a/src/qs8-vhswish/gen/qs8-vhswish-avx-u8.c b/src/qs8-vhswish/gen/qs8-vhswish-avx-u8.c index 6cf502bc837..b3283a53465 100644 --- a/src/qs8-vhswish/gen/qs8-vhswish-avx-u8.c +++ b/src/qs8-vhswish/gen/qs8-vhswish-avx-u8.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qs8_vhswish_ukernel__avx_u8( diff --git a/src/qs8-vhswish/gen/qs8-vhswish-neon-u16.c b/src/qs8-vhswish/gen/qs8-vhswish-neon-u16.c index 8bf521a0793..424b59aba47 100644 --- a/src/qs8-vhswish/gen/qs8-vhswish-neon-u16.c +++ b/src/qs8-vhswish/gen/qs8-vhswish-neon-u16.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qs8_vhswish_ukernel__neon_u16( diff --git a/src/qs8-vhswish/gen/qs8-vhswish-neon-u32.c b/src/qs8-vhswish/gen/qs8-vhswish-neon-u32.c index 0147208accc..f1e85ba2102 100644 --- a/src/qs8-vhswish/gen/qs8-vhswish-neon-u32.c +++ b/src/qs8-vhswish/gen/qs8-vhswish-neon-u32.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qs8_vhswish_ukernel__neon_u32( diff --git a/src/qs8-vhswish/gen/qs8-vhswish-neon-u8.c b/src/qs8-vhswish/gen/qs8-vhswish-neon-u8.c index ff85fb858b9..ab486679d21 100644 --- a/src/qs8-vhswish/gen/qs8-vhswish-neon-u8.c +++ b/src/qs8-vhswish/gen/qs8-vhswish-neon-u8.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qs8_vhswish_ukernel__neon_u8( diff --git a/src/qs8-vhswish/gen/qs8-vhswish-scalar-u1.c b/src/qs8-vhswish/gen/qs8-vhswish-scalar-u1.c index 73a73e3011d..69cb0456dc5 100644 --- a/src/qs8-vhswish/gen/qs8-vhswish-scalar-u1.c +++ b/src/qs8-vhswish/gen/qs8-vhswish-scalar-u1.c @@ -10,7 +10,7 @@ #include #include "xnnpack/math.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qs8_vhswish_ukernel__scalar_u1( diff --git a/src/qs8-vhswish/gen/qs8-vhswish-scalar-u2.c b/src/qs8-vhswish/gen/qs8-vhswish-scalar-u2.c index 47bad8d6d64..133c27dec2f 100644 --- a/src/qs8-vhswish/gen/qs8-vhswish-scalar-u2.c +++ b/src/qs8-vhswish/gen/qs8-vhswish-scalar-u2.c @@ -10,7 +10,7 @@ #include #include "xnnpack/math.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qs8_vhswish_ukernel__scalar_u2( diff --git a/src/qs8-vhswish/gen/qs8-vhswish-scalar-u4.c b/src/qs8-vhswish/gen/qs8-vhswish-scalar-u4.c index 4f1bf9f7c36..9bd75968885 100644 --- a/src/qs8-vhswish/gen/qs8-vhswish-scalar-u4.c +++ b/src/qs8-vhswish/gen/qs8-vhswish-scalar-u4.c @@ -10,7 +10,7 @@ #include #include "xnnpack/math.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qs8_vhswish_ukernel__scalar_u4( diff --git a/src/qs8-vhswish/gen/qs8-vhswish-sse2-u16.c b/src/qs8-vhswish/gen/qs8-vhswish-sse2-u16.c index a2e32679b81..9ccf642d7e9 100644 --- a/src/qs8-vhswish/gen/qs8-vhswish-sse2-u16.c +++ b/src/qs8-vhswish/gen/qs8-vhswish-sse2-u16.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" #include "xnnpack/unaligned.h" diff --git a/src/qs8-vhswish/gen/qs8-vhswish-sse2-u32.c b/src/qs8-vhswish/gen/qs8-vhswish-sse2-u32.c index 08496a330d4..0fdc1eb6304 100644 --- a/src/qs8-vhswish/gen/qs8-vhswish-sse2-u32.c +++ b/src/qs8-vhswish/gen/qs8-vhswish-sse2-u32.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" #include "xnnpack/unaligned.h" diff --git a/src/qs8-vhswish/gen/qs8-vhswish-sse41-u16.c b/src/qs8-vhswish/gen/qs8-vhswish-sse41-u16.c index 217ce4dd5f2..3ede288bf8b 100644 --- a/src/qs8-vhswish/gen/qs8-vhswish-sse41-u16.c +++ b/src/qs8-vhswish/gen/qs8-vhswish-sse41-u16.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qs8_vhswish_ukernel__sse41_u16( diff --git a/src/qs8-vhswish/gen/qs8-vhswish-sse41-u32.c b/src/qs8-vhswish/gen/qs8-vhswish-sse41-u32.c index bf1eb5a0ee8..35530dbc19a 100644 --- a/src/qs8-vhswish/gen/qs8-vhswish-sse41-u32.c +++ b/src/qs8-vhswish/gen/qs8-vhswish-sse41-u32.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qs8_vhswish_ukernel__sse41_u32( diff --git a/src/qs8-vhswish/gen/qs8-vhswish-sse41-u8.c b/src/qs8-vhswish/gen/qs8-vhswish-sse41-u8.c index ab4d2aa9975..20469e949df 100644 --- a/src/qs8-vhswish/gen/qs8-vhswish-sse41-u8.c +++ b/src/qs8-vhswish/gen/qs8-vhswish-sse41-u8.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qs8_vhswish_ukernel__sse41_u8( diff --git a/src/qs8-vhswish/gen/qs8-vhswish-ssse3-u16.c b/src/qs8-vhswish/gen/qs8-vhswish-ssse3-u16.c index e68869b4c45..edd712abac1 100644 --- a/src/qs8-vhswish/gen/qs8-vhswish-ssse3-u16.c +++ b/src/qs8-vhswish/gen/qs8-vhswish-ssse3-u16.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" #include "xnnpack/unaligned.h" diff --git a/src/qs8-vhswish/gen/qs8-vhswish-ssse3-u32.c b/src/qs8-vhswish/gen/qs8-vhswish-ssse3-u32.c index cc12ba79780..9aff74aaaaf 100644 --- a/src/qs8-vhswish/gen/qs8-vhswish-ssse3-u32.c +++ b/src/qs8-vhswish/gen/qs8-vhswish-ssse3-u32.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" #include "xnnpack/unaligned.h" diff --git a/src/qs8-vhswish/gen/qs8-vhswish-wasmsimd-u16.c b/src/qs8-vhswish/gen/qs8-vhswish-wasmsimd-u16.c index ad4147be867..ee083d92a84 100644 --- a/src/qs8-vhswish/gen/qs8-vhswish-wasmsimd-u16.c +++ b/src/qs8-vhswish/gen/qs8-vhswish-wasmsimd-u16.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qs8_vhswish_ukernel__wasmsimd_u16( diff --git a/src/qs8-vhswish/gen/qs8-vhswish-wasmsimd-u32.c b/src/qs8-vhswish/gen/qs8-vhswish-wasmsimd-u32.c index 551309852f1..6181ed75c96 100644 --- a/src/qs8-vhswish/gen/qs8-vhswish-wasmsimd-u32.c +++ b/src/qs8-vhswish/gen/qs8-vhswish-wasmsimd-u32.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qs8_vhswish_ukernel__wasmsimd_u32( diff --git a/src/qs8-vhswish/gen/qs8-vhswish-wasmsimd-u8.c b/src/qs8-vhswish/gen/qs8-vhswish-wasmsimd-u8.c index 20a8a9e6487..7ccd5946192 100644 --- a/src/qs8-vhswish/gen/qs8-vhswish-wasmsimd-u8.c +++ b/src/qs8-vhswish/gen/qs8-vhswish-wasmsimd-u8.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qs8_vhswish_ukernel__wasmsimd_u8( diff --git a/src/qs8-vhswish/neon.c.in b/src/qs8-vhswish/neon.c.in index 5002bd892b9..f0efc5e3578 100644 --- a/src/qs8-vhswish/neon.c.in +++ b/src/qs8-vhswish/neon.c.in @@ -12,7 +12,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" #include #include "xnnpack/common.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] diff --git a/src/qs8-vhswish/scalar.c.in b/src/qs8-vhswish/scalar.c.in index 1e3cae3e2ec..fdc1e6d7c94 100644 --- a/src/qs8-vhswish/scalar.c.in +++ b/src/qs8-vhswish/scalar.c.in @@ -9,7 +9,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" #include #include "xnnpack/math.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] diff --git a/src/qs8-vhswish/sse2.c.in b/src/qs8-vhswish/sse2.c.in index bc6a64e18cb..926e7a04dbe 100644 --- a/src/qs8-vhswish/sse2.c.in +++ b/src/qs8-vhswish/sse2.c.in @@ -12,7 +12,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" #include #include "xnnpack/common.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" #include "xnnpack/unaligned.h" diff --git a/src/qs8-vhswish/sse4.c.in b/src/qs8-vhswish/sse4.c.in index 61c44b9f562..dff7aa0d1e3 100644 --- a/src/qs8-vhswish/sse4.c.in +++ b/src/qs8-vhswish/sse4.c.in @@ -13,7 +13,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] diff --git a/src/qs8-vhswish/ssse3.c.in b/src/qs8-vhswish/ssse3.c.in index 62b9015227f..75307832f3b 100644 --- a/src/qs8-vhswish/ssse3.c.in +++ b/src/qs8-vhswish/ssse3.c.in @@ -12,7 +12,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" #include #include "xnnpack/common.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" #include "xnnpack/unaligned.h" diff --git a/src/qs8-vhswish/wasmsimd.c.in b/src/qs8-vhswish/wasmsimd.c.in index 3ad456c9ff2..adf3174a928 100644 --- a/src/qs8-vhswish/wasmsimd.c.in +++ b/src/qs8-vhswish/wasmsimd.c.in @@ -12,7 +12,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" #include #include "xnnpack/common.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] diff --git a/src/qs8-vlrelu/armsimd32.c.in b/src/qs8-vlrelu/armsimd32.c.in index 19489f37365..65c520a527e 100644 --- a/src/qs8-vlrelu/armsimd32.c.in +++ b/src/qs8-vlrelu/armsimd32.c.in @@ -14,7 +14,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" #include "xnnpack/intrinsics-polyfill.h" #include "xnnpack/math.h" #include "xnnpack/unaligned.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] diff --git a/src/qs8-vlrelu/avx2.c.in b/src/qs8-vlrelu/avx2.c.in index ff38529403a..2fe1c86a099 100644 --- a/src/qs8-vlrelu/avx2.c.in +++ b/src/qs8-vlrelu/avx2.c.in @@ -13,7 +13,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-armsimd32-u4.c b/src/qs8-vlrelu/gen/qs8-vlrelu-armsimd32-u4.c index 2efa8844130..0c92404a11a 100644 --- a/src/qs8-vlrelu/gen/qs8-vlrelu-armsimd32-u4.c +++ b/src/qs8-vlrelu/gen/qs8-vlrelu-armsimd32-u4.c @@ -14,7 +14,7 @@ #include "xnnpack/intrinsics-polyfill.h" #include "xnnpack/math.h" #include "xnnpack/unaligned.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qs8_vlrelu_ukernel__armsimd32_u4( diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-armsimd32-u8.c b/src/qs8-vlrelu/gen/qs8-vlrelu-armsimd32-u8.c index edb86301ce5..6149d36f0c7 100644 --- a/src/qs8-vlrelu/gen/qs8-vlrelu-armsimd32-u8.c +++ b/src/qs8-vlrelu/gen/qs8-vlrelu-armsimd32-u8.c @@ -14,7 +14,7 @@ #include "xnnpack/intrinsics-polyfill.h" #include "xnnpack/math.h" #include "xnnpack/unaligned.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qs8_vlrelu_ukernel__armsimd32_u8( diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-avx-u16.c b/src/qs8-vlrelu/gen/qs8-vlrelu-avx-u16.c index 4d48f8973ad..b414b773dbc 100644 --- a/src/qs8-vlrelu/gen/qs8-vlrelu-avx-u16.c +++ b/src/qs8-vlrelu/gen/qs8-vlrelu-avx-u16.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qs8_vlrelu_ukernel__avx_u16( diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-avx-u32.c b/src/qs8-vlrelu/gen/qs8-vlrelu-avx-u32.c index 6b6a000d02f..029d5369874 100644 --- a/src/qs8-vlrelu/gen/qs8-vlrelu-avx-u32.c +++ b/src/qs8-vlrelu/gen/qs8-vlrelu-avx-u32.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qs8_vlrelu_ukernel__avx_u32( diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-avx-u8.c b/src/qs8-vlrelu/gen/qs8-vlrelu-avx-u8.c index 3086f3b11e9..24345b6bc9a 100644 --- a/src/qs8-vlrelu/gen/qs8-vlrelu-avx-u8.c +++ b/src/qs8-vlrelu/gen/qs8-vlrelu-avx-u8.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qs8_vlrelu_ukernel__avx_u8( diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u16.c b/src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u16.c index bf1c50718c6..4bdef541773 100644 --- a/src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u16.c +++ b/src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u16.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qs8_vlrelu_ukernel__avx2_u16( diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u32.c b/src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u32.c index 103a671075e..a691a97eb87 100644 --- a/src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u32.c +++ b/src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u32.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qs8_vlrelu_ukernel__avx2_u32( diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u64.c b/src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u64.c index e5e93e65d66..d20d0fc3ec9 100644 --- a/src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u64.c +++ b/src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u64.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qs8_vlrelu_ukernel__avx2_u64( diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-neon-u16.c b/src/qs8-vlrelu/gen/qs8-vlrelu-neon-u16.c index fd89575a7e7..8ddd5a73ad9 100644 --- a/src/qs8-vlrelu/gen/qs8-vlrelu-neon-u16.c +++ b/src/qs8-vlrelu/gen/qs8-vlrelu-neon-u16.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qs8_vlrelu_ukernel__neon_u16( diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-neon-u32.c b/src/qs8-vlrelu/gen/qs8-vlrelu-neon-u32.c index ad5ad976c69..2d001633e9e 100644 --- a/src/qs8-vlrelu/gen/qs8-vlrelu-neon-u32.c +++ b/src/qs8-vlrelu/gen/qs8-vlrelu-neon-u32.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qs8_vlrelu_ukernel__neon_u32( diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-neon-u8.c b/src/qs8-vlrelu/gen/qs8-vlrelu-neon-u8.c index c053ac1624e..bd2035afcd8 100644 --- a/src/qs8-vlrelu/gen/qs8-vlrelu-neon-u8.c +++ b/src/qs8-vlrelu/gen/qs8-vlrelu-neon-u8.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qs8_vlrelu_ukernel__neon_u8( diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-rvv-u1v.c b/src/qs8-vlrelu/gen/qs8-vlrelu-rvv-u1v.c index 7e9159c6b5f..2abe0424efb 100644 --- a/src/qs8-vlrelu/gen/qs8-vlrelu-rvv-u1v.c +++ b/src/qs8-vlrelu/gen/qs8-vlrelu-rvv-u1v.c @@ -11,7 +11,7 @@ #include -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qs8_vlrelu_ukernel__rvv_u1v( diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-rvv-u2v.c b/src/qs8-vlrelu/gen/qs8-vlrelu-rvv-u2v.c index 1e14268b149..a51e57991f4 100644 --- a/src/qs8-vlrelu/gen/qs8-vlrelu-rvv-u2v.c +++ b/src/qs8-vlrelu/gen/qs8-vlrelu-rvv-u2v.c @@ -11,7 +11,7 @@ #include -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qs8_vlrelu_ukernel__rvv_u2v( diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-u1.c b/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-u1.c index ebf633a0fe0..5c6e2d46695 100644 --- a/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-u1.c +++ b/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-u1.c @@ -10,7 +10,7 @@ #include #include "xnnpack/math.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qs8_vlrelu_ukernel__scalar_andxor_u1( diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-u2.c b/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-u2.c index d2e392cf879..df20df9c56c 100644 --- a/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-u2.c +++ b/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-u2.c @@ -10,7 +10,7 @@ #include #include "xnnpack/math.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qs8_vlrelu_ukernel__scalar_andxor_u2( diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-u4.c b/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-u4.c index 4344baf8141..d905e5bee90 100644 --- a/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-u4.c +++ b/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-u4.c @@ -10,7 +10,7 @@ #include #include "xnnpack/math.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qs8_vlrelu_ukernel__scalar_andxor_u4( diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u1.c b/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u1.c index 471d652eb7f..5f9c21e99f2 100644 --- a/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u1.c +++ b/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u1.c @@ -10,7 +10,7 @@ #include #include "xnnpack/math.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qs8_vlrelu_ukernel__scalar_select_u1( diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u2.c b/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u2.c index 6ae6793d00f..529450bd9b5 100644 --- a/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u2.c +++ b/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u2.c @@ -10,7 +10,7 @@ #include #include "xnnpack/math.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qs8_vlrelu_ukernel__scalar_select_u2( diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u4.c b/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u4.c index 2b283b3abd7..04f73cb758d 100644 --- a/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u4.c +++ b/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u4.c @@ -10,7 +10,7 @@ #include #include "xnnpack/math.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qs8_vlrelu_ukernel__scalar_select_u4( diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-sse2-u16.c b/src/qs8-vlrelu/gen/qs8-vlrelu-sse2-u16.c index 520bba77505..1ac03c29fee 100644 --- a/src/qs8-vlrelu/gen/qs8-vlrelu-sse2-u16.c +++ b/src/qs8-vlrelu/gen/qs8-vlrelu-sse2-u16.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" #include "xnnpack/unaligned.h" diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-sse2-u32.c b/src/qs8-vlrelu/gen/qs8-vlrelu-sse2-u32.c index 8305b457d0b..e25383cc0a0 100644 --- a/src/qs8-vlrelu/gen/qs8-vlrelu-sse2-u32.c +++ b/src/qs8-vlrelu/gen/qs8-vlrelu-sse2-u32.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" #include "xnnpack/unaligned.h" diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-sse41-u16.c b/src/qs8-vlrelu/gen/qs8-vlrelu-sse41-u16.c index be708252465..a45744d6230 100644 --- a/src/qs8-vlrelu/gen/qs8-vlrelu-sse41-u16.c +++ b/src/qs8-vlrelu/gen/qs8-vlrelu-sse41-u16.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qs8_vlrelu_ukernel__sse41_u16( diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-sse41-u32.c b/src/qs8-vlrelu/gen/qs8-vlrelu-sse41-u32.c index 5fb36ef81c2..25d8161e4e5 100644 --- a/src/qs8-vlrelu/gen/qs8-vlrelu-sse41-u32.c +++ b/src/qs8-vlrelu/gen/qs8-vlrelu-sse41-u32.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qs8_vlrelu_ukernel__sse41_u32( diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-sse41-u8.c b/src/qs8-vlrelu/gen/qs8-vlrelu-sse41-u8.c index 4aa4856d5bb..a93d4d11e32 100644 --- a/src/qs8-vlrelu/gen/qs8-vlrelu-sse41-u8.c +++ b/src/qs8-vlrelu/gen/qs8-vlrelu-sse41-u8.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qs8_vlrelu_ukernel__sse41_u8( diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-ssse3-u16.c b/src/qs8-vlrelu/gen/qs8-vlrelu-ssse3-u16.c index 735a71bf082..acad3770453 100644 --- a/src/qs8-vlrelu/gen/qs8-vlrelu-ssse3-u16.c +++ b/src/qs8-vlrelu/gen/qs8-vlrelu-ssse3-u16.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" #include "xnnpack/unaligned.h" diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-ssse3-u32.c b/src/qs8-vlrelu/gen/qs8-vlrelu-ssse3-u32.c index 4980cf680fb..6d5741569c4 100644 --- a/src/qs8-vlrelu/gen/qs8-vlrelu-ssse3-u32.c +++ b/src/qs8-vlrelu/gen/qs8-vlrelu-ssse3-u32.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" #include "xnnpack/unaligned.h" diff --git a/src/qs8-vlrelu/neon.c.in b/src/qs8-vlrelu/neon.c.in index 9ad887f9972..7f17862634b 100644 --- a/src/qs8-vlrelu/neon.c.in +++ b/src/qs8-vlrelu/neon.c.in @@ -12,7 +12,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" #include #include "xnnpack/common.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] diff --git a/src/qs8-vlrelu/qs8-vlrelu.h b/src/qs8-vlrelu/qs8-vlrelu.h index c384e6b0d73..c0d5c3e3618 100644 --- a/src/qs8-vlrelu/qs8-vlrelu.h +++ b/src/qs8-vlrelu/qs8-vlrelu.h @@ -17,59 +17,59 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_qs8_vlrelu_ukernel__neon_u8, 8, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_qs8_vlrelu_ukernel__neon_u16, 16, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_qs8_vlrelu_ukernel__neon_u32, 32, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_qs8_vlrelu_ukernel__neon_u8, 8, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_qs8_vlrelu_ukernel__neon_u16, 16, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_qs8_vlrelu_ukernel__neon_u32, 32, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_qs8_vlrelu_ukernel__rvv_u1v, 1, true, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_qs8_vlrelu_ukernel__rvv_u2v, 2, true, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_qs8_vlrelu_ukernel__rvv_u1v, 1, true, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_qs8_vlrelu_ukernel__rvv_u2v, 2, true, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) #endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__sse2_u16, 16, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__sse2_u32, 32, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_ssse3, xnn_qs8_vlrelu_ukernel__ssse3_u16, 16, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_ssse3, xnn_qs8_vlrelu_ukernel__ssse3_u32, 32, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_qs8_vlrelu_ukernel__sse41_u8, 8, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_qs8_vlrelu_ukernel__sse41_u16, 16, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_qs8_vlrelu_ukernel__sse41_u32, 32, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qs8_vlrelu_ukernel__avx_u8, 8, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qs8_vlrelu_ukernel__avx_u16, 16, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qs8_vlrelu_ukernel__avx_u32, 32, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vlrelu_ukernel__avx2_u16, 16, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vlrelu_ukernel__avx2_u32, 32, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vlrelu_ukernel__avx2_u64, 64, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__sse2_u16, 16, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__sse2_u32, 32, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_ssse3, xnn_qs8_vlrelu_ukernel__ssse3_u16, 16, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_ssse3, xnn_qs8_vlrelu_ukernel__ssse3_u32, 32, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_qs8_vlrelu_ukernel__sse41_u8, 8, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_qs8_vlrelu_ukernel__sse41_u16, 16, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_qs8_vlrelu_ukernel__sse41_u32, 32, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qs8_vlrelu_ukernel__avx_u8, 8, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qs8_vlrelu_ukernel__avx_u16, 16, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qs8_vlrelu_ukernel__avx_u32, 32, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vlrelu_ukernel__avx2_u16, 16, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vlrelu_ukernel__avx2_u32, 32, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vlrelu_ukernel__avx2_u64, 64, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmsimd_arm_u16, 16, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmsimd_arm_u32, 32, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmsimd_x86_u8, 8, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmsimd_x86_u16, 16, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmsimd_x86_u32, 32, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmsimd_arm_u16, 16, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmsimd_arm_u32, 32, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmsimd_x86_u8, 8, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmsimd_x86_u16, 16, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmsimd_x86_u32, 32, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD #if XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_u16, 16, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_u32, 32, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_u8, 8, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_u16, 16, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_u32, 32, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_u16, 16, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_u32, 32, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_u8, 8, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_u16, 16, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_u32, 32, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) #endif // XNN_ARCH_WASMRELAXEDSIMD #if XNN_ARCH_ARM -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_v6, xnn_qs8_vlrelu_ukernel__armsimd32_u4, 4, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_v6, xnn_qs8_vlrelu_ukernel__armsimd32_u8, 8, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_v6, xnn_qs8_vlrelu_ukernel__armsimd32_u4, 4, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_v6, xnn_qs8_vlrelu_ukernel__armsimd32_u8, 8, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) #endif // XNN_ARCH_ARM -XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__scalar_select_u1, 1, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__scalar_select_u2, 2, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__scalar_select_u4, 4, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__scalar_andxor_u1, 1, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__scalar_andxor_u2, 2, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__scalar_andxor_u4, 4, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__scalar_select_u1, 1, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__scalar_select_u2, 2, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__scalar_select_u4, 4, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__scalar_andxor_u1, 1, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__scalar_andxor_u2, 2, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__scalar_andxor_u4, 4, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params) #ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS #undef XNN_DEFINED_UKERNEL_WITH_PARAMS diff --git a/src/qs8-vlrelu/rvv.c.in b/src/qs8-vlrelu/rvv.c.in index fee935ed324..2d0bf1470a9 100755 --- a/src/qs8-vlrelu/rvv.c.in +++ b/src/qs8-vlrelu/rvv.c.in @@ -9,7 +9,7 @@ $assert DATATYPE in ["QS8", "QU8"] #include -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] diff --git a/src/qs8-vlrelu/scalar-andxor.c.in b/src/qs8-vlrelu/scalar-andxor.c.in index 3518adb4e23..6ce4a95e4a4 100644 --- a/src/qs8-vlrelu/scalar-andxor.c.in +++ b/src/qs8-vlrelu/scalar-andxor.c.in @@ -9,7 +9,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" #include #include "xnnpack/math.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] diff --git a/src/qs8-vlrelu/scalar-select.c.in b/src/qs8-vlrelu/scalar-select.c.in index efe0dcd684f..3786a811f50 100644 --- a/src/qs8-vlrelu/scalar-select.c.in +++ b/src/qs8-vlrelu/scalar-select.c.in @@ -9,7 +9,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" #include #include "xnnpack/math.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] diff --git a/src/qs8-vlrelu/sse2.c.in b/src/qs8-vlrelu/sse2.c.in index e2460673953..f825a6e0ac3 100644 --- a/src/qs8-vlrelu/sse2.c.in +++ b/src/qs8-vlrelu/sse2.c.in @@ -12,7 +12,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" #include #include "xnnpack/common.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" #include "xnnpack/unaligned.h" diff --git a/src/qs8-vlrelu/sse4.c.in b/src/qs8-vlrelu/sse4.c.in index 1b8a415bca2..5cec07fd300 100644 --- a/src/qs8-vlrelu/sse4.c.in +++ b/src/qs8-vlrelu/sse4.c.in @@ -13,7 +13,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] diff --git a/src/qs8-vlrelu/ssse3.c.in b/src/qs8-vlrelu/ssse3.c.in index b71c09d05b3..5788f59c8e5 100644 --- a/src/qs8-vlrelu/ssse3.c.in +++ b/src/qs8-vlrelu/ssse3.c.in @@ -12,7 +12,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" #include #include "xnnpack/common.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" #include "xnnpack/unaligned.h" diff --git a/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-fmagic.c b/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-fmagic.c index 86d44a0222c..e75dcc32f21 100644 --- a/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-fmagic.c +++ b/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic( diff --git a/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-imagic.c b/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-imagic.c index 7e948c97a95..a5b70578988 100644 --- a/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-imagic.c +++ b/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-imagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic( diff --git a/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-lrintf.c b/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-lrintf.c index 025ae55081f..97bf0989f8e 100644 --- a/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-lrintf.c +++ b/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-lrintf.c @@ -9,9 +9,13 @@ #include #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__scalar_lrintf( diff --git a/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-wasm-fmagic.c b/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-wasm-fmagic.c index 9c11113fa64..7eb023b47a4 100644 --- a/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-wasm-fmagic.c +++ b/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-wasm-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__wasm_fmagic( diff --git a/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-fmagic.c b/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-fmagic.c index bb929b1573e..c3f5251b63c 100644 --- a/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-fmagic.c +++ b/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__scalar_fmagic( diff --git a/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-imagic.c b/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-imagic.c index bbaf53878aa..902ed641c6c 100644 --- a/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-imagic.c +++ b/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-imagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__scalar_imagic( diff --git a/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-lrintf.c b/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-lrintf.c index 445d754bc5a..15985a3ba14 100644 --- a/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-lrintf.c +++ b/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-lrintf.c @@ -9,9 +9,13 @@ #include #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf( diff --git a/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-wasm-fmagic.c b/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-wasm-fmagic.c index 5c67ddc0476..c19ba42c44f 100644 --- a/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-wasm-fmagic.c +++ b/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-wasm-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__wasm_fmagic( diff --git a/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-fmagic.c b/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-fmagic.c index bd19f753b58..b0f3c4a748a 100644 --- a/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-fmagic.c +++ b/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" void xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__scalar_fmagic( size_t channels, diff --git a/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-imagic.c b/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-imagic.c index 14d66ab5298..f46751a6172 100644 --- a/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-imagic.c +++ b/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-imagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" void xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__scalar_imagic( size_t channels, diff --git a/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c b/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c index aaa3b70875f..8bbdd03a3ea 100644 --- a/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c +++ b/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c @@ -9,9 +9,13 @@ #include #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" void xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__scalar_lrintf( size_t channels, diff --git a/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-wasm-fmagic.c b/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-wasm-fmagic.c index c93594a988e..7ae102c01ce 100644 --- a/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-wasm-fmagic.c +++ b/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-wasm-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" void xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__wasm_fmagic( size_t channels, diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-fmagic.c b/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-fmagic.c index 70de9952f2c..3884c87a98d 100644 --- a/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-fmagic.c +++ b/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic( diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-imagic.c b/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-imagic.c index 0e04aba98b0..f73c56c7f4f 100644 --- a/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-imagic.c +++ b/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-imagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__scalar_imagic( diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-lrintf.c b/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-lrintf.c index 89a648ae835..a0d0b8ac541 100644 --- a/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-lrintf.c +++ b/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-lrintf.c @@ -9,9 +9,13 @@ #include #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__scalar_lrintf( diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-wasm-fmagic.c b/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-wasm-fmagic.c index d39d8f0d504..c64e8a6f206 100644 --- a/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-wasm-fmagic.c +++ b/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-wasm-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__wasm_fmagic( diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-rndnu-scalar.c b/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-rndnu-scalar.c index fcaf6b1ce78..e00f88a6900 100644 --- a/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-rndnu-scalar.c +++ b/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-rndnu-scalar.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qu8_dwconv_minmax_rndnu_ukernel_9p1c__scalar( diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-fmagic.c b/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-fmagic.c index 7336f5547fa..21fa3e4bf87 100644 --- a/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-fmagic.c +++ b/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__scalar_fmagic( diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-imagic.c b/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-imagic.c index 5a660c13bf6..fd70457629b 100644 --- a/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-imagic.c +++ b/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-imagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic( diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-lrintf.c b/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-lrintf.c index 0baaf4ed0e2..bdc10caea70 100644 --- a/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-lrintf.c +++ b/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-lrintf.c @@ -9,9 +9,13 @@ #include #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf( diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-wasm-fmagic.c b/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-wasm-fmagic.c index 40f39ee703e..f36f84fad11 100644 --- a/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-wasm-fmagic.c +++ b/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-wasm-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__wasm_fmagic( diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-rndnu-scalar.c b/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-rndnu-scalar.c index cadf0b45273..69f7b747400 100644 --- a/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-rndnu-scalar.c +++ b/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-rndnu-scalar.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" #include "xnnpack/unaligned.h" void xnn_qu8_dwconv_minmax_rndnu_ukernel_9p2c__scalar( diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-fmagic.c b/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-fmagic.c index 9a586d062c6..a9a229e4c09 100644 --- a/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-fmagic.c +++ b/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" void xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__scalar_fmagic( size_t channels, diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-imagic.c b/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-imagic.c index 89d1eb17be7..076dd656fde 100644 --- a/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-imagic.c +++ b/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-imagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" void xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__scalar_imagic( size_t channels, diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-lrintf.c b/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-lrintf.c index 643e55a7d55..37a0b640173 100644 --- a/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-lrintf.c +++ b/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-lrintf.c @@ -9,9 +9,13 @@ #include #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" void xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__scalar_lrintf( size_t channels, diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-wasm-fmagic.c b/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-wasm-fmagic.c index cde61f2cafc..710c51f5c74 100644 --- a/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-wasm-fmagic.c +++ b/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-wasm-fmagic.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" void xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__wasm_fmagic( size_t channels, diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-rndnu-scalar.c b/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-rndnu-scalar.c index 87ca4d6b3fd..0a2edcc6e52 100644 --- a/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-rndnu-scalar.c +++ b/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-rndnu-scalar.c @@ -8,9 +8,13 @@ // LICENSE file in the root directory of this source tree. #include +#include +#include +#include "xnnpack/common.h" #include "xnnpack/dwconv.h" #include "xnnpack/math.h" +#include "xnnpack/microparams.h" void xnn_qu8_dwconv_minmax_rndnu_ukernel_9p4c__scalar( size_t channels, diff --git a/src/qu8-dwconv/qu8-dwconv-minmax-multipass-fp32.h b/src/qu8-dwconv/qu8-dwconv-minmax-multipass-fp32.h index 667780c8ef7..25b35d8b4a4 100644 --- a/src/qu8-dwconv/qu8-dwconv-minmax-multipass-fp32.h +++ b/src/qu8-dwconv/qu8-dwconv-minmax-multipass-fp32.h @@ -61,13 +61,16 @@ XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__avx_mul32, 8, 8, 9, 16, 4, 4, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__avx2_mul32, 8, 8, 9, 16, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__avx2_mul32, 8, 8, 9, 32, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c16s1r__avx512skx_mul32, 5, 5, 5, 16, 16, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l32c16s1r__avx512skx_mul32, 5, 5, 5, 32, 16, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c16s1r__avx512skx_mul32, 6, 6, 7, 16, 16, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l32c16s1r__avx512skx_mul32, 6, 6, 7, 32, 16, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c16s1r__avx512skx_mul32, 8, 8, 9, 16, 16, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s1r__avx512skx_mul32, 8, 8, 9, 32, 16, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16, 5, 5, 5, 8, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) diff --git a/src/qu8-dwconv/qu8-dwconv-minmax-unipass-fp32.h b/src/qu8-dwconv/qu8-dwconv-minmax-unipass-fp32.h index 0a0cd7e224c..8db3b279bf8 100644 --- a/src/qu8-dwconv/qu8-dwconv-minmax-unipass-fp32.h +++ b/src/qu8-dwconv/qu8-dwconv-minmax-unipass-fp32.h @@ -48,11 +48,14 @@ XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__ XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul32, 16, false, 16, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul32, 16, false, 16, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qu8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul32, 32, false, 32, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx512skx_mul32, 16, false, 16, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32, 32, false, 32, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__avx512skx_mul32, 16, false, 16, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32, 32, false, 32, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16, 8, false, 8, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params) diff --git a/src/qu8-f32-vcvt/qu8-f32-vcvt.h b/src/qu8-f32-vcvt/qu8-f32-vcvt.h index 3bcbc008f05..628eed7ec1e 100644 --- a/src/qu8-f32-vcvt/qu8-f32-vcvt.h +++ b/src/qu8-f32-vcvt/qu8-f32-vcvt.h @@ -40,11 +40,14 @@ XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_f32_vcvt_ukernel__avx2_u8 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_f32_vcvt_ukernel__avx2_u16, 16, false, uint8_t, float, struct xnn_qu8_f32_cvt_params, xnn_init_qu8_f32_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_f32_vcvt_ukernel__avx2_u24, 24, false, uint8_t, float, struct xnn_qu8_f32_cvt_params, xnn_init_qu8_f32_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_f32_vcvt_ukernel__avx2_u32, 32, false, uint8_t, float, struct xnn_qu8_f32_cvt_params, xnn_init_qu8_f32_cvt_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qu8_f32_vcvt_ukernel__avx512skx_u16, 16, false, uint8_t, float, struct xnn_qu8_f32_cvt_params, xnn_init_qu8_f32_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qu8_f32_vcvt_ukernel__avx512skx_u32, 32, false, uint8_t, float, struct xnn_qu8_f32_cvt_params, xnn_init_qu8_f32_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qu8_f32_vcvt_ukernel__avx512skx_u48, 48, false, uint8_t, float, struct xnn_qu8_f32_cvt_params, xnn_init_qu8_f32_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qu8_f32_vcvt_ukernel__avx512skx_u64, 64, false, uint8_t, float, struct xnn_qu8_f32_cvt_params, xnn_init_qu8_f32_cvt_scalar_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qu8_f32_vcvt_ukernel__wasmsimd_u8, 8, false, uint8_t, float, struct xnn_qu8_f32_cvt_params, xnn_init_qu8_f32_cvt_scalar_params) diff --git a/src/qu8-requantization/qu8-requantization-rndna-neon.c b/src/qu8-requantization/qu8-requantization-rndna-neon.c deleted file mode 100644 index 1178b1ff5cb..00000000000 --- a/src/qu8-requantization/qu8-requantization-rndna-neon.c +++ /dev/null @@ -1,165 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/requantization-stubs.h" - - -void xnn_qu8_requantize_rndna__neon( - size_t n, - const int32_t* input, - float scale, - uint8_t zero_point, - uint8_t qmin, - uint8_t qmax, - uint8_t* output) -{ - assert(n % 16 == 0); - assert(scale < 1.0f); - assert(scale >= 0x1.0p-32f); - - const uint32_t scale_bits = float_as_uint32(scale); - const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000); - const int32_t shift = 127 + 23 - (scale_bits >> 23); - assert(shift >= 24); - assert(shift < 56); - -#if defined(__aarch64__) - const int32x4_t vmultiplier = vdupq_n_s32(multiplier); -#else - const int32x2_t vmultiplier = vdup_n_s32(multiplier); -#endif - const int16x8_t vzero_point = vdupq_n_s16((int16_t)(uint16_t) zero_point); - const int64x2_t vshift = vdupq_n_s64(-shift); - const uint8x16_t vqmin = vdupq_n_u8(qmin); - const uint8x16_t vqmax = vdupq_n_u8(qmax); - for (; n != 0; n -= 16) { - const int32x4_t x = vld1q_s32(input); - const int32x4_t y = vld1q_s32(input + 4); - const int32x4_t z = vld1q_s32(input + 8); - const int32x4_t w = vld1q_s32(input + 12); - input += 16; - - const uint32x4_t x_neg_mask = vcltq_s32(x, vmovq_n_s32(0)); - const uint32x4_t y_neg_mask = vcltq_s32(y, vmovq_n_s32(0)); - const uint32x4_t z_neg_mask = vcltq_s32(z, vmovq_n_s32(0)); - const uint32x4_t w_neg_mask = vcltq_s32(w, vmovq_n_s32(0)); - -#if defined(__aarch64__) - const int64x2_t x01_product = vmull_s32(vget_low_s32(x), vget_low_s32(vmultiplier)); - const int64x2_t x23_product = vmull_high_s32(x, vmultiplier); - const int64x2_t y01_product = vmull_s32(vget_low_s32(y), vget_low_s32(vmultiplier)); - const int64x2_t y23_product = vmull_high_s32(y, vmultiplier); - const int64x2_t z01_product = vmull_s32(vget_low_s32(z), vget_low_s32(vmultiplier)); - const int64x2_t z23_product = vmull_high_s32(z, vmultiplier); - const int64x2_t w01_product = vmull_s32(vget_low_s32(w), vget_low_s32(vmultiplier)); - const int64x2_t w23_product = vmull_high_s32(w, vmultiplier); -#else - const int64x2_t x01_product = vmull_s32(vget_low_s32(x), vmultiplier); - const int64x2_t x23_product = vmull_s32(vget_high_s32(x), vmultiplier); - const int64x2_t y01_product = vmull_s32(vget_low_s32(y), vmultiplier); - const int64x2_t y23_product = vmull_s32(vget_high_s32(y), vmultiplier); - const int64x2_t z01_product = vmull_s32(vget_low_s32(z), vmultiplier); - const int64x2_t z23_product = vmull_s32(vget_high_s32(z), vmultiplier); - const int64x2_t w01_product = vmull_s32(vget_low_s32(w), vmultiplier); - const int64x2_t w23_product = vmull_s32(vget_high_s32(w), vmultiplier); -#endif - -#if defined(__aarch64__) - const int64x2_t x01_adjusted_product = vaddw_s32(x01_product, vreinterpret_s32_u32(vget_low_u32(x_neg_mask))); - const int64x2_t x23_adjusted_product = vaddw_high_s32(x23_product, vreinterpretq_s32_u32(x_neg_mask)); - const int64x2_t y01_adjusted_product = vaddw_s32(y01_product, vreinterpret_s32_u32(vget_low_u32(y_neg_mask))); - const int64x2_t y23_adjusted_product = vaddw_high_s32(y23_product, vreinterpretq_s32_u32(y_neg_mask)); - const int64x2_t z01_adjusted_product = vaddw_s32(z01_product, vreinterpret_s32_u32(vget_low_u32(z_neg_mask))); - const int64x2_t z23_adjusted_product = vaddw_high_s32(z23_product, vreinterpretq_s32_u32(z_neg_mask)); - const int64x2_t w01_adjusted_product = vaddw_s32(w01_product, vreinterpret_s32_u32(vget_low_u32(w_neg_mask))); - const int64x2_t w23_adjusted_product = vaddw_high_s32(w23_product, vreinterpretq_s32_u32(w_neg_mask)); -#else - const int64x2_t x01_adjusted_product = vaddw_s32(x01_product, vreinterpret_s32_u32(vget_low_u32(x_neg_mask))); - const int64x2_t x23_adjusted_product = vaddw_s32(x23_product, vreinterpret_s32_u32(vget_high_u32(x_neg_mask))); - const int64x2_t y01_adjusted_product = vaddw_s32(y01_product, vreinterpret_s32_u32(vget_low_u32(y_neg_mask))); - const int64x2_t y23_adjusted_product = vaddw_s32(y23_product, vreinterpret_s32_u32(vget_high_u32(y_neg_mask))); - const int64x2_t z01_adjusted_product = vaddw_s32(z01_product, vreinterpret_s32_u32(vget_low_u32(z_neg_mask))); - const int64x2_t z23_adjusted_product = vaddw_s32(z23_product, vreinterpret_s32_u32(vget_high_u32(z_neg_mask))); - const int64x2_t w01_adjusted_product = vaddw_s32(w01_product, vreinterpret_s32_u32(vget_low_u32(w_neg_mask))); - const int64x2_t w23_adjusted_product = vaddw_s32(w23_product, vreinterpret_s32_u32(vget_high_u32(w_neg_mask))); -#endif - - const int64x2_t x01_scaled = vrshlq_s64(x01_adjusted_product, vshift); - const int64x2_t x23_scaled = vrshlq_s64(x23_adjusted_product, vshift); - const int64x2_t y01_scaled = vrshlq_s64(y01_adjusted_product, vshift); - const int64x2_t y23_scaled = vrshlq_s64(y23_adjusted_product, vshift); - const int64x2_t z01_scaled = vrshlq_s64(z01_adjusted_product, vshift); - const int64x2_t z23_scaled = vrshlq_s64(z23_adjusted_product, vshift); - const int64x2_t w01_scaled = vrshlq_s64(w01_adjusted_product, vshift); - const int64x2_t w23_scaled = vrshlq_s64(w23_adjusted_product, vshift); - -#ifdef __aarch64__ - const int32x4_t x_scaled = vuzp1q_s32(vreinterpretq_s32_s64(x01_scaled), vreinterpretq_s32_s64(x23_scaled)); - const int32x4_t y_scaled = vuzp1q_s32(vreinterpretq_s32_s64(y01_scaled), vreinterpretq_s32_s64(y23_scaled)); - const int32x4_t z_scaled = vuzp1q_s32(vreinterpretq_s32_s64(z01_scaled), vreinterpretq_s32_s64(z23_scaled)); - const int32x4_t w_scaled = vuzp1q_s32(vreinterpretq_s32_s64(w01_scaled), vreinterpretq_s32_s64(w23_scaled)); - - const int16x8_t xy_packed = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(x_scaled), y_scaled), vzero_point); - const int16x8_t zw_packed = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(z_scaled), w_scaled), vzero_point); - const uint8x16_t xyzw_packed = vqmovun_high_s16(vqmovun_s16(xy_packed), zw_packed); -#else - const int32x4_t x_scaled = vcombine_s32(vmovn_s64(x01_scaled), vmovn_s64(x23_scaled)); - const int32x4_t y_scaled = vcombine_s32(vmovn_s64(y01_scaled), vmovn_s64(y23_scaled)); - const int32x4_t z_scaled = vcombine_s32(vmovn_s64(z01_scaled), vmovn_s64(z23_scaled)); - const int32x4_t w_scaled = vcombine_s32(vmovn_s64(w01_scaled), vmovn_s64(w23_scaled)); - - const int16x8_t xy_packed = vqaddq_s16(vcombine_s16(vqmovn_s32(x_scaled), vqmovn_s32(y_scaled)), vzero_point); - const int16x8_t zw_packed = vqaddq_s16(vcombine_s16(vqmovn_s32(z_scaled), vqmovn_s32(w_scaled)), vzero_point); - const uint8x16_t xyzw_packed = vcombine_u8(vqmovun_s16(xy_packed), vqmovun_s16(zw_packed)); -#endif - - const uint8x16_t xyzw_clamped = vmaxq_u8(vminq_u8(xyzw_packed, vqmax), vqmin); - - // AArch32 version: - // 4x VCLT.S32 Qd, Qm, #0 - // 8x VMULL.S32 Qd, Dm, Dn - // 8x VADDW.S32 Qd, Qm, Dn - // 8x VRSHL.S32 Qd, Qm, Qn - // 8x VMOVN.S64 Dd, Qm - // 4x VQMOVN.S32 Dd, Qm - // 2x VQADD.S16 Qd, Qm, Qn - // 2x VQMOVUN.S16 Dd, Qm - // 1x VMAX.U8 Qd, Qm, Qn - // 1x VMIN.U8 Qd, Qm, Qn - // --------------------- - // 46 instructions total - // - // AArch64 version: - // 4x CMLT Vd.4S, Vn.4S, #0 - // 4x SMULL Vd.2D, Vn.2S, Vm.2S - // 4x SMULL2 Vd.2D, Vn.4S, Vm.4S - // 4x SADDW Vd.2D, Vn.2D, Vm.2S - // 4x SADDW2 Vd.2D, Vn.2D, Vm.4S - // 8x SRSHL Vd.2D, Vn.2D, Vm.2D - // 4x UZP1 Vd.4S, Vn.4S, Vm.4S - // 2x SQXTN Vd.4H, Vn.4S - // 2x SQXTN2 Vd.8H, Vn.4S - // 2x SQADD Vd.8H, Vn.8H, Vm.8H - // 1x SQXTUN Vd.8B, Vn.8H - // 1x SQXTUN2 Vd.16B, Vn.8H - // 1x UMIN Vd.16B, Vn.16B, Vm.16B - // 1x UMAX Vd.16B, Vn.16B, Vm.16B - // --------------------- - // 42 instructions total - - vst1q_u8(output, xyzw_clamped); - output += 16; - } -} diff --git a/src/qu8-requantization/qu8-requantization-rndna-scalar-signed64.c b/src/qu8-requantization/qu8-requantization-rndna-scalar-signed64.c deleted file mode 100644 index 2a744620fde..00000000000 --- a/src/qu8-requantization/qu8-requantization-rndna-scalar-signed64.c +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/requantization-stubs.h" - - -void xnn_qu8_requantize_rndna__scalar_signed64( - size_t n, - const int32_t* input, - float scale, - uint8_t zero_point, - uint8_t qmin, - uint8_t qmax, - uint8_t* output) -{ - assert(n % 4 == 0); - assert(scale < 1.0f); - assert(scale >= 0x1.0p-32f); - - const uint32_t scale_bits = float_as_uint32(scale); - const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000); - const uint32_t shift = 127 + 23 - (scale_bits >> 23); - assert(shift >= 24); - assert(shift < 56); - - const int64_t rounding = INT64_C(1) << (shift - 1); - const int32_t smin = (int32_t) (uint32_t) qmin - (int32_t) (uint32_t) zero_point; - const int32_t smax = (int32_t) (uint32_t) qmax - (int32_t) (uint32_t) zero_point; - for (; n != 0; n -= 4) { - const int32_t x = input[0]; - const int32_t y = input[1]; - const int32_t z = input[2]; - const int32_t w = input[3]; - input += 4; - - // Compute full 64-bit product of signed 32-bit factors. - // - // Note: multiplier can be treated as either signed or unsigned. - const int64_t x_product = (int64_t) x * (int64_t) multiplier; - const int64_t y_product = (int64_t) y * (int64_t) multiplier; - const int64_t z_product = (int64_t) z * (int64_t) multiplier; - const int64_t w_product = (int64_t) w * (int64_t) multiplier; - - // Adjust product before subsequent shift with rounding up to simulate shift with rounding away from zero. - const int64_t x_adjusted_product = x_product - (int64_t) (x < 0); - const int64_t y_adjusted_product = y_product - (int64_t) (y < 0); - const int64_t z_adjusted_product = z_product - (int64_t) (z < 0); - const int64_t w_adjusted_product = w_product - (int64_t) (w < 0); - - // Arithmetically shift the full 64-bit product right with rounding. - // Rounding is performed towards closest integer, with midpoints rounded up. - // - // Note that although rounding is precomputed, it is dependent on shift value, and on processors with 64-bit - // "right shift with rounding" instruction each line below can be represented by just one such instruction - // (e.g. VRSHL.S64 on ARM NEON, SRSHL in ARM64 Advanced SIMD). - const int32_t x_scaled = (int32_t) math_asr_s64(x_adjusted_product + rounding, shift); - const int32_t y_scaled = (int32_t) math_asr_s64(y_adjusted_product + rounding, shift); - const int32_t z_scaled = (int32_t) math_asr_s64(z_adjusted_product + rounding, shift); - const int32_t w_scaled = (int32_t) math_asr_s64(w_adjusted_product + rounding, shift); - - // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point). - const int32_t x_clamped = math_min_s32(math_max_s32(x_scaled, smin), smax); - const int32_t y_clamped = math_min_s32(math_max_s32(y_scaled, smin), smax); - const int32_t z_clamped = math_min_s32(math_max_s32(z_scaled, smin), smax); - const int32_t w_clamped = math_min_s32(math_max_s32(w_scaled, smin), smax); - - // Add zero point to clamped value. - // The result is guaranteed to be in [qmin, qmax] range. - // - // This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519] - // range, so addition of zero point (which can be up to 255) can overflow signed 32-bit integer. - const int32_t x_biased = x_clamped + zero_point; - const int32_t y_biased = y_clamped + zero_point; - const int32_t z_biased = z_clamped + zero_point; - const int32_t w_biased = w_clamped + zero_point; - - output[0] = (uint8_t) x_biased; - output[1] = (uint8_t) y_biased; - output[2] = (uint8_t) z_biased; - output[3] = (uint8_t) w_biased; - output += 4; - } -} diff --git a/src/qu8-requantization/qu8-requantization-rndna-scalar-unsigned32.c b/src/qu8-requantization/qu8-requantization-rndna-scalar-unsigned32.c deleted file mode 100644 index 3c803e3b45c..00000000000 --- a/src/qu8-requantization/qu8-requantization-rndna-scalar-unsigned32.c +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/requantization-stubs.h" - - -void xnn_qu8_requantize_rndna__scalar_unsigned32( - size_t n, - const int32_t* input, - float scale, - uint8_t zero_point, - uint8_t qmin, - uint8_t qmax, - uint8_t* output) -{ - assert(n % 4 == 0); - assert(scale < 1.0f); - assert(scale >= 0x1.0p-32f); - - const uint32_t scale_bits = float_as_uint32(scale); - const uint32_t multiplier = (scale_bits << 8) | UINT32_C(0x80000000); - const uint32_t shift = 127 + 31 - (scale_bits >> 23); - assert(shift >= 32); - assert(shift < 64); - - const uint64_t rounding = UINT64_C(1) << (shift - 1); - const uint32_t rounding_hi = (uint32_t) (rounding >> 32); - const uint32_t rounding_lo = (uint32_t) rounding; - const uint32_t shift_minus_32 = shift - 32; - const int32_t smin = (int32_t) (uint32_t) qmin - (int32_t) (uint32_t) zero_point; - const int32_t smax = (int32_t) (uint32_t) qmax - (int32_t) (uint32_t) zero_point; - for (; n != 0; n -= 4) { - const int32_t x = input[0]; - const int32_t y = input[1]; - const int32_t z = input[2]; - const int32_t w = input[3]; - input += 4; - - // Compute absolute value of input as unsigned 32-bit int. - // All further computations will work with unsigned values to avoid undefined behaviour on signed operations. - const uint32_t x_abs = (x >= 0) ? (uint32_t) x : -(uint32_t) x; - const uint32_t y_abs = (y >= 0) ? (uint32_t) y : -(uint32_t) y; - const uint32_t z_abs = (z >= 0) ? (uint32_t) z : -(uint32_t) z; - const uint32_t w_abs = (w >= 0) ? (uint32_t) w : -(uint32_t) w; - - // Compute full 64-bit product of 32-bit factors. - const uint64_t x_product = (uint64_t) x_abs * (uint64_t) multiplier; - const uint64_t y_product = (uint64_t) y_abs * (uint64_t) multiplier; - const uint64_t z_product = (uint64_t) z_abs * (uint64_t) multiplier; - const uint64_t w_product = (uint64_t) w_abs * (uint64_t) multiplier; - - // Shift the full 64-bit product right with rounding. - // Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero). - // - // Generally, this operation requires both 64-bit addition and 64-bit shift, but we use two tricks to replace - // 64-bit operations with 32-bit operations. - // - // To avoid full 64-bit addition we make use of three facts: - // - 64-bit rounding value added before the shift is a power of 2, and thus has only one bit set. - // - When 0x1.0p-32f <= scale < 0x1.0p-31f, then the non-zero bit in rounding is in the low 32 bits, and - // rounding is exactly 0x80000000 (2**31), because rounding is 2**(scale-1) and scale >= 32. In this case, - // addition of rounding can affect high 32 bits of the product only through overflow, which happens if - // low 32-bit part of the product equals or exceeds 0x80000000. We can reformulate the latter condition - // as low 32-bit part of the product has the bit 31 set, and then overflow happens if both the low 32-bit part - // of the product and the low 32-bit part of the rounding value have bit 31 set. Since 32-bit numbers with the - // bit 31 set are negative when interpreted as signed integers, we can check the overflow condition as - // (int32_t) (LOW(product) & LOW(rounding)) < 0 - // - When 0x1.0p-31f <= scale < 1.0f, then the non-zero bit is in the high 32 bits of rounding. We just need - // to do 32-bit addition of high 32 bits of rounding and high 32 bits of product. This addition never - // overflows because product <= 0x80000000 * 0xFFFFFF00 < 2**63 and rounding = 2**(scale-1) <= 2**62. - // - // To avoid full 64-bit shift, we leverage the fact that shift >= 32, and do it in two steps: - // - Shift by 32, which can be implemented by extacting the high 32-bit word on 32-bit systems. - // - Shift by (shift - 32), which can be implemented as a 32-bit shift of high word of addition result. - const uint32_t x_carry_lo = (uint32_t) ((int32_t)((uint32_t) x_product & rounding_lo) < 0); - const uint32_t y_carry_lo = (uint32_t) ((int32_t)((uint32_t) y_product & rounding_lo) < 0); - const uint32_t z_carry_lo = (uint32_t) ((int32_t)((uint32_t) z_product & rounding_lo) < 0); - const uint32_t w_carry_lo = (uint32_t) ((int32_t)((uint32_t) w_product & rounding_lo) < 0); - - const uint32_t x_product_hi = (uint32_t) (x_product >> 32); - const uint32_t y_product_hi = (uint32_t) (y_product >> 32); - const uint32_t z_product_hi = (uint32_t) (z_product >> 32); - const uint32_t w_product_hi = (uint32_t) (w_product >> 32); - - const uint32_t x_abs_scaled = (uint32_t) (x_product_hi + rounding_hi + x_carry_lo) >> shift_minus_32; - const uint32_t y_abs_scaled = (uint32_t) (y_product_hi + rounding_hi + y_carry_lo) >> shift_minus_32; - const uint32_t z_abs_scaled = (uint32_t) (z_product_hi + rounding_hi + z_carry_lo) >> shift_minus_32; - const uint32_t w_abs_scaled = (uint32_t) (w_product_hi + rounding_hi + w_carry_lo) >> shift_minus_32; - - // Copy the sign of input to scaled absolute input value. - const int32_t x_scaled = (int32_t) (x >= 0 ? x_abs_scaled : -x_abs_scaled); - const int32_t y_scaled = (int32_t) (y >= 0 ? y_abs_scaled : -y_abs_scaled); - const int32_t z_scaled = (int32_t) (z >= 0 ? z_abs_scaled : -z_abs_scaled); - const int32_t w_scaled = (int32_t) (w >= 0 ? w_abs_scaled : -w_abs_scaled); - - // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point). - const int32_t x_clamped = math_min_s32(math_max_s32(x_scaled, smin), smax); - const int32_t y_clamped = math_min_s32(math_max_s32(y_scaled, smin), smax); - const int32_t z_clamped = math_min_s32(math_max_s32(z_scaled, smin), smax); - const int32_t w_clamped = math_min_s32(math_max_s32(w_scaled, smin), smax); - - // Add zero point to clamped value. - // The result is guaranteed to be in [qmin, qmax] range. - // - // This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519] - // range, so addition of zero point (which can be up to 255) can overflow signed 32-bit integer. - const int32_t x_biased = x_clamped + zero_point; - const int32_t y_biased = y_clamped + zero_point; - const int32_t z_biased = z_clamped + zero_point; - const int32_t w_biased = w_clamped + zero_point; - - output[0] = (uint8_t) x_biased; - output[1] = (uint8_t) y_biased; - output[2] = (uint8_t) z_biased; - output[3] = (uint8_t) w_biased; - output += 4; - } -} diff --git a/src/qu8-requantization/qu8-requantization-rndna-scalar-unsigned64.c b/src/qu8-requantization/qu8-requantization-rndna-scalar-unsigned64.c deleted file mode 100644 index cb6e9fd50c7..00000000000 --- a/src/qu8-requantization/qu8-requantization-rndna-scalar-unsigned64.c +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "xnnpack/math.h" -#include "xnnpack/requantization-stubs.h" - - -void xnn_qu8_requantize_rndna__scalar_unsigned64( - size_t n, - const int32_t* input, - float scale, - uint8_t zero_point, - uint8_t qmin, - uint8_t qmax, - uint8_t* output) -{ - assert(n % 4 == 0); - assert(scale < 1.0f); - assert(scale >= 0x1.0p-32f); - - const uint32_t scale_bits = float_as_uint32(scale); - const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000); - const uint32_t shift = 127 + 23 - (scale_bits >> 23); - assert(shift >= 24); - assert(shift < 56); - - const uint64_t rounding = UINT64_C(1) << (shift - 1); - const int32_t smin = (int32_t) (uint32_t) qmin - (int32_t) (uint32_t) zero_point; - const int32_t smax = (int32_t) (uint32_t) qmax - (int32_t) (uint32_t) zero_point; - for (; n != 0; n -= 4) { - const int32_t x = input[0]; - const int32_t y = input[1]; - const int32_t z = input[2]; - const int32_t w = input[3]; - input += 4; - - // Compute absolute value of input as unsigned 32-bit int. - // All further computations will work with unsigned values to avoid undefined behaviour on signed operations. - const uint32_t x_abs = (x >= 0) ? (uint32_t) x : -(uint32_t) x; - const uint32_t y_abs = (y >= 0) ? (uint32_t) y : -(uint32_t) y; - const uint32_t z_abs = (z >= 0) ? (uint32_t) z : -(uint32_t) z; - const uint32_t w_abs = (w >= 0) ? (uint32_t) w : -(uint32_t) w; - - // Compute full 64-bit product of 32-bit factors. - const uint64_t x_product = (uint64_t) x_abs * (uint64_t) multiplier; - const uint64_t y_product = (uint64_t) y_abs * (uint64_t) multiplier; - const uint64_t z_product = (uint64_t) z_abs * (uint64_t) multiplier; - const uint64_t w_product = (uint64_t) w_abs * (uint64_t) multiplier; - - // Shift the full 64-bit product right with rounding. - // Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero). - // - // Note that although rounding is precomputed, it is dependent on shift value, and on processors with 64-bit - // "right shift with rounding" instruction each line below can be represented by just one such instruction - // (e.g. VRSHL.U64 on ARM NEON, URSHL in ARM64 Advanced SIMD). - const uint32_t x_abs_scaled = (uint32_t) ((x_product + rounding) >> shift); - const uint32_t y_abs_scaled = (uint32_t) ((y_product + rounding) >> shift); - const uint32_t z_abs_scaled = (uint32_t) ((z_product + rounding) >> shift); - const uint32_t w_abs_scaled = (uint32_t) ((w_product + rounding) >> shift); - - // Copy the sign of input to scaled absolute input value. - // - // On x86 processors with SSSE3 instruction set, this operation nicely maps to PSIGND instruction. - const int32_t x_scaled = (int32_t) (x >= 0 ? x_abs_scaled : -x_abs_scaled); - const int32_t y_scaled = (int32_t) (y >= 0 ? y_abs_scaled : -y_abs_scaled); - const int32_t z_scaled = (int32_t) (z >= 0 ? z_abs_scaled : -z_abs_scaled); - const int32_t w_scaled = (int32_t) (w >= 0 ? w_abs_scaled : -w_abs_scaled); - - // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point). - const int32_t x_clamped = math_min_s32(math_max_s32(x_scaled, smin), smax); - const int32_t y_clamped = math_min_s32(math_max_s32(y_scaled, smin), smax); - const int32_t z_clamped = math_min_s32(math_max_s32(z_scaled, smin), smax); - const int32_t w_clamped = math_min_s32(math_max_s32(w_scaled, smin), smax); - - // Add zero point to clamped value. - // The result is guaranteed to be in [qmin, qmax] range. - // - // This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519] - // range, so addition of zero point (which can be up to 255) can overflow signed 32-bit integer. - const int32_t x_biased = x_clamped + zero_point; - const int32_t y_biased = y_clamped + zero_point; - const int32_t z_biased = z_clamped + zero_point; - const int32_t w_biased = w_clamped + zero_point; - - output[0] = (uint8_t) x_biased; - output[1] = (uint8_t) y_biased; - output[2] = (uint8_t) z_biased; - output[3] = (uint8_t) w_biased; - output += 4; - } -} diff --git a/src/qu8-requantization/qu8-requantization-rndna-sse2.c b/src/qu8-requantization/qu8-requantization-rndna-sse2.c deleted file mode 100644 index c4bf952d9ee..00000000000 --- a/src/qu8-requantization/qu8-requantization-rndna-sse2.c +++ /dev/null @@ -1,129 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/requantization-stubs.h" - - -void xnn_qu8_requantize_rndna__sse2( - size_t n, - const int32_t* input, - float scale, - uint8_t zero_point, - uint8_t qmin, - uint8_t qmax, - uint8_t* output) -{ - assert(n % 16 == 0); - assert(scale < 1.0f); - assert(scale >= 0x1.0p-32f); - - const uint32_t scale_bits = float_as_uint32(scale); - const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000); - const uint32_t shift = 127 + 23 - (scale_bits >> 23); - assert(shift >= 24); - assert(shift < 56); - const uint64_t rounding = UINT64_C(1) << (shift - 1); - - const __m128i vmultiplier = _mm_set1_epi32(multiplier); - const __m128i vzero_point = _mm_set1_epi16((short) (uint16_t) zero_point); - const __m128i vqmin = _mm_set1_epi8((char) qmin); - const __m128i vqmax = _mm_set1_epi8((char) qmax); - const __m128i vshift = _mm_cvtsi32_si128((int) shift); - const __m128i vrounding = _mm_set1_epi64x(rounding); - for (; n != 0; n -= 16) { - const __m128i x = _mm_loadu_si128((const __m128i*) input); - const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4)); - const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8)); - const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12)); - input += 16; - - const __m128i x_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), x); - const __m128i y_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), y); - const __m128i z_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), z); - const __m128i w_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), w); - - const __m128i x_abs0123 = _mm_sub_epi32(_mm_xor_si128(x, x_neg_mask), x_neg_mask); - const __m128i y_abs0123 = _mm_sub_epi32(_mm_xor_si128(y, y_neg_mask), y_neg_mask); - const __m128i z_abs0123 = _mm_sub_epi32(_mm_xor_si128(z, z_neg_mask), z_neg_mask); - const __m128i w_abs0123 = _mm_sub_epi32(_mm_xor_si128(w, w_neg_mask), w_neg_mask); - - const __m128i x_abs1032 = _mm_shuffle_epi32(x_abs0123, _MM_SHUFFLE(2, 3, 0, 1)); - const __m128i y_abs1032 = _mm_shuffle_epi32(y_abs0123, _MM_SHUFFLE(2, 3, 0, 1)); - const __m128i z_abs1032 = _mm_shuffle_epi32(z_abs0123, _MM_SHUFFLE(2, 3, 0, 1)); - const __m128i w_abs1032 = _mm_shuffle_epi32(w_abs0123, _MM_SHUFFLE(2, 3, 0, 1)); - - const __m128i x_absmul02 = _mm_mul_epu32(x_abs0123, vmultiplier); - const __m128i y_absmul02 = _mm_mul_epu32(y_abs0123, vmultiplier); - const __m128i z_absmul02 = _mm_mul_epu32(z_abs0123, vmultiplier); - const __m128i w_absmul02 = _mm_mul_epu32(w_abs0123, vmultiplier); - - const __m128i x_absmul13 = _mm_mul_epu32(x_abs1032, vmultiplier); - const __m128i y_absmul13 = _mm_mul_epu32(y_abs1032, vmultiplier); - const __m128i z_absmul13 = _mm_mul_epu32(z_abs1032, vmultiplier); - const __m128i w_absmul13 = _mm_mul_epu32(w_abs1032, vmultiplier); - - const __m128i x_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(x_absmul02, vrounding), vshift); - const __m128i x_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(x_absmul13, vrounding), vshift); - const __m128i y_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(y_absmul02, vrounding), vshift); - const __m128i y_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(y_absmul13, vrounding), vshift); - const __m128i z_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(z_absmul02, vrounding), vshift); - const __m128i z_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(z_absmul13, vrounding), vshift); - const __m128i w_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(w_absmul02, vrounding), vshift); - const __m128i w_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(w_absmul13, vrounding), vshift); - - const __m128i x_abs_scaled0213 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(x_abs_scaled02), _mm_castsi128_ps(x_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0))); - const __m128i y_abs_scaled0213 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(y_abs_scaled02), _mm_castsi128_ps(y_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0))); - const __m128i z_abs_scaled0213 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(z_abs_scaled02), _mm_castsi128_ps(z_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0))); - const __m128i w_abs_scaled0213 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(w_abs_scaled02), _mm_castsi128_ps(w_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0))); - - const __m128i x_abs_scaled = _mm_shuffle_epi32(x_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0)); - const __m128i y_abs_scaled = _mm_shuffle_epi32(y_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0)); - const __m128i z_abs_scaled = _mm_shuffle_epi32(z_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0)); - const __m128i w_abs_scaled = _mm_shuffle_epi32(w_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0)); - - const __m128i x_scaled = _mm_sub_epi32(_mm_xor_si128(x_abs_scaled, x_neg_mask), x_neg_mask); - const __m128i y_scaled = _mm_sub_epi32(_mm_xor_si128(y_abs_scaled, y_neg_mask), y_neg_mask); - const __m128i z_scaled = _mm_sub_epi32(_mm_xor_si128(z_abs_scaled, z_neg_mask), z_neg_mask); - const __m128i w_scaled = _mm_sub_epi32(_mm_xor_si128(w_abs_scaled, w_neg_mask), w_neg_mask); - - const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_scaled, y_scaled), vzero_point); - const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_scaled, w_scaled), vzero_point); - const __m128i xyzw_packed = _mm_packus_epi16(xy_packed, zw_packed); - const __m128i xyzw_clamped = _mm_max_epu8(_mm_min_epu8(xyzw_packed, vqmax), vqmin); - - // 4x PXOR (setzero) - // 8x PSUBD - // 8x PXOR - // 8x PSHUFD - // 8x PMULUDQ - // 8x PSRLQ - // 8x PADDQ - // 4x SHUFPS - // 2x PACKSSDW - // 1x PACKUSWB - // 2x PADDW - // 1x PMAXUB - // 1x PMINUB - // --------------------- - // 63 instructions total - - _mm_storeu_si128((__m128i*) output, xyzw_clamped); - output += 16; - } -} diff --git a/src/qu8-requantization/qu8-requantization-rndna-sse41.c b/src/qu8-requantization/qu8-requantization-rndna-sse41.c deleted file mode 100644 index 044725bfe70..00000000000 --- a/src/qu8-requantization/qu8-requantization-rndna-sse41.c +++ /dev/null @@ -1,116 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/requantization-stubs.h" - - -void xnn_qu8_requantize_rndna__sse41( - size_t n, - const int32_t* input, - float scale, - uint8_t zero_point, - uint8_t qmin, - uint8_t qmax, - uint8_t* output) -{ - assert(n % 16 == 0); - assert(scale < 1.0f); - assert(scale >= 0x1.0p-32f); - - const uint32_t scale_bits = float_as_uint32(scale); - const uint32_t multiplier = (scale_bits << 8) | UINT32_C(0x80000000); - const uint32_t shift = 127 + 31 - (scale_bits >> 23); - assert(shift >= 32); - assert(shift < 64); - const uint64_t rounding = UINT64_C(1) << (shift - 1); - - const __m128i vmultiplier = _mm_set1_epi32(multiplier); - const __m128i vzero_point = _mm_set1_epi16((short) (uint16_t) zero_point); - const __m128i vqmin = _mm_set1_epi8((char) qmin); - const __m128i vqmax = _mm_set1_epi8((char) qmax); - const __m128i vshiftlo = _mm_cvtsi32_si128((int) shift); - const __m128i vshifthi = _mm_cvtsi32_si128((int) shift - 32); - const __m128i vrounding = _mm_set1_epi64x(rounding); - for (; n != 0; n -= 16) { - const __m128i x = _mm_loadu_si128((const __m128i*) input); - const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4)); - const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8)); - const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12)); - input += 16; - - const __m128i x_abs0123 = _mm_abs_epi32(x); - const __m128i y_abs0123 = _mm_abs_epi32(y); - const __m128i z_abs0123 = _mm_abs_epi32(z); - const __m128i w_abs0123 = _mm_abs_epi32(w); - - const __m128i x_abs1032 = _mm_shuffle_epi32(x_abs0123, _MM_SHUFFLE(2, 3, 0, 1)); - const __m128i y_abs1032 = _mm_shuffle_epi32(y_abs0123, _MM_SHUFFLE(2, 3, 0, 1)); - const __m128i z_abs1032 = _mm_shuffle_epi32(z_abs0123, _MM_SHUFFLE(2, 3, 0, 1)); - const __m128i w_abs1032 = _mm_shuffle_epi32(w_abs0123, _MM_SHUFFLE(2, 3, 0, 1)); - - const __m128i x_absmul02 = _mm_mul_epu32(x_abs0123, vmultiplier); - const __m128i y_absmul02 = _mm_mul_epu32(y_abs0123, vmultiplier); - const __m128i z_absmul02 = _mm_mul_epu32(z_abs0123, vmultiplier); - const __m128i w_absmul02 = _mm_mul_epu32(w_abs0123, vmultiplier); - - const __m128i x_absmul13 = _mm_mul_epu32(x_abs1032, vmultiplier); - const __m128i y_absmul13 = _mm_mul_epu32(y_abs1032, vmultiplier); - const __m128i z_absmul13 = _mm_mul_epu32(z_abs1032, vmultiplier); - const __m128i w_absmul13 = _mm_mul_epu32(w_abs1032, vmultiplier); - - const __m128i x_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(x_absmul02, vrounding), vshiftlo); - const __m128i x_abs_scaled13 = _mm_srl_epi32(_mm_add_epi64(x_absmul13, vrounding), vshifthi); - const __m128i y_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(y_absmul02, vrounding), vshiftlo); - const __m128i y_abs_scaled13 = _mm_srl_epi32(_mm_add_epi64(y_absmul13, vrounding), vshifthi); - const __m128i z_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(z_absmul02, vrounding), vshiftlo); - const __m128i z_abs_scaled13 = _mm_srl_epi32(_mm_add_epi64(z_absmul13, vrounding), vshifthi); - const __m128i w_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(w_absmul02, vrounding), vshiftlo); - const __m128i w_abs_scaled13 = _mm_srl_epi32(_mm_add_epi64(w_absmul13, vrounding), vshifthi); - - const __m128i x_abs_scaled = _mm_blend_epi16(x_abs_scaled02, x_abs_scaled13, 0xCC); - const __m128i y_abs_scaled = _mm_blend_epi16(y_abs_scaled02, y_abs_scaled13, 0xCC); - const __m128i z_abs_scaled = _mm_blend_epi16(z_abs_scaled02, z_abs_scaled13, 0xCC); - const __m128i w_abs_scaled = _mm_blend_epi16(w_abs_scaled02, w_abs_scaled13, 0xCC); - - const __m128i x_scaled = _mm_sign_epi32(x_abs_scaled, x); - const __m128i y_scaled = _mm_sign_epi32(y_abs_scaled, y); - const __m128i z_scaled = _mm_sign_epi32(z_abs_scaled, z); - const __m128i w_scaled = _mm_sign_epi32(w_abs_scaled, w); - - const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_scaled, y_scaled), vzero_point); - const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_scaled, w_scaled), vzero_point); - const __m128i xyzw_packed = _mm_packus_epi16(xy_packed, zw_packed); - const __m128i xyzw_clamped = _mm_max_epu8(_mm_min_epu8(xyzw_packed, vqmax), vqmin); - - // 4x PABSD - // 4x PSHUFD - // 8x PMULUDQ - // 4x PSRLQ - // 4x PSRLD - // 8x PADDQ - // 4x PBLENDW - // 4x PSIGND - // 2x PACKSSDW - // 1x PACKUSWB - // 2x PADDW - // 1x PMAXUB - // 1x PMINUB - // --------------------- - // 47 instructions total - - _mm_storeu_si128((__m128i*) output, xyzw_clamped); - output += 16; - } -} diff --git a/src/qu8-requantization/qu8-requantization-rndna-ssse3.c b/src/qu8-requantization/qu8-requantization-rndna-ssse3.c deleted file mode 100644 index 564a4155c44..00000000000 --- a/src/qu8-requantization/qu8-requantization-rndna-ssse3.c +++ /dev/null @@ -1,123 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include - -#include "xnnpack/math.h" -#include "xnnpack/requantization-stubs.h" - - -void xnn_qu8_requantize_rndna__ssse3( - size_t n, - const int32_t* input, - float scale, - uint8_t zero_point, - uint8_t qmin, - uint8_t qmax, - uint8_t* output) -{ - assert(n % 16 == 0); - assert(scale < 1.0f); - assert(scale >= 0x1.0p-32f); - - const uint32_t scale_bits = float_as_uint32(scale); - const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000); - const uint32_t shift = 127 + 23 - (scale_bits >> 23); - assert(shift >= 24); - assert(shift < 56); - const uint64_t rounding = UINT64_C(1) << (shift - 1); - - const __m128i vmultiplier = _mm_set1_epi32(multiplier); - const __m128i vzero_point = _mm_set1_epi16((short) (uint16_t) zero_point); - const __m128i vqmin = _mm_set1_epi8((char) qmin); - const __m128i vqmax = _mm_set1_epi8((char) qmax); - const __m128i vshift = _mm_cvtsi32_si128((int) shift); - const __m128i vrounding = _mm_set1_epi64x(rounding); - for (; n != 0; n -= 16) { - const __m128i x = _mm_loadu_si128((const __m128i*) input); - const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4)); - const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8)); - const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12)); - input += 16; - - const __m128i x_abs0123 = _mm_abs_epi32(x); - const __m128i y_abs0123 = _mm_abs_epi32(y); - const __m128i z_abs0123 = _mm_abs_epi32(z); - const __m128i w_abs0123 = _mm_abs_epi32(w); - - const __m128i x_abs1032 = _mm_shuffle_epi32(x_abs0123, _MM_SHUFFLE(2, 3, 0, 1)); - const __m128i y_abs1032 = _mm_shuffle_epi32(y_abs0123, _MM_SHUFFLE(2, 3, 0, 1)); - const __m128i z_abs1032 = _mm_shuffle_epi32(z_abs0123, _MM_SHUFFLE(2, 3, 0, 1)); - const __m128i w_abs1032 = _mm_shuffle_epi32(w_abs0123, _MM_SHUFFLE(2, 3, 0, 1)); - - const __m128i x_absmul02 = _mm_mul_epu32(x_abs0123, vmultiplier); - const __m128i y_absmul02 = _mm_mul_epu32(y_abs0123, vmultiplier); - const __m128i z_absmul02 = _mm_mul_epu32(z_abs0123, vmultiplier); - const __m128i w_absmul02 = _mm_mul_epu32(w_abs0123, vmultiplier); - - const __m128i x_absmul13 = _mm_mul_epu32(x_abs1032, vmultiplier); - const __m128i y_absmul13 = _mm_mul_epu32(y_abs1032, vmultiplier); - const __m128i z_absmul13 = _mm_mul_epu32(z_abs1032, vmultiplier); - const __m128i w_absmul13 = _mm_mul_epu32(w_abs1032, vmultiplier); - - const __m128i x_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(x_absmul02, vrounding), vshift); - const __m128i x_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(x_absmul13, vrounding), vshift); - const __m128i y_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(y_absmul02, vrounding), vshift); - const __m128i y_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(y_absmul13, vrounding), vshift); - const __m128i z_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(z_absmul02, vrounding), vshift); - const __m128i z_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(z_absmul13, vrounding), vshift); - const __m128i w_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(w_absmul02, vrounding), vshift); - const __m128i w_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(w_absmul13, vrounding), vshift); - - const __m128i x_abs_scaled0213 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(x_abs_scaled02), _mm_castsi128_ps(x_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0))); - const __m128i y_abs_scaled0213 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(y_abs_scaled02), _mm_castsi128_ps(y_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0))); - const __m128i z_abs_scaled0213 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(z_abs_scaled02), _mm_castsi128_ps(z_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0))); - const __m128i w_abs_scaled0213 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(w_abs_scaled02), _mm_castsi128_ps(w_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0))); - - const __m128i x_abs_scaled = _mm_shuffle_epi32(x_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0)); - const __m128i y_abs_scaled = _mm_shuffle_epi32(y_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0)); - const __m128i z_abs_scaled = _mm_shuffle_epi32(z_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0)); - const __m128i w_abs_scaled = _mm_shuffle_epi32(w_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0)); - - const __m128i x_scaled = _mm_sign_epi32(x_abs_scaled, x); - const __m128i y_scaled = _mm_sign_epi32(y_abs_scaled, y); - const __m128i z_scaled = _mm_sign_epi32(z_abs_scaled, z); - const __m128i w_scaled = _mm_sign_epi32(w_abs_scaled, w); - - const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_scaled, y_scaled), vzero_point); - const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_scaled, w_scaled), vzero_point); - const __m128i xyzw_packed = _mm_packus_epi16(xy_packed, zw_packed); - const __m128i xyzw_clamped = _mm_max_epu8(_mm_min_epu8(xyzw_packed, vqmax), vqmin); - - // 4x PABSD - // 8x PSHUFD - // 8x PMULUDQ - // 8x PSRLQ - // 8x PADDQ - // 4x SHUFPS - // 4x PSIGND - // 2x PACKSSDW - // 1x PACKUSWB - // 2x PADDW - // 1x PMAXUB - // 1x PMINUB - // --------------------- - // 51 instructions total - - _mm_storeu_si128((__m128i*) output, xyzw_clamped); - output += 16; - } -} diff --git a/src/qu8-vadd/qu8-vadd-minmax.h b/src/qu8-vadd/qu8-vadd-minmax.h index 2e9658f7fe9..2cc23f195c0 100644 --- a/src/qu8-vadd/qu8-vadd-minmax.h +++ b/src/qu8-vadd/qu8-vadd-minmax.h @@ -36,9 +36,12 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qu8_vadd_minmax_ukernel__avx_mul32 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qu8_vadd_minmax_ukernel__avx_mul32_ld32_u16, 16, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_vadd_minmax_ukernel__avx2_mul32_ld64_u8, 8, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_vadd_minmax_ukernel__avx2_mul32_ld64_u16, 16, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qu8_vadd_minmax_ukernel__avx512skx_mul32_ld128_u16, 16, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qu8_vadd_minmax_ukernel__avx512skx_mul32_ld128_u32, 32, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vadd_minmax_ukernel__wasmsimd_u8, 8, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params) diff --git a/src/qu8-vaddc/qu8-vaddc-minmax.h b/src/qu8-vaddc/qu8-vaddc-minmax.h index 08c23dd2a97..1d15a4d9774 100644 --- a/src/qu8-vaddc/qu8-vaddc-minmax.h +++ b/src/qu8-vaddc/qu8-vaddc-minmax.h @@ -36,9 +36,12 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qu8_vaddc_minmax_ukernel__avx_mul3 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_u16, 16, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_u8, 8, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_u16, 16, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u16, 16, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u32, 32, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vaddc_minmax_ukernel__wasmsimd_u8, 8, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params) diff --git a/src/qu8-vhswish/gen/qu8-vhswish-avx-u16.c b/src/qu8-vhswish/gen/qu8-vhswish-avx-u16.c index d7a5347b72e..632ce7bba2d 100644 --- a/src/qu8-vhswish/gen/qu8-vhswish-avx-u16.c +++ b/src/qu8-vhswish/gen/qu8-vhswish-avx-u16.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qu8_vhswish_ukernel__avx_u16( diff --git a/src/qu8-vhswish/gen/qu8-vhswish-avx-u32.c b/src/qu8-vhswish/gen/qu8-vhswish-avx-u32.c index a08379aebc7..cf2a42d9d9c 100644 --- a/src/qu8-vhswish/gen/qu8-vhswish-avx-u32.c +++ b/src/qu8-vhswish/gen/qu8-vhswish-avx-u32.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qu8_vhswish_ukernel__avx_u32( diff --git a/src/qu8-vhswish/gen/qu8-vhswish-avx-u8.c b/src/qu8-vhswish/gen/qu8-vhswish-avx-u8.c index ea6d75990cf..f9d8bcebdec 100644 --- a/src/qu8-vhswish/gen/qu8-vhswish-avx-u8.c +++ b/src/qu8-vhswish/gen/qu8-vhswish-avx-u8.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qu8_vhswish_ukernel__avx_u8( diff --git a/src/qu8-vhswish/gen/qu8-vhswish-neon-u16.c b/src/qu8-vhswish/gen/qu8-vhswish-neon-u16.c index 5b704e6f655..b689a77d76b 100644 --- a/src/qu8-vhswish/gen/qu8-vhswish-neon-u16.c +++ b/src/qu8-vhswish/gen/qu8-vhswish-neon-u16.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qu8_vhswish_ukernel__neon_u16( diff --git a/src/qu8-vhswish/gen/qu8-vhswish-neon-u32.c b/src/qu8-vhswish/gen/qu8-vhswish-neon-u32.c index 9115b9ac4f3..22d6538fee4 100644 --- a/src/qu8-vhswish/gen/qu8-vhswish-neon-u32.c +++ b/src/qu8-vhswish/gen/qu8-vhswish-neon-u32.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qu8_vhswish_ukernel__neon_u32( diff --git a/src/qu8-vhswish/gen/qu8-vhswish-neon-u8.c b/src/qu8-vhswish/gen/qu8-vhswish-neon-u8.c index 52f922b5f97..613c460c9c2 100644 --- a/src/qu8-vhswish/gen/qu8-vhswish-neon-u8.c +++ b/src/qu8-vhswish/gen/qu8-vhswish-neon-u8.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qu8_vhswish_ukernel__neon_u8( diff --git a/src/qu8-vhswish/gen/qu8-vhswish-scalar-u1.c b/src/qu8-vhswish/gen/qu8-vhswish-scalar-u1.c index 5db6f72ece8..be9de0e3542 100644 --- a/src/qu8-vhswish/gen/qu8-vhswish-scalar-u1.c +++ b/src/qu8-vhswish/gen/qu8-vhswish-scalar-u1.c @@ -10,7 +10,7 @@ #include #include "xnnpack/math.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qu8_vhswish_ukernel__scalar_u1( diff --git a/src/qu8-vhswish/gen/qu8-vhswish-scalar-u2.c b/src/qu8-vhswish/gen/qu8-vhswish-scalar-u2.c index 3fec85a7850..c4fc01757d3 100644 --- a/src/qu8-vhswish/gen/qu8-vhswish-scalar-u2.c +++ b/src/qu8-vhswish/gen/qu8-vhswish-scalar-u2.c @@ -10,7 +10,7 @@ #include #include "xnnpack/math.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qu8_vhswish_ukernel__scalar_u2( diff --git a/src/qu8-vhswish/gen/qu8-vhswish-scalar-u4.c b/src/qu8-vhswish/gen/qu8-vhswish-scalar-u4.c index 5215d81cb6b..a77516d39a6 100644 --- a/src/qu8-vhswish/gen/qu8-vhswish-scalar-u4.c +++ b/src/qu8-vhswish/gen/qu8-vhswish-scalar-u4.c @@ -10,7 +10,7 @@ #include #include "xnnpack/math.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qu8_vhswish_ukernel__scalar_u4( diff --git a/src/qu8-vhswish/gen/qu8-vhswish-sse2-u16.c b/src/qu8-vhswish/gen/qu8-vhswish-sse2-u16.c index ea3eb20ab72..d80e34b285c 100644 --- a/src/qu8-vhswish/gen/qu8-vhswish-sse2-u16.c +++ b/src/qu8-vhswish/gen/qu8-vhswish-sse2-u16.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" #include "xnnpack/unaligned.h" diff --git a/src/qu8-vhswish/gen/qu8-vhswish-sse2-u32.c b/src/qu8-vhswish/gen/qu8-vhswish-sse2-u32.c index 7f0dfff8124..a797aa7f4d9 100644 --- a/src/qu8-vhswish/gen/qu8-vhswish-sse2-u32.c +++ b/src/qu8-vhswish/gen/qu8-vhswish-sse2-u32.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" #include "xnnpack/unaligned.h" diff --git a/src/qu8-vhswish/gen/qu8-vhswish-sse41-u16.c b/src/qu8-vhswish/gen/qu8-vhswish-sse41-u16.c index 0f61f80cdb4..ab6c9fd4703 100644 --- a/src/qu8-vhswish/gen/qu8-vhswish-sse41-u16.c +++ b/src/qu8-vhswish/gen/qu8-vhswish-sse41-u16.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qu8_vhswish_ukernel__sse41_u16( diff --git a/src/qu8-vhswish/gen/qu8-vhswish-sse41-u32.c b/src/qu8-vhswish/gen/qu8-vhswish-sse41-u32.c index d77c2daffde..77abc2d5c2f 100644 --- a/src/qu8-vhswish/gen/qu8-vhswish-sse41-u32.c +++ b/src/qu8-vhswish/gen/qu8-vhswish-sse41-u32.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qu8_vhswish_ukernel__sse41_u32( diff --git a/src/qu8-vhswish/gen/qu8-vhswish-sse41-u8.c b/src/qu8-vhswish/gen/qu8-vhswish-sse41-u8.c index 3a9274eb06f..f10beb22a06 100644 --- a/src/qu8-vhswish/gen/qu8-vhswish-sse41-u8.c +++ b/src/qu8-vhswish/gen/qu8-vhswish-sse41-u8.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qu8_vhswish_ukernel__sse41_u8( diff --git a/src/qu8-vhswish/gen/qu8-vhswish-ssse3-u16.c b/src/qu8-vhswish/gen/qu8-vhswish-ssse3-u16.c index 5a6009627f1..c5ab8780096 100644 --- a/src/qu8-vhswish/gen/qu8-vhswish-ssse3-u16.c +++ b/src/qu8-vhswish/gen/qu8-vhswish-ssse3-u16.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" #include "xnnpack/unaligned.h" diff --git a/src/qu8-vhswish/gen/qu8-vhswish-ssse3-u32.c b/src/qu8-vhswish/gen/qu8-vhswish-ssse3-u32.c index 31d676297f5..ac6c3707eef 100644 --- a/src/qu8-vhswish/gen/qu8-vhswish-ssse3-u32.c +++ b/src/qu8-vhswish/gen/qu8-vhswish-ssse3-u32.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" #include "xnnpack/unaligned.h" diff --git a/src/qu8-vhswish/gen/qu8-vhswish-wasmsimd-u16.c b/src/qu8-vhswish/gen/qu8-vhswish-wasmsimd-u16.c index ec8a6ad1f7a..87a80df6c00 100644 --- a/src/qu8-vhswish/gen/qu8-vhswish-wasmsimd-u16.c +++ b/src/qu8-vhswish/gen/qu8-vhswish-wasmsimd-u16.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qu8_vhswish_ukernel__wasmsimd_u16( diff --git a/src/qu8-vhswish/gen/qu8-vhswish-wasmsimd-u32.c b/src/qu8-vhswish/gen/qu8-vhswish-wasmsimd-u32.c index ae26ff0d007..14e701ba098 100644 --- a/src/qu8-vhswish/gen/qu8-vhswish-wasmsimd-u32.c +++ b/src/qu8-vhswish/gen/qu8-vhswish-wasmsimd-u32.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qu8_vhswish_ukernel__wasmsimd_u32( diff --git a/src/qu8-vhswish/gen/qu8-vhswish-wasmsimd-u8.c b/src/qu8-vhswish/gen/qu8-vhswish-wasmsimd-u8.c index 82c6e0140b1..b582ef3a129 100644 --- a/src/qu8-vhswish/gen/qu8-vhswish-wasmsimd-u8.c +++ b/src/qu8-vhswish/gen/qu8-vhswish-wasmsimd-u8.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/vunary.h" void xnn_qu8_vhswish_ukernel__wasmsimd_u8( diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-armsimd32-u4.c b/src/qu8-vlrelu/gen/qu8-vlrelu-armsimd32-u4.c index 2c90579f979..749c1e52685 100644 --- a/src/qu8-vlrelu/gen/qu8-vlrelu-armsimd32-u4.c +++ b/src/qu8-vlrelu/gen/qu8-vlrelu-armsimd32-u4.c @@ -14,7 +14,7 @@ #include "xnnpack/intrinsics-polyfill.h" #include "xnnpack/math.h" #include "xnnpack/unaligned.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qu8_vlrelu_ukernel__armsimd32_u4( diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-armsimd32-u8.c b/src/qu8-vlrelu/gen/qu8-vlrelu-armsimd32-u8.c index d217b4cd995..f3307de68f7 100644 --- a/src/qu8-vlrelu/gen/qu8-vlrelu-armsimd32-u8.c +++ b/src/qu8-vlrelu/gen/qu8-vlrelu-armsimd32-u8.c @@ -14,7 +14,7 @@ #include "xnnpack/intrinsics-polyfill.h" #include "xnnpack/math.h" #include "xnnpack/unaligned.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qu8_vlrelu_ukernel__armsimd32_u8( diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-avx-u16.c b/src/qu8-vlrelu/gen/qu8-vlrelu-avx-u16.c index e74c9e5efc0..368dd44b9c1 100644 --- a/src/qu8-vlrelu/gen/qu8-vlrelu-avx-u16.c +++ b/src/qu8-vlrelu/gen/qu8-vlrelu-avx-u16.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qu8_vlrelu_ukernel__avx_u16( diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-avx-u32.c b/src/qu8-vlrelu/gen/qu8-vlrelu-avx-u32.c index 95992be39d9..e5066d5ac84 100644 --- a/src/qu8-vlrelu/gen/qu8-vlrelu-avx-u32.c +++ b/src/qu8-vlrelu/gen/qu8-vlrelu-avx-u32.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qu8_vlrelu_ukernel__avx_u32( diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-avx-u8.c b/src/qu8-vlrelu/gen/qu8-vlrelu-avx-u8.c index fc7c4ec3841..7245be8b177 100644 --- a/src/qu8-vlrelu/gen/qu8-vlrelu-avx-u8.c +++ b/src/qu8-vlrelu/gen/qu8-vlrelu-avx-u8.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qu8_vlrelu_ukernel__avx_u8( diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u16.c b/src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u16.c index 2a3e00ef751..de0700bdd11 100644 --- a/src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u16.c +++ b/src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u16.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qu8_vlrelu_ukernel__avx2_u16( diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u32.c b/src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u32.c index 4495449c4fc..3a1f2aa6f34 100644 --- a/src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u32.c +++ b/src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u32.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qu8_vlrelu_ukernel__avx2_u32( diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u64.c b/src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u64.c index c462e8f27b2..87e650b173d 100644 --- a/src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u64.c +++ b/src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u64.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qu8_vlrelu_ukernel__avx2_u64( diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-neon-u16.c b/src/qu8-vlrelu/gen/qu8-vlrelu-neon-u16.c index a7a7ab3c052..ee89ae561b7 100644 --- a/src/qu8-vlrelu/gen/qu8-vlrelu-neon-u16.c +++ b/src/qu8-vlrelu/gen/qu8-vlrelu-neon-u16.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qu8_vlrelu_ukernel__neon_u16( diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-neon-u32.c b/src/qu8-vlrelu/gen/qu8-vlrelu-neon-u32.c index 4c7d391689d..74085803790 100644 --- a/src/qu8-vlrelu/gen/qu8-vlrelu-neon-u32.c +++ b/src/qu8-vlrelu/gen/qu8-vlrelu-neon-u32.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qu8_vlrelu_ukernel__neon_u32( diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-neon-u8.c b/src/qu8-vlrelu/gen/qu8-vlrelu-neon-u8.c index aedd5a1c8df..a5f21080e73 100644 --- a/src/qu8-vlrelu/gen/qu8-vlrelu-neon-u8.c +++ b/src/qu8-vlrelu/gen/qu8-vlrelu-neon-u8.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qu8_vlrelu_ukernel__neon_u8( diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u1v.c b/src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u1v.c index 8b8ede5ebc8..d37375fbc44 100644 --- a/src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u1v.c +++ b/src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u1v.c @@ -11,7 +11,7 @@ #include -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qu8_vlrelu_ukernel__rvv_u1v( diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u2v.c b/src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u2v.c index 188e17eb953..4222ad94212 100644 --- a/src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u2v.c +++ b/src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u2v.c @@ -11,7 +11,7 @@ #include -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qu8_vlrelu_ukernel__rvv_u2v( diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-u1.c b/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-u1.c index 0c977ced365..bcc7e1784a1 100644 --- a/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-u1.c +++ b/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-u1.c @@ -10,7 +10,7 @@ #include #include "xnnpack/math.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qu8_vlrelu_ukernel__scalar_andxor_u1( diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-u2.c b/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-u2.c index 9c84e95cd25..6ee964d7599 100644 --- a/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-u2.c +++ b/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-u2.c @@ -10,7 +10,7 @@ #include #include "xnnpack/math.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qu8_vlrelu_ukernel__scalar_andxor_u2( diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-u4.c b/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-u4.c index b2c3dac2396..a8dcc85dfe4 100644 --- a/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-u4.c +++ b/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-u4.c @@ -10,7 +10,7 @@ #include #include "xnnpack/math.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qu8_vlrelu_ukernel__scalar_andxor_u4( diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u1.c b/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u1.c index 07bda31539f..ee2f92a2b32 100644 --- a/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u1.c +++ b/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u1.c @@ -10,7 +10,7 @@ #include #include "xnnpack/math.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qu8_vlrelu_ukernel__scalar_select_u1( diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u2.c b/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u2.c index b5b8175e641..b384959fa1a 100644 --- a/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u2.c +++ b/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u2.c @@ -10,7 +10,7 @@ #include #include "xnnpack/math.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qu8_vlrelu_ukernel__scalar_select_u2( diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u4.c b/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u4.c index 5a7060ca906..2271daee282 100644 --- a/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u4.c +++ b/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u4.c @@ -10,7 +10,7 @@ #include #include "xnnpack/math.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qu8_vlrelu_ukernel__scalar_select_u4( diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-sse2-u16.c b/src/qu8-vlrelu/gen/qu8-vlrelu-sse2-u16.c index cda0c4f5583..495c1403e8f 100644 --- a/src/qu8-vlrelu/gen/qu8-vlrelu-sse2-u16.c +++ b/src/qu8-vlrelu/gen/qu8-vlrelu-sse2-u16.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" #include "xnnpack/unaligned.h" diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-sse2-u32.c b/src/qu8-vlrelu/gen/qu8-vlrelu-sse2-u32.c index 650a3e1450c..82c69bc34e2 100644 --- a/src/qu8-vlrelu/gen/qu8-vlrelu-sse2-u32.c +++ b/src/qu8-vlrelu/gen/qu8-vlrelu-sse2-u32.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" #include "xnnpack/unaligned.h" diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-sse41-u16.c b/src/qu8-vlrelu/gen/qu8-vlrelu-sse41-u16.c index 16192fa2f24..b0a0d4b3f41 100644 --- a/src/qu8-vlrelu/gen/qu8-vlrelu-sse41-u16.c +++ b/src/qu8-vlrelu/gen/qu8-vlrelu-sse41-u16.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qu8_vlrelu_ukernel__sse41_u16( diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-sse41-u32.c b/src/qu8-vlrelu/gen/qu8-vlrelu-sse41-u32.c index 4e75c18f848..079f62071be 100644 --- a/src/qu8-vlrelu/gen/qu8-vlrelu-sse41-u32.c +++ b/src/qu8-vlrelu/gen/qu8-vlrelu-sse41-u32.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qu8_vlrelu_ukernel__sse41_u32( diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-sse41-u8.c b/src/qu8-vlrelu/gen/qu8-vlrelu-sse41-u8.c index ffb29eeb65a..60461abe767 100644 --- a/src/qu8-vlrelu/gen/qu8-vlrelu-sse41-u8.c +++ b/src/qu8-vlrelu/gen/qu8-vlrelu-sse41-u8.c @@ -13,7 +13,7 @@ #include "xnnpack/common.h" #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" void xnn_qu8_vlrelu_ukernel__sse41_u8( diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-ssse3-u16.c b/src/qu8-vlrelu/gen/qu8-vlrelu-ssse3-u16.c index 55fa1989df8..e963944edac 100644 --- a/src/qu8-vlrelu/gen/qu8-vlrelu-ssse3-u16.c +++ b/src/qu8-vlrelu/gen/qu8-vlrelu-ssse3-u16.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" #include "xnnpack/unaligned.h" diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-ssse3-u32.c b/src/qu8-vlrelu/gen/qu8-vlrelu-ssse3-u32.c index 9dc7daa7070..472713d9c3d 100644 --- a/src/qu8-vlrelu/gen/qu8-vlrelu-ssse3-u32.c +++ b/src/qu8-vlrelu/gen/qu8-vlrelu-ssse3-u32.c @@ -12,7 +12,7 @@ #include #include "xnnpack/common.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" #include "xnnpack/unaligned.h" diff --git a/src/qu8-vlrelu/qu8-vlrelu.h b/src/qu8-vlrelu/qu8-vlrelu.h index 5a404ada04f..ded476d9daf 100644 --- a/src/qu8-vlrelu/qu8-vlrelu.h +++ b/src/qu8-vlrelu/qu8-vlrelu.h @@ -17,59 +17,59 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_qu8_vlrelu_ukernel__neon_u8, 8, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_qu8_vlrelu_ukernel__neon_u16, 16, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_qu8_vlrelu_ukernel__neon_u32, 32, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_qu8_vlrelu_ukernel__neon_u8, 8, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_qu8_vlrelu_ukernel__neon_u16, 16, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_qu8_vlrelu_ukernel__neon_u32, 32, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_qu8_vlrelu_ukernel__rvv_u1v, 1, true, int8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_qu8_vlrelu_ukernel__rvv_u2v, 2, true, int8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_qu8_vlrelu_ukernel__rvv_u1v, 1, true, int8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_qu8_vlrelu_ukernel__rvv_u2v, 2, true, int8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) #endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__sse2_u16, 16, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__sse2_u32, 32, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_ssse3, xnn_qu8_vlrelu_ukernel__ssse3_u16, 16, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_ssse3, xnn_qu8_vlrelu_ukernel__ssse3_u32, 32, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_qu8_vlrelu_ukernel__sse41_u8, 8, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_qu8_vlrelu_ukernel__sse41_u16, 16, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_qu8_vlrelu_ukernel__sse41_u32, 32, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qu8_vlrelu_ukernel__avx_u8, 8, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qu8_vlrelu_ukernel__avx_u16, 16, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qu8_vlrelu_ukernel__avx_u32, 32, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_vlrelu_ukernel__avx2_u16, 16, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_vlrelu_ukernel__avx2_u32, 32, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_vlrelu_ukernel__avx2_u64, 64, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__sse2_u16, 16, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__sse2_u32, 32, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_ssse3, xnn_qu8_vlrelu_ukernel__ssse3_u16, 16, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_ssse3, xnn_qu8_vlrelu_ukernel__ssse3_u32, 32, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_qu8_vlrelu_ukernel__sse41_u8, 8, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_qu8_vlrelu_ukernel__sse41_u16, 16, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_qu8_vlrelu_ukernel__sse41_u32, 32, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qu8_vlrelu_ukernel__avx_u8, 8, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qu8_vlrelu_ukernel__avx_u16, 16, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qu8_vlrelu_ukernel__avx_u32, 32, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_vlrelu_ukernel__avx2_u16, 16, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_vlrelu_ukernel__avx2_u32, 32, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_vlrelu_ukernel__avx2_u64, 64, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmsimd_arm_u16, 16, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmsimd_arm_u32, 32, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmsimd_x86_u8, 8, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmsimd_x86_u16, 16, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmsimd_x86_u32, 32, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmsimd_arm_u16, 16, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmsimd_arm_u32, 32, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmsimd_x86_u8, 8, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmsimd_x86_u16, 16, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmsimd_x86_u32, 32, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD #if XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_u16, 16, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_u32, 32, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_u8, 8, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_u16, 16, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_u32, 32, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_u16, 16, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_u32, 32, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_u8, 8, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_u16, 16, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_u32, 32, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) #endif // XNN_ARCH_WASMRELAXEDSIMD #if XNN_ARCH_ARM -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_v6, xnn_qu8_vlrelu_ukernel__armsimd32_u4, 4, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_v6, xnn_qu8_vlrelu_ukernel__armsimd32_u8, 8, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_v6, xnn_qu8_vlrelu_ukernel__armsimd32_u4, 4, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_v6, xnn_qu8_vlrelu_ukernel__armsimd32_u8, 8, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) #endif // XNN_ARCH_ARM -XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__scalar_select_u1, 1, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__scalar_select_u2, 2, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__scalar_select_u4, 4, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__scalar_andxor_u1, 1, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__scalar_andxor_u2, 2, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__scalar_andxor_u4, 4, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__scalar_select_u1, 1, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__scalar_select_u2, 2, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__scalar_select_u4, 4, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__scalar_andxor_u1, 1, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__scalar_andxor_u2, 2, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__scalar_andxor_u4, 4, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params) #ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS #undef XNN_DEFINED_UKERNEL_WITH_PARAMS diff --git a/src/s32-f32-vcvt/s32-f32-vcvt.h b/src/s32-f32-vcvt/s32-f32-vcvt.h index 8d7c1d11a3e..b73438208ca 100644 --- a/src/s32-f32-vcvt/s32-f32-vcvt.h +++ b/src/s32-f32-vcvt/s32-f32-vcvt.h @@ -28,6 +28,9 @@ XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s32_f32_vcvt_ukernel__avx2_u8 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s32_f32_vcvt_ukernel__avx2_u16, 16, false, int32_t, float, struct xnn_s32_f32_cvt_params, xnn_init_s32_f32_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s32_f32_vcvt_ukernel__avx2_u24, 24, false, int32_t, float, struct xnn_s32_f32_cvt_params, xnn_init_s32_f32_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s32_f32_vcvt_ukernel__avx2_u32, 32, false, int32_t, float, struct xnn_s32_f32_cvt_params, xnn_init_s32_f32_cvt_scalar_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_s32_f32_vcvt_ukernel__avx512f_u16, 16, false, int32_t, float, struct xnn_s32_f32_cvt_params, xnn_init_s32_f32_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_s32_f32_vcvt_ukernel__avx512f_u32, 32, false, int32_t, float, struct xnn_s32_f32_cvt_params, xnn_init_s32_f32_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_s32_f32_vcvt_ukernel__avx512f_u48, 48, false, int32_t, float, struct xnn_s32_f32_cvt_params, xnn_init_s32_f32_cvt_scalar_params) diff --git a/src/s32-vmul/s32-vmul.h b/src/s32-vmul/s32-vmul.h index 6ccbf827ec4..be5f1618418 100644 --- a/src/s32-vmul/s32-vmul.h +++ b/src/s32-vmul/s32-vmul.h @@ -29,11 +29,14 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s32_vmul_ukernel__avx2_u8, 8, fal XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s32_vmul_ukernel__avx2_u16, 16, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s32_vmul_ukernel__avx2_u24, 24, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s32_vmul_ukernel__avx2_u32, 32, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_s32_vmul_ukernel__avx512f_u16, 16, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_s32_vmul_ukernel__avx512f_u32, 32, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_s32_vmul_ukernel__avx512f_u48, 48, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_s32_vmul_ukernel__avx512f_u64, 64, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL)) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#endif // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_UKERNEL_WITH_PARAMS(0, xnn_s32_vmul_ukernel__wasmsimd_u4, 4, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL)) diff --git a/src/s32-vmul/s32-vmulc.h b/src/s32-vmul/s32-vmulc.h index b478f7323e5..13e6f76ad99 100644 --- a/src/s32-vmul/s32-vmulc.h +++ b/src/s32-vmul/s32-vmulc.h @@ -29,6 +29,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s32_vmulc_ukernel__avx2_u8, 8, fa XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s32_vmulc_ukernel__avx2_u16, 16, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s32_vmulc_ukernel__avx2_u24, 24, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s32_vmulc_ukernel__avx2_u32, 32, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL)) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_s32_vmulc_ukernel__avx512f_u16, 16, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_s32_vmulc_ukernel__avx512f_u32, 32, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL)) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_s32_vmulc_ukernel__avx512f_u48, 48, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL)) diff --git a/src/s8-vclamp/gen/s8-vclamp-rvv-u1v.c b/src/s8-vclamp/gen/s8-vclamp-rvv-u1v.c new file mode 100644 index 00000000000..ce574eee512 --- /dev/null +++ b/src/s8-vclamp/gen/s8-vclamp-rvv-u1v.c @@ -0,0 +1,44 @@ +// Auto-generated file. Do not edit! +// Template: src/s8-vclamp/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/intrinsics-polyfill.h" +#include "xnnpack/vunary.h" + + +void xnn_s8_vclamp_ukernel__rvv_u1v( + size_t batch, + const int8_t* input, + int8_t* output, + const struct xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input != NULL); + assert(output != NULL); + + const int8_t vmin = params->scalar.min; + const int8_t vmax = params->scalar.max; + + do { + const size_t n = __riscv_vsetvl_e8m1(batch); + vint8m1_t vacc = __riscv_vle8_v_i8m1(input, n); + vacc = __riscv_vmax_vx_i8m1(vacc, vmin, n); + vacc = __riscv_vmin_vx_i8m1(vacc, vmax, n); + __riscv_vse8_v_i8m1(output, vacc, n); + input += n; + output += n; + batch -= n; + } while (batch != 0); +} diff --git a/src/s8-vclamp/gen/s8-vclamp-rvv-u2v.c b/src/s8-vclamp/gen/s8-vclamp-rvv-u2v.c new file mode 100644 index 00000000000..7b3d979c852 --- /dev/null +++ b/src/s8-vclamp/gen/s8-vclamp-rvv-u2v.c @@ -0,0 +1,44 @@ +// Auto-generated file. Do not edit! +// Template: src/s8-vclamp/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/intrinsics-polyfill.h" +#include "xnnpack/vunary.h" + + +void xnn_s8_vclamp_ukernel__rvv_u2v( + size_t batch, + const int8_t* input, + int8_t* output, + const struct xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input != NULL); + assert(output != NULL); + + const int8_t vmin = params->scalar.min; + const int8_t vmax = params->scalar.max; + + do { + const size_t n = __riscv_vsetvl_e8m2(batch); + vint8m2_t vacc = __riscv_vle8_v_i8m2(input, n); + vacc = __riscv_vmax_vx_i8m2(vacc, vmin, n); + vacc = __riscv_vmin_vx_i8m2(vacc, vmax, n); + __riscv_vse8_v_i8m2(output, vacc, n); + input += n; + output += n; + batch -= n; + } while (batch != 0); +} diff --git a/src/s8-vclamp/gen/s8-vclamp-rvv-u4v.c b/src/s8-vclamp/gen/s8-vclamp-rvv-u4v.c new file mode 100644 index 00000000000..7da079a2e0d --- /dev/null +++ b/src/s8-vclamp/gen/s8-vclamp-rvv-u4v.c @@ -0,0 +1,44 @@ +// Auto-generated file. Do not edit! +// Template: src/s8-vclamp/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/intrinsics-polyfill.h" +#include "xnnpack/vunary.h" + + +void xnn_s8_vclamp_ukernel__rvv_u4v( + size_t batch, + const int8_t* input, + int8_t* output, + const struct xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input != NULL); + assert(output != NULL); + + const int8_t vmin = params->scalar.min; + const int8_t vmax = params->scalar.max; + + do { + const size_t n = __riscv_vsetvl_e8m4(batch); + vint8m4_t vacc = __riscv_vle8_v_i8m4(input, n); + vacc = __riscv_vmax_vx_i8m4(vacc, vmin, n); + vacc = __riscv_vmin_vx_i8m4(vacc, vmax, n); + __riscv_vse8_v_i8m4(output, vacc, n); + input += n; + output += n; + batch -= n; + } while (batch != 0); +} diff --git a/src/s8-vclamp/gen/s8-vclamp-rvv-u8v.c b/src/s8-vclamp/gen/s8-vclamp-rvv-u8v.c new file mode 100644 index 00000000000..4ba23c5333a --- /dev/null +++ b/src/s8-vclamp/gen/s8-vclamp-rvv-u8v.c @@ -0,0 +1,44 @@ +// Auto-generated file. Do not edit! +// Template: src/s8-vclamp/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/intrinsics-polyfill.h" +#include "xnnpack/vunary.h" + + +void xnn_s8_vclamp_ukernel__rvv_u8v( + size_t batch, + const int8_t* input, + int8_t* output, + const struct xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input != NULL); + assert(output != NULL); + + const int8_t vmin = params->scalar.min; + const int8_t vmax = params->scalar.max; + + do { + const size_t n = __riscv_vsetvl_e8m8(batch); + vint8m8_t vacc = __riscv_vle8_v_i8m8(input, n); + vacc = __riscv_vmax_vx_i8m8(vacc, vmin, n); + vacc = __riscv_vmin_vx_i8m8(vacc, vmax, n); + __riscv_vse8_v_i8m8(output, vacc, n); + input += n; + output += n; + batch -= n; + } while (batch != 0); +} diff --git a/src/s8-vclamp/rvv.c.in b/src/s8-vclamp/rvv.c.in new file mode 100644 index 00000000000..512b531de57 --- /dev/null +++ b/src/s8-vclamp/rvv.c.in @@ -0,0 +1,49 @@ +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert LMUL in [1, 2, 4, 8] +$assert DATATYPE in ["S8", "U8"] + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/intrinsics-polyfill.h" +#include "xnnpack/vunary.h" + +$XINT8_T = {"S8": "int8_t", "U8": "uint8_t"}[DATATYPE] + +void xnn_${DATATYPE.lower()}_vclamp_ukernel__rvv_u${LMUL}v( + size_t batch, + const ${XINT8_T}* input, + ${XINT8_T}* output, + const struct xnn_${DATATYPE.lower()}_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(${XINT8_T}) == 0); + assert(input != NULL); + assert(output != NULL); + + const ${XINT8_T} vmin = params->scalar.min; + const ${XINT8_T} vmax = params->scalar.max; + + do { + const size_t n = __riscv_vsetvl_e8m${LMUL}(batch); + $if DATATYPE == "S8": + vint8m${LMUL}_t vacc = __riscv_vle8_v_i8m${LMUL}(input, n); + vacc = __riscv_vmax_vx_i8m${LMUL}(vacc, vmin, n); + vacc = __riscv_vmin_vx_i8m${LMUL}(vacc, vmax, n); + __riscv_vse8_v_i8m${LMUL}(output, vacc, n); + $else: + vuint8m${LMUL}_t vacc = __riscv_vle8_v_u8m${LMUL}(input, n); + vacc = __riscv_vmaxu_vx_u8m${LMUL}(vacc, vmin, n); + vacc = __riscv_vminu_vx_u8m${LMUL}(vacc, vmax, n); + __riscv_vse8_v_u8m${LMUL}(output, vacc, n); + input += n; + output += n; + batch -= n; + } while (batch != 0); +} diff --git a/src/s8-vclamp/s8-vclamp-avx2-u128.c b/src/s8-vclamp/s8-vclamp-avx2-u128.c new file mode 100644 index 00000000000..263a81d020c --- /dev/null +++ b/src/s8-vclamp/s8-vclamp-avx2-u128.c @@ -0,0 +1,104 @@ +// Copyright 2021 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/microparams.h" +#include "xnnpack/unaligned.h" +#include "xnnpack/vunary.h" + + +void xnn_s8_vclamp_ukernel__avx2_u128( + size_t batch, + const int8_t* input, + int8_t* output, + const struct xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input != NULL); + assert(output != NULL); + + const __m256i voutput_min = _mm256_set1_epi8(params->scalar.min); + const __m256i voutput_max = _mm256_set1_epi8(params->scalar.max); + XNN_FORCE_REALIZATION(voutput_min); + XNN_FORCE_REALIZATION(voutput_max); + + for (; batch >= 128; batch -= 128) { + __m256i vacc0 = _mm256_loadu_si256((const __m256i*) input); + __m256i vacc1 = _mm256_loadu_si256((const __m256i*) input + 1); + __m256i vacc2 = _mm256_loadu_si256((const __m256i*) input + 2); + __m256i vacc3 = _mm256_loadu_si256((const __m256i*) input + 3); + input += 128; + + vacc0 = _mm256_max_epi8(vacc0, voutput_min); + vacc1 = _mm256_max_epi8(vacc1, voutput_min); + vacc2 = _mm256_max_epi8(vacc2, voutput_min); + vacc3 = _mm256_max_epi8(vacc3, voutput_min); + + vacc0 = _mm256_min_epi8(vacc0, voutput_max); + vacc1 = _mm256_min_epi8(vacc1, voutput_max); + vacc2 = _mm256_min_epi8(vacc2, voutput_max); + vacc3 = _mm256_min_epi8(vacc3, voutput_max); + + _mm256_storeu_si256((__m256i*) output, vacc0); + _mm256_storeu_si256((__m256i*) output + 1, vacc1); + _mm256_storeu_si256((__m256i*) output + 2, vacc2); + _mm256_storeu_si256((__m256i*) output + 3, vacc3); + output += 128; + } + for (; batch >= 32; batch -= 32) { + __m256i vacc = _mm256_loadu_si256((const __m256i*) input); + input += 32; + + vacc = _mm256_min_epi8(vacc, voutput_max); + vacc = _mm256_max_epi8(vacc, voutput_min); + + _mm256_storeu_si256((__m256i*) output, vacc); + output += 32; + } + if (batch >= 16) { + __m128i vacc = _mm_loadu_si128((const __m128i*) input); + input += 16; + + vacc = _mm_min_epi8(vacc, _mm256_castsi256_si128(voutput_max)); + vacc = _mm_max_epi8(vacc, _mm256_castsi256_si128(voutput_min)); + + _mm_storeu_si128((__m128i*) output, vacc); + output += 16; + batch -= 16; + } + + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 && batch <= 15); + __m128i vacc = _mm_loadu_si128((const __m128i*) input); + vacc = _mm_min_epi8(vacc, _mm256_castsi256_si128(voutput_max)); + vacc = _mm_max_epi8(vacc, _mm256_castsi256_si128(voutput_min)); + + if (batch & 8) { + _mm_storel_epi64((__m128i*) output, vacc); + output += 8; + vacc = _mm_unpackhi_epi64(vacc, vacc); + } + if (batch & 4) { + unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vacc)); + output += 4; + vacc = _mm_srli_epi64(vacc, 32); + } + if (batch & 2) { + unaligned_store_u16(output, (uint16_t) _mm_cvtsi128_si32(vacc)); + output += 2; + vacc = _mm_srli_epi32(vacc, 16); + } + if (batch & 1) { + *output = (int8_t) _mm_cvtsi128_si32(vacc); + } + } +} diff --git a/src/s8-vclamp/s8-vclamp-avx512skx-u256.c b/src/s8-vclamp/s8-vclamp-avx512skx-u256.c new file mode 100644 index 00000000000..abea2644840 --- /dev/null +++ b/src/s8-vclamp/s8-vclamp-avx512skx-u256.c @@ -0,0 +1,76 @@ +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/microparams.h" +#include "xnnpack/unaligned.h" +#include "xnnpack/vunary.h" + + +void xnn_s8_vclamp_ukernel__avx512skx_u256( + size_t batch, + const int8_t* input, + int8_t* output, + const struct xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input != NULL); + assert(output != NULL); + + const __m512i voutput_min = _mm512_set1_epi8(params->scalar.min); + const __m512i voutput_max = _mm512_set1_epi8(params->scalar.max); + + for (; batch >= 256; batch -= 256) { + __m512i vacc0 = _mm512_loadu_si512((const __m512i*) input); + __m512i vacc1 = _mm512_loadu_si512((const __m512i*) input + 1); + __m512i vacc2 = _mm512_loadu_si512((const __m512i*) input + 2); + __m512i vacc3 = _mm512_loadu_si512((const __m512i*) input + 3); + input += 256; + + vacc0 = _mm512_max_epi8(vacc0, voutput_min); + vacc1 = _mm512_max_epi8(vacc1, voutput_min); + vacc2 = _mm512_max_epi8(vacc2, voutput_min); + vacc3 = _mm512_max_epi8(vacc3, voutput_min); + + vacc0 = _mm512_min_epi8(vacc0, voutput_max); + vacc1 = _mm512_min_epi8(vacc1, voutput_max); + vacc2 = _mm512_min_epi8(vacc2, voutput_max); + vacc3 = _mm512_min_epi8(vacc3, voutput_max); + + _mm512_storeu_si512((__m512i*) output, vacc0); + _mm512_storeu_si512((__m512i*) output + 1, vacc1); + _mm512_storeu_si512((__m512i*) output + 2, vacc2); + _mm512_storeu_si512((__m512i*) output + 3, vacc3); + output += 256; + } + for (; batch >= 64; batch -= 64) { + __m512i vacc = _mm512_loadu_si512((const __m512i*) input); + input += 64; + + vacc = _mm512_min_epi8(vacc, voutput_max); + vacc = _mm512_max_epi8(vacc, voutput_min); + + _mm512_storeu_si512((__m512i*) output, vacc); + output += 64; + } + + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 && batch <= 63); + const __mmask64 vmask = _cvtu64_mask64((uint64_t) ((UINT64_C(1) << batch) - UINT64_C(1))); + __m512i vacc = _mm512_maskz_loadu_epi8(vmask, input); + + vacc = _mm512_min_epi8(vacc, voutput_max); + vacc = _mm512_max_epi8(vacc, voutput_min); + + _mm512_mask_storeu_epi8(output, vmask, vacc); + } +} diff --git a/src/s8-vclamp/s8-vclamp.h b/src/s8-vclamp/s8-vclamp.h index d4b601b627b..a7aa50b642f 100644 --- a/src/s8-vclamp/s8-vclamp.h +++ b/src/s8-vclamp/s8-vclamp.h @@ -23,8 +23,20 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_s8_vclamp_ukernel__neon_u64, 64, #if XNN_ARCH_X86 || XNN_ARCH_X86_64 XNN_UKERNEL_WITH_PARAMS(0, xnn_s8_vclamp_ukernel__sse2_u64, 64, false, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params) XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_s8_vclamp_ukernel__sse41_u64, 64, false, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s8_vclamp_ukernel__avx2_u128, 128, false, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_s8_vclamp_ukernel__avx512skx_u256, 256, false, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params) +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_s8_vclamp_ukernel__rvv_u1v, 1, true, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_s8_vclamp_ukernel__rvv_u2v, 2, true, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_s8_vclamp_ukernel__rvv_u4v, 4, true, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_s8_vclamp_ukernel__rvv_u8v, 8, true, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params) +#endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV + #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_UKERNEL_WITH_PARAMS(0, xnn_s8_vclamp_ukernel__wasmsimd_u64, 64, false, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD diff --git a/src/subgraph.c b/src/subgraph.c index 1c6769b3bbf..cc26fbf4d0c 100644 --- a/src/subgraph.c +++ b/src/subgraph.c @@ -515,15 +515,6 @@ uint32_t xnn_check_nchw_compatibility(xnn_subgraph_t subgraph, struct xnn_node* case xnn_node_type_floor: case xnn_node_type_hardswish: case xnn_node_type_leaky_relu: - case xnn_node_type_static_mean: - case xnn_node_type_static_sum: - if (subgraph->values[node->inputs[0]].shape.num_dims == 4) { - return XNN_LAYOUT_FLAG_COMPATIBLE_NCHW | XNN_LAYOUT_FLAG_COMPATIBLE_NHWC2NCHW; - } else { - xnn_log_info("Node %s inputs shape is incompatible with sparse inference", - xnn_node_type_to_string(node->type)); - return 0; - } case xnn_node_type_negate: case xnn_node_type_sigmoid: case xnn_node_type_square: @@ -536,6 +527,15 @@ uint32_t xnn_check_nchw_compatibility(xnn_subgraph_t subgraph, struct xnn_node* xnn_node_type_to_string(node->type)); return 0; } + case xnn_node_type_static_mean: + case xnn_node_type_static_sum: + if (subgraph->values[node->inputs[0]].shape.num_dims == 4) { + return XNN_LAYOUT_FLAG_COMPATIBLE_NCHW | XNN_LAYOUT_FLAG_COMPATIBLE_NCHW2NHWC; + } else { + xnn_log_info("Node %s inputs shape is incompatible with sparse inference", + xnn_node_type_to_string(node->type)); + return 0; + } default: return false; } @@ -1433,6 +1433,8 @@ enum xnn_node_type xnn_binary_operator_to_node_type(enum xnn_binary_operator typ return xnn_node_type_copysign; case xnn_binary_squared_difference: return xnn_node_type_squared_difference; + case xnn_binary_prelu: + return xnn_node_type_prelu; case xnn_binary_minimum: return xnn_node_type_minimum2; case xnn_binary_maximum: @@ -1457,6 +1459,8 @@ enum xnn_binary_operator xnn_node_type_to_binary_operator(enum xnn_node_type typ return xnn_binary_copysign; case xnn_node_type_squared_difference: return xnn_binary_squared_difference; + case xnn_node_type_prelu: + return xnn_binary_prelu; case xnn_node_type_minimum2: return xnn_binary_minimum; case xnn_node_type_maximum2: diff --git a/src/subgraph/convert.c b/src/subgraph/convert.c index 16ccb7bd015..b722dee5e4b 100644 --- a/src/subgraph/convert.c +++ b/src/subgraph/convert.c @@ -64,7 +64,6 @@ static enum xnn_status create_convert_operator( status = xnn_create_convert_nc_f32_qs8( output_value->quantization.scale, (int8_t) output_value->quantization.zero_point, - INT8_MIN, INT8_MAX, node->flags, &opdata->operator_objects[0]); break; @@ -72,7 +71,6 @@ static enum xnn_status create_convert_operator( status = xnn_create_convert_nc_f32_qu8( output_value->quantization.scale, (uint8_t) output_value->quantization.zero_point, - 0, UINT8_MAX, node->flags, &opdata->operator_objects[0]); break; diff --git a/src/subgraph/deprecated.c b/src/subgraph/deprecated.c index 9c086313ddc..b5165d691cd 100644 --- a/src/subgraph/deprecated.c +++ b/src/subgraph/deprecated.c @@ -77,6 +77,13 @@ enum xnn_status xnn_define_copysign(xnn_subgraph_t subgraph, uint32_t input1_id, input2_id, output_id, flags); } +enum xnn_status xnn_define_prelu(xnn_subgraph_t subgraph, uint32_t input1_id, + uint32_t input2_id, uint32_t output_id, + uint32_t flags) { + return xnn_define_binary(subgraph, xnn_binary_prelu, NULL, + input1_id, input2_id, output_id, flags); +} + enum xnn_status xnn_define_static_mean(xnn_subgraph_t subgraph, size_t num_reduction_axes, const size_t* reduction_axes, diff --git a/src/subgraph/prelu.c b/src/subgraph/prelu.c deleted file mode 100644 index d4688735f5b..00000000000 --- a/src/subgraph/prelu.c +++ /dev/null @@ -1,285 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/log.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/subgraph-validation.h" -#include "xnnpack/subgraph.h" -#include "pthreadpool.h" - -static enum xnn_status create_prelu_operator( - const struct xnn_node* node, - const struct xnn_value* values, - size_t num_values, - struct xnn_operator_data* opdata, - struct xnn_code_cache* code_cache, - xnn_weights_cache_t weights_cache) -{ - assert(node->num_inputs == 2); - const uint32_t input_id = node->inputs[0]; - assert(input_id != XNN_INVALID_VALUE_ID); - assert(input_id < num_values); - - const uint32_t slope_id = node->inputs[1]; - assert(slope_id != XNN_INVALID_VALUE_ID); - assert(slope_id < num_values); - - const void* slope_data = values[slope_id].fp32_data != NULL ? values[slope_id].fp32_data : values[slope_id].data; - assert(slope_data != NULL); - - assert(node->num_outputs == 1); - - const size_t num_slope_dims = values[slope_id].shape.num_dims; - const size_t slope_channels = num_slope_dims == 0 ? 1 : values[slope_id].shape.dim[num_slope_dims - 1]; - - const size_t num_input_dims = values[input_id].shape.num_dims; - const size_t input_channels = num_input_dims == 0 ? 1 : values[input_id].shape.dim[num_input_dims - 1]; - - const uint32_t input1_id = node->inputs[0]; - assert(input_id < num_values); - const struct xnn_value *input1_value = &values[input1_id]; - enum xnn_status status; - switch (input1_value->datatype) { - case xnn_datatype_fp16: - status = xnn_create_prelu_nc_f16( - input_channels, - slope_channels, - /*input_stride=*/input_channels, - /*output_stride=*/input_channels, - /*negative_slope=*/slope_data, - node->flags | XNN_FLAG_FP32_STATIC_WEIGHTS, - code_cache, - weights_cache, - &opdata->operator_objects[0]); - break; - case xnn_datatype_fp32: - status = xnn_create_prelu_nc_f32( - input_channels, - slope_channels, - /*input_stride=*/input_channels, - /*output_stride=*/input_channels, - /*negative_slope=*/slope_data, - node->flags, - code_cache, - weights_cache, - &opdata->operator_objects[0]); - break; - default: - XNN_UNREACHABLE; - } - return status; -} - -static enum xnn_status reshape_prelu_operator( - struct xnn_operator_data* opdata, - struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id < num_values); - const struct xnn_value* input_value = values + input_id; - const size_t batch_size = xnn_shape_multiply_non_channel_dims(&input_value->shape); - - const size_t old_workspace_size = opdata->workspace_size; - enum xnn_status status = xnn_status_invalid_state; - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_prelu_nc_f16: - status = xnn_reshape_prelu_nc_f16( - opdata->operator_objects[0], - batch_size, - threadpool); - break; - case xnn_operator_type_prelu_nc_f32: - status = xnn_reshape_prelu_nc_f32( - opdata->operator_objects[0], - batch_size, - threadpool); - break; - default: - XNN_UNREACHABLE; - } - if (status != xnn_status_success) { - return status; - } - - const uint32_t output_id = opdata->outputs[0]; - assert(output_id < num_values); - struct xnn_value* output_value = values + output_id; - - memcpy(output_value->shape.dim, input_value->shape.dim, input_value->shape.num_dims * sizeof(size_t)); - const size_t new_size = xnn_tensor_get_size(output_value); - if (new_size > output_value->size || opdata->workspace_size > old_workspace_size) { - output_value->size = new_size; - return xnn_status_reallocation_required; - } - - return xnn_status_success; - -} - -static enum xnn_status setup_prelu_operator( - const struct xnn_operator_data* opdata, - const struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id != XNN_INVALID_VALUE_ID); - assert(input_id < num_values); - - const uint32_t output_id = opdata->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - const struct xnn_value* input_value = values + input_id; - const void* input_data = input_value->data; - assert(input_data != NULL); - - const struct xnn_value* output_value = values + output_id; - void* output_data = output_value->data; - assert(output_data != NULL); - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_prelu_nc_f16: - return xnn_setup_prelu_nc_f16( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_prelu_nc_f32: - return xnn_setup_prelu_nc_f32( - opdata->operator_objects[0], - input_data, - output_data); - default: - XNN_UNREACHABLE; - } -} - -enum xnn_status xnn_define_prelu( - xnn_subgraph_t subgraph, - uint32_t input_id, - uint32_t slope_id, - uint32_t output_id, - uint32_t flags) -{ - enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_prelu)) != xnn_status_success) { - return status; - } - - if ((status = xnn_subgraph_check_input_node_id(xnn_node_type_prelu, input_id, subgraph->num_values)) != - xnn_status_success) { - return status; - } - - const struct xnn_value* input_value = &subgraph->values[input_id]; - status = xnn_subgraph_check_input_type_dense(xnn_node_type_prelu, input_id, input_value); - if (status != xnn_status_success) { - return status; - } - - switch (input_value->datatype) { - case xnn_datatype_fp16: - case xnn_datatype_fp32: - break; - default: - xnn_log_error( - "failed to define %s operator with input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_prelu), input_id, - xnn_datatype_to_string(input_value->datatype), input_value->datatype); - return xnn_status_invalid_parameter; - } - - if (slope_id >= subgraph->num_values) { - xnn_log_error( - "failed to define %s operator with slope ID #%" PRIu32 ": invalid Value ID", - xnn_node_type_to_string(xnn_node_type_prelu), slope_id); - return xnn_status_invalid_parameter; - } - - const struct xnn_value* slope_value = &subgraph->values[slope_id]; - if (slope_value->type != xnn_value_type_dense_tensor) { - xnn_log_error( - "failed to define %s operator with slope ID #%" PRIu32 ": unsupported Value type %d (expected dense tensor)", - xnn_node_type_to_string(xnn_node_type_prelu), slope_id, slope_value->type); - return xnn_status_invalid_parameter; - } - - if (slope_value->data == NULL) { - xnn_log_error( - "failed to define %s operator with slope ID #%" PRIu32 ": non-static Value", - xnn_node_type_to_string(xnn_node_type_prelu), slope_id); - return xnn_status_invalid_parameter; - } - - switch (slope_value->datatype) { - case xnn_datatype_fp32: - break; - default: - xnn_log_error( - "failed to define %s operator with slope ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_prelu), slope_id, - xnn_datatype_to_string(slope_value->datatype), slope_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_output_node_id(xnn_node_type_prelu, output_id, subgraph->num_values); - if (status != xnn_status_success) { - return status; - } - - const struct xnn_value* output_value = &subgraph->values[output_id]; - status = xnn_subgraph_check_output_type_dense(xnn_node_type_prelu, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - enum xnn_compute_type compute_type = xnn_compute_type_invalid; - switch (output_value->datatype) { - case xnn_datatype_fp16: - compute_type = xnn_compute_type_fp16; - break; - case xnn_datatype_fp32: - compute_type = xnn_compute_type_fp32; - break; - default: - xnn_log_error( - "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_prelu), output_id, - xnn_datatype_to_string(output_value->datatype), output_value->datatype); - return xnn_status_invalid_parameter; - } - - struct xnn_node* node = xnn_subgraph_new_node(subgraph); - if (node == NULL) { - return xnn_status_out_of_memory; - } - - node->type = xnn_node_type_prelu; - node->compute_type = compute_type; - node->num_inputs = 2; - node->inputs[0] = input_id; - node->inputs[1] = slope_id; - node->num_outputs = 1; - node->outputs[0] = output_id; - node->flags = flags; - - node->create = create_prelu_operator; - node->reshape = reshape_prelu_operator; - node->setup = setup_prelu_operator; - - return xnn_status_success; -} diff --git a/src/subgraph/rope.c b/src/subgraph/rope.c index 8d424b79981..3f6e3b64121 100644 --- a/src/subgraph/rope.c +++ b/src/subgraph/rope.c @@ -36,13 +36,11 @@ static enum xnn_status create_rope_operator( switch (input_value->datatype) { case xnn_datatype_fp16: status = xnn_create_rope_nthc_f16( - node->params.rope.max_tokens, /*flags=*/0, &opdata->operator_objects[0]); break; case xnn_datatype_fp32: status = xnn_create_rope_nthc_f32( - node->params.rope.max_tokens, /*flags=*/0, &opdata->operator_objects[0]); break; @@ -170,13 +168,6 @@ enum xnn_status xnn_define_rope( return status; } - if (max_tokens == 0) { - xnn_log_error( - "failed to define %s operator with %zu max tokens: maximum number of tokens must be non-zero", - xnn_node_type_to_string(xnn_node_type_rope), max_tokens); - return xnn_status_invalid_parameter; - } - status = xnn_subgraph_check_input_node_id(xnn_node_type_rope, input_id, subgraph->num_values); if (status != xnn_status_success) { return status; @@ -262,7 +253,6 @@ enum xnn_status xnn_define_rope( node->type = xnn_node_type_rope; node->compute_type = compute_type; - node->params.rope.max_tokens = max_tokens; node->num_inputs = 2; node->inputs[0] = input_id; node->inputs[1] = weights_id; diff --git a/src/subgraph/static-reduce.c b/src/subgraph/static-reduce.c index 0de434f5465..230b97297c4 100644 --- a/src/subgraph/static-reduce.c +++ b/src/subgraph/static-reduce.c @@ -326,15 +326,6 @@ enum xnn_status xnn_define_static_reduce( return xnn_status_invalid_parameter; } - for (size_t i = 0; i < num_reduction_axes; i++) { - if (reduction_axes[i] > input_value->shape.num_dims) { - xnn_log_error( - "failed to define %s operator with #%zu reduction axis of %zu: the index is out of bounds for a %zuD input shape", - xnn_node_type_to_string(node_type), i, reduction_axes[i], input_value->shape.num_dims); - return xnn_status_invalid_parameter; - } - } - for (size_t i = 1; i < num_reduction_axes; i++) { if (reduction_axes[i] <= reduction_axes[i - 1]) { xnn_log_error( diff --git a/src/u32-f32-vcvt/u32-f32-vcvt.h b/src/u32-f32-vcvt/u32-f32-vcvt.h index 1d9267fa4a2..2bb599a4377 100644 --- a/src/u32-f32-vcvt/u32-f32-vcvt.h +++ b/src/u32-f32-vcvt/u32-f32-vcvt.h @@ -31,6 +31,9 @@ XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_u32_f32_vcvt_ukernel__avx2_u3 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_u32_f32_vcvt_ukernel__avx512f_u16, 16, false, uint32_t, float, struct xnn_u32_f32_cvt_params, xnn_init_u32_f32_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_u32_f32_vcvt_ukernel__avx512f_u32, 32, false, uint32_t, float, struct xnn_u32_f32_cvt_params, xnn_init_u32_f32_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_u32_f32_vcvt_ukernel__avx512f_u48, 48, false, uint32_t, float, struct xnn_u32_f32_cvt_params, xnn_init_u32_f32_cvt_scalar_params) diff --git a/src/u8-vclamp/gen/u8-vclamp-rvv-u1v.c b/src/u8-vclamp/gen/u8-vclamp-rvv-u1v.c new file mode 100644 index 00000000000..2d291a78f40 --- /dev/null +++ b/src/u8-vclamp/gen/u8-vclamp-rvv-u1v.c @@ -0,0 +1,44 @@ +// Auto-generated file. Do not edit! +// Template: src/s8-vclamp/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/intrinsics-polyfill.h" +#include "xnnpack/vunary.h" + + +void xnn_u8_vclamp_ukernel__rvv_u1v( + size_t batch, + const uint8_t* input, + uint8_t* output, + const struct xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint8_t) == 0); + assert(input != NULL); + assert(output != NULL); + + const uint8_t vmin = params->scalar.min; + const uint8_t vmax = params->scalar.max; + + do { + const size_t n = __riscv_vsetvl_e8m1(batch); + vuint8m1_t vacc = __riscv_vle8_v_u8m1(input, n); + vacc = __riscv_vmaxu_vx_u8m1(vacc, vmin, n); + vacc = __riscv_vminu_vx_u8m1(vacc, vmax, n); + __riscv_vse8_v_u8m1(output, vacc, n); + input += n; + output += n; + batch -= n; + } while (batch != 0); +} diff --git a/src/u8-vclamp/gen/u8-vclamp-rvv-u2v.c b/src/u8-vclamp/gen/u8-vclamp-rvv-u2v.c new file mode 100644 index 00000000000..9a91840c067 --- /dev/null +++ b/src/u8-vclamp/gen/u8-vclamp-rvv-u2v.c @@ -0,0 +1,44 @@ +// Auto-generated file. Do not edit! +// Template: src/s8-vclamp/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/intrinsics-polyfill.h" +#include "xnnpack/vunary.h" + + +void xnn_u8_vclamp_ukernel__rvv_u2v( + size_t batch, + const uint8_t* input, + uint8_t* output, + const struct xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint8_t) == 0); + assert(input != NULL); + assert(output != NULL); + + const uint8_t vmin = params->scalar.min; + const uint8_t vmax = params->scalar.max; + + do { + const size_t n = __riscv_vsetvl_e8m2(batch); + vuint8m2_t vacc = __riscv_vle8_v_u8m2(input, n); + vacc = __riscv_vmaxu_vx_u8m2(vacc, vmin, n); + vacc = __riscv_vminu_vx_u8m2(vacc, vmax, n); + __riscv_vse8_v_u8m2(output, vacc, n); + input += n; + output += n; + batch -= n; + } while (batch != 0); +} diff --git a/src/u8-vclamp/gen/u8-vclamp-rvv-u4v.c b/src/u8-vclamp/gen/u8-vclamp-rvv-u4v.c new file mode 100644 index 00000000000..011146ec219 --- /dev/null +++ b/src/u8-vclamp/gen/u8-vclamp-rvv-u4v.c @@ -0,0 +1,44 @@ +// Auto-generated file. Do not edit! +// Template: src/s8-vclamp/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/intrinsics-polyfill.h" +#include "xnnpack/vunary.h" + + +void xnn_u8_vclamp_ukernel__rvv_u4v( + size_t batch, + const uint8_t* input, + uint8_t* output, + const struct xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint8_t) == 0); + assert(input != NULL); + assert(output != NULL); + + const uint8_t vmin = params->scalar.min; + const uint8_t vmax = params->scalar.max; + + do { + const size_t n = __riscv_vsetvl_e8m4(batch); + vuint8m4_t vacc = __riscv_vle8_v_u8m4(input, n); + vacc = __riscv_vmaxu_vx_u8m4(vacc, vmin, n); + vacc = __riscv_vminu_vx_u8m4(vacc, vmax, n); + __riscv_vse8_v_u8m4(output, vacc, n); + input += n; + output += n; + batch -= n; + } while (batch != 0); +} diff --git a/src/u8-vclamp/gen/u8-vclamp-rvv-u8v.c b/src/u8-vclamp/gen/u8-vclamp-rvv-u8v.c new file mode 100644 index 00000000000..5c782d823ff --- /dev/null +++ b/src/u8-vclamp/gen/u8-vclamp-rvv-u8v.c @@ -0,0 +1,44 @@ +// Auto-generated file. Do not edit! +// Template: src/s8-vclamp/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/intrinsics-polyfill.h" +#include "xnnpack/vunary.h" + + +void xnn_u8_vclamp_ukernel__rvv_u8v( + size_t batch, + const uint8_t* input, + uint8_t* output, + const struct xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint8_t) == 0); + assert(input != NULL); + assert(output != NULL); + + const uint8_t vmin = params->scalar.min; + const uint8_t vmax = params->scalar.max; + + do { + const size_t n = __riscv_vsetvl_e8m8(batch); + vuint8m8_t vacc = __riscv_vle8_v_u8m8(input, n); + vacc = __riscv_vmaxu_vx_u8m8(vacc, vmin, n); + vacc = __riscv_vminu_vx_u8m8(vacc, vmax, n); + __riscv_vse8_v_u8m8(output, vacc, n); + input += n; + output += n; + batch -= n; + } while (batch != 0); +} diff --git a/src/u8-vclamp/u8-vclamp-avx2-u128.c b/src/u8-vclamp/u8-vclamp-avx2-u128.c new file mode 100644 index 00000000000..0448807359a --- /dev/null +++ b/src/u8-vclamp/u8-vclamp-avx2-u128.c @@ -0,0 +1,104 @@ +// Copyright 2021 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/microparams.h" +#include "xnnpack/unaligned.h" +#include "xnnpack/vunary.h" + + +void xnn_u8_vclamp_ukernel__avx2_u128( + size_t batch, + const uint8_t* input, + uint8_t* output, + const struct xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(batch != 0); + assert(batch % sizeof(uint8_t) == 0); + assert(input != NULL); + assert(output != NULL); + + const __m256i voutput_min = _mm256_set1_epi8(params->scalar.min); + const __m256i voutput_max = _mm256_set1_epi8(params->scalar.max); + XNN_FORCE_REALIZATION(voutput_min); + XNN_FORCE_REALIZATION(voutput_max); + + for (; batch >= 128; batch -= 128) { + __m256i vacc0 = _mm256_loadu_si256((const __m256i*) input); + __m256i vacc1 = _mm256_loadu_si256((const __m256i*) input + 1); + __m256i vacc2 = _mm256_loadu_si256((const __m256i*) input + 2); + __m256i vacc3 = _mm256_loadu_si256((const __m256i*) input + 3); + input += 128; + + vacc0 = _mm256_max_epu8(vacc0, voutput_min); + vacc1 = _mm256_max_epu8(vacc1, voutput_min); + vacc2 = _mm256_max_epu8(vacc2, voutput_min); + vacc3 = _mm256_max_epu8(vacc3, voutput_min); + + vacc0 = _mm256_min_epu8(vacc0, voutput_max); + vacc1 = _mm256_min_epu8(vacc1, voutput_max); + vacc2 = _mm256_min_epu8(vacc2, voutput_max); + vacc3 = _mm256_min_epu8(vacc3, voutput_max); + + _mm256_storeu_si256((__m256i*) output, vacc0); + _mm256_storeu_si256((__m256i*) output + 1, vacc1); + _mm256_storeu_si256((__m256i*) output + 2, vacc2); + _mm256_storeu_si256((__m256i*) output + 3, vacc3); + output += 128; + } + for (; batch >= 32; batch -= 32) { + __m256i vacc = _mm256_loadu_si256((const __m256i*) input); + input += 32; + + vacc = _mm256_max_epu8(vacc, voutput_min); + vacc = _mm256_min_epu8(vacc, voutput_max); + + _mm256_storeu_si256((__m256i*) output, vacc); + output += 32; + } + if (batch >= 16) { + __m128i vacc = _mm_loadu_si128((const __m128i*) input); + input += 16; + + vacc = _mm_max_epu8(vacc, _mm256_castsi256_si128(voutput_min)); + vacc = _mm_min_epu8(vacc, _mm256_castsi256_si128(voutput_max)); + + _mm_storeu_si128((__m128i*) output, vacc); + output += 16; + batch -= 16; + } + + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 && batch <= 15); + __m128i vacc = _mm_loadu_si128((const __m128i*) input); + vacc = _mm_max_epu8(vacc, _mm256_castsi256_si128(voutput_min)); + vacc = _mm_min_epu8(vacc, _mm256_castsi256_si128(voutput_max)); + + if (batch & 8) { + _mm_storel_epi64((__m128i*) output, vacc); + output += 8; + vacc = _mm_unpackhi_epi64(vacc, vacc); + } + if (batch & 4) { + unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vacc)); + output += 4; + vacc = _mm_srli_epi64(vacc, 32); + } + if (batch & 2) { + unaligned_store_u16(output, (uint16_t) _mm_cvtsi128_si32(vacc)); + output += 2; + vacc = _mm_srli_epi32(vacc, 16); + } + if (batch & 1) { + *output = (uint8_t) _mm_cvtsi128_si32(vacc); + } + } +} diff --git a/src/u8-vclamp/u8-vclamp-avx512skx-u256.c b/src/u8-vclamp/u8-vclamp-avx512skx-u256.c new file mode 100644 index 00000000000..fcd9c0afeb6 --- /dev/null +++ b/src/u8-vclamp/u8-vclamp-avx512skx-u256.c @@ -0,0 +1,76 @@ +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/microparams.h" +#include "xnnpack/unaligned.h" +#include "xnnpack/vunary.h" + + +void xnn_u8_vclamp_ukernel__avx512skx_u256( + size_t batch, + const uint8_t* input, + uint8_t* output, + const struct xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint8_t) == 0); + assert(input != NULL); + assert(output != NULL); + + const __m512i voutput_min = _mm512_set1_epi8(params->scalar.min); + const __m512i voutput_max = _mm512_set1_epi8(params->scalar.max); + + for (; batch >= 256; batch -= 256) { + __m512i vacc0 = _mm512_loadu_si512((const __m512i*) input); + __m512i vacc1 = _mm512_loadu_si512((const __m512i*) input + 1); + __m512i vacc2 = _mm512_loadu_si512((const __m512i*) input + 2); + __m512i vacc3 = _mm512_loadu_si512((const __m512i*) input + 3); + input += 256; + + vacc0 = _mm512_max_epu8(vacc0, voutput_min); + vacc1 = _mm512_max_epu8(vacc1, voutput_min); + vacc2 = _mm512_max_epu8(vacc2, voutput_min); + vacc3 = _mm512_max_epu8(vacc3, voutput_min); + + vacc0 = _mm512_min_epu8(vacc0, voutput_max); + vacc1 = _mm512_min_epu8(vacc1, voutput_max); + vacc2 = _mm512_min_epu8(vacc2, voutput_max); + vacc3 = _mm512_min_epu8(vacc3, voutput_max); + + _mm512_storeu_si512((__m512i*) output, vacc0); + _mm512_storeu_si512((__m512i*) output + 1, vacc1); + _mm512_storeu_si512((__m512i*) output + 2, vacc2); + _mm512_storeu_si512((__m512i*) output + 3, vacc3); + output += 256; + } + for (; batch >= 64; batch -= 64) { + __m512i vacc = _mm512_loadu_si512((const __m512i*) input); + input += 64; + + vacc = _mm512_min_epu8(vacc, voutput_max); + vacc = _mm512_max_epu8(vacc, voutput_min); + + _mm512_storeu_si512((__m512i*) output, vacc); + output += 64; + } + + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 && batch <= 63); + const __mmask64 vmask = _cvtu64_mask64((uint64_t) ((UINT64_C(1) << batch) - UINT64_C(1))); + __m512i vacc = _mm512_maskz_loadu_epi8(vmask, input); + + vacc = _mm512_min_epu8(vacc, voutput_max); + vacc = _mm512_max_epu8(vacc, voutput_min); + + _mm512_mask_storeu_epi8(output, vmask, vacc); + } +} diff --git a/src/u8-vclamp/u8-vclamp.h b/src/u8-vclamp/u8-vclamp.h index 0aceaebd69a..90b0cf43be3 100644 --- a/src/u8-vclamp/u8-vclamp.h +++ b/src/u8-vclamp/u8-vclamp.h @@ -22,8 +22,20 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_u8_vclamp_ukernel__neon_u64, 64, #if XNN_ARCH_X86 || XNN_ARCH_X86_64 XNN_UKERNEL_WITH_PARAMS(0, xnn_u8_vclamp_ukernel__sse2_u64, 64, false, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_u8_vclamp_ukernel__avx2_u128, 128, false, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_u8_vclamp_ukernel__avx512skx_u256, 256, false, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params) +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_u8_vclamp_ukernel__rvv_u1v, 1, true, int8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_u8_vclamp_ukernel__rvv_u2v, 2, true, int8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_u8_vclamp_ukernel__rvv_u4v, 4, true, int8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_u8_vclamp_ukernel__rvv_u8v, 8, true, int8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params) +#endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV + #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD XNN_UKERNEL_WITH_PARAMS(0, xnn_u8_vclamp_ukernel__wasmsimd_u64, 64, false, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD diff --git a/src/x32-packw/x32-packw.h b/src/x32-packw/x32-packw.h index 5517782d3d1..2a962d5f5d9 100644 --- a/src/x32-packw/x32-packw.h +++ b/src/x32-packw/x32-packw.h @@ -65,6 +65,9 @@ XNN_UKERNEL(xnn_arch_x86_avx, xnn_x32_packw_gemm_goi_ukernel_x16__avx_u4, 16, 1, XNN_UKERNEL(xnn_arch_x86_avx, xnn_x32_packw_gemm_goi_ukernel_x16__avx_u4_prfm, 16, 1, 1, 4, 1) XNN_UKERNEL(xnn_arch_x86_avx, xnn_x32_packw_gemm_goi_ukernel_x16s4__avx_u4, 16, 1, 4, 4, 1) XNN_UKERNEL(xnn_arch_x86_avx, xnn_x32_packw_gemm_goi_ukernel_x16s4__avx_u4_prfm, 16, 1, 4, 4, 1) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) XNN_UKERNEL(xnn_arch_x86_avx512f, xnn_x32_packw_gemm_goi_ukernel_x16__avx512f_u4, 16, 1, 1, 4, 1) XNN_UKERNEL(xnn_arch_x86_avx512f, xnn_x32_packw_gemm_goi_ukernel_x16__avx512f_u4_prfm, 16, 1, 1, 4, 1) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/x8-packw/kr-avxvnni.c.in b/src/x8-packw/kr-avxvnni.c.in index 7300efd09e8..685a9cd5c7e 100644 --- a/src/x8-packw/kr-avxvnni.c.in +++ b/src/x8-packw/kr-avxvnni.c.in @@ -36,7 +36,7 @@ void xnn_qs8${"_to_qu8" if IZP == 128 else ""}_packw_gemm_goi_ukernel_x${NR}c${K const void* scale, ${WTYPE}* packed_weights, size_t extra_bytes, - const void* params) + const void* params) XNN_OOB_READS { assert(g != 0); assert(nc != 0); @@ -107,69 +107,18 @@ void xnn_qs8${"_to_qu8" if IZP == 128 else ""}_packw_gemm_goi_ukernel_x${NR}c${K // KC remainder of 1..${KR-1} if (k != 0) { assert(k >= 1 && k <= ${KR-1}); - $for N in range(0, NR, 4): - __m256i v${N} = _mm256_setzero_si256(); - - if (k & 4) { - $for N in range(0, NR, 4): - v${N} = _mm256_insert_epi32(v${N}, *(const int32_t *)w${N}, 0); - v${N} = _mm256_insert_epi32(v${N}, *(const int32_t *)w${N+1}, 2); - v${N} = _mm256_insert_epi32(v${N}, *(const int32_t *)w${N+2}, 4); - v${N} = _mm256_insert_epi32(v${N}, *(const int32_t *)w${N+3}, 6); - $for N in range(NR): - w${N} += 4; - } - if (k & 2) { - if (k & 4) { - $for N in range(0, NR, 4): - v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N}, 2); - v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N+1}, 6); - v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N+2}, 10); - v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N+3}, 14); - } else { - $for N in range(0, NR, 4): - v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N}, 0); - v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N+1}, 4); - v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N+2}, 8); - v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N+3}, 12); - } - $for N in range(NR): - w${N} += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - $for N in range(0, NR, 4): - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N}, 6); - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+1}, 14); - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+2}, 22); - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+3}, 30); - } - else if (k & 4) { - $for N in range(0, NR, 4): - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N}, 4); - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+1}, 12); - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+2}, 20); - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+3}, 28); - } - else if (k & 2) { - $for N in range(0, NR, 4): - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N}, 2); - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+1}, 10); - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+2}, 18); - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+3}, 26); - } - else { - $for N in range(0, NR, 4): - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N}, 0); - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+1}, 8); - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+2}, 16); - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+3}, 24); - } + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); - $for N in range(NR): - w${N} += 1; - } + $for N in range(0, NR, 4): + __m256i v${N} = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w${N})); + v${N} = _mm256_blend_epi32(v${N}, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w${N+1})), 0x0C); + v${N} = _mm256_blend_epi32(v${N}, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w${N+2})), 0x30); + v${N} = _mm256_blend_epi32(v${N}, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w${N+3})), 0xC0); + v${N} = _mm256_and_si256(v${N}, vmask); + + $for N in range(NR): + w${N} += k; $for N in range(0, NR, 4): vacc${N} = ${_MM256_DPBUSD_EPI32}(vacc${N}, vone, v${N}); @@ -259,69 +208,18 @@ void xnn_qs8${"_to_qu8" if IZP == 128 else ""}_packw_gemm_goi_ukernel_x${NR}c${K // KC remainder of 1..${KR-1} if (k != 0) { assert(k >= 1 && k <= ${KR-1}); - $for N in range(0, NR, 4): - __m256i v${N} = _mm256_setzero_si256(); - - if (k & 4) { - $for N in range(0, NR, 4): - v${N} = _mm256_insert_epi32(v${N}, *(const int32_t *)w${N}, 0); - v${N} = _mm256_insert_epi32(v${N}, *(const int32_t *)w${N+1}, 2); - v${N} = _mm256_insert_epi32(v${N}, *(const int32_t *)w${N+2}, 4); - v${N} = _mm256_insert_epi32(v${N}, *(const int32_t *)w${N+3}, 6); - $for N in range(NR): - w${N} += 4; - } - if (k & 2) { - if (k & 4) { - $for N in range(0, NR, 4): - v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N}, 2); - v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N+1}, 6); - v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N+2}, 10); - v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N+3}, 14); - } else { - $for N in range(0, NR, 4): - v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N}, 0); - v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N+1}, 4); - v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N+2}, 8); - v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N+3}, 12); - } - $for N in range(NR): - w${N} += 2; - } - if (k & 1) { - if ((k & 4) && (k & 2)) { - $for N in range(0, NR, 4): - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N}, 6); - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+1}, 14); - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+2}, 22); - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+3}, 30); - } - else if (k & 4) { - $for N in range(0, NR, 4): - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N}, 4); - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+1}, 12); - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+2}, 20); - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+3}, 28); - } - else if (k & 2) { - $for N in range(0, NR, 4): - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N}, 2); - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+1}, 10); - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+2}, 18); - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+3}, 26); - } - else { - $for N in range(0, NR, 4): - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N}, 0); - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+1}, 8); - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+2}, 16); - v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+3}, 24); - } + __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8); - $for N in range(NR): - w${N} += 1; - } + $for N in range(0, NR, 4): + __m256i v${N} = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w${N})); + v${N} = _mm256_blend_epi32(v${N}, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w${N+1})), 0x0C); + v${N} = _mm256_blend_epi32(v${N}, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w${N+2})), 0x30); + v${N} = _mm256_blend_epi32(v${N}, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w${N+3})), 0xC0); + v${N} = _mm256_and_si256(v${N}, vmask); + + $for N in range(NR): + w${N} += k; $for N in range(0, NR, 4): vacc${N} = ${_MM256_DPBUSD_EPI32}(vacc${N}, vone, v${N}); diff --git a/src/x8-packw/kr-wasmdot.c.in b/src/x8-packw/kr-wasmdot.c.in new file mode 100644 index 00000000000..4b5bd727730 --- /dev/null +++ b/src/x8-packw/kr-wasmdot.c.in @@ -0,0 +1,247 @@ +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert NR == 8 +$assert KR == 8 +$assert TYPE in ["int8_t"] +$assert IZP in [0, 128] + +#include + +#include + +#include "xnnpack/packw.h" + + +$ABC = "012345678" +$BTYPE = {"int8_t": "uint32_t"}[TYPE] +$WTYPE = {"int8_t": "int8_t"}[TYPE] +void xnn_qs8${"_to_qu8" if IZP == 128 else ""}_packw_gemm_goi_ukernel_x${NR}c${KR}__wasmrelaxedsimd( + size_t g, + size_t nc, + size_t kc, + size_t nr, + size_t kr, + size_t sr, + const ${WTYPE}* weights, + const int32_t* bias, + const void* scale, + ${WTYPE}* packed_weights, + size_t extra_bytes, + const void* params) XNN_OOB_READS +{ + assert(g != 0); + assert(nc != 0); + assert(kc != 0); + assert(nr == ${NR}); + assert(kr == ${KR}); + assert(sr == 1); + assert(weights != NULL); + assert(packed_weights != NULL); + + const v128_t vone = wasm_i8x16_splat(1); + const v128_t vzero = wasm_i32x4_splat(0); + XNN_FORCE_REALIZATION(vone); + XNN_FORCE_REALIZATION(vzero); + ${TYPE}* out = (${TYPE}*) packed_weights; + const ${BTYPE}* b = (const ${BTYPE}*) bias; + const uint32_t izp = (uint32_t) (params ? (((const struct xnn_qs8_packw_params*) params)->input_zero_point + ${IZP}): ${IZP}); + v128_t vzeropoint = wasm_i32x4_splat((int32_t) izp); + + do { + // NC main loop multiple of ${NR} + const ${TYPE}* w0 = (const ${TYPE}*) weights; + size_t n = nc; + for (;n >= ${NR}; n -= ${NR}) { + int32_t* packed_b = (int32_t*) out; + if XNN_LIKELY(b != NULL) { + $for N in range(0, NR, 4): + const v128_t vb${N>>2} = wasm_v128_load(b + ${N}); + wasm_v128_store(out + ${N * 4}, vb${N>>2}); + b += ${NR}; + } else { + $for N in range(0, NR, 4): + wasm_v128_store(out + ${N * 4}, vzero); + } + out += ${NR} * sizeof(${BTYPE}); + + $for N in range(1, NR): + const ${TYPE}* w${N} = w${N-1} + kc; + + $for N in range(0, NR, 2): + v128_t vacc${ABC[N:N+2]} = wasm_i32x4_splat(0); + + // KC main loop multiple of ${NR}x${KR} + size_t k = kc; + for (; k >= ${2 * KR}; k -= ${2 * KR}) { + $for N in range(NR): + v128_t v${N}_01 = wasm_v128_load(w${N}); + + $for N in range(0, NR, 2): + v128_t v${ABC[N:N+2]}_0 = wasm_i64x2_shuffle(v${N}_01, v${N+1}_01, 0, 2); + v128_t v${ABC[N:N+2]}_1 = wasm_i64x2_shuffle(v${N}_01, v${N+1}_01, 1, 3); + + $for N in range(0, NR, 2): + vacc${ABC[N:N+2]} = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v${ABC[N:N+2]}_0, vone, vacc${ABC[N:N+2]}); + vacc${ABC[N:N+2]} = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v${ABC[N:N+2]}_1, vone, vacc${ABC[N:N+2]}); + + $for N in range(0, NR, 2): + wasm_v128_store(out + ${N * KR}, v${ABC[N:N+2]}_0); + + $for N in range(0, NR, 2): + wasm_v128_store(out + ${(N + 8) * KR}, v${ABC[N:N+2]}_1); + + $for N in range(NR): + w${N} += ${2 * KR}; + out += ${2*NR*KR}; + } + + for (; k >= ${KR}; k -= ${KR}) { + $for N in range(0, NR, 2): + const v128_t v${N} = wasm_v128_load64_splat(w${N}); + const v128_t v${N+1} = wasm_v128_load64_splat(w${N+1}); + const v128_t v${ABC[N:N+2]} = wasm_i64x2_shuffle(v${N}, v${N+1}, 0, 3); + + $for N in range(0, NR, 2): + vacc${ABC[N:N+2]} = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v${ABC[N:N+2]}, vone, vacc${ABC[N:N+2]}); + + $for N in range(0, NR, 2): + wasm_v128_store(out + ${N * KR}, v${ABC[N:N+2]}); + + $for N in range(NR): + w${N} += ${KR}; + out += ${NR*KR}; + } + + // KC remainder 1..KR-1 + if (k != 0) { + assert(k >= 1 && k <= ${KR-1}); + + const v128_t vmask = wasm_u64x2_shr(wasm_i32x4_splat(-1), (${KR} - k) * sizeof(${WTYPE}) * 8); + + $for N in range(0, NR, 2): + const v128_t v${N} = wasm_v128_load64_splat(w${N}); + const v128_t v${N+1} = wasm_v128_load64_splat(w${N+1}); + v128_t v${ABC[N:N+2]} = wasm_i64x2_shuffle(v${N}, v${N+1}, 0, 3); + v${ABC[N:N+2]} = wasm_v128_and(v${ABC[N:N+2]}, vmask); + + $for N in range(0, NR, 2): + vacc${ABC[N:N+2]} = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v${ABC[N:N+2]}, vone, vacc${ABC[N:N+2]}); + + $for N in range(0, NR, 2): + wasm_v128_store(out + ${N * KR}, v${ABC[N:N+2]}); + + $for N in range(NR): + w${N} += k; + out += ${NR*KR}; + } + + v128_t vksum0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc01, vacc23, 0, 2, 4, 6), wasm_v32x4_shuffle(vacc01, vacc23, 1, 3, 5, 7)); + v128_t vksum4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc45, vacc67, 0, 2, 4, 6), wasm_v32x4_shuffle(vacc45, vacc67, 1, 3, 5, 7)); + + vksum0123 = wasm_i32x4_mul(vksum0123, vzeropoint); + vksum4567 = wasm_i32x4_mul(vksum4567, vzeropoint); + + v128_t vpack0123 = wasm_v128_load(packed_b); + v128_t vpack4567 = wasm_v128_load(packed_b + 4); + + wasm_v128_store(packed_b, wasm_i32x4_sub(vpack0123, vksum0123)); + wasm_v128_store(packed_b + 4, wasm_i32x4_sub(vpack4567, vksum4567)); + + out = (${TYPE}*) ((uintptr_t) out + extra_bytes); + w0 = w${NR-1}; + } + + // NC remainder (1..${NR-1}) + if XNN_UNLIKELY(n != 0) { + assert(n >= 1 && n <= ${NR-1}); + + int32_t* packed_b = (int32_t*) out; + if XNN_LIKELY(b != NULL) { + size_t nb = n; + do { + *((${BTYPE}*) out) = *b++; + out += sizeof(${BTYPE}); + } while (--nb != 0); + } else { + size_t nb = n; + do { + *((${BTYPE}*) out) = 0; + out += sizeof(${BTYPE}); + } while (--nb != 0); + } + out += (${NR} - n) * sizeof(${BTYPE}); + + $for N in range(1, NR): + const ${TYPE}* w${N} = w${N-1} + kc; + $if N % 2 == 0: + if XNN_UNPREDICTABLE(n <= ${N}) { + w${N} = w${N-1}; + } + $else: + if XNN_UNPREDICTABLE(n < ${N+1}) { + w${N} = w${N-1}; + } + + $for N in range(0, NR, 2): + v128_t vacc${ABC[N:N+2]} = wasm_i32x4_splat(0); + + // KC main loop multiple of ${NR}x${KR} + size_t k = kc; + for (; k >= ${KR}; k -= ${KR}) { + $for N in range(0, NR, 2): + const v128_t v${N} = wasm_v128_load64_splat(w${N}); + const v128_t v${N+1} = wasm_v128_load64_splat(w${N+1}); + const v128_t v${ABC[N:N+2]} = wasm_i64x2_shuffle(v${N}, v${N+1}, 0, 3); + + $for N in range(0, NR, 2): + vacc${ABC[N:N+2]} = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v${ABC[N:N+2]}, vone, vacc${ABC[N:N+2]}); + + $for N in range(0, NR, 2): + wasm_v128_store(out + ${N * KR}, v${ABC[N:N+2]}); + + $for N in range(NR): + w${N} += ${KR}; + out += ${NR*KR}; + } + + // KC remainder of 1..${KR-1} + if (k != 0) { + assert(k >= 1 && k <= ${KR-1}); + + const v128_t vmask = wasm_u64x2_shr(wasm_i32x4_splat(-1), (${KR} - k) * sizeof(${WTYPE}) * 8); + + $for N in range(0, NR, 2): + const v128_t v${N} = wasm_v128_load64_splat(w${N}); + const v128_t v${N+1} = wasm_v128_load64_splat(w${N+1}); + v128_t v${ABC[N:N+2]} = wasm_i64x2_shuffle(v${N}, v${N+1}, 0, 3); + v${ABC[N:N+2]} = wasm_v128_and(v${ABC[N:N+2]}, vmask); + + $for N in range(0, NR, 2): + vacc${ABC[N:N+2]} = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v${ABC[N:N+2]}, vone, vacc${ABC[N:N+2]}); + + $for N in range(0, NR, 2): + wasm_v128_store(out + ${N * KR}, v${ABC[N:N+2]}); + + out += ${NR*KR}; + } + + v128_t vksum0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc01, vacc23, 0, 2, 4, 6), wasm_v32x4_shuffle(vacc01, vacc23, 1, 3, 5, 7)); + v128_t vksum4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc45, vacc67, 0, 2, 4, 6), wasm_v32x4_shuffle(vacc45, vacc67, 1, 3, 5, 7)); + + vksum0123 = wasm_i32x4_mul(vksum0123, vzeropoint); + vksum4567 = wasm_i32x4_mul(vksum4567, vzeropoint); + + v128_t vpack0123 = wasm_v128_load(packed_b); + v128_t vpack4567 = wasm_v128_load(packed_b + 4); + + wasm_v128_store(packed_b, wasm_i32x4_sub(vpack0123, vksum0123)); + wasm_v128_store(packed_b + 4, wasm_i32x4_sub(vpack4567, vksum4567)); + + out = (${TYPE}*) ((uintptr_t) out + extra_bytes); + } + weights += nc * kc; + } while (--g != 0); +} diff --git a/src/xnnpack/avgpool.h b/src/xnnpack/avgpool.h index 55508bec2d9..68a8c005a4a 100644 --- a/src/xnnpack/avgpool.h +++ b/src/xnnpack/avgpool.h @@ -46,7 +46,7 @@ extern "C" { size_t output_increment, \ const struct xnn_f16_scaleminmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); -#include "src/f16-avgpool/f16-avgpool-minmax.h" +#include "f16-avgpool/f16-avgpool-minmax.h" #undef XNN_UKERNEL_MULTIPASS #undef XNN_UKERNEL_UNIPASS @@ -78,7 +78,7 @@ extern "C" { size_t output_increment, \ const struct xnn_f32_scaleminmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); -#include "src/f32-avgpool/f32-avgpool-minmax.h" +#include "f32-avgpool/f32-avgpool-minmax.h" #undef XNN_UKERNEL_MULTIPASS #undef XNN_UKERNEL_UNIPASS @@ -111,7 +111,7 @@ extern "C" { size_t output_increment, \ const union xnn_qu8_avgpool_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); -#include "src/qu8-avgpool/qu8-avgpool-minmax.h" +#include "qu8-avgpool/qu8-avgpool-minmax.h" #undef XNN_UKERNEL_MULTIPASS #undef XNN_UKERNEL_UNIPASS diff --git a/src/xnnpack/compute.h b/src/xnnpack/compute.h index 18eb152e72c..7a257df9034 100644 --- a/src/xnnpack/compute.h +++ b/src/xnnpack/compute.h @@ -1464,23 +1464,6 @@ struct reduce_context { size_t output2_block_size); #endif -struct prelu_context { - size_t n; - const void* x; - size_t x_stride; - const void* w; - void* y; - size_t y_stride; - xnn_prelu_ukernel_fn ukernel; -}; - -#ifndef __cplusplus - XNN_PRIVATE void xnn_compute_prelu( - const struct prelu_context context[restrict XNN_MIN_ELEMENTS(1)], - size_t batch_start, - size_t batch_range); -#endif - struct vmulcaddc_context { size_t n; const void* x; diff --git a/src/xnnpack/config-types.h b/src/xnnpack/config-types.h index 7abec0bccb4..3d9e5f9b309 100644 --- a/src/xnnpack/config-types.h +++ b/src/xnnpack/config-types.h @@ -236,6 +236,8 @@ struct xnn_dwconv_config { uint8_t last_tile; }; +// Bilinear interpolation (2D). + struct xnn_ibilinear_config { xnn_ibilinear_ukernel_fn ukernel; // Number of output pixels in a tile. @@ -243,7 +245,7 @@ struct xnn_ibilinear_config { uint8_t pixel_tile; }; -// Bilinear interpolation (2D). +// Bilinear interpolation (2D) in CHW layout. struct xnn_ibilinear_chw_config { xnn_ibilinear_chw_ukernel_fn ukernel; @@ -252,18 +254,6 @@ struct xnn_ibilinear_chw_config { uint8_t channel_tile; }; -// Bilinear interpolation (2D) in CHW layout. - -struct xnn_prelu_config { - xnn_prelu_ukernel_fn ukernel; - // Number of rows in a tile. - // For best efficiency, micro-kernel must process a multiple of this number of rows in each call. - uint16_t row_tile; - // Number of channels in a tile. - // For best efficiency, micro-kernel must process a multiple of this number of channels in each call. - uint16_t channel_tile; -}; - struct xnn_gemm_config { struct gemm_fused_ukernels minmax; struct gemm_fused_ukernels relu; diff --git a/src/xnnpack/config.h b/src/xnnpack/config.h index 59eaf94ced5..67c7ee16bc5 100644 --- a/src/xnnpack/config.h +++ b/src/xnnpack/config.h @@ -30,6 +30,7 @@ XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f16_vdiv_confi XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f16_vmax_config(); XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f16_vmin_config(); XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f16_vmul_config(); +XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f16_vprelu_config(); XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f16_vsub_config(); XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f16_vsqrdiff_config(); XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f32_vadd_config(); @@ -39,6 +40,7 @@ XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f32_vdiv_confi XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f32_vmax_config(); XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f32_vmin_config(); XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f32_vmul_config(); +XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f32_vprelu_config(); XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f32_vsub_config(); XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f32_vsqrdiff_config(); XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_qs8_vadd_config(); @@ -155,9 +157,6 @@ XNN_INTERNAL const struct xnn_ibilinear_config* xnn_init_u8_ibilinear_config(); XNN_INTERNAL const struct xnn_ibilinear_chw_config* xnn_init_f16_ibilinear_chw_config(); XNN_INTERNAL const struct xnn_ibilinear_chw_config* xnn_init_f32_ibilinear_chw_config(); -XNN_INTERNAL const struct xnn_prelu_config* xnn_init_f16_prelu_config(); -XNN_INTERNAL const struct xnn_prelu_config* xnn_init_f32_prelu_config(); - static inline struct xnn_hmp_dqgemm_ukernel xnn_init_hmp_dqgemm_ukernel( xnn_dqgemm_ukernel_fn function) { struct xnn_hmp_dqgemm_ukernel ukernel = {{ function }}; diff --git a/src/xnnpack/dwconv.h b/src/xnnpack/dwconv.h index a4c23e345cb..5ecaaec7d69 100644 --- a/src/xnnpack/dwconv.h +++ b/src/xnnpack/dwconv.h @@ -31,14 +31,14 @@ extern "C" { size_t input_offset, \ const datatype* zero, \ const params_type params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); -#include "src/f16-dwconv/f16-dwconv-minmax-unipass.h" -#include "src/f32-dwconv/f32-dwconv-minmax-unipass.h" -#include "src/f32-dwconv/f32-dwconv-unipass.h" -#include "src/qs8-dwconv/qs8-dwconv-minmax-unipass-fp32.h" -#include "src/qs8-dwconv/qs8-dwconv-minmax-unipass-rndnu.h" -#include "src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-unipass-fp32.h" -#include "src/qu8-dwconv/qu8-dwconv-minmax-unipass-fp32.h" -#include "src/qu8-dwconv/qu8-dwconv-minmax-unipass-rndnu.h" +#include "f16-dwconv/f16-dwconv-minmax-unipass.h" +#include "f32-dwconv/f32-dwconv-minmax-unipass.h" +#include "f32-dwconv/f32-dwconv-unipass.h" +#include "qs8-dwconv/qs8-dwconv-minmax-unipass-fp32.h" +#include "qs8-dwconv/qs8-dwconv-minmax-unipass-rndnu.h" +#include "qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-unipass-fp32.h" +#include "qu8-dwconv/qu8-dwconv-minmax-unipass-fp32.h" +#include "qu8-dwconv/qu8-dwconv-minmax-unipass-rndnu.h" #undef XNN_DWCONV_UNIPASS @@ -56,14 +56,14 @@ extern "C" { size_t kernel_size, \ buffer_type* buffer, \ const params_type params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); -#include "src/f16-dwconv/f16-dwconv-minmax-multipass.h" -#include "src/f32-dwconv/f32-dwconv-minmax-multipass.h" -#include "src/f32-dwconv/f32-dwconv-multipass.h" -#include "src/qs8-dwconv/qs8-dwconv-minmax-multipass-fp32.h" -#include "src/qs8-dwconv/qs8-dwconv-minmax-multipass-rndnu.h" -#include "src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-multipass-fp32.h" -#include "src/qu8-dwconv/qu8-dwconv-minmax-multipass-fp32.h" -#include "src/qu8-dwconv/qu8-dwconv-minmax-multipass-rndnu.h" +#include "f16-dwconv/f16-dwconv-minmax-multipass.h" +#include "f32-dwconv/f32-dwconv-minmax-multipass.h" +#include "f32-dwconv/f32-dwconv-multipass.h" +#include "qs8-dwconv/qs8-dwconv-minmax-multipass-fp32.h" +#include "qs8-dwconv/qs8-dwconv-minmax-multipass-rndnu.h" +#include "qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-multipass-fp32.h" +#include "qu8-dwconv/qu8-dwconv-minmax-multipass-fp32.h" +#include "qu8-dwconv/qu8-dwconv-minmax-multipass-rndnu.h" #undef XNN_DWCONV_MULTIPASS #define DECLARE_F32_DWCONV2D_CHW_MINMAX_UKERNEL_FUNCTION(fn_name) \ diff --git a/src/xnnpack/fill.h b/src/xnnpack/fill.h index 2c807407866..2698d0c671e 100644 --- a/src/xnnpack/fill.h +++ b/src/xnnpack/fill.h @@ -22,7 +22,7 @@ extern "C" { void* output, \ size_t output_stride, \ const uint32_t fill_pattern); -#include "src/xx-fill/xx-fill.h" +#include "xx-fill/xx-fill.h" #undef XNN_FILL_UKERNEL #ifdef __cplusplus diff --git a/src/xnnpack/indirection.h b/src/xnnpack/indirection.h index 12ed000d3de..e5e71f852bb 100644 --- a/src/xnnpack/indirection.h +++ b/src/xnnpack/indirection.h @@ -13,6 +13,7 @@ #include "xnnpack.h" #include "xnnpack/common.h" +#include "xnnpack/math.h" #include "xnnpack/microparams.h" #ifdef __cplusplus diff --git a/src/xnnpack/maxpool.h b/src/xnnpack/maxpool.h index d803fc50cbc..38100203cfa 100644 --- a/src/xnnpack/maxpool.h +++ b/src/xnnpack/maxpool.h @@ -31,7 +31,7 @@ extern "C" { size_t output_increment, \ const union xnn_f16_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); -#include "src/f16-maxpool/f16-maxpool-minmax.h" +#include "f16-maxpool/f16-maxpool-minmax.h" #undef XNN_UKERNEL @@ -47,7 +47,7 @@ extern "C" { size_t output_increment, \ const union xnn_f32_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); -#include "src/f32-maxpool/f32-maxpool-minmax.h" +#include "f32-maxpool/f32-maxpool-minmax.h" #undef XNN_UKERNEL @@ -64,7 +64,7 @@ extern "C" { const struct xnn_u8_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); -#include "src/u8-maxpool/u8-maxpool-minmax.h" +#include "u8-maxpool/u8-maxpool-minmax.h" #undef XNN_UKERNEL @@ -81,7 +81,7 @@ extern "C" { const struct xnn_s8_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); -#include "src/s8-maxpool/s8-maxpool-minmax.h" +#include "s8-maxpool/s8-maxpool-minmax.h" #undef XNN_UKERNEL diff --git a/src/xnnpack/microfnptr.h b/src/xnnpack/microfnptr.h index aacb40ff5ef..1705904ee9c 100644 --- a/src/xnnpack/microfnptr.h +++ b/src/xnnpack/microfnptr.h @@ -11,9 +11,9 @@ #include "xnnpack.h" #include "xnnpack/common.h" +#include "xnnpack/math.h" #include "xnnpack/microparams.h" - /****************** Microkernel pointers for dense inference *****************/ // CONV-HWC: direct CONVolution in HWC layout @@ -790,35 +790,6 @@ typedef void (*xnn_f32_vmulcaddc_ukernel_fn)( size_t output_stride, const union xnn_f32_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); -// PRELU: Parametric RELU - -typedef void (*xnn_prelu_ukernel_fn)( - size_t batch, - size_t channels, - const void* input, - size_t input_stride, - const void* weights, - void* output, - size_t output_stride); - -typedef void (*xnn_f16_prelu_ukernel_fn)( - size_t batch, - size_t channels, - const xnn_float16* input, - size_t input_stride, - const xnn_float16* weights, - xnn_float16* output, - size_t output_stride); - -typedef void (*xnn_f32_prelu_ukernel_fn)( - size_t batch, - size_t channels, - const float* input, - size_t input_stride, - const float* weights, - float* output, - size_t output_stride); - // IBILINEAR: Indirect BILINEAR interpolation typedef void (*xnn_ibilinear_ukernel_fn)( @@ -2528,16 +2499,12 @@ typedef size_t (*xnn_init_binary_params_fn)( typedef size_t (*xnn_init_f16_qs8_cvt_params_fn)( struct xnn_f16_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)], xnn_float16 scale, - int8_t output_zero_point, - int8_t output_min, - int8_t output_max); + int8_t output_zero_point); typedef size_t (*xnn_init_f32_qs8_cvt_params_fn)( struct xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)], float scale, - int8_t output_zero_point, - int8_t output_min, - int8_t output_max); + int8_t output_zero_point); typedef size_t (*xnn_init_qs8_reduce_minmax_params_fn)( struct xnn_qs8_reduce_minmax_params params[XNN_MIN_ELEMENTS(1)], @@ -2556,9 +2523,7 @@ typedef size_t (*xnn_init_qu8_reduce_minmax_params_fn)( typedef size_t (*xnn_init_f32_qu8_cvt_params_fn)( struct xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)], float scale, - uint8_t output_zero_point, - uint8_t output_min, - uint8_t output_max); + uint8_t output_zero_point); typedef size_t (*xnn_init_s32_f32_cvt_params_fn)( struct xnn_s32_f32_cvt_params params[XNN_MIN_ELEMENTS(1)], diff --git a/src/xnnpack/microparams-init.h b/src/xnnpack/microparams-init.h index ea04ccde182..7363f4064ce 100644 --- a/src/xnnpack/microparams-init.h +++ b/src/xnnpack/microparams-init.h @@ -10,9 +10,9 @@ #include "xnnpack.h" #include "xnnpack/common.h" +#include "xnnpack/math.h" #include "xnnpack/microparams.h" - #ifdef __cplusplus extern "C" { #endif @@ -478,23 +478,17 @@ DECLARE_INIT_QS8_MUL_MINMAX_PARAMS_FUNCTION(xnn_init_qs8_mul_minmax_scalar_param XNN_INTERNAL size_t xnn_init_f16_qs8_cvt_scalar_params( struct xnn_f16_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)], xnn_float16 scale, - int8_t zero_point, - int8_t output_min, - int8_t output_max); + int8_t zero_point); XNN_INTERNAL size_t xnn_init_f32_qs8_cvt_scalar_params( struct xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)], float scale, - int8_t zero_point, - int8_t output_min, - int8_t output_max); + int8_t zero_point); XNN_INTERNAL size_t xnn_init_f32_qu8_cvt_scalar_params( struct xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)], float scale, - uint8_t zero_point, - uint8_t output_min, - uint8_t output_max); + uint8_t zero_point); XNN_INTERNAL size_t xnn_init_s32_f32_cvt_scalar_params( struct xnn_s32_f32_cvt_params params[XNN_MIN_ELEMENTS(1)], diff --git a/src/xnnpack/microparams.h b/src/xnnpack/microparams.h index d939f36694c..7ff41529af6 100644 --- a/src/xnnpack/microparams.h +++ b/src/xnnpack/microparams.h @@ -590,8 +590,6 @@ struct xnn_f16_qs8_cvt_params { struct { xnn_float16 scale; int16_t output_zero_point; - int8_t output_min; - int8_t output_max; } scalar; }; @@ -599,8 +597,6 @@ struct xnn_f32_qs8_cvt_params { struct { float scale; int16_t output_zero_point; - int8_t output_min; - int8_t output_max; } scalar; }; @@ -608,8 +604,6 @@ struct xnn_f32_qu8_cvt_params { struct { float scale; int16_t output_zero_point; - uint8_t output_min; - uint8_t output_max; } scalar; }; diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h index 813cc03a8f7..0fbb6c36670 100644 --- a/src/xnnpack/operator.h +++ b/src/xnnpack/operator.h @@ -150,7 +150,6 @@ struct xnn_operator { size_t group_input_channels; size_t group_output_channels; size_t channels; - size_t max_tokens; uint32_t pad_value; @@ -333,7 +332,6 @@ struct xnn_operator { }; }; // For softmax operator. const struct xnn_maxpool_config* maxpool_config; - const struct xnn_prelu_config* prelu_config; const struct xnn_unpool_config* unpool_config; const struct xnn_zip_config* zip_config; struct { @@ -392,7 +390,6 @@ struct xnn_operator { struct max_pooling_context max_pooling; struct pad_context pad; struct pixelwise_average_pooling_context pixelwise_average_pooling; - struct prelu_context prelu; struct reduce_context reduce; struct { struct resize_bilinear_context resize_bilinear; diff --git a/src/xnnpack/pack.h b/src/xnnpack/pack.h index 5d50d12cb79..6f8f4e9277d 100644 --- a/src/xnnpack/pack.h +++ b/src/xnnpack/pack.h @@ -13,10 +13,10 @@ #include "xnnpack.h" #include "xnnpack/common.h" +#include "xnnpack/math.h" #include "xnnpack/microfnptr.h" #include "xnnpack/microparams.h" - #ifdef __cplusplus extern "C" { #endif @@ -1358,31 +1358,6 @@ XNN_INTERNAL void xnn_pack_f32_to_f16_vmulcaddc_w( const void* params); -// Pack functions for prelu weights. -typedef void (*xnn_pack_prelu_w_fn)( - size_t input_channels, - size_t slope_channels, - const void* slope_data, - void* packed_weights); - -XNN_INTERNAL void xnn_pack_f32_prelu_w( - size_t input_channels, - size_t slope_channels, - const float* slope_data, - float* packed_weights); - -XNN_INTERNAL void xnn_pack_f16_prelu_w( - size_t input_channels, - size_t slope_channels, - const uint16_t* slope_data, - uint16_t* packed_weights); - -XNN_INTERNAL void xnn_pack_f32_to_f16_prelu_w( - size_t input_channels, - size_t slope_channels, - const float* slope_data, - xnn_float16* packed_weights); - // Sparse packing functions. struct xnn_spmm_packing_params { diff --git a/src/xnnpack/packb.h b/src/xnnpack/packb.h index 88a2d2896f7..b0b662f384a 100644 --- a/src/xnnpack/packb.h +++ b/src/xnnpack/packb.h @@ -25,7 +25,7 @@ extern "C" { size_t channel_subtile_stride, \ const struct xnn_x32_packb_params params [XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); \ -#include "src/x32-packb/x32-packb.h" +#include "x32-packb/x32-packb.h" #undef XNN_UKERNEL diff --git a/src/xnnpack/packq.h b/src/xnnpack/packq.h index abc93399eb6..07af36c7b1c 100644 --- a/src/xnnpack/packq.h +++ b/src/xnnpack/packq.h @@ -148,7 +148,7 @@ XNN_INLINE static float xnn_x8_packq_f32qp8_get_dequantized( const float* XNN_RESTRICT lhs, size_t lhs_stride, \ void* XNN_RESTRICT lhs_packed); -#include "src/x8-packq/x8-packq.h" +#include "x8-packq/x8-packq.h" #undef XNN_UKERNEL diff --git a/src/xnnpack/packw.h b/src/xnnpack/packw.h index 013295f593a..cf62f60c640 100644 --- a/src/xnnpack/packw.h +++ b/src/xnnpack/packw.h @@ -31,7 +31,7 @@ extern "C" { size_t extra_bytes, \ const void* params); -#include "src/x8-packw/x8-packw.h" +#include "x8-packw/x8-packw.h" #undef XNN_UKERNEL @@ -50,7 +50,7 @@ extern "C" { size_t extra_bytes, \ const void* params); -#include "src/qs8-packw/qs8-packw.h" +#include "qs8-packw/qs8-packw.h" #undef XNN_QS8_UKERNEL @@ -69,7 +69,7 @@ extern "C" { size_t extra_bytes, \ const void* params); \ -#include "src/x16-packw/x16-packw.h" +#include "x16-packw/x16-packw.h" #undef XNN_UKERNEL @@ -88,7 +88,7 @@ extern "C" { size_t extra_bytes, \ const void* params); \ -#include "src/x32-packw/x32-packw.h" +#include "x32-packw/x32-packw.h" #undef XNN_UKERNEL diff --git a/src/xnnpack/packx.h b/src/xnnpack/packx.h index c08428f4e1d..c6a24e09106 100644 --- a/src/xnnpack/packx.h +++ b/src/xnnpack/packx.h @@ -23,7 +23,7 @@ extern "C" { size_t x_stride, \ uint32_t* y); -#include "src/x32-packx/x32-packx.h" +#include "x32-packx/x32-packx.h" #undef XNN_UKERNEL diff --git a/src/xnnpack/pad.h b/src/xnnpack/pad.h index db0098ecedb..227c4ca75de 100644 --- a/src/xnnpack/pad.h +++ b/src/xnnpack/pad.h @@ -26,7 +26,7 @@ extern "C" { void* output, \ size_t output_stride, \ const uint32_t fill_pattern); -#include "src/xx-pad/xx-pad.h" +#include "xx-pad/xx-pad.h" #undef XNN_PAD_UKERNEL diff --git a/src/xnnpack/pavgpool.h b/src/xnnpack/pavgpool.h index e45595de9dd..c1433914ec5 100644 --- a/src/xnnpack/pavgpool.h +++ b/src/xnnpack/pavgpool.h @@ -45,7 +45,7 @@ extern "C" { size_t output_increment, \ const struct xnn_f16_scaleminmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); -#include "src/f16-pavgpool/f16-pavgpool-minmax.h" +#include "f16-pavgpool/f16-pavgpool-minmax.h" #undef XNN_UKERNEL_MULTIPASS #undef XNN_UKERNEL_UNIPASS @@ -80,7 +80,7 @@ extern "C" { size_t output_increment, \ const union xnn_f32_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); -#include "src/f32-pavgpool/f32-pavgpool-minmax.h" +#include "f32-pavgpool/f32-pavgpool-minmax.h" #undef XNN_UKERNEL_MULTIPASS #undef XNN_UKERNEL_UNIPASS diff --git a/src/xnnpack/prelu.h b/src/xnnpack/prelu.h deleted file mode 100644 index 2057d4a2b1d..00000000000 --- a/src/xnnpack/prelu.h +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include - -#include "xnnpack/common.h" -#include "xnnpack/math.h" - -#ifdef __cplusplus -extern "C" { -#endif - - -#define DECLARE_F16_PRELU_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t rows, \ - size_t channels, \ - const xnn_float16* input, \ - size_t input_stride, \ - const xnn_float16* weights, \ - xnn_float16* output, \ - size_t output_stride); - -DECLARE_F16_PRELU_UKERNEL_FUNCTION(xnn_f16_prelu_ukernel__neonfp16arith_2x8) -DECLARE_F16_PRELU_UKERNEL_FUNCTION(xnn_f16_prelu_ukernel__neonfp16arith_2x16) - -DECLARE_F16_PRELU_UKERNEL_FUNCTION(xnn_f16_prelu_ukernel__f16c_2x8) -DECLARE_F16_PRELU_UKERNEL_FUNCTION(xnn_f16_prelu_ukernel__f16c_2x16) - - -#define DECLARE_F32_PRELU_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t rows, \ - size_t channels, \ - const float* input, \ - size_t input_stride, \ - const float* weights, \ - float* output, \ - size_t output_stride); - -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__neon_1x4) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__neon_1x8) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__neon_1x16) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__neon_2x4) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__neon_2x8) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__neon_2x16) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__neon_4x4) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__neon_4x8) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__neon_4x16) - -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__sse_2x4) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__sse_2x8) - -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__sse2_2x4) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__sse2_2x8) - -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__sse41_2x4) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__sse41_2x8) - -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__avx_2x8) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__avx_2x16) - -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__avx512f_2x16) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__avx512f_2x32) - -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x4) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x8) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x16) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x4) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x16) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x4) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x8) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x16) - -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x4) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x8) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x16) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x4) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x16) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x4) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x8) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x16) - -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x4) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x8) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x16) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x8) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x16) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x4) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x8) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x16) - -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x4) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x8) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x16) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x8) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x16) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x4) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x8) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x16) - -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasm_2x1) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasm_2x4) - -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__scalar_2x1) -DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__scalar_2x4) - - -#ifdef __cplusplus -} // extern "C" -#endif diff --git a/src/xnnpack/requantization-stubs.h b/src/xnnpack/requantization-stubs.h index 02a47cc3ec6..3a23770f067 100644 --- a/src/xnnpack/requantization-stubs.h +++ b/src/xnnpack/requantization-stubs.h @@ -48,14 +48,6 @@ DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_gemmlowp__sse41) DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_gemmlowp__ssse3) DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_gemmlowp__wasmsimd) -DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_rndna__neon) -DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_rndna__scalar_signed64) -DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_rndna__scalar_unsigned32) -DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_rndna__scalar_unsigned64) -DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_rndna__sse2) -DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_rndna__sse41) -DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_rndna__ssse3) - typedef void (*xnn_qs8_requantization_fn)( size_t n, @@ -90,14 +82,6 @@ DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_gemmlowp__sse41) DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_gemmlowp__ssse3) DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_gemmlowp__wasmsimd) -DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_rndna__neon) -DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_rndna__scalar_signed64) -DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_rndna__scalar_unsigned32) -DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_rndna__scalar_unsigned64) -DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_rndna__sse2) -DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_rndna__sse41) -DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_rndna__ssse3) - DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_rndnu__neon_mull) DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_rndnu__neon_qdmulh) DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_rndnu__scalar) diff --git a/src/xnnpack/requantization.h b/src/xnnpack/requantization.h index 1cc63dbe652..cc2af0ec0bf 100644 --- a/src/xnnpack/requantization.h +++ b/src/xnnpack/requantization.h @@ -74,82 +74,6 @@ static inline uint8_t xnn_qu8_requantize_fp32( return (uint8_t) output; } -static inline int8_t xnn_qs8_requantize_rndna( - int32_t input, - float scale, - int8_t zero_point, - int8_t min, - int8_t max) -{ - assert(scale >= 1.0f / 4294967296.0f /* 0x1.0p-32f */); - assert(scale < 256.0f); - - const uint32_t scale_bits = float_as_uint32(scale); - const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000); - const uint32_t shift = 127 + 23 - (scale_bits >> 23); - assert(shift >= 16); - assert(shift < 56); - - const uint64_t rounding = UINT64_C(1) << (shift - 1); - const int32_t min_less_zero_point = (int32_t) min - (int32_t) zero_point; - const int32_t max_less_zero_point = (int32_t) max - (int32_t) zero_point; - - uint32_t abs_input = (uint32_t) input; - if (input < 0) { - abs_input = -abs_input; - } - - const uint64_t abs_prescaled_input = (uint64_t) abs_input * (uint64_t) multiplier; - const uint32_t abs_scaled_input = (uint32_t) ((abs_prescaled_input + rounding) >> shift); - - int32_t output = (int32_t) abs_scaled_input; - if (input < 0) { - output = -output; - } - - output = math_max_s32(output, min_less_zero_point); - output = math_min_s32(output, max_less_zero_point); - return (int8_t) (output + (int32_t) zero_point); -} - -static inline uint8_t xnn_qu8_requantize_rndna( - int32_t input, - float scale, - uint8_t zero_point, - uint8_t min, - uint8_t max) -{ - assert(scale >= 1.0f / 4294967296.0f /* 0x1.0p-32f */); - assert(scale < 256.0f); - - const uint32_t scale_bits = float_as_uint32(scale); - const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000); - const uint32_t shift = 127 + 23 - (scale_bits >> 23); - assert(shift >= 16); - assert(shift < 56); - - const uint64_t rounding = UINT64_C(1) << (shift - 1); - const int32_t min_less_zero_point = (int32_t) min - (int32_t) zero_point; - const int32_t max_less_zero_point = (int32_t) max - (int32_t) zero_point; - - uint32_t abs_input = (uint32_t) input; - if (input < 0) { - abs_input = -abs_input; - } - - const uint64_t abs_prescaled_input = (uint64_t) abs_input * (uint64_t) multiplier; - const uint32_t abs_scaled_input = (uint32_t) ((abs_prescaled_input + rounding) >> shift); - - int32_t output = (int32_t) abs_scaled_input; - if (input < 0) { - output = -output; - } - - output = math_max_s32(output, min_less_zero_point); - output = math_min_s32(output, max_less_zero_point); - return (uint8_t) (output + (int32_t) zero_point); -} - static inline int8_t xnn_qs8_requantize_rndnu( int32_t input, float scale, diff --git a/src/xnnpack/subgraph.h b/src/xnnpack/subgraph.h index e8000c085b8..d60e34bad28 100644 --- a/src/xnnpack/subgraph.h +++ b/src/xnnpack/subgraph.h @@ -319,9 +319,6 @@ struct xnn_node { size_t new_height; size_t new_width; } static_resize; - struct { - size_t max_tokens; - } rope; struct { size_t num_dims; size_t offsets[XNN_MAX_TENSOR_DIMS]; diff --git a/src/xnnpack/transpose.h b/src/xnnpack/transpose.h index eebabcc4b50..a10c4b94ebc 100644 --- a/src/xnnpack/transpose.h +++ b/src/xnnpack/transpose.h @@ -25,7 +25,7 @@ extern "C" { size_t element_size, \ size_t block_width, \ size_t block_height); -#include "src/xx-transposev/xx-transposev.h" +#include "xx-transposev/xx-transposev.h" #undef XNN_TRANSPOSE_UKERNEL #define XNN_TRANSPOSE_UKERNEL(arch_flags, fn_name, datasize, datatype, ...) \ @@ -36,11 +36,11 @@ extern "C" { size_t output_stride, \ size_t block_width, \ size_t block_height); -#include "src/x8-transposec/x8-transposec.h" -#include "src/x16-transposec/x16-transposec.h" -#include "src/x24-transposec/x24-transposec.h" -#include "src/x32-transposec/x32-transposec.h" -#include "src/x64-transposec/x64-transposec.h" +#include "x8-transposec/x8-transposec.h" +#include "x16-transposec/x16-transposec.h" +#include "x24-transposec/x24-transposec.h" +#include "x32-transposec/x32-transposec.h" +#include "x64-transposec/x64-transposec.h" #undef XNN_TRANSPOSE_UKERNEL #ifdef __cplusplus diff --git a/src/xnnpack/vbinary.h b/src/xnnpack/vbinary.h index 2479e2e182c..8a9bdf065e8 100644 --- a/src/xnnpack/vbinary.h +++ b/src/xnnpack/vbinary.h @@ -24,26 +24,26 @@ extern "C" { XNN_INTERNAL void ukernel( \ size_t n, const xnn_float16* a, const xnn_float16* b, xnn_float16* y, \ const params_type params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); -#include "src/f16-vbinary/f16-vadd.h" -#include "src/f16-vbinary/f16-vaddc.h" -#include "src/f16-vbinary/f16-vcmul.h" -#include "src/f16-vbinary/f16-vdiv.h" -#include "src/f16-vbinary/f16-vdivc.h" -#include "src/f16-vbinary/f16-vmax.h" -#include "src/f16-vbinary/f16-vmaxc.h" -#include "src/f16-vbinary/f16-vmin.h" -#include "src/f16-vbinary/f16-vminc.h" -#include "src/f16-vbinary/f16-vmul.h" -#include "src/f16-vbinary/f16-vmulc.h" -#include "src/f16-vbinary/f16-vprelu.h" -#include "src/f16-vbinary/f16-vpreluc.h" -#include "src/f16-vbinary/f16-vrpreluc.h" -#include "src/f16-vbinary/f16-vrdivc.h" -#include "src/f16-vbinary/f16-vrsubc.h" -#include "src/f16-vbinary/f16-vsqrdiff.h" -#include "src/f16-vbinary/f16-vsqrdiffc.h" -#include "src/f16-vbinary/f16-vsub.h" -#include "src/f16-vbinary/f16-vsubc.h" +#include "f16-vbinary/f16-vadd.h" +#include "f16-vbinary/f16-vaddc.h" +#include "f16-vbinary/f16-vcmul.h" +#include "f16-vbinary/f16-vdiv.h" +#include "f16-vbinary/f16-vdivc.h" +#include "f16-vbinary/f16-vmax.h" +#include "f16-vbinary/f16-vmaxc.h" +#include "f16-vbinary/f16-vmin.h" +#include "f16-vbinary/f16-vminc.h" +#include "f16-vbinary/f16-vmul.h" +#include "f16-vbinary/f16-vmulc.h" +#include "f16-vbinary/f16-vprelu.h" +#include "f16-vbinary/f16-vpreluc.h" +#include "f16-vbinary/f16-vrpreluc.h" +#include "f16-vbinary/f16-vrdivc.h" +#include "f16-vbinary/f16-vrsubc.h" +#include "f16-vbinary/f16-vsqrdiff.h" +#include "f16-vbinary/f16-vsqrdiffc.h" +#include "f16-vbinary/f16-vsub.h" +#include "f16-vbinary/f16-vsubc.h" #undef XNN_UKERNEL_WITH_PARAMS #define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ @@ -51,29 +51,29 @@ extern "C" { XNN_INTERNAL void ukernel( \ size_t n, const float* a, const float* b, float* y, \ const params_type params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); -#include "src/f32-vbinary/f32-vadd.h" -#include "src/f32-vbinary/f32-vaddc.h" -#include "src/f32-vbinary/f32-vcopysign.h" -#include "src/f32-vbinary/f32-vcopysignc.h" -#include "src/f32-vbinary/f32-vcmul.h" -#include "src/f32-vbinary/f32-vdiv.h" -#include "src/f32-vbinary/f32-vdivc.h" -#include "src/f32-vbinary/f32-vmax.h" -#include "src/f32-vbinary/f32-vmaxc.h" -#include "src/f32-vbinary/f32-vmin.h" -#include "src/f32-vbinary/f32-vminc.h" -#include "src/f32-vbinary/f32-vmul.h" -#include "src/f32-vbinary/f32-vmulc.h" -#include "src/f32-vbinary/f32-vprelu.h" -#include "src/f32-vbinary/f32-vpreluc.h" -#include "src/f32-vbinary/f32-vrpreluc.h" -#include "src/f32-vbinary/f32-vrcopysignc.h" -#include "src/f32-vbinary/f32-vrdivc.h" -#include "src/f32-vbinary/f32-vrsubc.h" -#include "src/f32-vbinary/f32-vsqrdiff.h" -#include "src/f32-vbinary/f32-vsqrdiffc.h" -#include "src/f32-vbinary/f32-vsub.h" -#include "src/f32-vbinary/f32-vsubc.h" +#include "f32-vbinary/f32-vadd.h" +#include "f32-vbinary/f32-vaddc.h" +#include "f32-vbinary/f32-vcopysign.h" +#include "f32-vbinary/f32-vcopysignc.h" +#include "f32-vbinary/f32-vcmul.h" +#include "f32-vbinary/f32-vdiv.h" +#include "f32-vbinary/f32-vdivc.h" +#include "f32-vbinary/f32-vmax.h" +#include "f32-vbinary/f32-vmaxc.h" +#include "f32-vbinary/f32-vmin.h" +#include "f32-vbinary/f32-vminc.h" +#include "f32-vbinary/f32-vmul.h" +#include "f32-vbinary/f32-vmulc.h" +#include "f32-vbinary/f32-vprelu.h" +#include "f32-vbinary/f32-vpreluc.h" +#include "f32-vbinary/f32-vrpreluc.h" +#include "f32-vbinary/f32-vrcopysignc.h" +#include "f32-vbinary/f32-vrdivc.h" +#include "f32-vbinary/f32-vrsubc.h" +#include "f32-vbinary/f32-vsqrdiff.h" +#include "f32-vbinary/f32-vsqrdiffc.h" +#include "f32-vbinary/f32-vsub.h" +#include "f32-vbinary/f32-vsubc.h" #undef XNN_UKERNEL_WITH_PARAMS #define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ @@ -82,12 +82,12 @@ extern "C" { size_t n, const uint8_t* input_a, const uint8_t* input_b, \ uint8_t* output, \ const params_type params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); -#include "src/qu8-vadd/qu8-vadd-minmax.h" -#include "src/qu8-vaddc/qu8-vaddc-minmax.h" -#include "src/qu8-vmul/qu8-vmul-minmax-fp32.h" -#include "src/qu8-vmul/qu8-vmul-minmax-rndnu.h" -#include "src/qu8-vmulc/qu8-vmulc-minmax-fp32.h" -#include "src/qu8-vmulc/qu8-vmulc-minmax-rndnu.h" +#include "qu8-vadd/qu8-vadd-minmax.h" +#include "qu8-vaddc/qu8-vaddc-minmax.h" +#include "qu8-vmul/qu8-vmul-minmax-fp32.h" +#include "qu8-vmul/qu8-vmul-minmax-rndnu.h" +#include "qu8-vmulc/qu8-vmulc-minmax-fp32.h" +#include "qu8-vmulc/qu8-vmulc-minmax-rndnu.h" #undef XNN_UKERNEL_WITH_PARAMS #define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ @@ -95,12 +95,12 @@ extern "C" { XNN_INTERNAL void ukernel( \ size_t n, const int8_t* input_a, const int8_t* input_b, int8_t* output, \ const params_type params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); -#include "src/qs8-vadd/qs8-vadd-minmax.h" -#include "src/qs8-vaddc/qs8-vaddc-minmax.h" -#include "src/qs8-vmul/qs8-vmul-minmax-fp32.h" -#include "src/qs8-vmul/qs8-vmul-minmax-rndnu.h" -#include "src/qs8-vmulc/qs8-vmulc-minmax-fp32.h" -#include "src/qs8-vmulc/qs8-vmulc-minmax-rndnu.h" +#include "qs8-vadd/qs8-vadd-minmax.h" +#include "qs8-vaddc/qs8-vaddc-minmax.h" +#include "qs8-vmul/qs8-vmul-minmax-fp32.h" +#include "qs8-vmul/qs8-vmul-minmax-rndnu.h" +#include "qs8-vmulc/qs8-vmulc-minmax-fp32.h" +#include "qs8-vmulc/qs8-vmulc-minmax-rndnu.h" #undef XNN_UKERNEL_WITH_PARAMS #define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ @@ -109,8 +109,8 @@ extern "C" { size_t n, const int32_t* input_a, const int32_t* input_b, \ int32_t* output, \ const params_type params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); -#include "src/s32-vmul/s32-vmul.h" -#include "src/s32-vmul/s32-vmulc.h" +#include "s32-vmul/s32-vmul.h" +#include "s32-vmul/s32-vmulc.h" #undef XNN_UKERNEL_WITH_PARAMS #ifdef __cplusplus diff --git a/src/xnnpack/vcvt.h b/src/xnnpack/vcvt.h index fab15282e61..3bd78dc57e3 100644 --- a/src/xnnpack/vcvt.h +++ b/src/xnnpack/vcvt.h @@ -17,19 +17,19 @@ extern "C" { #define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, type_in, type_out, params_type, init_params) \ XNN_INTERNAL void ukernel(size_t n, const type_in* input, type_out* output, const params_type* params); -#include "src/f16-f32-vcvt/f16-f32-vcvt.h" -#include "src/f16-qs8-vcvt/f16-qs8-vcvt.h" -#include "src/f32-f16-vcvt/f32-f16-vcvt.h" -#include "src/f32-qs8-vcvt/f32-qs8-vcvt.h" -#include "src/f32-qu8-vcvt/f32-qu8-vcvt.h" -#include "src/qs16-qs8-vcvt/qs16-qs8-vcvt.h" -#include "src/qs8-f16-vcvt/qs8-f16-vcvt.h" -#include "src/qs8-f32-vcvt/qs8-f32-vcvt.h" -#include "src/qu8-f32-vcvt/qu8-f32-vcvt.h" -#include "src/s32-f32-vcvt/s32-f32-vcvt.h" -#include "src/u32-f32-vcvt/u32-f32-vcvt.h" -#include "src/qs8-vcvt/qs8-vcvt.h" -#include "src/qu8-vcvt/qu8-vcvt.h" +#include "f16-f32-vcvt/f16-f32-vcvt.h" +#include "f16-qs8-vcvt/f16-qs8-vcvt.h" +#include "f32-f16-vcvt/f32-f16-vcvt.h" +#include "f32-qs8-vcvt/f32-qs8-vcvt.h" +#include "f32-qu8-vcvt/f32-qu8-vcvt.h" +#include "qs16-qs8-vcvt/qs16-qs8-vcvt.h" +#include "qs8-f16-vcvt/qs8-f16-vcvt.h" +#include "qs8-f32-vcvt/qs8-f32-vcvt.h" +#include "qu8-f32-vcvt/qu8-f32-vcvt.h" +#include "s32-f32-vcvt/s32-f32-vcvt.h" +#include "u32-f32-vcvt/u32-f32-vcvt.h" +#include "qs8-vcvt/qs8-vcvt.h" +#include "qu8-vcvt/qu8-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS #ifdef __cplusplus diff --git a/src/xnnpack/vhswish.h b/src/xnnpack/vhswish.h deleted file mode 100644 index cc38437f079..00000000000 --- a/src/xnnpack/vhswish.h +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include - -#include "xnnpack/common.h" -#include "xnnpack/microparams.h" - -#ifdef __cplusplus -extern "C" { -#endif - - -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, fn_name, batch_tile, vector_tile, \ - datatype, params_type, init_params) \ - XNN_INTERNAL void fn_name( \ - size_t n, \ - const int8_t* input, \ - int8_t* output, \ - const params_type params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); -#include "src/qs8-vhswish/qs8-vhswish.h" -#undef XNN_UKERNEL -#undef XNN_UKERNEL_WITH_PARAMS - -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, fn_name, batch_tile, vector_tile, \ - datatype, params_type, init_params) \ - XNN_INTERNAL void fn_name( \ - size_t n, \ - const uint8_t* input, \ - uint8_t* output, \ - const params_type params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); -#include "src/qu8-vhswish/qu8-vhswish.h" -#undef XNN_UKERNEL -#undef XNN_UKERNEL_WITH_PARAMS - - -#ifdef __cplusplus -} // extern "C" -#endif diff --git a/src/xnnpack/vlog.h b/src/xnnpack/vlog.h deleted file mode 100644 index 3788af83709..00000000000 --- a/src/xnnpack/vlog.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include - -#include "xnnpack/common.h" - -#ifdef __cplusplus -extern "C" { -#endif - - -#define DECLARE_U32_VLOG_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t batch_size, \ - const uint32_t* input, \ - uint32_t input_lshift, \ - uint32_t output_scale, \ - uint16_t* output); - - -DECLARE_U32_VLOG_UKERNEL_FUNCTION(xnn_u32_vlog_ukernel__scalar_x1) -DECLARE_U32_VLOG_UKERNEL_FUNCTION(xnn_u32_vlog_ukernel__scalar_x2) -DECLARE_U32_VLOG_UKERNEL_FUNCTION(xnn_u32_vlog_ukernel__scalar_x3) -DECLARE_U32_VLOG_UKERNEL_FUNCTION(xnn_u32_vlog_ukernel__scalar_x4) - -#ifdef __cplusplus -} // extern "C" -#endif diff --git a/src/xnnpack/vlrelu.h b/src/xnnpack/vlrelu.h deleted file mode 100644 index 8ec23a938b9..00000000000 --- a/src/xnnpack/vlrelu.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include - -#include "xnnpack/common.h" -#include "xnnpack/microparams.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#define XNN_UKERNEL(arch_flags, fn_name, batch_tile, vector_tile, datatype) \ - XNN_INTERNAL void fn_name(size_t n, const int8_t* input, int8_t* output, \ - const struct xnn_qs8_lrelu_params \ - params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); -#include "src/qs8-vlrelu/qs8-vlrelu.h" -#undef XNN_UKERNEL - -#define XNN_UKERNEL(arch_flags, fn_name, batch_tile, vector_tile, datatype) \ - XNN_INTERNAL void fn_name(size_t n, const uint8_t* input, uint8_t* output, \ - const struct xnn_qu8_lrelu_params \ - params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); -#include "src/qu8-vlrelu/qu8-vlrelu.h" -#undef XNN_UKERNEL - -#ifdef __cplusplus -} // extern "C" -#endif diff --git a/src/xnnpack/vunary.h b/src/xnnpack/vunary.h index 5aefeceb748..2f3536fc687 100644 --- a/src/xnnpack/vunary.h +++ b/src/xnnpack/vunary.h @@ -41,14 +41,14 @@ extern "C" { XNN_INTERNAL void fn_name(size_t n, const int8_t* x, int8_t* y, \ const struct xnn_s8_minmax_params \ params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); -#include "src/s8-vclamp/s8-vclamp.h" +#include "s8-vclamp/s8-vclamp.h" #undef XNN_UKERNEL #define XNN_UKERNEL(arch_flags, fn_name, batch_tile, vector_tile, datatype) \ XNN_INTERNAL void fn_name(size_t n, const uint8_t* x, uint8_t* y, \ const struct xnn_u8_minmax_params \ params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); -#include "src/u8-vclamp/u8-vclamp.h" +#include "u8-vclamp/u8-vclamp.h" #undef XNN_UKERNEL #define XNN_UKERNEL(arch_flags, fn_name, batch_tile, vector_tile, datatype) \ @@ -56,7 +56,7 @@ extern "C" { #define XNN_UKERNEL_WITH_PARAMS(arch_flags, fn_name, batch_tile, vector_tile, \ datatype, params_type, init_params) \ DECLARE_BF16_UKERNEL_FUNCTION(fn_name, params_type); -#include "src/bf16-vabs/bf16-vabs.h" +#include "bf16-vabs/bf16-vabs.h" #undef XNN_UKERNEL #undef XNN_UKERNEL_WITH_PARAMS @@ -65,21 +65,21 @@ extern "C" { #define XNN_UKERNEL_WITH_PARAMS(arch_flags, fn_name, batch_tile, vector_tile, \ datatype, params_type, init_params) \ DECLARE_F16_UKERNEL_FUNCTION(fn_name, params_type); -#include "src/f16-vabs/f16-vabs.h" -#include "src/f16-vclamp/f16-vclamp.h" -#include "src/f16-velu/f16-velu.h" -#include "src/f16-vhswish/f16-vhswish.h" -#include "src/f16-vlrelu/f16-vlrelu.h" -#include "src/f16-vneg/f16-vneg.h" -#include "src/f16-vrnd/f16-vrndd.h" -#include "src/f16-vrnd/f16-vrndne.h" -#include "src/f16-vrnd/f16-vrndu.h" -#include "src/f16-vrnd/f16-vrndz.h" -#include "src/f16-vrsqrt/f16-vrsqrt.h" -#include "src/f16-vsigmoid/f16-vsigmoid.h" -#include "src/f16-vsqr/f16-vsqr.h" -#include "src/f16-vsqrt/f16-vsqrt.h" -#include "src/f16-vtanh/f16-vtanh.h" +#include "f16-vabs/f16-vabs.h" +#include "f16-vclamp/f16-vclamp.h" +#include "f16-velu/f16-velu.h" +#include "f16-vhswish/f16-vhswish.h" +#include "f16-vlrelu/f16-vlrelu.h" +#include "f16-vneg/f16-vneg.h" +#include "f16-vrnd/f16-vrndd.h" +#include "f16-vrnd/f16-vrndne.h" +#include "f16-vrnd/f16-vrndu.h" +#include "f16-vrnd/f16-vrndz.h" +#include "f16-vrsqrt/f16-vrsqrt.h" +#include "f16-vsigmoid/f16-vsigmoid.h" +#include "f16-vsqr/f16-vsqr.h" +#include "f16-vsqrt/f16-vsqrt.h" +#include "f16-vtanh/f16-vtanh.h" #undef XNN_UKERNEL #undef XNN_UKERNEL_WITH_PARAMS @@ -88,28 +88,45 @@ extern "C" { #define XNN_UKERNEL_WITH_PARAMS(arch_flags, fn_name, batch_tile, vector_tile, \ datatype, params_type, init_params) \ DECLARE_F32_UKERNEL_FUNCTION(fn_name, params_type); -#include "src/f32-vabs/f32-vabs.h" -#include "src/f32-vclamp/f32-vclamp.h" -#include "src/f32-velu/f32-velu.h" -#include "src/f32-vexp/f32-vexp.h" -#include "src/f32-vgelu/f32-vgelu.h" -#include "src/f32-vhswish/f32-vhswish.h" -#include "src/f32-vlog/f32-vlog.h" -#include "src/f32-vlrelu/f32-vlrelu.h" -#include "src/f32-vneg/f32-vneg.h" -#include "src/f32-vrelu/f32-vrelu.h" -#include "src/f32-vrnd/f32-vrndd.h" -#include "src/f32-vrnd/f32-vrndne.h" -#include "src/f32-vrnd/f32-vrndu.h" -#include "src/f32-vrnd/f32-vrndz.h" -#include "src/f32-vrsqrt/f32-vrsqrt.h" -#include "src/f32-vsigmoid/f32-vsigmoid.h" -#include "src/f32-vsqr/f32-vsqr.h" -#include "src/f32-vsqrt/f32-vsqrt.h" -#include "src/f32-vtanh/f32-vtanh.h" +#include "f32-vabs/f32-vabs.h" +#include "f32-vclamp/f32-vclamp.h" +#include "f32-velu/f32-velu.h" +#include "f32-vexp/f32-vexp.h" +#include "f32-vgelu/f32-vgelu.h" +#include "f32-vhswish/f32-vhswish.h" +#include "f32-vlog/f32-vlog.h" +#include "f32-vlrelu/f32-vlrelu.h" +#include "f32-vneg/f32-vneg.h" +#include "f32-vrelu/f32-vrelu.h" +#include "f32-vrnd/f32-vrndd.h" +#include "f32-vrnd/f32-vrndne.h" +#include "f32-vrnd/f32-vrndu.h" +#include "f32-vrnd/f32-vrndz.h" +#include "f32-vrsqrt/f32-vrsqrt.h" +#include "f32-vsigmoid/f32-vsigmoid.h" +#include "f32-vsqr/f32-vsqr.h" +#include "f32-vsqrt/f32-vsqrt.h" +#include "f32-vtanh/f32-vtanh.h" #undef XNN_UKERNEL #undef XNN_UKERNEL_WITH_PARAMS +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, fn_name, batch_tile, vector_tile, \ + datatype, params_type, init_params) \ + XNN_INTERNAL void fn_name( \ + size_t n, const int8_t* input, int8_t* output, \ + const params_type params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); +#include "qs8-vhswish/qs8-vhswish.h" +#include "qs8-vlrelu/qs8-vlrelu.h" +#undef XNN_UKERNEL_WITH_PARAMS + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, fn_name, batch_tile, vector_tile, \ + datatype, params_type, init_params) \ + XNN_INTERNAL void fn_name( \ + size_t n, const uint8_t* input, uint8_t* output, \ + const params_type params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); +#include "qu8-vhswish/qu8-vhswish.h" +#include "qu8-vlrelu/qu8-vlrelu.h" +#undef XNN_UKERNEL_WITH_PARAMS #define DECLARE_XX_VUNARY_UKERNEL_FUNCTION(fn_name) \ XNN_INTERNAL void fn_name( \ diff --git a/src/xnnpack/zerob.h b/src/xnnpack/zerob.h index 0ffef479264..f37395ff750 100644 --- a/src/xnnpack/zerob.h +++ b/src/xnnpack/zerob.h @@ -24,7 +24,7 @@ extern "C" { size_t channel_subtile_stride, \ const struct xnn_x32_packb_params* params); \ -#include "src/x32-zerob/x32-zerob.h" +#include "x32-zerob/x32-zerob.h" #undef XNN_UKERNEL diff --git a/test/BUILD.bazel b/test/BUILD.bazel index bcbedbc084d..a9e9f4f6965 100644 --- a/test/BUILD.bazel +++ b/test/BUILD.bazel @@ -91,6 +91,13 @@ xnnpack_cxx_library( ], ) +xnnpack_cxx_library( + name = "tanh_operator_tester", + testonly = True, + hdrs = ["tanh-operator-tester.h"], + deps = OPERATOR_TEST_DEPS + xnnpack_test_deps_for_library(), +) + xnnpack_cxx_library( name = "unary_operator_tester", testonly = True, @@ -554,15 +561,6 @@ xnnpack_unit_test( ], ) -xnnpack_unit_test( - name = "f16_prelu_test", - srcs = [ - "f16-prelu.cc", - "prelu-microkernel-tester.h", - ], - deps = MICROKERNEL_TEST_DEPS, -) - xnnpack_unit_test( name = "f16_spmm_minmax_test", srcs = [ @@ -799,15 +797,6 @@ xnnpack_unit_test( ], ) -xnnpack_unit_test( - name = "f32_prelu_test", - srcs = [ - "f32-prelu.cc", - "prelu-microkernel-tester.h", - ], - deps = MICROKERNEL_TEST_DEPS, -) - xnnpack_unit_test( name = "f32_raddexpminusmax_test", srcs = [ @@ -1318,12 +1307,6 @@ xnnpack_unit_test( ########################## Size tests for the library ######################### -xnnpack_binary( - name = "operator_size_test", - srcs = ["operator-size.c"], - deps = ["//:XNNPACK"], -) - xnnpack_binary( name = "subgraph_size_test", srcs = ["subgraph-size.c"], @@ -1621,15 +1604,6 @@ xnnpack_unit_test( deps = OPERATOR_TEST_DEPS, ) -xnnpack_unit_test( - name = "prelu_nc_test", - srcs = [ - "prelu-nc.cc", - "prelu-operator-tester.h", - ], - deps = OPERATOR_TEST_DEPS, -) - xnnpack_unit_test( name = "resize_bilinear_nhwc_test", srcs = [ @@ -1778,6 +1752,7 @@ xnnpack_cxx_library( deps = [ ":replicable_random_device", ":subgraph_unary_tester", + ":tanh_operator_tester", "//:XNNPACK", "//:math", "//:node_type", @@ -1897,6 +1872,7 @@ xnnpack_unit_test( "//:XNNPACK", "//:buffer", "//:common", + "//:math", "//:node_type", "//:operator_utils", "//:operators", @@ -1926,6 +1902,7 @@ xnnpack_unit_test( "//:XNNPACK", "//:buffer", "//:common", + "//:math", "//:node_type", "//:operators", "//:subgraph", @@ -1941,6 +1918,7 @@ xnnpack_unit_test( ":replicable_random_device", "//:XNNPACK", "//:buffer", + "//:math", "//:node_type", "//:operators", "//:subgraph", @@ -1963,6 +1941,7 @@ xnnpack_unit_test( "//:XNNPACK", "//:buffer", "//:common", + "//:math", "//:node_type", "//:operator_utils", "//:operators", @@ -1982,6 +1961,7 @@ xnnpack_unit_test( ":replicable_random_device", "//:XNNPACK", "//:buffer", + "//:math", "//:node_type", "//:operator_utils", "//:operators", @@ -1999,6 +1979,7 @@ xnnpack_unit_test( ":replicable_random_device", "//:XNNPACK", "//:buffer", + "//:math", "//:node_type", "//:operators", "//:subgraph", @@ -2016,6 +1997,7 @@ xnnpack_unit_test( "//:XNNPACK", "//:buffer", "//:common", + "//:math", "//:node_type", "//:operator_utils", "//:operators", @@ -2033,6 +2015,7 @@ xnnpack_unit_test( ":replicable_random_device", "//:XNNPACK", "//:buffer", + "//:math", "//:node_type", "//:operators", "//:subgraph", @@ -2075,6 +2058,7 @@ xnnpack_unit_test( "//:XNNPACK", "//:aligned_allocator", "//:common", + "//:math", "//:node_type", "//:operators", "//:requantization", @@ -2092,6 +2076,7 @@ xnnpack_unit_test( "//:XNNPACK", "//:aligned_allocator", "//:common", + "//:math", "//:node_type", "//:operators", "//:requantization", @@ -2109,6 +2094,7 @@ xnnpack_unit_test( "//:XNNPACK", "//:aligned_allocator", "//:common", + "//:math", "//:node_type", "//:operators", "//:subgraph", @@ -2125,6 +2111,7 @@ xnnpack_unit_test( "//:XNNPACK", "//:aligned_allocator", "//:common", + "//:math", "//:node_type", "//:operators", "//:subgraph", @@ -2140,6 +2127,7 @@ xnnpack_unit_test( ":replicable_random_device", "//:XNNPACK", "//:buffer", + "//:math", "//:node_type", "//:operator_utils", "//:operators", @@ -2148,21 +2136,6 @@ xnnpack_unit_test( ], ) -xnnpack_unit_test( - name = "prelu_test", - srcs = [ - "prelu.cc", - ], - deps = [ - ":replicable_random_device", - "//:XNNPACK", - "//:buffer", - "//:node_type", - "//:operators", - "//:subgraph", - ], -) - xnnpack_unit_test( name = "rope_test", srcs = [ @@ -2189,6 +2162,7 @@ xnnpack_unit_test( "//:XNNPACK", "//:aligned_allocator", "//:common", + "//:math", "//:node_type", "//:subgraph", ], @@ -2234,6 +2208,7 @@ xnnpack_unit_test( ":replicable_random_device", "//:XNNPACK", "//:buffer", + "//:math", "//:node_type", "//:operators", "//:subgraph", @@ -2360,6 +2335,7 @@ xnnpack_unit_test( "//:allocation_type", "//:allocator", "//:buffer", + "//:math", "//:node_type", "//:params", "//:subgraph", diff --git a/test/abs.cc b/test/abs.cc index 8a037d79d61..1ce5c5ab860 100644 --- a/test/abs.cc +++ b/test/abs.cc @@ -12,6 +12,7 @@ #include #include "xnnpack.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" diff --git a/test/average-pooling-2d.cc b/test/average-pooling-2d.cc index 6ad7c9ba4e7..9a7ef13ca89 100644 --- a/test/average-pooling-2d.cc +++ b/test/average-pooling-2d.cc @@ -14,12 +14,13 @@ #include #include "xnnpack.h" +#include "xnnpack/buffer.h" #include "xnnpack/common.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator-utils.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" -#include "xnnpack/buffer.h" #include "replicable_random_device.h" template < diff --git a/test/avgpool-microkernel-tester.h b/test/avgpool-microkernel-tester.h index 101cd837852..3ac8b570286 100644 --- a/test/avgpool-microkernel-tester.h +++ b/test/avgpool-microkernel-tester.h @@ -21,10 +21,11 @@ #include #include "xnnpack.h" +#include "xnnpack/buffer.h" +#include "xnnpack/math.h" #include "xnnpack/microfnptr.h" #include "xnnpack/microparams.h" #include "xnnpack/requantization.h" -#include "xnnpack/buffer.h" #include "next_prime.h" #include "replicable_random_device.h" diff --git a/test/avgpool-minmax.cc b/test/avgpool-minmax.cc index cf2f484d250..fe78497d95b 100644 --- a/test/avgpool-minmax.cc +++ b/test/avgpool-minmax.cc @@ -43,10 +43,10 @@ const XnnTestParam xnn_test_params[] = { #define XNN_UKERNEL_UNIPASS(arch_flags, ukernel, channel_tile, channel_scaled_tile, primary_tile, incremental_tile, init_params) \ { #ukernel, AvgPoolMicrokernelTester::Kernel{ukernel, init_params}, arch_flags, channel_tile, channel_scaled_tile, primary_tile, incremental_tile }, -#include "src/f16-avgpool/f16-avgpool-minmax.h" -#include "src/f16-pavgpool/f16-pavgpool-minmax.h" -#include "src/f32-avgpool/f32-avgpool-minmax.h" -#include "src/f32-pavgpool/f32-pavgpool-minmax.h" +#include "f16-avgpool/f16-avgpool-minmax.h" +#include "f16-pavgpool/f16-pavgpool-minmax.h" +#include "f32-avgpool/f32-avgpool-minmax.h" +#include "f32-pavgpool/f32-pavgpool-minmax.h" #undef XNN_UKERNEL_MULTIPASS #undef XNN_UKERNEL_UNIPASS @@ -57,7 +57,7 @@ const XnnTestParam xnn_test_params[] = { #define XNN_UKERNEL_UNIPASS(arch_flags, ukernel, requantize, channel_tile, channel_scaled_tile, primary_tile, incremental_tile, init_params) \ { #ukernel, AvgPoolMicrokernelTester::Kernel{ukernel, init_params, requantize}, arch_flags, channel_tile, channel_scaled_tile, primary_tile, incremental_tile }, -#include "src/qu8-avgpool/qu8-avgpool-minmax.h" +#include "qu8-avgpool/qu8-avgpool-minmax.h" #undef XNN_UKERNEL_MULTIPASS #undef XNN_UKERNEL_UNIPASS diff --git a/test/bankers-rounding.cc b/test/bankers-rounding.cc index 00cde38e0a3..77f508a8f44 100644 --- a/test/bankers-rounding.cc +++ b/test/bankers-rounding.cc @@ -12,6 +12,7 @@ #include #include "xnnpack.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" diff --git a/test/batch-matrix-multiply.cc b/test/batch-matrix-multiply.cc index 39a2d49065b..c3f1dec9ad3 100644 --- a/test/batch-matrix-multiply.cc +++ b/test/batch-matrix-multiply.cc @@ -21,11 +21,12 @@ #include #include "xnnpack.h" +#include "xnnpack/buffer.h" #include "xnnpack/common.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" -#include "xnnpack/buffer.h" #include "replicable_random_device.h" template diff --git a/test/bf16-vabs.cc b/test/bf16-vabs.cc index 09e65eb412c..9d925bb1332 100644 --- a/test/bf16-vabs.cc +++ b/test/bf16-vabs.cc @@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Abs()); \ \ XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Abs()); -#include "src/bf16-vabs/bf16-vabs.h" +#include "bf16-vabs/bf16-vabs.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/binary-elementwise-nd.cc b/test/binary-elementwise-nd.cc index 874c80e9b7e..e4f26e82a26 100644 --- a/test/binary-elementwise-nd.cc +++ b/test/binary-elementwise-nd.cc @@ -77,6 +77,8 @@ class BinaryElementwiseOperatorTester { return "Minimum"; case xnn_binary_multiply: return "Multiply"; + case xnn_binary_prelu: + return "Prelu"; case xnn_binary_subtract: return "Subtract"; case xnn_binary_squared_difference: diff --git a/test/binary.cc b/test/binary.cc index 0a5da6b753d..7280dfd90f4 100644 --- a/test/binary.cc +++ b/test/binary.cc @@ -165,6 +165,8 @@ static const char* binary_operator_to_string( return "Minimum"; case xnn_binary_multiply: return "Multiply"; + case xnn_binary_prelu: + return "Prelu"; case xnn_binary_subtract: return "Subtract"; case xnn_binary_squared_difference: @@ -913,7 +915,8 @@ INSTANTIATE_TEST_SUITE_P(test, BinaryTestF16, testing::Values(xnn_binary_add, xnn_binary_subtract, xnn_binary_multiply, xnn_binary_divide, xnn_binary_maximum, xnn_binary_minimum, - xnn_binary_squared_difference), + xnn_binary_squared_difference, + xnn_binary_prelu), [](const auto& info) { return ToString(info.param); }); #endif INSTANTIATE_TEST_SUITE_P(test, BinaryTestF32, @@ -921,7 +924,8 @@ INSTANTIATE_TEST_SUITE_P(test, BinaryTestF32, xnn_binary_multiply, xnn_binary_divide, xnn_binary_maximum, xnn_binary_minimum, xnn_binary_copysign, - xnn_binary_squared_difference), + xnn_binary_squared_difference, + xnn_binary_prelu), [](const auto& info) { return ToString(info.param); }); INSTANTIATE_TEST_SUITE_P(test, BinaryTestS32, testing::Values(xnn_binary_multiply), diff --git a/test/ceiling.cc b/test/ceiling.cc index ee493fc69a5..13682a681b2 100644 --- a/test/ceiling.cc +++ b/test/ceiling.cc @@ -12,6 +12,7 @@ #include #include "xnnpack.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" diff --git a/test/clamp.cc b/test/clamp.cc index df4d225920b..aa03eda65a7 100644 --- a/test/clamp.cc +++ b/test/clamp.cc @@ -12,6 +12,7 @@ #include #include "xnnpack.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" diff --git a/test/concatenate2.cc b/test/concatenate2.cc index ff3ad0066d0..2a078aa1d69 100644 --- a/test/concatenate2.cc +++ b/test/concatenate2.cc @@ -17,10 +17,11 @@ #include #include "xnnpack.h" +#include "xnnpack/buffer.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" -#include "xnnpack/buffer.h" #include "replicable_random_device.h" template class Concatenate2Test : public ::testing::Test { diff --git a/test/concatenate3.cc b/test/concatenate3.cc index 3743e663bd9..5706b5df330 100644 --- a/test/concatenate3.cc +++ b/test/concatenate3.cc @@ -17,10 +17,11 @@ #include #include "xnnpack.h" +#include "xnnpack/buffer.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" -#include "xnnpack/buffer.h" #include "replicable_random_device.h" template class Concatenate3Test : public ::testing::Test { diff --git a/test/concatenate4.cc b/test/concatenate4.cc index d999a20d887..d37320daf9d 100644 --- a/test/concatenate4.cc +++ b/test/concatenate4.cc @@ -17,10 +17,11 @@ #include #include "xnnpack.h" +#include "xnnpack/buffer.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" -#include "xnnpack/buffer.h" #include "replicable_random_device.h" template class Concatenate4Test : public ::testing::Test { diff --git a/test/concatenate5.cc b/test/concatenate5.cc index 8546da9fb7d..e06af5b7887 100644 --- a/test/concatenate5.cc +++ b/test/concatenate5.cc @@ -17,10 +17,11 @@ #include #include "xnnpack.h" +#include "xnnpack/buffer.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" -#include "xnnpack/buffer.h" #include "replicable_random_device.h" template class Concatenate5Test : public ::testing::Test { diff --git a/test/conv-hwc2chw-microkernel-tester.h b/test/conv-hwc2chw-microkernel-tester.h index 3f32e698821..f7f852f658f 100644 --- a/test/conv-hwc2chw-microkernel-tester.h +++ b/test/conv-hwc2chw-microkernel-tester.h @@ -17,10 +17,11 @@ #include #include "xnnpack.h" +#include "xnnpack/buffer.h" +#include "xnnpack/math.h" #include "xnnpack/microfnptr.h" #include "xnnpack/microparams.h" #include "xnnpack/pack.h" -#include "xnnpack/buffer.h" #include "replicable_random_device.h" class ConvHWC2CHWMicrokernelTester { diff --git a/test/convert-nc-eager.cc b/test/convert-nc-eager.cc index a09464790ea..af668afec9c 100644 --- a/test/convert-nc-eager.cc +++ b/test/convert-nc-eager.cc @@ -124,8 +124,6 @@ TEST(CONVERT_NC_F32_QS8, unit_batch) { ConvertOperatorTester() .batch_size(1) .channels(channels) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestRunF32toQS8(); } @@ -136,8 +134,6 @@ TEST(CONVERT_NC_F32_QS8, small_batch) { ConvertOperatorTester() .batch_size(3) .channels(channels) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestRunF32toQS8(); } @@ -149,8 +145,6 @@ TEST(CONVERT_NC_F32_QS8, small_batch_with_input_stride) { .batch_size(3) .channels(channels) .input_stride(129) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestRunF32toQS8(); } @@ -162,8 +156,6 @@ TEST(CONVERT_NC_F32_QS8, small_batch_with_output_stride) { .batch_size(3) .channels(channels) .output_stride(117) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestRunF32toQS8(); } @@ -176,8 +168,6 @@ TEST(CONVERT_NC_F32_QS8, small_batch_with_input_and_output_stride) { .channels(channels) .input_stride(129) .output_stride(117) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestRunF32toQS8(); } @@ -190,9 +180,7 @@ TEST(CONVERT_NC_F32_QS8, output_scale) { .batch_size(3) .channels(channels) .output_scale(output_scale) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .iterations(3) + .iterations(3) .TestRunF32toQS8(); } } @@ -208,9 +196,7 @@ TEST(CONVERT_NC_F32_QS8, output_zero_point) { .batch_size(3) .channels(channels) .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .iterations(3) + .iterations(3) .TestRunF32toQS8(); } } @@ -304,8 +290,6 @@ TEST(CONVERT_NC_QS16_QS8, unit_batch) { ConvertOperatorTester() .batch_size(1) .channels(channels) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestRunQS16toQS8(); } @@ -316,8 +300,6 @@ TEST(CONVERT_NC_QS16_QS8, small_batch) { ConvertOperatorTester() .batch_size(3) .channels(channels) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestRunQS16toQS8(); } @@ -329,8 +311,6 @@ TEST(CONVERT_NC_QS16_QS8, small_batch_with_input_stride) { .batch_size(3) .channels(channels) .input_stride(129) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestRunQS16toQS8(); } @@ -342,8 +322,6 @@ TEST(CONVERT_NC_QS16_QS8, small_batch_with_output_stride) { .batch_size(3) .channels(channels) .output_stride(117) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestRunQS16toQS8(); } @@ -356,8 +334,6 @@ TEST(CONVERT_NC_QS16_QS8, small_batch_with_input_and_output_stride) { .channels(channels) .input_stride(129) .output_stride(117) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestRunQS16toQS8(); } @@ -370,9 +346,7 @@ TEST(CONVERT_NC_QS16_QS8, input_scale) { .batch_size(3) .channels(channels) .input_scale(input_scale) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .iterations(3) + .iterations(3) .TestRunQS16toQS8(); } } @@ -388,8 +362,6 @@ TEST(CONVERT_NC_QS16_QS8, output_zero_point) { .batch_size(3) .channels(channels) .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestRunQS16toQS8(); } @@ -401,8 +373,6 @@ TEST(CONVERT_NC_F32_QU8, unit_batch) { ConvertOperatorTester() .batch_size(1) .channels(channels) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestRunF32toQU8(); } @@ -413,8 +383,6 @@ TEST(CONVERT_NC_F32_QU8, small_batch) { ConvertOperatorTester() .batch_size(3) .channels(channels) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestRunF32toQU8(); } @@ -426,8 +394,6 @@ TEST(CONVERT_NC_F32_QU8, small_batch_with_input_stride) { .batch_size(3) .channels(channels) .input_stride(129) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestRunF32toQU8(); } @@ -439,8 +405,6 @@ TEST(CONVERT_NC_F32_QU8, small_batch_with_output_stride) { .batch_size(3) .channels(channels) .output_stride(117) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestRunF32toQU8(); } @@ -453,8 +417,6 @@ TEST(CONVERT_NC_F32_QU8, small_batch_with_input_and_output_stride) { .channels(channels) .input_stride(129) .output_stride(117) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestRunF32toQU8(); } @@ -467,8 +429,6 @@ TEST(CONVERT_NC_F32_QU8, output_scale) { .batch_size(3) .channels(channels) .output_scale(output_scale) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestRunF32toQU8(); } @@ -485,8 +445,6 @@ TEST(CONVERT_NC_F32_QU8, output_zero_point) { .batch_size(3) .channels(channels) .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestRunF32toQU8(); } diff --git a/test/convert-nc.cc b/test/convert-nc.cc index 9d1102a42b0..33eeee1a0ca 100644 --- a/test/convert-nc.cc +++ b/test/convert-nc.cc @@ -124,8 +124,6 @@ TEST(CONVERT_NC_F16_QD8, unit_batch) { ConvertOperatorTester() .batch_size(1) .channels(channels) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestF16toQD8(); } @@ -136,8 +134,6 @@ TEST(CONVERT_NC_F16_QD8, small_batch) { ConvertOperatorTester() .batch_size(3) .channels(channels) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestF16toQD8(); } @@ -149,8 +145,6 @@ TEST(CONVERT_NC_F16_QD8, small_batch_with_input_stride) { .batch_size(3) .channels(channels) .input_stride(129) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestF16toQD8(); } @@ -162,8 +156,6 @@ TEST(CONVERT_NC_F16_QD8, small_batch_with_output_stride) { .batch_size(3) .channels(channels) .output_stride(117) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestF16toQD8(); } @@ -176,53 +168,16 @@ TEST(CONVERT_NC_F16_QD8, small_batch_with_input_and_output_stride) { .channels(channels) .input_stride(129) .output_stride(117) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestF16toQD8(); } } -TEST(CONVERT_NC_F16_QD8, output_min) { - for (int16_t qmin = std::numeric_limits::min(); - qmin < std::numeric_limits::max(); - qmin += 51) - { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .qmin(qmin) - .qmax(std::numeric_limits::max()) - .iterations(3) - .TestF16toQD8(); - } - } -} - -TEST(CONVERT_NC_F16_QD8, output_max) { - for (int16_t qmax = std::numeric_limits::min() + 1; - qmax <= std::numeric_limits::max(); - qmax += 51) - { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .qmin(std::numeric_limits::min()) - .qmax(qmax) - .iterations(3) - .TestF16toQD8(); - } - } -} TEST(CONVERT_NC_F32_QD8, unit_batch) { for (size_t channels = 1; channels < 100; channels++) { ConvertOperatorTester() .batch_size(1) .channels(channels) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestF32toQD8(); } @@ -233,8 +188,6 @@ TEST(CONVERT_NC_F32_QD8, small_batch) { ConvertOperatorTester() .batch_size(3) .channels(channels) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestF32toQD8(); } @@ -246,8 +199,6 @@ TEST(CONVERT_NC_F32_QD8, small_batch_with_input_stride) { .batch_size(3) .channels(channels) .input_stride(129) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestF32toQD8(); } @@ -259,8 +210,6 @@ TEST(CONVERT_NC_F32_QD8, small_batch_with_output_stride) { .batch_size(3) .channels(channels) .output_stride(117) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestF32toQD8(); } @@ -273,8 +222,6 @@ TEST(CONVERT_NC_F32_QD8, small_batch_with_input_and_output_stride) { .channels(channels) .input_stride(129) .output_stride(117) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestF32toQD8(); } @@ -285,8 +232,6 @@ TEST(CONVERT_NC_F32_QS8, unit_batch) { ConvertOperatorTester() .batch_size(1) .channels(channels) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestF32toQS8(); } @@ -297,8 +242,6 @@ TEST(CONVERT_NC_F32_QS8, small_batch) { ConvertOperatorTester() .batch_size(3) .channels(channels) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestF32toQS8(); } @@ -310,8 +253,6 @@ TEST(CONVERT_NC_F32_QS8, small_batch_with_input_stride) { .batch_size(3) .channels(channels) .input_stride(129) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestF32toQS8(); } @@ -323,8 +264,6 @@ TEST(CONVERT_NC_F32_QS8, small_batch_with_output_stride) { .batch_size(3) .channels(channels) .output_stride(117) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestF32toQS8(); } @@ -337,8 +276,6 @@ TEST(CONVERT_NC_F32_QS8, small_batch_with_input_and_output_stride) { .channels(channels) .input_stride(129) .output_stride(117) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestF32toQS8(); } @@ -351,9 +288,7 @@ TEST(CONVERT_NC_F32_QS8, output_scale) { .batch_size(3) .channels(channels) .output_scale(output_scale) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .iterations(3) + .iterations(3) .TestF32toQS8(); } } @@ -369,43 +304,7 @@ TEST(CONVERT_NC_F32_QS8, output_zero_point) { .batch_size(3) .channels(channels) .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .iterations(3) - .TestF32toQS8(); - } - } -} - -TEST(CONVERT_NC_F32_QS8, output_min) { - for (int16_t qmin = std::numeric_limits::min(); - qmin < std::numeric_limits::max(); - qmin += 51) - { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .qmin(qmin) - .qmax(std::numeric_limits::max()) - .iterations(3) - .TestF32toQS8(); - } - } -} - -TEST(CONVERT_NC_F32_QS8, output_max) { - for (int16_t qmax = std::numeric_limits::min() + 1; - qmax <= std::numeric_limits::max(); - qmax += 51) - { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .qmin(std::numeric_limits::min()) - .qmax(qmax) - .iterations(3) + .iterations(3) .TestF32toQS8(); } } @@ -416,8 +315,6 @@ TEST(CONVERT_NC_F32_QU8, unit_batch) { ConvertOperatorTester() .batch_size(1) .channels(channels) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestF32toQU8(); } @@ -428,8 +325,6 @@ TEST(CONVERT_NC_F32_QU8, small_batch) { ConvertOperatorTester() .batch_size(3) .channels(channels) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestF32toQU8(); } @@ -441,8 +336,6 @@ TEST(CONVERT_NC_F32_QU8, small_batch_with_input_stride) { .batch_size(3) .channels(channels) .input_stride(129) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestF32toQU8(); } @@ -454,8 +347,6 @@ TEST(CONVERT_NC_F32_QU8, small_batch_with_output_stride) { .batch_size(3) .channels(channels) .output_stride(117) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestF32toQU8(); } @@ -468,8 +359,6 @@ TEST(CONVERT_NC_F32_QU8, small_batch_with_input_and_output_stride) { .channels(channels) .input_stride(129) .output_stride(117) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestF32toQU8(); } @@ -482,8 +371,6 @@ TEST(CONVERT_NC_F32_QU8, output_scale) { .batch_size(3) .channels(channels) .output_scale(output_scale) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestF32toQU8(); } @@ -500,42 +387,6 @@ TEST(CONVERT_NC_F32_QU8, output_zero_point) { .batch_size(3) .channels(channels) .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .iterations(3) - .TestF32toQU8(); - } - } -} - -TEST(CONVERT_NC_F32_QU8, output_min) { - for (int16_t qmin = std::numeric_limits::min(); - qmin < std::numeric_limits::max(); - qmin += 51) - { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .qmin(qmin) - .qmax(std::numeric_limits::max()) - .iterations(3) - .TestF32toQU8(); - } - } -} - -TEST(CONVERT_NC_F32_QU8, output_max) { - for (int16_t qmax = std::numeric_limits::min() + 1; - qmax <= std::numeric_limits::max(); - qmax += 51) - { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .qmin(std::numeric_limits::min()) - .qmax(qmax) .iterations(3) .TestF32toQU8(); } @@ -713,8 +564,6 @@ TEST(CONVERT_NC_QS16_QS8, unit_batch) { ConvertOperatorTester() .batch_size(1) .channels(channels) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestQS16toQS8(); } @@ -725,8 +574,6 @@ TEST(CONVERT_NC_QS16_QS8, small_batch) { ConvertOperatorTester() .batch_size(3) .channels(channels) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestQS16toQS8(); } @@ -738,8 +585,6 @@ TEST(CONVERT_NC_QS16_QS8, small_batch_with_input_stride) { .batch_size(3) .channels(channels) .input_stride(129) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestQS16toQS8(); } @@ -751,8 +596,6 @@ TEST(CONVERT_NC_QS16_QS8, small_batch_with_output_stride) { .batch_size(3) .channels(channels) .output_stride(117) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestQS16toQS8(); } @@ -765,8 +608,6 @@ TEST(CONVERT_NC_QS16_QS8, small_batch_with_input_and_output_stride) { .channels(channels) .input_stride(129) .output_stride(117) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestQS16toQS8(); } @@ -779,9 +620,7 @@ TEST(CONVERT_NC_QS16_QS8, input_scale) { .batch_size(3) .channels(channels) .input_scale(input_scale) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .iterations(3) + .iterations(3) .TestQS16toQS8(); } } @@ -797,9 +636,7 @@ TEST(CONVERT_NC_QS16_QS8, output_zero_point) { .batch_size(3) .channels(channels) .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .iterations(3) + .iterations(3) .TestQS16toQS8(); } } @@ -893,8 +730,6 @@ TEST(CONVERT_NC_F32_QP8, unit_batch) { ConvertOperatorTester() .batch_size(1) .channels(channels) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestF32toQD8(); } @@ -905,8 +740,6 @@ TEST(CONVERT_NC_F32_QP8, small_batch) { ConvertOperatorTester() .batch_size(3) .channels(channels) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestF32toQD8(); } @@ -918,8 +751,6 @@ TEST(CONVERT_NC_F32_QP8, small_batch_with_input_stride) { .batch_size(3) .channels(channels) .input_stride(129) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .iterations(3) .TestF32toQD8(); } diff --git a/test/convert-operator-tester.h b/test/convert-operator-tester.h index 2858ef3dc3c..e88eea376bf 100644 --- a/test/convert-operator-tester.h +++ b/test/convert-operator-tester.h @@ -109,24 +109,6 @@ class ConvertOperatorTester { return this->zero_point_; } - ConvertOperatorTester& qmin(int16_t qmin) { - this->qmin_ = qmin; - return *this; - } - - int16_t qmin() const { - return this->qmin_; - } - - ConvertOperatorTester& qmax(int16_t qmax) { - this->qmax_ = qmax; - return *this; - } - - int16_t qmax() const { - return this->qmax_; - } - ConvertOperatorTester& iterations(size_t iterations) { this->iterations_ = iterations; return *this; @@ -412,10 +394,6 @@ class ConvertOperatorTester { } void TestF32toQS8() const { - ASSERT_GE(qmin(), std::numeric_limits::min()); - ASSERT_LE(qmax(), std::numeric_limits::max()); - ASSERT_LT(qmin(), qmax()); - ASSERT_GE(zero_point(), std::numeric_limits::min()); ASSERT_LE(zero_point(), std::numeric_limits::max()); @@ -434,8 +412,8 @@ class ConvertOperatorTester { for (size_t i = 0; i < batch_size(); i++) { for (size_t c = 0; c < channels(); c++) { float scaled_input = input[i * input_stride() + c] * inv_scale; - scaled_input = std::min(scaled_input, float(qmax() - zero_point())); - scaled_input = std::max(scaled_input, float(qmin() - zero_point())); + scaled_input = std::min(scaled_input, float(std::numeric_limits::max() - zero_point())); + scaled_input = std::max(scaled_input, float(std::numeric_limits::min() - zero_point())); output_ref[i * channels() + c] = int8_t(std::lrintf(scaled_input) + long(zero_point())); } } @@ -446,7 +424,7 @@ class ConvertOperatorTester { ASSERT_EQ(xnn_status_success, xnn_create_convert_nc_f32_qs8( - output_scale(), int8_t(zero_point()), int8_t(qmin()), int8_t(qmax()), + output_scale(), int8_t(zero_point()), 0, &convert_op)); ASSERT_NE(nullptr, convert_op); @@ -469,10 +447,6 @@ class ConvertOperatorTester { } void TestF32toQU8() const { - ASSERT_GE(qmin(), std::numeric_limits::min()); - ASSERT_LE(qmax(), std::numeric_limits::max()); - ASSERT_LT(qmin(), qmax()); - ASSERT_GE(zero_point(), std::numeric_limits::min()); ASSERT_LE(zero_point(), std::numeric_limits::max()); @@ -491,8 +465,8 @@ class ConvertOperatorTester { for (size_t i = 0; i < batch_size(); i++) { for (size_t c = 0; c < channels(); c++) { float scaled_input = input[i * input_stride() + c] * inv_scale; - scaled_input = std::min(scaled_input, float(qmax() - zero_point())); - scaled_input = std::max(scaled_input, float(qmin() - zero_point())); + scaled_input = std::min(scaled_input, float(std::numeric_limits::max() - zero_point())); + scaled_input = std::max(scaled_input, float(std::numeric_limits::min() - zero_point())); output_ref[i * channels() + c] = uint8_t(std::lrintf(scaled_input) + long(zero_point())); } } @@ -503,7 +477,7 @@ class ConvertOperatorTester { ASSERT_EQ(xnn_status_success, xnn_create_convert_nc_f32_qu8( - output_scale(), uint8_t(zero_point()), uint8_t(qmin()), uint8_t(qmax()), + output_scale(), uint8_t(zero_point()), 0, &convert_op)); ASSERT_NE(nullptr, convert_op); @@ -631,10 +605,6 @@ class ConvertOperatorTester { } void TestQS16toQS8() const { - ASSERT_GE(qmin(), std::numeric_limits::min()); - ASSERT_LE(qmax(), std::numeric_limits::max()); - ASSERT_LT(qmin(), qmax()); - ASSERT_GE(zero_point(), std::numeric_limits::min()); ASSERT_LE(zero_point(), std::numeric_limits::max()); @@ -819,10 +789,6 @@ class ConvertOperatorTester { } void TestRunF32toQS8() const { - ASSERT_GE(qmin(), std::numeric_limits::min()); - ASSERT_LE(qmax(), std::numeric_limits::max()); - ASSERT_LT(qmin(), qmax()); - ASSERT_GE(zero_point(), std::numeric_limits::min()); ASSERT_LE(zero_point(), std::numeric_limits::max()); @@ -841,8 +807,8 @@ class ConvertOperatorTester { for (size_t i = 0; i < batch_size(); i++) { for (size_t c = 0; c < channels(); c++) { float scaled_input = input[i * input_stride() + c] * inv_scale; - scaled_input = std::min(scaled_input, float(qmax() - zero_point())); - scaled_input = std::max(scaled_input, float(qmin() - zero_point())); + scaled_input = std::min(scaled_input, float(std::numeric_limits::max() - zero_point())); + scaled_input = std::max(scaled_input, float(std::numeric_limits::min() - zero_point())); output_ref[i * channels() + c] = int8_t(std::lrintf(scaled_input) + long(zero_point())); } } @@ -910,10 +876,6 @@ class ConvertOperatorTester { } void TestRunQS16toQS8() const { - ASSERT_GE(qmin(), std::numeric_limits::min()); - ASSERT_LE(qmax(), std::numeric_limits::max()); - ASSERT_LT(qmin(), qmax()); - ASSERT_GE(zero_point(), std::numeric_limits::min()); ASSERT_LE(zero_point(), std::numeric_limits::max()); @@ -960,10 +922,6 @@ class ConvertOperatorTester { } void TestRunF32toQU8() const { - ASSERT_GE(qmin(), std::numeric_limits::min()); - ASSERT_LE(qmax(), std::numeric_limits::max()); - ASSERT_LT(qmin(), qmax()); - ASSERT_GE(zero_point(), std::numeric_limits::min()); ASSERT_LE(zero_point(), std::numeric_limits::max()); @@ -982,8 +940,8 @@ class ConvertOperatorTester { for (size_t i = 0; i < batch_size(); i++) { for (size_t c = 0; c < channels(); c++) { float scaled_input = input[i * input_stride() + c] * inv_scale; - scaled_input = std::min(scaled_input, float(qmax() - zero_point())); - scaled_input = std::max(scaled_input, float(qmin() - zero_point())); + scaled_input = std::min(scaled_input, float(std::numeric_limits::max() - zero_point())); + scaled_input = std::max(scaled_input, float(std::numeric_limits::min() - zero_point())); output_ref[i * channels() + c] = uint8_t(std::lrintf(scaled_input) + long(zero_point())); } } @@ -1057,7 +1015,5 @@ class ConvertOperatorTester { float input_scale_{150.0f}; float output_scale_{3.0f}; int16_t zero_point_{1}; - int16_t qmin_{std::numeric_limits::min()}; - int16_t qmax_{std::numeric_limits::max()}; size_t iterations_{15}; }; diff --git a/test/convert.cc b/test/convert.cc index ca2fe7e0df3..931f18d5f51 100644 --- a/test/convert.cc +++ b/test/convert.cc @@ -13,6 +13,7 @@ #include #include "xnnpack.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" @@ -484,7 +485,7 @@ TEST_F(ConvertTestF32ToQS8, matches_operator_api) // Call operator API. xnn_operator_t op = nullptr; const xnn_status status = xnn_create_convert_nc_f32_qs8( - scale, signed_zero_point, INT8_MIN, INT8_MAX, /*flags=*/0, &op); + scale, signed_zero_point, /*flags=*/0, &op); if (status == xnn_status_unsupported_hardware) { GTEST_SKIP(); } @@ -593,7 +594,7 @@ TEST_F(ConvertTestF32ToQU8, matches_operator_api) // Call operator API. xnn_operator_t op = nullptr; const xnn_status status = xnn_create_convert_nc_f32_qu8( - scale, unsigned_zero_point, 0, UINT8_MAX, /*flags=*/0, &op); + scale, unsigned_zero_point, /*flags=*/0, &op); if (status == xnn_status_unsupported_hardware) { GTEST_SKIP(); } diff --git a/test/convolution-2d.cc b/test/convolution-2d.cc index 43b98339f13..c0d52531580 100644 --- a/test/convolution-2d.cc +++ b/test/convolution-2d.cc @@ -15,13 +15,14 @@ #include #include "xnnpack.h" +#include "xnnpack/buffer.h" #include "xnnpack/common.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator-utils.h" #include "xnnpack/operator.h" #include "xnnpack/requantization.h" #include "xnnpack/subgraph.h" -#include "xnnpack/buffer.h" #include "convolution-test-helpers.h" #include "replicable_random_device.h" diff --git a/test/convolution-operator-tester.h b/test/convolution-operator-tester.h index 298f687e187..3e2b1c3b9e0 100644 --- a/test/convolution-operator-tester.h +++ b/test/convolution-operator-tester.h @@ -22,10 +22,11 @@ #include #include "xnnpack.h" +#include "xnnpack/buffer.h" #include "xnnpack/cache.h" #include "xnnpack/common.h" +#include "xnnpack/math.h" #include "xnnpack/microparams.h" -#include "xnnpack/buffer.h" #include "convolution-test-helpers.h" #include "replicable_random_device.h" #include "pthreadpool.h" diff --git a/test/copy.cc b/test/copy.cc index 693981d4924..ceafa6564a9 100644 --- a/test/copy.cc +++ b/test/copy.cc @@ -12,6 +12,7 @@ #include #include "xnnpack.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" diff --git a/test/deconvolution-2d.cc b/test/deconvolution-2d.cc index de1975b5272..01c80f733bd 100644 --- a/test/deconvolution-2d.cc +++ b/test/deconvolution-2d.cc @@ -15,12 +15,13 @@ #include #include "xnnpack.h" +#include "xnnpack/buffer.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator-utils.h" #include "xnnpack/operator.h" #include "xnnpack/requantization.h" #include "xnnpack/subgraph.h" -#include "xnnpack/buffer.h" #include "replicable_random_device.h" template class DeconvolutionTestBase : public ::testing::Test { diff --git a/test/deconvolution-operator-tester.h b/test/deconvolution-operator-tester.h index e83d4c0b968..831d57b7494 100644 --- a/test/deconvolution-operator-tester.h +++ b/test/deconvolution-operator-tester.h @@ -24,10 +24,11 @@ #include #include #include "xnnpack.h" +#include "xnnpack/buffer.h" #include "xnnpack/cache.h" #include "xnnpack/common.h" +#include "xnnpack/math.h" #include "xnnpack/microparams.h" -#include "xnnpack/buffer.h" #include "replicable_random_device.h" class DeconvolutionOperatorTester { diff --git a/test/depth-to-space-2d.cc b/test/depth-to-space-2d.cc index 23c9808e6cc..6a341fbdeb1 100644 --- a/test/depth-to-space-2d.cc +++ b/test/depth-to-space-2d.cc @@ -18,10 +18,11 @@ #include #include "xnnpack.h" +#include "xnnpack/buffer.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" -#include "xnnpack/buffer.h" #include "replicable_random_device.h" template class DepthToSpaceTest : public ::testing::Test { diff --git a/test/depthwise-convolution-2d.cc b/test/depthwise-convolution-2d.cc index 91fbab1d2b5..44d51f6f71a 100644 --- a/test/depthwise-convolution-2d.cc +++ b/test/depthwise-convolution-2d.cc @@ -16,13 +16,14 @@ #include #include "xnnpack.h" +#include "xnnpack/buffer.h" #include "xnnpack/common.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator-utils.h" #include "xnnpack/operator.h" #include "xnnpack/requantization.h" #include "xnnpack/subgraph.h" -#include "xnnpack/buffer.h" #include "convolution-test-helpers.h" #include "replicable_random_device.h" diff --git a/test/dwconv2d-microkernel-tester.h b/test/dwconv2d-microkernel-tester.h index eb99ab1d3f7..1a25dade488 100644 --- a/test/dwconv2d-microkernel-tester.h +++ b/test/dwconv2d-microkernel-tester.h @@ -19,9 +19,10 @@ #include #include "xnnpack.h" +#include "xnnpack/buffer.h" +#include "xnnpack/math.h" #include "xnnpack/microfnptr.h" #include "xnnpack/microparams.h" -#include "xnnpack/buffer.h" #include "replicable_random_device.h" class DWConv2DMicrokernelTester { diff --git a/test/elu.cc b/test/elu.cc index 1694a195ae4..39866bfa7da 100644 --- a/test/elu.cc +++ b/test/elu.cc @@ -12,6 +12,7 @@ #include #include "xnnpack.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" diff --git a/test/even-split2.cc b/test/even-split2.cc index bcd6b004a30..45bd14d48b5 100644 --- a/test/even-split2.cc +++ b/test/even-split2.cc @@ -17,10 +17,11 @@ #include #include "xnnpack.h" +#include "xnnpack/buffer.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" -#include "xnnpack/buffer.h" #include "replicable_random_device.h" template class EvenSplit2Test : public ::testing::Test { diff --git a/test/even-split3.cc b/test/even-split3.cc index c72bacc8d9d..28418947e69 100644 --- a/test/even-split3.cc +++ b/test/even-split3.cc @@ -17,10 +17,11 @@ #include #include "xnnpack.h" +#include "xnnpack/buffer.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" -#include "xnnpack/buffer.h" #include "replicable_random_device.h" template class EvenSplit3Test : public ::testing::Test { diff --git a/test/even-split4.cc b/test/even-split4.cc index b327c868f48..16f968feb72 100644 --- a/test/even-split4.cc +++ b/test/even-split4.cc @@ -17,10 +17,11 @@ #include #include "xnnpack.h" +#include "xnnpack/buffer.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" -#include "xnnpack/buffer.h" #include "replicable_random_device.h" template class EvenSplit4Test : public ::testing::Test { diff --git a/test/f16-dwconv-minmax-multipass.cc b/test/f16-dwconv-minmax-multipass.cc index 994598c292f..d87f21b8244 100644 --- a/test/f16-dwconv-minmax-multipass.cc +++ b/test/f16-dwconv-minmax-multipass.cc @@ -300,5 +300,5 @@ INSTANTIATE_TEST_SUITE_P( [](const testing::TestParamInfo& info) { \ return info.param.test_name; \ }); -#include "src/f16-dwconv/f16-dwconv-minmax-multipass.h" +#include "f16-dwconv/f16-dwconv-minmax-multipass.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-dwconv-minmax-unipass.cc b/test/f16-dwconv-minmax-unipass.cc index fbbce5862f8..624e20f0ba2 100644 --- a/test/f16-dwconv-minmax-unipass.cc +++ b/test/f16-dwconv-minmax-unipass.cc @@ -204,5 +204,5 @@ INSTANTIATE_TEST_SUITE_P( [](const testing::TestParamInfo& info) { \ return info.param.test_name; \ }); -#include "src/f16-dwconv/f16-dwconv-minmax-unipass.h" +#include "f16-dwconv/f16-dwconv-minmax-unipass.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-f32-vcvt.cc b/test/f16-f32-vcvt.cc index bd583c3e2df..d723077bf52 100644 --- a/test/f16-f32-vcvt.cc +++ b/test/f16-f32-vcvt.cc @@ -18,5 +18,5 @@ XNN_TEST_CVT_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype_in, datatype_out XNN_TEST_CVT_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);\ XNN_TEST_CVT_BATCH_LT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); -#include "src/f16-f32-vcvt/f16-f32-vcvt.h" +#include "f16-f32-vcvt/f16-f32-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/test/f16-prelu.cc b/test/f16-prelu.cc deleted file mode 100644 index 36c4e7ef28e..00000000000 --- a/test/f16-prelu.cc +++ /dev/null @@ -1,491 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: test/f16-prelu.yaml -// Generator: tools/generate-prelu-test.py - - -#include -#include "xnnpack/common.h" -#include "xnnpack/isa-checks.h" -#include "xnnpack/prelu.h" -#include "prelu-microkernel-tester.h" - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - TEST(F16_PRELU__NEONFP16ARITH_2X8, channels_eq_8) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - PReLUMicrokernelTester() - .rows(2) - .channels(8) - .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x8); - } - - TEST(F16_PRELU__NEONFP16ARITH_2X8, channels_div_8) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 16; channels < 80; channels += 8) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x8); - } - } - - TEST(F16_PRELU__NEONFP16ARITH_2X8, channels_lt_8) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x8); - } - } - - TEST(F16_PRELU__NEONFP16ARITH_2X8, channels_gt_8) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 9; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x8); - } - } - - TEST(F16_PRELU__NEONFP16ARITH_2X8, rows_lt_2) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x8); - } - } - } - - TEST(F16_PRELU__NEONFP16ARITH_2X8, rows_div_2) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x8); - } - } - } - - TEST(F16_PRELU__NEONFP16ARITH_2X8, rows_gt_2) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x8); - } - } - } - - TEST(F16_PRELU__NEONFP16ARITH_2X8, input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(43) - .iterations(1) - .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x8); - } - } - } - - TEST(F16_PRELU__NEONFP16ARITH_2X8, output_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(43) - .iterations(1) - .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x8); - } - } - } - - TEST(F16_PRELU__NEONFP16ARITH_2X8, inplace) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x8); - } - } - } -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - TEST(F16_PRELU__NEONFP16ARITH_2X16, channels_eq_16) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - PReLUMicrokernelTester() - .rows(2) - .channels(16) - .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x16); - } - - TEST(F16_PRELU__NEONFP16ARITH_2X16, channels_div_16) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 32; channels < 160; channels += 16) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x16); - } - } - - TEST(F16_PRELU__NEONFP16ARITH_2X16, channels_lt_16) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x16); - } - } - - TEST(F16_PRELU__NEONFP16ARITH_2X16, channels_gt_16) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 17; channels < 32; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x16); - } - } - - TEST(F16_PRELU__NEONFP16ARITH_2X16, rows_lt_2) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x16); - } - } - } - - TEST(F16_PRELU__NEONFP16ARITH_2X16, rows_div_2) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x16); - } - } - } - - TEST(F16_PRELU__NEONFP16ARITH_2X16, rows_gt_2) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x16); - } - } - } - - TEST(F16_PRELU__NEONFP16ARITH_2X16, input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(83) - .iterations(1) - .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x16); - } - } - } - - TEST(F16_PRELU__NEONFP16ARITH_2X16, output_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(83) - .iterations(1) - .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x16); - } - } - } - - TEST(F16_PRELU__NEONFP16ARITH_2X16, inplace) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x16); - } - } - } -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F16_PRELU__F16C_2X8, channels_eq_8) { - TEST_REQUIRES_X86_F16C; - PReLUMicrokernelTester() - .rows(2) - .channels(8) - .Test(xnn_f16_prelu_ukernel__f16c_2x8); - } - - TEST(F16_PRELU__F16C_2X8, channels_div_8) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 16; channels < 80; channels += 8) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f16_prelu_ukernel__f16c_2x8); - } - } - - TEST(F16_PRELU__F16C_2X8, channels_lt_8) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f16_prelu_ukernel__f16c_2x8); - } - } - - TEST(F16_PRELU__F16C_2X8, channels_gt_8) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 9; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f16_prelu_ukernel__f16c_2x8); - } - } - - TEST(F16_PRELU__F16C_2X8, rows_lt_2) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_prelu_ukernel__f16c_2x8); - } - } - } - - TEST(F16_PRELU__F16C_2X8, rows_div_2) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_prelu_ukernel__f16c_2x8); - } - } - } - - TEST(F16_PRELU__F16C_2X8, rows_gt_2) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_prelu_ukernel__f16c_2x8); - } - } - } - - TEST(F16_PRELU__F16C_2X8, input_stride) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(43) - .iterations(1) - .Test(xnn_f16_prelu_ukernel__f16c_2x8); - } - } - } - - TEST(F16_PRELU__F16C_2X8, output_stride) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(43) - .iterations(1) - .Test(xnn_f16_prelu_ukernel__f16c_2x8); - } - } - } - - TEST(F16_PRELU__F16C_2X8, inplace) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f16_prelu_ukernel__f16c_2x8); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F16_PRELU__F16C_2X16, channels_eq_16) { - TEST_REQUIRES_X86_F16C; - PReLUMicrokernelTester() - .rows(2) - .channels(16) - .Test(xnn_f16_prelu_ukernel__f16c_2x16); - } - - TEST(F16_PRELU__F16C_2X16, channels_div_16) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 32; channels < 160; channels += 16) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f16_prelu_ukernel__f16c_2x16); - } - } - - TEST(F16_PRELU__F16C_2X16, channels_lt_16) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f16_prelu_ukernel__f16c_2x16); - } - } - - TEST(F16_PRELU__F16C_2X16, channels_gt_16) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 17; channels < 32; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f16_prelu_ukernel__f16c_2x16); - } - } - - TEST(F16_PRELU__F16C_2X16, rows_lt_2) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_prelu_ukernel__f16c_2x16); - } - } - } - - TEST(F16_PRELU__F16C_2X16, rows_div_2) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_prelu_ukernel__f16c_2x16); - } - } - } - - TEST(F16_PRELU__F16C_2X16, rows_gt_2) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_prelu_ukernel__f16c_2x16); - } - } - } - - TEST(F16_PRELU__F16C_2X16, input_stride) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(83) - .iterations(1) - .Test(xnn_f16_prelu_ukernel__f16c_2x16); - } - } - } - - TEST(F16_PRELU__F16C_2X16, output_stride) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(83) - .iterations(1) - .Test(xnn_f16_prelu_ukernel__f16c_2x16); - } - } - } - - TEST(F16_PRELU__F16C_2X16, inplace) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f16_prelu_ukernel__f16c_2x16); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/test/f16-prelu.yaml b/test/f16-prelu.yaml deleted file mode 100644 index 22a92cd0b51..00000000000 --- a/test/f16-prelu.yaml +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright 2020 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# ARM NEON+FP16ARITH -- name: xnn_f16_prelu_ukernel__neonfp16arith_2x8 -- name: xnn_f16_prelu_ukernel__neonfp16arith_2x16 - -# x86 F16C -- name: xnn_f16_prelu_ukernel__f16c_2x8 -- name: xnn_f16_prelu_ukernel__f16c_2x16 diff --git a/test/f16-qs8-vcvt.cc b/test/f16-qs8-vcvt.cc index 6d42943dfb4..89e963b8d47 100644 --- a/test/f16-qs8-vcvt.cc +++ b/test/f16-qs8-vcvt.cc @@ -23,5 +23,5 @@ XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, u \ \ XNN_TEST_CVT_OUTPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); -#include "src/f16-qs8-vcvt/f16-qs8-vcvt.h" +#include "f16-qs8-vcvt/f16-qs8-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/test/f16-vabs.cc b/test/f16-vabs.cc index 61f82ac0ed3..bd9953a00dd 100644 --- a/test/f16-vabs.cc +++ b/test/f16-vabs.cc @@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Abs()); \ \ XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Abs()); -#include "src/f16-vabs/f16-vabs.h" +#include "f16-vabs/f16-vabs.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vadd.cc b/test/f16-vadd.cc index 6bf82168024..d9568a6e34d 100644 --- a/test/f16-vadd.cc +++ b/test/f16-vadd.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); -#include "src/f16-vbinary/f16-vadd.h" +#include "f16-vbinary/f16-vadd.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vaddc.cc b/test/f16-vaddc.cc index f1717a8d5c7..68a7361cbcc 100644 --- a/test/f16-vaddc.cc +++ b/test/f16-vaddc.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); -#include "src/f16-vbinary/f16-vaddc.h" +#include "f16-vbinary/f16-vaddc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vclamp.cc b/test/f16-vclamp.cc index 3a3b7511687..db5fd14048a 100644 --- a/test/f16-vclamp.cc +++ b/test/f16-vclamp.cc @@ -34,5 +34,5 @@ XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ XNN_TEST_UNARY_QMIN(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ XNN_TEST_UNARY_QMAX(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); -#include "src/f16-vclamp/f16-vclamp.h" +#include "f16-vclamp/f16-vclamp.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vcmul.cc b/test/f16-vcmul.cc index f0492ca5190..9f551f262fa 100644 --- a/test/f16-vcmul.cc +++ b/test/f16-vcmul.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params); -#include "src/f16-vbinary/f16-vcmul.h" +#include "f16-vbinary/f16-vcmul.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vdiv.cc b/test/f16-vdiv.cc index 6f6b91f6665..f1a1e72c1cd 100644 --- a/test/f16-vdiv.cc +++ b/test/f16-vdiv.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); -#include "src/f16-vbinary/f16-vdiv.h" +#include "f16-vbinary/f16-vdiv.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vdivc.cc b/test/f16-vdivc.cc index b30b4789328..522d50d613b 100644 --- a/test/f16-vdivc.cc +++ b/test/f16-vdivc.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); -#include "src/f16-vbinary/f16-vdivc.h" +#include "f16-vbinary/f16-vdivc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-velu.cc b/test/f16-velu.cc index b5ab981aff4..d1ddb4f7ce1 100644 --- a/test/f16-velu.cc +++ b/test/f16-velu.cc @@ -76,5 +76,5 @@ TEST(ukernel, beta) { } \ } \ } -#include "src/f16-velu/f16-velu.h" +#include "f16-velu/f16-velu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vhswish.cc b/test/f16-vhswish.cc index 39bf2bdb63a..f8b4a3b5433 100644 --- a/test/f16-vhswish.cc +++ b/test/f16-vhswish.cc @@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ \ XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); -#include "src/f16-vhswish/f16-vhswish.h" +#include "f16-vhswish/f16-vhswish.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vlrelu.cc b/test/f16-vlrelu.cc index 2f931a115b0..eea1518a1e6 100644 --- a/test/f16-vlrelu.cc +++ b/test/f16-vlrelu.cc @@ -46,5 +46,5 @@ TEST(ukernel, slope) { } \ } \ } -#include "src/f16-vlrelu/f16-vlrelu.h" +#include "f16-vlrelu/f16-vlrelu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vmax.cc b/test/f16-vmax.cc index 598de281d27..24509ce02bd 100644 --- a/test/f16-vmax.cc +++ b/test/f16-vmax.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Max, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Max, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Max, init_params); -#include "src/f16-vbinary/f16-vmax.h" +#include "f16-vbinary/f16-vmax.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vmaxc.cc b/test/f16-vmaxc.cc index a27e02cd6b6..daaafe58ba0 100644 --- a/test/f16-vmaxc.cc +++ b/test/f16-vmaxc.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Max, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Max, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Max, init_params); -#include "src/f16-vbinary/f16-vmaxc.h" +#include "f16-vbinary/f16-vmaxc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vmin.cc b/test/f16-vmin.cc index aca129bf78b..4547f969cb3 100644 --- a/test/f16-vmin.cc +++ b/test/f16-vmin.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Min, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Min, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Min, init_params); -#include "src/f16-vbinary/f16-vmin.h" +#include "f16-vbinary/f16-vmin.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vminc.cc b/test/f16-vminc.cc index da6c5930139..c186bb4f87b 100644 --- a/test/f16-vminc.cc +++ b/test/f16-vminc.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Min, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Min, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Min, init_params); -#include "src/f16-vbinary/f16-vminc.h" +#include "f16-vbinary/f16-vminc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vmul.cc b/test/f16-vmul.cc index cc169da9dbb..23ea43be68a 100644 --- a/test/f16-vmul.cc +++ b/test/f16-vmul.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); -#include "src/f16-vbinary/f16-vmul.h" +#include "f16-vbinary/f16-vmul.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vmulc.cc b/test/f16-vmulc.cc index 9ff93b385b0..b41f8e486ce 100644 --- a/test/f16-vmulc.cc +++ b/test/f16-vmulc.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); -#include "src/f16-vbinary/f16-vmulc.h" +#include "f16-vbinary/f16-vmulc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vneg.cc b/test/f16-vneg.cc index add84b6d095..5cf6e86eda8 100644 --- a/test/f16-vneg.cc +++ b/test/f16-vneg.cc @@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Neg()); \ \ XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Neg()); -#include "src/f16-vneg/f16-vneg.h" +#include "f16-vneg/f16-vneg.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vprelu.cc b/test/f16-vprelu.cc index 65ab0cce57b..4aca438164e 100644 --- a/test/f16-vprelu.cc +++ b/test/f16-vprelu.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); -#include "src/f16-vbinary/f16-vprelu.h" +#include "f16-vbinary/f16-vprelu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vpreluc.cc b/test/f16-vpreluc.cc index 55ee99e7415..9ac5d00a873 100644 --- a/test/f16-vpreluc.cc +++ b/test/f16-vpreluc.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); -#include "src/f16-vbinary/f16-vpreluc.h" +#include "f16-vbinary/f16-vpreluc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vrdivc.cc b/test/f16-vrdivc.cc index fc73faa51bf..55ff00068ae 100644 --- a/test/f16-vrdivc.cc +++ b/test/f16-vrdivc.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RDiv, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RDiv, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RDiv, init_params); -#include "src/f16-vbinary/f16-vrdivc.h" +#include "f16-vbinary/f16-vrdivc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vrndd.cc b/test/f16-vrndd.cc index 18e84573163..8690fffb912 100644 --- a/test/f16-vrndd.cc +++ b/test/f16-vrndd.cc @@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUna XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundDown, init_params); \ \ XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundDown, init_params); -#include "src/f16-vrnd/f16-vrndd.h" +#include "f16-vrnd/f16-vrndd.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vrndne.cc b/test/f16-vrndne.cc index 103045e68c5..d11342514cb 100644 --- a/test/f16-vrndne.cc +++ b/test/f16-vrndne.cc @@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUna XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundToNearestEven, init_params); \ \ XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundToNearestEven, init_params); -#include "src/f16-vrnd/f16-vrndne.h" +#include "f16-vrnd/f16-vrndne.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vrndu.cc b/test/f16-vrndu.cc index f265fcd4173..44f229fff03 100644 --- a/test/f16-vrndu.cc +++ b/test/f16-vrndu.cc @@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUna XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundUp, init_params); \ \ XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundUp, init_params); -#include "src/f16-vrnd/f16-vrndu.h" +#include "f16-vrnd/f16-vrndu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vrndz.cc b/test/f16-vrndz.cc index db57af8077c..5cc01810577 100644 --- a/test/f16-vrndz.cc +++ b/test/f16-vrndz.cc @@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUna XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundTowardsZero, init_params); \ \ XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundTowardsZero, init_params); -#include "src/f16-vrnd/f16-vrndz.h" +#include "f16-vrnd/f16-vrndz.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vrpreluc.cc b/test/f16-vrpreluc.cc index b769c7763b0..65ca49c6c0b 100644 --- a/test/f16-vrpreluc.cc +++ b/test/f16-vrpreluc.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); -#include "src/f16-vbinary/f16-vrpreluc.h" +#include "f16-vbinary/f16-vrpreluc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vrsqrt.cc b/test/f16-vrsqrt.cc index 777148fbc27..ffe76b20cff 100644 --- a/test/f16-vrsqrt.cc +++ b/test/f16-vrsqrt.cc @@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ \ XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); -#include "src/f16-vrsqrt/f16-vrsqrt.h" +#include "f16-vrsqrt/f16-vrsqrt.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vrsubc.cc b/test/f16-vrsubc.cc index 74cb4632e4e..873aba2531f 100644 --- a/test/f16-vrsubc.cc +++ b/test/f16-vrsubc.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RSub, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RSub, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RSub, init_params); -#include "src/f16-vbinary/f16-vrsubc.h" +#include "f16-vbinary/f16-vrsubc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vsigmoid.cc b/test/f16-vsigmoid.cc index c9fa13f1f1b..661f486e033 100644 --- a/test/f16-vsigmoid.cc +++ b/test/f16-vsigmoid.cc @@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ \ XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); -#include "src/f16-vsigmoid/f16-vsigmoid.h" +#include "f16-vsigmoid/f16-vsigmoid.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vsqr.cc b/test/f16-vsqr.cc index af906dca2b9..97dedacf945 100644 --- a/test/f16-vsqr.cc +++ b/test/f16-vsqr.cc @@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Sqr()); \ \ XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Sqr()); -#include "src/f16-vsqr/f16-vsqr.h" +#include "f16-vsqr/f16-vsqr.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vsqrdiff.cc b/test/f16-vsqrdiff.cc index dd60bda0a1c..a621c3608f6 100644 --- a/test/f16-vsqrdiff.cc +++ b/test/f16-vsqrdiff.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::SqrDiff, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::SqrDiff, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::SqrDiff, init_params); -#include "src/f16-vbinary/f16-vsqrdiff.h" +#include "f16-vbinary/f16-vsqrdiff.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vsqrdiffc.cc b/test/f16-vsqrdiffc.cc index 3682b324ac5..9a631468af4 100644 --- a/test/f16-vsqrdiffc.cc +++ b/test/f16-vsqrdiffc.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::SqrDiff, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::SqrDiff, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::SqrDiff, init_params); -#include "src/f16-vbinary/f16-vsqrdiffc.h" +#include "f16-vbinary/f16-vsqrdiffc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vsqrt.cc b/test/f16-vsqrt.cc index 47a110bf40f..66d396ad7fe 100644 --- a/test/f16-vsqrt.cc +++ b/test/f16-vsqrt.cc @@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ \ XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); -#include "src/f16-vsqrt/f16-vsqrt.h" +#include "f16-vsqrt/f16-vsqrt.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vsub.cc b/test/f16-vsub.cc index aba129113a1..96e41b52dc3 100644 --- a/test/f16-vsub.cc +++ b/test/f16-vsub.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); -#include "src/f16-vbinary/f16-vsub.h" +#include "f16-vbinary/f16-vsub.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vsubc.cc b/test/f16-vsubc.cc index ea52fcd5567..6b66e4a5938 100644 --- a/test/f16-vsubc.cc +++ b/test/f16-vsubc.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); -#include "src/f16-vbinary/f16-vsubc.h" +#include "f16-vbinary/f16-vsubc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vtanh.cc b/test/f16-vtanh.cc index 157e956a26e..924acb8d415 100644 --- a/test/f16-vtanh.cc +++ b/test/f16-vtanh.cc @@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ \ XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); -#include "src/f16-vtanh/f16-vtanh.h" +#include "f16-vtanh/f16-vtanh.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-dwconv-minmax-multipass.cc b/test/f32-dwconv-minmax-multipass.cc index 4b1ae109c14..a032524a298 100644 --- a/test/f32-dwconv-minmax-multipass.cc +++ b/test/f32-dwconv-minmax-multipass.cc @@ -300,5 +300,5 @@ INSTANTIATE_TEST_SUITE_P( [](const testing::TestParamInfo& info) { \ return info.param.test_name; \ }); -#include "src/f32-dwconv/f32-dwconv-minmax-multipass.h" +#include "f32-dwconv/f32-dwconv-minmax-multipass.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-dwconv-minmax-unipass.cc b/test/f32-dwconv-minmax-unipass.cc index 00cf20ccf08..3e13c8ad80b 100644 --- a/test/f32-dwconv-minmax-unipass.cc +++ b/test/f32-dwconv-minmax-unipass.cc @@ -204,5 +204,5 @@ INSTANTIATE_TEST_SUITE_P( [](const testing::TestParamInfo& info) { \ return info.param.test_name; \ }); -#include "src/f32-dwconv/f32-dwconv-minmax-unipass.h" +#include "f32-dwconv/f32-dwconv-minmax-unipass.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-dwconv-multipass.cc b/test/f32-dwconv-multipass.cc index 279dfc94b4c..e1528fec836 100644 --- a/test/f32-dwconv-multipass.cc +++ b/test/f32-dwconv-multipass.cc @@ -273,5 +273,5 @@ INSTANTIATE_TEST_SUITE_P( [](const testing::TestParamInfo& info) { \ return info.param.test_name; \ }); -#include "src/f32-dwconv/f32-dwconv-multipass.h" +#include "f32-dwconv/f32-dwconv-multipass.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-dwconv-unipass.cc b/test/f32-dwconv-unipass.cc index db24377ba3b..f847307c615 100644 --- a/test/f32-dwconv-unipass.cc +++ b/test/f32-dwconv-unipass.cc @@ -151,5 +151,5 @@ INSTANTIATE_TEST_SUITE_P( [](const testing::TestParamInfo& info) { \ return info.param.test_name; \ }); -#include "src/f32-dwconv/f32-dwconv-unipass.h" +#include "f32-dwconv/f32-dwconv-unipass.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-f16-vcvt.cc b/test/f32-f16-vcvt.cc index 4688b03bfa0..6ca121fd452 100644 --- a/test/f32-f16-vcvt.cc +++ b/test/f32-f16-vcvt.cc @@ -18,5 +18,5 @@ XNN_TEST_CVT_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype_in, datatype_out XNN_TEST_CVT_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);\ XNN_TEST_CVT_BATCH_LT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); -#include "src/f32-f16-vcvt/f32-f16-vcvt.h" +#include "f32-f16-vcvt/f32-f16-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/test/f32-prelu.cc b/test/f32-prelu.cc deleted file mode 100644 index 58fdf245375..00000000000 --- a/test/f32-prelu.cc +++ /dev/null @@ -1,6259 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: test/f32-prelu.yaml -// Generator: tools/generate-prelu-test.py - - -#include -#include "xnnpack/common.h" -#include "xnnpack/isa-checks.h" -#include "xnnpack/prelu.h" -#include "prelu-microkernel-tester.h" - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_PRELU__NEON_1X4, channels_eq_4) { - TEST_REQUIRES_ARM_NEON; - PReLUMicrokernelTester() - .rows(1) - .channels(4) - .Test(xnn_f32_prelu_ukernel__neon_1x4); - } - - TEST(F32_PRELU__NEON_1X4, channels_div_4) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 8; channels < 40; channels += 4) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_1x4); - } - } - - TEST(F32_PRELU__NEON_1X4, channels_lt_4) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 4; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_1x4); - } - } - - TEST(F32_PRELU__NEON_1X4, channels_gt_4) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 5; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_1x4); - } - } - - TEST(F32_PRELU__NEON_1X4, rows_gt_1) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_1x4); - } - } - } - - TEST(F32_PRELU__NEON_1X4, input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__neon_1x4); - } - } - } - - TEST(F32_PRELU__NEON_1X4, output_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__neon_1x4); - } - } - } - - TEST(F32_PRELU__NEON_1X4, inplace) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__neon_1x4); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_PRELU__NEON_1X8, channels_eq_8) { - TEST_REQUIRES_ARM_NEON; - PReLUMicrokernelTester() - .rows(1) - .channels(8) - .Test(xnn_f32_prelu_ukernel__neon_1x8); - } - - TEST(F32_PRELU__NEON_1X8, channels_div_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 80; channels += 8) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_1x8); - } - } - - TEST(F32_PRELU__NEON_1X8, channels_lt_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_1x8); - } - } - - TEST(F32_PRELU__NEON_1X8, channels_gt_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_1x8); - } - } - - TEST(F32_PRELU__NEON_1X8, rows_gt_1) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_1x8); - } - } - } - - TEST(F32_PRELU__NEON_1X8, input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__neon_1x8); - } - } - } - - TEST(F32_PRELU__NEON_1X8, output_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__neon_1x8); - } - } - } - - TEST(F32_PRELU__NEON_1X8, inplace) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__neon_1x8); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_PRELU__NEON_1X16, channels_eq_16) { - TEST_REQUIRES_ARM_NEON; - PReLUMicrokernelTester() - .rows(1) - .channels(16) - .Test(xnn_f32_prelu_ukernel__neon_1x16); - } - - TEST(F32_PRELU__NEON_1X16, channels_div_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 160; channels += 16) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_1x16); - } - } - - TEST(F32_PRELU__NEON_1X16, channels_lt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_1x16); - } - } - - TEST(F32_PRELU__NEON_1X16, channels_gt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_1x16); - } - } - - TEST(F32_PRELU__NEON_1X16, rows_gt_1) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_1x16); - } - } - } - - TEST(F32_PRELU__NEON_1X16, input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__neon_1x16); - } - } - } - - TEST(F32_PRELU__NEON_1X16, output_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__neon_1x16); - } - } - } - - TEST(F32_PRELU__NEON_1X16, inplace) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__neon_1x16); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_PRELU__NEON_2X4, channels_eq_4) { - TEST_REQUIRES_ARM_NEON; - PReLUMicrokernelTester() - .rows(2) - .channels(4) - .Test(xnn_f32_prelu_ukernel__neon_2x4); - } - - TEST(F32_PRELU__NEON_2X4, channels_div_4) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 8; channels < 40; channels += 4) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_2x4); - } - } - - TEST(F32_PRELU__NEON_2X4, channels_lt_4) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 4; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_2x4); - } - } - - TEST(F32_PRELU__NEON_2X4, channels_gt_4) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 5; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_2x4); - } - } - - TEST(F32_PRELU__NEON_2X4, rows_lt_2) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_2x4); - } - } - } - - TEST(F32_PRELU__NEON_2X4, rows_div_2) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_2x4); - } - } - } - - TEST(F32_PRELU__NEON_2X4, rows_gt_2) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_2x4); - } - } - } - - TEST(F32_PRELU__NEON_2X4, input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__neon_2x4); - } - } - } - - TEST(F32_PRELU__NEON_2X4, output_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__neon_2x4); - } - } - } - - TEST(F32_PRELU__NEON_2X4, inplace) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__neon_2x4); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_PRELU__NEON_2X8, channels_eq_8) { - TEST_REQUIRES_ARM_NEON; - PReLUMicrokernelTester() - .rows(2) - .channels(8) - .Test(xnn_f32_prelu_ukernel__neon_2x8); - } - - TEST(F32_PRELU__NEON_2X8, channels_div_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 80; channels += 8) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_2x8); - } - } - - TEST(F32_PRELU__NEON_2X8, channels_lt_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_2x8); - } - } - - TEST(F32_PRELU__NEON_2X8, channels_gt_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_2x8); - } - } - - TEST(F32_PRELU__NEON_2X8, rows_lt_2) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_2x8); - } - } - } - - TEST(F32_PRELU__NEON_2X8, rows_div_2) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_2x8); - } - } - } - - TEST(F32_PRELU__NEON_2X8, rows_gt_2) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_2x8); - } - } - } - - TEST(F32_PRELU__NEON_2X8, input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__neon_2x8); - } - } - } - - TEST(F32_PRELU__NEON_2X8, output_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__neon_2x8); - } - } - } - - TEST(F32_PRELU__NEON_2X8, inplace) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__neon_2x8); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_PRELU__NEON_2X16, channels_eq_16) { - TEST_REQUIRES_ARM_NEON; - PReLUMicrokernelTester() - .rows(2) - .channels(16) - .Test(xnn_f32_prelu_ukernel__neon_2x16); - } - - TEST(F32_PRELU__NEON_2X16, channels_div_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 160; channels += 16) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_2x16); - } - } - - TEST(F32_PRELU__NEON_2X16, channels_lt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_2x16); - } - } - - TEST(F32_PRELU__NEON_2X16, channels_gt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_2x16); - } - } - - TEST(F32_PRELU__NEON_2X16, rows_lt_2) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_2x16); - } - } - } - - TEST(F32_PRELU__NEON_2X16, rows_div_2) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_2x16); - } - } - } - - TEST(F32_PRELU__NEON_2X16, rows_gt_2) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_2x16); - } - } - } - - TEST(F32_PRELU__NEON_2X16, input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__neon_2x16); - } - } - } - - TEST(F32_PRELU__NEON_2X16, output_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__neon_2x16); - } - } - } - - TEST(F32_PRELU__NEON_2X16, inplace) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__neon_2x16); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_PRELU__NEON_4X4, channels_eq_4) { - TEST_REQUIRES_ARM_NEON; - PReLUMicrokernelTester() - .rows(4) - .channels(4) - .Test(xnn_f32_prelu_ukernel__neon_4x4); - } - - TEST(F32_PRELU__NEON_4X4, channels_div_4) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 8; channels < 40; channels += 4) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_4x4); - } - } - - TEST(F32_PRELU__NEON_4X4, channels_lt_4) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 4; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_4x4); - } - } - - TEST(F32_PRELU__NEON_4X4, channels_gt_4) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 5; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_4x4); - } - } - - TEST(F32_PRELU__NEON_4X4, rows_lt_4) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows < 4; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_4x4); - } - } - } - - TEST(F32_PRELU__NEON_4X4, rows_div_4) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows <= 16; rows += 4) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_4x4); - } - } - } - - TEST(F32_PRELU__NEON_4X4, rows_gt_4) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 5; rows < 8; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_4x4); - } - } - } - - TEST(F32_PRELU__NEON_4X4, input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__neon_4x4); - } - } - } - - TEST(F32_PRELU__NEON_4X4, output_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__neon_4x4); - } - } - } - - TEST(F32_PRELU__NEON_4X4, inplace) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__neon_4x4); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_PRELU__NEON_4X8, channels_eq_8) { - TEST_REQUIRES_ARM_NEON; - PReLUMicrokernelTester() - .rows(4) - .channels(8) - .Test(xnn_f32_prelu_ukernel__neon_4x8); - } - - TEST(F32_PRELU__NEON_4X8, channels_div_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 80; channels += 8) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_4x8); - } - } - - TEST(F32_PRELU__NEON_4X8, channels_lt_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_4x8); - } - } - - TEST(F32_PRELU__NEON_4X8, channels_gt_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_4x8); - } - } - - TEST(F32_PRELU__NEON_4X8, rows_lt_4) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows < 4; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_4x8); - } - } - } - - TEST(F32_PRELU__NEON_4X8, rows_div_4) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows <= 16; rows += 4) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_4x8); - } - } - } - - TEST(F32_PRELU__NEON_4X8, rows_gt_4) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 5; rows < 8; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_4x8); - } - } - } - - TEST(F32_PRELU__NEON_4X8, input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__neon_4x8); - } - } - } - - TEST(F32_PRELU__NEON_4X8, output_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__neon_4x8); - } - } - } - - TEST(F32_PRELU__NEON_4X8, inplace) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__neon_4x8); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_PRELU__NEON_4X16, channels_eq_16) { - TEST_REQUIRES_ARM_NEON; - PReLUMicrokernelTester() - .rows(4) - .channels(16) - .Test(xnn_f32_prelu_ukernel__neon_4x16); - } - - TEST(F32_PRELU__NEON_4X16, channels_div_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 160; channels += 16) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_4x16); - } - } - - TEST(F32_PRELU__NEON_4X16, channels_lt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_4x16); - } - } - - TEST(F32_PRELU__NEON_4X16, channels_gt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_4x16); - } - } - - TEST(F32_PRELU__NEON_4X16, rows_lt_4) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows < 4; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_4x16); - } - } - } - - TEST(F32_PRELU__NEON_4X16, rows_div_4) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows <= 16; rows += 4) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_4x16); - } - } - } - - TEST(F32_PRELU__NEON_4X16, rows_gt_4) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 5; rows < 8; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__neon_4x16); - } - } - } - - TEST(F32_PRELU__NEON_4X16, input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__neon_4x16); - } - } - } - - TEST(F32_PRELU__NEON_4X16, output_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__neon_4x16); - } - } - } - - TEST(F32_PRELU__NEON_4X16, inplace) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__neon_4x16); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_PRELU__SSE_2X4, channels_eq_4) { - TEST_REQUIRES_X86_SSE; - PReLUMicrokernelTester() - .rows(2) - .channels(4) - .Test(xnn_f32_prelu_ukernel__sse_2x4); - } - - TEST(F32_PRELU__SSE_2X4, channels_div_4) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 8; channels < 40; channels += 4) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse_2x4); - } - } - - TEST(F32_PRELU__SSE_2X4, channels_lt_4) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 1; channels < 4; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse_2x4); - } - } - - TEST(F32_PRELU__SSE_2X4, channels_gt_4) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 5; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse_2x4); - } - } - - TEST(F32_PRELU__SSE_2X4, rows_lt_2) { - TEST_REQUIRES_X86_SSE; - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse_2x4); - } - } - } - - TEST(F32_PRELU__SSE_2X4, rows_div_2) { - TEST_REQUIRES_X86_SSE; - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse_2x4); - } - } - } - - TEST(F32_PRELU__SSE_2X4, rows_gt_2) { - TEST_REQUIRES_X86_SSE; - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse_2x4); - } - } - } - - TEST(F32_PRELU__SSE_2X4, input_stride) { - TEST_REQUIRES_X86_SSE; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__sse_2x4); - } - } - } - - TEST(F32_PRELU__SSE_2X4, output_stride) { - TEST_REQUIRES_X86_SSE; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__sse_2x4); - } - } - } - - TEST(F32_PRELU__SSE_2X4, inplace) { - TEST_REQUIRES_X86_SSE; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__sse_2x4); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_PRELU__SSE_2X8, channels_eq_8) { - TEST_REQUIRES_X86_SSE; - PReLUMicrokernelTester() - .rows(2) - .channels(8) - .Test(xnn_f32_prelu_ukernel__sse_2x8); - } - - TEST(F32_PRELU__SSE_2X8, channels_div_8) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 16; channels < 80; channels += 8) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse_2x8); - } - } - - TEST(F32_PRELU__SSE_2X8, channels_lt_8) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 1; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse_2x8); - } - } - - TEST(F32_PRELU__SSE_2X8, channels_gt_8) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 9; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse_2x8); - } - } - - TEST(F32_PRELU__SSE_2X8, rows_lt_2) { - TEST_REQUIRES_X86_SSE; - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse_2x8); - } - } - } - - TEST(F32_PRELU__SSE_2X8, rows_div_2) { - TEST_REQUIRES_X86_SSE; - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse_2x8); - } - } - } - - TEST(F32_PRELU__SSE_2X8, rows_gt_2) { - TEST_REQUIRES_X86_SSE; - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse_2x8); - } - } - } - - TEST(F32_PRELU__SSE_2X8, input_stride) { - TEST_REQUIRES_X86_SSE; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__sse_2x8); - } - } - } - - TEST(F32_PRELU__SSE_2X8, output_stride) { - TEST_REQUIRES_X86_SSE; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__sse_2x8); - } - } - } - - TEST(F32_PRELU__SSE_2X8, inplace) { - TEST_REQUIRES_X86_SSE; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__sse_2x8); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_PRELU__SSE2_2X4, channels_eq_4) { - TEST_REQUIRES_X86_SSE2; - PReLUMicrokernelTester() - .rows(2) - .channels(4) - .Test(xnn_f32_prelu_ukernel__sse2_2x4); - } - - TEST(F32_PRELU__SSE2_2X4, channels_div_4) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 8; channels < 40; channels += 4) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse2_2x4); - } - } - - TEST(F32_PRELU__SSE2_2X4, channels_lt_4) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 4; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse2_2x4); - } - } - - TEST(F32_PRELU__SSE2_2X4, channels_gt_4) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 5; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse2_2x4); - } - } - - TEST(F32_PRELU__SSE2_2X4, rows_lt_2) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse2_2x4); - } - } - } - - TEST(F32_PRELU__SSE2_2X4, rows_div_2) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse2_2x4); - } - } - } - - TEST(F32_PRELU__SSE2_2X4, rows_gt_2) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse2_2x4); - } - } - } - - TEST(F32_PRELU__SSE2_2X4, input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__sse2_2x4); - } - } - } - - TEST(F32_PRELU__SSE2_2X4, output_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__sse2_2x4); - } - } - } - - TEST(F32_PRELU__SSE2_2X4, inplace) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__sse2_2x4); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_PRELU__SSE2_2X8, channels_eq_8) { - TEST_REQUIRES_X86_SSE2; - PReLUMicrokernelTester() - .rows(2) - .channels(8) - .Test(xnn_f32_prelu_ukernel__sse2_2x8); - } - - TEST(F32_PRELU__SSE2_2X8, channels_div_8) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 16; channels < 80; channels += 8) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse2_2x8); - } - } - - TEST(F32_PRELU__SSE2_2X8, channels_lt_8) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse2_2x8); - } - } - - TEST(F32_PRELU__SSE2_2X8, channels_gt_8) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 9; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse2_2x8); - } - } - - TEST(F32_PRELU__SSE2_2X8, rows_lt_2) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse2_2x8); - } - } - } - - TEST(F32_PRELU__SSE2_2X8, rows_div_2) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse2_2x8); - } - } - } - - TEST(F32_PRELU__SSE2_2X8, rows_gt_2) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse2_2x8); - } - } - } - - TEST(F32_PRELU__SSE2_2X8, input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__sse2_2x8); - } - } - } - - TEST(F32_PRELU__SSE2_2X8, output_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__sse2_2x8); - } - } - } - - TEST(F32_PRELU__SSE2_2X8, inplace) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__sse2_2x8); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_PRELU__SSE41_2X4, channels_eq_4) { - TEST_REQUIRES_X86_SSE41; - PReLUMicrokernelTester() - .rows(2) - .channels(4) - .Test(xnn_f32_prelu_ukernel__sse41_2x4); - } - - TEST(F32_PRELU__SSE41_2X4, channels_div_4) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 8; channels < 40; channels += 4) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse41_2x4); - } - } - - TEST(F32_PRELU__SSE41_2X4, channels_lt_4) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 4; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse41_2x4); - } - } - - TEST(F32_PRELU__SSE41_2X4, channels_gt_4) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 5; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse41_2x4); - } - } - - TEST(F32_PRELU__SSE41_2X4, rows_lt_2) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse41_2x4); - } - } - } - - TEST(F32_PRELU__SSE41_2X4, rows_div_2) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse41_2x4); - } - } - } - - TEST(F32_PRELU__SSE41_2X4, rows_gt_2) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse41_2x4); - } - } - } - - TEST(F32_PRELU__SSE41_2X4, input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__sse41_2x4); - } - } - } - - TEST(F32_PRELU__SSE41_2X4, output_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__sse41_2x4); - } - } - } - - TEST(F32_PRELU__SSE41_2X4, inplace) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__sse41_2x4); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_PRELU__SSE41_2X8, channels_eq_8) { - TEST_REQUIRES_X86_SSE41; - PReLUMicrokernelTester() - .rows(2) - .channels(8) - .Test(xnn_f32_prelu_ukernel__sse41_2x8); - } - - TEST(F32_PRELU__SSE41_2X8, channels_div_8) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 16; channels < 80; channels += 8) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse41_2x8); - } - } - - TEST(F32_PRELU__SSE41_2X8, channels_lt_8) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse41_2x8); - } - } - - TEST(F32_PRELU__SSE41_2X8, channels_gt_8) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 9; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse41_2x8); - } - } - - TEST(F32_PRELU__SSE41_2X8, rows_lt_2) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse41_2x8); - } - } - } - - TEST(F32_PRELU__SSE41_2X8, rows_div_2) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse41_2x8); - } - } - } - - TEST(F32_PRELU__SSE41_2X8, rows_gt_2) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__sse41_2x8); - } - } - } - - TEST(F32_PRELU__SSE41_2X8, input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__sse41_2x8); - } - } - } - - TEST(F32_PRELU__SSE41_2X8, output_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__sse41_2x8); - } - } - } - - TEST(F32_PRELU__SSE41_2X8, inplace) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__sse41_2x8); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_PRELU__AVX_2X8, channels_eq_8) { - TEST_REQUIRES_X86_AVX; - PReLUMicrokernelTester() - .rows(2) - .channels(8) - .Test(xnn_f32_prelu_ukernel__avx_2x8); - } - - TEST(F32_PRELU__AVX_2X8, channels_div_8) { - TEST_REQUIRES_X86_AVX; - for (size_t channels = 16; channels < 80; channels += 8) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__avx_2x8); - } - } - - TEST(F32_PRELU__AVX_2X8, channels_lt_8) { - TEST_REQUIRES_X86_AVX; - for (size_t channels = 1; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__avx_2x8); - } - } - - TEST(F32_PRELU__AVX_2X8, channels_gt_8) { - TEST_REQUIRES_X86_AVX; - for (size_t channels = 9; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__avx_2x8); - } - } - - TEST(F32_PRELU__AVX_2X8, rows_lt_2) { - TEST_REQUIRES_X86_AVX; - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__avx_2x8); - } - } - } - - TEST(F32_PRELU__AVX_2X8, rows_div_2) { - TEST_REQUIRES_X86_AVX; - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__avx_2x8); - } - } - } - - TEST(F32_PRELU__AVX_2X8, rows_gt_2) { - TEST_REQUIRES_X86_AVX; - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__avx_2x8); - } - } - } - - TEST(F32_PRELU__AVX_2X8, input_stride) { - TEST_REQUIRES_X86_AVX; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__avx_2x8); - } - } - } - - TEST(F32_PRELU__AVX_2X8, output_stride) { - TEST_REQUIRES_X86_AVX; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__avx_2x8); - } - } - } - - TEST(F32_PRELU__AVX_2X8, inplace) { - TEST_REQUIRES_X86_AVX; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__avx_2x8); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_PRELU__AVX_2X16, channels_eq_16) { - TEST_REQUIRES_X86_AVX; - PReLUMicrokernelTester() - .rows(2) - .channels(16) - .Test(xnn_f32_prelu_ukernel__avx_2x16); - } - - TEST(F32_PRELU__AVX_2X16, channels_div_16) { - TEST_REQUIRES_X86_AVX; - for (size_t channels = 32; channels < 160; channels += 16) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__avx_2x16); - } - } - - TEST(F32_PRELU__AVX_2X16, channels_lt_16) { - TEST_REQUIRES_X86_AVX; - for (size_t channels = 1; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__avx_2x16); - } - } - - TEST(F32_PRELU__AVX_2X16, channels_gt_16) { - TEST_REQUIRES_X86_AVX; - for (size_t channels = 17; channels < 32; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__avx_2x16); - } - } - - TEST(F32_PRELU__AVX_2X16, rows_lt_2) { - TEST_REQUIRES_X86_AVX; - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__avx_2x16); - } - } - } - - TEST(F32_PRELU__AVX_2X16, rows_div_2) { - TEST_REQUIRES_X86_AVX; - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__avx_2x16); - } - } - } - - TEST(F32_PRELU__AVX_2X16, rows_gt_2) { - TEST_REQUIRES_X86_AVX; - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__avx_2x16); - } - } - } - - TEST(F32_PRELU__AVX_2X16, input_stride) { - TEST_REQUIRES_X86_AVX; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__avx_2x16); - } - } - } - - TEST(F32_PRELU__AVX_2X16, output_stride) { - TEST_REQUIRES_X86_AVX; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__avx_2x16); - } - } - } - - TEST(F32_PRELU__AVX_2X16, inplace) { - TEST_REQUIRES_X86_AVX; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__avx_2x16); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) - TEST(F32_PRELU__AVX512F_2X16, channels_eq_16) { - TEST_REQUIRES_X86_AVX512F; - PReLUMicrokernelTester() - .rows(2) - .channels(16) - .Test(xnn_f32_prelu_ukernel__avx512f_2x16); - } - - TEST(F32_PRELU__AVX512F_2X16, channels_div_16) { - TEST_REQUIRES_X86_AVX512F; - for (size_t channels = 32; channels < 160; channels += 16) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__avx512f_2x16); - } - } - - TEST(F32_PRELU__AVX512F_2X16, channels_lt_16) { - TEST_REQUIRES_X86_AVX512F; - for (size_t channels = 1; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__avx512f_2x16); - } - } - - TEST(F32_PRELU__AVX512F_2X16, channels_gt_16) { - TEST_REQUIRES_X86_AVX512F; - for (size_t channels = 17; channels < 32; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__avx512f_2x16); - } - } - - TEST(F32_PRELU__AVX512F_2X16, rows_lt_2) { - TEST_REQUIRES_X86_AVX512F; - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__avx512f_2x16); - } - } - } - - TEST(F32_PRELU__AVX512F_2X16, rows_div_2) { - TEST_REQUIRES_X86_AVX512F; - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__avx512f_2x16); - } - } - } - - TEST(F32_PRELU__AVX512F_2X16, rows_gt_2) { - TEST_REQUIRES_X86_AVX512F; - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__avx512f_2x16); - } - } - } - - TEST(F32_PRELU__AVX512F_2X16, input_stride) { - TEST_REQUIRES_X86_AVX512F; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__avx512f_2x16); - } - } - } - - TEST(F32_PRELU__AVX512F_2X16, output_stride) { - TEST_REQUIRES_X86_AVX512F; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__avx512f_2x16); - } - } - } - - TEST(F32_PRELU__AVX512F_2X16, inplace) { - TEST_REQUIRES_X86_AVX512F; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__avx512f_2x16); - } - } - } -#endif // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) - - -#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) - TEST(F32_PRELU__AVX512F_2X32, channels_eq_32) { - TEST_REQUIRES_X86_AVX512F; - PReLUMicrokernelTester() - .rows(2) - .channels(32) - .Test(xnn_f32_prelu_ukernel__avx512f_2x32); - } - - TEST(F32_PRELU__AVX512F_2X32, channels_div_32) { - TEST_REQUIRES_X86_AVX512F; - for (size_t channels = 64; channels < 320; channels += 32) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__avx512f_2x32); - } - } - - TEST(F32_PRELU__AVX512F_2X32, channels_lt_32) { - TEST_REQUIRES_X86_AVX512F; - for (size_t channels = 1; channels < 32; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__avx512f_2x32); - } - } - - TEST(F32_PRELU__AVX512F_2X32, channels_gt_32) { - TEST_REQUIRES_X86_AVX512F; - for (size_t channels = 33; channels < 64; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__avx512f_2x32); - } - } - - TEST(F32_PRELU__AVX512F_2X32, rows_lt_2) { - TEST_REQUIRES_X86_AVX512F; - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 160; channels += 31) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__avx512f_2x32); - } - } - } - - TEST(F32_PRELU__AVX512F_2X32, rows_div_2) { - TEST_REQUIRES_X86_AVX512F; - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 160; channels += 31) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__avx512f_2x32); - } - } - } - - TEST(F32_PRELU__AVX512F_2X32, rows_gt_2) { - TEST_REQUIRES_X86_AVX512F; - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 160; channels += 31) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__avx512f_2x32); - } - } - } - - TEST(F32_PRELU__AVX512F_2X32, input_stride) { - TEST_REQUIRES_X86_AVX512F; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 160; channels += 31) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(163) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__avx512f_2x32); - } - } - } - - TEST(F32_PRELU__AVX512F_2X32, output_stride) { - TEST_REQUIRES_X86_AVX512F; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 160; channels += 31) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(163) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__avx512f_2x32); - } - } - } - - TEST(F32_PRELU__AVX512F_2X32, inplace) { - TEST_REQUIRES_X86_AVX512F; - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 160; channels += 31) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__avx512f_2x32); - } - } - } -#endif // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMSIMD_IMINMAX_1X4, channels_eq_4) { - PReLUMicrokernelTester() - .rows(1) - .channels(4) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x4); - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_1X4, channels_div_4) { - for (size_t channels = 8; channels < 40; channels += 4) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x4); - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_1X4, channels_lt_4) { - for (size_t channels = 1; channels < 4; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x4); - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_1X4, channels_gt_4) { - for (size_t channels = 5; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x4); - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_1X4, rows_gt_1) { - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x4); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_1X4, input_stride) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x4); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_1X4, output_stride) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x4); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_1X4, inplace) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x4); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMSIMD_IMINMAX_1X8, channels_eq_8) { - PReLUMicrokernelTester() - .rows(1) - .channels(8) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x8); - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_1X8, channels_div_8) { - for (size_t channels = 16; channels < 80; channels += 8) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x8); - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_1X8, channels_lt_8) { - for (size_t channels = 1; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x8); - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_1X8, channels_gt_8) { - for (size_t channels = 9; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x8); - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_1X8, rows_gt_1) { - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x8); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_1X8, input_stride) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x8); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_1X8, output_stride) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x8); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_1X8, inplace) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x8); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMSIMD_IMINMAX_1X16, channels_eq_16) { - PReLUMicrokernelTester() - .rows(1) - .channels(16) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x16); - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_1X16, channels_div_16) { - for (size_t channels = 32; channels < 160; channels += 16) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x16); - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_1X16, channels_lt_16) { - for (size_t channels = 1; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x16); - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_1X16, channels_gt_16) { - for (size_t channels = 17; channels < 32; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x16); - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_1X16, rows_gt_1) { - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x16); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_1X16, input_stride) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x16); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_1X16, output_stride) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x16); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_1X16, inplace) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x16); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X4, channels_eq_4) { - PReLUMicrokernelTester() - .rows(2) - .channels(4) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x4); - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X4, channels_div_4) { - for (size_t channels = 8; channels < 40; channels += 4) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x4); - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X4, channels_lt_4) { - for (size_t channels = 1; channels < 4; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x4); - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X4, channels_gt_4) { - for (size_t channels = 5; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x4); - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X4, rows_lt_2) { - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x4); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X4, rows_div_2) { - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x4); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X4, rows_gt_2) { - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x4); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X4, input_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x4); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X4, output_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x4); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X4, inplace) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x4); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X8, channels_eq_8) { - PReLUMicrokernelTester() - .rows(2) - .channels(8) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8); - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X8, channels_div_8) { - for (size_t channels = 16; channels < 80; channels += 8) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8); - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X8, channels_lt_8) { - for (size_t channels = 1; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8); - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X8, channels_gt_8) { - for (size_t channels = 9; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8); - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X8, rows_lt_2) { - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X8, rows_div_2) { - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X8, rows_gt_2) { - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X8, input_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X8, output_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X8, inplace) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X16, channels_eq_16) { - PReLUMicrokernelTester() - .rows(2) - .channels(16) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x16); - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X16, channels_div_16) { - for (size_t channels = 32; channels < 160; channels += 16) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x16); - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X16, channels_lt_16) { - for (size_t channels = 1; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x16); - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X16, channels_gt_16) { - for (size_t channels = 17; channels < 32; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x16); - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X16, rows_lt_2) { - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x16); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X16, rows_div_2) { - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x16); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X16, rows_gt_2) { - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x16); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X16, input_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x16); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X16, output_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x16); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_2X16, inplace) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x16); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X4, channels_eq_4) { - PReLUMicrokernelTester() - .rows(4) - .channels(4) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x4); - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X4, channels_div_4) { - for (size_t channels = 8; channels < 40; channels += 4) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x4); - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X4, channels_lt_4) { - for (size_t channels = 1; channels < 4; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x4); - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X4, channels_gt_4) { - for (size_t channels = 5; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x4); - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X4, rows_lt_4) { - for (size_t rows = 1; rows < 4; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x4); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X4, rows_div_4) { - for (size_t rows = 8; rows <= 16; rows += 4) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x4); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X4, rows_gt_4) { - for (size_t rows = 5; rows < 8; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x4); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X4, input_stride) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x4); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X4, output_stride) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x4); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X4, inplace) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x4); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X8, channels_eq_8) { - PReLUMicrokernelTester() - .rows(4) - .channels(8) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x8); - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X8, channels_div_8) { - for (size_t channels = 16; channels < 80; channels += 8) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x8); - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X8, channels_lt_8) { - for (size_t channels = 1; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x8); - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X8, channels_gt_8) { - for (size_t channels = 9; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x8); - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X8, rows_lt_4) { - for (size_t rows = 1; rows < 4; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x8); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X8, rows_div_4) { - for (size_t rows = 8; rows <= 16; rows += 4) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x8); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X8, rows_gt_4) { - for (size_t rows = 5; rows < 8; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x8); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X8, input_stride) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x8); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X8, output_stride) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x8); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X8, inplace) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x8); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X16, channels_eq_16) { - PReLUMicrokernelTester() - .rows(4) - .channels(16) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x16); - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X16, channels_div_16) { - for (size_t channels = 32; channels < 160; channels += 16) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x16); - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X16, channels_lt_16) { - for (size_t channels = 1; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x16); - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X16, channels_gt_16) { - for (size_t channels = 17; channels < 32; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x16); - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X16, rows_lt_4) { - for (size_t rows = 1; rows < 4; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x16); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X16, rows_div_4) { - for (size_t rows = 8; rows <= 16; rows += 4) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x16); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X16, rows_gt_4) { - for (size_t rows = 5; rows < 8; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x16); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X16, input_stride) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x16); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X16, output_stride) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x16); - } - } - } - - TEST(F32_PRELU__WASMSIMD_IMINMAX_4X16, inplace) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x16); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMSIMD_LANESELECT_1X4, channels_eq_4) { - PReLUMicrokernelTester() - .rows(1) - .channels(4) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x4); - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_1X4, channels_div_4) { - for (size_t channels = 8; channels < 40; channels += 4) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x4); - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_1X4, channels_lt_4) { - for (size_t channels = 1; channels < 4; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x4); - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_1X4, channels_gt_4) { - for (size_t channels = 5; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x4); - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_1X4, rows_gt_1) { - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x4); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_1X4, input_stride) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x4); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_1X4, output_stride) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x4); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_1X4, inplace) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x4); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMSIMD_LANESELECT_1X8, channels_eq_8) { - PReLUMicrokernelTester() - .rows(1) - .channels(8) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x8); - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_1X8, channels_div_8) { - for (size_t channels = 16; channels < 80; channels += 8) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x8); - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_1X8, channels_lt_8) { - for (size_t channels = 1; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x8); - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_1X8, channels_gt_8) { - for (size_t channels = 9; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x8); - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_1X8, rows_gt_1) { - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x8); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_1X8, input_stride) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x8); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_1X8, output_stride) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x8); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_1X8, inplace) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x8); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMSIMD_LANESELECT_1X16, channels_eq_16) { - PReLUMicrokernelTester() - .rows(1) - .channels(16) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x16); - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_1X16, channels_div_16) { - for (size_t channels = 32; channels < 160; channels += 16) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x16); - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_1X16, channels_lt_16) { - for (size_t channels = 1; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x16); - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_1X16, channels_gt_16) { - for (size_t channels = 17; channels < 32; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x16); - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_1X16, rows_gt_1) { - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x16); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_1X16, input_stride) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x16); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_1X16, output_stride) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x16); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_1X16, inplace) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x16); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X4, channels_eq_4) { - PReLUMicrokernelTester() - .rows(2) - .channels(4) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x4); - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X4, channels_div_4) { - for (size_t channels = 8; channels < 40; channels += 4) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x4); - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X4, channels_lt_4) { - for (size_t channels = 1; channels < 4; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x4); - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X4, channels_gt_4) { - for (size_t channels = 5; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x4); - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X4, rows_lt_2) { - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x4); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X4, rows_div_2) { - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x4); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X4, rows_gt_2) { - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x4); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X4, input_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x4); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X4, output_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x4); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X4, inplace) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x4); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X8, channels_eq_8) { - PReLUMicrokernelTester() - .rows(2) - .channels(8) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8); - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X8, channels_div_8) { - for (size_t channels = 16; channels < 80; channels += 8) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8); - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X8, channels_lt_8) { - for (size_t channels = 1; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8); - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X8, channels_gt_8) { - for (size_t channels = 9; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8); - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X8, rows_lt_2) { - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X8, rows_div_2) { - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X8, rows_gt_2) { - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X8, input_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X8, output_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X8, inplace) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X16, channels_eq_16) { - PReLUMicrokernelTester() - .rows(2) - .channels(16) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x16); - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X16, channels_div_16) { - for (size_t channels = 32; channels < 160; channels += 16) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x16); - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X16, channels_lt_16) { - for (size_t channels = 1; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x16); - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X16, channels_gt_16) { - for (size_t channels = 17; channels < 32; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x16); - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X16, rows_lt_2) { - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x16); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X16, rows_div_2) { - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x16); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X16, rows_gt_2) { - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x16); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X16, input_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x16); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X16, output_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x16); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_2X16, inplace) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x16); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X4, channels_eq_4) { - PReLUMicrokernelTester() - .rows(4) - .channels(4) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x4); - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X4, channels_div_4) { - for (size_t channels = 8; channels < 40; channels += 4) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x4); - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X4, channels_lt_4) { - for (size_t channels = 1; channels < 4; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x4); - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X4, channels_gt_4) { - for (size_t channels = 5; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x4); - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X4, rows_lt_4) { - for (size_t rows = 1; rows < 4; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x4); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X4, rows_div_4) { - for (size_t rows = 8; rows <= 16; rows += 4) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x4); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X4, rows_gt_4) { - for (size_t rows = 5; rows < 8; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x4); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X4, input_stride) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x4); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X4, output_stride) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x4); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X4, inplace) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x4); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X8, channels_eq_8) { - PReLUMicrokernelTester() - .rows(4) - .channels(8) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x8); - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X8, channels_div_8) { - for (size_t channels = 16; channels < 80; channels += 8) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x8); - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X8, channels_lt_8) { - for (size_t channels = 1; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x8); - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X8, channels_gt_8) { - for (size_t channels = 9; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x8); - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X8, rows_lt_4) { - for (size_t rows = 1; rows < 4; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x8); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X8, rows_div_4) { - for (size_t rows = 8; rows <= 16; rows += 4) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x8); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X8, rows_gt_4) { - for (size_t rows = 5; rows < 8; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x8); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X8, input_stride) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x8); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X8, output_stride) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x8); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X8, inplace) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x8); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X16, channels_eq_16) { - PReLUMicrokernelTester() - .rows(4) - .channels(16) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x16); - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X16, channels_div_16) { - for (size_t channels = 32; channels < 160; channels += 16) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x16); - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X16, channels_lt_16) { - for (size_t channels = 1; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x16); - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X16, channels_gt_16) { - for (size_t channels = 17; channels < 32; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x16); - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X16, rows_lt_4) { - for (size_t rows = 1; rows < 4; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x16); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X16, rows_div_4) { - for (size_t rows = 8; rows <= 16; rows += 4) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x16); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X16, rows_gt_4) { - for (size_t rows = 5; rows < 8; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x16); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X16, input_stride) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x16); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X16, output_stride) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x16); - } - } - } - - TEST(F32_PRELU__WASMSIMD_LANESELECT_4X16, inplace) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x16); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X4, channels_eq_4) { - PReLUMicrokernelTester() - .rows(1) - .channels(4) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x4); - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X4, channels_div_4) { - for (size_t channels = 8; channels < 40; channels += 4) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x4); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X4, channels_lt_4) { - for (size_t channels = 1; channels < 4; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x4); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X4, channels_gt_4) { - for (size_t channels = 5; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x4); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X4, rows_gt_1) { - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x4); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X4, input_stride) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x4); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X4, output_stride) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x4); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X4, inplace) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x4); - } - } - } -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X8, channels_eq_8) { - PReLUMicrokernelTester() - .rows(1) - .channels(8) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x8); - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X8, channels_div_8) { - for (size_t channels = 16; channels < 80; channels += 8) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x8); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X8, channels_lt_8) { - for (size_t channels = 1; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x8); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X8, channels_gt_8) { - for (size_t channels = 9; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x8); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X8, rows_gt_1) { - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x8); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X8, input_stride) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x8); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X8, output_stride) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x8); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X8, inplace) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x8); - } - } - } -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X16, channels_eq_16) { - PReLUMicrokernelTester() - .rows(1) - .channels(16) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x16); - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X16, channels_div_16) { - for (size_t channels = 32; channels < 160; channels += 16) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x16); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X16, channels_lt_16) { - for (size_t channels = 1; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x16); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X16, channels_gt_16) { - for (size_t channels = 17; channels < 32; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x16); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X16, rows_gt_1) { - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x16); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X16, input_stride) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x16); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X16, output_stride) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x16); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X16, inplace) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x16); - } - } - } -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X4, channels_eq_4) { - PReLUMicrokernelTester() - .rows(2) - .channels(4) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4); - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X4, channels_div_4) { - for (size_t channels = 8; channels < 40; channels += 4) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X4, channels_lt_4) { - for (size_t channels = 1; channels < 4; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X4, channels_gt_4) { - for (size_t channels = 5; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X4, rows_lt_2) { - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X4, rows_div_2) { - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X4, rows_gt_2) { - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X4, input_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X4, output_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X4, inplace) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4); - } - } - } -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X8, channels_eq_8) { - PReLUMicrokernelTester() - .rows(2) - .channels(8) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x8); - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X8, channels_div_8) { - for (size_t channels = 16; channels < 80; channels += 8) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x8); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X8, channels_lt_8) { - for (size_t channels = 1; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x8); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X8, channels_gt_8) { - for (size_t channels = 9; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x8); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X8, rows_lt_2) { - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x8); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X8, rows_div_2) { - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x8); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X8, rows_gt_2) { - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x8); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X8, input_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x8); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X8, output_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x8); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X8, inplace) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x8); - } - } - } -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X16, channels_eq_16) { - PReLUMicrokernelTester() - .rows(2) - .channels(16) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x16); - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X16, channels_div_16) { - for (size_t channels = 32; channels < 160; channels += 16) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x16); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X16, channels_lt_16) { - for (size_t channels = 1; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x16); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X16, channels_gt_16) { - for (size_t channels = 17; channels < 32; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x16); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X16, rows_lt_2) { - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x16); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X16, rows_div_2) { - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x16); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X16, rows_gt_2) { - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x16); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X16, input_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x16); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X16, output_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x16); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X16, inplace) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x16); - } - } - } -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X4, channels_eq_4) { - PReLUMicrokernelTester() - .rows(4) - .channels(4) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x4); - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X4, channels_div_4) { - for (size_t channels = 8; channels < 40; channels += 4) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x4); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X4, channels_lt_4) { - for (size_t channels = 1; channels < 4; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x4); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X4, channels_gt_4) { - for (size_t channels = 5; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x4); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X4, rows_lt_4) { - for (size_t rows = 1; rows < 4; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x4); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X4, rows_div_4) { - for (size_t rows = 8; rows <= 16; rows += 4) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x4); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X4, rows_gt_4) { - for (size_t rows = 5; rows < 8; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x4); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X4, input_stride) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x4); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X4, output_stride) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x4); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X4, inplace) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x4); - } - } - } -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X8, channels_eq_8) { - PReLUMicrokernelTester() - .rows(4) - .channels(8) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x8); - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X8, channels_div_8) { - for (size_t channels = 16; channels < 80; channels += 8) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x8); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X8, channels_lt_8) { - for (size_t channels = 1; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x8); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X8, channels_gt_8) { - for (size_t channels = 9; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x8); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X8, rows_lt_4) { - for (size_t rows = 1; rows < 4; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x8); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X8, rows_div_4) { - for (size_t rows = 8; rows <= 16; rows += 4) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x8); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X8, rows_gt_4) { - for (size_t rows = 5; rows < 8; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x8); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X8, input_stride) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x8); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X8, output_stride) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x8); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X8, inplace) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x8); - } - } - } -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X16, channels_eq_16) { - PReLUMicrokernelTester() - .rows(4) - .channels(16) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x16); - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X16, channels_div_16) { - for (size_t channels = 32; channels < 160; channels += 16) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x16); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X16, channels_lt_16) { - for (size_t channels = 1; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x16); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X16, channels_gt_16) { - for (size_t channels = 17; channels < 32; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x16); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X16, rows_lt_4) { - for (size_t rows = 1; rows < 4; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x16); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X16, rows_div_4) { - for (size_t rows = 8; rows <= 16; rows += 4) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x16); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X16, rows_gt_4) { - for (size_t rows = 5; rows < 8; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x16); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X16, input_stride) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x16); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X16, output_stride) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x16); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X16, inplace) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x16); - } - } - } -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X4, channels_eq_4) { - PReLUMicrokernelTester() - .rows(1) - .channels(4) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x4); - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X4, channels_div_4) { - for (size_t channels = 8; channels < 40; channels += 4) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x4); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X4, channels_lt_4) { - for (size_t channels = 1; channels < 4; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x4); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X4, channels_gt_4) { - for (size_t channels = 5; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x4); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X4, rows_gt_1) { - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x4); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X4, input_stride) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x4); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X4, output_stride) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x4); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X4, inplace) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x4); - } - } - } -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X8, channels_eq_8) { - PReLUMicrokernelTester() - .rows(1) - .channels(8) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x8); - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X8, channels_div_8) { - for (size_t channels = 16; channels < 80; channels += 8) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x8); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X8, channels_lt_8) { - for (size_t channels = 1; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x8); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X8, channels_gt_8) { - for (size_t channels = 9; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x8); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X8, rows_gt_1) { - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x8); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X8, input_stride) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x8); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X8, output_stride) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x8); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X8, inplace) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x8); - } - } - } -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X16, channels_eq_16) { - PReLUMicrokernelTester() - .rows(1) - .channels(16) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x16); - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X16, channels_div_16) { - for (size_t channels = 32; channels < 160; channels += 16) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x16); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X16, channels_lt_16) { - for (size_t channels = 1; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x16); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X16, channels_gt_16) { - for (size_t channels = 17; channels < 32; channels++) { - PReLUMicrokernelTester() - .rows(1) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x16); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X16, rows_gt_1) { - for (size_t rows = 2; rows < 2; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x16); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X16, input_stride) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x16); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X16, output_stride) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x16); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X16, inplace) { - for (size_t rows = 1; rows <= 3; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x16); - } - } - } -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X4, channels_eq_4) { - PReLUMicrokernelTester() - .rows(2) - .channels(4) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4); - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X4, channels_div_4) { - for (size_t channels = 8; channels < 40; channels += 4) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X4, channels_lt_4) { - for (size_t channels = 1; channels < 4; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X4, channels_gt_4) { - for (size_t channels = 5; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X4, rows_lt_2) { - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X4, rows_div_2) { - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X4, rows_gt_2) { - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X4, input_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X4, output_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X4, inplace) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4); - } - } - } -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X8, channels_eq_8) { - PReLUMicrokernelTester() - .rows(2) - .channels(8) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x8); - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X8, channels_div_8) { - for (size_t channels = 16; channels < 80; channels += 8) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x8); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X8, channels_lt_8) { - for (size_t channels = 1; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x8); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X8, channels_gt_8) { - for (size_t channels = 9; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x8); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X8, rows_lt_2) { - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x8); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X8, rows_div_2) { - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x8); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X8, rows_gt_2) { - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x8); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X8, input_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x8); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X8, output_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x8); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X8, inplace) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x8); - } - } - } -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X16, channels_eq_16) { - PReLUMicrokernelTester() - .rows(2) - .channels(16) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x16); - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X16, channels_div_16) { - for (size_t channels = 32; channels < 160; channels += 16) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x16); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X16, channels_lt_16) { - for (size_t channels = 1; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x16); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X16, channels_gt_16) { - for (size_t channels = 17; channels < 32; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x16); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X16, rows_lt_2) { - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x16); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X16, rows_div_2) { - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x16); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X16, rows_gt_2) { - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x16); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X16, input_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x16); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X16, output_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x16); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X16, inplace) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x16); - } - } - } -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X4, channels_eq_4) { - PReLUMicrokernelTester() - .rows(4) - .channels(4) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x4); - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X4, channels_div_4) { - for (size_t channels = 8; channels < 40; channels += 4) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x4); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X4, channels_lt_4) { - for (size_t channels = 1; channels < 4; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x4); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X4, channels_gt_4) { - for (size_t channels = 5; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x4); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X4, rows_lt_4) { - for (size_t rows = 1; rows < 4; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x4); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X4, rows_div_4) { - for (size_t rows = 8; rows <= 16; rows += 4) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x4); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X4, rows_gt_4) { - for (size_t rows = 5; rows < 8; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x4); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X4, input_stride) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x4); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X4, output_stride) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x4); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X4, inplace) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x4); - } - } - } -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X8, channels_eq_8) { - PReLUMicrokernelTester() - .rows(4) - .channels(8) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x8); - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X8, channels_div_8) { - for (size_t channels = 16; channels < 80; channels += 8) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x8); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X8, channels_lt_8) { - for (size_t channels = 1; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x8); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X8, channels_gt_8) { - for (size_t channels = 9; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x8); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X8, rows_lt_4) { - for (size_t rows = 1; rows < 4; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x8); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X8, rows_div_4) { - for (size_t rows = 8; rows <= 16; rows += 4) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x8); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X8, rows_gt_4) { - for (size_t rows = 5; rows < 8; rows++) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x8); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X8, input_stride) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x8); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X8, output_stride) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(43) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x8); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X8, inplace) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 40; channels += 7) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x8); - } - } - } -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X16, channels_eq_16) { - PReLUMicrokernelTester() - .rows(4) - .channels(16) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x16); - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X16, channels_div_16) { - for (size_t channels = 32; channels < 160; channels += 16) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x16); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X16, channels_lt_16) { - for (size_t channels = 1; channels < 16; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x16); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X16, channels_gt_16) { - for (size_t channels = 17; channels < 32; channels++) { - PReLUMicrokernelTester() - .rows(4) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x16); - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X16, rows_lt_4) { - for (size_t rows = 1; rows < 4; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x16); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X16, rows_div_4) { - for (size_t rows = 8; rows <= 16; rows += 4) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x16); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X16, rows_gt_4) { - for (size_t rows = 5; rows < 8; rows++) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x16); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X16, input_stride) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x16); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X16, output_stride) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(83) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x16); - } - } - } - - TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X16, inplace) { - for (size_t rows = 1; rows <= 12; rows += 3) { - for (size_t channels = 1; channels <= 80; channels += 15) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x16); - } - } - } -#endif // XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASM_2X1, channels_eq_1) { - PReLUMicrokernelTester() - .rows(2) - .channels(1) - .Test(xnn_f32_prelu_ukernel__wasm_2x1); - } - - TEST(F32_PRELU__WASM_2X1, channels_gt_1) { - for (size_t channels = 2; channels < 10; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasm_2x1); - } - } - - TEST(F32_PRELU__WASM_2X1, rows_lt_2) { - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 5; channels += 1) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasm_2x1); - } - } - } - - TEST(F32_PRELU__WASM_2X1, rows_div_2) { - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 5; channels += 1) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasm_2x1); - } - } - } - - TEST(F32_PRELU__WASM_2X1, rows_gt_2) { - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 5; channels += 1) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasm_2x1); - } - } - } - - TEST(F32_PRELU__WASM_2X1, input_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 5; channels += 1) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(7) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasm_2x1); - } - } - } - - TEST(F32_PRELU__WASM_2X1, output_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 5; channels += 1) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(7) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasm_2x1); - } - } - } - - TEST(F32_PRELU__WASM_2X1, inplace) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 5; channels += 1) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasm_2x1); - } - } - } -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_PRELU__WASM_2X4, channels_eq_4) { - PReLUMicrokernelTester() - .rows(2) - .channels(4) - .Test(xnn_f32_prelu_ukernel__wasm_2x4); - } - - TEST(F32_PRELU__WASM_2X4, channels_div_4) { - for (size_t channels = 8; channels < 40; channels += 4) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasm_2x4); - } - } - - TEST(F32_PRELU__WASM_2X4, channels_lt_4) { - for (size_t channels = 1; channels < 4; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasm_2x4); - } - } - - TEST(F32_PRELU__WASM_2X4, channels_gt_4) { - for (size_t channels = 5; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasm_2x4); - } - } - - TEST(F32_PRELU__WASM_2X4, rows_lt_2) { - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasm_2x4); - } - } - } - - TEST(F32_PRELU__WASM_2X4, rows_div_2) { - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasm_2x4); - } - } - } - - TEST(F32_PRELU__WASM_2X4, rows_gt_2) { - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__wasm_2x4); - } - } - } - - TEST(F32_PRELU__WASM_2X4, input_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasm_2x4); - } - } - } - - TEST(F32_PRELU__WASM_2X4, output_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasm_2x4); - } - } - } - - TEST(F32_PRELU__WASM_2X4, inplace) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__wasm_2x4); - } - } - } -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -TEST(F32_PRELU__SCALAR_2X1, channels_eq_1) { - PReLUMicrokernelTester() - .rows(2) - .channels(1) - .Test(xnn_f32_prelu_ukernel__scalar_2x1); -} - -TEST(F32_PRELU__SCALAR_2X1, channels_gt_1) { - for (size_t channels = 2; channels < 10; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__scalar_2x1); - } -} - -TEST(F32_PRELU__SCALAR_2X1, rows_lt_2) { - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 5; channels += 1) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__scalar_2x1); - } - } -} - -TEST(F32_PRELU__SCALAR_2X1, rows_div_2) { - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 5; channels += 1) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__scalar_2x1); - } - } -} - -TEST(F32_PRELU__SCALAR_2X1, rows_gt_2) { - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 5; channels += 1) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__scalar_2x1); - } - } -} - -TEST(F32_PRELU__SCALAR_2X1, input_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 5; channels += 1) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(7) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__scalar_2x1); - } - } -} - -TEST(F32_PRELU__SCALAR_2X1, output_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 5; channels += 1) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(7) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__scalar_2x1); - } - } -} - -TEST(F32_PRELU__SCALAR_2X1, inplace) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 5; channels += 1) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__scalar_2x1); - } - } -} - -TEST(F32_PRELU__SCALAR_2X4, channels_eq_4) { - PReLUMicrokernelTester() - .rows(2) - .channels(4) - .Test(xnn_f32_prelu_ukernel__scalar_2x4); -} - -TEST(F32_PRELU__SCALAR_2X4, channels_div_4) { - for (size_t channels = 8; channels < 40; channels += 4) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__scalar_2x4); - } -} - -TEST(F32_PRELU__SCALAR_2X4, channels_lt_4) { - for (size_t channels = 1; channels < 4; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__scalar_2x4); - } -} - -TEST(F32_PRELU__SCALAR_2X4, channels_gt_4) { - for (size_t channels = 5; channels < 8; channels++) { - PReLUMicrokernelTester() - .rows(2) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__scalar_2x4); - } -} - -TEST(F32_PRELU__SCALAR_2X4, rows_lt_2) { - for (size_t rows = 1; rows < 2; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__scalar_2x4); - } - } -} - -TEST(F32_PRELU__SCALAR_2X4, rows_div_2) { - for (size_t rows = 4; rows <= 8; rows += 2) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__scalar_2x4); - } - } -} - -TEST(F32_PRELU__SCALAR_2X4, rows_gt_2) { - for (size_t rows = 3; rows < 4; rows++) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_prelu_ukernel__scalar_2x4); - } - } -} - -TEST(F32_PRELU__SCALAR_2X4, input_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__scalar_2x4); - } - } -} - -TEST(F32_PRELU__SCALAR_2X4, output_stride) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(23) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__scalar_2x4); - } - } -} - -TEST(F32_PRELU__SCALAR_2X4, inplace) { - for (size_t rows = 1; rows <= 6; rows += 1) { - for (size_t channels = 1; channels <= 20; channels += 3) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(xnn_f32_prelu_ukernel__scalar_2x4); - } - } -} \ No newline at end of file diff --git a/test/f32-prelu.yaml b/test/f32-prelu.yaml deleted file mode 100644 index fcb18835002..00000000000 --- a/test/f32-prelu.yaml +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright 2019 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# ARM NEON -- name: xnn_f32_prelu_ukernel__neon_1x4 -- name: xnn_f32_prelu_ukernel__neon_1x8 -- name: xnn_f32_prelu_ukernel__neon_1x16 -- name: xnn_f32_prelu_ukernel__neon_2x4 -- name: xnn_f32_prelu_ukernel__neon_2x8 -- name: xnn_f32_prelu_ukernel__neon_2x16 -- name: xnn_f32_prelu_ukernel__neon_4x4 -- name: xnn_f32_prelu_ukernel__neon_4x8 -- name: xnn_f32_prelu_ukernel__neon_4x16 -# x86 SSE -- name: xnn_f32_prelu_ukernel__sse_2x4 -- name: xnn_f32_prelu_ukernel__sse_2x8 -- name: xnn_f32_prelu_ukernel__sse2_2x4 -- name: xnn_f32_prelu_ukernel__sse2_2x8 -- name: xnn_f32_prelu_ukernel__sse41_2x4 -- name: xnn_f32_prelu_ukernel__sse41_2x8 -# x86 AVX -- name: xnn_f32_prelu_ukernel__avx_2x8 -- name: xnn_f32_prelu_ukernel__avx_2x16 -# x86 AVX512 -- name: xnn_f32_prelu_ukernel__avx512f_2x16 -- name: xnn_f32_prelu_ukernel__avx512f_2x32 -# WAsm SIMD -- name: xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x4 -- name: xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x8 -- name: xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x16 -- name: xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x4 -- name: xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8 -- name: xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x16 -- name: xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x4 -- name: xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x8 -- name: xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x16 -- name: xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x4 -- name: xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x8 -- name: xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x16 -- name: xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x4 -- name: xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8 -- name: xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x16 -- name: xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x4 -- name: xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x8 -- name: xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x16 -# WAsm Relaxed SIMD -- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x4 -- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x8 -- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x16 -- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4 -- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x8 -- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x16 -- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x4 -- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x8 -- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x16 -- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x4 -- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x8 -- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x16 -- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4 -- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x8 -- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x16 -- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x4 -- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x8 -- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x16 -# WAsm -- name: xnn_f32_prelu_ukernel__wasm_2x1 -- name: xnn_f32_prelu_ukernel__wasm_2x4 -# Scalar -- name: xnn_f32_prelu_ukernel__scalar_2x1 -- name: xnn_f32_prelu_ukernel__scalar_2x4 diff --git a/test/f32-qs8-vcvt.cc b/test/f32-qs8-vcvt.cc index c2e15f43a2d..ca31e0fa0f4 100644 --- a/test/f32-qs8-vcvt.cc +++ b/test/f32-qs8-vcvt.cc @@ -2,10 +2,6 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Microkernel: f32-qs8-vcvt -// Generator: tools/generate-vcvt-test.py #include "xnnpack/microparams-init.h" @@ -25,9 +21,6 @@ XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, u XNN_TEST_CVT_OUTPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);\ \ XNN_TEST_CVT_SATURATION(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_OVERFLOW(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ - \ -XNN_TEST_CVT_QMIN(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_QMAX(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); -#include "src/f32-qs8-vcvt/f32-qs8-vcvt.h" +XNN_TEST_CVT_OVERFLOW(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); +#include "f32-qs8-vcvt/f32-qs8-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/test/f32-qu8-vcvt.cc b/test/f32-qu8-vcvt.cc index 7c1cc17c11f..0189a95982b 100644 --- a/test/f32-qu8-vcvt.cc +++ b/test/f32-qu8-vcvt.cc @@ -2,10 +2,6 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Microkernel: f32-qu8-vcvt -// Generator: tools/generate-vcvt-test.py #include "xnnpack/microparams-init.h" @@ -25,10 +21,6 @@ XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, u XNN_TEST_CVT_OUTPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);\ \ XNN_TEST_CVT_SATURATION(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_OVERFLOW(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ - \ - \ -XNN_TEST_CVT_QMIN(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_QMAX(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); -#include "src/f32-qu8-vcvt/f32-qu8-vcvt.h" +XNN_TEST_CVT_OVERFLOW(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); +#include "f32-qu8-vcvt/f32-qu8-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/test/f32-vabs.cc b/test/f32-vabs.cc index b5988fd3a29..11b95f45cc7 100644 --- a/test/f32-vabs.cc +++ b/test/f32-vabs.cc @@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Abs()); \ \ XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Abs()); -#include "src/f32-vabs/f32-vabs.h" +#include "f32-vabs/f32-vabs.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vadd.cc b/test/f32-vadd.cc index caeabdbe654..84cbb1bab29 100644 --- a/test/f32-vadd.cc +++ b/test/f32-vadd.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); -#include "src/f32-vbinary/f32-vadd.h" +#include "f32-vbinary/f32-vadd.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vaddc.cc b/test/f32-vaddc.cc index b8c8721c6ba..071d06de40e 100644 --- a/test/f32-vaddc.cc +++ b/test/f32-vaddc.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params); -#include "src/f32-vbinary/f32-vaddc.h" +#include "f32-vbinary/f32-vaddc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vclamp.cc b/test/f32-vclamp.cc index 0ad6af1863f..ed0f97a218d 100644 --- a/test/f32-vclamp.cc +++ b/test/f32-vclamp.cc @@ -34,5 +34,5 @@ XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ XNN_TEST_UNARY_QMIN(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ XNN_TEST_UNARY_QMAX(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); -#include "src/f32-vclamp/f32-vclamp.h" +#include "f32-vclamp/f32-vclamp.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vcmul.cc b/test/f32-vcmul.cc index f2c41e71f50..dedd5e10d7e 100644 --- a/test/f32-vcmul.cc +++ b/test/f32-vcmul.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params); -#include "src/f32-vbinary/f32-vcmul.h" +#include "f32-vbinary/f32-vcmul.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vcopysign.cc b/test/f32-vcopysign.cc index 13163219c10..99d5b2d9d63 100644 --- a/test/f32-vcopysign.cc +++ b/test/f32-vcopysign.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::CopySign, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::CopySign, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::CopySign, init_params); -#include "src/f32-vbinary/f32-vcopysign.h" +#include "f32-vbinary/f32-vcopysign.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vcopysignc.cc b/test/f32-vcopysignc.cc index 0b37182adb8..63ac43a13a8 100644 --- a/test/f32-vcopysignc.cc +++ b/test/f32-vcopysignc.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::CopySign, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::CopySign, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::CopySign, init_params); -#include "src/f32-vbinary/f32-vcopysignc.h" +#include "f32-vbinary/f32-vcopysignc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vdiv.cc b/test/f32-vdiv.cc index 79cc695e474..1a3e349df1e 100644 --- a/test/f32-vdiv.cc +++ b/test/f32-vdiv.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); -#include "src/f32-vbinary/f32-vdiv.h" +#include "f32-vbinary/f32-vdiv.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vdivc.cc b/test/f32-vdivc.cc index 88d55ff6930..42b20dd5227 100644 --- a/test/f32-vdivc.cc +++ b/test/f32-vdivc.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params); -#include "src/f32-vbinary/f32-vdivc.h" +#include "f32-vbinary/f32-vdivc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-velu.cc b/test/f32-velu.cc index 6ecab89b8ea..4db508d03de 100644 --- a/test/f32-velu.cc +++ b/test/f32-velu.cc @@ -76,5 +76,5 @@ TEST(ukernel, beta) { } \ } \ } -#include "src/f32-velu/f32-velu.h" +#include "f32-velu/f32-velu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vexp.cc b/test/f32-vexp.cc index ed50b5efa6e..577a50951a1 100644 --- a/test/f32-vexp.cc +++ b/test/f32-vexp.cc @@ -63,5 +63,5 @@ TEST(ukernel, special_values) { } \ } \ } -#include "src/f32-vexp/f32-vexp.h" +#include "f32-vexp/f32-vexp.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vgelu.cc b/test/f32-vgelu.cc index 30beaa307de..720a986c13f 100644 --- a/test/f32-vgelu.cc +++ b/test/f32-vgelu.cc @@ -63,5 +63,5 @@ TEST(ukernel, special_values) { } \ } \ } -#include "src/f32-vgelu/f32-vgelu.h" +#include "f32-vgelu/f32-vgelu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vhswish.cc b/test/f32-vhswish.cc index fb2892df96b..813441dc197 100644 --- a/test/f32-vhswish.cc +++ b/test/f32-vhswish.cc @@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ \ XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); -#include "src/f32-vhswish/f32-vhswish.h" +#include "f32-vhswish/f32-vhswish.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vlog.cc b/test/f32-vlog.cc index 005b6142ff6..917c8d36b2c 100644 --- a/test/f32-vlog.cc +++ b/test/f32-vlog.cc @@ -63,5 +63,5 @@ TEST(ukernel, special_values) { } \ } \ } -#include "src/f32-vlog/f32-vlog.h" +#include "f32-vlog/f32-vlog.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vlrelu.cc b/test/f32-vlrelu.cc index 16d55758aa2..d7478e85dab 100644 --- a/test/f32-vlrelu.cc +++ b/test/f32-vlrelu.cc @@ -46,5 +46,5 @@ TEST(ukernel, slope) { } \ } \ } -#include "src/f32-vlrelu/f32-vlrelu.h" +#include "f32-vlrelu/f32-vlrelu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vmax.cc b/test/f32-vmax.cc index c6ab216e1e4..af07b27fcae 100644 --- a/test/f32-vmax.cc +++ b/test/f32-vmax.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Max, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Max, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Max, init_params); -#include "src/f32-vbinary/f32-vmax.h" +#include "f32-vbinary/f32-vmax.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vmaxc.cc b/test/f32-vmaxc.cc index 96a870ff865..6746142ae80 100644 --- a/test/f32-vmaxc.cc +++ b/test/f32-vmaxc.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Max, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Max, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Max, init_params); -#include "src/f32-vbinary/f32-vmaxc.h" +#include "f32-vbinary/f32-vmaxc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vmin.cc b/test/f32-vmin.cc index 01c5b1c2f94..10e8b311c36 100644 --- a/test/f32-vmin.cc +++ b/test/f32-vmin.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Min, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Min, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Min, init_params); -#include "src/f32-vbinary/f32-vmin.h" +#include "f32-vbinary/f32-vmin.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vminc.cc b/test/f32-vminc.cc index 6e5e1ad3d2a..913c8092976 100644 --- a/test/f32-vminc.cc +++ b/test/f32-vminc.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Min, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Min, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Min, init_params); -#include "src/f32-vbinary/f32-vminc.h" +#include "f32-vbinary/f32-vminc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vmul.cc b/test/f32-vmul.cc index 93bf736e41b..097ab533d14 100644 --- a/test/f32-vmul.cc +++ b/test/f32-vmul.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); -#include "src/f32-vbinary/f32-vmul.h" +#include "f32-vbinary/f32-vmul.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vmulc.cc b/test/f32-vmulc.cc index eb58031cb52..ecd2f6c285c 100644 --- a/test/f32-vmulc.cc +++ b/test/f32-vmulc.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); -#include "src/f32-vbinary/f32-vmulc.h" +#include "f32-vbinary/f32-vmulc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vneg.cc b/test/f32-vneg.cc index c6dbc078fb1..bbbe8e0660a 100644 --- a/test/f32-vneg.cc +++ b/test/f32-vneg.cc @@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Neg()); \ \ XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Neg()); -#include "src/f32-vneg/f32-vneg.h" +#include "f32-vneg/f32-vneg.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vprelu.cc b/test/f32-vprelu.cc index bbd587e18c0..291bc181977 100644 --- a/test/f32-vprelu.cc +++ b/test/f32-vprelu.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); -#include "src/f32-vbinary/f32-vprelu.h" +#include "f32-vbinary/f32-vprelu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vpreluc.cc b/test/f32-vpreluc.cc index 31725565dd5..b0290e8deae 100644 --- a/test/f32-vpreluc.cc +++ b/test/f32-vpreluc.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params); -#include "src/f32-vbinary/f32-vpreluc.h" +#include "f32-vbinary/f32-vpreluc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vrcopysignc.cc b/test/f32-vrcopysignc.cc index bde00d406af..fd8ed7f769f 100644 --- a/test/f32-vrcopysignc.cc +++ b/test/f32-vrcopysignc.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RCopySign, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RCopySign, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RCopySign, init_params); -#include "src/f32-vbinary/f32-vrcopysignc.h" +#include "f32-vbinary/f32-vrcopysignc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vrdivc.cc b/test/f32-vrdivc.cc index 510b7f52901..31185edf74c 100644 --- a/test/f32-vrdivc.cc +++ b/test/f32-vrdivc.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RDiv, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RDiv, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RDiv, init_params); -#include "src/f32-vbinary/f32-vrdivc.h" +#include "f32-vbinary/f32-vrdivc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vrelu.cc b/test/f32-vrelu.cc index 8732db4b431..0d6d7c4e338 100644 --- a/test/f32-vrelu.cc +++ b/test/f32-vrelu.cc @@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ \ XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); -#include "src/f32-vrelu/f32-vrelu.h" +#include "f32-vrelu/f32-vrelu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vrndd.cc b/test/f32-vrndd.cc index 64f9c1a910a..cc824ea91d9 100644 --- a/test/f32-vrndd.cc +++ b/test/f32-vrndd.cc @@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUna XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundDown, init_params); \ \ XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundDown, init_params); -#include "src/f32-vrnd/f32-vrndd.h" +#include "f32-vrnd/f32-vrndd.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vrndne.cc b/test/f32-vrndne.cc index e4739467ad2..c1106131b6b 100644 --- a/test/f32-vrndne.cc +++ b/test/f32-vrndne.cc @@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUna XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundToNearestEven, init_params); \ \ XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundToNearestEven, init_params); -#include "src/f32-vrnd/f32-vrndne.h" +#include "f32-vrnd/f32-vrndne.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vrndu.cc b/test/f32-vrndu.cc index b0d313b4dcf..247b2cb1ced 100644 --- a/test/f32-vrndu.cc +++ b/test/f32-vrndu.cc @@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUna XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundUp, init_params); \ \ XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundUp, init_params); -#include "src/f32-vrnd/f32-vrndu.h" +#include "f32-vrnd/f32-vrndu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vrndz.cc b/test/f32-vrndz.cc index 7efd335417e..0808e048173 100644 --- a/test/f32-vrndz.cc +++ b/test/f32-vrndz.cc @@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUna XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundTowardsZero, init_params); \ \ XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundTowardsZero, init_params); -#include "src/f32-vrnd/f32-vrndz.h" +#include "f32-vrnd/f32-vrndz.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vrpreluc.cc b/test/f32-vrpreluc.cc index 8ea52b7789c..df02958d935 100644 --- a/test/f32-vrpreluc.cc +++ b/test/f32-vrpreluc.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params); -#include "src/f32-vbinary/f32-vrpreluc.h" +#include "f32-vbinary/f32-vrpreluc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vrsqrt.cc b/test/f32-vrsqrt.cc index 8050d56e362..06b1eaccd64 100644 --- a/test/f32-vrsqrt.cc +++ b/test/f32-vrsqrt.cc @@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ \ XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); -#include "src/f32-vrsqrt/f32-vrsqrt.h" +#include "f32-vrsqrt/f32-vrsqrt.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vrsubc.cc b/test/f32-vrsubc.cc index 14c71fac4fb..de2f4321b9c 100644 --- a/test/f32-vrsubc.cc +++ b/test/f32-vrsubc.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RSub, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RSub, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RSub, init_params); -#include "src/f32-vbinary/f32-vrsubc.h" +#include "f32-vbinary/f32-vrsubc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vsigmoid.cc b/test/f32-vsigmoid.cc index 0988fc19978..e60273dc896 100644 --- a/test/f32-vsigmoid.cc +++ b/test/f32-vsigmoid.cc @@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ \ XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); -#include "src/f32-vsigmoid/f32-vsigmoid.h" +#include "f32-vsigmoid/f32-vsigmoid.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vsqr.cc b/test/f32-vsqr.cc index e5965be3420..a2f4987c3bc 100644 --- a/test/f32-vsqr.cc +++ b/test/f32-vsqr.cc @@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Sqr()); \ \ XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Sqr()); -#include "src/f32-vsqr/f32-vsqr.h" +#include "f32-vsqr/f32-vsqr.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vsqrdiff.cc b/test/f32-vsqrdiff.cc index 8dfdac139ff..365838c4438 100644 --- a/test/f32-vsqrdiff.cc +++ b/test/f32-vsqrdiff.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::SqrDiff, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::SqrDiff, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::SqrDiff, init_params); -#include "src/f32-vbinary/f32-vsqrdiff.h" +#include "f32-vbinary/f32-vsqrdiff.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vsqrdiffc.cc b/test/f32-vsqrdiffc.cc index 8434a403dc8..3d0276a901a 100644 --- a/test/f32-vsqrdiffc.cc +++ b/test/f32-vsqrdiffc.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::SqrDiff, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::SqrDiff, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::SqrDiff, init_params); -#include "src/f32-vbinary/f32-vsqrdiffc.h" +#include "f32-vbinary/f32-vsqrdiffc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vsqrt.cc b/test/f32-vsqrt.cc index 829dc4b466c..ef629a2987f 100644 --- a/test/f32-vsqrt.cc +++ b/test/f32-vsqrt.cc @@ -63,5 +63,5 @@ TEST(ukernel, special_values) { } \ } \ } -#include "src/f32-vsqrt/f32-vsqrt.h" +#include "f32-vsqrt/f32-vsqrt.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vsub.cc b/test/f32-vsub.cc index 03c9f5cc02c..15c86a300c7 100644 --- a/test/f32-vsub.cc +++ b/test/f32-vsub.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); -#include "src/f32-vbinary/f32-vsub.h" +#include "f32-vbinary/f32-vsub.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vsubc.cc b/test/f32-vsubc.cc index 69cfe91caa0..eac7baa565f 100644 --- a/test/f32-vsubc.cc +++ b/test/f32-vsubc.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params); -#include "src/f32-vbinary/f32-vsubc.h" +#include "f32-vbinary/f32-vsubc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vtanh.cc b/test/f32-vtanh.cc index 591612e13d8..5d2c6723393 100644 --- a/test/f32-vtanh.cc +++ b/test/f32-vtanh.cc @@ -63,5 +63,5 @@ TEST(ukernel, special_values) { } \ } \ } -#include "src/f32-vtanh/f32-vtanh.h" +#include "f32-vtanh/f32-vtanh.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/floor.cc b/test/floor.cc index d3412cc3148..bea533cabaf 100644 --- a/test/floor.cc +++ b/test/floor.cc @@ -12,6 +12,7 @@ #include #include "xnnpack.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" diff --git a/test/fully-connected.cc b/test/fully-connected.cc index 3bc434e827e..f5948f2c89f 100644 --- a/test/fully-connected.cc +++ b/test/fully-connected.cc @@ -519,6 +519,7 @@ TEST_F(FullyConnectedTestQP8F32QC4W, matches_operator_api_with_reshape) { // unwritten portions of these buffers are matching. std::fill(convert_input.begin(), convert_input.end(), 0.0f); std::fill(subgraph_output.begin(), subgraph_output.end(), 0.0f); + std::fill(operator_output.begin(), operator_output.end(), 0.0f); // Adjust number of kernel elements for QC4W. input_channels should be padded // to byte boundary, hence even. diff --git a/test/gavgpool-cw-microkernel-tester.h b/test/gavgpool-cw-microkernel-tester.h index fae2cb8e9d2..d9450a0f671 100644 --- a/test/gavgpool-cw-microkernel-tester.h +++ b/test/gavgpool-cw-microkernel-tester.h @@ -17,6 +17,7 @@ #include #include "xnnpack.h" #include "xnnpack/fp16.h" +#include "xnnpack/math.h" #include "xnnpack/microfnptr.h" #include "xnnpack/microparams.h" #include "replicable_random_device.h" diff --git a/test/gavgpool-microkernel-tester.h b/test/gavgpool-microkernel-tester.h index b0b1d5f0d94..92a3ba72aef 100644 --- a/test/gavgpool-microkernel-tester.h +++ b/test/gavgpool-microkernel-tester.h @@ -21,6 +21,7 @@ #include #include "xnnpack.h" #include "xnnpack/aligned-allocator.h" +#include "xnnpack/math.h" #include "xnnpack/microfnptr.h" #include "xnnpack/microparams.h" #include "xnnpack/requantization.h" diff --git a/test/gemm-microkernel-tester.cc b/test/gemm-microkernel-tester.cc index cb92bf8bc3b..2f2aebe6409 100644 --- a/test/gemm-microkernel-tester.cc +++ b/test/gemm-microkernel-tester.cc @@ -1734,7 +1734,7 @@ void GemmMicrokernelTester::Test( input_qp8.data()); std::generate(b.begin(), b.end(), std::ref(w8rng)); - // std::generate(bias.begin(), bias.end(), std::ref(f32rng)); + std::generate(bias.begin(), bias.end(), std::ref(f32rng)); std::generate(kernel_scale.begin(), kernel_scale.end(), std::ref(scalerng)); std::fill(packed_w.begin(), packed_w.end(), 0); @@ -1747,8 +1747,8 @@ void GemmMicrokernelTester::Test( /*accumulator_init=*/nullptr, /*weights=*/b.data(), /*int_extra_data0_fn=*/nullptr, - /*extra_data0=*/nullptr, - /*extra_data0_size=*/0, + /*extra_data0=*/bias.data(), + /*extra_data0_size=*/sizeof(float), /*init_extra_data1_fn=*/ nullptr, /*extra_data1=*/kernel_scale.data(), diff --git a/test/global-average-pooling-1d.cc b/test/global-average-pooling-1d.cc index 4a45f86851d..ec5ab308cb7 100644 --- a/test/global-average-pooling-1d.cc +++ b/test/global-average-pooling-1d.cc @@ -19,6 +19,7 @@ #include "xnnpack.h" #include "xnnpack/aligned-allocator.h" #include "xnnpack/common.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/requantization.h" diff --git a/test/global-average-pooling-2d.cc b/test/global-average-pooling-2d.cc index 786fcf47a33..c489b6f9a4f 100644 --- a/test/global-average-pooling-2d.cc +++ b/test/global-average-pooling-2d.cc @@ -19,6 +19,7 @@ #include "xnnpack.h" #include "xnnpack/aligned-allocator.h" #include "xnnpack/common.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/requantization.h" diff --git a/test/global-sum-pooling-1d.cc b/test/global-sum-pooling-1d.cc index d3bd490ac53..bd7c813c762 100644 --- a/test/global-sum-pooling-1d.cc +++ b/test/global-sum-pooling-1d.cc @@ -19,6 +19,7 @@ #include "xnnpack.h" #include "xnnpack/aligned-allocator.h" #include "xnnpack/common.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" diff --git a/test/global-sum-pooling-2d.cc b/test/global-sum-pooling-2d.cc index e3c6fec6943..d23a0eff24d 100644 --- a/test/global-sum-pooling-2d.cc +++ b/test/global-sum-pooling-2d.cc @@ -19,6 +19,7 @@ #include "xnnpack.h" #include "xnnpack/aligned-allocator.h" #include "xnnpack/common.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" diff --git a/test/hardswish.cc b/test/hardswish.cc index df394a3d0d1..d026b2d7fa8 100644 --- a/test/hardswish.cc +++ b/test/hardswish.cc @@ -14,6 +14,7 @@ #include #include "xnnpack.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" diff --git a/test/leaky-relu.cc b/test/leaky-relu.cc index 5907f5f2a91..9ebdc0de3e9 100644 --- a/test/leaky-relu.cc +++ b/test/leaky-relu.cc @@ -12,6 +12,7 @@ #include #include "xnnpack.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" diff --git a/test/max-pooling-2d.cc b/test/max-pooling-2d.cc index b6e4b8ecaa8..c7ed6cd91c3 100644 --- a/test/max-pooling-2d.cc +++ b/test/max-pooling-2d.cc @@ -15,12 +15,13 @@ #include #include "xnnpack.h" +#include "xnnpack/buffer.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator-utils.h" #include "xnnpack/operator.h" #include "xnnpack/requantization.h" #include "xnnpack/subgraph.h" -#include "xnnpack/buffer.h" #include "replicable_random_device.h" template class MaxPooling2DTestBase : public ::testing::Test { diff --git a/test/maxpool-microkernel-tester.h b/test/maxpool-microkernel-tester.h index 755ace470a0..f8447292bd0 100644 --- a/test/maxpool-microkernel-tester.h +++ b/test/maxpool-microkernel-tester.h @@ -21,10 +21,11 @@ #include #include "xnnpack.h" +#include "xnnpack/buffer.h" +#include "xnnpack/math.h" #include "xnnpack/microfnptr.h" #include "xnnpack/microparams.h" #include "next_prime.h" -#include "xnnpack/buffer.h" #include "replicable_random_device.h" class MaxPoolMicrokernelTester { diff --git a/test/maxpool-minmax.cc b/test/maxpool-minmax.cc index e7c92e96e0c..d5cc5669871 100644 --- a/test/maxpool-minmax.cc +++ b/test/maxpool-minmax.cc @@ -39,10 +39,10 @@ std::string GetTestName(const testing::TestParamInfo& info) { #ukernel, MaxPoolMicrokernelTester::Kernel{ukernel, init_params}, arch_flags, channel_tile, channel_scaled_tile, primary_tile, incremental_tile, qmin, qmax }, const XnnTestParam xnn_test_params[] = { -#include "src/f16-maxpool/f16-maxpool-minmax.h" -#include "src/f32-maxpool/f32-maxpool-minmax.h" -#include "src/s8-maxpool/s8-maxpool-minmax.h" -#include "src/u8-maxpool/u8-maxpool-minmax.h" +#include "f16-maxpool/f16-maxpool-minmax.h" +#include "f32-maxpool/f32-maxpool-minmax.h" +#include "s8-maxpool/s8-maxpool-minmax.h" +#include "u8-maxpool/u8-maxpool-minmax.h" }; #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/negate.cc b/test/negate.cc index f0aa5af3c1a..f31ff9ac25e 100644 --- a/test/negate.cc +++ b/test/negate.cc @@ -12,6 +12,7 @@ #include #include "xnnpack.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" diff --git a/test/operator-size.c b/test/operator-size.c deleted file mode 100644 index 7c4c490489b..00000000000 --- a/test/operator-size.c +++ /dev/null @@ -1,200 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack.h" - - -// A dummy program that calls every Operators API function in XNNPACK, for size estimation. -int main(int argc, char** argv) { - int function_idx = 0; - if (argc >= 2) { - function_idx = atoi(argv[1]); - } - - xnn_initialize(NULL /* allocator */); - - xnn_operator_t op = NULL; - switch (function_idx) { - case -1: - xnn_delete_operator(op); - break; - case 0: - xnn_run_operator(op, NULL); - break; - case 1: - xnn_create_binary_elementwise_nd( - xnn_binary_add, xnn_datatype_fp32, NULL, NULL, NULL, 0, &op); - break; - case 2: - xnn_setup_binary_elementwise_nd( - op, NULL, NULL, NULL); - break; - case 3: - xnn_create_argmax_pooling2d_nhwc_f32( - 0, 0, 0, 0, - 0, 0, - 0, &op); - break; - case 4: - xnn_setup_argmax_pooling2d_nhwc_f32( - op, NULL, NULL, NULL, NULL); - break; - case 5: - xnn_create_average_pooling2d_nhwc_f32( - 0, 0, 0, 0, - 0, 0, - 0, 0, - 0.0f, 0.0f, - 0, &op); - break; - case 6: - xnn_setup_average_pooling2d_nhwc_f32( - op, NULL, NULL, NULL); - break; - case 7: - xnn_create_clamp_nc_f32( - 0.0f, 0.0f, - 0, &op); - xnn_reshape_clamp_nc_f32( - op, 0, 0, 0, 0, NULL); - break; - case 8: - xnn_setup_clamp_nc_f32( - op, NULL, NULL); - break; - case 9: - xnn_create_convolution2d_nhwc_f32( - 0, 0, 0, 0, - 0, 0, - 0, 0, - 0, 0, - 0, 0, 0, 0, 0, - NULL, NULL, - 0.0f, 0.0f, - 0, NULL, NULL, &op); - break; - case 10: - xnn_setup_convolution2d_nhwc_f32( - op, NULL, NULL, NULL); - break; - case 11: - xnn_create_deconvolution2d_nhwc_f32( - 0, 0, 0, 0, - 0, 0, - 0, 0, - 0, 0, - 0, 0, 0, 0, 0, - NULL, NULL, - 0.0f, 0.0f, - 0, NULL, NULL, &op); - break; - case 12: - xnn_setup_deconvolution2d_nhwc_f32( - op, NULL, NULL); - break; - case 15: - xnn_create_fully_connected_nc_f32( - 0, 0, 0, 0, - NULL, NULL, - 0.0f, 0.0f, - 0, NULL, NULL, &op); - break; - case 16: - xnn_setup_fully_connected_nc_f32( - op, NULL, NULL); - break; - case 17: - xnn_create_global_average_pooling_nwc_f32( - 0.0f, 0.0f, - 0, &op); - break; - case 18: - xnn_setup_global_average_pooling_nwc_f32( - op, NULL, NULL, NULL); - break; - case 19: - xnn_create_hardswish_nc_f32( - 0, &op); - xnn_reshape_hardswish_nc_f32( - op, 0, 0, 0, 0, NULL); - break; - case 20: - xnn_setup_hardswish_nc_f32( - op, NULL, NULL); - break; - case 21: - xnn_create_max_pooling2d_nhwc_f32( - 0, 0, 0, 0, - 0, 0, 0, - 0, 0, 0, - 0.0f, 0.0f, - 0, &op); - break; - case 22: - xnn_setup_max_pooling2d_nhwc_f32( - op, NULL, NULL); - break; - case 29: - xnn_create_prelu_nc_f32( - 0, 0, 0, 0, - NULL, 0, NULL, NULL, &op); - break; - case 30: - xnn_setup_prelu_nc_f32( - op, - NULL, NULL); - break; - case 31: - xnn_create_resize_bilinear2d_nhwc_f32( - 0, 0, 0, &op); - break; - case 32: - xnn_setup_resize_bilinear2d_nhwc_f32( - op, NULL, NULL, NULL); - break; - case 33: - xnn_create_sigmoid_nc_f32( - 0, &op); - xnn_reshape_sigmoid_nc_f32( - op, 0, 0, 0, 0, NULL); - break; - case 34: - xnn_setup_sigmoid_nc_f32( - op, NULL, NULL); - break; - case 35: - xnn_create_softmax_nc_f32( - 0, &op); - break; - case 36: - xnn_setup_softmax_nc_f32( - op, NULL, NULL); - break; - case 39: - xnn_create_channel_shuffle_nc_x32( - 0, 0, 0, 0, - 0, &op); - break; - case 40: - xnn_setup_channel_shuffle_nc_x32( - op, NULL, NULL); - break; - case 41: - xnn_create_unpooling2d_nhwc_x32( - 0, 0, 0, 0, - 0, 0, - 0, 0, 0, - 0, &op); - break; - case 42: - xnn_setup_unpooling2d_nhwc_x32( - op, NULL, NULL, NULL); - break; - } - - xnn_deinitialize(); -} diff --git a/test/prelu-microkernel-tester.h b/test/prelu-microkernel-tester.h deleted file mode 100644 index 30edcda22ac..00000000000 --- a/test/prelu-microkernel-tester.h +++ /dev/null @@ -1,187 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/buffer.h" -#include "replicable_random_device.h" - -class PReLUMicrokernelTester { - public: - PReLUMicrokernelTester& rows(size_t rows) { - assert(rows != 0); - this->rows_ = rows; - return *this; - } - - size_t rows() const { - return this->rows_; - } - - PReLUMicrokernelTester& channels(size_t channels) { - assert(channels != 0); - this->channels_ = channels; - return *this; - } - - size_t channels() const { - return this->channels_; - } - - PReLUMicrokernelTester& input_stride(size_t input_stride) { - assert(input_stride != 0); - this->input_stride_ = input_stride; - return *this; - } - - size_t input_stride() const { - if (this->input_stride_ == 0) { - return channels(); - } else { - assert(this->input_stride_ >= channels()); - return this->input_stride_; - } - } - - PReLUMicrokernelTester& output_stride(size_t output_stride) { - assert(output_stride != 0); - this->output_stride_ = output_stride; - return *this; - } - - size_t output_stride() const { - if (this->output_stride_ == 0) { - return channels(); - } else { - assert(this->output_stride_ >= channels()); - return this->output_stride_; - } - } - - PReLUMicrokernelTester& inplace(bool inplace) { - this->inplace_ = inplace; - return *this; - } - - bool inplace() const { - return this->inplace_; - } - - PReLUMicrokernelTester& iterations(size_t iterations) { - this->iterations_ = iterations; - return *this; - } - - size_t iterations() const { - return this->iterations_; - } - - void Test(xnn_f16_prelu_ukernel_fn prelu) const { - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist(-1.0f, 1.0f); - std::uniform_real_distribution w32dist(0.25f, 0.75f); - - xnnpack::Buffer x(channels() + (rows() - 1) * input_stride() + XNN_EXTRA_BYTES / sizeof(xnn_float16)); - xnnpack::Buffer w( - channels() + XNN_EXTRA_BYTES / sizeof(xnn_float16)); - xnnpack::Buffer y(channels() + (rows() - 1) * output_stride() + XNN_EXTRA_BYTES / sizeof(xnn_float16)); - xnnpack::Buffer y_ref(channels() * rows()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(x.begin(), x.end(), [&]() { return f32dist(rng); }); - std::generate(w.begin(), w.end(), [&]() { return w32dist(rng); }); - if (inplace()) { - std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); }); - } - const xnn_float16* x_data = inplace() ? y.data() : x.data(); - - // Compute reference results, without clamping. - for (size_t n = 0; n < rows(); n++) { - for (size_t c = 0; c < channels(); c++) { - const float x_value = x_data[n * input_stride() + c]; - y_ref[n * channels() + c] = std::signbit(x_value) ? - float(xnn_float16(x_value * w[c])) : x_value; // What is going on here? - } - } - - // Call optimized micro-kernel. - prelu(rows(), channels() * sizeof(xnn_float16), - x_data, input_stride() * sizeof(xnn_float16), - w.data(), - y.data(), output_stride() * sizeof(xnn_float16)); - - // Verify results. - for (size_t n = 0; n < rows(); n++) { - for (size_t c = 0; c < channels(); c++) { - EXPECT_EQ(y[n * output_stride() + c], y_ref[n * channels() + c]) - << "at row " << n << " / " << rows() - << ", channel " << c << " / " << channels(); - } - } - } - } - - void Test(xnn_f32_prelu_ukernel_fn prelu) const { - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist(-1.0f, 1.0f); - std::uniform_real_distribution w32dist(0.25f, 0.75f); - - xnnpack::Buffer x(channels() + (rows() - 1) * input_stride() + XNN_EXTRA_BYTES / sizeof(float)); - xnnpack::Buffer w(channels() + - XNN_EXTRA_BYTES / sizeof(float)); - xnnpack::Buffer y(channels() + (rows() - 1) * output_stride() + XNN_EXTRA_BYTES / sizeof(float)); - xnnpack::Buffer y_ref(channels() * rows()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(x.begin(), x.end(), [&]() { return f32dist(rng); }); - std::generate(w.begin(), w.end(), [&]() { return w32dist(rng); }); - if (inplace()) { - std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); }); - } - const float* x_data = inplace() ? y.data() : x.data(); - - // Compute reference results, without clamping. - for (size_t n = 0; n < rows(); n++) { - for (size_t c = 0; c < channels(); c++) { - const float x_value = x_data[n * input_stride() + c]; - y_ref[n * channels() + c] = std::signbit(x_value) ? x_value * w[c] : x_value; - } - } - - // Call optimized micro-kernel. - prelu(rows(), channels() * sizeof(float), - x_data, input_stride() * sizeof(float), - w.data(), - y.data(), output_stride() * sizeof(float)); - - // Verify results. - for (size_t n = 0; n < rows(); n++) { - for (size_t c = 0; c < channels(); c++) { - EXPECT_EQ(y[n * output_stride() + c], y_ref[n * channels() + c]) - << "at row " << n << " / " << rows() - << ", channel " << c << " / " << channels(); - } - } - } - } - - private: - size_t rows_{1}; - size_t channels_{1}; - size_t input_stride_{0}; - size_t output_stride_{0}; - bool inplace_{false}; - size_t iterations_{15}; -}; diff --git a/test/prelu-nc.cc b/test/prelu-nc.cc deleted file mode 100644 index 32068f52d27..00000000000 --- a/test/prelu-nc.cc +++ /dev/null @@ -1,356 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include -#include "xnnpack/config.h" -#include "prelu-operator-tester.h" - -#ifndef XNN_EXCLUDE_F16_TESTS -TEST(PRELU_NC_F16, unit_batch) { - const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config(); - if (prelu_config == nullptr) { - GTEST_SKIP(); // F16 unsupported. - } - for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max(1, prelu_config->channel_tile - 1)) { - PReLUOperatorTester() - .batch_size(1) - .input_channels(input_channels) - .iterations(3) - .TestF16(); - } -} - -TEST(PRELU_NC_F16, small_batch_with_broadcasted_slope) { - const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config(); - if (prelu_config == nullptr) { - GTEST_SKIP(); // F16 unsupported. - } - for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max(1, prelu_config->channel_tile - 1)) { - PReLUOperatorTester() - .batch_size(prelu_config->row_tile) - .input_channels(input_channels) - .slope_channels(1) - .iterations(3) - .TestF16(); - } -} - -TEST(PRELU_NC_F16, small_batch) { - const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config(); - if (prelu_config == nullptr) { - GTEST_SKIP(); // F16 unsupported. - } - for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max(1, prelu_config->channel_tile - 1)) { - PReLUOperatorTester() - .batch_size(prelu_config->row_tile) - .input_channels(input_channels) - .iterations(3) - .TestF16(); - } -} - -TEST(PRELU_NC_F16, small_batch_with_x_stride) { - const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config(); - if (prelu_config == nullptr) { - GTEST_SKIP(); // F16 unsupported. - } - for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max(1, prelu_config->channel_tile - 1)) { - PReLUOperatorTester() - .batch_size(prelu_config->row_tile) - .input_channels(input_channels) - .x_stride(337) - .iterations(3) - .TestF16(); - } -} - -TEST(PRELU_NC_F16, small_batch_with_y_stride) { - const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config(); - if (prelu_config == nullptr) { - GTEST_SKIP(); // F16 unsupported. - } - for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max(1, prelu_config->channel_tile - 1)) { - PReLUOperatorTester() - .batch_size(prelu_config->row_tile) - .input_channels(input_channels) - .y_stride(347) - .iterations(3) - .TestF16(); - } -} - -TEST(PRELU_NC_F16, small_batch_with_x_stride_and_y_stride) { - const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config(); - if (prelu_config == nullptr) { - GTEST_SKIP(); // F16 unsupported. - } - for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max(1, prelu_config->channel_tile - 1)) { - PReLUOperatorTester() - .batch_size(prelu_config->row_tile) - .input_channels(input_channels) - .x_stride(337) - .y_stride(347) - .iterations(3) - .TestF16(); - } -} - -TEST(PRELU_NC_F16, large_batch) { - const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config(); - if (prelu_config == nullptr) { - GTEST_SKIP(); // F16 unsupported. - } - for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max(1, prelu_config->channel_tile - 1)) { - PReLUOperatorTester() - .batch_size(3 * prelu_config->row_tile + 1) - .input_channels(input_channels) - .iterations(1) - .TestF16(); - } -} - -TEST(PRELU_NC_F16, large_batch_with_x_stride) { - const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config(); - if (prelu_config == nullptr) { - GTEST_SKIP(); // F16 unsupported. - } - for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max(1, prelu_config->channel_tile - 1)) { - PReLUOperatorTester() - .batch_size(3 * prelu_config->row_tile + 1) - .input_channels(input_channels) - .x_stride(337) - .iterations(1) - .TestF16(); - } -} - -TEST(PRELU_NC_F16, large_batch_with_y_stride) { - const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config(); - if (prelu_config == nullptr) { - GTEST_SKIP(); // F16 unsupported. - } - for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max(1, prelu_config->channel_tile - 1)) { - PReLUOperatorTester() - .batch_size(3 * prelu_config->row_tile + 1) - .input_channels(input_channels) - .y_stride(347) - .iterations(1) - .TestF16(); - } -} - -TEST(PRELU_NC_F16, large_batch_with_x_stride_and_y_stride) { - const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config(); - if (prelu_config == nullptr) { - GTEST_SKIP(); // F16 unsupported. - } - for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max(1, prelu_config->channel_tile - 1)) { - PReLUOperatorTester() - .batch_size(3 * prelu_config->row_tile + 1) - .input_channels(input_channels) - .x_stride(337) - .y_stride(347) - .iterations(1) - .TestF16(); - } -} - -TEST(PRELU_NC_F16, fp32_weights) { - const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config(); - if (prelu_config == nullptr) { - GTEST_SKIP(); // F16 unsupported. - } - for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max(1, prelu_config->channel_tile - 1)) { - PReLUOperatorTester() - .batch_size(3 * prelu_config->row_tile + 1) - .input_channels(input_channels) - .x_stride(337) - .y_stride(347) - .weights_type(PReLUOperatorTester::WeightsType::FP32) - .iterations(1) - .TestF16(); - } -} - -TEST(PRELU_NC_F16, weights_cache_unit_batch) { - const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config(); - if (prelu_config == nullptr) { - GTEST_SKIP(); // F16 unsupported. - } - for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max(1, prelu_config->channel_tile - 1)) { - PReLUOperatorTester() - .batch_size(1) - .input_channels(input_channels) - .use_weights_cache(true) - .iterations(3) - .TestF16(); - } -} - -TEST(PRELU_NC_F16, weights_cache_fp32_weights) { - const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config(); - if (prelu_config == nullptr) { - GTEST_SKIP(); // F16 unsupported. - } - for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max(1, prelu_config->channel_tile - 1)) { - PReLUOperatorTester() - .batch_size(3 * prelu_config->row_tile + 1) - .input_channels(input_channels) - .x_stride(345) - .y_stride(347) - .weights_type(PReLUOperatorTester::WeightsType::FP32) - .use_weights_cache(true) - .iterations(1) - .TestF16(); - } -} -#endif // XNN_EXCLUDE_F16_TESTS - - -TEST(PRELU_NC_F32, unit_batch) { - const struct xnn_prelu_config* prelu_config = xnn_init_f32_prelu_config(); - assert(prelu_config != nullptr); - for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max(1, prelu_config->channel_tile - 1)) { - PReLUOperatorTester() - .batch_size(1) - .input_channels(input_channels) - .iterations(3) - .TestF32(); - } -} - -TEST(PRELU_NC_F32, small_batch_with_broadcasted_slope) { - const struct xnn_prelu_config* prelu_config = xnn_init_f32_prelu_config(); - assert(prelu_config != nullptr); - for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max(1, prelu_config->channel_tile - 1)) { - PReLUOperatorTester() - .batch_size(prelu_config->row_tile) - .input_channels(input_channels) - .slope_channels(1) - .iterations(3) - .TestF32(); - } -} - -TEST(PRELU_NC_F32, small_batch) { - const struct xnn_prelu_config* prelu_config = xnn_init_f32_prelu_config(); - assert(prelu_config != nullptr); - for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max(1, prelu_config->channel_tile - 1)) { - PReLUOperatorTester() - .batch_size(prelu_config->row_tile) - .input_channels(input_channels) - .iterations(3) - .TestF32(); - } -} - -TEST(PRELU_NC_F32, small_batch_with_x_stride) { - const struct xnn_prelu_config* prelu_config = xnn_init_f32_prelu_config(); - assert(prelu_config != nullptr); - for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max(1, prelu_config->channel_tile - 1)) { - PReLUOperatorTester() - .batch_size(prelu_config->row_tile) - .input_channels(input_channels) - .x_stride(337) - .iterations(3) - .TestF32(); - } -} - -TEST(PRELU_NC_F32, small_batch_with_y_stride) { - const struct xnn_prelu_config* prelu_config = xnn_init_f32_prelu_config(); - assert(prelu_config != nullptr); - for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max(1, prelu_config->channel_tile - 1)) { - PReLUOperatorTester() - .batch_size(prelu_config->row_tile) - .input_channels(input_channels) - .y_stride(347) - .iterations(3) - .TestF32(); - } -} - -TEST(PRELU_NC_F32, small_batch_with_x_stride_and_y_stride) { - const struct xnn_prelu_config* prelu_config = xnn_init_f32_prelu_config(); - assert(prelu_config != nullptr); - for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max(1, prelu_config->channel_tile - 1)) { - PReLUOperatorTester() - .batch_size(prelu_config->row_tile) - .input_channels(input_channels) - .x_stride(337) - .y_stride(347) - .iterations(3) - .TestF32(); - } -} - -TEST(PRELU_NC_F32, large_batch) { - const struct xnn_prelu_config* prelu_config = xnn_init_f32_prelu_config(); - assert(prelu_config != nullptr); - for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max(1, prelu_config->channel_tile - 1)) { - PReLUOperatorTester() - .batch_size(3 * prelu_config->row_tile + 1) - .input_channels(input_channels) - .iterations(1) - .TestF32(); - } -} - -TEST(PRELU_NC_F32, large_batch_with_x_stride) { - const struct xnn_prelu_config* prelu_config = xnn_init_f32_prelu_config(); - assert(prelu_config != nullptr); - for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max(1, prelu_config->channel_tile - 1)) { - PReLUOperatorTester() - .batch_size(3 * prelu_config->row_tile + 1) - .input_channels(input_channels) - .x_stride(337) - .iterations(1) - .TestF32(); - } -} - -TEST(PRELU_NC_F32, large_batch_with_y_stride) { - const struct xnn_prelu_config* prelu_config = xnn_init_f32_prelu_config(); - assert(prelu_config != nullptr); - for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max(1, prelu_config->channel_tile - 1)) { - PReLUOperatorTester() - .batch_size(3 * prelu_config->row_tile + 1) - .input_channels(input_channels) - .y_stride(347) - .iterations(1) - .TestF32(); - } -} - -TEST(PRELU_NC_F32, large_batch_with_x_stride_and_y_stride) { - const struct xnn_prelu_config* prelu_config = xnn_init_f32_prelu_config(); - assert(prelu_config != nullptr); - for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max(1, prelu_config->channel_tile - 1)) { - PReLUOperatorTester() - .batch_size(3 * prelu_config->row_tile + 1) - .input_channels(input_channels) - .x_stride(337) - .y_stride(347) - .iterations(1) - .TestF32(); - } -} - -TEST(PRELU_NC_F32, weights_cache_unit_batch) { - const struct xnn_prelu_config* prelu_config = xnn_init_f32_prelu_config(); - assert(prelu_config != nullptr); - for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max(1, prelu_config->channel_tile - 1)) { - PReLUOperatorTester() - .batch_size(1) - .input_channels(input_channels) - .use_weights_cache(true) - .iterations(3) - .TestF32(); - } -} diff --git a/test/prelu-operator-tester.h b/test/prelu-operator-tester.h deleted file mode 100644 index cc0dd7311d0..00000000000 --- a/test/prelu-operator-tester.h +++ /dev/null @@ -1,394 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/cache.h" -#include "xnnpack/math.h" -#include "xnnpack/buffer.h" -#include "replicable_random_device.h" - -class PReLUOperatorTester { - public: - enum class WeightsType { - Default, - FP32, - }; - - PReLUOperatorTester& batch_size(size_t batch_size) { - assert(batch_size != 0); - this->batch_size_ = batch_size; - return *this; - } - - size_t batch_size() const { - return this->batch_size_; - } - - PReLUOperatorTester& input_channels(size_t input_channels) { - assert(input_channels != 0); - this->input_channels_ = input_channels; - return *this; - } - - size_t input_channels() const { - return this->input_channels_; - } - - PReLUOperatorTester& slope_channels(size_t slope_channels) { - assert(slope_channels != 0); - this->slope_channels_ = slope_channels; - return *this; - } - - size_t slope_channels() const { - if (this->slope_channels_ == 0) { - return this->input_channels_; - } else { - return this->slope_channels_; - } - } - - PReLUOperatorTester& x_stride(size_t x_stride) { - assert(x_stride != 0); - this->x_stride_ = x_stride; - return *this; - } - - size_t x_stride() const { - if (this->x_stride_ == 0) { - return this->input_channels_; - } else { - assert(this->x_stride_ >= this->input_channels_); - return this->x_stride_; - } - } - - PReLUOperatorTester& y_stride(size_t y_stride) { - assert(y_stride != 0); - this->y_stride_ = y_stride; - return *this; - } - - size_t y_stride() const { - if (this->y_stride_ == 0) { - return this->input_channels_; - } else { - assert(this->y_stride_ >= this->input_channels_); - return this->y_stride_; - } - } - - PReLUOperatorTester& weights_type(WeightsType weights_type) { - this->weights_type_ = weights_type; - return *this; - } - - WeightsType weights_type() const { - return this->weights_type_; - } - - PReLUOperatorTester& iterations(size_t iterations) { - this->iterations_ = iterations; - return *this; - } - - size_t iterations() const { - return this->iterations_; - } - - PReLUOperatorTester& use_weights_cache(bool use_weights_cache) { - this->use_weights_cache_ = use_weights_cache; - return *this; - } - - bool use_weights_cache() const { - return this->use_weights_cache_; - } - - void TestF16() const { - switch (weights_type()) { - case WeightsType::Default: - break; - case WeightsType::FP32: - break; - default: - GTEST_FAIL() << "unexpected weights type"; - } - - xnnpack::ReplicableRandomDevice rng; - auto f32irng = std::uniform_real_distribution(-1.0f, 1.0f); - auto f32wrng = std::uniform_real_distribution(0.25f, 0.75f); - - xnnpack::Buffer x((batch_size() - 1) * x_stride() + input_channels() + XNN_EXTRA_BYTES / sizeof(xnn_float16)); - xnnpack::Buffer w(input_channels()); - xnnpack::Buffer w_as_float(input_channels()); - xnnpack::Buffer y((batch_size() - 1) * y_stride() + input_channels() + XNN_EXTRA_BYTES / sizeof(xnn_float16)); - xnnpack::Buffer y_ref(batch_size() * input_channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(x.begin(), x.end(), [&] { return f32irng(rng); }); - if (slope_channels() == 1) { - std::fill(w.begin(), w.end(), f32wrng(rng)); - } else { - std::generate(w.begin(), w.end(), [&] { return f32wrng(rng); }); - } - std::copy(w.cbegin(), w.cend(), w_as_float.begin()); - - // Compute reference results, without clamping. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < input_channels(); c++) { - const float x_value = x[i * x_stride() + c]; - const float w_value = w_as_float[c]; - y_ref[i * input_channels() + c] = std::signbit(x_value) ? x_value * w_value : x_value; - } - } - - // Create, setup, run, and destroy PReLU operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - xnn_operator_t prelu_op = nullptr; - - struct xnn_internal_weights_cache* internal_weights_cache = nullptr; - std::unique_ptr auto_weights_cache( - nullptr, xnn_delete_weights_cache); - if (use_weights_cache()) { - xnn_weights_cache_t weights_cache = nullptr; - xnn_create_weights_cache(&weights_cache); - auto_weights_cache.reset(weights_cache); - if (weights_cache) { - internal_weights_cache = (struct xnn_internal_weights_cache*) weights_cache->context; - } - } - - const void* negative_slope_data = w.data(); - if (weights_type() == WeightsType::FP32) { - negative_slope_data = w_as_float.data(); - } - uint32_t flags = 0; - if (weights_type() == WeightsType::FP32) { - flags |= XNN_FLAG_FP32_STATIC_WEIGHTS; - } - ASSERT_EQ(xnn_status_success, - xnn_create_prelu_nc_f16( - input_channels(), slope_channels(), x_stride(), y_stride(), - negative_slope_data, - flags, /*code_cache=*/nullptr, auto_weights_cache.get(), &prelu_op)); - ASSERT_NE(nullptr, prelu_op); - if (use_weights_cache()) { - ASSERT_EQ(xnn_status_success, - xnn_finalize_weights_cache(auto_weights_cache.get(), xnn_weights_cache_finalization_kind_soft)); - } - - // Smart pointer to automatically delete prelu_op. - std::unique_ptr auto_prelu_op(prelu_op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, - xnn_reshape_prelu_nc_f16( - prelu_op, - batch_size(), - /*threadpool=*/nullptr)); - - ASSERT_EQ(xnn_status_success, - xnn_setup_prelu_nc_f16( - prelu_op, - x.data(), y.data())); - - ASSERT_EQ(xnn_status_success, - xnn_run_operator(prelu_op, /*threadpool=*/nullptr)); - - VerifyF16(y, y_ref); - - if (use_weights_cache()) { - xnn_operator_t prelu_op2 = nullptr; - const size_t old_weights_cache_size = internal_weights_cache->cache.weights.size; - - ASSERT_EQ(xnn_status_success, - xnn_create_prelu_nc_f16( - input_channels(), slope_channels(), x_stride(), y_stride(), - negative_slope_data, - flags, /*code_cache=*/nullptr, auto_weights_cache.get(), &prelu_op2)); - ASSERT_NE(nullptr, prelu_op2); - - // Smart pointer to automatically delete prelu_op2. - std::unique_ptr auto_prelu_op(prelu_op2, xnn_delete_operator); - - xnnpack::Buffer y2(y.size(), std::nanf("")); - ASSERT_EQ(xnn_status_success, - xnn_reshape_prelu_nc_f16( - prelu_op2, - batch_size(), - /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, - xnn_setup_prelu_nc_f16( - prelu_op2, - x.data(), y2.data())); - - ASSERT_EQ(xnn_status_success, - xnn_run_operator(prelu_op2, /*threadpool=*/nullptr)); - - VerifyF16(y2, y_ref); - VerifyWeightsCache(*internal_weights_cache, old_weights_cache_size); - } - } - } - - void VerifyF16(const xnnpack::Buffer& y, const xnnpack::Buffer& y_ref) const { - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < input_channels(); c++) { - ASSERT_NEAR( - y[i * y_stride() + c], - y_ref[i * input_channels() + c], - std::max(1.0e-4f, std::abs(y_ref[i * input_channels() + c]) * 1.0e-3f)) - << "at position " << i << " / " << batch_size() << ", channel " << c << " / " << input_channels(); - } - } - } - - void TestF32() const { - ASSERT_EQ(weights_type(), WeightsType::Default); - - xnnpack::ReplicableRandomDevice rng; - auto f32irng = std::uniform_real_distribution(-1.0f, 1.0f); - auto f32wrng = std::uniform_real_distribution(0.25f, 0.75f); - - xnnpack::Buffer x((batch_size() - 1) * x_stride() + input_channels() + XNN_EXTRA_BYTES / sizeof(float)); - xnnpack::Buffer w(input_channels()); - xnnpack::Buffer y((batch_size() - 1) * y_stride() + input_channels() + XNN_EXTRA_BYTES / sizeof(float)); - xnnpack::Buffer y_ref(batch_size() * input_channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(x.begin(), x.end(), [&] { return f32irng(rng);} ); - if (slope_channels() == 1) { - std::fill(w.begin(), w.end(), f32wrng(rng)); - } else { - std::generate(w.begin(), w.end(), [&] { return f32wrng(rng);} ); - } - - // Compute reference results, without clamping. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < input_channels(); c++) { - y_ref[i * input_channels() + c] = std::signbit(x[i * x_stride() + c]) ? x[i * x_stride() + c] * w[c] : x[i * x_stride() + c]; - } - } - - // Create, setup, run, and destroy PReLU operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - xnn_operator_t prelu_op = nullptr; - - struct xnn_internal_weights_cache* internal_weights_cache = nullptr; - std::unique_ptr auto_weights_cache( - nullptr, xnn_delete_weights_cache); - if (use_weights_cache()) { - xnn_weights_cache_t weights_cache = nullptr; - xnn_create_weights_cache(&weights_cache); - auto_weights_cache.reset(weights_cache); - if (weights_cache) { - internal_weights_cache = (struct xnn_internal_weights_cache*) weights_cache->context; - } - } - - ASSERT_EQ(xnn_status_success, - xnn_create_prelu_nc_f32( - input_channels(), slope_channels(), x_stride(), y_stride(), - w.data(), - 0, /*code_cache=*/nullptr, auto_weights_cache.get(), &prelu_op)); - ASSERT_NE(nullptr, prelu_op); - if (use_weights_cache()) { - ASSERT_EQ(xnn_status_success, - xnn_finalize_weights_cache(auto_weights_cache.get(), xnn_weights_cache_finalization_kind_soft)); - } - - // Smart pointer to automatically delete prelu_op. - std::unique_ptr auto_prelu_op(prelu_op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, - xnn_reshape_prelu_nc_f32( - prelu_op, - batch_size(), - /*threadpool=*/nullptr)); - - ASSERT_EQ(xnn_status_success, - xnn_setup_prelu_nc_f32( - prelu_op, - x.data(), y.data())); - - ASSERT_EQ(xnn_status_success, - xnn_run_operator(prelu_op, /*threadpool=*/nullptr)); - - VerifyF32(y, y_ref); - - if (use_weights_cache()) { - xnn_operator_t prelu_op2 = nullptr; - const size_t old_weights_cache_size = internal_weights_cache->cache.weights.size; - - ASSERT_EQ(xnn_status_success, - xnn_create_prelu_nc_f32( - input_channels(), slope_channels(), x_stride(), y_stride(), - w.data(), - 0, /*code_cache=*/nullptr, auto_weights_cache.get(), &prelu_op2)); - ASSERT_NE(nullptr, prelu_op2); - - // Smart pointer to automatically delete prelu_op2. - std::unique_ptr auto_prelu_op(prelu_op2, xnn_delete_operator); - xnnpack::Buffer y2(y.size(), nanf("")); - - ASSERT_EQ(xnn_status_success, - xnn_reshape_prelu_nc_f32( - prelu_op2, - batch_size(), - /*threadpool=*/nullptr)); - - ASSERT_EQ(xnn_status_success, - xnn_setup_prelu_nc_f32( - prelu_op2, - x.data(), y2.data())); - - ASSERT_EQ(xnn_status_success, - xnn_run_operator(prelu_op2, /*threadpool=*/nullptr)); - - VerifyF32(y, y_ref); - VerifyWeightsCache(*internal_weights_cache, old_weights_cache_size); - } - } - } - - void VerifyF32(const xnnpack::Buffer& y, const xnnpack::Buffer& y_ref) const { - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < input_channels(); c++) { - ASSERT_NEAR( - y[i * y_stride() + c], - y_ref[i * input_channels() + c], - std::max(1.0e-6f, std::abs(y_ref[i * input_channels() + c]) * 1.0e-6f)) - << "at position " << i << " / " << batch_size() << ", channel " << c << " / " << input_channels(); - } - } - } - - void VerifyWeightsCache(const xnn_internal_weights_cache& weights_cache, size_t old_size) const { - ASSERT_EQ(weights_cache.cache.hits, 1); - // Ensure that we did not write more weights to the cache because it was a cache hit. - ASSERT_EQ(old_size, weights_cache.cache.weights.size); - }; - - private: - size_t batch_size_{1}; - size_t input_channels_{1}; - size_t slope_channels_{0}; - size_t x_stride_{0}; - size_t y_stride_{0}; - WeightsType weights_type_{WeightsType::Default}; - bool use_weights_cache_{false}; - size_t iterations_{15}; -}; diff --git a/test/prelu.cc b/test/prelu.cc deleted file mode 100644 index 8571c89f7f6..00000000000 --- a/test/prelu.cc +++ /dev/null @@ -1,366 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/subgraph.h" -#include "xnnpack/buffer.h" -#include "replicable_random_device.h" - -template < - typename InputType, - typename WeightType = InputType, - typename OutputType = InputType> -class PreluTest : public ::testing::Test { - protected: - void SetUp() override { - dim_dist = std::uniform_int_distribution(1, 9); - input_dims = RandomShape(4); - output_dims = input_dims; - batch_size = input_dims[0] * input_dims[1] * input_dims[2]; - input_channels = input_dims[3]; - slope_channels = input_dims[3]; - // Randomly broadcast slope. - if (dim_dist(rng) < 3) { - slope_channels = 1; - } - slope_dims = {slope_channels}; - input = xnnpack::Buffer(XNN_EXTRA_BYTES / sizeof(InputType) + NumElements(input_dims)); - slope = xnnpack::Buffer(slope_channels); - operator_output = xnnpack::Buffer(NumElements(output_dims)); - subgraph_output = xnnpack::Buffer(operator_output.size()); - } - - std::vector RandomShape(size_t num_dims) - { - std::vector dims(num_dims); - std::generate(dims.begin(), dims.end(), [&] { return dim_dist(rng); }); - return dims; - } - - size_t NumElements(std::vector& dims) - { - return std::accumulate(dims.begin(), dims.end(), size_t(1), std::multiplies()); - } - - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution dim_dist; - - std::vector output_dims; - std::vector input_dims; - std::vector slope_dims; - xnnpack::Buffer input; - xnnpack::Buffer slope; - xnnpack::Buffer operator_output; - xnnpack::Buffer subgraph_output; - size_t input_channels; - size_t slope_channels; - size_t batch_size; -}; - -using PreluTestF16 = PreluTest; -using PreluTestF32 = PreluTest; - -TEST_F(PreluTestF16, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, input_dims.size(), input_dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - uint32_t slope_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, slope_dims.size(), slope_dims.data(), slope.data(), 1, - /*flags=*/0, &slope_id)); - ASSERT_NE(slope_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, input_dims.size(), input_dims.data(), nullptr, 2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_prelu(subgraph, input_id, slope_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_prelu); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp16); - ASSERT_EQ(node->num_inputs, 2); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->inputs[1], slope_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(PreluTestF32, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - uint32_t input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, input_dims.size(), input_dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - uint32_t slope_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, slope_dims.size(), slope_dims.data(), slope.data(), 1, - /*flags=*/0, &slope_id)); - ASSERT_NE(slope_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, input_dims.size(), input_dims.data(), nullptr, 2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_prelu(subgraph, input_id, slope_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_prelu); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->num_inputs, 2); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->inputs[1], slope_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(PreluTestF16, matches_operator_api) -{ - std::uniform_real_distribution f32idist(-1.0f, 1.0f); - std::uniform_real_distribution f32wdist(0.25f, 0.75f); - std::generate(input.begin(), input.end(), [&]() { return f32idist(rng); }); - std::generate(slope.begin(), slope.end(), [&]() { return f32wdist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = - xnn_create_prelu_nc_f16(input_channels, slope_channels, input_channels, input_channels, slope.data(), XNN_FLAG_FP32_STATIC_WEIGHTS, nullptr, nullptr, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ( - xnn_status_success, - xnn_reshape_prelu_nc_f16(op, batch_size, /*threadpool=*/nullptr)); - - ASSERT_EQ( - xnn_status_success, - xnn_setup_prelu_nc_f16(op, input.data(), operator_output.data())); - - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - uint32_t input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, input_dims.size(), input_dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - uint32_t slope_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, slope_dims.size(), slope_dims.data(), slope.data(), /*external_id=*/1, - /*flags=*/0, &slope_id)); - ASSERT_NE(slope_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, output_dims.size(), output_dims.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_prelu(subgraph, input_id, slope_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(PreluTestF32, matches_operator_api) -{ - std::uniform_real_distribution f32idist(-1.0f, 1.0f); - std::uniform_real_distribution f32wdist(0.25f, 0.75f); - std::generate(input.begin(), input.end(), [&]() { return f32idist(rng); }); - std::generate(slope.begin(), slope.end(), [&]() { return f32wdist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = - xnn_create_prelu_nc_f32(input_channels, slope_channels, input_channels, input_channels, slope.data(), /*flags=*/0, nullptr, nullptr, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ( - xnn_status_success, - xnn_reshape_prelu_nc_f32(op, batch_size, /*threadpool=*/nullptr)); - - ASSERT_EQ( - xnn_status_success, - xnn_setup_prelu_nc_f32(op, input.data(), operator_output.data())); - - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - uint32_t input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, input_dims.size(), input_dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - uint32_t slope_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, slope_dims.size(), slope_dims.data(), slope.data(), /*external_id=*/1, - /*flags=*/0, &slope_id)); - ASSERT_NE(slope_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, output_dims.size(), output_dims.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_prelu(subgraph, input_id, slope_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(PreluTestF32, reshape_output) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/3, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - uint32_t input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, input_dims.size(), input_dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - uint32_t slope_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, slope_dims.size(), slope_dims.data(), slope.data(), /*external_id=*/1, - /*flags=*/0, &slope_id)); - ASSERT_NE(slope_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, output_dims.size(), output_dims.data(), nullptr, /*external_id=*/2, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_prelu(subgraph, input_id, slope_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - input_dims[0] += 2; - input_dims[1] += 2; - input_dims[2] += 2; - - ASSERT_EQ(xnn_status_success, xnn_reshape_external_value(runtime, input_id, input_dims.size(), input_dims.data())); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->reshape(&runtime->opdata[0], runtime->values, runtime->num_values, /*threadpool=*/nullptr), xnn_status_reallocation_required); - const xnn_shape* output_shape = &runtime->values[node->outputs[0]].shape; - for (size_t i = 0; i < input_dims.size(); ++i) { - ASSERT_EQ(input_dims[i], output_shape->dim[i]); - } - - input_dims[1] -= 1; - ASSERT_EQ(xnn_status_success, xnn_reshape_external_value(runtime, input_id, input_dims.size(), input_dims.data())); - ASSERT_EQ(node->reshape(&runtime->opdata[0], runtime->values, runtime->num_values, /*threadpool=*/nullptr), xnn_status_success); - for (size_t i = 0; i < input_dims.size(); ++i) { - ASSERT_EQ(input_dims[i], output_shape->dim[i]); - } -} diff --git a/test/qs16-qs8-vcvt.cc b/test/qs16-qs8-vcvt.cc index 0d9531a3de9..f947e0c9628 100644 --- a/test/qs16-qs8-vcvt.cc +++ b/test/qs16-qs8-vcvt.cc @@ -23,5 +23,5 @@ XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, u \ \ XNN_TEST_CVT_OUTPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); -#include "src/qs16-qs8-vcvt/qs16-qs8-vcvt.h" +#include "qs16-qs8-vcvt/qs16-qs8-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/test/qs8-dwconv-minmax-multipass-fp32.cc b/test/qs8-dwconv-minmax-multipass-fp32.cc index 51e7ee9801a..c2c1e70d2ab 100644 --- a/test/qs8-dwconv-minmax-multipass-fp32.cc +++ b/test/qs8-dwconv-minmax-multipass-fp32.cc @@ -300,5 +300,5 @@ INSTANTIATE_TEST_SUITE_P( [](const testing::TestParamInfo& info) { \ return info.param.test_name; \ }); -#include "src/qs8-dwconv/qs8-dwconv-minmax-multipass-fp32.h" +#include "qs8-dwconv/qs8-dwconv-minmax-multipass-fp32.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qs8-dwconv-minmax-multipass-rndnu.cc b/test/qs8-dwconv-minmax-multipass-rndnu.cc index 08efff8519b..8afaff02c64 100644 --- a/test/qs8-dwconv-minmax-multipass-rndnu.cc +++ b/test/qs8-dwconv-minmax-multipass-rndnu.cc @@ -300,5 +300,5 @@ INSTANTIATE_TEST_SUITE_P( [](const testing::TestParamInfo& info) { \ return info.param.test_name; \ }); -#include "src/qs8-dwconv/qs8-dwconv-minmax-multipass-rndnu.h" +#include "qs8-dwconv/qs8-dwconv-minmax-multipass-rndnu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qs8-dwconv-minmax-unipass-fp32.cc b/test/qs8-dwconv-minmax-unipass-fp32.cc index 4da93830b13..8e688cfee62 100644 --- a/test/qs8-dwconv-minmax-unipass-fp32.cc +++ b/test/qs8-dwconv-minmax-unipass-fp32.cc @@ -204,5 +204,5 @@ INSTANTIATE_TEST_SUITE_P( [](const testing::TestParamInfo& info) { \ return info.param.test_name; \ }); -#include "src/qs8-dwconv/qs8-dwconv-minmax-unipass-fp32.h" +#include "qs8-dwconv/qs8-dwconv-minmax-unipass-fp32.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qs8-dwconv-minmax-unipass-rndnu.cc b/test/qs8-dwconv-minmax-unipass-rndnu.cc index 13ee096f8b5..d0ec65e4121 100644 --- a/test/qs8-dwconv-minmax-unipass-rndnu.cc +++ b/test/qs8-dwconv-minmax-unipass-rndnu.cc @@ -204,5 +204,5 @@ INSTANTIATE_TEST_SUITE_P( [](const testing::TestParamInfo& info) { \ return info.param.test_name; \ }); -#include "src/qs8-dwconv/qs8-dwconv-minmax-unipass-rndnu.h" +#include "qs8-dwconv/qs8-dwconv-minmax-unipass-rndnu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qs8-f16-vcvt.cc b/test/qs8-f16-vcvt.cc index 132e9d9ef08..f410d3027bb 100644 --- a/test/qs8-f16-vcvt.cc +++ b/test/qs8-f16-vcvt.cc @@ -22,5 +22,5 @@ XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ \ XNN_TEST_CVT_INPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); -#include "src/qs8-f16-vcvt/qs8-f16-vcvt.h" +#include "qs8-f16-vcvt/qs8-f16-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/test/qs8-f32-vcvt.cc b/test/qs8-f32-vcvt.cc index 5e2ee068a88..9415e7df8a1 100644 --- a/test/qs8-f32-vcvt.cc +++ b/test/qs8-f32-vcvt.cc @@ -22,5 +22,5 @@ XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ \ XNN_TEST_CVT_INPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); -#include "src/qs8-f32-vcvt/qs8-f32-vcvt.h" +#include "qs8-f32-vcvt/qs8-f32-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/test/qs8-packw.cc b/test/qs8-packw.cc index 0f69c429a15..23a635403c7 100644 --- a/test/qs8-packw.cc +++ b/test/qs8-packw.cc @@ -33,7 +33,7 @@ std::string GetTestQS8Name(const testing::TestParamInfo& { #ukernel, ukernel, arch_flags, nr, kr, sr, kblock, nr_scale, izp }, const XnnTestQS8Param xnn_test_qs8_params[] = { -#include "src/qs8-packw/qs8-packw.h" +#include "qs8-packw/qs8-packw.h" }; #undef XNN_QS8_UKERNEL diff --git a/test/qs8-qc8w-dwconv-minmax-multipass-fp32.cc b/test/qs8-qc8w-dwconv-minmax-multipass-fp32.cc index 5ae7ca8f286..4dfea67e351 100644 --- a/test/qs8-qc8w-dwconv-minmax-multipass-fp32.cc +++ b/test/qs8-qc8w-dwconv-minmax-multipass-fp32.cc @@ -300,5 +300,5 @@ INSTANTIATE_TEST_SUITE_P( [](const testing::TestParamInfo& info) { \ return info.param.test_name; \ }); -#include "src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-multipass-fp32.h" +#include "qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-multipass-fp32.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qs8-qc8w-dwconv-minmax-unipass-fp32.cc b/test/qs8-qc8w-dwconv-minmax-unipass-fp32.cc index 95f334d0e22..7b761dc8e85 100644 --- a/test/qs8-qc8w-dwconv-minmax-unipass-fp32.cc +++ b/test/qs8-qc8w-dwconv-minmax-unipass-fp32.cc @@ -204,5 +204,5 @@ INSTANTIATE_TEST_SUITE_P( [](const testing::TestParamInfo& info) { \ return info.param.test_name; \ }); -#include "src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-unipass-fp32.h" +#include "qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-unipass-fp32.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qs8-requantization.cc b/test/qs8-requantization.cc index 53bf1cc92cb..e9d541ed3a0 100644 --- a/test/qs8-requantization.cc +++ b/test/qs8-requantization.cc @@ -15,288 +15,6 @@ #include "xnnpack/requantization-stubs.h" #include "requantization-tester.h" -/* - * Round-to-nearest, ties away from zero, scalar implementation using unsigned 32-bit arithmetics. - */ - -TEST(QS8_RNDNA__SCALAR_UNSIGNED32, exact_divide_by_po2) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qs8_requantize_rndna__scalar_unsigned32); - } -} - -TEST(QS8_RNDNA__SCALAR_UNSIGNED32, exact_divide_by_po2_with_zero_point) { - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qs8_requantize_rndna__scalar_unsigned32); - } - } -} - -TEST(QS8_RNDNA__SCALAR_UNSIGNED32, divide_by_po2_with_rounding_up) { - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_rndna__scalar_unsigned32); - } - } -} - -TEST(QS8_RNDNA__SCALAR_UNSIGNED32, divide_by_po2_with_rounding_down) { - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_rndna__scalar_unsigned32); - } - } -} - -TEST(QS8_RNDNA__SCALAR_UNSIGNED32, divide_by_po2_with_rounding_away) { - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingTiesAway(xnn_qs8_requantize_rndna__scalar_unsigned32); - } - } -} - -TEST(QS8_RNDNA__SCALAR_UNSIGNED32, special_cases) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .TestSpecialCases(xnn_qs8_requantize_rndna__scalar_unsigned32); -} - -TEST(QS8_RNDNA__SCALAR_UNSIGNED32, random_cases) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .iterations(100) - .TestRandomCasesRoundToNearestTiesAway(xnn_qs8_requantize_rndna__scalar_unsigned32); -} - - -/* - * Round-to-nearest, ties away from zero, scalar implementation using unsigned 64-bit arithmetics. - */ - -TEST(QS8_RNDNA__SCALAR_UNSIGNED64, exact_divide_by_po2) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qs8_requantize_rndna__scalar_unsigned64); - } -} - -TEST(QS8_RNDNA__SCALAR_UNSIGNED64, exact_divide_by_po2_with_zero_point) { - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qs8_requantize_rndna__scalar_unsigned64); - } - } -} - -TEST(QS8_RNDNA__SCALAR_UNSIGNED64, divide_by_po2_with_rounding_up) { - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_rndna__scalar_unsigned64); - } - } -} - -TEST(QS8_RNDNA__SCALAR_UNSIGNED64, divide_by_po2_with_rounding_down) { - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_rndna__scalar_unsigned64); - } - } -} - -TEST(QS8_RNDNA__SCALAR_UNSIGNED64, divide_by_po2_with_rounding_away) { - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingTiesAway(xnn_qs8_requantize_rndna__scalar_unsigned64); - } - } -} - -TEST(QS8_RNDNA__SCALAR_UNSIGNED64, special_cases) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .TestSpecialCases(xnn_qs8_requantize_rndna__scalar_unsigned64); -} - -TEST(QS8_RNDNA__SCALAR_UNSIGNED64, random_cases) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .iterations(100) - .TestRandomCasesRoundToNearestTiesAway(xnn_qs8_requantize_rndna__scalar_unsigned64); -} - - -/* - * Round-to-nearest, ties away from zero, scalar implementation using signed 64-bit arithmetics. - */ - -TEST(QS8_RNDNA__SCALAR_SIGNED64, exact_divide_by_po2) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qs8_requantize_rndna__scalar_signed64); - } -} - -TEST(QS8_RNDNA__SCALAR_SIGNED64, exact_divide_by_po2_with_zero_point) { - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qs8_requantize_rndna__scalar_signed64); - } - } -} - -TEST(QS8_RNDNA__SCALAR_SIGNED64, divide_by_po2_with_rounding_up) { - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_rndna__scalar_signed64); - } - } -} - -TEST(QS8_RNDNA__SCALAR_SIGNED64, divide_by_po2_with_rounding_down) { - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_rndna__scalar_signed64); - } - } -} - -TEST(QS8_RNDNA__SCALAR_SIGNED64, divide_by_po2_with_rounding_away) { - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingTiesAway(xnn_qs8_requantize_rndna__scalar_signed64); - } - } -} - -TEST(QS8_RNDNA__SCALAR_SIGNED64, special_cases) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .TestSpecialCases(xnn_qs8_requantize_rndna__scalar_signed64); -} - -TEST(QS8_RNDNA__SCALAR_SIGNED64, random_cases) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .iterations(100) - .TestRandomCasesRoundToNearestTiesAway(xnn_qs8_requantize_rndna__scalar_signed64); -} - - /* * Round-to-nearest, ties up, scalar implementation using signed 64-bit arithmetics. */ @@ -361,417 +79,121 @@ TEST(QS8_RNDNU__SCALAR, divide_by_po2_with_rounding_down) { TEST(QS8_RNDNU__SCALAR, divide_by_po2_with_rounding_away) { for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingTiesUp(xnn_qs8_requantize_rndnu__scalar); - } - } -} - -TEST(QS8_RNDNU__SCALAR, random_cases) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .iterations(100) - .TestRandomCasesRoundToNearestTiesUp(xnn_qs8_requantize_rndnu__scalar); -} - - -/* - * FP32-based scalar implementation using lrintf function. - */ - -TEST(QS8_FP32__SCALAR_LRINTF, random_cases) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .iterations(1000) - .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__scalar_lrintf); -} - - -/* - * FP32-based scalar implementation using magic trick for FP32->INT32 conversion. - */ - -TEST(QS8_FP32__SCALAR_FMAGIC, random_cases) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .iterations(1000) - .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__scalar_fmagic); -} - - -/* - * GEMMLOWP-equivalent scalar implementation. - */ - -TEST(QS8_GEMMLOWP__SCALAR, exact_divide_by_po2) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qs8_requantize_gemmlowp__scalar); - } -} - -TEST(QS8_GEMMLOWP__SCALAR, exact_divide_by_po2_with_zero_point) { - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qs8_requantize_gemmlowp__scalar); - } - } -} - -TEST(QS8_GEMMLOWP__SCALAR, divide_by_po2_with_rounding_up) { - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_gemmlowp__scalar); - } - } -} - -/* No rounding down test - it fails because of upward bias in multiplication */ -/* No rounding away test - it fails because of upward bias in multiplication */ - -TEST(QS8_GEMMLOWP__SCALAR, special_cases) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .TestSpecialCases(xnn_qs8_requantize_gemmlowp__scalar); -} - -TEST(QS8_GEMMLOWP__SCALAR, random_cases) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .iterations(100) - .TestRandomCasesApproximate(xnn_qs8_requantize_gemmlowp__scalar); -} - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - /* - * Round-to-nearest, ties away from zero, SSE2 implementation using floating-point shuffle. - */ - - TEST(QS8_RNDNA__SSE2, exact_divide_by_po2) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qs8_requantize_rndna__sse2); - } - } - - TEST(QS8_RNDNA__SSE2, exact_divide_by_po2_with_zero_point) { - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qs8_requantize_rndna__sse2); - } - } - } - - TEST(QS8_RNDNA__SSE2, divide_by_po2_with_rounding_up) { - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_rndna__sse2); - } - } - } - - TEST(QS8_RNDNA__SSE2, divide_by_po2_with_rounding_down) { - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_rndna__sse2); - } - } - } - - TEST(QS8_RNDNA__SSE2, divide_by_po2_with_rounding_away) { - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingTiesAway(xnn_qs8_requantize_rndna__sse2); - } - } - } - - TEST(QS8_RNDNA__SSE2, special_cases) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .TestSpecialCases(xnn_qs8_requantize_rndna__sse2); - } - - TEST(QS8_RNDNA__SSE2, random_cases) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .iterations(100) - .TestRandomCasesRoundToNearestTiesAway(xnn_qs8_requantize_rndna__sse2); - } - - - /* - * Round-to-nearest, ties away from zero, SSSE3 implementation using floating-point shuffle. - */ - - TEST(QS8_RNDNA__SSSE3, exact_divide_by_po2) { - TEST_REQUIRES_X86_SSSE3; - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qs8_requantize_rndna__ssse3); - } - } - - TEST(QS8_RNDNA__SSSE3, exact_divide_by_po2_with_zero_point) { - TEST_REQUIRES_X86_SSSE3; - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qs8_requantize_rndna__ssse3); - } - } - } - - TEST(QS8_RNDNA__SSSE3, divide_by_po2_with_rounding_up) { - TEST_REQUIRES_X86_SSSE3; - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_rndna__ssse3); - } - } - } - - TEST(QS8_RNDNA__SSSE3, divide_by_po2_with_rounding_down) { - TEST_REQUIRES_X86_SSSE3; - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_rndna__ssse3); - } - } - } - - TEST(QS8_RNDNA__SSSE3, divide_by_po2_with_rounding_away) { - TEST_REQUIRES_X86_SSSE3; - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingTiesAway(xnn_qs8_requantize_rndna__ssse3); - } + zero_point <= std::numeric_limits::max(); + zero_point++) + { + for (uint32_t s = 1; s < 32; s++) { + RequantizationTester() + .zero_point(zero_point) + .qmin(std::numeric_limits::min()) + .qmax(std::numeric_limits::max()) + .s(s) + .TestDivideByPO2WithRoundingTiesUp(xnn_qs8_requantize_rndnu__scalar); } } +} - TEST(QS8_RNDNA__SSSE3, special_cases) { - TEST_REQUIRES_X86_SSSE3; - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .TestSpecialCases(xnn_qs8_requantize_rndna__ssse3); - } +TEST(QS8_RNDNU__SCALAR, random_cases) { + RequantizationTester() + .qmin(std::numeric_limits::min()) + .qmax(std::numeric_limits::max()) + .iterations(100) + .TestRandomCasesRoundToNearestTiesUp(xnn_qs8_requantize_rndnu__scalar); +} - TEST(QS8_RNDNA__SSSE3, random_cases) { - TEST_REQUIRES_X86_SSSE3; + +/* + * FP32-based scalar implementation using lrintf function. + */ + +TEST(QS8_FP32__SCALAR_LRINTF, random_cases) { + RequantizationTester() + .qmin(std::numeric_limits::min()) + .qmax(std::numeric_limits::max()) + .iterations(1000) + .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__scalar_lrintf); +} + + +/* + * FP32-based scalar implementation using magic trick for FP32->INT32 conversion. + */ + +TEST(QS8_FP32__SCALAR_FMAGIC, random_cases) { + RequantizationTester() + .qmin(std::numeric_limits::min()) + .qmax(std::numeric_limits::max()) + .iterations(1000) + .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__scalar_fmagic); +} + + +/* + * GEMMLOWP-equivalent scalar implementation. + */ + +TEST(QS8_GEMMLOWP__SCALAR, exact_divide_by_po2) { + for (uint32_t s = 1; s < 32; s++) { RequantizationTester() .qmin(std::numeric_limits::min()) .qmax(std::numeric_limits::max()) - .iterations(100) - .TestRandomCasesRoundToNearestTiesAway(xnn_qs8_requantize_rndna__ssse3); + .s(s) + .TestExactDivideByPO2(xnn_qs8_requantize_gemmlowp__scalar); } +} - - /* - * Round-to-nearest, ties away from zero, SSE4.1 implementation using static blend instruction. - */ - - TEST(QS8_RNDNA__SSE41, exact_divide_by_po2) { - TEST_REQUIRES_X86_SSE41; +TEST(QS8_GEMMLOWP__SCALAR, exact_divide_by_po2_with_zero_point) { + for (int32_t zero_point = std::numeric_limits::min(); + zero_point <= std::numeric_limits::max(); + zero_point++) + { for (uint32_t s = 1; s < 32; s++) { RequantizationTester() + .zero_point(zero_point) .qmin(std::numeric_limits::min()) .qmax(std::numeric_limits::max()) .s(s) - .TestExactDivideByPO2(xnn_qs8_requantize_rndna__sse41); - } - } - - TEST(QS8_RNDNA__SSE41, exact_divide_by_po2_with_zero_point) { - TEST_REQUIRES_X86_SSE41; - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qs8_requantize_rndna__sse41); - } - } - } - - TEST(QS8_RNDNA__SSE41, divide_by_po2_with_rounding_up) { - TEST_REQUIRES_X86_SSE41; - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_rndna__sse41); - } + .TestExactDivideByPO2(xnn_qs8_requantize_gemmlowp__scalar); } } +} - TEST(QS8_RNDNA__SSE41, divide_by_po2_with_rounding_down) { - TEST_REQUIRES_X86_SSE41; - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_rndna__sse41); - } +TEST(QS8_GEMMLOWP__SCALAR, divide_by_po2_with_rounding_up) { + for (int32_t zero_point = std::numeric_limits::min(); + zero_point <= std::numeric_limits::max(); + zero_point++) + { + for (uint32_t s = 1; s < 32; s++) { + RequantizationTester() + .zero_point(zero_point) + .qmin(std::numeric_limits::min()) + .qmax(std::numeric_limits::max()) + .s(s) + .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_gemmlowp__scalar); } } +} - TEST(QS8_RNDNA__SSE41, divide_by_po2_with_rounding_away) { - TEST_REQUIRES_X86_SSE41; - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingTiesAway(xnn_qs8_requantize_rndna__sse41); - } - } - } +/* No rounding down test - it fails because of upward bias in multiplication */ +/* No rounding away test - it fails because of upward bias in multiplication */ - TEST(QS8_RNDNA__SSE41, special_cases) { - TEST_REQUIRES_X86_SSE41; - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .TestSpecialCases(xnn_qs8_requantize_rndna__sse41); - } +TEST(QS8_GEMMLOWP__SCALAR, special_cases) { + RequantizationTester() + .qmin(std::numeric_limits::min()) + .qmax(std::numeric_limits::max()) + .TestSpecialCases(xnn_qs8_requantize_gemmlowp__scalar); +} - TEST(QS8_RNDNA__SSE41, random_cases) { - TEST_REQUIRES_X86_SSE41; - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .iterations(100) - .TestRandomCasesRoundToNearestTiesAway(xnn_qs8_requantize_rndna__sse41); - } +TEST(QS8_GEMMLOWP__SCALAR, random_cases) { + RequantizationTester() + .qmin(std::numeric_limits::min()) + .qmax(std::numeric_limits::max()) + .iterations(100) + .TestRandomCasesApproximate(xnn_qs8_requantize_gemmlowp__scalar); +} +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 /* * Round-to-nearest, ties up, SSE4.1 implementation using arithmetic shift right. */ @@ -1191,107 +613,6 @@ TEST(QS8_GEMMLOWP__SCALAR, random_cases) { #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 - /* - * Round-to-nearest, ties away from zero, ARM NEON implementation. - */ - - TEST(QS8_RNDNA__NEON, exact_divide_by_po2) { - TEST_REQUIRES_ARM_NEON; - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .s(s) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .TestExactDivideByPO2(xnn_qs8_requantize_rndna__neon); - } - } - - TEST(QS8_RNDNA__NEON, exact_divide_by_po2_with_zero_point) { - TEST_REQUIRES_ARM_NEON; - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qs8_requantize_rndna__neon); - } - } - } - - TEST(QS8_RNDNA__NEON, divide_by_po2_with_rounding_up) { - TEST_REQUIRES_ARM_NEON; - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_rndna__neon); - } - } - } - - TEST(QS8_RNDNA__NEON, divide_by_po2_with_rounding_down) { - TEST_REQUIRES_ARM_NEON; - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_rndna__neon); - } - } - } - - TEST(QS8_RNDNA__NEON, divide_by_po2_with_rounding_away) { - TEST_REQUIRES_ARM_NEON; - for (int32_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point++) - { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingTiesAway(xnn_qs8_requantize_rndna__neon); - } - } - } - - TEST(QS8_RNDNA__NEON, special_cases) { - TEST_REQUIRES_ARM_NEON; - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .TestSpecialCases(xnn_qs8_requantize_rndna__neon); - } - - TEST(QS8_RNDNA__NEON, random_cases) { - TEST_REQUIRES_ARM_NEON; - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .iterations(100) - .TestRandomCasesRoundToNearestTiesAway(xnn_qs8_requantize_rndna__neon); - } - - /* * Round-to-nearest, ties up, ARM NEON implementation using extended multiplication. */ diff --git a/test/qs8-vadd-minmax.cc b/test/qs8-vadd-minmax.cc index aa3b0d66c63..45860f4013d 100644 --- a/test/qs8-vadd-minmax.cc +++ b/test/qs8-vadd-minmax.cc @@ -31,5 +31,5 @@ XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, false, datatype, ukerne \ XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params); \ XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params); -#include "src/qs8-vadd/qs8-vadd-minmax.h" +#include "qs8-vadd/qs8-vadd-minmax.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qs8-vaddc-minmax.cc b/test/qs8-vaddc-minmax.cc index 41fdd0148d9..cdef11d3293 100644 --- a/test/qs8-vaddc-minmax.cc +++ b/test/qs8-vaddc-minmax.cc @@ -31,5 +31,5 @@ XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel \ XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, true, datatype, ukernel, init_params); \ XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, true, datatype, ukernel, init_params); -#include "src/qs8-vaddc/qs8-vaddc-minmax.h" +#include "qs8-vaddc/qs8-vaddc-minmax.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qs8-vcvt.cc b/test/qs8-vcvt.cc index 4b3237b3962..4716a82d19c 100644 --- a/test/qs8-vcvt.cc +++ b/test/qs8-vcvt.cc @@ -22,5 +22,5 @@ XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ \ XNN_TEST_CVT_INPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); -#include "src/qs8-vcvt/qs8-vcvt.h" +#include "qs8-vcvt/qs8-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/test/qs8-vhswish.cc b/test/qs8-vhswish.cc index c958a985bde..093c95ab5d6 100644 --- a/test/qs8-vhswish.cc +++ b/test/qs8-vhswish.cc @@ -1,1561 +1,83 @@ -// Copyright 2023 Google LLC +// Copyright 2019 Google LLC // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Specification: test/qs8-vhswish.yaml -// Generator: tools/generate-vhswish-test.py +// Microkernel: qs8-vhswish +// Generator: tools/generate-vunary-test.py -#include +#include +#include +#include +#include +#include #include +#include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/isa-checks.h" #include "xnnpack/microparams-init.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/microparams.h" +#include "xnnpack/vunary.h" +#include "next_prime.h" #include "vhswish-microkernel-tester.h" - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QS8_VHSWISH__NEON_U8, batch_eq_8) { - TEST_REQUIRES_ARM_NEON; - VHSwishMicrokernelTester() - .batch_size(8) - .Test(xnn_qs8_vhswish_ukernel__neon_u8, xnn_init_qs8_hswish_scalar_params); - } - - TEST(QS8_VHSWISH__NEON_U8, batch_div_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__neon_u8, xnn_init_qs8_hswish_scalar_params); - } - } - - TEST(QS8_VHSWISH__NEON_U8, batch_lt_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 1; batch_size < 8; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__neon_u8, xnn_init_qs8_hswish_scalar_params); - } - } - - TEST(QS8_VHSWISH__NEON_U8, batch_gt_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 9; batch_size < 16; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__neon_u8, xnn_init_qs8_hswish_scalar_params); - } - } - - TEST(QS8_VHSWISH__NEON_U8, input_scale) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .Test(xnn_qs8_vhswish_ukernel__neon_u8, xnn_init_qs8_hswish_scalar_params); - } - } - } - - TEST(QS8_VHSWISH__NEON_U8, output_scale) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .Test(xnn_qs8_vhswish_ukernel__neon_u8, xnn_init_qs8_hswish_scalar_params); - } - } - } - - TEST(QS8_VHSWISH__NEON_U8, input_zero_point) { - TEST_REQUIRES_ARM_NEON; - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .Test(xnn_qs8_vhswish_ukernel__neon_u8, xnn_init_qs8_hswish_scalar_params); - } - } - } - - TEST(QS8_VHSWISH__NEON_U8, output_zero_point) { - TEST_REQUIRES_ARM_NEON; - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_zero_point(output_zero_point) - .Test(xnn_qs8_vhswish_ukernel__neon_u8, xnn_init_qs8_hswish_scalar_params); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QS8_VHSWISH__NEON_U16, batch_eq_16) { - TEST_REQUIRES_ARM_NEON; - VHSwishMicrokernelTester() - .batch_size(16) - .Test(xnn_qs8_vhswish_ukernel__neon_u16, xnn_init_qs8_hswish_scalar_params); - } - - TEST(QS8_VHSWISH__NEON_U16, batch_div_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__neon_u16, xnn_init_qs8_hswish_scalar_params); - } - } - - TEST(QS8_VHSWISH__NEON_U16, batch_lt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 1; batch_size < 16; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__neon_u16, xnn_init_qs8_hswish_scalar_params); - } - } - - TEST(QS8_VHSWISH__NEON_U16, batch_gt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 17; batch_size < 32; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__neon_u16, xnn_init_qs8_hswish_scalar_params); - } - } - - TEST(QS8_VHSWISH__NEON_U16, input_scale) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .Test(xnn_qs8_vhswish_ukernel__neon_u16, xnn_init_qs8_hswish_scalar_params); - } - } - } - - TEST(QS8_VHSWISH__NEON_U16, output_scale) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .Test(xnn_qs8_vhswish_ukernel__neon_u16, xnn_init_qs8_hswish_scalar_params); - } - } - } - - TEST(QS8_VHSWISH__NEON_U16, input_zero_point) { - TEST_REQUIRES_ARM_NEON; - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .Test(xnn_qs8_vhswish_ukernel__neon_u16, xnn_init_qs8_hswish_scalar_params); - } - } - } - - TEST(QS8_VHSWISH__NEON_U16, output_zero_point) { - TEST_REQUIRES_ARM_NEON; - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_zero_point(output_zero_point) - .Test(xnn_qs8_vhswish_ukernel__neon_u16, xnn_init_qs8_hswish_scalar_params); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QS8_VHSWISH__NEON_U32, batch_eq_32) { - TEST_REQUIRES_ARM_NEON; - VHSwishMicrokernelTester() - .batch_size(32) - .Test(xnn_qs8_vhswish_ukernel__neon_u32, xnn_init_qs8_hswish_scalar_params); - } - - TEST(QS8_VHSWISH__NEON_U32, batch_div_32) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__neon_u32, xnn_init_qs8_hswish_scalar_params); - } - } - - TEST(QS8_VHSWISH__NEON_U32, batch_lt_32) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 1; batch_size < 32; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__neon_u32, xnn_init_qs8_hswish_scalar_params); - } - } - - TEST(QS8_VHSWISH__NEON_U32, batch_gt_32) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 33; batch_size < 64; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__neon_u32, xnn_init_qs8_hswish_scalar_params); - } - } - - TEST(QS8_VHSWISH__NEON_U32, input_scale) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .Test(xnn_qs8_vhswish_ukernel__neon_u32, xnn_init_qs8_hswish_scalar_params); - } - } - } - - TEST(QS8_VHSWISH__NEON_U32, output_scale) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .Test(xnn_qs8_vhswish_ukernel__neon_u32, xnn_init_qs8_hswish_scalar_params); - } - } - } - - TEST(QS8_VHSWISH__NEON_U32, input_zero_point) { - TEST_REQUIRES_ARM_NEON; - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .Test(xnn_qs8_vhswish_ukernel__neon_u32, xnn_init_qs8_hswish_scalar_params); - } - } - } - - TEST(QS8_VHSWISH__NEON_U32, output_zero_point) { - TEST_REQUIRES_ARM_NEON; - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_zero_point(output_zero_point) - .Test(xnn_qs8_vhswish_ukernel__neon_u32, xnn_init_qs8_hswish_scalar_params); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QS8_VHSWISH__SSE2_U16, batch_eq_16) { - TEST_REQUIRES_X86_SSE2; - VHSwishMicrokernelTester() - .batch_size(16) - .Test(xnn_qs8_vhswish_ukernel__sse2_u16, xnn_init_qs8_hswish_sse2_params); - } - - TEST(QS8_VHSWISH__SSE2_U16, batch_div_16) { - TEST_REQUIRES_X86_SSE2; - for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__sse2_u16, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__SSE2_U16, batch_lt_16) { - TEST_REQUIRES_X86_SSE2; - for (size_t batch_size = 1; batch_size < 16; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__sse2_u16, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__SSE2_U16, batch_gt_16) { - TEST_REQUIRES_X86_SSE2; - for (size_t batch_size = 17; batch_size < 32; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__sse2_u16, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__SSE2_U16, input_scale) { - TEST_REQUIRES_X86_SSE2; - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .Test(xnn_qs8_vhswish_ukernel__sse2_u16, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__SSE2_U16, output_scale) { - TEST_REQUIRES_X86_SSE2; - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .Test(xnn_qs8_vhswish_ukernel__sse2_u16, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__SSE2_U16, input_zero_point) { - TEST_REQUIRES_X86_SSE2; - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .Test(xnn_qs8_vhswish_ukernel__sse2_u16, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__SSE2_U16, output_zero_point) { - TEST_REQUIRES_X86_SSE2; - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_zero_point(output_zero_point) - .Test(xnn_qs8_vhswish_ukernel__sse2_u16, xnn_init_qs8_hswish_sse2_params); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QS8_VHSWISH__SSE2_U32, batch_eq_32) { - TEST_REQUIRES_X86_SSE2; - VHSwishMicrokernelTester() - .batch_size(32) - .Test(xnn_qs8_vhswish_ukernel__sse2_u32, xnn_init_qs8_hswish_sse2_params); - } - - TEST(QS8_VHSWISH__SSE2_U32, batch_div_32) { - TEST_REQUIRES_X86_SSE2; - for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__sse2_u32, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__SSE2_U32, batch_lt_32) { - TEST_REQUIRES_X86_SSE2; - for (size_t batch_size = 1; batch_size < 32; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__sse2_u32, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__SSE2_U32, batch_gt_32) { - TEST_REQUIRES_X86_SSE2; - for (size_t batch_size = 33; batch_size < 64; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__sse2_u32, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__SSE2_U32, input_scale) { - TEST_REQUIRES_X86_SSE2; - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .Test(xnn_qs8_vhswish_ukernel__sse2_u32, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__SSE2_U32, output_scale) { - TEST_REQUIRES_X86_SSE2; - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .Test(xnn_qs8_vhswish_ukernel__sse2_u32, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__SSE2_U32, input_zero_point) { - TEST_REQUIRES_X86_SSE2; - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .Test(xnn_qs8_vhswish_ukernel__sse2_u32, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__SSE2_U32, output_zero_point) { - TEST_REQUIRES_X86_SSE2; - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_zero_point(output_zero_point) - .Test(xnn_qs8_vhswish_ukernel__sse2_u32, xnn_init_qs8_hswish_sse2_params); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QS8_VHSWISH__SSSE3_U16, batch_eq_16) { - TEST_REQUIRES_X86_SSSE3; - VHSwishMicrokernelTester() - .batch_size(16) - .Test(xnn_qs8_vhswish_ukernel__ssse3_u16, xnn_init_qs8_hswish_sse2_params); - } - - TEST(QS8_VHSWISH__SSSE3_U16, batch_div_16) { - TEST_REQUIRES_X86_SSSE3; - for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__ssse3_u16, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__SSSE3_U16, batch_lt_16) { - TEST_REQUIRES_X86_SSSE3; - for (size_t batch_size = 1; batch_size < 16; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__ssse3_u16, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__SSSE3_U16, batch_gt_16) { - TEST_REQUIRES_X86_SSSE3; - for (size_t batch_size = 17; batch_size < 32; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__ssse3_u16, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__SSSE3_U16, input_scale) { - TEST_REQUIRES_X86_SSSE3; - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .Test(xnn_qs8_vhswish_ukernel__ssse3_u16, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__SSSE3_U16, output_scale) { - TEST_REQUIRES_X86_SSSE3; - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .Test(xnn_qs8_vhswish_ukernel__ssse3_u16, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__SSSE3_U16, input_zero_point) { - TEST_REQUIRES_X86_SSSE3; - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .Test(xnn_qs8_vhswish_ukernel__ssse3_u16, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__SSSE3_U16, output_zero_point) { - TEST_REQUIRES_X86_SSSE3; - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_zero_point(output_zero_point) - .Test(xnn_qs8_vhswish_ukernel__ssse3_u16, xnn_init_qs8_hswish_sse2_params); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QS8_VHSWISH__SSSE3_U32, batch_eq_32) { - TEST_REQUIRES_X86_SSSE3; - VHSwishMicrokernelTester() - .batch_size(32) - .Test(xnn_qs8_vhswish_ukernel__ssse3_u32, xnn_init_qs8_hswish_sse2_params); - } - - TEST(QS8_VHSWISH__SSSE3_U32, batch_div_32) { - TEST_REQUIRES_X86_SSSE3; - for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__ssse3_u32, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__SSSE3_U32, batch_lt_32) { - TEST_REQUIRES_X86_SSSE3; - for (size_t batch_size = 1; batch_size < 32; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__ssse3_u32, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__SSSE3_U32, batch_gt_32) { - TEST_REQUIRES_X86_SSSE3; - for (size_t batch_size = 33; batch_size < 64; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__ssse3_u32, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__SSSE3_U32, input_scale) { - TEST_REQUIRES_X86_SSSE3; - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .Test(xnn_qs8_vhswish_ukernel__ssse3_u32, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__SSSE3_U32, output_scale) { - TEST_REQUIRES_X86_SSSE3; - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .Test(xnn_qs8_vhswish_ukernel__ssse3_u32, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__SSSE3_U32, input_zero_point) { - TEST_REQUIRES_X86_SSSE3; - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .Test(xnn_qs8_vhswish_ukernel__ssse3_u32, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__SSSE3_U32, output_zero_point) { - TEST_REQUIRES_X86_SSSE3; - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_zero_point(output_zero_point) - .Test(xnn_qs8_vhswish_ukernel__ssse3_u32, xnn_init_qs8_hswish_sse2_params); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QS8_VHSWISH__SSE41_U8, batch_eq_8) { - TEST_REQUIRES_X86_SSE41; - VHSwishMicrokernelTester() - .batch_size(8) - .Test(xnn_qs8_vhswish_ukernel__sse41_u8, xnn_init_qs8_hswish_sse2_params); - } - - TEST(QS8_VHSWISH__SSE41_U8, batch_div_8) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__sse41_u8, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__SSE41_U8, batch_lt_8) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 1; batch_size < 8; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__sse41_u8, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__SSE41_U8, batch_gt_8) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 9; batch_size < 16; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__sse41_u8, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__SSE41_U8, input_scale) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .Test(xnn_qs8_vhswish_ukernel__sse41_u8, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__SSE41_U8, output_scale) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .Test(xnn_qs8_vhswish_ukernel__sse41_u8, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__SSE41_U8, input_zero_point) { - TEST_REQUIRES_X86_SSE41; - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .Test(xnn_qs8_vhswish_ukernel__sse41_u8, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__SSE41_U8, output_zero_point) { - TEST_REQUIRES_X86_SSE41; - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_zero_point(output_zero_point) - .Test(xnn_qs8_vhswish_ukernel__sse41_u8, xnn_init_qs8_hswish_sse2_params); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QS8_VHSWISH__SSE41_U16, batch_eq_16) { - TEST_REQUIRES_X86_SSE41; - VHSwishMicrokernelTester() - .batch_size(16) - .Test(xnn_qs8_vhswish_ukernel__sse41_u16, xnn_init_qs8_hswish_sse2_params); - } - - TEST(QS8_VHSWISH__SSE41_U16, batch_div_16) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__sse41_u16, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__SSE41_U16, batch_lt_16) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 1; batch_size < 16; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__sse41_u16, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__SSE41_U16, batch_gt_16) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 17; batch_size < 32; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__sse41_u16, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__SSE41_U16, input_scale) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .Test(xnn_qs8_vhswish_ukernel__sse41_u16, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__SSE41_U16, output_scale) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .Test(xnn_qs8_vhswish_ukernel__sse41_u16, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__SSE41_U16, input_zero_point) { - TEST_REQUIRES_X86_SSE41; - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .Test(xnn_qs8_vhswish_ukernel__sse41_u16, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__SSE41_U16, output_zero_point) { - TEST_REQUIRES_X86_SSE41; - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_zero_point(output_zero_point) - .Test(xnn_qs8_vhswish_ukernel__sse41_u16, xnn_init_qs8_hswish_sse2_params); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QS8_VHSWISH__SSE41_U32, batch_eq_32) { - TEST_REQUIRES_X86_SSE41; - VHSwishMicrokernelTester() - .batch_size(32) - .Test(xnn_qs8_vhswish_ukernel__sse41_u32, xnn_init_qs8_hswish_sse2_params); - } - - TEST(QS8_VHSWISH__SSE41_U32, batch_div_32) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__sse41_u32, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__SSE41_U32, batch_lt_32) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 1; batch_size < 32; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__sse41_u32, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__SSE41_U32, batch_gt_32) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 33; batch_size < 64; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__sse41_u32, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__SSE41_U32, input_scale) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .Test(xnn_qs8_vhswish_ukernel__sse41_u32, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__SSE41_U32, output_scale) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .Test(xnn_qs8_vhswish_ukernel__sse41_u32, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__SSE41_U32, input_zero_point) { - TEST_REQUIRES_X86_SSE41; - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .Test(xnn_qs8_vhswish_ukernel__sse41_u32, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__SSE41_U32, output_zero_point) { - TEST_REQUIRES_X86_SSE41; - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_zero_point(output_zero_point) - .Test(xnn_qs8_vhswish_ukernel__sse41_u32, xnn_init_qs8_hswish_sse2_params); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QS8_VHSWISH__AVX_U8, batch_eq_8) { - TEST_REQUIRES_X86_AVX; - VHSwishMicrokernelTester() - .batch_size(8) - .Test(xnn_qs8_vhswish_ukernel__avx_u8, xnn_init_qs8_hswish_sse2_params); - } - - TEST(QS8_VHSWISH__AVX_U8, batch_div_8) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__avx_u8, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__AVX_U8, batch_lt_8) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 1; batch_size < 8; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__avx_u8, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__AVX_U8, batch_gt_8) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 9; batch_size < 16; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__avx_u8, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__AVX_U8, input_scale) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .Test(xnn_qs8_vhswish_ukernel__avx_u8, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__AVX_U8, output_scale) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .Test(xnn_qs8_vhswish_ukernel__avx_u8, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__AVX_U8, input_zero_point) { - TEST_REQUIRES_X86_AVX; - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .Test(xnn_qs8_vhswish_ukernel__avx_u8, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__AVX_U8, output_zero_point) { - TEST_REQUIRES_X86_AVX; - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_zero_point(output_zero_point) - .Test(xnn_qs8_vhswish_ukernel__avx_u8, xnn_init_qs8_hswish_sse2_params); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QS8_VHSWISH__AVX_U16, batch_eq_16) { - TEST_REQUIRES_X86_AVX; - VHSwishMicrokernelTester() - .batch_size(16) - .Test(xnn_qs8_vhswish_ukernel__avx_u16, xnn_init_qs8_hswish_sse2_params); - } - - TEST(QS8_VHSWISH__AVX_U16, batch_div_16) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__avx_u16, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__AVX_U16, batch_lt_16) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 1; batch_size < 16; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__avx_u16, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__AVX_U16, batch_gt_16) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 17; batch_size < 32; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__avx_u16, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__AVX_U16, input_scale) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .Test(xnn_qs8_vhswish_ukernel__avx_u16, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__AVX_U16, output_scale) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .Test(xnn_qs8_vhswish_ukernel__avx_u16, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__AVX_U16, input_zero_point) { - TEST_REQUIRES_X86_AVX; - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .Test(xnn_qs8_vhswish_ukernel__avx_u16, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__AVX_U16, output_zero_point) { - TEST_REQUIRES_X86_AVX; - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_zero_point(output_zero_point) - .Test(xnn_qs8_vhswish_ukernel__avx_u16, xnn_init_qs8_hswish_sse2_params); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QS8_VHSWISH__AVX_U32, batch_eq_32) { - TEST_REQUIRES_X86_AVX; - VHSwishMicrokernelTester() - .batch_size(32) - .Test(xnn_qs8_vhswish_ukernel__avx_u32, xnn_init_qs8_hswish_sse2_params); - } - - TEST(QS8_VHSWISH__AVX_U32, batch_div_32) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__avx_u32, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__AVX_U32, batch_lt_32) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 1; batch_size < 32; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__avx_u32, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__AVX_U32, batch_gt_32) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 33; batch_size < 64; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__avx_u32, xnn_init_qs8_hswish_sse2_params); - } - } - - TEST(QS8_VHSWISH__AVX_U32, input_scale) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .Test(xnn_qs8_vhswish_ukernel__avx_u32, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__AVX_U32, output_scale) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .Test(xnn_qs8_vhswish_ukernel__avx_u32, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__AVX_U32, input_zero_point) { - TEST_REQUIRES_X86_AVX; - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .Test(xnn_qs8_vhswish_ukernel__avx_u32, xnn_init_qs8_hswish_sse2_params); - } - } - } - - TEST(QS8_VHSWISH__AVX_U32, output_zero_point) { - TEST_REQUIRES_X86_AVX; - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_zero_point(output_zero_point) - .Test(xnn_qs8_vhswish_ukernel__avx_u32, xnn_init_qs8_hswish_sse2_params); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(QS8_VHSWISH__WASMSIMD_U8, batch_eq_8) { - VHSwishMicrokernelTester() - .batch_size(8) - .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u8, xnn_init_qs8_hswish_scalar_params); - } - - TEST(QS8_VHSWISH__WASMSIMD_U8, batch_div_8) { - for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u8, xnn_init_qs8_hswish_scalar_params); - } - } - - TEST(QS8_VHSWISH__WASMSIMD_U8, batch_lt_8) { - for (size_t batch_size = 1; batch_size < 8; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u8, xnn_init_qs8_hswish_scalar_params); - } - } - - TEST(QS8_VHSWISH__WASMSIMD_U8, batch_gt_8) { - for (size_t batch_size = 9; batch_size < 16; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u8, xnn_init_qs8_hswish_scalar_params); - } - } - - TEST(QS8_VHSWISH__WASMSIMD_U8, input_scale) { - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u8, xnn_init_qs8_hswish_scalar_params); - } - } - } - - TEST(QS8_VHSWISH__WASMSIMD_U8, output_scale) { - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u8, xnn_init_qs8_hswish_scalar_params); - } - } - } - - TEST(QS8_VHSWISH__WASMSIMD_U8, input_zero_point) { - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u8, xnn_init_qs8_hswish_scalar_params); - } - } - } - - TEST(QS8_VHSWISH__WASMSIMD_U8, output_zero_point) { - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_zero_point(output_zero_point) - .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u8, xnn_init_qs8_hswish_scalar_params); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(QS8_VHSWISH__WASMSIMD_U16, batch_eq_16) { - VHSwishMicrokernelTester() - .batch_size(16) - .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u16, xnn_init_qs8_hswish_scalar_params); - } - - TEST(QS8_VHSWISH__WASMSIMD_U16, batch_div_16) { - for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u16, xnn_init_qs8_hswish_scalar_params); - } - } - - TEST(QS8_VHSWISH__WASMSIMD_U16, batch_lt_16) { - for (size_t batch_size = 1; batch_size < 16; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u16, xnn_init_qs8_hswish_scalar_params); - } - } - - TEST(QS8_VHSWISH__WASMSIMD_U16, batch_gt_16) { - for (size_t batch_size = 17; batch_size < 32; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u16, xnn_init_qs8_hswish_scalar_params); - } - } - - TEST(QS8_VHSWISH__WASMSIMD_U16, input_scale) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u16, xnn_init_qs8_hswish_scalar_params); - } - } - } - - TEST(QS8_VHSWISH__WASMSIMD_U16, output_scale) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u16, xnn_init_qs8_hswish_scalar_params); - } - } - } - - TEST(QS8_VHSWISH__WASMSIMD_U16, input_zero_point) { - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u16, xnn_init_qs8_hswish_scalar_params); - } - } - } - - TEST(QS8_VHSWISH__WASMSIMD_U16, output_zero_point) { - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_zero_point(output_zero_point) - .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u16, xnn_init_qs8_hswish_scalar_params); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(QS8_VHSWISH__WASMSIMD_U32, batch_eq_32) { - VHSwishMicrokernelTester() - .batch_size(32) - .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u32, xnn_init_qs8_hswish_scalar_params); - } - - TEST(QS8_VHSWISH__WASMSIMD_U32, batch_div_32) { - for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u32, xnn_init_qs8_hswish_scalar_params); - } - } - - TEST(QS8_VHSWISH__WASMSIMD_U32, batch_lt_32) { - for (size_t batch_size = 1; batch_size < 32; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u32, xnn_init_qs8_hswish_scalar_params); - } - } - - TEST(QS8_VHSWISH__WASMSIMD_U32, batch_gt_32) { - for (size_t batch_size = 33; batch_size < 64; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u32, xnn_init_qs8_hswish_scalar_params); - } - } - - TEST(QS8_VHSWISH__WASMSIMD_U32, input_scale) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u32, xnn_init_qs8_hswish_scalar_params); - } - } - } - - TEST(QS8_VHSWISH__WASMSIMD_U32, output_scale) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u32, xnn_init_qs8_hswish_scalar_params); - } - } - } - - TEST(QS8_VHSWISH__WASMSIMD_U32, input_zero_point) { - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u32, xnn_init_qs8_hswish_scalar_params); - } - } - } - - TEST(QS8_VHSWISH__WASMSIMD_U32, output_zero_point) { - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_zero_point(output_zero_point) - .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u32, xnn_init_qs8_hswish_scalar_params); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -TEST(QS8_VHSWISH__SCALAR_U1, batch_eq_1) { - VHSwishMicrokernelTester() - .batch_size(1) - .Test(xnn_qs8_vhswish_ukernel__scalar_u1, xnn_init_qs8_hswish_scalar_params); -} - -TEST(QS8_VHSWISH__SCALAR_U1, batch_gt_1) { - for (size_t batch_size = 2; batch_size < 10; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__scalar_u1, xnn_init_qs8_hswish_scalar_params); - } -} - -TEST(QS8_VHSWISH__SCALAR_U1, input_scale) { - for (size_t batch_size = 1; batch_size <= 5; batch_size += 1) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .Test(xnn_qs8_vhswish_ukernel__scalar_u1, xnn_init_qs8_hswish_scalar_params); - } - } -} - -TEST(QS8_VHSWISH__SCALAR_U1, output_scale) { - for (size_t batch_size = 1; batch_size <= 5; batch_size += 1) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .Test(xnn_qs8_vhswish_ukernel__scalar_u1, xnn_init_qs8_hswish_scalar_params); - } - } -} - -TEST(QS8_VHSWISH__SCALAR_U1, input_zero_point) { - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 5; batch_size += 1) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .Test(xnn_qs8_vhswish_ukernel__scalar_u1, xnn_init_qs8_hswish_scalar_params); - } - } -} - -TEST(QS8_VHSWISH__SCALAR_U1, output_zero_point) { - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 5; batch_size += 1) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_zero_point(output_zero_point) - .Test(xnn_qs8_vhswish_ukernel__scalar_u1, xnn_init_qs8_hswish_scalar_params); - } - } -} - -TEST(QS8_VHSWISH__SCALAR_U2, batch_eq_2) { - VHSwishMicrokernelTester() - .batch_size(2) - .Test(xnn_qs8_vhswish_ukernel__scalar_u2, xnn_init_qs8_hswish_scalar_params); -} - -TEST(QS8_VHSWISH__SCALAR_U2, batch_div_2) { - for (size_t batch_size = 4; batch_size < 20; batch_size += 2) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__scalar_u2, xnn_init_qs8_hswish_scalar_params); - } -} - -TEST(QS8_VHSWISH__SCALAR_U2, batch_lt_2) { - for (size_t batch_size = 1; batch_size < 2; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__scalar_u2, xnn_init_qs8_hswish_scalar_params); - } -} - -TEST(QS8_VHSWISH__SCALAR_U2, batch_gt_2) { - for (size_t batch_size = 3; batch_size < 4; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__scalar_u2, xnn_init_qs8_hswish_scalar_params); - } -} - -TEST(QS8_VHSWISH__SCALAR_U2, input_scale) { - for (size_t batch_size = 1; batch_size <= 10; batch_size += 1) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .Test(xnn_qs8_vhswish_ukernel__scalar_u2, xnn_init_qs8_hswish_scalar_params); - } - } -} - -TEST(QS8_VHSWISH__SCALAR_U2, output_scale) { - for (size_t batch_size = 1; batch_size <= 10; batch_size += 1) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .Test(xnn_qs8_vhswish_ukernel__scalar_u2, xnn_init_qs8_hswish_scalar_params); - } - } -} - -TEST(QS8_VHSWISH__SCALAR_U2, input_zero_point) { - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 10; batch_size += 1) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .Test(xnn_qs8_vhswish_ukernel__scalar_u2, xnn_init_qs8_hswish_scalar_params); - } - } -} - -TEST(QS8_VHSWISH__SCALAR_U2, output_zero_point) { - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 10; batch_size += 1) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_zero_point(output_zero_point) - .Test(xnn_qs8_vhswish_ukernel__scalar_u2, xnn_init_qs8_hswish_scalar_params); - } - } -} - -TEST(QS8_VHSWISH__SCALAR_U4, batch_eq_4) { - VHSwishMicrokernelTester() - .batch_size(4) - .Test(xnn_qs8_vhswish_ukernel__scalar_u4, xnn_init_qs8_hswish_scalar_params); -} - -TEST(QS8_VHSWISH__SCALAR_U4, batch_div_4) { - for (size_t batch_size = 8; batch_size < 40; batch_size += 4) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__scalar_u4, xnn_init_qs8_hswish_scalar_params); - } -} - -TEST(QS8_VHSWISH__SCALAR_U4, batch_lt_4) { - for (size_t batch_size = 1; batch_size < 4; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__scalar_u4, xnn_init_qs8_hswish_scalar_params); - } -} - -TEST(QS8_VHSWISH__SCALAR_U4, batch_gt_4) { - for (size_t batch_size = 5; batch_size < 8; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .Test(xnn_qs8_vhswish_ukernel__scalar_u4, xnn_init_qs8_hswish_scalar_params); - } -} - -TEST(QS8_VHSWISH__SCALAR_U4, input_scale) { - for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .Test(xnn_qs8_vhswish_ukernel__scalar_u4, xnn_init_qs8_hswish_scalar_params); - } - } -} - -TEST(QS8_VHSWISH__SCALAR_U4, output_scale) { - for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .Test(xnn_qs8_vhswish_ukernel__scalar_u4, xnn_init_qs8_hswish_scalar_params); - } - } -} - -TEST(QS8_VHSWISH__SCALAR_U4, input_zero_point) { - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .Test(xnn_qs8_vhswish_ukernel__scalar_u4, xnn_init_qs8_hswish_scalar_params); - } - } -} - -TEST(QS8_VHSWISH__SCALAR_U4, output_zero_point) { - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_zero_point(output_zero_point) - .Test(xnn_qs8_vhswish_ukernel__scalar_u4, xnn_init_qs8_hswish_scalar_params); - } - } -} \ No newline at end of file +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ + \ +XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ +XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ +XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ +XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ + \ +XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ +TEST(ukernel, input_scale) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { \ + for (float input_scale : {4.0f, 16.0f, 64.0f}) { \ + VHSwishMicrokernelTester() \ + .batch_size(batch_size) \ + .input_scale(input_scale) \ + .Test(ukernel, init_params); \ + } \ + } \ +} \ + \ +TEST(ukernel, output_scale) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { \ + for (float output_scale : {4.0f, 16.0f, 64.0f}) { \ + VHSwishMicrokernelTester() \ + .batch_size(batch_size) \ + .output_scale(output_scale) \ + .Test(ukernel, init_params); \ + } \ + } \ +} \ + \ +TEST(ukernel, input_zero_point) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { \ + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { \ + VHSwishMicrokernelTester() \ + .batch_size(batch_size) \ + .input_zero_point(input_zero_point) \ + .Test(ukernel, init_params); \ + } \ + } \ +} \ + \ +TEST(ukernel, output_zero_point) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { \ + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { \ + VHSwishMicrokernelTester() \ + .batch_size(batch_size) \ + .output_zero_point(output_zero_point) \ + .Test(ukernel, init_params); \ + } \ + } \ +} +#include "qs8-vhswish/qs8-vhswish.h" +#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qs8-vhswish.yaml b/test/qs8-vhswish.yaml deleted file mode 100644 index 0c0154c381a..00000000000 --- a/test/qs8-vhswish.yaml +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright 2023 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# ARM NEON -- name: xnn_qs8_vhswish_ukernel__neon_u8 - init: xnn_init_qs8_hswish_scalar_params -- name: xnn_qs8_vhswish_ukernel__neon_u16 - init: xnn_init_qs8_hswish_scalar_params -- name: xnn_qs8_vhswish_ukernel__neon_u32 - init: xnn_init_qs8_hswish_scalar_params - -# x86 SSE2 -- name: xnn_qs8_vhswish_ukernel__sse2_u16 - init: xnn_init_qs8_hswish_sse2_params -- name: xnn_qs8_vhswish_ukernel__sse2_u32 - init: xnn_init_qs8_hswish_sse2_params - -# x86 SSSE3 -- name: xnn_qs8_vhswish_ukernel__ssse3_u16 - init: xnn_init_qs8_hswish_sse2_params -- name: xnn_qs8_vhswish_ukernel__ssse3_u32 - init: xnn_init_qs8_hswish_sse2_params - -# x86 SSE4.1 -- name: xnn_qs8_vhswish_ukernel__sse41_u8 - init: xnn_init_qs8_hswish_sse2_params -- name: xnn_qs8_vhswish_ukernel__sse41_u16 - init: xnn_init_qs8_hswish_sse2_params -- name: xnn_qs8_vhswish_ukernel__sse41_u32 - init: xnn_init_qs8_hswish_sse2_params - -# x86 AVX -- name: xnn_qs8_vhswish_ukernel__avx_u8 - init: xnn_init_qs8_hswish_sse2_params -- name: xnn_qs8_vhswish_ukernel__avx_u16 - init: xnn_init_qs8_hswish_sse2_params -- name: xnn_qs8_vhswish_ukernel__avx_u32 - init: xnn_init_qs8_hswish_sse2_params - -# WAsm Relaxed SIMD -- name: xnn_qs8_vhswish_ukernel__wasmsimd_u8 - init: xnn_init_qs8_hswish_scalar_params -- name: xnn_qs8_vhswish_ukernel__wasmsimd_u16 - init: xnn_init_qs8_hswish_scalar_params -- name: xnn_qs8_vhswish_ukernel__wasmsimd_u32 - init: xnn_init_qs8_hswish_scalar_params - -# Scalar -- name: xnn_qs8_vhswish_ukernel__scalar_u1 - init: xnn_init_qs8_hswish_scalar_params -- name: xnn_qs8_vhswish_ukernel__scalar_u2 - init: xnn_init_qs8_hswish_scalar_params -- name: xnn_qs8_vhswish_ukernel__scalar_u4 - init: xnn_init_qs8_hswish_scalar_params diff --git a/test/qs8-vlrelu.cc b/test/qs8-vlrelu.cc index adec5d8a2fb..650b91404c0 100644 --- a/test/qs8-vlrelu.cc +++ b/test/qs8-vlrelu.cc @@ -20,7 +20,7 @@ #include "xnnpack/isa-checks.h" #include "xnnpack/microparams-init.h" #include "xnnpack/microparams.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" #include "next_prime.h" #include "vlrelu-microkernel-tester.h" @@ -55,5 +55,5 @@ TEST(ukernel, negative_scale) { } \ } \ } -#include "src/qs8-vlrelu/qs8-vlrelu.h" +#include "qs8-vlrelu/qs8-vlrelu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qs8-vmul-minmax-fp32.cc b/test/qs8-vmul-minmax-fp32.cc index 84192475916..71b20362efb 100644 --- a/test/qs8-vmul-minmax-fp32.cc +++ b/test/qs8-vmul-minmax-fp32.cc @@ -31,5 +31,5 @@ XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, false, datatype, ukerne \ XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params); \ XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params); -#include "src/qs8-vmul/qs8-vmul-minmax-fp32.h" +#include "qs8-vmul/qs8-vmul-minmax-fp32.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qs8-vmul-minmax-rndnu.cc b/test/qs8-vmul-minmax-rndnu.cc index 415282a69ec..a8e646f9be8 100644 --- a/test/qs8-vmul-minmax-rndnu.cc +++ b/test/qs8-vmul-minmax-rndnu.cc @@ -31,5 +31,5 @@ XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, false, datatype, ukerne \ XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params); \ XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params); -#include "src/qs8-vmul/qs8-vmul-minmax-rndnu.h" +#include "qs8-vmul/qs8-vmul-minmax-rndnu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qs8-vmulc-minmax-fp32.cc b/test/qs8-vmulc-minmax-fp32.cc index 17c4c06dce7..88b6b2668b9 100644 --- a/test/qs8-vmulc-minmax-fp32.cc +++ b/test/qs8-vmulc-minmax-fp32.cc @@ -31,5 +31,5 @@ XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel \ XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, true, datatype, ukernel, init_params); \ XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, true, datatype, ukernel, init_params); -#include "src/qs8-vmulc/qs8-vmulc-minmax-fp32.h" +#include "qs8-vmulc/qs8-vmulc-minmax-fp32.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qs8-vmulc-minmax-rndnu.cc b/test/qs8-vmulc-minmax-rndnu.cc index a2deac30969..c51e1dfdffb 100644 --- a/test/qs8-vmulc-minmax-rndnu.cc +++ b/test/qs8-vmulc-minmax-rndnu.cc @@ -31,5 +31,5 @@ XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel \ XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, true, datatype, ukernel, init_params); \ XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, true, datatype, ukernel, init_params); -#include "src/qs8-vmulc/qs8-vmulc-minmax-rndnu.h" +#include "qs8-vmulc/qs8-vmulc-minmax-rndnu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qu8-dwconv-minmax-multipass-fp32.cc b/test/qu8-dwconv-minmax-multipass-fp32.cc index 29dd4cdf532..d73327d2e9a 100644 --- a/test/qu8-dwconv-minmax-multipass-fp32.cc +++ b/test/qu8-dwconv-minmax-multipass-fp32.cc @@ -300,5 +300,5 @@ INSTANTIATE_TEST_SUITE_P( [](const testing::TestParamInfo& info) { \ return info.param.test_name; \ }); -#include "src/qu8-dwconv/qu8-dwconv-minmax-multipass-fp32.h" +#include "qu8-dwconv/qu8-dwconv-minmax-multipass-fp32.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qu8-dwconv-minmax-multipass-rndnu.cc b/test/qu8-dwconv-minmax-multipass-rndnu.cc index b300ab8218a..bd8cc9d10d3 100644 --- a/test/qu8-dwconv-minmax-multipass-rndnu.cc +++ b/test/qu8-dwconv-minmax-multipass-rndnu.cc @@ -300,5 +300,5 @@ INSTANTIATE_TEST_SUITE_P( [](const testing::TestParamInfo& info) { \ return info.param.test_name; \ }); -#include "src/qu8-dwconv/qu8-dwconv-minmax-multipass-rndnu.h" +#include "qu8-dwconv/qu8-dwconv-minmax-multipass-rndnu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qu8-dwconv-minmax-unipass-fp32.cc b/test/qu8-dwconv-minmax-unipass-fp32.cc index 586851cea87..4dd4997d469 100644 --- a/test/qu8-dwconv-minmax-unipass-fp32.cc +++ b/test/qu8-dwconv-minmax-unipass-fp32.cc @@ -225,5 +225,5 @@ INSTANTIATE_TEST_SUITE_P( [](const testing::TestParamInfo& info) { \ return info.param.test_name; \ }); -#include "src/qu8-dwconv/qu8-dwconv-minmax-unipass-fp32.h" +#include "qu8-dwconv/qu8-dwconv-minmax-unipass-fp32.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qu8-dwconv-minmax-unipass-rndnu.cc b/test/qu8-dwconv-minmax-unipass-rndnu.cc index 361838e6368..3891a756d71 100644 --- a/test/qu8-dwconv-minmax-unipass-rndnu.cc +++ b/test/qu8-dwconv-minmax-unipass-rndnu.cc @@ -225,5 +225,5 @@ INSTANTIATE_TEST_SUITE_P( [](const testing::TestParamInfo& info) { \ return info.param.test_name; \ }); -#include "src/qu8-dwconv/qu8-dwconv-minmax-unipass-rndnu.h" +#include "qu8-dwconv/qu8-dwconv-minmax-unipass-rndnu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qu8-f32-vcvt.cc b/test/qu8-f32-vcvt.cc index e37575b1b1c..909563c8e75 100644 --- a/test/qu8-f32-vcvt.cc +++ b/test/qu8-f32-vcvt.cc @@ -22,5 +22,5 @@ XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ \ XNN_TEST_CVT_INPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); -#include "src/qu8-f32-vcvt/qu8-f32-vcvt.h" +#include "qu8-f32-vcvt/qu8-f32-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/test/qu8-requantization.cc b/test/qu8-requantization.cc index c5cd2a23929..5501de70e86 100644 --- a/test/qu8-requantization.cc +++ b/test/qu8-requantization.cc @@ -15,254 +15,6 @@ #include "xnnpack/requantization-stubs.h" #include "requantization-tester.h" -/* - * Round-to-nearest, ties away from zero, scalar implementation using unsigned 32-bit arithmetics. - */ - -TEST(QU8_RNDNA__SCALAR_UNSIGNED32, exact_divide_by_po2) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qu8_requantize_rndna__scalar_unsigned32); - } -} - -TEST(QU8_RNDNA__SCALAR_UNSIGNED32, exact_divide_by_po2_with_zero_point) { - for (int32_t zero_point = 1; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qu8_requantize_rndna__scalar_unsigned32); - } - } -} - -TEST(QU8_RNDNA__SCALAR_UNSIGNED32, divide_by_po2_with_rounding_up) { - for (int32_t zero_point = 0; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_rndna__scalar_unsigned32); - } - } -} - -TEST(QU8_RNDNA__SCALAR_UNSIGNED32, divide_by_po2_with_rounding_down) { - for (int32_t zero_point = 0; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_rndna__scalar_unsigned32); - } - } -} - -TEST(QU8_RNDNA__SCALAR_UNSIGNED32, divide_by_po2_with_rounding_away) { - for (int32_t zero_point = 0; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingTiesAway(xnn_qu8_requantize_rndna__scalar_unsigned32); - } - } -} - -TEST(QU8_RNDNA__SCALAR_UNSIGNED32, special_cases) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .TestSpecialCases(xnn_qu8_requantize_rndna__scalar_unsigned32); -} - -TEST(QU8_RNDNA__SCALAR_UNSIGNED32, random_cases) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .zero_point(128) - .iterations(100) - .TestRandomCasesRoundToNearestTiesAway(xnn_qu8_requantize_rndna__scalar_unsigned32); -} - - -/* - * Round-to-nearest, ties away from zero, scalar implementation using unsigned 64-bit arithmetics. - */ - -TEST(QU8_RNDNA__SCALAR_UNSIGNED64, exact_divide_by_po2) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qu8_requantize_rndna__scalar_unsigned64); - } -} - -TEST(QU8_RNDNA__SCALAR_UNSIGNED64, exact_divide_by_po2_with_zero_point) { - for (int32_t zero_point = 1; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qu8_requantize_rndna__scalar_unsigned64); - } - } -} - -TEST(QU8_RNDNA__SCALAR_UNSIGNED64, divide_by_po2_with_rounding_up) { - for (int32_t zero_point = 0; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_rndna__scalar_unsigned64); - } - } -} - -TEST(QU8_RNDNA__SCALAR_UNSIGNED64, divide_by_po2_with_rounding_down) { - for (int32_t zero_point = 0; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_rndna__scalar_unsigned64); - } - } -} - -TEST(QU8_RNDNA__SCALAR_UNSIGNED64, divide_by_po2_with_rounding_away) { - for (int32_t zero_point = 0; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingTiesAway(xnn_qu8_requantize_rndna__scalar_unsigned64); - } - } -} - -TEST(QU8_RNDNA__SCALAR_UNSIGNED64, special_cases) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .TestSpecialCases(xnn_qu8_requantize_rndna__scalar_unsigned64); -} - -TEST(QU8_RNDNA__SCALAR_UNSIGNED64, random_cases) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .zero_point(128) - .iterations(100) - .TestRandomCasesRoundToNearestTiesAway(xnn_qu8_requantize_rndna__scalar_unsigned64); -} - - -/* - * Round-to-nearest, ties away from zero, scalar implementation using signed 64-bit arithmetics. - */ - -TEST(QU8_RNDNA__SCALAR_SIGNED64, exact_divide_by_po2) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qu8_requantize_rndna__scalar_signed64); - } -} - -TEST(QU8_RNDNA__SCALAR_SIGNED64, exact_divide_by_po2_with_zero_point) { - for (int32_t zero_point = 1; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qu8_requantize_rndna__scalar_signed64); - } - } -} - -TEST(QU8_RNDNA__SCALAR_SIGNED64, divide_by_po2_with_rounding_up) { - for (int32_t zero_point = 0; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_rndna__scalar_signed64); - } - } -} - -TEST(QU8_RNDNA__SCALAR_SIGNED64, divide_by_po2_with_rounding_down) { - for (int32_t zero_point = 0; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_rndna__scalar_signed64); - } - } -} - -TEST(QU8_RNDNA__SCALAR_SIGNED64, divide_by_po2_with_rounding_away) { - for (int32_t zero_point = 0; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingTiesAway(xnn_qu8_requantize_rndna__scalar_signed64); - } - } -} - -TEST(QU8_RNDNA__SCALAR_SIGNED64, special_cases) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .TestSpecialCases(xnn_qu8_requantize_rndna__scalar_signed64); -} - -TEST(QU8_RNDNA__SCALAR_SIGNED64, random_cases) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .zero_point(128) - .iterations(100) - .TestRandomCasesRoundToNearestTiesAway(xnn_qu8_requantize_rndna__scalar_signed64); -} - /* * FP32-based scalar implementation using lrintf function. @@ -350,269 +102,6 @@ TEST(QU8_GEMMLOWP__SCALAR, random_cases) { #if XNN_ARCH_X86 || XNN_ARCH_X86_64 - /* - * Round-to-nearest, ties away from zero, SSE2 implementation using floating-point shuffle. - */ - - TEST(QU8_RNDNA__SSE2, exact_divide_by_po2) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qu8_requantize_rndna__sse2); - } - } - - TEST(QU8_RNDNA__SSE2, exact_divide_by_po2_with_zero_point) { - for (int32_t zero_point = 1; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qu8_requantize_rndna__sse2); - } - } - } - - TEST(QU8_RNDNA__SSE2, divide_by_po2_with_rounding_up) { - for (int32_t zero_point = 0; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_rndna__sse2); - } - } - } - - TEST(QU8_RNDNA__SSE2, divide_by_po2_with_rounding_down) { - for (int32_t zero_point = 0; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_rndna__sse2); - } - } - } - - TEST(QU8_RNDNA__SSE2, divide_by_po2_with_rounding_away) { - for (int32_t zero_point = 0; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingTiesAway(xnn_qu8_requantize_rndna__sse2); - } - } - } - - TEST(QU8_RNDNA__SSE2, special_cases) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .TestSpecialCases(xnn_qu8_requantize_rndna__sse2); - } - - TEST(QU8_RNDNA__SSE2, random_cases) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .zero_point(128) - .iterations(100) - .TestRandomCasesRoundToNearestTiesAway(xnn_qu8_requantize_rndna__sse2); - } - - - /* - * Round-to-nearest, ties away from zero, SSSE3 implementation using floating-point shuffle. - */ - - TEST(QU8_RNDNA__SSSE3, exact_divide_by_po2) { - TEST_REQUIRES_X86_SSSE3; - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qu8_requantize_rndna__ssse3); - } - } - - TEST(QU8_RNDNA__SSSE3, exact_divide_by_po2_with_zero_point) { - TEST_REQUIRES_X86_SSSE3; - for (int32_t zero_point = 1; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qu8_requantize_rndna__ssse3); - } - } - } - - TEST(QU8_RNDNA__SSSE3, divide_by_po2_with_rounding_up) { - TEST_REQUIRES_X86_SSSE3; - for (int32_t zero_point = 0; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_rndna__ssse3); - } - } - } - - TEST(QU8_RNDNA__SSSE3, divide_by_po2_with_rounding_down) { - TEST_REQUIRES_X86_SSSE3; - for (int32_t zero_point = 0; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_rndna__ssse3); - } - } - } - - TEST(QU8_RNDNA__SSSE3, divide_by_po2_with_rounding_away) { - TEST_REQUIRES_X86_SSSE3; - for (int32_t zero_point = 0; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingTiesAway(xnn_qu8_requantize_rndna__ssse3); - } - } - } - - TEST(QU8_RNDNA__SSSE3, special_cases) { - TEST_REQUIRES_X86_SSSE3; - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .TestSpecialCases(xnn_qu8_requantize_rndna__ssse3); - } - - TEST(QU8_RNDNA__SSSE3, random_cases) { - TEST_REQUIRES_X86_SSSE3; - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .zero_point(128) - .iterations(100) - .TestRandomCasesRoundToNearestTiesAway(xnn_qu8_requantize_rndna__ssse3); - } - - - /* - * Round-to-nearest, ties away from zero, SSE4.1 implementation using static blend instruction. - */ - - TEST(QU8_RNDNA__SSE41, exact_divide_by_po2) { - TEST_REQUIRES_X86_SSE41; - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qu8_requantize_rndna__sse41); - } - } - - TEST(QU8_RNDNA__SSE41, exact_divide_by_po2_with_zero_point) { - TEST_REQUIRES_X86_SSE41; - for (int32_t zero_point = 1; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qu8_requantize_rndna__sse41); - } - } - } - - TEST(QU8_RNDNA__SSE41, divide_by_po2_with_rounding_up) { - TEST_REQUIRES_X86_SSE41; - for (int32_t zero_point = 0; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_rndna__sse41); - } - } - } - - TEST(QU8_RNDNA__SSE41, divide_by_po2_with_rounding_down) { - TEST_REQUIRES_X86_SSE41; - for (int32_t zero_point = 0; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_rndna__sse41); - } - } - } - - TEST(QU8_RNDNA__SSE41, divide_by_po2_with_rounding_away) { - TEST_REQUIRES_X86_SSE41; - for (int32_t zero_point = 0; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingTiesAway(xnn_qu8_requantize_rndna__sse41); - } - } - } - - TEST(QU8_RNDNA__SSE41, special_cases) { - TEST_REQUIRES_X86_SSE41; - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .TestSpecialCases(xnn_qu8_requantize_rndna__sse41); - } - - TEST(QU8_RNDNA__SSE41, random_cases) { - TEST_REQUIRES_X86_SSE41; - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .zero_point(128) - .iterations(100) - .TestRandomCasesRoundToNearestTiesAway(xnn_qu8_requantize_rndna__sse41); - } - - /* * FP32-based x86 SSE2 implementation. */ @@ -814,96 +303,6 @@ TEST(QU8_GEMMLOWP__SCALAR, random_cases) { #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 - /* - * Round-to-nearest, ties away from zero, ARM NEON implementation. - */ - - TEST(QU8_RNDNA__NEON, exact_divide_by_po2) { - TEST_REQUIRES_ARM_NEON; - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qu8_requantize_rndna__neon); - } - } - - TEST(QU8_RNDNA__NEON, exact_divide_by_po2_with_zero_point) { - TEST_REQUIRES_ARM_NEON; - for (int32_t zero_point = 1; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestExactDivideByPO2(xnn_qu8_requantize_rndna__neon); - } - } - } - - TEST(QU8_RNDNA__NEON, divide_by_po2_with_rounding_up) { - TEST_REQUIRES_ARM_NEON; - for (int32_t zero_point = 0; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_rndna__neon); - } - } - } - - TEST(QU8_RNDNA__NEON, divide_by_po2_with_rounding_down) { - TEST_REQUIRES_ARM_NEON; - for (int32_t zero_point = 0; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_rndna__neon); - } - } - } - - TEST(QU8_RNDNA__NEON, divide_by_po2_with_rounding_away) { - TEST_REQUIRES_ARM_NEON; - for (int32_t zero_point = 0; zero_point < 256; zero_point++) { - for (uint32_t s = 1; s < 32; s++) { - RequantizationTester() - .zero_point(zero_point) - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .s(s) - .TestDivideByPO2WithRoundingTiesAway(xnn_qu8_requantize_rndna__neon); - } - } - } - - TEST(QU8_RNDNA__NEON, special_cases) { - TEST_REQUIRES_ARM_NEON; - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .TestSpecialCases(xnn_qu8_requantize_rndna__neon); - } - - TEST(QU8_RNDNA__NEON, random_cases) { - TEST_REQUIRES_ARM_NEON; - RequantizationTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) - .zero_point(128) - .iterations(100) - .TestRandomCasesRoundToNearestTiesAway(xnn_qu8_requantize_rndna__neon); - } - - /* * FP32-based ARM NEON implementation. */ diff --git a/test/qu8-vadd-minmax.cc b/test/qu8-vadd-minmax.cc index a720b913857..038adf26f39 100644 --- a/test/qu8-vadd-minmax.cc +++ b/test/qu8-vadd-minmax.cc @@ -31,5 +31,5 @@ XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, false, datatype, ukerne \ XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params); \ XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params); -#include "src/qu8-vadd/qu8-vadd-minmax.h" +#include "qu8-vadd/qu8-vadd-minmax.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qu8-vaddc-minmax.cc b/test/qu8-vaddc-minmax.cc index 64da7bb810b..9c856481986 100644 --- a/test/qu8-vaddc-minmax.cc +++ b/test/qu8-vaddc-minmax.cc @@ -31,5 +31,5 @@ XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel \ XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, true, datatype, ukernel, init_params); \ XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, true, datatype, ukernel, init_params); -#include "src/qu8-vaddc/qu8-vaddc-minmax.h" +#include "qu8-vaddc/qu8-vaddc-minmax.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qu8-vcvt.cc b/test/qu8-vcvt.cc index d8e60d7252c..b0e061a70b4 100644 --- a/test/qu8-vcvt.cc +++ b/test/qu8-vcvt.cc @@ -22,5 +22,5 @@ XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ \ XNN_TEST_CVT_INPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); -#include "src/qu8-vcvt/qu8-vcvt.h" +#include "qu8-vcvt/qu8-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/test/qu8-vhswish.cc b/test/qu8-vhswish.cc index 43607f2b51f..e6e76ed7dcc 100644 --- a/test/qu8-vhswish.cc +++ b/test/qu8-vhswish.cc @@ -1,1823 +1,89 @@ -// Copyright 2023 Google LLC +// Copyright 2019 Google LLC // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. // // Auto-generated file. Do not edit! -// Specification: test/qu8-vhswish.yaml -// Generator: tools/generate-vhswish-test.py +// Microkernel: qu8-vhswish +// Generator: tools/generate-vunary-test.py -#include +#include +#include +#include +#include +#include #include +#include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/isa-checks.h" #include "xnnpack/microparams-init.h" -#include "xnnpack/vhswish.h" +#include "xnnpack/microparams.h" +#include "xnnpack/vunary.h" +#include "next_prime.h" #include "vhswish-microkernel-tester.h" - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QU8_VHSWISH__NEON_U8, batch_eq_8) { - TEST_REQUIRES_ARM_NEON; - VHSwishMicrokernelTester() - .batch_size(8) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__neon_u8, xnn_init_qu8_hswish_scalar_params); - } - - TEST(QU8_VHSWISH__NEON_U8, batch_div_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__neon_u8, xnn_init_qu8_hswish_scalar_params); - } - } - - TEST(QU8_VHSWISH__NEON_U8, batch_lt_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 1; batch_size < 8; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__neon_u8, xnn_init_qu8_hswish_scalar_params); - } - } - - TEST(QU8_VHSWISH__NEON_U8, batch_gt_8) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 9; batch_size < 16; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__neon_u8, xnn_init_qu8_hswish_scalar_params); - } - } - - TEST(QU8_VHSWISH__NEON_U8, input_scale) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__neon_u8, xnn_init_qu8_hswish_scalar_params); - } - } - } - - TEST(QU8_VHSWISH__NEON_U8, output_scale) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__neon_u8, xnn_init_qu8_hswish_scalar_params); - } - } - } - - TEST(QU8_VHSWISH__NEON_U8, input_zero_point) { - TEST_REQUIRES_ARM_NEON; - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__neon_u8, xnn_init_qu8_hswish_scalar_params); - } - } - } - - TEST(QU8_VHSWISH__NEON_U8, output_zero_point) { - TEST_REQUIRES_ARM_NEON; - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(output_zero_point) - .Test(xnn_qu8_vhswish_ukernel__neon_u8, xnn_init_qu8_hswish_scalar_params); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QU8_VHSWISH__NEON_U16, batch_eq_16) { - TEST_REQUIRES_ARM_NEON; - VHSwishMicrokernelTester() - .batch_size(16) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__neon_u16, xnn_init_qu8_hswish_scalar_params); - } - - TEST(QU8_VHSWISH__NEON_U16, batch_div_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__neon_u16, xnn_init_qu8_hswish_scalar_params); - } - } - - TEST(QU8_VHSWISH__NEON_U16, batch_lt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 1; batch_size < 16; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__neon_u16, xnn_init_qu8_hswish_scalar_params); - } - } - - TEST(QU8_VHSWISH__NEON_U16, batch_gt_16) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 17; batch_size < 32; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__neon_u16, xnn_init_qu8_hswish_scalar_params); - } - } - - TEST(QU8_VHSWISH__NEON_U16, input_scale) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__neon_u16, xnn_init_qu8_hswish_scalar_params); - } - } - } - - TEST(QU8_VHSWISH__NEON_U16, output_scale) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__neon_u16, xnn_init_qu8_hswish_scalar_params); - } - } - } - - TEST(QU8_VHSWISH__NEON_U16, input_zero_point) { - TEST_REQUIRES_ARM_NEON; - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__neon_u16, xnn_init_qu8_hswish_scalar_params); - } - } - } - - TEST(QU8_VHSWISH__NEON_U16, output_zero_point) { - TEST_REQUIRES_ARM_NEON; - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(output_zero_point) - .Test(xnn_qu8_vhswish_ukernel__neon_u16, xnn_init_qu8_hswish_scalar_params); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QU8_VHSWISH__NEON_U32, batch_eq_32) { - TEST_REQUIRES_ARM_NEON; - VHSwishMicrokernelTester() - .batch_size(32) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__neon_u32, xnn_init_qu8_hswish_scalar_params); - } - - TEST(QU8_VHSWISH__NEON_U32, batch_div_32) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__neon_u32, xnn_init_qu8_hswish_scalar_params); - } - } - - TEST(QU8_VHSWISH__NEON_U32, batch_lt_32) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 1; batch_size < 32; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__neon_u32, xnn_init_qu8_hswish_scalar_params); - } - } - - TEST(QU8_VHSWISH__NEON_U32, batch_gt_32) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 33; batch_size < 64; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__neon_u32, xnn_init_qu8_hswish_scalar_params); - } - } - - TEST(QU8_VHSWISH__NEON_U32, input_scale) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__neon_u32, xnn_init_qu8_hswish_scalar_params); - } - } - } - - TEST(QU8_VHSWISH__NEON_U32, output_scale) { - TEST_REQUIRES_ARM_NEON; - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__neon_u32, xnn_init_qu8_hswish_scalar_params); - } - } - } - - TEST(QU8_VHSWISH__NEON_U32, input_zero_point) { - TEST_REQUIRES_ARM_NEON; - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__neon_u32, xnn_init_qu8_hswish_scalar_params); - } - } - } - - TEST(QU8_VHSWISH__NEON_U32, output_zero_point) { - TEST_REQUIRES_ARM_NEON; - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(output_zero_point) - .Test(xnn_qu8_vhswish_ukernel__neon_u32, xnn_init_qu8_hswish_scalar_params); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QU8_VHSWISH__SSE2_U16, batch_eq_16) { - TEST_REQUIRES_X86_SSE2; - VHSwishMicrokernelTester() - .batch_size(16) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse2_u16, xnn_init_qu8_hswish_sse2_params); - } - - TEST(QU8_VHSWISH__SSE2_U16, batch_div_16) { - TEST_REQUIRES_X86_SSE2; - for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse2_u16, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__SSE2_U16, batch_lt_16) { - TEST_REQUIRES_X86_SSE2; - for (size_t batch_size = 1; batch_size < 16; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse2_u16, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__SSE2_U16, batch_gt_16) { - TEST_REQUIRES_X86_SSE2; - for (size_t batch_size = 17; batch_size < 32; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse2_u16, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__SSE2_U16, input_scale) { - TEST_REQUIRES_X86_SSE2; - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse2_u16, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__SSE2_U16, output_scale) { - TEST_REQUIRES_X86_SSE2; - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse2_u16, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__SSE2_U16, input_zero_point) { - TEST_REQUIRES_X86_SSE2; - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse2_u16, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__SSE2_U16, output_zero_point) { - TEST_REQUIRES_X86_SSE2; - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(output_zero_point) - .Test(xnn_qu8_vhswish_ukernel__sse2_u16, xnn_init_qu8_hswish_sse2_params); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QU8_VHSWISH__SSE2_U32, batch_eq_32) { - TEST_REQUIRES_X86_SSE2; - VHSwishMicrokernelTester() - .batch_size(32) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse2_u32, xnn_init_qu8_hswish_sse2_params); - } - - TEST(QU8_VHSWISH__SSE2_U32, batch_div_32) { - TEST_REQUIRES_X86_SSE2; - for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse2_u32, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__SSE2_U32, batch_lt_32) { - TEST_REQUIRES_X86_SSE2; - for (size_t batch_size = 1; batch_size < 32; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse2_u32, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__SSE2_U32, batch_gt_32) { - TEST_REQUIRES_X86_SSE2; - for (size_t batch_size = 33; batch_size < 64; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse2_u32, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__SSE2_U32, input_scale) { - TEST_REQUIRES_X86_SSE2; - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse2_u32, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__SSE2_U32, output_scale) { - TEST_REQUIRES_X86_SSE2; - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse2_u32, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__SSE2_U32, input_zero_point) { - TEST_REQUIRES_X86_SSE2; - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse2_u32, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__SSE2_U32, output_zero_point) { - TEST_REQUIRES_X86_SSE2; - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(output_zero_point) - .Test(xnn_qu8_vhswish_ukernel__sse2_u32, xnn_init_qu8_hswish_sse2_params); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QU8_VHSWISH__SSSE3_U16, batch_eq_16) { - TEST_REQUIRES_X86_SSSE3; - VHSwishMicrokernelTester() - .batch_size(16) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__ssse3_u16, xnn_init_qu8_hswish_sse2_params); - } - - TEST(QU8_VHSWISH__SSSE3_U16, batch_div_16) { - TEST_REQUIRES_X86_SSSE3; - for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__ssse3_u16, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__SSSE3_U16, batch_lt_16) { - TEST_REQUIRES_X86_SSSE3; - for (size_t batch_size = 1; batch_size < 16; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__ssse3_u16, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__SSSE3_U16, batch_gt_16) { - TEST_REQUIRES_X86_SSSE3; - for (size_t batch_size = 17; batch_size < 32; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__ssse3_u16, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__SSSE3_U16, input_scale) { - TEST_REQUIRES_X86_SSSE3; - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__ssse3_u16, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__SSSE3_U16, output_scale) { - TEST_REQUIRES_X86_SSSE3; - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__ssse3_u16, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__SSSE3_U16, input_zero_point) { - TEST_REQUIRES_X86_SSSE3; - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__ssse3_u16, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__SSSE3_U16, output_zero_point) { - TEST_REQUIRES_X86_SSSE3; - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(output_zero_point) - .Test(xnn_qu8_vhswish_ukernel__ssse3_u16, xnn_init_qu8_hswish_sse2_params); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QU8_VHSWISH__SSSE3_U32, batch_eq_32) { - TEST_REQUIRES_X86_SSSE3; - VHSwishMicrokernelTester() - .batch_size(32) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__ssse3_u32, xnn_init_qu8_hswish_sse2_params); - } - - TEST(QU8_VHSWISH__SSSE3_U32, batch_div_32) { - TEST_REQUIRES_X86_SSSE3; - for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__ssse3_u32, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__SSSE3_U32, batch_lt_32) { - TEST_REQUIRES_X86_SSSE3; - for (size_t batch_size = 1; batch_size < 32; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__ssse3_u32, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__SSSE3_U32, batch_gt_32) { - TEST_REQUIRES_X86_SSSE3; - for (size_t batch_size = 33; batch_size < 64; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__ssse3_u32, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__SSSE3_U32, input_scale) { - TEST_REQUIRES_X86_SSSE3; - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__ssse3_u32, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__SSSE3_U32, output_scale) { - TEST_REQUIRES_X86_SSSE3; - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__ssse3_u32, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__SSSE3_U32, input_zero_point) { - TEST_REQUIRES_X86_SSSE3; - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__ssse3_u32, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__SSSE3_U32, output_zero_point) { - TEST_REQUIRES_X86_SSSE3; - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(output_zero_point) - .Test(xnn_qu8_vhswish_ukernel__ssse3_u32, xnn_init_qu8_hswish_sse2_params); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QU8_VHSWISH__SSE41_U8, batch_eq_8) { - TEST_REQUIRES_X86_SSE41; - VHSwishMicrokernelTester() - .batch_size(8) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse41_u8, xnn_init_qu8_hswish_sse2_params); - } - - TEST(QU8_VHSWISH__SSE41_U8, batch_div_8) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse41_u8, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__SSE41_U8, batch_lt_8) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 1; batch_size < 8; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse41_u8, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__SSE41_U8, batch_gt_8) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 9; batch_size < 16; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse41_u8, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__SSE41_U8, input_scale) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse41_u8, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__SSE41_U8, output_scale) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse41_u8, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__SSE41_U8, input_zero_point) { - TEST_REQUIRES_X86_SSE41; - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse41_u8, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__SSE41_U8, output_zero_point) { - TEST_REQUIRES_X86_SSE41; - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(output_zero_point) - .Test(xnn_qu8_vhswish_ukernel__sse41_u8, xnn_init_qu8_hswish_sse2_params); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QU8_VHSWISH__SSE41_U16, batch_eq_16) { - TEST_REQUIRES_X86_SSE41; - VHSwishMicrokernelTester() - .batch_size(16) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse41_u16, xnn_init_qu8_hswish_sse2_params); - } - - TEST(QU8_VHSWISH__SSE41_U16, batch_div_16) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse41_u16, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__SSE41_U16, batch_lt_16) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 1; batch_size < 16; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse41_u16, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__SSE41_U16, batch_gt_16) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 17; batch_size < 32; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse41_u16, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__SSE41_U16, input_scale) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse41_u16, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__SSE41_U16, output_scale) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse41_u16, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__SSE41_U16, input_zero_point) { - TEST_REQUIRES_X86_SSE41; - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse41_u16, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__SSE41_U16, output_zero_point) { - TEST_REQUIRES_X86_SSE41; - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(output_zero_point) - .Test(xnn_qu8_vhswish_ukernel__sse41_u16, xnn_init_qu8_hswish_sse2_params); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QU8_VHSWISH__SSE41_U32, batch_eq_32) { - TEST_REQUIRES_X86_SSE41; - VHSwishMicrokernelTester() - .batch_size(32) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse41_u32, xnn_init_qu8_hswish_sse2_params); - } - - TEST(QU8_VHSWISH__SSE41_U32, batch_div_32) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse41_u32, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__SSE41_U32, batch_lt_32) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 1; batch_size < 32; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse41_u32, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__SSE41_U32, batch_gt_32) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 33; batch_size < 64; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse41_u32, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__SSE41_U32, input_scale) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse41_u32, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__SSE41_U32, output_scale) { - TEST_REQUIRES_X86_SSE41; - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse41_u32, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__SSE41_U32, input_zero_point) { - TEST_REQUIRES_X86_SSE41; - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__sse41_u32, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__SSE41_U32, output_zero_point) { - TEST_REQUIRES_X86_SSE41; - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(output_zero_point) - .Test(xnn_qu8_vhswish_ukernel__sse41_u32, xnn_init_qu8_hswish_sse2_params); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QU8_VHSWISH__AVX_U8, batch_eq_8) { - TEST_REQUIRES_X86_AVX; - VHSwishMicrokernelTester() - .batch_size(8) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__avx_u8, xnn_init_qu8_hswish_sse2_params); - } - - TEST(QU8_VHSWISH__AVX_U8, batch_div_8) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__avx_u8, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__AVX_U8, batch_lt_8) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 1; batch_size < 8; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__avx_u8, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__AVX_U8, batch_gt_8) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 9; batch_size < 16; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__avx_u8, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__AVX_U8, input_scale) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__avx_u8, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__AVX_U8, output_scale) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__avx_u8, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__AVX_U8, input_zero_point) { - TEST_REQUIRES_X86_AVX; - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__avx_u8, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__AVX_U8, output_zero_point) { - TEST_REQUIRES_X86_AVX; - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(output_zero_point) - .Test(xnn_qu8_vhswish_ukernel__avx_u8, xnn_init_qu8_hswish_sse2_params); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QU8_VHSWISH__AVX_U16, batch_eq_16) { - TEST_REQUIRES_X86_AVX; - VHSwishMicrokernelTester() - .batch_size(16) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__avx_u16, xnn_init_qu8_hswish_sse2_params); - } - - TEST(QU8_VHSWISH__AVX_U16, batch_div_16) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__avx_u16, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__AVX_U16, batch_lt_16) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 1; batch_size < 16; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__avx_u16, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__AVX_U16, batch_gt_16) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 17; batch_size < 32; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__avx_u16, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__AVX_U16, input_scale) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__avx_u16, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__AVX_U16, output_scale) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__avx_u16, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__AVX_U16, input_zero_point) { - TEST_REQUIRES_X86_AVX; - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__avx_u16, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__AVX_U16, output_zero_point) { - TEST_REQUIRES_X86_AVX; - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(output_zero_point) - .Test(xnn_qu8_vhswish_ukernel__avx_u16, xnn_init_qu8_hswish_sse2_params); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QU8_VHSWISH__AVX_U32, batch_eq_32) { - TEST_REQUIRES_X86_AVX; - VHSwishMicrokernelTester() - .batch_size(32) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__avx_u32, xnn_init_qu8_hswish_sse2_params); - } - - TEST(QU8_VHSWISH__AVX_U32, batch_div_32) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__avx_u32, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__AVX_U32, batch_lt_32) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 1; batch_size < 32; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__avx_u32, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__AVX_U32, batch_gt_32) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 33; batch_size < 64; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__avx_u32, xnn_init_qu8_hswish_sse2_params); - } - } - - TEST(QU8_VHSWISH__AVX_U32, input_scale) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__avx_u32, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__AVX_U32, output_scale) { - TEST_REQUIRES_X86_AVX; - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__avx_u32, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__AVX_U32, input_zero_point) { - TEST_REQUIRES_X86_AVX; - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__avx_u32, xnn_init_qu8_hswish_sse2_params); - } - } - } - - TEST(QU8_VHSWISH__AVX_U32, output_zero_point) { - TEST_REQUIRES_X86_AVX; - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(output_zero_point) - .Test(xnn_qu8_vhswish_ukernel__avx_u32, xnn_init_qu8_hswish_sse2_params); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(QU8_VHSWISH__WASMSIMD_U8, batch_eq_8) { - VHSwishMicrokernelTester() - .batch_size(8) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u8, xnn_init_qu8_hswish_scalar_params); - } - - TEST(QU8_VHSWISH__WASMSIMD_U8, batch_div_8) { - for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u8, xnn_init_qu8_hswish_scalar_params); - } - } - - TEST(QU8_VHSWISH__WASMSIMD_U8, batch_lt_8) { - for (size_t batch_size = 1; batch_size < 8; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u8, xnn_init_qu8_hswish_scalar_params); - } - } - - TEST(QU8_VHSWISH__WASMSIMD_U8, batch_gt_8) { - for (size_t batch_size = 9; batch_size < 16; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u8, xnn_init_qu8_hswish_scalar_params); - } - } - - TEST(QU8_VHSWISH__WASMSIMD_U8, input_scale) { - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u8, xnn_init_qu8_hswish_scalar_params); - } - } - } - - TEST(QU8_VHSWISH__WASMSIMD_U8, output_scale) { - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u8, xnn_init_qu8_hswish_scalar_params); - } - } - } - - TEST(QU8_VHSWISH__WASMSIMD_U8, input_zero_point) { - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u8, xnn_init_qu8_hswish_scalar_params); - } - } - } - - TEST(QU8_VHSWISH__WASMSIMD_U8, output_zero_point) { - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(output_zero_point) - .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u8, xnn_init_qu8_hswish_scalar_params); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(QU8_VHSWISH__WASMSIMD_U16, batch_eq_16) { - VHSwishMicrokernelTester() - .batch_size(16) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u16, xnn_init_qu8_hswish_scalar_params); - } - - TEST(QU8_VHSWISH__WASMSIMD_U16, batch_div_16) { - for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u16, xnn_init_qu8_hswish_scalar_params); - } - } - - TEST(QU8_VHSWISH__WASMSIMD_U16, batch_lt_16) { - for (size_t batch_size = 1; batch_size < 16; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u16, xnn_init_qu8_hswish_scalar_params); - } - } - - TEST(QU8_VHSWISH__WASMSIMD_U16, batch_gt_16) { - for (size_t batch_size = 17; batch_size < 32; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u16, xnn_init_qu8_hswish_scalar_params); - } - } - - TEST(QU8_VHSWISH__WASMSIMD_U16, input_scale) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u16, xnn_init_qu8_hswish_scalar_params); - } - } - } - - TEST(QU8_VHSWISH__WASMSIMD_U16, output_scale) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u16, xnn_init_qu8_hswish_scalar_params); - } - } - } - - TEST(QU8_VHSWISH__WASMSIMD_U16, input_zero_point) { - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u16, xnn_init_qu8_hswish_scalar_params); - } - } - } - - TEST(QU8_VHSWISH__WASMSIMD_U16, output_zero_point) { - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(output_zero_point) - .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u16, xnn_init_qu8_hswish_scalar_params); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(QU8_VHSWISH__WASMSIMD_U32, batch_eq_32) { - VHSwishMicrokernelTester() - .batch_size(32) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u32, xnn_init_qu8_hswish_scalar_params); - } - - TEST(QU8_VHSWISH__WASMSIMD_U32, batch_div_32) { - for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u32, xnn_init_qu8_hswish_scalar_params); - } - } - - TEST(QU8_VHSWISH__WASMSIMD_U32, batch_lt_32) { - for (size_t batch_size = 1; batch_size < 32; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u32, xnn_init_qu8_hswish_scalar_params); - } - } - - TEST(QU8_VHSWISH__WASMSIMD_U32, batch_gt_32) { - for (size_t batch_size = 33; batch_size < 64; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u32, xnn_init_qu8_hswish_scalar_params); - } - } - - TEST(QU8_VHSWISH__WASMSIMD_U32, input_scale) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u32, xnn_init_qu8_hswish_scalar_params); - } - } - } - - TEST(QU8_VHSWISH__WASMSIMD_U32, output_scale) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u32, xnn_init_qu8_hswish_scalar_params); - } - } - } - - TEST(QU8_VHSWISH__WASMSIMD_U32, input_zero_point) { - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u32, xnn_init_qu8_hswish_scalar_params); - } - } - } - - TEST(QU8_VHSWISH__WASMSIMD_U32, output_zero_point) { - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(output_zero_point) - .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u32, xnn_init_qu8_hswish_scalar_params); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -TEST(QU8_VHSWISH__SCALAR_U1, batch_eq_1) { - VHSwishMicrokernelTester() - .batch_size(1) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__scalar_u1, xnn_init_qu8_hswish_scalar_params); -} - -TEST(QU8_VHSWISH__SCALAR_U1, batch_gt_1) { - for (size_t batch_size = 2; batch_size < 10; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__scalar_u1, xnn_init_qu8_hswish_scalar_params); - } -} - -TEST(QU8_VHSWISH__SCALAR_U1, input_scale) { - for (size_t batch_size = 1; batch_size <= 5; batch_size += 1) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__scalar_u1, xnn_init_qu8_hswish_scalar_params); - } - } -} - -TEST(QU8_VHSWISH__SCALAR_U1, output_scale) { - for (size_t batch_size = 1; batch_size <= 5; batch_size += 1) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__scalar_u1, xnn_init_qu8_hswish_scalar_params); - } - } -} - -TEST(QU8_VHSWISH__SCALAR_U1, input_zero_point) { - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 5; batch_size += 1) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__scalar_u1, xnn_init_qu8_hswish_scalar_params); - } - } -} - -TEST(QU8_VHSWISH__SCALAR_U1, output_zero_point) { - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 5; batch_size += 1) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(output_zero_point) - .Test(xnn_qu8_vhswish_ukernel__scalar_u1, xnn_init_qu8_hswish_scalar_params); - } - } -} - -TEST(QU8_VHSWISH__SCALAR_U2, batch_eq_2) { - VHSwishMicrokernelTester() - .batch_size(2) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__scalar_u2, xnn_init_qu8_hswish_scalar_params); -} - -TEST(QU8_VHSWISH__SCALAR_U2, batch_div_2) { - for (size_t batch_size = 4; batch_size < 20; batch_size += 2) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__scalar_u2, xnn_init_qu8_hswish_scalar_params); - } -} - -TEST(QU8_VHSWISH__SCALAR_U2, batch_lt_2) { - for (size_t batch_size = 1; batch_size < 2; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__scalar_u2, xnn_init_qu8_hswish_scalar_params); - } -} - -TEST(QU8_VHSWISH__SCALAR_U2, batch_gt_2) { - for (size_t batch_size = 3; batch_size < 4; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__scalar_u2, xnn_init_qu8_hswish_scalar_params); - } -} - -TEST(QU8_VHSWISH__SCALAR_U2, input_scale) { - for (size_t batch_size = 1; batch_size <= 10; batch_size += 1) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__scalar_u2, xnn_init_qu8_hswish_scalar_params); - } - } -} - -TEST(QU8_VHSWISH__SCALAR_U2, output_scale) { - for (size_t batch_size = 1; batch_size <= 10; batch_size += 1) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__scalar_u2, xnn_init_qu8_hswish_scalar_params); - } - } -} - -TEST(QU8_VHSWISH__SCALAR_U2, input_zero_point) { - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 10; batch_size += 1) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__scalar_u2, xnn_init_qu8_hswish_scalar_params); - } - } -} - -TEST(QU8_VHSWISH__SCALAR_U2, output_zero_point) { - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 10; batch_size += 1) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(output_zero_point) - .Test(xnn_qu8_vhswish_ukernel__scalar_u2, xnn_init_qu8_hswish_scalar_params); - } - } -} - -TEST(QU8_VHSWISH__SCALAR_U4, batch_eq_4) { - VHSwishMicrokernelTester() - .batch_size(4) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__scalar_u4, xnn_init_qu8_hswish_scalar_params); -} - -TEST(QU8_VHSWISH__SCALAR_U4, batch_div_4) { - for (size_t batch_size = 8; batch_size < 40; batch_size += 4) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__scalar_u4, xnn_init_qu8_hswish_scalar_params); - } -} - -TEST(QU8_VHSWISH__SCALAR_U4, batch_lt_4) { - for (size_t batch_size = 1; batch_size < 4; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__scalar_u4, xnn_init_qu8_hswish_scalar_params); - } -} - -TEST(QU8_VHSWISH__SCALAR_U4, batch_gt_4) { - for (size_t batch_size = 5; batch_size < 8; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__scalar_u4, xnn_init_qu8_hswish_scalar_params); - } -} - -TEST(QU8_VHSWISH__SCALAR_U4, input_scale) { - for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__scalar_u4, xnn_init_qu8_hswish_scalar_params); - } - } -} - -TEST(QU8_VHSWISH__SCALAR_U4, output_scale) { - for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - .input_zero_point(150) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__scalar_u4, xnn_init_qu8_hswish_scalar_params); - } - } -} - -TEST(QU8_VHSWISH__SCALAR_U4, input_zero_point) { - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - .output_zero_point(100) - .Test(xnn_qu8_vhswish_ukernel__scalar_u4, xnn_init_qu8_hswish_scalar_params); - } - } -} - -TEST(QU8_VHSWISH__SCALAR_U4, output_zero_point) { - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(150) - .output_zero_point(output_zero_point) - .Test(xnn_qu8_vhswish_ukernel__scalar_u4, xnn_init_qu8_hswish_scalar_params); - } - } -} \ No newline at end of file +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ + \ +XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ +XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ +XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ +XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ + \ +XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ +TEST(ukernel, input_scale) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { \ + for (float input_scale : {4.0f, 16.0f, 64.0f}) { \ + VHSwishMicrokernelTester() \ + .batch_size(batch_size) \ + .input_scale(input_scale) \ + .input_zero_point(150) \ + .output_zero_point(100) \ + .Test(ukernel, init_params); \ + } \ + } \ +} \ + \ +TEST(ukernel, output_scale) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { \ + for (float output_scale : {4.0f, 16.0f, 64.0f}) { \ + VHSwishMicrokernelTester() \ + .batch_size(batch_size) \ + .output_scale(output_scale) \ + .input_zero_point(150) \ + .output_zero_point(100) \ + .Test(ukernel, init_params); \ + } \ + } \ +} \ + \ +TEST(ukernel, input_zero_point) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { \ + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { \ + VHSwishMicrokernelTester() \ + .batch_size(batch_size) \ + .input_zero_point(input_zero_point) \ + .output_zero_point(100) \ + .Test(ukernel, init_params); \ + } \ + } \ +} \ + \ +TEST(ukernel, output_zero_point) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { \ + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { \ + VHSwishMicrokernelTester() \ + .batch_size(batch_size) \ + .input_zero_point(150) \ + .output_zero_point(output_zero_point) \ + .Test(ukernel, init_params); \ + } \ + } \ +} +#include "qu8-vhswish/qu8-vhswish.h" +#undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qu8-vhswish.yaml b/test/qu8-vhswish.yaml deleted file mode 100644 index e348b3e5488..00000000000 --- a/test/qu8-vhswish.yaml +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright 2023 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# ARM NEON -- name: xnn_qu8_vhswish_ukernel__neon_u8 - init: xnn_init_qu8_hswish_scalar_params -- name: xnn_qu8_vhswish_ukernel__neon_u16 - init: xnn_init_qu8_hswish_scalar_params -- name: xnn_qu8_vhswish_ukernel__neon_u32 - init: xnn_init_qu8_hswish_scalar_params - -# x86 SSE2 -- name: xnn_qu8_vhswish_ukernel__sse2_u16 - init: xnn_init_qu8_hswish_sse2_params -- name: xnn_qu8_vhswish_ukernel__sse2_u32 - init: xnn_init_qu8_hswish_sse2_params - -# x86 SSSE3 -- name: xnn_qu8_vhswish_ukernel__ssse3_u16 - init: xnn_init_qu8_hswish_sse2_params -- name: xnn_qu8_vhswish_ukernel__ssse3_u32 - init: xnn_init_qu8_hswish_sse2_params - -# x86 SSE4.1 -- name: xnn_qu8_vhswish_ukernel__sse41_u8 - init: xnn_init_qu8_hswish_sse2_params -- name: xnn_qu8_vhswish_ukernel__sse41_u16 - init: xnn_init_qu8_hswish_sse2_params -- name: xnn_qu8_vhswish_ukernel__sse41_u32 - init: xnn_init_qu8_hswish_sse2_params - -# x86 AVX -- name: xnn_qu8_vhswish_ukernel__avx_u8 - init: xnn_init_qu8_hswish_sse2_params -- name: xnn_qu8_vhswish_ukernel__avx_u16 - init: xnn_init_qu8_hswish_sse2_params -- name: xnn_qu8_vhswish_ukernel__avx_u32 - init: xnn_init_qu8_hswish_sse2_params - -# WAsm Relaxed SIMD -- name: xnn_qu8_vhswish_ukernel__wasmsimd_u8 - init: xnn_init_qu8_hswish_scalar_params -- name: xnn_qu8_vhswish_ukernel__wasmsimd_u16 - init: xnn_init_qu8_hswish_scalar_params -- name: xnn_qu8_vhswish_ukernel__wasmsimd_u32 - init: xnn_init_qu8_hswish_scalar_params - -# Scalar -- name: xnn_qu8_vhswish_ukernel__scalar_u1 - init: xnn_init_qu8_hswish_scalar_params -- name: xnn_qu8_vhswish_ukernel__scalar_u2 - init: xnn_init_qu8_hswish_scalar_params -- name: xnn_qu8_vhswish_ukernel__scalar_u4 - init: xnn_init_qu8_hswish_scalar_params diff --git a/test/qu8-vlrelu.cc b/test/qu8-vlrelu.cc index 7effffac006..39b9947ff4c 100644 --- a/test/qu8-vlrelu.cc +++ b/test/qu8-vlrelu.cc @@ -20,7 +20,7 @@ #include "xnnpack/isa-checks.h" #include "xnnpack/microparams-init.h" #include "xnnpack/microparams.h" -#include "xnnpack/vlrelu.h" +#include "xnnpack/vunary.h" #include "next_prime.h" #include "vlrelu-microkernel-tester.h" @@ -55,5 +55,5 @@ TEST(ukernel, negative_scale) { } \ } \ } -#include "src/qu8-vlrelu/qu8-vlrelu.h" +#include "qu8-vlrelu/qu8-vlrelu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qu8-vmul-minmax-fp32.cc b/test/qu8-vmul-minmax-fp32.cc index f6bdba82f4a..64f6b498c0f 100644 --- a/test/qu8-vmul-minmax-fp32.cc +++ b/test/qu8-vmul-minmax-fp32.cc @@ -31,5 +31,5 @@ XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, false, datatype, ukerne \ XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params); \ XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params); -#include "src/qu8-vmul/qu8-vmul-minmax-fp32.h" +#include "qu8-vmul/qu8-vmul-minmax-fp32.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qu8-vmul-minmax-rndnu.cc b/test/qu8-vmul-minmax-rndnu.cc index 823741779f1..2447f07af5c 100644 --- a/test/qu8-vmul-minmax-rndnu.cc +++ b/test/qu8-vmul-minmax-rndnu.cc @@ -31,5 +31,5 @@ XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, false, datatype, ukerne \ XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params); \ XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params); -#include "src/qu8-vmul/qu8-vmul-minmax-rndnu.h" +#include "qu8-vmul/qu8-vmul-minmax-rndnu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qu8-vmulc-minmax-fp32.cc b/test/qu8-vmulc-minmax-fp32.cc index daa8984b69d..efa1d85ef47 100644 --- a/test/qu8-vmulc-minmax-fp32.cc +++ b/test/qu8-vmulc-minmax-fp32.cc @@ -31,5 +31,5 @@ XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel \ XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, true, datatype, ukernel, init_params); \ XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, true, datatype, ukernel, init_params); -#include "src/qu8-vmulc/qu8-vmulc-minmax-fp32.h" +#include "qu8-vmulc/qu8-vmulc-minmax-fp32.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qu8-vmulc-minmax-rndnu.cc b/test/qu8-vmulc-minmax-rndnu.cc index b9fab2e8e4f..1e203ea74b7 100644 --- a/test/qu8-vmulc-minmax-rndnu.cc +++ b/test/qu8-vmulc-minmax-rndnu.cc @@ -31,5 +31,5 @@ XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel \ XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, true, datatype, ukernel, init_params); \ XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, true, datatype, ukernel, init_params); -#include "src/qu8-vmulc/qu8-vmulc-minmax-rndnu.h" +#include "qu8-vmulc/qu8-vmulc-minmax-rndnu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/raddstoreexpminusmax-microkernel-tester.h b/test/raddstoreexpminusmax-microkernel-tester.h index 1d020a2cf17..0dd99e56a5c 100644 --- a/test/raddstoreexpminusmax-microkernel-tester.h +++ b/test/raddstoreexpminusmax-microkernel-tester.h @@ -17,9 +17,10 @@ #include #include "xnnpack.h" +#include "xnnpack/buffer.h" +#include "xnnpack/math.h" #include "xnnpack/microfnptr.h" #include "xnnpack/microparams.h" -#include "xnnpack/buffer.h" #include "replicable_random_device.h" class RAddStoreExpMinusMaxMicrokernelTester { diff --git a/test/rdsum-microkernel-tester.h b/test/rdsum-microkernel-tester.h index f8315b7fde5..e3742ead92f 100644 --- a/test/rdsum-microkernel-tester.h +++ b/test/rdsum-microkernel-tester.h @@ -17,10 +17,11 @@ #include #include "xnnpack.h" +#include "xnnpack/buffer.h" +#include "xnnpack/math.h" #include "xnnpack/microfnptr.h" #include "xnnpack/microparams.h" #include "xnnpack/requantization.h" -#include "xnnpack/buffer.h" #include "replicable_random_device.h" class RDSumMicrokernelTester { diff --git a/test/reciprocal-square-root.cc b/test/reciprocal-square-root.cc index 8d8ef3cbf15..bebfdd1c85a 100644 --- a/test/reciprocal-square-root.cc +++ b/test/reciprocal-square-root.cc @@ -11,6 +11,7 @@ #include #include "xnnpack.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" diff --git a/test/requantization-tester.h b/test/requantization-tester.h index 80f24ddcbc3..a24857ab21b 100644 --- a/test/requantization-tester.h +++ b/test/requantization-tester.h @@ -520,49 +520,7 @@ class RequantizationTester { } } - void TestRandomCasesRoundToNearestTiesAway(xnn_qu8_requantization_fn requantize) { - ASSERT_GE(zero_point(), std::numeric_limits::min()); - ASSERT_LE(zero_point(), std::numeric_limits::max()); - ASSERT_GE(qmin(), std::numeric_limits::min()); - ASSERT_LE(qmin(), std::numeric_limits::max()); - ASSERT_GE(qmax(), std::numeric_limits::min()); - ASSERT_LE(qmax(), std::numeric_limits::max()); - ASSERT_LT(qmin(), qmax()); - - xnnpack::ReplicableRandomDevice rng; - for (size_t iteration = 0; iteration < iterations(); iteration++) { - auto u8rng = - std::bind(std::uniform_int_distribution(0, std::numeric_limits::max()), std::ref(rng)); - - xnnpack::Buffer inputs(4096); - xnnpack::Buffer outputs(inputs.size()); - - std::uniform_real_distribution scale_distribution(0x1.000000p-23f, 0x1.FFFFFEp-1f); - const float scale = scale_distribution(rng); - for (size_t i = 0; i < inputs.size(); i++) { - const uint8_t approximate_output = std::min(std::max(uint8_t(u8rng()), uint8_t(qmin())), uint8_t(qmax())); - const int32_t input = int32_t(double(approximate_output) / double(scale)); - inputs[i] = input; - } - - requantize( - inputs.size(), inputs.data(), scale, zero_point(), qmin(), qmax(), - outputs.data()); - - /* Ensure that outputs are not all identical, as in this case the test doesn't validate much */ - ASSERT_NE( - *std::max_element(outputs.cbegin(), outputs.cend()), - *std::min_element(outputs.cbegin(), outputs.cend())); - - for (size_t i = 0; i < inputs.size(); i++) { - const uint8_t reference_output = xnn_qu8_requantize_rndna( - inputs[i], scale, zero_point(), qmin(), qmax()); - ASSERT_EQ(uint32_t(reference_output), uint32_t(outputs[i])); - } - } - } - - void TestRandomCasesRoundToNearestTiesAway(xnn_qs8_requantization_fn requantize) { + void TestRandomCasesRoundToNearestTiesUp(xnn_qs8_requantization_fn requantize) { ASSERT_GE(zero_point(), std::numeric_limits::min()); ASSERT_LE(zero_point(), std::numeric_limits::max()); ASSERT_GE(qmin(), std::numeric_limits::min()); @@ -597,34 +555,34 @@ class RequantizationTester { *std::min_element(outputs.cbegin(), outputs.cend())); for (size_t i = 0; i < inputs.size(); i++) { - const int8_t reference_output = xnn_qs8_requantize_rndna( + const int8_t reference_output = xnn_qs8_requantize_rndnu( inputs[i], scale, zero_point(), qmin(), qmax()); ASSERT_EQ(int32_t(reference_output), int32_t(outputs[i])); } } } - void TestRandomCasesRoundToNearestTiesUp(xnn_qs8_requantization_fn requantize) { - ASSERT_GE(zero_point(), std::numeric_limits::min()); - ASSERT_LE(zero_point(), std::numeric_limits::max()); - ASSERT_GE(qmin(), std::numeric_limits::min()); - ASSERT_LE(qmin(), std::numeric_limits::max()); - ASSERT_GE(qmax(), std::numeric_limits::min()); - ASSERT_LE(qmax(), std::numeric_limits::max()); + void TestRandomCasesRoundToNearestTiesUp(xnn_qu8_requantization_fn requantize) { + ASSERT_GE(zero_point(), std::numeric_limits::min()); + ASSERT_LE(zero_point(), std::numeric_limits::max()); + ASSERT_GE(qmin(), std::numeric_limits::min()); + ASSERT_LE(qmin(), std::numeric_limits::max()); + ASSERT_GE(qmax(), std::numeric_limits::min()); + ASSERT_LE(qmax(), std::numeric_limits::max()); ASSERT_LT(qmin(), qmax()); xnnpack::ReplicableRandomDevice rng; for (size_t iteration = 0; iteration < iterations(); iteration++) { - auto i8rng = std::bind( - std::uniform_int_distribution(std::numeric_limits::min(), std::numeric_limits::max()), std::ref(rng)); + auto u8rng = std::bind( + std::uniform_int_distribution(std::numeric_limits::min(), std::numeric_limits::max()), std::ref(rng)); xnnpack::Buffer inputs(4096); - xnnpack::Buffer outputs(inputs.size()); + xnnpack::Buffer outputs(inputs.size()); std::uniform_real_distribution scale_distribution(0x1.000000p-23f, 0x1.FFFFFEp-1f); const float scale = scale_distribution(rng); for (size_t i = 0; i < inputs.size(); i++) { - const int8_t approximate_output = std::min(std::max(int8_t(i8rng()), int8_t(qmin())), int8_t(qmax())); + const uint8_t approximate_output = std::min(std::max(uint8_t(u8rng()), uint8_t(qmin())), uint8_t(qmax())); const int32_t input = int32_t(double(approximate_output) / double(scale)); inputs[i] = input; } @@ -639,7 +597,7 @@ class RequantizationTester { *std::min_element(outputs.cbegin(), outputs.cend())); for (size_t i = 0; i < inputs.size(); i++) { - const int8_t reference_output = xnn_qs8_requantize_rndnu( + const uint8_t reference_output = xnn_qu8_requantize_rndnu( inputs[i], scale, zero_point(), qmin(), qmax()); ASSERT_EQ(int32_t(reference_output), int32_t(outputs[i])); } diff --git a/test/rope-operator-tester.h b/test/rope-operator-tester.h index e887fec4892..33194b17d63 100644 --- a/test/rope-operator-tester.h +++ b/test/rope-operator-tester.h @@ -133,7 +133,7 @@ class RoPEOperatorTester { xnn_operator_t rope_op = nullptr; const xnn_status status = xnn_create_rope_nthc_f16( - /*max_tokens=*/tokens(), /*flags=*/0, &rope_op); + /*flags=*/0, &rope_op); if (status == xnn_status_unsupported_hardware) { GTEST_SKIP(); } @@ -237,7 +237,7 @@ class RoPEOperatorTester { xnn_operator_t rope_op = nullptr; const xnn_status status = xnn_create_rope_nthc_f32( - /*max_tokens=*/tokens(), /*flags=*/0, &rope_op); + /*flags=*/0, &rope_op); if (status == xnn_status_unsupported_hardware) { GTEST_SKIP(); } diff --git a/test/rope.cc b/test/rope.cc index e377d59562e..54ce6a1bcdc 100644 --- a/test/rope.cc +++ b/test/rope.cc @@ -98,7 +98,6 @@ TEST_F(RoPETestF16, define) const struct xnn_node* node = &subgraph->nodes[0]; ASSERT_EQ(node->type, xnn_node_type_rope); ASSERT_EQ(node->compute_type, xnn_compute_type_fp16); - ASSERT_EQ(node->params.rope.max_tokens, max_tokens); ASSERT_EQ(node->num_inputs, 2); ASSERT_EQ(node->inputs[0], input_id); ASSERT_EQ(node->inputs[1], weights_id); @@ -143,7 +142,6 @@ TEST_F(RoPETestF32, define) const struct xnn_node* node = &subgraph->nodes[0]; ASSERT_EQ(node->type, xnn_node_type_rope); ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->params.rope.max_tokens, max_tokens); ASSERT_EQ(node->num_inputs, 2); ASSERT_EQ(node->inputs[0], input_id); ASSERT_EQ(node->inputs[1], weights_id); @@ -161,7 +159,7 @@ TEST_F(RoPETestF16, matches_operator_api) std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); std::generate(weights.begin(), weights.end(), [&]() { return f32dist(rng); }); - const xnn_status status = xnn_create_rope_nthc_f16(max_tokens, /*flags=*/0, &op); + const xnn_status status = xnn_create_rope_nthc_f16(/*flags=*/0, &op); if (status == xnn_status_unsupported_hardware) { GTEST_SKIP(); } @@ -239,7 +237,7 @@ TEST_F(RoPETestF32, matches_operator_api) std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); std::generate(weights.begin(), weights.end(), [&]() { return f32dist(rng); }); - const xnn_status status = xnn_create_rope_nthc_f32(max_tokens, /*flags=*/0, &op); + const xnn_status status = xnn_create_rope_nthc_f32(/*flags=*/0, &op); if (status == xnn_status_unsupported_hardware) { GTEST_SKIP(); } diff --git a/test/rsum-microkernel-tester.h b/test/rsum-microkernel-tester.h index 3bf2df6a0e8..e7884719b54 100644 --- a/test/rsum-microkernel-tester.h +++ b/test/rsum-microkernel-tester.h @@ -18,10 +18,11 @@ #include #include "xnnpack.h" +#include "xnnpack/buffer.h" +#include "xnnpack/math.h" #include "xnnpack/microfnptr.h" #include "xnnpack/microparams.h" #include "xnnpack/requantization.h" -#include "xnnpack/buffer.h" #include "replicable_random_device.h" class RSumMicrokernelTester { diff --git a/test/s32-f32-vcvt.cc b/test/s32-f32-vcvt.cc index 1bf048248a2..26f3916b37e 100644 --- a/test/s32-f32-vcvt.cc +++ b/test/s32-f32-vcvt.cc @@ -19,5 +19,5 @@ XNN_TEST_CVT_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype_in, datatype_ou XNN_TEST_CVT_BATCH_LT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ XNN_TEST_CVT_INPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); -#include "src/s32-f32-vcvt/s32-f32-vcvt.h" +#include "s32-f32-vcvt/s32-f32-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/test/s32-vmul.cc b/test/s32-vmul.cc index 0d4eeb7ab50..366aa3ebca4 100644 --- a/test/s32-vmul.cc +++ b/test/s32-vmul.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); -#include "src/s32-vmul/s32-vmul.h" +#include "s32-vmul/s32-vmul.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/s32-vmulc.cc b/test/s32-vmulc.cc index 35dd5ca65d7..6d5756f0f43 100644 --- a/test/s32-vmulc.cc +++ b/test/s32-vmulc.cc @@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); \ XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params); -#include "src/s32-vmul/s32-vmulc.h" +#include "s32-vmul/s32-vmulc.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/s8-vclamp.cc b/test/s8-vclamp.cc index 153893d3a2c..ee0c42e78e3 100644 --- a/test/s8-vclamp.cc +++ b/test/s8-vclamp.cc @@ -34,5 +34,5 @@ XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ XNN_TEST_UNARY_QMIN(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ XNN_TEST_UNARY_QMAX(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); -#include "src/s8-vclamp/s8-vclamp.h" +#include "s8-vclamp/s8-vclamp.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/scaled-dot-product-attention.cc b/test/scaled-dot-product-attention.cc index e335bbadfe9..d5270f9c28d 100644 --- a/test/scaled-dot-product-attention.cc +++ b/test/scaled-dot-product-attention.cc @@ -20,6 +20,7 @@ #include "xnnpack.h" #include "xnnpack/aligned-allocator.h" #include "xnnpack/common.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/subgraph.h" #include "replicable_random_device.h" diff --git a/test/sigmoid.cc b/test/sigmoid.cc index 377f73812bb..b8ffe05bc24 100644 --- a/test/sigmoid.cc +++ b/test/sigmoid.cc @@ -11,6 +11,7 @@ #include #include "xnnpack.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" diff --git a/test/softmax.cc b/test/softmax.cc index b8e80605a8f..d059d9c9787 100644 --- a/test/softmax.cc +++ b/test/softmax.cc @@ -13,6 +13,7 @@ #include #include "xnnpack.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" diff --git a/test/space-to-depth-2d.cc b/test/space-to-depth-2d.cc index f3b7b47af00..7e5d77f2efb 100644 --- a/test/space-to-depth-2d.cc +++ b/test/space-to-depth-2d.cc @@ -14,6 +14,7 @@ #include #include "xnnpack.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" diff --git a/test/square-root.cc b/test/square-root.cc index e8acac879fe..62ac4780330 100644 --- a/test/square-root.cc +++ b/test/square-root.cc @@ -12,6 +12,7 @@ #include #include "xnnpack.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" diff --git a/test/square.cc b/test/square.cc index 431ae4adb09..da2908cb444 100644 --- a/test/square.cc +++ b/test/square.cc @@ -12,6 +12,7 @@ #include #include "xnnpack.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" diff --git a/test/static-expand-dims.cc b/test/static-expand-dims.cc index 3bc8ec7b56c..3d26649cf6c 100644 --- a/test/static-expand-dims.cc +++ b/test/static-expand-dims.cc @@ -16,6 +16,7 @@ #include #include "xnnpack.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" @@ -77,10 +78,11 @@ TEST_F(StaticExpandDimsTestInt8, define) nullptr, 0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); ASSERT_NE(input_id, XNN_INVALID_NODE_ID); + CalculateExpectedShape(); output_id = XNN_INVALID_NODE_ID; ASSERT_EQ( xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, zero_point, scale, dims.size(), dims.data(), + subgraph, xnn_datatype_qint8, zero_point, scale, expected_shape.size(), expected_shape.data(), nullptr, 1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); ASSERT_NE(output_id, XNN_INVALID_NODE_ID); @@ -135,10 +137,11 @@ TEST_F(StaticExpandDimsTestInt8, matches_operator_api) nullptr, /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); ASSERT_NE(input_id, XNN_INVALID_NODE_ID); + CalculateExpectedShape(); output_id = XNN_INVALID_NODE_ID; ASSERT_EQ( xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, zero_point, scale, dims.size(), dims.data(), + subgraph, xnn_datatype_qint8, zero_point, scale, expected_shape.size(), expected_shape.data(), nullptr, /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); ASSERT_NE(output_id, XNN_INVALID_NODE_ID); @@ -160,7 +163,6 @@ TEST_F(StaticExpandDimsTestInt8, matches_operator_api) std::vector out_dims(XNN_MAX_TENSOR_DIMS); ASSERT_EQ(xnn_status_success, xnn_get_external_value_shape(runtime, output_id, &num_out_dims, &out_dims[0])); out_dims.resize(num_out_dims); - CalculateExpectedShape(); EXPECT_EQ(expected_shape, out_dims); } @@ -179,10 +181,11 @@ TEST_F(StaticExpandDimsTestF16, define) nullptr, 0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); ASSERT_NE(input_id, XNN_INVALID_NODE_ID); + CalculateExpectedShape(); output_id = XNN_INVALID_NODE_ID; ASSERT_EQ( xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), + subgraph, xnn_datatype_fp16, expected_shape.size(), expected_shape.data(), nullptr, 1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); ASSERT_NE(output_id, XNN_INVALID_NODE_ID); @@ -235,10 +238,11 @@ TEST_F(StaticExpandDimsTestF16, matches_operator_api) nullptr, /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); ASSERT_NE(input_id, XNN_INVALID_NODE_ID); + CalculateExpectedShape(); output_id = XNN_INVALID_NODE_ID; ASSERT_EQ( xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), + subgraph, xnn_datatype_fp16, expected_shape.size(), expected_shape.data(), nullptr, /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); ASSERT_NE(output_id, XNN_INVALID_NODE_ID); @@ -260,6 +264,5 @@ TEST_F(StaticExpandDimsTestF16, matches_operator_api) std::vector out_dims(XNN_MAX_TENSOR_DIMS); ASSERT_EQ(xnn_status_success, xnn_get_external_value_shape(runtime, output_id, &num_out_dims, &out_dims[0])); out_dims.resize(num_out_dims); - CalculateExpectedShape(); EXPECT_EQ(expected_shape, out_dims); } diff --git a/test/static-reduce.cc b/test/static-reduce.cc index 181f18c44fe..9dff65376bb 100644 --- a/test/static-reduce.cc +++ b/test/static-reduce.cc @@ -311,7 +311,7 @@ INSTANTIATE_TEST_SUITE_P(ReduceTest, ReduceTest, Values(xnn_reduce_sum, xnn_reduce_mean), Bool())), [](auto p) { return p.param.Name(); }); -TEST_P(ReduceTest, SubgraphDefineWorks) { +TEST_P(ReduceTest, define) { const Param p = GetParam(); ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); @@ -347,7 +347,7 @@ TEST_P(ReduceTest, SubgraphDefineWorks) { ASSERT_EQ(node->flags, p.keep_dims ? XNN_FLAG_KEEP_DIMS : 0); } -TEST_P(ReduceTest, SubgraphAPIResultsMatchesOperatorAPI) { +TEST_P(ReduceTest, matches_operator_api) { const Param p = GetParam(); ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); @@ -463,10 +463,12 @@ TEST_P(ReduceTest, SubgraphAPIResultsMatchesOperatorAPI) { CompareOutputs(p.datatype); } -TEST_P(ReduceTest, ReshapingWorks) { +TEST_P(ReduceTest, reshape) { const Param p = GetParam(); ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); + GenerateRandomInput(p.datatype); + // Call subgraph API. xnn_subgraph_t subgraph = nullptr; ASSERT_EQ(xnn_status_success, xnn_create_subgraph(2, /*flags=*/0, &subgraph)); diff --git a/test/static-reshape.cc b/test/static-reshape.cc index fccc6b63608..9d2df5ed362 100644 --- a/test/static-reshape.cc +++ b/test/static-reshape.cc @@ -16,6 +16,7 @@ #include #include "xnnpack.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" diff --git a/test/static-resize-bilinear-2d.cc b/test/static-resize-bilinear-2d.cc index ef624886892..25ebe70d40b 100644 --- a/test/static-resize-bilinear-2d.cc +++ b/test/static-resize-bilinear-2d.cc @@ -15,10 +15,11 @@ #include #include "xnnpack.h" +#include "xnnpack/buffer.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" -#include "xnnpack/buffer.h" #include "replicable_random_device.h" template class StaticResizeBilinear2DTestBase : public ::testing::Test { diff --git a/test/static-slice.cc b/test/static-slice.cc index 2673f3ba433..ff107cecbff 100644 --- a/test/static-slice.cc +++ b/test/static-slice.cc @@ -16,6 +16,7 @@ #include #include #include "xnnpack.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" diff --git a/test/static-transpose.cc b/test/static-transpose.cc index eeaa0a2e797..084e1a9cf3b 100644 --- a/test/static-transpose.cc +++ b/test/static-transpose.cc @@ -15,6 +15,7 @@ #include #include "xnnpack.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" diff --git a/test/subgraph-fp16.cc b/test/subgraph-fp16.cc index b6a2fa4fc1d..6c3e2f575c7 100644 --- a/test/subgraph-fp16.cc +++ b/test/subgraph-fp16.cc @@ -17,6 +17,7 @@ #include #include "xnnpack.h" #include "xnnpack/allocation-type.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/subgraph.h" #include "mock-allocator.h" diff --git a/test/subgraph-size.c b/test/subgraph-size.c index 1dcbdb50176..0cffe7d11ed 100644 --- a/test/subgraph-size.c +++ b/test/subgraph-size.c @@ -71,9 +71,6 @@ int main(int argc, char** argv) { case 7: xnn_define_binary(NULL, xnn_binary_add, NULL, 0, 0, 0, 0); break; - case 9: - xnn_define_prelu(NULL, 0, 0, 0, 0); - break; case 10: xnn_define_clamp(NULL, 0.0f, 0.0f, 0, 0, 0); break; diff --git a/test/tanh-operator-tester.h b/test/tanh-operator-tester.h index 017e4602cd0..ec5dc2eaf24 100644 --- a/test/tanh-operator-tester.h +++ b/test/tanh-operator-tester.h @@ -18,6 +18,7 @@ #include #include "xnnpack.h" +#include "xnnpack/math.h" #include "replicable_random_device.h" class TanhOperatorTester { diff --git a/test/tanh.cc b/test/tanh.cc index a8bd67271c0..241f6bdd1b5 100644 --- a/test/tanh.cc +++ b/test/tanh.cc @@ -11,6 +11,7 @@ #include #include "xnnpack.h" +#include "xnnpack/math.h" #include "xnnpack/node-type.h" #include "xnnpack/operator.h" #include "xnnpack/subgraph.h" diff --git a/test/u32-f32-vcvt.cc b/test/u32-f32-vcvt.cc index aa750e94bef..48ffb59b3ca 100644 --- a/test/u32-f32-vcvt.cc +++ b/test/u32-f32-vcvt.cc @@ -19,5 +19,5 @@ XNN_TEST_CVT_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype_in, datatype_ou XNN_TEST_CVT_BATCH_LT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ XNN_TEST_CVT_INPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); -#include "src/u32-f32-vcvt/u32-f32-vcvt.h" +#include "u32-f32-vcvt/u32-f32-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/test/u8-vclamp.cc b/test/u8-vclamp.cc index 4faa4670e1f..ebcec214a40 100644 --- a/test/u8-vclamp.cc +++ b/test/u8-vclamp.cc @@ -34,5 +34,5 @@ XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ XNN_TEST_UNARY_QMIN(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ XNN_TEST_UNARY_QMAX(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); -#include "src/u8-vclamp/u8-vclamp.h" +#include "u8-vclamp/u8-vclamp.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/unary-operator-tester.cc b/test/unary-operator-tester.cc index 56c12497b27..83044400c0a 100644 --- a/test/unary-operator-tester.cc +++ b/test/unary-operator-tester.cc @@ -21,6 +21,7 @@ #include #include "xnnpack.h" #include "xnnpack/buffer.h" +#include "xnnpack/math.h" #include "replicable_random_device.h" namespace xnnpack { diff --git a/test/vbinary-microkernel-tester.cc b/test/vbinary-microkernel-tester.cc index 0f64451d260..ee3fd1cc7c7 100644 --- a/test/vbinary-microkernel-tester.cc +++ b/test/vbinary-microkernel-tester.cc @@ -21,11 +21,12 @@ #include #include "xnnpack.h" +#include "xnnpack/buffer.h" +#include "xnnpack/math.h" #include "xnnpack/microfnptr.h" #include "xnnpack/microparams-init.h" #include "xnnpack/microparams.h" #include "xnnpack/requantization.h" -#include "xnnpack/buffer.h" #include "replicable_random_device.h" void VBinaryMicrokernelTester::Test(xnn_f16_vbinary_ukernel_fn vbinary, diff --git a/test/vcmul-microkernel-tester.h b/test/vcmul-microkernel-tester.h index 56824def81f..0dc567139df 100644 --- a/test/vcmul-microkernel-tester.h +++ b/test/vcmul-microkernel-tester.h @@ -16,9 +16,10 @@ #include #include "xnnpack.h" +#include "xnnpack/buffer.h" #include "xnnpack/isa-checks.h" +#include "xnnpack/math.h" #include "xnnpack/microfnptr.h" -#include "xnnpack/buffer.h" #include "replicable_random_device.h" class VCMulMicrokernelTester { diff --git a/test/vcvt-microkernel-tester.cc b/test/vcvt-microkernel-tester.cc index 50a356313fc..d294192cf78 100644 --- a/test/vcvt-microkernel-tester.cc +++ b/test/vcvt-microkernel-tester.cc @@ -77,10 +77,6 @@ void VCvtMicrokernelTester::Test( void VCvtMicrokernelTester::Test(xnn_f16_qs8_vcvt_ukernel_fn vcvt, xnn_init_f16_qs8_cvt_params_fn init_params) { - ASSERT_GE(qmin(), std::numeric_limits::min()); - ASSERT_LE(qmax(), std::numeric_limits::max()); - ASSERT_LT(qmin(), qmax()); - ASSERT_GE(output_zero_point(), std::numeric_limits::min()); ASSERT_LE(output_zero_point(), std::numeric_limits::max()); @@ -101,8 +97,7 @@ void VCvtMicrokernelTester::Test(xnn_f16_qs8_vcvt_ukernel_fn vcvt, struct xnn_f16_qs8_cvt_params params; - init_params(¶ms, scale(), - output_zero_point(), qmin(), qmax()); + init_params(¶ms, scale(), output_zero_point()); // Call optimized micro-kernel. vcvt(batch_size() * sizeof(xnn_float16), input.data(), output.data(), ¶ms); @@ -111,9 +106,11 @@ void VCvtMicrokernelTester::Test(xnn_f16_qs8_vcvt_ukernel_fn vcvt, for (size_t i = 0; i < batch_size(); i++) { float scaled_input = input[i] * scale_fp16; scaled_input = std::min( - scaled_input, static_cast(qmax() - output_zero_point())); + scaled_input, static_cast(std::numeric_limits::max() - + output_zero_point())); scaled_input = std::max( - scaled_input, static_cast(qmin() - output_zero_point())); + scaled_input, static_cast(std::numeric_limits::min() - + output_zero_point())); output_ref[i] = static_cast( std::lrintf(scaled_input) + static_cast(output_zero_point())); } @@ -134,10 +131,6 @@ void VCvtMicrokernelTester::Test(xnn_f16_qs8_vcvt_ukernel_fn vcvt, void VCvtMicrokernelTester::Test( xnn_f32_qs8_vcvt_ukernel_fn vcvt, xnn_init_f32_qs8_cvt_params_fn init_params) const { - ASSERT_GE(qmin(), std::numeric_limits::min()); - ASSERT_LE(qmax(), std::numeric_limits::max()); - ASSERT_LT(qmin(), qmax()); - ASSERT_GE(output_zero_point(), std::numeric_limits::min()); ASSERT_LE(output_zero_point(), std::numeric_limits::max()); @@ -151,7 +144,7 @@ void VCvtMicrokernelTester::Test( std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); struct xnn_f32_qs8_cvt_params params; - init_params(¶ms, scale(), output_zero_point(), qmin(), qmax()); + init_params(¶ms, scale(), output_zero_point()); // Call optimized micro-kernel. vcvt(batch_size() * sizeof(float), input.data(), output.data(), ¶ms); @@ -160,9 +153,11 @@ void VCvtMicrokernelTester::Test( for (size_t i = 0; i < batch_size(); i++) { float scaled_input = input[i] * scale(); scaled_input = std::min( - scaled_input, static_cast(qmax() - output_zero_point())); + scaled_input, static_cast(std::numeric_limits::max() - + output_zero_point())); scaled_input = std::max( - scaled_input, static_cast(qmin() - output_zero_point())); + scaled_input, static_cast(std::numeric_limits::min() - + output_zero_point())); output_ref[i] = static_cast( std::lrintf(scaled_input) + static_cast(output_zero_point())); } @@ -181,10 +176,6 @@ void VCvtMicrokernelTester::Test( void VCvtMicrokernelTester::Test( xnn_f32_qu8_vcvt_ukernel_fn vcvt, xnn_init_f32_qu8_cvt_params_fn init_params) const { - ASSERT_GE(qmin(), std::numeric_limits::min()); - ASSERT_LE(qmax(), std::numeric_limits::max()); - ASSERT_LT(qmin(), qmax()); - ASSERT_GE(output_zero_point(), std::numeric_limits::min()); ASSERT_LE(output_zero_point(), std::numeric_limits::max()); @@ -198,7 +189,7 @@ void VCvtMicrokernelTester::Test( std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); struct xnn_f32_qu8_cvt_params params; - init_params(¶ms, scale(), output_zero_point(), qmin(), qmax()); + init_params(¶ms, scale(), output_zero_point()); // Call optimized micro-kernel. vcvt(batch_size() * sizeof(float), input.data(), output.data(), ¶ms); @@ -207,9 +198,11 @@ void VCvtMicrokernelTester::Test( for (size_t i = 0; i < batch_size(); i++) { float scaled_input = input[i] * scale(); scaled_input = std::min( - scaled_input, static_cast(qmax() - output_zero_point())); + scaled_input, static_cast(std::numeric_limits::max() - + output_zero_point())); scaled_input = std::max( - scaled_input, static_cast(qmin() - output_zero_point())); + scaled_input, static_cast(std::numeric_limits::min() - + output_zero_point())); output_ref[i] = static_cast( std::lrintf(scaled_input) + static_cast(output_zero_point())); } diff --git a/test/vcvt-microkernel-tester.h b/test/vcvt-microkernel-tester.h index f1f7689b27c..927f4429957 100644 --- a/test/vcvt-microkernel-tester.h +++ b/test/vcvt-microkernel-tester.h @@ -52,20 +52,6 @@ class VCvtMicrokernelTester { int16_t output_zero_point() const { return this->output_zero_point_; } - VCvtMicrokernelTester& qmin(int16_t qmin) { - this->qmin_ = qmin; - return *this; - } - - int16_t qmin() const { return this->qmin_; } - - VCvtMicrokernelTester& qmax(int16_t qmax) { - this->qmax_ = qmax; - return *this; - } - - int16_t qmax() const { return this->qmax_; } - VCvtMicrokernelTester& iterations(size_t iterations) { this->iterations_ = iterations; return *this; @@ -114,8 +100,6 @@ class VCvtMicrokernelTester { float scale_ = 1.75f; int16_t input_zero_point_ = 0; int16_t output_zero_point_ = 5; - int16_t qmin_ = std::numeric_limits::min(); - int16_t qmax_ = std::numeric_limits::max(); size_t batch_size_ = 1; size_t iterations_ = 15; }; @@ -124,8 +108,6 @@ template VCvtMicrokernelTester make_vcvt_tester() { if (std::is_integral::value) { return VCvtMicrokernelTester() - .qmin(std::numeric_limits::min()) - .qmax(std::numeric_limits::max()) .output_zero_point(std::numeric_limits::min() / 2 + std::numeric_limits::max() / 2 + 1); } else { @@ -272,43 +254,3 @@ VCvtMicrokernelTester make_vcvt_tester() { .Test(__VA_ARGS__); \ } \ } - -#define XNN_TEST_CVT_QMIN(ukernel, arch_flags, batch_tile, datatype_in, \ - datatype_out, ...) \ - TEST(ukernel, qmin) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_end = batch_tile * batch_scale * 5; \ - const size_t batch_step = std::max(2, batch_end / 8) - 1; \ - for (int32_t qmin = std::numeric_limits::min(); \ - qmin < std::numeric_limits::max(); qmin += 51) { \ - for (size_t batch_size = 1; batch_size <= batch_end; \ - batch_size += batch_step) { \ - make_vcvt_tester() \ - .batch_size(batch_size) \ - .scale(500) \ - .qmin(qmin) \ - .Test(__VA_ARGS__); \ - } \ - } \ - } - -#define XNN_TEST_CVT_QMAX(ukernel, arch_flags, batch_tile, datatype_in, \ - datatype_out, ...) \ - TEST(ukernel, qmax) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_end = batch_tile * batch_scale * 5; \ - const size_t batch_step = std::max(2, batch_end / 8) - 1; \ - for (int32_t qmax = std::numeric_limits::min() + 1; \ - qmax <= std::numeric_limits::max(); qmax += 51) { \ - for (size_t batch_size = 1; batch_size <= batch_end; \ - batch_size += batch_step) { \ - make_vcvt_tester() \ - .batch_size(batch_size) \ - .scale(500) \ - .qmax(qmax) \ - .Test(__VA_ARGS__); \ - } \ - } \ - } diff --git a/test/vhswish-microkernel-tester.h b/test/vhswish-microkernel-tester.h index 9ef61f6a069..d12b3476c08 100644 --- a/test/vhswish-microkernel-tester.h +++ b/test/vhswish-microkernel-tester.h @@ -85,6 +85,13 @@ class VHSwishMicrokernelTester { return this->iterations_; } + VHSwishMicrokernelTester& inplace(bool inplace) { + this->inplace_ = inplace; + return *this; + } + + bool inplace() const { return this->inplace_; } + void Test(xnn_qs8_vhswish_ukernel_fn vhswish, xnn_init_qs8_hswish_params_fn init_params) const { ASSERT_GE(input_zero_point(), std::numeric_limits::min()); ASSERT_LE(input_zero_point(), std::numeric_limits::max()); @@ -100,15 +107,9 @@ class VHSwishMicrokernelTester { xnnpack::Buffer output_ref(batch_size()); for (size_t iteration = 0; iteration < iterations(); iteration++) { std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); - for (int i = 0; i < batch_size(); i++) { - input[i] = i; - } union xnn_qs8_hswish_params params; init_params(¶ms, input_zero_point(), output_zero_point(), input_scale(), output_scale()); - // Call optimized micro-kernel. - vhswish(batch_size() * sizeof(int8_t), input.data(), output.data(), ¶ms); - // Compute reference results const int32_t input_scale_div = (int32_t) lrintf(256.0f * input_scale() / 6.0f); const int32_t scale_ratio = (int32_t) lrintf(256.0f * input_scale() / output_scale()); @@ -125,6 +126,13 @@ class VHSwishMicrokernelTester { output_ref[i] = static_cast(output_value); } + // Call optimized micro-kernel. + vhswish(batch_size() * sizeof(int8_t), input.data(), inplace() ? input.data() : output.data(), ¶ms); + + if (inplace()) { + std::copy_n(input.data(), batch_size(), output.data()); + } + // Verify results. for (size_t i = 0; i < batch_size(); i++) { EXPECT_EQ(int32_t(output[i]), int32_t(output_ref[i])) @@ -149,15 +157,9 @@ class VHSwishMicrokernelTester { xnnpack::Buffer output_ref(batch_size()); for (size_t iteration = 0; iteration < iterations(); iteration++) { std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); - for (int i = 0; i < batch_size(); i++) { - input[i] = i; - } union xnn_qu8_hswish_params params; init_params(¶ms, input_zero_point(), output_zero_point(), input_scale(), output_scale()); - // Call optimized micro-kernel. - vhswish(batch_size() * sizeof(uint8_t), input.data(), output.data(), ¶ms); - // Compute reference results const int32_t input_scale_div = (int32_t) lrintf(256.0f * input_scale() / 6.0f); const int32_t scale_ratio = (int32_t) lrintf(256.0f * input_scale() / output_scale()); @@ -174,6 +176,13 @@ class VHSwishMicrokernelTester { output_ref[i] = static_cast(output_value); } + // Call optimized micro-kernel. + vhswish(batch_size() * sizeof(uint8_t), input.data(), inplace() ? input.data() : output.data(), ¶ms); + + if (inplace()) { + std::copy_n(input.data(), batch_size(), output.data()); + } + // Verify results. for (size_t i = 0; i < batch_size(); i++) { EXPECT_EQ(int32_t(output[i]), int32_t(output_ref[i])) @@ -190,4 +199,68 @@ class VHSwishMicrokernelTester { int16_t output_zero_point_ = 5; size_t batch_size_ = 1; size_t iterations_ = 15; + bool inplace_ = false; }; + +#define XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, \ + ...) \ + TEST(ukernel, batch_eq) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + const size_t batch_scale = get_batch_scale(); \ + VHSwishMicrokernelTester() \ + .batch_size(batch_tile * batch_scale) \ + .Test(__VA_ARGS__); \ + } + +#define XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, \ + ...) \ + TEST(ukernel, batch_div) { \ + if (batch_tile == 1) return; \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + const size_t batch_scale = get_batch_scale(); \ + const size_t batch_step = batch_tile * batch_scale; \ + for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; \ + batch_size += batch_step) { \ + VHSwishMicrokernelTester().batch_size(batch_size).Test(__VA_ARGS__); \ + } \ + } + +#define XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, \ + ...) \ + TEST(ukernel, batch_lt) { \ + if (batch_tile == 1) return; \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + const size_t batch_scale = get_batch_scale(); \ + const size_t batch_end = batch_tile * batch_scale; \ + for (size_t batch_size = 1; batch_size < batch_end; batch_size++) { \ + VHSwishMicrokernelTester().batch_size(batch_size).Test(__VA_ARGS__); \ + } \ + } + +#define XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, \ + ...) \ + TEST(ukernel, batch_gt) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + const size_t batch_scale = get_batch_scale(); \ + const size_t batch_step = batch_tile * batch_scale; \ + const size_t batch_end = batch_tile == 1 ? 10 : 2 * batch_step; \ + for (size_t batch_size = batch_step + 1; batch_size < batch_end; \ + batch_size++) { \ + VHSwishMicrokernelTester().batch_size(batch_size).Test(__VA_ARGS__); \ + } \ + } + +#define XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ...) \ + TEST(ukernel, inplace) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + const size_t batch_scale = get_batch_scale(); \ + const size_t batch_end = batch_tile * batch_scale; \ + const size_t batch_step = std::max(1, batch_tile - 1); \ + for (size_t batch_size = 1; batch_size <= batch_end; \ + batch_size += batch_step) { \ + VHSwishMicrokernelTester() \ + .batch_size(batch_size) \ + .inplace(true) \ + .Test(__VA_ARGS__); \ + } \ + } diff --git a/test/vlog-microkernel-tester.h b/test/vlog-microkernel-tester.h deleted file mode 100644 index 19a10e4393b..00000000000 --- a/test/vlog-microkernel-tester.h +++ /dev/null @@ -1,161 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/buffer.h" -#include "replicable_random_device.h" - -extern "C" XNN_INTERNAL const uint16_t xnn_table_vlog[129]; - -class VLogMicrokernelTester { - public: - VLogMicrokernelTester& batch(size_t batch) { - assert(batch != 0); - this->batch_ = batch; - return *this; - } - - size_t batch() const { - return this->batch_; - } - - VLogMicrokernelTester& input_lshift(uint32_t input_lshift) { - assert(input_lshift < 32); - this->input_lshift_ = input_lshift; - return *this; - } - - uint32_t input_lshift() const { - return this->input_lshift_; - } - - VLogMicrokernelTester& output_scale(uint32_t output_scale) { - this->output_scale_ = output_scale; - return *this; - } - - uint32_t output_scale() const { - return this->output_scale_; - } - - VLogMicrokernelTester& inplace(bool inplace) { - this->inplace_ = inplace; - return *this; - } - - bool inplace() const { - return this->inplace_; - } - - VLogMicrokernelTester& iterations(size_t iterations) { - this->iterations_ = iterations; - return *this; - } - - size_t iterations() const { - return this->iterations_; - } - - void Test(xnn_u32_vlog_ukernel_fn vlog) const { - xnnpack::ReplicableRandomDevice rng; - auto i16rng = std::bind(std::uniform_int_distribution(), std::ref(rng)); - auto i32rng = std::bind(std::uniform_int_distribution(), std::ref(rng)); - - xnnpack::Buffer x(batch() + XNN_EXTRA_BYTES / sizeof(uint32_t)); - xnnpack::Buffer y(batch() * (inplace() ? sizeof(uint32_t) / sizeof(uint16_t) : 1) + XNN_EXTRA_BYTES / sizeof(uint32_t)); - xnnpack::Buffer y_ref(batch()); - const uint32_t* x_data = inplace() ? reinterpret_cast(y.data()) : x.data(); - - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(x.begin(), x.end(), std::ref(i32rng)); - std::generate(y.begin(), y.end(), std::ref(i16rng)); - std::generate(y_ref.begin(), y_ref.end(), std::ref(i16rng)); - - // Compute reference results. - for (size_t n = 0; n < batch(); n++) { - const uint32_t x_value = x_data[n]; - const uint32_t scaled = x_value << input_lshift(); - uint32_t log_value = 0; - if (scaled != 0) { - const uint32_t out_scale = output_scale(); - - const int log_scale = 65536; - const int log_scale_log2 = 16; - const int log_coeff = 45426; - const uint32_t log2x = math_clz_nonzero_u32(scaled) ^ 31; // log2 of scaled - assert(log2x < 32); - - // Number of segments in the log lookup table. The table will be log_segments+1 - // in length (with some padding). - const int log_segments_log2 = 7; - - // Part 1 - uint32_t frac = scaled - (UINT32_C(1) << log2x); - - // Shift the fractional part into msb of 16 bits - frac = XNN_UNPREDICTABLE(log2x < log_scale_log2) ? - (frac << (log_scale_log2 - log2x)) : - (frac >> (log2x - log_scale_log2)); - - // Part 2 - const uint32_t base_seg = frac >> (log_scale_log2 - log_segments_log2); - const uint32_t seg_unit = (UINT32_C(1) << log_scale_log2) >> log_segments_log2; - - assert(128 == (1 << log_segments_log2)); - assert(base_seg < (1 << log_segments_log2)); - - const uint32_t c0 = xnn_table_vlog[base_seg]; - const uint32_t c1 = xnn_table_vlog[base_seg + 1]; - const uint32_t seg_base = seg_unit * base_seg; - const uint32_t rel_pos = ((c1 - c0) * (frac - seg_base)) >> log_scale_log2; - const uint32_t fraction = frac + c0 + rel_pos; - - const uint32_t log2 = (log2x << log_scale_log2) + fraction; - const uint32_t round = log_scale / 2; - const uint32_t loge = (((uint64_t) log_coeff) * log2 + round) >> log_scale_log2; - - // Finally scale to our output scale - log_value = (out_scale * loge + round) >> log_scale_log2; - } - - const uint32_t vout = math_min_u32(log_value, (uint32_t) INT16_MAX); - y_ref[n] = vout; - } - - // Call optimized micro-kernel. - vlog(batch(), x_data, input_lshift(), output_scale(), y.data()); - - // Verify results. - for (size_t n = 0; n < batch(); n++) { - EXPECT_EQ(y[n], y_ref[n]) - << ", input_lshift " << input_lshift() - << ", output_scale " << output_scale() - << ", batch " << n << " / " << batch(); - } - } - } - - private: - size_t batch_{1}; - uint32_t input_lshift_{4}; - uint32_t output_scale_{16}; - bool inplace_{false}; - size_t iterations_{15}; -}; diff --git a/test/vmulcaddc-microkernel-tester.h b/test/vmulcaddc-microkernel-tester.h index 0594e3ead4d..dd178750fba 100644 --- a/test/vmulcaddc-microkernel-tester.h +++ b/test/vmulcaddc-microkernel-tester.h @@ -16,10 +16,11 @@ #include #include "xnnpack.h" +#include "xnnpack/buffer.h" +#include "xnnpack/math.h" #include "xnnpack/microfnptr.h" #include "xnnpack/microparams.h" #include "xnnpack/pack.h" -#include "xnnpack/buffer.h" #include "replicable_random_device.h" class VMulCAddCMicrokernelTester { diff --git a/test/vunary-microkernel-tester.cc b/test/vunary-microkernel-tester.cc index 130053b82ee..6e7bd0ed7bd 100644 --- a/test/vunary-microkernel-tester.cc +++ b/test/vunary-microkernel-tester.cc @@ -18,10 +18,11 @@ #include #include "xnnpack.h" +#include "xnnpack/buffer.h" #include "xnnpack/common.h" +#include "xnnpack/math.h" #include "xnnpack/microfnptr.h" #include "xnnpack/microparams.h" -#include "xnnpack/buffer.h" #include "replicable_random_device.h" #ifndef M_SQRT1_2 diff --git a/test/x16-packw.cc b/test/x16-packw.cc index cb3f26371fe..9ca158b2be9 100644 --- a/test/x16-packw.cc +++ b/test/x16-packw.cc @@ -32,7 +32,7 @@ std::string GetTestName(const testing::TestParamInfo& info) { #ukernel, ukernel, arch_flags, nr, kr, sr, kblock, nr_scale }, const XnnTestParam xnn_test_params[] = { -#include "src/x16-packw/x16-packw.h" +#include "x16-packw/x16-packw.h" }; #undef XNN_UKERNEL diff --git a/test/x32-packb.cc b/test/x32-packb.cc index 66d8c909cbb..ff90b1d2adb 100644 --- a/test/x32-packb.cc +++ b/test/x32-packb.cc @@ -32,8 +32,8 @@ std::string GetTestName(const testing::TestParamInfo& info) { #ukernel, PackBMicrokernelTester::Kernel{ukernel}, arch_flags, channel_tile, channel_subtile, channel_round }, const XnnTestParam xnn_test_params[] = { -#include "src/x32-packb/x32-packb.h" -#include "src/x32-zerob/x32-zerob.h" +#include "x32-packb/x32-packb.h" +#include "x32-zerob/x32-zerob.h" }; #undef XNN_UKERNEL diff --git a/test/x32-packw.cc b/test/x32-packw.cc index d2bc52b2312..3cb4b52cb08 100644 --- a/test/x32-packw.cc +++ b/test/x32-packw.cc @@ -32,7 +32,7 @@ std::string GetTestName(const testing::TestParamInfo& info) { #ukernel, ukernel, arch_flags, nr, kr, sr, kblock, nr_scale }, const XnnTestParam xnn_test_params[] = { -#include "src/x32-packw/x32-packw.h" +#include "x32-packw/x32-packw.h" }; #undef XNN_UKERNEL diff --git a/test/x32-packx.cc b/test/x32-packx.cc index 5986e3b1dc8..58deea144aa 100644 --- a/test/x32-packx.cc +++ b/test/x32-packx.cc @@ -32,7 +32,7 @@ std::string GetTestName(const testing::TestParamInfo& info) { #ukernel, ukernel, arch_flags, k, mr }, const XnnTestParam xnn_test_params[] = { -#include "src/x32-packx/x32-packx.h" +#include "x32-packx/x32-packx.h" }; #undef XNN_UKERNEL diff --git a/test/x8-lut.cc b/test/x8-lut.cc index 2edfd7537bd..b986d436dd6 100644 --- a/test/x8-lut.cc +++ b/test/x8-lut.cc @@ -1040,7 +1040,7 @@ TEST(X8_LUT__SCALAR_U16, inplace) { #endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_ENABLE_AVX512VBMI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) TEST(X8_LUT__AVX512VBMI_VPERMX2B_U64, batch_eq_64) { TEST_REQUIRES_X86_AVX512VBMI; LUTMicrokernelTester() @@ -1084,10 +1084,10 @@ TEST(X8_LUT__SCALAR_U16, inplace) { .Test(xnn_x8_lut_ukernel__avx512vbmi_vpermx2b_u64); } } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#endif // XNN_ENABLE_AVX512VBMI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_ENABLE_AVX512VBMI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) TEST(X8_LUT__AVX512VBMI_VPERMX2B_U128, batch_eq_128) { TEST_REQUIRES_X86_AVX512VBMI; LUTMicrokernelTester() @@ -1131,10 +1131,10 @@ TEST(X8_LUT__SCALAR_U16, inplace) { .Test(xnn_x8_lut_ukernel__avx512vbmi_vpermx2b_u128); } } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#endif // XNN_ENABLE_AVX512VBMI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_ENABLE_AVX512VBMI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) TEST(X8_LUT__AVX512VBMI_VPERMX2B_U192, batch_eq_192) { TEST_REQUIRES_X86_AVX512VBMI; LUTMicrokernelTester() @@ -1178,10 +1178,10 @@ TEST(X8_LUT__SCALAR_U16, inplace) { .Test(xnn_x8_lut_ukernel__avx512vbmi_vpermx2b_u192); } } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#endif // XNN_ENABLE_AVX512VBMI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_ENABLE_AVX512VBMI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) TEST(X8_LUT__AVX512VBMI_VPERMX2B_U256, batch_eq_256) { TEST_REQUIRES_X86_AVX512VBMI; LUTMicrokernelTester() @@ -1225,7 +1225,7 @@ TEST(X8_LUT__SCALAR_U16, inplace) { .Test(xnn_x8_lut_ukernel__avx512vbmi_vpermx2b_u256); } } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 +#endif // XNN_ENABLE_AVX512VBMI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD diff --git a/test/x8-packq.cc b/test/x8-packq.cc index 1ae6cd9d4f7..d0df2abcfd8 100644 --- a/test/x8-packq.cc +++ b/test/x8-packq.cc @@ -33,7 +33,7 @@ std::string GetTestName(const testing::TestParamInfo& info) { #ukernel, ukernel, arch_flags, unroll }, const XnnTestParam xnn_test_params[] = { -#include "src/x8-packq/x8-packq.h" +#include "x8-packq/x8-packq.h" }; #undef XNN_UKERNEL diff --git a/test/x8-packw.cc b/test/x8-packw.cc index c6bdcc55ab7..12694864f60 100644 --- a/test/x8-packw.cc +++ b/test/x8-packw.cc @@ -33,7 +33,7 @@ std::string GetTestName(const testing::TestParamInfo& info) { #ukernel, ukernel, arch_flags, nr, kr, sr, kblock, nr_scale }, const XnnTestParam xnn_test_params[] = { -#include "src/x8-packw/x8-packw.h" +#include "x8-packw/x8-packw.h" }; #undef XNN_UKERNEL diff --git a/test/xN-transpose.cc b/test/xN-transpose.cc index 8cd0fc7ed35..b55d226daac 100644 --- a/test/xN-transpose.cc +++ b/test/xN-transpose.cc @@ -97,12 +97,12 @@ TestParams transpose_ukernels[] = { block_width, block_height) \ {#ukernel, arch_flags, make_ukernel_wrapper(ukernel), \ element_size, block_width, block_height}, -#include "src/x8-transposec/x8-transposec.h" -#include "src/x16-transposec/x16-transposec.h" -#include "src/x24-transposec/x24-transposec.h" -#include "src/x32-transposec/x32-transposec.h" -#include "src/x64-transposec/x64-transposec.h" -#include "src/xx-transposev/xx-transposev.h" +#include "x8-transposec/x8-transposec.h" +#include "x16-transposec/x16-transposec.h" +#include "x24-transposec/x24-transposec.h" +#include "x32-transposec/x32-transposec.h" +#include "x64-transposec/x64-transposec.h" +#include "xx-transposev/xx-transposev.h" }; #undef XNN_TRANSPOSE_UKERNEL @@ -112,7 +112,7 @@ TestParams transposev_ukernels[] = { block_width, block_height) \ {#ukernel, arch_flags, make_ukernel_wrapper(ukernel), \ element_size, block_width, block_height}, -#include "src/xx-transposev/xx-transposev.h" +#include "xx-transposev/xx-transposev.h" }; #undef XNN_TRANSPOSE_UKERNEL diff --git a/test/xx-fill.cc b/test/xx-fill.cc index c22c5c1fcb1..ce141aa2956 100644 --- a/test/xx-fill.cc +++ b/test/xx-fill.cc @@ -128,7 +128,7 @@ struct TestParams { #define XNN_FILL_UKERNEL(arch_flags, ukernel) {#ukernel, arch_flags, ukernel}, TestParams test_params[] = { -#include "src/xx-fill/xx-fill.h" +#include "xx-fill/xx-fill.h" }; #undef XNN_FILL_UKERNEL diff --git a/test/xx-pad.cc b/test/xx-pad.cc index 776bba6419c..33cc22424e4 100644 --- a/test/xx-pad.cc +++ b/test/xx-pad.cc @@ -188,7 +188,7 @@ struct TestParams { #define XNN_PAD_UKERNEL(arch_flags, ukernel, tile_size) \ {#ukernel, arch_flags, ukernel, tile_size}, TestParams test_params[] = { -#include "src/xx-pad/xx-pad.h" +#include "xx-pad/xx-pad.h" }; #undef XNN_PAD_UKERNEL diff --git a/tools/generate-gemm-test.py b/tools/generate-gemm-test.py index 500bd8e1644..e776ddabef0 100755 --- a/tools/generate-gemm-test.py +++ b/tools/generate-gemm-test.py @@ -881,8 +881,8 @@ def main(args): // Generator: {generator} #include -#include "bench/gemm-benchmark.h" -#include "bench/utils.h" +#include "gemm-benchmark.h" +#include "utils.h" #include "xnnpack/common.h" #include "xnnpack/gemm.h" #include "xnnpack/isa-checks.h" diff --git a/tools/generate-prelu-test.py b/tools/generate-prelu-test.py deleted file mode 100755 index 1b901d8c20c..00000000000 --- a/tools/generate-prelu-test.py +++ /dev/null @@ -1,235 +0,0 @@ -#!/usr/bin/env python -# Copyright 2019 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import argparse -import codecs -import math -import os -import re -import sys -import yaml - -sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from primes import next_prime -import xngen -import xnncommon - - -parser = argparse.ArgumentParser(description='PReLU microkernel test generator') -parser.add_argument("-s", "--spec", metavar="FILE", required=True, - help="Specification (YAML) file") -parser.add_argument("-o", "--output", metavar="FILE", required=True, - help='Output (C++ source) file') -parser.set_defaults(defines=list()) - - -def split_ukernel_name(name): - match = re.fullmatch(r"xnn_(f16|f32)_prelu_ukernel__(.+)_(\d+)x(\d+)", name) - assert match is not None - row_tile = int(match.group(3)) - channel_tile = int(match.group(4)) - - arch, isa, assembly = xnncommon.parse_target_name(target_name=match.group(2)) - return row_tile, channel_tile, arch, isa - - -PRELU_TEST_TEMPLATE = """\ -TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - PReLUMicrokernelTester() - .rows(${ROW_TILE}) - .channels(${CHANNEL_TILE}) - .Test(${", ".join(TEST_ARGS)}); -} - -$if CHANNEL_TILE > 1: - TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*10}; channels += ${CHANNEL_TILE}) { - PReLUMicrokernelTester() - .rows(${ROW_TILE}) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - } - - TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) { - PReLUMicrokernelTester() - .rows(${ROW_TILE}) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - } - -TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) { - PReLUMicrokernelTester() - .rows(${ROW_TILE}) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } -} - -$if ROW_TILE > 1: - TEST(${TEST_NAME}, rows_lt_${ROW_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t rows = 1; rows < ${ROW_TILE}; rows++) { - for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - } - } - - TEST(${TEST_NAME}, rows_div_${ROW_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t rows = ${ROW_TILE*2}; rows <= ${ROW_TILE*4}; rows += ${ROW_TILE}) { - for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - } - } - -TEST(${TEST_NAME}, rows_gt_${ROW_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t rows = ${ROW_TILE+1}; rows < ${ROW_TILE*2}; rows++) { - for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - } -} - -TEST(${TEST_NAME}, input_stride) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t rows = 1; rows <= ${ROW_TILE*3}; rows += ${max(1, ROW_TILE-1)}) { - for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(${next_prime(CHANNEL_TILE*5+1)}) - .iterations(1) - .Test(${", ".join(TEST_ARGS)}); - } - } -} - -TEST(${TEST_NAME}, output_stride) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t rows = 1; rows <= ${ROW_TILE*3}; rows += ${max(1, ROW_TILE-1)}) { - for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .output_stride(${next_prime(CHANNEL_TILE*5+1)}) - .iterations(1) - .Test(${", ".join(TEST_ARGS)}); - } - } -} - -TEST(${TEST_NAME}, inplace) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t rows = 1; rows <= ${ROW_TILE*3}; rows += ${max(1, ROW_TILE-1)}) { - for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) { - PReLUMicrokernelTester() - .rows(rows) - .channels(channels) - .inplace(true) - .iterations(1) - .Test(${", ".join(TEST_ARGS)}); - } - } -} -""" - - -def generate_test_cases(ukernel, row_tile, channel_tile, isa): - """Generates all tests cases for a PRELU micro-kernel. - - Args: - ukernel: C name of the micro-kernel function. - row_tile: Number of rows (pixels) processed per one iteration of the outer - loop of the micro-kernel. - channel_tile: Number of channels processed per one iteration of the inner - loop of the micro-kernel. - isa: instruction set required to run the micro-kernel. Generated unit test - will skip execution if the host processor doesn't support this ISA. - - Returns: - Code for the test case. - """ - _, test_name = ukernel.split("_", 1) - _, datatype, ukernel_type, _ = ukernel.split("_", 3) - return xngen.preprocess(PRELU_TEST_TEMPLATE, { - "TEST_NAME": test_name.upper().replace("UKERNEL_", ""), - "TEST_ARGS": [ukernel], - "DATATYPE": datatype, - "ROW_TILE": row_tile, - "CHANNEL_TILE": channel_tile, - "ISA_CHECK": xnncommon.generate_isa_check_macro(isa), - "next_prime": next_prime, - }) - - -def main(args): - options = parser.parse_args(args) - - with codecs.open(options.spec, "r", encoding="utf-8") as spec_file: - spec_yaml = yaml.safe_load(spec_file) - if not isinstance(spec_yaml, list): - raise ValueError("expected a list of micro-kernels in the spec") - - tests = """\ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: {specification} -// Generator: {generator} - - -#include -#include "xnnpack/common.h" -#include "xnnpack/isa-checks.h" -#include "xnnpack/prelu.h" -#include "prelu-microkernel-tester.h" -""".format(specification=options.spec, generator=sys.argv[0]) - - for ukernel_spec in spec_yaml: - name = ukernel_spec["name"] - row_tile, channel_tile, arch, isa = split_ukernel_name(name) - - test_case = generate_test_cases(name, row_tile, channel_tile, isa) - tests += "\n\n" + xnncommon.postprocess_test_case(test_case, arch, isa) - - xnncommon.overwrite_if_changed(options.output, tests) - - -if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/tools/generate-rdsum-benchmark.py b/tools/generate-rdsum-benchmark.py index 77f572220de..3a81702ded2 100755 --- a/tools/generate-rdsum-benchmark.py +++ b/tools/generate-rdsum-benchmark.py @@ -109,8 +109,8 @@ def main(args): // Specification: {specification} // Generator: {generator} -#include "bench/rsum-benchmark.h" -#include "bench/utils.h" +#include "rsum-benchmark.h" +#include "utils.h" #include #include "xnnpack.h" diff --git a/tools/generate-spmm-test.py b/tools/generate-spmm-test.py index 23f9b3f10ed..9a5f3f14836 100755 --- a/tools/generate-spmm-test.py +++ b/tools/generate-spmm-test.py @@ -486,8 +486,8 @@ def main(args): // Generator: {generator} #include -#include "bench/spmm-benchmark.h" -#include "bench/utils.h" +#include "spmm-benchmark.h" +#include "utils.h" #include "xnnpack/gemm.h" #include "xnnpack/microfnptr.h" #include "xnnpack/microparams-init.h" diff --git a/tools/generate-vhswish-test.py b/tools/generate-vhswish-test.py deleted file mode 100755 index 4c9310c6bb8..00000000000 --- a/tools/generate-vhswish-test.py +++ /dev/null @@ -1,227 +0,0 @@ -#!/usr/bin/env python -# Copyright 2023 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import argparse -import codecs -import math -import os -import re -import sys -import yaml - -sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -import xngen -import xnncommon - - -parser = argparse.ArgumentParser( - description='Vector Hardswish operation microkernel test generator') -parser.add_argument("-s", "--spec", metavar="FILE", required=True, - help="Specification (YAML) file") -parser.add_argument("-o", "--output", metavar="FILE", required=True, - help='Output (C++ source) file') -parser.set_defaults(defines=list()) - - -def split_ukernel_name(name): - match = re.fullmatch(r"xnn_(qs8|qu8)_vhswish_ukernel__(.+)_u(\d+)(v)?", name) - if match is None: - raise ValueError("Unexpected microkernel name: " + name) - - datatype = match.group(1) - batch_tile = int(match.group(3)) - - arch, isa, assembly = xnncommon.parse_target_name(target_name=match.group(2)) - return datatype, batch_tile, arch, isa - - -HSWISH_TEST_TEMPLATE = """\ -TEST(${TEST_NAME}, batch_eq_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - VHSwishMicrokernelTester() - .batch_size(${BATCH_TILE}) - $if DATATYPE == "QU8": - .input_zero_point(150) - .output_zero_point(100) - .Test(${", ".join(TEST_ARGS)}); -} - -$if BATCH_TILE > 1: - TEST(${TEST_NAME}, batch_div_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t batch_size = ${BATCH_TILE*2}; batch_size < ${BATCH_TILE*10}; batch_size += ${BATCH_TILE}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - $if DATATYPE == "QU8": - .input_zero_point(150) - .output_zero_point(100) - .Test(${", ".join(TEST_ARGS)}); - } - } - - TEST(${TEST_NAME}, batch_lt_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t batch_size = 1; batch_size < ${BATCH_TILE}; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - $if DATATYPE == "QU8": - .input_zero_point(150) - .output_zero_point(100) - .Test(${", ".join(TEST_ARGS)}); - } - } - -TEST(${TEST_NAME}, batch_gt_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t batch_size = ${BATCH_TILE+1}; batch_size < ${10 if BATCH_TILE == 1 else BATCH_TILE*2}; batch_size++) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - $if DATATYPE == "QU8": - .input_zero_point(150) - .output_zero_point(100) - .Test(${", ".join(TEST_ARGS)}); - } -} - -TEST(${TEST_NAME}, input_scale) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t batch_size = 1; batch_size <= ${BATCH_TILE*5}; batch_size += ${max(1, BATCH_TILE-1)}) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - $if DATATYPE == "QU8": - .input_zero_point(150) - .output_zero_point(100) - .Test(${", ".join(TEST_ARGS)}); - } - } -} - -TEST(${TEST_NAME}, output_scale) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t batch_size = 1; batch_size <= ${BATCH_TILE*5}; batch_size += ${max(1, BATCH_TILE-1)}) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - $if DATATYPE == "QU8": - .input_zero_point(150) - .output_zero_point(100) - .Test(${", ".join(TEST_ARGS)}); - } - } -} - -TEST(${TEST_NAME}, input_zero_point) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= ${BATCH_TILE*5}; batch_size += ${max(1, BATCH_TILE-1)}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - $if DATATYPE == "QU8": - .output_zero_point(100) - .Test(${", ".join(TEST_ARGS)}); - } - } -} - -TEST(${TEST_NAME}, output_zero_point) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= ${BATCH_TILE*5}; batch_size += ${max(1, BATCH_TILE-1)}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - $if DATATYPE == "QU8": - .input_zero_point(150) - .output_zero_point(output_zero_point) - .Test(${", ".join(TEST_ARGS)}); - } - } -} -""" - - -def generate_test_cases(ukernel, init_fn, datatype, batch_tile, isa): - """Generates all tests cases for a Vector Hardswish micro-kernel. - - Args: - ukernel: C name of the micro-kernel function. - init_fn: C name of the function to initialize microkernel parameters. - datatype: data type. - batch_tile: Number of batch elements processed per one iteration of the - inner loop of the micro-kernel. - isa: instruction set required to run the micro-kernel. Generated unit test - will skip execution if the host processor doesn't support this ISA. - - Returns: - Code for the test case. - """ - _, test_name = ukernel.split("_", 1) - test_args = [ukernel] - if init_fn: - test_args.append(init_fn) - return xngen.preprocess(HSWISH_TEST_TEMPLATE, { - "TEST_NAME": test_name.upper().replace("UKERNEL_", ""), - "TEST_ARGS": test_args, - "BATCH_TILE": batch_tile, - "DATATYPE": datatype.upper(), - "ISA_CHECK": xnncommon.generate_isa_check_macro(isa), - }) - - -def main(args): - options = parser.parse_args(args) - - with codecs.open(options.spec, "r", encoding="utf-8") as spec_file: - spec_yaml = yaml.safe_load(spec_file) - if not isinstance(spec_yaml, list): - raise ValueError("expected a list of micro-kernels in the spec") - - tests = """\ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: {specification} -// Generator: {generator} - - -#include - -#include -#include "xnnpack/common.h" -#include "xnnpack/isa-checks.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/vhswish.h" -#include "vhswish-microkernel-tester.h" -""".format(specification=options.spec, generator=sys.argv[0]) - - for ukernel_spec in spec_yaml: - name = ukernel_spec["name"] - init_fn = ukernel_spec.get("init") - datatype, batch_tile, arch, isa = split_ukernel_name(name) - - test_case = generate_test_cases( - name, init_fn, datatype, batch_tile, isa) - tests += "\n\n" + xnncommon.postprocess_test_case(test_case, arch, isa) - - xnncommon.overwrite_if_changed(options.output, tests) - - -if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/tools/generate-vlog-test.py b/tools/generate-vlog-test.py deleted file mode 100755 index 96522b43217..00000000000 --- a/tools/generate-vlog-test.py +++ /dev/null @@ -1,176 +0,0 @@ -#!/usr/bin/env python -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import argparse -import codecs -import math -import os -import re -import sys -import yaml - -sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from primes import next_prime -import xngen -import xnncommon - - -parser = argparse.ArgumentParser(description='VLog microkernel test generator') -parser.add_argument("-s", "--spec", metavar="FILE", required=True, - help="Specification (YAML) file") -parser.add_argument("-o", "--output", metavar="FILE", required=True, - help='Output (C++ source) file') -parser.set_defaults(defines=list()) - - -def split_ukernel_name(name): - match = re.fullmatch(r"xnn_u32_vlog_ukernel__(.+)_x(\d+)", name) - assert match is not None - batch_tile = int(match.group(2)) - - arch, isa, assembly = xnncommon.parse_target_name(target_name=match.group(1)) - return batch_tile, arch, isa - - -VLOG_TEST_TEMPLATE = """\ -TEST(${TEST_NAME}, DISABLED_batch_eq_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - VLogMicrokernelTester() - .batch(${BATCH_TILE}) - .Test(${", ".join(TEST_ARGS)}); -} - -$if BATCH_TILE > 1: - TEST(${TEST_NAME}, DISABLED_batch_div_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t batch = ${BATCH_TILE*2}; batch < ${BATCH_TILE*10}; batch += ${BATCH_TILE}) { - VLogMicrokernelTester() - .batch(batch) - .Test(${", ".join(TEST_ARGS)}); - } - } - - TEST(${TEST_NAME}, DISABLED_batch_lt_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t batch = 1; batch < ${BATCH_TILE}; batch++) { - VLogMicrokernelTester() - .batch(batch) - .Test(${", ".join(TEST_ARGS)}); - } - } - -TEST(${TEST_NAME}, DISABLED_batch_gt_${BATCH_TILE}) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t batch = ${BATCH_TILE+1}; batch < ${10 if BATCH_TILE == 1 else BATCH_TILE*2}; batch++) { - VLogMicrokernelTester() - .batch(batch) - .Test(${", ".join(TEST_ARGS)}); - } -} - -TEST(${TEST_NAME}, DISABLED_input_lshift) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (uint32_t input_lshift = 0; input_lshift < 32; input_lshift++) { - VLogMicrokernelTester() - .batch(${BATCH_TILE}) - .input_lshift(input_lshift) - .Test(${", ".join(TEST_ARGS)}); - } -} - -TEST(${TEST_NAME}, DISABLED_output_scale) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (uint32_t output_scale = 0; output_scale < 65536; output_scale += ${next_prime(BATCH_TILE + 1)}) { - VLogMicrokernelTester() - .batch(${BATCH_TILE}) - .output_scale(output_scale) - .Test(${", ".join(TEST_ARGS)}); - } -} - -TEST(${TEST_NAME}, DISABLED_inplace) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t batch = ${BATCH_TILE+1}; batch < ${10 if BATCH_TILE == 1 else BATCH_TILE*2}; batch++) { - VLogMicrokernelTester() - .batch(batch) - .inplace(true) - .Test(${", ".join(TEST_ARGS)}); - } -} - -""" - - -def generate_test_cases(ukernel, batch_tile, isa): - """Generates all tests cases for a VLog micro-kernel. - - Args: - ukernel: C name of the micro-kernel function. - batch_tile: Number of batch processed per one iteration of the inner - loop of the micro-kernel. - isa: instruction set required to run the micro-kernel. Generated unit test - will skip execution if the host processor doesn't support this ISA. - - Returns: - Code for the test case. - """ - _, test_name = ukernel.split("_", 1) - _, datatype, ukernel_type, _ = ukernel.split("_", 3) - return xngen.preprocess(VLOG_TEST_TEMPLATE, { - "TEST_NAME": test_name.upper().replace("UKERNEL_", ""), - "TEST_ARGS": [ukernel], - "DATATYPE": datatype, - "BATCH_TILE": batch_tile, - "ISA_CHECK": xnncommon.generate_isa_check_macro(isa), - "next_prime": next_prime, - }) - - -def main(args): - options = parser.parse_args(args) - - with codecs.open(options.spec, "r", encoding="utf-8") as spec_file: - spec_yaml = yaml.safe_load(spec_file) - if not isinstance(spec_yaml, list): - raise ValueError("expected a list of micro-kernels in the spec") - - tests = """\ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: {specification} -// Generator: {generator} - - -#include -#include "xnnpack/common.h" -#include "xnnpack/isa-checks.h" -#include "xnnpack/vlog.h" -#include "vlog-microkernel-tester.h" -""".format(specification=options.spec, generator=sys.argv[0]) - - for ukernel_spec in spec_yaml: - name = ukernel_spec["name"] - batch_tile, arch, isa = split_ukernel_name(name) - - test_case = generate_test_cases(name, batch_tile, isa) - tests += "\n\n" + xnncommon.postprocess_test_case(test_case, arch, isa) - - xnncommon.overwrite_if_changed(options.output, tests) - - -if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/tools/generate-vunary-test.py b/tools/generate-vunary-test.py index 6af8142f6a4..82593c9c74d 100755 --- a/tools/generate-vunary-test.py +++ b/tools/generate-vunary-test.py @@ -18,6 +18,7 @@ ) parser.add_argument("-t", "--tester", metavar="TESTER", required=True, choices=[ + "VHSwishMicrokernelTester", "VLReLUMicrokernelTester", "VUnaryMicrokernelTester"], help="Tester class to be used in the generated test") @@ -106,8 +107,7 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ${", ".join(TEST_ARGS)}); XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ${", ".join(TEST_ARGS)}); -$if OP_TYPE != "SquareRootShift": - XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ${", ".join(TEST_ARGS)}); +XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ${", ".join(TEST_ARGS)}); $if OP_TYPE == "Clamp": XNN_TEST_UNARY_QMIN(ukernel, arch_flags, batch_tile, datatype, ${", ".join(TEST_ARGS)}); XNN_TEST_UNARY_QMAX(ukernel, arch_flags, batch_tile, datatype, ${", ".join(TEST_ARGS)}); @@ -202,21 +202,65 @@ } } } -$if OP_TYPE == "SquareRootShift": - TEST(ukernel, shift) { - TEST_REQUIRES_ARCH_FLAGS(arch_flags); - const size_t batch_scale = get_batch_scale(); - const size_t batch_end = batch_tile * batch_scale; - const size_t batch_step = std::max(1, batch_tile - 1); - for (uint32_t shift = 0; shift < 32; shift++) { - for (size_t batch_size = 1; batch_size <= 5 * batch_end; batch_size += batch_step) { - ${TESTER}() - .batch_size(batch_size) - .shift(shift) - .Test(${", ".join(TEST_ARGS)}); +$if OP_TYPE == "HardSwish": + $if "f" not in DATATYPE: + TEST(ukernel, input_scale) { + TEST_REQUIRES_ARCH_FLAGS(arch_flags); + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + for (float input_scale : {4.0f, 16.0f, 64.0f}) { + VHSwishMicrokernelTester() + .batch_size(batch_size) + .input_scale(input_scale) + $if "qu8" in DATATYPE: + .input_zero_point(150) + .output_zero_point(100) + .Test(${", ".join(TEST_ARGS)}); + } + } + } + + TEST(ukernel, output_scale) { + TEST_REQUIRES_ARCH_FLAGS(arch_flags); + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + for (float output_scale : {4.0f, 16.0f, 64.0f}) { + VHSwishMicrokernelTester() + .batch_size(batch_size) + .output_scale(output_scale) + $if "qu8" in DATATYPE: + .input_zero_point(150) + .output_zero_point(100) + .Test(${", ".join(TEST_ARGS)}); + } + } + } + + TEST(ukernel, input_zero_point) { + TEST_REQUIRES_ARCH_FLAGS(arch_flags); + for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + VHSwishMicrokernelTester() + .batch_size(batch_size) + .input_zero_point(input_zero_point) + $if "qu8" in DATATYPE: + .output_zero_point(100) + .Test(${", ".join(TEST_ARGS)}); + } + } + } + + TEST(ukernel, output_zero_point) { + TEST_REQUIRES_ARCH_FLAGS(arch_flags); + for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + VHSwishMicrokernelTester() + .batch_size(batch_size) + $if "qu8" in DATATYPE: + .input_zero_point(150) + .output_zero_point(output_zero_point) + .Test(${", ".join(TEST_ARGS)}); + } } } - } $if DATATYPE == "f32" and OP_TYPE in SPECIAL_VALUES_F32: TEST(ukernel, special_values) { TEST_REQUIRES_ARCH_FLAGS(arch_flags); @@ -261,14 +305,12 @@ def main(args): tester = options.tester tester_header = { + "VHSwishMicrokernelTester": "vhswish-microkernel-tester.h", "VLReLUMicrokernelTester": "vlrelu-microkernel-tester.h", "VUnaryMicrokernelTester": "vunary-microkernel-tester.h", }[tester] - op_header = { - "VLReLUMicrokernelTester": "vlrelu.h", - "VUnaryMicrokernelTester": "vunary.h", - }[tester] + op_header = "vunary.h" tests = """\ // Copyright 2019 Google LLC // diff --git a/tools/xnncommon.py b/tools/xnncommon.py index afe72e8b4cf..26c06900303 100644 --- a/tools/xnncommon.py +++ b/tools/xnncommon.py @@ -57,6 +57,7 @@ def _remove_duplicate_newlines(text): "avx256vnnigfni": "XNN_ENABLE_AVX256VNNIGFNI", "avx512f": "XNN_ENABLE_AVX512F", "avx512skx": "XNN_ENABLE_AVX512SKX", + "avx512vbmi": "XNN_ENABLE_AVX512VBMI", "avx512vnni": "XNN_ENABLE_AVX512VNNI", "avx512vnnigfni": "XNN_ENABLE_AVX512VNNIGFNI", "avx512amx": "XNN_ENABLE_AVX512AMX",