Merge branch 'master' into c4_update

yolanda15 · Oct 23, 2024 · 2e2670b · 2e2670b
2 parents 290e2c4 + 8731139
commit 2e2670b
Show file tree

Hide file tree

Showing 987 changed files with 6,027 additions and 35,264 deletions.
diff --git a/BUILD.bazel b/BUILD.bazel
@@ -233,7 +233,6 @@ MICROKERNEL_HDRS = [
     "src/xnnpack/pad.h",
     "src/xnnpack/pavgpool.h",
     "src/xnnpack/ppmm.h",
-    "src/xnnpack/prelu.h",
     "src/xnnpack/quantization.h",
     "src/xnnpack/raddexpminusmax.h",
     "src/xnnpack/raddextexp.h",
@@ -244,9 +243,6 @@ MICROKERNEL_HDRS = [
     "src/xnnpack/unpool.h",
     "src/xnnpack/vbinary.h",
     "src/xnnpack/vcvt.h",
-    "src/xnnpack/vhswish.h",
-    "src/xnnpack/vlog.h",
-    "src/xnnpack/vlrelu.h",
     "src/xnnpack/vmulcaddc.h",
     "src/xnnpack/vscaleexpminusmax.h",
     "src/xnnpack/vscaleextexp.h",
@@ -323,6 +319,7 @@ xnnpack_cc_library(
     deps = [
         ":common",
         ":config_hdrs",
+        ":fp16",
         ":math",
         ":memory",
         ":microparams",
@@ -1247,6 +1244,18 @@ config_setting(
     define_values = {"xnn_enable_avx512skx": "false"},
 )
 
+# Enables usage of Intel AVX512VBMI (evex512) kernels.
+config_setting(
+    name = "xnn_enable_avx512vbmi_explicit_true",
+    define_values = {"xnn_enable_avx512vbmi": "true"},
+)
+
+# Disables usage of Intel AVX512VBMI (evex512) kernels.
+config_setting(
+    name = "xnn_enable_avx512vbmi_explicit_false",
+    define_values = {"xnn_enable_avx512vbmi": "false"},
+)
+
 # Enables usage of Intel AVX512VNNI (evex512) kernels.
 config_setting(
     name = "xnn_enable_avx512vnni_explicit_true",
@@ -1704,6 +1713,22 @@ alias(
     }),
 )
 
+selects.config_setting_group(
+    name = "avx512vbmi_enabled_by_default",
+    match_any = [
+        "//build_config:x86",
+    ],
+)
+
+alias(
+    name = "avx512vbmi_enabled",
+    actual = select({
+        ":xnn_enable_avx512vbmi_explicit_true": ":xnn_enable_avx512vbmi_explicit_true",
+        ":xnn_enable_avx512vbmi_explicit_false": ":xnn_enable_avx512vbmi_explicit_true",
+        "//conditions:default": ":avx512vbmi_enabled_by_default",
+    }),
+)
+
 selects.config_setting_group(
     name = "avx512vnni_enabled_by_default",
     match_any = [

diff --git a/CMakeLists.txt b/CMakeLists.txt
diff --git a/bench/BUILD.bazel b/bench/BUILD.bazel
@@ -51,6 +51,8 @@ OPERATOR_BENCHMARK_DEPS = [
     "//:math",
 ]
 
+############################### Helper libraries ###############################
+
 xnnpack_cxx_library(
     name = "bench_utils",
     srcs = ["utils.cc"],
@@ -98,8 +100,6 @@ cc_library(
     ],
 )
 
-######################### Benchmarks for micro-kernels #########################
-
 xnnpack_cxx_library(
     name = "gemm_benchmark",
     srcs = [
@@ -115,6 +115,29 @@ xnnpack_cxx_library(
     ],
 )
 
+xnnpack_cxx_library(
+    name = "packw_benchmark",
+    hdrs = [
+        "packw-benchmark.h",
+    ],
+    deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":bgemm",
+        "@com_google_benchmark//:benchmark",
+    ],
+)
+
+xnnpack_cxx_library(
+    name = "bgemm",
+    hdrs = [
+        "bgemm.h",
+    ],
+    deps = MICROKERNEL_BENCHMARK_DEPS + [
+        "@com_google_benchmark//:benchmark",
+    ],
+)
+
+######################### Benchmarks for micro-kernels #########################
+
 [xnnpack_benchmark(
     name = "%s_bench" % kernel,
     srcs = [
@@ -167,12 +190,12 @@ xnnpack_cxx_library(
 xnnpack_benchmark(
     name = "f32_bgemm_bench",
     srcs = [
-        "bgemm.h",
         "f32-bgemm.cc",
     ],
     copts = xnnpack_optional_ruy_copts(),
     tags = xnnpack_slow_benchmark_tags(),
     deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":bgemm",
         "//:allocator",
     ] + xnnpack_optional_ruy_deps(),
 )
@@ -192,6 +215,19 @@ xnnpack_benchmark(
     ]),
 )
 
+xnnpack_benchmark(
+    name = "qp8_f32_qb4w_gemm",
+    srcs = ["qp8-f32-qb4w-gemm.cc"],
+    defines = xnnpack_kleidiai_defines(),
+    tags = xnnpack_slow_benchmark_tags(),
+    deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":gemm_benchmark",
+        "//:isa_checks",
+    ] + xnnpack_if_kleidiai_enabled([
+        "@KleidiAI//kai/ukernels/matmul",
+    ]),
+)
+
 [xnnpack_benchmark(
     name = "%s_bench" % kernel,
     srcs = [
@@ -292,6 +328,7 @@ xnnpack_benchmark(
     srcs = [
         "qs8-dwconv.cc",
     ],
+    tags = xnnpack_slow_benchmark_tags(),
     deps = MICROKERNEL_BENCHMARK_DEPS + [
         ":dwconv",
         "//:indirection",
@@ -470,22 +507,23 @@ xnnpack_benchmark(
 xnnpack_cxx_library(
     name = "packq_benchmark",
     srcs = [
-        "bgemm.h",
         "packq-benchmark.cc",
     ],
     hdrs = ["packq-benchmark.h"],
     deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":bgemm",
         "@com_google_benchmark//:benchmark",
     ],
 )
 
 xnnpack_benchmark(
     name = "x8_packq_bench",
     srcs = [
-        "bgemm.h",
         "x8-packq.cc",
     ],
+    tags = xnnpack_slow_benchmark_tags(),
     deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":bgemm",
         ":packq_benchmark",
         "//:allocator",
     ],
@@ -494,47 +532,51 @@ xnnpack_benchmark(
 xnnpack_benchmark(
     name = "x8_packw_bench",
     srcs = [
-        "bgemm.h",
-        "packw-benchmark.h",
         "x8-packw.cc",
     ],
+    tags = xnnpack_slow_benchmark_tags(),
     deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":bgemm",
+        ":packw_benchmark",
         "//:allocator",
     ],
 )
 
 xnnpack_benchmark(
     name = "qs8_packw_bench",
     srcs = [
-        "bgemm.h",
-        "packw-benchmark.h",
         "qs8-packw.cc",
     ],
+    tags = xnnpack_slow_benchmark_tags(),
     deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":bgemm",
+        ":packw_benchmark",
         "//:allocator",
     ],
 )
 
 xnnpack_benchmark(
     name = "x16_packw_bench",
     srcs = [
-        "bgemm.h",
-        "packw-benchmark.h",
         "x16-packw.cc",
     ],
+    tags = xnnpack_slow_benchmark_tags(),
     deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":bgemm",
+        ":packw_benchmark",
         "//:allocator",
     ],
 )
 
 xnnpack_benchmark(
     name = "x32_packw_bench",
     srcs = [
-        "bgemm.h",
-        "packw-benchmark.h",
         "x32-packw.cc",
     ],
+    tags = xnnpack_slow_benchmark_tags(),
     deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":bgemm",
+        ":packw_benchmark",
         "//:allocator",
     ],
 )

diff --git a/bench/abs.cc b/bench/abs.cc
@@ -3,10 +3,10 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include "xnnpack.h"
-
 #include "unary_operator.h"
-#include "bench/utils.h"
+#include "utils.h"
+#include "xnnpack.h"
+#include "xnnpack/math.h"
 #include <benchmark/benchmark.h>
 #ifdef BENCHMARK_TENSORFLOW_LITE
 #include "tensorflow/lite/schema/schema_generated.h"

diff --git a/bench/average-pooling.cc b/bench/average-pooling.cc
@@ -27,7 +27,7 @@
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
 #endif  // BENCHMARK_TENSORFLOW_LITE
-#include "bench/utils.h"
+#include "utils.h"
 
 static void xnnpack_average_pooling_qu8(benchmark::State& state, const char* net) {
   const size_t batch_size = state.range(0);

diff --git a/bench/bankers-rounding.cc b/bench/bankers-rounding.cc
@@ -3,10 +3,10 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include "xnnpack.h"
-
 #include "unary_operator.h"
-#include "bench/utils.h"
+#include "utils.h"
+#include "xnnpack.h"
+#include "xnnpack/math.h"
 #include <benchmark/benchmark.h>
 #ifdef BENCHMARK_TENSORFLOW_LITE
 #include "tensorflow/lite/schema/schema_generated.h"

diff --git a/bench/batch-matrix-multiply.cc b/bench/batch-matrix-multiply.cc
@@ -15,7 +15,7 @@
 #include "xnnpack.h"
 
 #include <benchmark/benchmark.h>
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack/buffer.h"
 #ifdef BENCHMARK_TENSORFLOW_LITE
 #include "flatbuffers/include/flatbuffers/flatbuffers.h"

diff --git a/bench/bf16-gemm.cc b/bench/bf16-gemm.cc
@@ -11,8 +11,8 @@
 #include <random>
 #include <vector>
 
-#include "bench/gemm.h"
-#include "bench/utils.h"
+#include "gemm.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/gemm.h"

diff --git a/bench/ceiling.cc b/bench/ceiling.cc
@@ -3,10 +3,10 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include "xnnpack.h"
-
 #include "unary_operator.h"
-#include "bench/utils.h"
+#include "utils.h"
+#include "xnnpack.h"
+#include "xnnpack/math.h"
 #include <benchmark/benchmark.h>
 #ifdef BENCHMARK_TENSORFLOW_LITE
 #include "tensorflow/lite/schema/schema_generated.h"

diff --git a/bench/channel-shuffle.cc b/bench/channel-shuffle.cc
@@ -16,7 +16,7 @@
 #include "xnnpack.h"
 
 #include <benchmark/benchmark.h>
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack/buffer.h"
 
 

diff --git a/bench/convert.cc b/bench/convert.cc
@@ -3,12 +3,12 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include "xnnpack.h"
-
 #include <limits>
 
 #include "unary_operator.h"
-#include "bench/utils.h"
+#include "utils.h"
+#include "xnnpack.h"
+#include "xnnpack/math.h"
 #include <benchmark/benchmark.h>
 #ifdef BENCHMARK_TENSORFLOW_LITE
 #include "flatbuffers/include/flatbuffers/flatbuffers.h"
@@ -33,9 +33,7 @@ void xnnpack_convert_f32_qs8(benchmark::State& state) {
   benchmark_unary_operator<float, int8_t>(
       [](uint32_t flags, xnn_operator_t* op) {
         return xnn_create_convert_nc_f32_qs8(
-            1.0f / 128.0f /* scale */, 1 /* zero point */,
-            std::numeric_limits<int8_t>::min(),
-            std::numeric_limits<int8_t>::max(), flags, op);
+            1.0f / 128.0f /* scale */, 1 /* zero point */, flags, op);
       },
       xnn_reshape_convert_nc_f32_qs8, xnn_setup_convert_nc_f32_qs8, state);
 }
@@ -44,9 +42,7 @@ void xnnpack_convert_f32_qu8(benchmark::State& state) {
   benchmark_unary_operator<float, uint8_t>(
       [](uint32_t flags, xnn_operator_t* op) {
         return xnn_create_convert_nc_f32_qu8(
-            1.0f / 128.0f /* scale */, 127 /* zero point */,
-            std::numeric_limits<uint8_t>::min(),
-            std::numeric_limits<uint8_t>::max(), flags, op);
+            1.0f / 128.0f /* scale */, 127 /* zero point */, flags, op);
       },
       xnn_reshape_convert_nc_f32_qu8, xnn_setup_convert_nc_f32_qu8, state);
 }

diff --git a/bench/convolution.cc b/bench/convolution.cc
@@ -28,7 +28,7 @@
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
 #endif  // BENCHMARK_TENSORFLOW_LITE
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack/buffer.h"
 
 void xnnpack_convolution_qu8(benchmark::State& state, const char* net) {

diff --git a/bench/deconvolution.cc b/bench/deconvolution.cc
@@ -25,7 +25,7 @@
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
 #endif  // BENCHMARK_TENSORFLOW_LITE */
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack/buffer.h"
 
 void xnnpack_deconvolution_qu8(benchmark::State& state, const char* net) {

diff --git a/bench/elu.cc b/bench/elu.cc
@@ -3,13 +3,13 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include "xnnpack.h"
-
 #include <cstdint>
 #include <limits>
 
 #include "unary_operator.h"
-#include "bench/utils.h"
+#include "utils.h"
+#include "xnnpack.h"
+#include "xnnpack/math.h"
 #include <benchmark/benchmark.h>
 #ifdef BENCHMARK_TENSORFLOW_LITE
 #include "flatbuffers/include/flatbuffers/flatbuffer_builder.h"