From efb77e5dd92061e99362ee9b461f2a55bc81c777 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Fri, 27 Dec 2024 10:23:08 -0800
Subject: [PATCH] Skip fast exp/log/pow/sin/cosine tests without sse 4.1
 (#8541)

Fixes #8536
---
 src/IROperator.h                      | 14 +++++++++-----
 test/performance/fast_pow.cpp         |  6 ++++++
 test/performance/fast_sine_cosine.cpp |  7 +++++++
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/src/IROperator.h b/src/IROperator.h
index 0db5606f011c..2b0ce6d97563 100644
--- a/src/IROperator.h
+++ b/src/IROperator.h
@@ -970,8 +970,9 @@ Expr pow(Expr x, Expr y);
  * mantissa. Vectorizes cleanly. */
 Expr erf(const Expr &x);
 
-/** Fast vectorizable approximation to some trigonometric functions for Float(32).
- * Absolute approximation error is less than 1e-5. */
+/** Fast vectorizable approximation to some trigonometric functions for
+ * Float(32).  Absolute approximation error is less than 1e-5. Slow on x86 if
+ * you don't have at least sse 4.1. */
 // @{
 Expr fast_sin(const Expr &x);
 Expr fast_cos(const Expr &x);
@@ -979,19 +980,22 @@ Expr fast_cos(const Expr &x);
 
 /** Fast approximate cleanly vectorizable log for Float(32). Returns
  * nonsense for x <= 0.0f. Accurate up to the last 5 bits of the
- * mantissa. Vectorizes cleanly. */
+ * mantissa. Vectorizes cleanly. Slow on x86 if you don't
+ * have at least sse 4.1. */
 Expr fast_log(const Expr &x);
 
 /** Fast approximate cleanly vectorizable exp for Float(32). Returns
  * nonsense for inputs that would overflow or underflow. Typically
  * accurate up to the last 5 bits of the mantissa. Gets worse when
- * approaching overflow. Vectorizes cleanly. */
+ * approaching overflow. Vectorizes cleanly. Slow on x86 if you don't
+ * have at least sse 4.1. */
 Expr fast_exp(const Expr &x);
 
 /** Fast approximate cleanly vectorizable pow for Float(32). Returns
  * nonsense for x < 0.0f. Accurate up to the last 5 bits of the
  * mantissa for typical exponents. Gets worse when approaching
- * overflow. Vectorizes cleanly. */
+ * overflow. Vectorizes cleanly. Slow on x86 if you don't
+ * have at least sse 4.1. */
 Expr fast_pow(Expr x, Expr y);
 
 /** Fast approximate inverse for Float(32). Corresponds to the rcpps
diff --git a/test/performance/fast_pow.cpp b/test/performance/fast_pow.cpp
index 801d5f3133f2..24cea2c32418 100644
--- a/test/performance/fast_pow.cpp
+++ b/test/performance/fast_pow.cpp
@@ -20,6 +20,12 @@ int main(int argc, char **argv) {
     printf("HL_TARGET is:     %s\n", hl_target.to_string().c_str());
     printf("HL_JIT_TARGET is: %s\n", hl_jit_target.to_string().c_str());
 
+    if (hl_jit_target.arch == Target::X86 &&
+        !hl_jit_target.has_feature(Target::SSE41)) {
+        printf("[SKIP] These intrinsics are known to be slow on x86 without sse 4.1.\n");
+        return 0;
+    }
+
     if (hl_jit_target.arch == Target::WebAssembly) {
         printf("[SKIP] Performance tests are meaningless and/or misleading under WebAssembly interpreter.\n");
         return 0;
diff --git a/test/performance/fast_sine_cosine.cpp b/test/performance/fast_sine_cosine.cpp
index dc8e7a360550..81f79f337c32 100644
--- a/test/performance/fast_sine_cosine.cpp
+++ b/test/performance/fast_sine_cosine.cpp
@@ -10,6 +10,13 @@ using namespace Halide::Tools;
 
 int main(int argc, char **argv) {
     Target target = get_jit_target_from_environment();
+
+    if (target.arch == Target::X86 &&
+        !target.has_feature(Target::SSE41)) {
+        printf("[SKIP] These intrinsics are known to be slow on x86 without sse 4.1.\n");
+        return 0;
+    }
+
     if (target.arch == Target::WebAssembly) {
         printf("[SKIP] Performance tests are meaningless and/or misleading under WebAssembly interpreter.\n");
         return 0;