diff --git a/src/IROperator.h b/src/IROperator.h index 0db5606f011c..2b0ce6d97563 100644 --- a/src/IROperator.h +++ b/src/IROperator.h @@ -970,8 +970,9 @@ Expr pow(Expr x, Expr y); * mantissa. Vectorizes cleanly. */ Expr erf(const Expr &x); -/** Fast vectorizable approximation to some trigonometric functions for Float(32). - * Absolute approximation error is less than 1e-5. */ +/** Fast vectorizable approximation to some trigonometric functions for + * Float(32). Absolute approximation error is less than 1e-5. Slow on x86 if + * you don't have at least sse 4.1. */ // @{ Expr fast_sin(const Expr &x); Expr fast_cos(const Expr &x); @@ -979,19 +980,22 @@ Expr fast_cos(const Expr &x); /** Fast approximate cleanly vectorizable log for Float(32). Returns * nonsense for x <= 0.0f. Accurate up to the last 5 bits of the - * mantissa. Vectorizes cleanly. */ + * mantissa. Vectorizes cleanly. Slow on x86 if you don't + * have at least sse 4.1. */ Expr fast_log(const Expr &x); /** Fast approximate cleanly vectorizable exp for Float(32). Returns * nonsense for inputs that would overflow or underflow. Typically * accurate up to the last 5 bits of the mantissa. Gets worse when - * approaching overflow. Vectorizes cleanly. */ + * approaching overflow. Vectorizes cleanly. Slow on x86 if you don't + * have at least sse 4.1. */ Expr fast_exp(const Expr &x); /** Fast approximate cleanly vectorizable pow for Float(32). Returns * nonsense for x < 0.0f. Accurate up to the last 5 bits of the * mantissa for typical exponents. Gets worse when approaching - * overflow. Vectorizes cleanly. */ + * overflow. Vectorizes cleanly. Slow on x86 if you don't + * have at least sse 4.1. */ Expr fast_pow(Expr x, Expr y); /** Fast approximate inverse for Float(32). Corresponds to the rcpps diff --git a/test/performance/fast_pow.cpp b/test/performance/fast_pow.cpp index 801d5f3133f2..24cea2c32418 100644 --- a/test/performance/fast_pow.cpp +++ b/test/performance/fast_pow.cpp @@ -20,6 +20,12 @@ int main(int argc, char **argv) { printf("HL_TARGET is: %s\n", hl_target.to_string().c_str()); printf("HL_JIT_TARGET is: %s\n", hl_jit_target.to_string().c_str()); + if (hl_jit_target.arch == Target::X86 && + !hl_jit_target.has_feature(Target::SSE41)) { + printf("[SKIP] These intrinsics are known to be slow on x86 without sse 4.1.\n"); + return 0; + } + if (hl_jit_target.arch == Target::WebAssembly) { printf("[SKIP] Performance tests are meaningless and/or misleading under WebAssembly interpreter.\n"); return 0; diff --git a/test/performance/fast_sine_cosine.cpp b/test/performance/fast_sine_cosine.cpp index dc8e7a360550..81f79f337c32 100644 --- a/test/performance/fast_sine_cosine.cpp +++ b/test/performance/fast_sine_cosine.cpp @@ -10,6 +10,13 @@ using namespace Halide::Tools; int main(int argc, char **argv) { Target target = get_jit_target_from_environment(); + + if (target.arch == Target::X86 && + !target.has_feature(Target::SSE41)) { + printf("[SKIP] These intrinsics are known to be slow on x86 without sse 4.1.\n"); + return 0; + } + if (target.arch == Target::WebAssembly) { printf("[SKIP] Performance tests are meaningless and/or misleading under WebAssembly interpreter.\n"); return 0;