-
Notifications
You must be signed in to change notification settings - Fork 11.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AVX-512] vpermi2b being predicated on a >= 0x80 mask gets optimized differently #109272
Comments
@llvm/issue-subscribers-backend-x86 Author: Niles Salter (Validark)
[Godbolt link](https://zig.godbolt.org/z/qdzKGcrez)
export fn vperm2(indices: @<!-- -->Vector(64, u8)) @<!-- -->Vector(64, u8) {
const table = std.simd.iota(u8, 128);
const table_part_1, const table_part_2 = @<!-- -->as([2]@<!-- -->Vector(64, u8), @<!-- -->bitCast(table));
return @<!-- -->select(
u8,
indices < @<!-- -->as(@<!-- -->Vector(64, u8), @<!-- -->splat(0x80)),
struct {
extern fn @"llvm.x86.avx512.vpermi2var.qi.512"(@<!-- -->Vector(64, u8), @<!-- -->Vector(64, u8), @<!-- -->Vector(64, u8)) @<!-- -->Vector(64, u8);
}.@"llvm.x86.avx512.vpermi2var.qi.512"(table_part_1, indices, table_part_2),
@<!-- -->as(@<!-- -->Vector(64, u8), @<!-- -->splat(0)),
);
} Compiled for Zen 4 .LCPI0_0:
.byte 0
; ...
.byte 63
.LCPI0_1:
.byte 64
; ...
.byte 127
vperm2:
vmovdqa64 zmm1, zmmword ptr [rip + .LCPI0_0]
vpmovb2m k0, zmm0
vpermt2b zmm1, zmm0, zmmword ptr [rip + .LCPI0_1]
vpmovm2b zmm0, k0
vpandnq zmm0, zmm0, zmm1
ret On size-optimized builds, at least, it should be: vperm2:
vpmovb2m k1, zmm0
vmovdqa64 zmm1, zmmword ptr [rip + .LCPI0_0]
vpermi2b zmm0 {k1} {z}, zmm1, zmmword ptr [rip + .LCPI0_1]
ret Optimized LLVM: define dso_local void @<!-- -->vperm2(ptr nocapture nonnull readonly align 64 %0, ptr nocapture nonnull writeonly align 64 %1) local_unnamed_addr {
Entry:
br label %Loop
Loop:
%lsr.iv = phi i64 [ %lsr.iv.next, %Loop ], [ 0, %Entry ]
%scevgep = getelementptr i8, ptr %1, i64 %lsr.iv
%scevgep6 = getelementptr i8, ptr %0, i64 %lsr.iv
%2 = load <64 x i8>, ptr %scevgep6, align 64
%3 = tail call <64 x i8> @<!-- -->llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47, i8 48, i8 49, i8 50, i8 51, i8 52, i8 53, i8 54, i8 55, i8 56, i8 57, i8 58, i8 59, i8 60, i8 61, i8 62, i8 63>, <64 x i8> %2, <64 x i8> <i8 64, i8 65, i8 66, i8 67, i8 68, i8 69, i8 70, i8 71, i8 72, i8 73, i8 74, i8 75, i8 76, i8 77, i8 78, i8 79, i8 80, i8 81, i8 82, i8 83, i8 84, i8 85, i8 86, i8 87, i8 88, i8 89, i8 90, i8 91, i8 92, i8 93, i8 94, i8 95, i8 96, i8 97, i8 98, i8 99, i8 100, i8 101, i8 102, i8 103, i8 104, i8 105, i8 106, i8 107, i8 108, i8 109, i8 110, i8 111, i8 112, i8 113, i8 114, i8 115, i8 116, i8 117, i8 118, i8 119, i8 120, i8 121, i8 122, i8 123, i8 124, i8 125, i8 126, i8 127>)
%.inv = icmp slt <64 x i8> %2, zeroinitializer
%4 = select <64 x i1> %.inv, <64 x i8> zeroinitializer, <64 x i8> %3
store <64 x i8> %4, ptr %scevgep, align 64
%scevgep7 = getelementptr i8, ptr %scevgep6, i64 74
%5 = load i8, ptr %scevgep7, align 2
%6 = icmp eq i8 %5, 0
%lsr.iv.next = add i64 %lsr.iv, 64
br i1 %6, label %Then, label %Loop
Then:
ret void
}
declare void @<!-- -->llvm.dbg.value(metadata, metadata, metadata) #<!-- -->1
declare void @<!-- -->llvm.dbg.declare(metadata, metadata, metadata) #<!-- -->1
declare <64 x i8> @<!-- -->llvm.x86.avx512.vpermi2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>) #<!-- -->2 |
The uncertainty I have about this one comes from the fact that these instructions are like 4 or 5 cycles each. It's possible that doing some instructions in parallel could have better latency in some cases. Although probably in code that's doing a lot more simultaneously, a serial dependency chain is fine. (Unless I'm wrong about the parallelism here. Can the machine get started on vpermi2b before vpmovb2m is done, even when it relies on the k mask result?) |
Godbolt link
Compiled for Zen 4
On size-optimized builds, at least, it should be:
Optimized LLVM:
The text was updated successfully, but these errors were encountered: