Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AVX-512] vpermi2b being predicated on a >= 0x80 mask gets optimized differently #109272

Open
Validark opened this issue Sep 19, 2024 · 2 comments
Assignees

Comments

@Validark
Copy link

Validark commented Sep 19, 2024

Godbolt link

export fn vperm2(indices: @Vector(64, u8)) @Vector(64, u8) {
    const table = std.simd.iota(u8, 128);
    const table_part_1, const table_part_2 = @as([2]@Vector(64, u8), @bitCast(table));
    
    return @select(
        u8,
        indices < @as(@Vector(64, u8), @splat(0x80)),
        struct {
            extern fn @"llvm.x86.avx512.vpermi2var.qi.512"(@Vector(64, u8), @Vector(64, u8), @Vector(64, u8)) @Vector(64, u8);
        }.@"llvm.x86.avx512.vpermi2var.qi.512"(table_part_1, indices, table_part_2),
        @as(@Vector(64, u8), @splat(0)),
    );
}

Compiled for Zen 4

.LCPI0_0:
        .byte   0
        ; ...
        .byte   63
.LCPI0_1:
        .byte   64
        ; ...
        .byte   127
vperm2:
        vmovdqa64       zmm1, zmmword ptr [rip + .LCPI0_0]
        vpmovb2m        k0, zmm0
        vpermt2b        zmm1, zmm0, zmmword ptr [rip + .LCPI0_1]
        vpmovm2b        zmm0, k0
        vpandnq zmm0, zmm0, zmm1
        ret

On size-optimized builds, at least, it should be:

vperm2:
        vpmovb2m        k1, zmm0
        vmovdqa64       zmm1, zmmword ptr [rip + .LCPI0_0]
        vpermi2b        zmm0 {k1} {z}, zmm1, zmmword ptr [rip + .LCPI0_1]
        ret

Optimized LLVM:

define dso_local void @vperm2(ptr nocapture nonnull readonly align 64 %0, ptr nocapture nonnull writeonly align 64 %1) local_unnamed_addr {
Entry:
  br label %Loop

Loop:
  %lsr.iv = phi i64 [ %lsr.iv.next, %Loop ], [ 0, %Entry ]
  %scevgep = getelementptr i8, ptr %1, i64 %lsr.iv
  %scevgep6 = getelementptr i8, ptr %0, i64 %lsr.iv
  %2 = load <64 x i8>, ptr %scevgep6, align 64
  %3 = tail call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47, i8 48, i8 49, i8 50, i8 51, i8 52, i8 53, i8 54, i8 55, i8 56, i8 57, i8 58, i8 59, i8 60, i8 61, i8 62, i8 63>, <64 x i8> %2, <64 x i8> <i8 64, i8 65, i8 66, i8 67, i8 68, i8 69, i8 70, i8 71, i8 72, i8 73, i8 74, i8 75, i8 76, i8 77, i8 78, i8 79, i8 80, i8 81, i8 82, i8 83, i8 84, i8 85, i8 86, i8 87, i8 88, i8 89, i8 90, i8 91, i8 92, i8 93, i8 94, i8 95, i8 96, i8 97, i8 98, i8 99, i8 100, i8 101, i8 102, i8 103, i8 104, i8 105, i8 106, i8 107, i8 108, i8 109, i8 110, i8 111, i8 112, i8 113, i8 114, i8 115, i8 116, i8 117, i8 118, i8 119, i8 120, i8 121, i8 122, i8 123, i8 124, i8 125, i8 126, i8 127>)
  %.inv = icmp slt <64 x i8> %2, zeroinitializer
  %4 = select <64 x i1> %.inv, <64 x i8> zeroinitializer, <64 x i8> %3
  store <64 x i8> %4, ptr %scevgep, align 64
  %scevgep7 = getelementptr i8, ptr %scevgep6, i64 74
  %5 = load i8, ptr %scevgep7, align 2
  %6 = icmp eq i8 %5, 0
  %lsr.iv.next = add i64 %lsr.iv, 64
  br i1 %6, label %Then, label %Loop

Then:
  ret void
}

declare void @llvm.dbg.value(metadata, metadata, metadata) #1

declare void @llvm.dbg.declare(metadata, metadata, metadata) #1

declare <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>) #2
@Validark Validark changed the title [AVX-512] vpermi2b being predicated on a >= 0x80 movemask gets de-optimized [AVX-512] vpermi2b being predicated on a >= 0x80 mask gets optimized differently Sep 19, 2024
@llvmbot
Copy link
Collaborator

llvmbot commented Sep 19, 2024

@llvm/issue-subscribers-backend-x86

Author: Niles Salter (Validark)

[Godbolt link](https://zig.godbolt.org/z/qdzKGcrez)
export fn vperm2(indices: @<!-- -->Vector(64, u8)) @<!-- -->Vector(64, u8) {
    const table = std.simd.iota(u8, 128);
    const table_part_1, const table_part_2 = @<!-- -->as([2]@<!-- -->Vector(64, u8), @<!-- -->bitCast(table));
    
    return @<!-- -->select(
        u8,
        indices &lt; @<!-- -->as(@<!-- -->Vector(64, u8), @<!-- -->splat(0x80)),
        struct {
            extern fn @"llvm.x86.avx512.vpermi2var.qi.512"(@<!-- -->Vector(64, u8), @<!-- -->Vector(64, u8), @<!-- -->Vector(64, u8)) @<!-- -->Vector(64, u8);
        }.@"llvm.x86.avx512.vpermi2var.qi.512"(table_part_1, indices, table_part_2),
        @<!-- -->as(@<!-- -->Vector(64, u8), @<!-- -->splat(0)),
    );
}

Compiled for Zen 4

.LCPI0_0:
        .byte   0
        ; ...
        .byte   63
.LCPI0_1:
        .byte   64
        ; ...
        .byte   127
vperm2:
        vmovdqa64       zmm1, zmmword ptr [rip + .LCPI0_0]
        vpmovb2m        k0, zmm0
        vpermt2b        zmm1, zmm0, zmmword ptr [rip + .LCPI0_1]
        vpmovm2b        zmm0, k0
        vpandnq zmm0, zmm0, zmm1
        ret

On size-optimized builds, at least, it should be:

vperm2:
        vpmovb2m        k1, zmm0
        vmovdqa64       zmm1, zmmword ptr [rip + .LCPI0_0]
        vpermi2b        zmm0 {k1} {z}, zmm1, zmmword ptr [rip + .LCPI0_1]
        ret

Optimized LLVM:

define dso_local void @<!-- -->vperm2(ptr nocapture nonnull readonly align 64 %0, ptr nocapture nonnull writeonly align 64 %1) local_unnamed_addr {
Entry:
  br label %Loop

Loop:
  %lsr.iv = phi i64 [ %lsr.iv.next, %Loop ], [ 0, %Entry ]
  %scevgep = getelementptr i8, ptr %1, i64 %lsr.iv
  %scevgep6 = getelementptr i8, ptr %0, i64 %lsr.iv
  %2 = load &lt;64 x i8&gt;, ptr %scevgep6, align 64
  %3 = tail call &lt;64 x i8&gt; @<!-- -->llvm.x86.avx512.vpermi2var.qi.512(&lt;64 x i8&gt; &lt;i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47, i8 48, i8 49, i8 50, i8 51, i8 52, i8 53, i8 54, i8 55, i8 56, i8 57, i8 58, i8 59, i8 60, i8 61, i8 62, i8 63&gt;, &lt;64 x i8&gt; %2, &lt;64 x i8&gt; &lt;i8 64, i8 65, i8 66, i8 67, i8 68, i8 69, i8 70, i8 71, i8 72, i8 73, i8 74, i8 75, i8 76, i8 77, i8 78, i8 79, i8 80, i8 81, i8 82, i8 83, i8 84, i8 85, i8 86, i8 87, i8 88, i8 89, i8 90, i8 91, i8 92, i8 93, i8 94, i8 95, i8 96, i8 97, i8 98, i8 99, i8 100, i8 101, i8 102, i8 103, i8 104, i8 105, i8 106, i8 107, i8 108, i8 109, i8 110, i8 111, i8 112, i8 113, i8 114, i8 115, i8 116, i8 117, i8 118, i8 119, i8 120, i8 121, i8 122, i8 123, i8 124, i8 125, i8 126, i8 127&gt;)
  %.inv = icmp slt &lt;64 x i8&gt; %2, zeroinitializer
  %4 = select &lt;64 x i1&gt; %.inv, &lt;64 x i8&gt; zeroinitializer, &lt;64 x i8&gt; %3
  store &lt;64 x i8&gt; %4, ptr %scevgep, align 64
  %scevgep7 = getelementptr i8, ptr %scevgep6, i64 74
  %5 = load i8, ptr %scevgep7, align 2
  %6 = icmp eq i8 %5, 0
  %lsr.iv.next = add i64 %lsr.iv, 64
  br i1 %6, label %Then, label %Loop

Then:
  ret void
}

declare void @<!-- -->llvm.dbg.value(metadata, metadata, metadata) #<!-- -->1

declare void @<!-- -->llvm.dbg.declare(metadata, metadata, metadata) #<!-- -->1

declare &lt;64 x i8&gt; @<!-- -->llvm.x86.avx512.vpermi2var.qi.512(&lt;64 x i8&gt;, &lt;64 x i8&gt;, &lt;64 x i8&gt;) #<!-- -->2

@RKSimon RKSimon self-assigned this Sep 19, 2024
@Validark
Copy link
Author

Validark commented Sep 19, 2024

The uncertainty I have about this one comes from the fact that these instructions are like 4 or 5 cycles each. It's possible that doing some instructions in parallel could have better latency in some cases. Although probably in code that's doing a lot more simultaneously, a serial dependency chain is fine.

(Unless I'm wrong about the parallelism here. Can the machine get started on vpermi2b before vpmovb2m is done, even when it relies on the k mask result?)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

4 participants