Skip to content

Commit 9b6ab40

Browse files
pendingchaosMarge Bot
authored and
Marge Bot
committed
aco: improve do_pack_2x16() with zero constants
We can skip the v_or_b32 or use an instruction smaller than v_alignbyte_b32. Signed-off-by: Rhys Perry <[email protected]> Reviewed-by: Daniel Schürmann <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19933>
1 parent 917cfd5 commit 9b6ab40

File tree

2 files changed

+48
-18
lines changed

2 files changed

+48
-18
lines changed

src/amd/compiler/aco_lower_to_hw_instr.cpp

+8-6
Original file line numberDiff line numberDiff line change
@@ -1451,8 +1451,8 @@ do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Opera
14511451

14521452
/* a single alignbyte can be sufficient: hi can be a 32-bit integer constant */
14531453
if (lo.physReg().byte() == 2 && hi.physReg().byte() == 0 &&
1454-
(!hi.isConstant() || !Operand::c32(hi.constantValue()).isLiteral() ||
1455-
ctx->program->gfx_level >= GFX10)) {
1454+
(!hi.isConstant() || (hi.constantValue() && (!Operand::c32(hi.constantValue()).isLiteral() ||
1455+
ctx->program->gfx_level >= GFX10)))) {
14561456
if (hi.isConstant())
14571457
bld.vop3(aco_opcode::v_alignbyte_b32, def, Operand::c32(hi.constantValue()), lo,
14581458
Operand::c32(2u));
@@ -1470,8 +1470,9 @@ do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Opera
14701470
bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi);
14711471
else
14721472
bld.vop2(aco_opcode::v_and_b32, def_hi, Operand::c32(~0xFFFFu), hi);
1473-
bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(lo.constantValue()),
1474-
Operand(def.physReg(), v1));
1473+
if (lo.constantValue())
1474+
bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(lo.constantValue()),
1475+
Operand(def.physReg(), v1));
14751476
return;
14761477
}
14771478
if (hi.isConstant()) {
@@ -1482,8 +1483,9 @@ do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Opera
14821483
bld.vop1(aco_opcode::v_cvt_u32_u16, def, lo);
14831484
else
14841485
bld.vop2(aco_opcode::v_and_b32, def_lo, Operand::c32(0xFFFFu), lo);
1485-
bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(hi.constantValue() << 16u),
1486-
Operand(def.physReg(), v1));
1486+
if (hi.constantValue())
1487+
bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(hi.constantValue() << 16u),
1488+
Operand(def.physReg(), v1));
14871489
return;
14881490
}
14891491

src/amd/compiler/tests/test_to_hw_instr.cpp

+40-12
Original file line numberDiff line numberDiff line change
@@ -841,26 +841,54 @@ BEGIN_TEST(to_hw_instr.swap_linear_vgpr)
841841
finish_to_hw_instr_test();
842842
END_TEST
843843

844-
BEGIN_TEST(to_hw_instr.pack2x16_alignbyte_constant)
844+
BEGIN_TEST(to_hw_instr.pack2x16_constant)
845845
PhysReg v0_lo{256};
846846
PhysReg v0_hi{256};
847+
PhysReg v1_lo{257};
847848
PhysReg v1_hi{257};
848849
v0_hi.reg_b += 2;
849850
v1_hi.reg_b += 2;
850851

851-
if (!setup_cs(NULL, GFX10))
852-
return;
852+
for (amd_gfx_level lvl : {GFX10, GFX11}) {
853+
if (!setup_cs(NULL, lvl))
854+
continue;
853855

854-
/* prevent usage of v_pack_b32_f16 */
855-
program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
856+
/* prevent usage of v_pack_b32_f16 */
857+
program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
856858

857-
//>> p_unit_test 0
858-
//! v1: %_:v[0] = v_alignbyte_b32 0x3800, %_:v[1][16:32], 2
859-
bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
860-
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
861-
Operand(v1_hi, v2b), Operand::c16(0x3800));
859+
//>> p_unit_test 0
860+
//! v1: %_:v[0] = v_alignbyte_b32 0x3800, %_:v[1][16:32], 2
861+
bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
862+
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
863+
Operand(v1_hi, v2b), Operand::c16(0x3800));
862864

863-
//! s_endpgm
865+
//! p_unit_test 1
866+
//! v2b: %_:v[0][0:16] = v_lshrrev_b32 16, %_:v[1][16:32]
867+
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
868+
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
869+
Operand(v1_hi, v2b), Operand::zero(2));
864870

865-
finish_to_hw_instr_test();
871+
//! p_unit_test 2
872+
//~gfx10! v2b: %_:v[0][0:16] = v_and_b32 0xffff, %_:v[1][0:16]
873+
//~gfx11! v1: %_:v[0] = v_cvt_u32_u16 %_:v[1][0:16]
874+
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
875+
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
876+
Operand(v1_lo, v2b), Operand::zero(2));
877+
878+
//! p_unit_test 3
879+
//! v2b: %_:v[0][16:32] = v_and_b32 0xffff0000, %_:v[1][16:32]
880+
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
881+
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
882+
Operand::zero(2), Operand(v1_hi, v2b));
883+
884+
//! p_unit_test 4
885+
//! v2b: %_:v[0][16:32] = v_lshlrev_b32 16, %_:v[1][0:16]
886+
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
887+
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
888+
Operand::zero(2), Operand(v1_lo, v2b));
889+
890+
//! s_endpgm
891+
892+
finish_to_hw_instr_test();
893+
}
866894
END_TEST

0 commit comments

Comments
 (0)