From d5e764e0303ca1ecb60dc7cf6dfc71f80971d7a4 Mon Sep 17 00:00:00 2001 From: TIHan Date: Mon, 29 Jan 2024 16:16:54 -0800 Subject: [PATCH 01/18] Added SVE_IF_4A and SVE_IF_4A_A formats. Added initial work for another SVE format group. --- src/coreclr/jit/codegenarm64test.cpp | 293 +++++++++++++++++ src/coreclr/jit/emit.h | 1 + src/coreclr/jit/emitarm64.cpp | 474 ++++++++++++++++++++++++++- 3 files changed, 766 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/codegenarm64test.cpp b/src/coreclr/jit/codegenarm64test.cpp index 0fd7c970ea2f6f..ee5afaed5d982d 100644 --- a/src/coreclr/jit/codegenarm64test.cpp +++ b/src/coreclr/jit/codegenarm64test.cpp @@ -6344,6 +6344,299 @@ void CodeGen::genArm64EmitterUnitTestsSve() INS_OPTS_SCALABLE_D); // LDFF1SH {.D }, /Z, [, .D] theEmitter->emitIns_R_R_R_R(INS_sve_ldff1w, EA_SCALABLE, REG_V4, REG_P3, REG_R2, REG_V1, INS_OPTS_SCALABLE_D); // LDFF1W {.D }, /Z, [, .D] + + // IF_SVE_IF_4A + theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1b, EA_SCALABLE, REG_V3, REG_P2, REG_V1, REG_R0, + INS_OPTS_SCALABLE_S); // LDNT1B {.S }, /Z, [.S{, }] + theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1h, EA_SCALABLE, REG_V0, REG_P1, REG_V2, REG_R3, + INS_OPTS_SCALABLE_S); // LDNT1H {.S }, /Z, [.S{, }] + theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1sb, EA_SCALABLE, REG_V2, REG_P3, REG_V5, REG_R4, + INS_OPTS_SCALABLE_S); // LDNT1SB {.S }, /Z, [.S{, }] + theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1sh, EA_SCALABLE, REG_V3, REG_P4, REG_V1, REG_R2, + INS_OPTS_SCALABLE_S); // LDNT1SH {.S }, /Z, [.S{, }] + // REG_ZR can be used due to the optional {, } of the format. + theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1w, EA_SCALABLE, REG_V1, REG_P3, REG_V4, REG_ZR, + INS_OPTS_SCALABLE_S); // LDNT1W {.S }, /Z, [.S{, }] + + // IF_SVE_IF_4A_A + theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + INS_OPTS_SCALABLE_D); // LDNT1B {.D }, /Z, [.D{, }] + theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + INS_OPTS_SCALABLE_D); // LDNT1H {.D }, /Z, [.D{, }] + theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1sb, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + INS_OPTS_SCALABLE_D); // LDNT1SB {.D }, /Z, [.D{, }] + theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1sh, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + INS_OPTS_SCALABLE_D); // LDNT1SH {.D }, /Z, [.D{, }] + theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + INS_OPTS_SCALABLE_D); // LDNT1W {.D }, /Z, [.D{, }] + + //// IF_SVE_IG_4A + //theEmitter->emitIns_R_R_R_R(INS_sve_ldff1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LDFF1D {.D }, /Z, [{, , LSL #3}] + //theEmitter->emitIns_R_R_R_R(INS_sve_ldff1sw, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LDFF1SW {.D }, /Z, [{, , LSL #2}] + + //// IF_SVE_IG_4A_D + //theEmitter->emitIns_R_R_R_R(INS_sve_ldff1sb, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LDFF1SB {.D }, /Z, [{, }] + + //// IF_SVE_IG_4A_E + //theEmitter->emitIns_R_R_R_R(INS_sve_ldff1b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LDFF1B {.D }, /Z, [{, }] + + //// IF_SVE_IG_4A_F + //theEmitter->emitIns_R_R_R_R(INS_sve_ldff1sh, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LDFF1SH {.D }, /Z, [{, , LSL #1}] + //theEmitter->emitIns_R_R_R_R(INS_sve_ldff1w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LDFF1W {.D }, /Z, [{, , LSL #2}] + + //// IF_SVE_IG_4A_G + //theEmitter->emitIns_R_R_R_R(INS_sve_ldff1h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LDFF1H {.D }, /Z, [{, , LSL #1}] + + //// IF_SVE_II_4A + //theEmitter->emitIns_R_R_R_R(INS_sve_ld1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD1D {.D }, /Z, [, , LSL #3] + + //// IF_SVE_II_4A_B + //theEmitter->emitIns_R_R_R_R(INS_sve_ld1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD1D {.Q }, /Z, [, , LSL #3] + + //// IF_SVE_II_4A_H + //theEmitter->emitIns_R_R_R_R(INS_sve_ld1w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD1W {.D }, /Z, [, , LSL #2] + + //// IF_SVE_IK_4A + //theEmitter->emitIns_R_R_R_R(INS_sve_ld1sw, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD1SW {.D }, /Z, [, , LSL #2] + + //// IF_SVE_IK_4A_F + //theEmitter->emitIns_R_R_R_R(INS_sve_ld1sb, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD1SB {.D }, /Z, [, ] + + //// IF_SVE_IK_4A_G + //theEmitter->emitIns_R_R_R_R(INS_sve_ld1sh, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD1SH {.D }, /Z, [, , LSL #1] + + //// IF_SVE_IK_4A_H + //theEmitter->emitIns_R_R_R_R(INS_sve_ld1b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD1B {.D }, /Z, [, ] + + //// IF_SVE_IK_4A_I + //theEmitter->emitIns_R_R_R_R(INS_sve_ld1h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD1H {.D }, /Z, [, , LSL #1] + + //// IF_SVE_IN_4A + //theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LDNT1B {.B }, /Z, [, ] + //theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LDNT1D {.D }, /Z, [, , LSL #3] + //theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LDNT1H {.H }, /Z, [, , LSL #1] + //theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LDNT1W {.S }, /Z, [, , LSL #2] + + //// IF_SVE_IP_4A + //theEmitter->emitIns_R_R_R_R(INS_sve_ld1rob, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD1ROB {.B }, /Z, [, ] + //theEmitter->emitIns_R_R_R_R(INS_sve_ld1rod, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD1ROD {.D }, /Z, [, , LSL #3] + //theEmitter->emitIns_R_R_R_R(INS_sve_ld1roh, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD1ROH {.H }, /Z, [, , LSL #1] + //theEmitter->emitIns_R_R_R_R(INS_sve_ld1row, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD1ROW {.S }, /Z, [, , LSL #2] + //theEmitter->emitIns_R_R_R_R(INS_sve_ld1rqb, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD1RQB {.B }, /Z, [, ] + //theEmitter->emitIns_R_R_R_R(INS_sve_ld1rqd, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD1RQD {.D }, /Z, [, , LSL #3] + //theEmitter->emitIns_R_R_R_R(INS_sve_ld1rqh, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD1RQH {.H }, /Z, [, , LSL #1] + //theEmitter->emitIns_R_R_R_R(INS_sve_ld1rqw, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD1RQW {.S }, /Z, [, , LSL #2] + + //// IF_SVE_IR_4A + //theEmitter->emitIns_R_R_R_R(INS_sve_ld2q, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD2Q {.Q, .Q }, /Z, [, , LSL #4] + //theEmitter->emitIns_R_R_R_R(INS_sve_ld3q, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD3Q {.Q, .Q, .Q }, /Z, [, , + // // LSL #4] + //theEmitter->emitIns_R_R_R_R(INS_sve_ld4q, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD4Q {.Q, .Q, .Q, .Q }, /Z, + // // [, , LSL #4] + + //// IF_SVE_IT_4A + //theEmitter->emitIns_R_R_R_R(INS_sve_ld2b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD2B {.B, .B }, /Z, [, ] + //theEmitter->emitIns_R_R_R_R(INS_sve_ld2d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD2D {.D, .D }, /Z, [, , LSL #3] + //theEmitter->emitIns_R_R_R_R(INS_sve_ld2h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD2H {.H, .H }, /Z, [, , LSL #1] + //theEmitter->emitIns_R_R_R_R(INS_sve_ld2w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD2W {.S, .S }, /Z, [, , LSL #2] + //theEmitter->emitIns_R_R_R_R(INS_sve_ld3b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD3B {.B, .B, .B }, /Z, [, ] + //theEmitter->emitIns_R_R_R_R(INS_sve_ld3d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD3D {.D, .D, .D }, /Z, [, , + // // LSL #3] + //theEmitter->emitIns_R_R_R_R(INS_sve_ld3h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD3H {.H, .H, .H }, /Z, [, , + // // LSL #1] + //theEmitter->emitIns_R_R_R_R(INS_sve_ld3w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD3W {.S, .S, .S }, /Z, [, , + // // LSL #2] + //theEmitter->emitIns_R_R_R_R(INS_sve_ld4b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD4B {.B, .B, .B, .B }, /Z, + // // [, ] + //theEmitter->emitIns_R_R_R_R(INS_sve_ld4d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD4D {.D, .D, .D, .D }, /Z, + // // [, , LSL #3] + //theEmitter->emitIns_R_R_R_R(INS_sve_ld4h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD4H {.H, .H, .H, .H }, /Z, + // // [, , LSL #1] + //theEmitter->emitIns_R_R_R_R(INS_sve_ld4w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD4W {.S, .S, .S, .S }, /Z, + // // [, , LSL #2] + + //// IF_SVE_IU_4B + //theEmitter->emitIns_R_R_R_R(INS_sve_ld1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD1D {.D }, /Z, [, .D, LSL #3] + //theEmitter->emitIns_R_R_R_R(INS_sve_ld1sw, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD1SW {.D }, /Z, [, .D, LSL #2] + //theEmitter->emitIns_R_R_R_R(INS_sve_ldff1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LDFF1D {.D }, /Z, [, .D, LSL #3] + //theEmitter->emitIns_R_R_R_R(INS_sve_ldff1sw, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LDFF1SW {.D }, /Z, [, .D, LSL #2] + + //// IF_SVE_IU_4B_B + //theEmitter->emitIns_R_R_R_R(INS_sve_ld1sw, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD1SW {.D }, /Z, [, .D] + //theEmitter->emitIns_R_R_R_R(INS_sve_ldff1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LDFF1D {.D }, /Z, [, .D] + //theEmitter->emitIns_R_R_R_R(INS_sve_ldff1sw, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LDFF1SW {.D }, /Z, [, .D] + + //// IF_SVE_IU_4B_D + //theEmitter->emitIns_R_R_R_R(INS_sve_ld1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD1D {.D }, /Z, [, .D] + + //// IF_SVE_IW_4A + //theEmitter->emitIns_R_R_R_R(INS_sve_ld1q, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LD1Q {.Q }, /Z, [.D{, }] + + //// IF_SVE_IX_4A + //theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LDNT1D {.D }, /Z, [.D{, }] + //theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1sw, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // LDNT1SW {.D }, /Z, [.D{, }] + + //// IF_SVE_IY_4A + //theEmitter->emitIns_R_R_R_R(INS_sve_st1q, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // ST1Q {.Q }, , [.D{, }] + + //// IF_SVE_IZ_4A + //theEmitter->emitIns_R_R_R_R(INS_sve_stnt1b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // STNT1B {.S }, , [.S{, }] + //theEmitter->emitIns_R_R_R_R(INS_sve_stnt1h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // STNT1H {.S }, , [.S{, }] + //theEmitter->emitIns_R_R_R_R(INS_sve_stnt1w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // STNT1W {.S }, , [.S{, }] + + //// IF_SVE_IZ_4A_A + //theEmitter->emitIns_R_R_R_R(INS_sve_stnt1b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // STNT1B {.D }, , [.D{, }] + //theEmitter->emitIns_R_R_R_R(INS_sve_stnt1h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // STNT1H {.D }, , [.D{, }] + //theEmitter->emitIns_R_R_R_R(INS_sve_stnt1w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // STNT1W {.D }, , [.D{, }] + + //// IF_SVE_JA_4A + //theEmitter->emitIns_R_R_R_R(INS_sve_stnt1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // STNT1D {.D }, , [.D{, }] + + //// IF_SVE_JB_4A + //theEmitter->emitIns_R_R_R_R(INS_sve_stnt1b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // STNT1B {.B }, , [, ] + //theEmitter->emitIns_R_R_R_R(INS_sve_stnt1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // STNT1D {.D }, , [, , LSL #3] + //theEmitter->emitIns_R_R_R_R(INS_sve_stnt1h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // STNT1H {.H }, , [, , LSL #1] + //theEmitter->emitIns_R_R_R_R(INS_sve_stnt1w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // STNT1W {.S }, , [, , LSL #2] + + //// IF_SVE_JC_4A + //theEmitter->emitIns_R_R_R_R(INS_sve_st2b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // ST2B {.B, .B }, , [, ] + //theEmitter->emitIns_R_R_R_R(INS_sve_st2d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // ST2D {.D, .D }, , [, , LSL #3] + //theEmitter->emitIns_R_R_R_R(INS_sve_st2h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // ST2H {.H, .H }, , [, , LSL #1] + //theEmitter->emitIns_R_R_R_R(INS_sve_st2w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // ST2W {.S, .S }, , [, , LSL #2] + //theEmitter->emitIns_R_R_R_R(INS_sve_st3b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // ST3B {.B, .B, .B }, , [, ] + //theEmitter->emitIns_R_R_R_R(INS_sve_st3d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // ST3D {.D, .D, .D }, , [, , LSL + // // #3] + //theEmitter->emitIns_R_R_R_R(INS_sve_st3h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // ST3H {.H, .H, .H }, , [, , LSL + // // #1] + //theEmitter->emitIns_R_R_R_R(INS_sve_st3w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // ST3W {.S, .S, .S }, , [, , LSL + // // #2] + //theEmitter->emitIns_R_R_R_R(INS_sve_st4b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // ST4B {.B, .B, .B, .B }, , [, + // // ] + //theEmitter->emitIns_R_R_R_R(INS_sve_st4d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // ST4D {.D, .D, .D, .D }, , [, + // // , LSL #3] + //theEmitter->emitIns_R_R_R_R(INS_sve_st4h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // ST4H {.H, .H, .H, .H }, , [, + // // , LSL #1] + //theEmitter->emitIns_R_R_R_R(INS_sve_st4w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // ST4W {.S, .S, .S, .S }, , [, + // // , LSL #2] + + //// IF_SVE_JD_4C + //theEmitter->emitIns_R_R_R_R(INS_sve_st1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // ST1D {.D }, , [, , LSL #3] + //theEmitter->emitIns_R_R_R_R(INS_sve_st1w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // ST1W {.Q }, , [, , LSL #2] + + //// IF_SVE_JD_4C_A + //theEmitter->emitIns_R_R_R_R(INS_sve_st1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // ST1D {.Q }, , [, , LSL #3] + + //// IF_SVE_JF_4A + //theEmitter->emitIns_R_R_R_R(INS_sve_st2q, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // ST2Q {.Q, .Q }, , [, , LSL #4] + //theEmitter->emitIns_R_R_R_R(INS_sve_st3q, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // ST3Q {.Q, .Q, .Q }, , [, , LSL + // // #4] + //theEmitter->emitIns_R_R_R_R(INS_sve_st4q, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // ST4Q {.Q, .Q, .Q, .Q }, , [, + // // , LSL #4] + + //// IF_SVE_JJ_4B + //theEmitter->emitIns_R_R_R_R(INS_sve_st1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // ST1D {.D }, , [, .D, LSL #3] + //theEmitter->emitIns_R_R_R_R(INS_sve_st1h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // ST1H {.D }, , [, .D, LSL #1] + //theEmitter->emitIns_R_R_R_R(INS_sve_st1w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // ST1W {.D }, , [, .D, LSL #2] + + //// IF_SVE_JJ_4B_C + //theEmitter->emitIns_R_R_R_R(INS_sve_st1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // ST1D {.D }, , [, .D] + + //// IF_SVE_JJ_4B_E + //theEmitter->emitIns_R_R_R_R(INS_sve_st1h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // ST1H {.D }, , [, .D] + //theEmitter->emitIns_R_R_R_R(INS_sve_st1w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // ST1W {.D }, , [, .D] + + //// IF_SVE_JK_4B + //theEmitter->emitIns_R_R_R_R(INS_sve_st1b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // INS_OPTS_SCALABLE_B); // ST1B {.D }, , [, .D] } #endif // defined(TARGET_ARM64) && defined(DEBUG) diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index e0ce3adb529eea..8ab061f7ab9716 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -1786,6 +1786,7 @@ class emitter #define PERFSCORE_THROUGHPUT_ZERO 0.0f // Only used for pseudo-instructions that don't generate code +#define PERFSCORE_THROUGHPUT_9X (1.0f / 9.0f) #define PERFSCORE_THROUGHPUT_6X (1.0f / 6.0f) // Hextuple issue #define PERFSCORE_THROUGHPUT_5X 0.20f // Pentuple issue #define PERFSCORE_THROUGHPUT_4X 0.25f // Quad issue diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index 28f05dfe53be5f..b82f63ac0a301f 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -1707,6 +1707,72 @@ void emitter::emitInsSanityCheck(instrDesc* id) assert(isScalableVectorSize(elemsize)); break; + case IF_SVE_IF_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit gather non-temporal load (vector plus + // scalar) + case IF_SVE_IF_4A_A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit gather non-temporal load (vector plus + // scalar) + case IF_SVE_IG_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus scalar) + case IF_SVE_IG_4A_D: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus + // scalar) + case IF_SVE_IG_4A_E: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus + // scalar) + case IF_SVE_IG_4A_F: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus + // scalar) + case IF_SVE_IG_4A_G: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus + // scalar) + case IF_SVE_II_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) + case IF_SVE_II_4A_B: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) + case IF_SVE_II_4A_H: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) + case IF_SVE_IK_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + case IF_SVE_IK_4A_F: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + case IF_SVE_IK_4A_G: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + case IF_SVE_IK_4A_H: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + case IF_SVE_IK_4A_I: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + case IF_SVE_IN_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal load (scalar plus scalar) + case IF_SVE_IP_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load and broadcast quadword (scalar plus scalar) + case IF_SVE_IR_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (quadwords, scalar plus + // scalar) + case IF_SVE_IT_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (scalar plus scalar) + case IF_SVE_IU_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked + // scaled offsets) + case IF_SVE_IU_4B_B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked + // scaled offsets) + case IF_SVE_IU_4B_D: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked + // scaled offsets) + case IF_SVE_IW_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit gather load (vector plus scalar) + case IF_SVE_IX_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit gather non-temporal load (vector plus + // scalar) + case IF_SVE_IY_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit scatter store (vector plus scalar) + case IF_SVE_IZ_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit scatter non-temporal store (vector plus + // scalar) + case IF_SVE_IZ_4A_A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit scatter non-temporal store (vector plus + // scalar) + case IF_SVE_JA_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit scatter non-temporal store (vector plus + // scalar) + case IF_SVE_JB_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal store (scalar plus + // scalar) + case IF_SVE_JC_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE store multiple structures (scalar plus scalar) + case IF_SVE_JD_4C: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous store (scalar plus scalar) + case IF_SVE_JD_4C_A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous store (scalar plus scalar) + case IF_SVE_JF_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE store multiple structures (quadwords, scalar plus + // scalar) + case IF_SVE_JJ_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled + // offsets) + case IF_SVE_JJ_4B_C: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled + // offsets) + case IF_SVE_JJ_4B_E: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled + // offsets) + case IF_SVE_JK_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit unscaled + // offsets) + elemsize = id->idOpSize(); + assert(insOptsScalableWords(id->idInsOpt())); + assert(isVectorRegister(id->idReg1())); // ttttt + assert(isPredicateRegister(id->idReg2())); // ggg + assert(isVectorRegister(id->idReg3())); // nnnnn + assert(isGeneralRegisterOrZR(id->idReg4())); // mmmmm + assert(isScalableVectorSize(elemsize)); + break; + default: printf("unexpected format %s\n", emitIfName(id->idInsFmt())); assert(!"Unexpected format"); @@ -11752,6 +11818,31 @@ void emitter::emitIns_R_R_R_R(instruction ins, } break; + case INS_sve_ldnt1b: + case INS_sve_ldnt1h: + case INS_sve_ldnt1sb: + case INS_sve_ldnt1sh: + case INS_sve_ldnt1w: + assert(insOptsScalableWords(opt)); + assert(isVectorRegister(reg1)); + assert(isPredicateRegister(reg2)); + assert(isVectorRegister(reg3)); + assert(isGeneralRegisterOrZR(reg4)); + assert(isScalableVectorSize(size)); + + assert(insScalableOptsNone(sopt)); + + if (opt == INS_OPTS_SCALABLE_S) + { + fmt = IF_SVE_IF_4A; + } + else + { + assert(opt == INS_OPTS_SCALABLE_D); + fmt = IF_SVE_IF_4A_A; + } + break; + default: unreached(); break; @@ -18349,6 +18440,18 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) dst += emitOutput_Instr(dst, code); break; + case IF_SVE_IF_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit gather non-temporal load (vector plus + // scalar) + case IF_SVE_IF_4A_A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit gather non-temporal load (vector plus + // scalar) + code = emitInsCodeSve(ins, fmt); + code |= insEncodeReg_V_4_to_0(id->idReg1()); // ttttt + code |= insEncodeReg_P_12_to_10(id->idReg2()); // ggg + code |= insEncodeReg_V_9_to_5(id->idReg3()); // nnnnn + code |= insEncodeReg_R_20_to_16(id->idReg4()); // mmmmm + dst += emitOutput_Instr(dst, code); + break; + default: assert(!"Unexpected format"); break; @@ -18834,12 +18937,21 @@ void emitter::emitDispSveExtendOptsModN(insOpts opt, int n) void emitter::emitDispSveModAddr(instruction ins, regNumber reg1, regNumber reg2, insOpts opt, insFormat fmt) { printf("["); - emitDispReg(reg1, EA_8BYTE, true); + + if (isVectorRegister(reg1)) + { + emitDispSveReg(reg1, opt, reg2 != REG_ZR); + } + else + { + emitDispReg(reg1, EA_8BYTE, true); + } + if (isVectorRegister(reg2)) { emitDispSveReg(reg2, opt, false); } - else + else if (reg2 != REG_ZR) { emitDispReg(reg2, EA_8BYTE, false); } @@ -21329,6 +21441,65 @@ void emitter::emitDispInsHelp( // {.D }, /Z, [, .D] case IF_SVE_HW_4B_D: // ...........mmmmm ...gggnnnnnttttt -- SVE 32-bit gather load (scalar plus 32-bit unscaled // offsets) + // {.S }, /Z, [.S{, }] + case IF_SVE_IF_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit gather non-temporal load (vector plus + // scalar) + // {.D }, /Z, [.D{, }] + case IF_SVE_IF_4A_A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit gather non-temporal load (vector plus + // scalar) + case IF_SVE_IG_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus scalar) + case IF_SVE_IG_4A_D: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus + // scalar) + case IF_SVE_IG_4A_E: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus + // scalar) + case IF_SVE_IG_4A_F: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus + // scalar) + case IF_SVE_IG_4A_G: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus + // scalar) + case IF_SVE_II_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) + case IF_SVE_II_4A_B: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) + case IF_SVE_II_4A_H: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) + case IF_SVE_IK_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + case IF_SVE_IK_4A_F: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + case IF_SVE_IK_4A_G: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + case IF_SVE_IK_4A_H: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + case IF_SVE_IK_4A_I: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + case IF_SVE_IN_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal load (scalar plus scalar) + case IF_SVE_IP_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load and broadcast quadword (scalar plus scalar) + case IF_SVE_IR_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (quadwords, scalar plus + // scalar) + case IF_SVE_IT_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (scalar plus scalar) + case IF_SVE_IU_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked + // scaled offsets) + case IF_SVE_IU_4B_B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked + // scaled offsets) + case IF_SVE_IU_4B_D: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked + // scaled offsets) + case IF_SVE_IW_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit gather load (vector plus scalar) + case IF_SVE_IX_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit gather non-temporal load (vector plus + // scalar) + case IF_SVE_IY_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit scatter store (vector plus scalar) + case IF_SVE_IZ_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit scatter non-temporal store (vector plus + // scalar) + case IF_SVE_IZ_4A_A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit scatter non-temporal store (vector plus + // scalar) + case IF_SVE_JA_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit scatter non-temporal store (vector plus + // scalar) + case IF_SVE_JB_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal store (scalar plus + // scalar) + case IF_SVE_JC_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE store multiple structures (scalar plus scalar) + case IF_SVE_JD_4C: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous store (scalar plus scalar) + case IF_SVE_JD_4C_A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous store (scalar plus scalar) + case IF_SVE_JF_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE store multiple structures (quadwords, scalar plus + // scalar) + case IF_SVE_JJ_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled + // offsets) + case IF_SVE_JJ_4B_C: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled + // offsets) + case IF_SVE_JJ_4B_E: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled + // offsets) + case IF_SVE_JK_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit unscaled + // offsets) emitDispSveConsecutiveRegList(id->idReg1(), insGetSveReg1ListSize(ins), id->idInsOpt(), true); // ttttt emitDispPredicateReg(id->idReg2(), insGetPredicateType(fmt), id->idInsOpt(), true); // ggg emitDispSveModAddr(ins, id->idReg3(), id->idReg4(), id->idInsOpt(), fmt); // nnnnn @@ -24382,6 +24553,305 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins result.insLatency = PERFSCORE_LATENCY_9C; break; + case IF_SVE_IF_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit gather non-temporal load (vector plus scalar) + case IF_SVE_IF_4A_A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit gather non-temporal load (vector plus + // scalar) + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_10C; + break; + + case IF_SVE_IG_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus scalar) + case IF_SVE_IG_4A_D: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus + // scalar) + case IF_SVE_IG_4A_E: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus + // scalar) + case IF_SVE_IG_4A_F: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus + // scalar) + case IF_SVE_IG_4A_G: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus + // scalar) + case IF_SVE_II_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) + case IF_SVE_II_4A_B: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) + case IF_SVE_II_4A_H: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) + case IF_SVE_IK_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + case IF_SVE_IK_4A_F: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + case IF_SVE_IK_4A_G: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + case IF_SVE_IK_4A_H: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + case IF_SVE_IK_4A_I: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_9C; + break; + + case IF_SVE_IN_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal load (scalar plus scalar) + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_10C; + break; + + case IF_SVE_IP_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load and broadcast quadword (scalar plus scalar) + switch (ins) + { + case INS_sve_ld1rqb: + result.insThroughput = PERFSCORE_THROUGHPUT_3C; + result.insLatency = PERFSCORE_LATENCY_6C; + break; + case INS_sve_ld1rob: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; // need to fix + result.insLatency = PERFSCORE_LATENCY_1C; // need to fix + break; + case INS_sve_ld1rqh: + result.insThroughput = PERFSCORE_THROUGHPUT_3C; + result.insLatency = PERFSCORE_LATENCY_6C; + break; + case INS_sve_ld1roh: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; // need to fix + result.insLatency = PERFSCORE_LATENCY_1C; // need to fix + break; + case INS_sve_ld1rqw: + result.insThroughput = PERFSCORE_THROUGHPUT_3C; + result.insLatency = PERFSCORE_LATENCY_6C; + break; + case INS_sve_ld1row: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; // need to fix + result.insLatency = PERFSCORE_LATENCY_1C; // need to fix + break; + case INS_sve_ld1rqd: + result.insThroughput = PERFSCORE_THROUGHPUT_3C; + result.insLatency = PERFSCORE_LATENCY_6C; + break; + case INS_sve_ld1rod: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; // need to fix + result.insLatency = PERFSCORE_LATENCY_1C; // need to fix + break; + default: + // all other instructions + perfScoreUnhandledInstruction(id, &result); + break; + } + + case IF_SVE_IR_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (quadwords, scalar plus + // scalar) + switch (ins) + { + case INS_sve_ld2q: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; // need to fix + result.insLatency = PERFSCORE_LATENCY_1C; // need to fix + break; + case INS_sve_ld3q: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; // need to fix + result.insLatency = PERFSCORE_LATENCY_1C; // need to fix + break; + case INS_sve_ld4q: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; // need to fix + result.insLatency = PERFSCORE_LATENCY_1C; // need to fix + break; + default: + // all other instructions + perfScoreUnhandledInstruction(id, &result); + break; + } + + case IF_SVE_IT_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (scalar plus scalar) + switch (ins) + { + case INS_sve_ld2b: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_9C; + break; + case INS_sve_ld3b: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_10C; + break; + case INS_sve_ld4b: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_10C; + break; + case INS_sve_ld2h: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_9C; + break; + case INS_sve_ld3h: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_10C; + break; + case INS_sve_ld4h: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_10C; + break; + case INS_sve_ld2w: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_9C; + break; + case INS_sve_ld3w: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_10C; + break; + case INS_sve_ld4w: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_10C; + break; + case INS_sve_ld2d: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_9C; + break; + case INS_sve_ld3d: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_10C; + break; + case INS_sve_ld4d: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_10C; + break; + default: + // all other instructions + perfScoreUnhandledInstruction(id, &result); + break; + } + + case IF_SVE_IU_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked + // scaled offsets) + case IF_SVE_IU_4B_B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked + // scaled offsets) + case IF_SVE_IU_4B_D: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked + // scaled offsets) + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_9C; + break; + + case IF_SVE_IW_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit gather load (vector plus scalar) + switch (ins) + { + case INS_sve_ld1q: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; // need to fix + result.insLatency = PERFSCORE_LATENCY_1C; // need to fix + break; + default: + // all other instructions + perfScoreUnhandledInstruction(id, &result); + break; + } + case IF_SVE_IX_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit gather non-temporal load (vector plus + // scalar) + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_10C; + break; + + case IF_SVE_IY_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit scatter store (vector plus scalar) + switch (ins) + { + case INS_sve_st1q: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; // need to fix + result.insLatency = PERFSCORE_LATENCY_1C; // need to fix + break; + default: + // all other instructions + perfScoreUnhandledInstruction(id, &result); + break; + } + + case IF_SVE_IZ_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit scatter non-temporal store (vector plus + // scalar) + case IF_SVE_IZ_4A_A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit scatter non-temporal store (vector plus + // scalar) + case IF_SVE_JA_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit scatter non-temporal store (vector plus + // scalar) + case IF_SVE_JB_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal store (scalar plus + // scalar) + case IF_SVE_JD_4C: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous store (scalar plus scalar) + case IF_SVE_JD_4C_A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous store (scalar plus scalar) + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_2C; + break; + + case IF_SVE_JC_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE store multiple structures (scalar plus scalar) + switch (ins) + { + case INS_sve_st2b: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_4C; + break; + case INS_sve_st3b: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_7C; + break; + case INS_sve_st4b: + result.insThroughput = PERFSCORE_THROUGHPUT_9X; + result.insLatency = PERFSCORE_LATENCY_11C; + break; + case INS_sve_st2h: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_4C; + break; + case INS_sve_st3h: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_7C; + break; + case INS_sve_st4h: + result.insThroughput = PERFSCORE_THROUGHPUT_9X; + result.insLatency = PERFSCORE_LATENCY_11C; + break; + case INS_sve_st2w: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_4C; + break; + case INS_sve_st3w: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_7C; + break; + case INS_sve_st4w: + result.insThroughput = PERFSCORE_THROUGHPUT_9X; + result.insLatency = PERFSCORE_LATENCY_11C; + break; + case INS_sve_st2d: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_4C; + break; + case INS_sve_st3d: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_7C; + break; + case INS_sve_st4d: + result.insThroughput = PERFSCORE_THROUGHPUT_9X; + result.insLatency = PERFSCORE_LATENCY_11C; + break; + default: + // all other instructions + perfScoreUnhandledInstruction(id, &result); + break; + } + + case IF_SVE_JF_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE store multiple structures (quadwords, scalar plus + // scalar) + switch (ins) + { + case INS_sve_st2q: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; // need to fix + result.insLatency = PERFSCORE_LATENCY_1C; // need to fix + break; + case INS_sve_st3q: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; // need to fix + result.insLatency = PERFSCORE_LATENCY_1C; // need to fix + break; + case INS_sve_st4q: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; // need to fix + result.insLatency = PERFSCORE_LATENCY_1C; // need to fix + break; + default: + // all other instructions + perfScoreUnhandledInstruction(id, &result); + break; + } + + case IF_SVE_JJ_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled + // offsets) + case IF_SVE_JJ_4B_C: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled + // offsets) + case IF_SVE_JJ_4B_E: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled + // offsets) + case IF_SVE_JK_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit unscaled + // offsets) + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_2C; + break; + default: // all other instructions perfScoreUnhandledInstruction(id, &result); From f84bc75435243d89935962e130b39ddbd93157a1 Mon Sep 17 00:00:00 2001 From: TIHan Date: Mon, 29 Jan 2024 18:53:00 -0800 Subject: [PATCH 02/18] Added SVE_IG_4A to SVE_IG_4A_G formats --- src/coreclr/jit/codegenarm64test.cpp | 87 ++++--- src/coreclr/jit/emitarm64.cpp | 342 +++++++++++++++++++++++---- src/coreclr/jit/emitarm64.h | 5 +- src/coreclr/jit/instrsarm64sve.h | 6 +- 4 files changed, 357 insertions(+), 83 deletions(-) diff --git a/src/coreclr/jit/codegenarm64test.cpp b/src/coreclr/jit/codegenarm64test.cpp index ee5afaed5d982d..047183c3b01363 100644 --- a/src/coreclr/jit/codegenarm64test.cpp +++ b/src/coreclr/jit/codegenarm64test.cpp @@ -6359,40 +6359,71 @@ void CodeGen::genArm64EmitterUnitTestsSve() INS_OPTS_SCALABLE_S); // LDNT1W {.S }, /Z, [.S{, }] // IF_SVE_IF_4A_A - theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1b, EA_SCALABLE, REG_V0, REG_P2, REG_V4, REG_R3, INS_OPTS_SCALABLE_D); // LDNT1B {.D }, /Z, [.D{, }] - theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1h, EA_SCALABLE, REG_V1, REG_P4, REG_V3, REG_R2, INS_OPTS_SCALABLE_D); // LDNT1H {.D }, /Z, [.D{, }] - theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1sb, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1sb, EA_SCALABLE, REG_V2, REG_P3, REG_V4, REG_R5, INS_OPTS_SCALABLE_D); // LDNT1SB {.D }, /Z, [.D{, }] - theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1sh, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1sh, EA_SCALABLE, REG_V3, REG_P2, REG_V1, REG_R0, INS_OPTS_SCALABLE_D); // LDNT1SH {.D }, /Z, [.D{, }] - theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, + // REG_ZR can be used due to the optional {, } of the format. + theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1w, EA_SCALABLE, REG_V4, REG_P1, REG_V2, REG_ZR, INS_OPTS_SCALABLE_D); // LDNT1W {.D }, /Z, [.D{, }] - //// IF_SVE_IG_4A - //theEmitter->emitIns_R_R_R_R(INS_sve_ldff1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LDFF1D {.D }, /Z, [{, , LSL #3}] - //theEmitter->emitIns_R_R_R_R(INS_sve_ldff1sw, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LDFF1SW {.D }, /Z, [{, , LSL #2}] - - //// IF_SVE_IG_4A_D - //theEmitter->emitIns_R_R_R_R(INS_sve_ldff1sb, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LDFF1SB {.D }, /Z, [{, }] - - //// IF_SVE_IG_4A_E - //theEmitter->emitIns_R_R_R_R(INS_sve_ldff1b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LDFF1B {.D }, /Z, [{, }] - - //// IF_SVE_IG_4A_F - //theEmitter->emitIns_R_R_R_R(INS_sve_ldff1sh, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LDFF1SH {.D }, /Z, [{, , LSL #1}] - //theEmitter->emitIns_R_R_R_R(INS_sve_ldff1w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LDFF1W {.D }, /Z, [{, , LSL #2}] - - //// IF_SVE_IG_4A_G - //theEmitter->emitIns_R_R_R_R(INS_sve_ldff1h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LDFF1H {.D }, /Z, [{, , LSL #1}] + // IF_SVE_IG_4A + theEmitter->emitIns_R_R_R_R(INS_sve_ldff1d, EA_SCALABLE, REG_V1, REG_P2, REG_R3, REG_R4, + INS_OPTS_SCALABLE_D, INS_SCALABLE_OPTS_LSL_N); // LDFF1D {.D }, /Z, [{, , LSL #3}] + theEmitter->emitIns_R_R_R_R(INS_sve_ldff1sw, EA_SCALABLE, REG_V2, REG_P3, REG_R4, REG_R5, + INS_OPTS_SCALABLE_D, INS_SCALABLE_OPTS_LSL_N); // LDFF1SW {.D }, /Z, [{, , LSL #2}] + // REG_ZR can be used due to the optional {, , LSL #2}} of the format, though it still requires passing + // INS_SCALABLE_OPTS_LSL_N with it. + theEmitter->emitIns_R_R_R_R(INS_sve_ldff1sw, EA_SCALABLE, REG_V2, REG_P3, REG_R4, REG_ZR, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // LDFF1SW {.D }, /Z, [{, , LSL #2}] + + // IF_SVE_IG_4A_D + theEmitter->emitIns_R_R_R_R(INS_sve_ldff1sb, EA_SCALABLE, REG_V1, REG_P0, REG_R2, REG_R4, + INS_OPTS_SCALABLE_H); // LDFF1SB {.H }, /Z, [{, }] + theEmitter->emitIns_R_R_R_R(INS_sve_ldff1sb, EA_SCALABLE, REG_V1, REG_P0, REG_R2, REG_R4, + INS_OPTS_SCALABLE_S); // LDFF1SB {.S }, /Z, [{, }] + theEmitter->emitIns_R_R_R_R(INS_sve_ldff1sb, EA_SCALABLE, REG_V1, REG_P0, REG_R2, REG_R4, + INS_OPTS_SCALABLE_D); // LDFF1SB {.D }, /Z, [{, }] + + // IF_SVE_IG_4A_E + theEmitter->emitIns_R_R_R_R(INS_sve_ldff1b, EA_SCALABLE, REG_V3, REG_P2, REG_R0, REG_R1, + INS_OPTS_SCALABLE_B); // LDFF1B {.B }, /Z, [{, }] + theEmitter->emitIns_R_R_R_R(INS_sve_ldff1b, EA_SCALABLE, REG_V3, REG_P2, REG_R0, REG_R1, + INS_OPTS_SCALABLE_H); // LDFF1B {.H }, /Z, [{, }] + theEmitter->emitIns_R_R_R_R(INS_sve_ldff1b, EA_SCALABLE, REG_V3, REG_P2, REG_R0, REG_R1, + INS_OPTS_SCALABLE_S); // LDFF1B {.S }, /Z, [{, }] + theEmitter->emitIns_R_R_R_R(INS_sve_ldff1b, EA_SCALABLE, REG_V3, REG_P2, REG_R0, REG_R1, + INS_OPTS_SCALABLE_D); // LDFF1B {.D }, /Z, [{, }] + + // IF_SVE_IG_4A_F + theEmitter->emitIns_R_R_R_R(INS_sve_ldff1sh, EA_SCALABLE, REG_V4, REG_P3, REG_R1, REG_R2, INS_OPTS_SCALABLE_S, + INS_SCALABLE_OPTS_LSL_N); // LDFF1SH {.S }, /Z, [{, , LSL #1}] + theEmitter->emitIns_R_R_R_R(INS_sve_ldff1sh, EA_SCALABLE, REG_V4, REG_P3, REG_R1, REG_R2, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // LDFF1SH {.D }, /Z, [{, , LSL #1}] + theEmitter->emitIns_R_R_R_R(INS_sve_ldff1w, EA_SCALABLE, REG_V1, REG_P0, REG_R2, REG_R3, INS_OPTS_SCALABLE_S, + INS_SCALABLE_OPTS_LSL_N); // LDFF1W {.S }, /Z, [{, , LSL #2}] + theEmitter->emitIns_R_R_R_R(INS_sve_ldff1w, EA_SCALABLE, REG_V1, REG_P0, REG_R2, REG_R3, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // LDFF1W {.D }, /Z, [{, , LSL #2}] + // REG_ZR can be used due to the optional {, , LSL #2}} of the format, though it still requires passing + // INS_SCALABLE_OPTS_LSL_N with it. + theEmitter->emitIns_R_R_R_R(INS_sve_ldff1w, EA_SCALABLE, REG_V1, REG_P0, REG_R2, REG_ZR, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // LDFF1W {.D }, /Z, [{, , LSL #2}] + + // IF_SVE_IG_4A_G + theEmitter->emitIns_R_R_R_R(INS_sve_ldff1h, EA_SCALABLE, REG_V3, REG_P1, REG_R4, REG_R0, INS_OPTS_SCALABLE_H, + INS_SCALABLE_OPTS_LSL_N); // LDFF1H {.H }, /Z, [{, , LSL #1}] + theEmitter->emitIns_R_R_R_R(INS_sve_ldff1h, EA_SCALABLE, REG_V3, REG_P1, REG_R4, REG_R0, INS_OPTS_SCALABLE_S, + INS_SCALABLE_OPTS_LSL_N); // LDFF1H {.S }, /Z, [{, , LSL #1}] + theEmitter->emitIns_R_R_R_R(INS_sve_ldff1h, EA_SCALABLE, REG_V3, REG_P1, REG_R4, REG_R0, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // LDFF1H {.D }, /Z, [{, , LSL #1}] + // REG_ZR can be used due to the optional {, , LSL #1}} of the format, though it still requires passing + // INS_SCALABLE_OPTS_LSL_N with it. + theEmitter->emitIns_R_R_R_R(INS_sve_ldff1h, EA_SCALABLE, REG_V3, REG_P1, REG_R4, REG_ZR, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // LDFF1H {.D }, /Z, [{, , LSL #1}] //// IF_SVE_II_4A //theEmitter->emitIns_R_R_R_R(INS_sve_ld1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index b82f63ac0a301f..8c33c3d75c6f14 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -1711,6 +1711,15 @@ void emitter::emitInsSanityCheck(instrDesc* id) // scalar) case IF_SVE_IF_4A_A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit gather non-temporal load (vector plus // scalar) + elemsize = id->idOpSize(); + assert(insOptsScalableWords(id->idInsOpt())); + assert(isVectorRegister(id->idReg1())); // ttttt + assert(isPredicateRegister(id->idReg2())); // ggg + assert(isVectorRegister(id->idReg3())); // nnnnn + assert(isGeneralRegisterOrZR(id->idReg4())); // mmmmm + assert(isScalableVectorSize(elemsize)); + break; + case IF_SVE_IG_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus scalar) case IF_SVE_IG_4A_D: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus // scalar) @@ -1720,6 +1729,15 @@ void emitter::emitInsSanityCheck(instrDesc* id) // scalar) case IF_SVE_IG_4A_G: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus // scalar) + elemsize = id->idOpSize(); + assert(insOptsScalableStandard(id->idInsOpt())); + assert(isVectorRegister(id->idReg1())); // ttttt + assert(isPredicateRegister(id->idReg2())); // ggg + assert(isGeneralRegister(id->idReg3())); // nnnnn + assert(isGeneralRegisterOrZR(id->idReg4())); // mmmmm + assert(isScalableVectorSize(elemsize)); + break; + case IF_SVE_II_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) case IF_SVE_II_4A_B: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) case IF_SVE_II_4A_H: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) @@ -11719,22 +11737,45 @@ void emitter::emitIns_R_R_R_R(instruction ins, assert(isVectorRegister(reg1)); assert(isPredicateRegister(reg2)); assert(isGeneralRegister(reg3)); - assert(isVectorRegister(reg4)); assert(isScalableVectorSize(size)); assert(insScalableOptsNone(sopt)); - if (insOptsScalableDoubleWord32bitExtends(opt)) - { - fmt = IF_SVE_HW_4A; - } - else if (insOptsScalableSingleWord32bitExtends(opt)) + if (isGeneralRegisterOrZR(reg4)) { - fmt = IF_SVE_HW_4A_A; + switch (ins) + { + case INS_sve_ldff1b: + assert(insOptsScalableStandard(opt)); + fmt = IF_SVE_IG_4A_E; + break; + + case INS_sve_ldff1sb: + assert(insOptsScalableAtLeastHalf(opt)); + fmt = IF_SVE_IG_4A_D; + break; + + default: + assert(!"Invalid instruction"); + break; + } } else { - assert(opt == INS_OPTS_SCALABLE_D); - fmt = IF_SVE_HW_4B; + assert(isVectorRegister(reg4)); + + if (insOptsScalableDoubleWord32bitExtends(opt)) + { + fmt = IF_SVE_HW_4A; + } + else if (insOptsScalableSingleWord32bitExtends(opt)) + { + fmt = IF_SVE_HW_4A_A; + } + else + { + assert(opt == INS_OPTS_SCALABLE_D); + fmt = IF_SVE_HW_4B; + } } break; @@ -11747,73 +11788,133 @@ void emitter::emitIns_R_R_R_R(instruction ins, assert(isVectorRegister(reg1)); assert(isPredicateRegister(reg2)); assert(isGeneralRegister(reg3)); - assert(isVectorRegister(reg4)); assert(isScalableVectorSize(size)); - if (insOptsScalableDoubleWord32bitExtends(opt)) + if (isGeneralRegisterOrZR(reg4)) { - if (sopt == INS_SCALABLE_OPTS_MOD_N) - { - fmt = IF_SVE_HW_4A_A; - } - else + assert(sopt == INS_SCALABLE_OPTS_LSL_N); + + switch (ins) { - assert(insScalableOptsNone(sopt)); - fmt = IF_SVE_HW_4A_B; + case INS_sve_ldff1h: + assert(insOptsScalableStandard(opt)); + fmt = IF_SVE_IG_4A_G; + break; + + case INS_sve_ldff1sh: + case INS_sve_ldff1w: + assert(insOptsScalableWords(opt)); + fmt = IF_SVE_IG_4A_F; + break; + + default: + assert(!"Invalid instruction"); + break; } } - else if (insOptsScalableSingleWord32bitExtends(opt)) + else { - if (sopt == INS_SCALABLE_OPTS_MOD_N) - { - fmt = IF_SVE_HW_4A; - } - else + assert(isVectorRegister(reg4)); + + if (insOptsScalableDoubleWord32bitExtends(opt)) { - assert(insScalableOptsNone(sopt)); - fmt = IF_SVE_HW_4A_C; + if (sopt == INS_SCALABLE_OPTS_MOD_N) + { + fmt = IF_SVE_HW_4A_A; + } + else + { + assert(insScalableOptsNone(sopt)); + fmt = IF_SVE_HW_4A_B; + } } - } - else - { - assert(opt == INS_OPTS_SCALABLE_D); - if (sopt == INS_SCALABLE_OPTS_LSL_N) + else if (insOptsScalableSingleWord32bitExtends(opt)) { - fmt = IF_SVE_HW_4B; + if (sopt == INS_SCALABLE_OPTS_MOD_N) + { + fmt = IF_SVE_HW_4A; + } + else + { + assert(insScalableOptsNone(sopt)); + fmt = IF_SVE_HW_4A_C; + } } else { - assert(insScalableOptsNone(sopt)); - fmt = IF_SVE_HW_4B_D; + assert(opt == INS_OPTS_SCALABLE_D); + if (sopt == INS_SCALABLE_OPTS_LSL_N) + { + fmt = IF_SVE_HW_4B; + } + else + { + assert(insScalableOptsNone(sopt)); + fmt = IF_SVE_HW_4B_D; + } } } break; - case INS_sve_ldff1sw: - case INS_sve_ldff1d: case INS_sve_ld1d: case INS_sve_ld1sw: - assert(insOptsScalableDoubleWord32bitExtends(opt)); + case INS_sve_ldff1d: + case INS_sve_ldff1sw: assert(isVectorRegister(reg1)); assert(isPredicateRegister(reg2)); assert(isGeneralRegister(reg3)); - assert(isVectorRegister(reg4)); assert(isScalableVectorSize(size)); - if (sopt == INS_SCALABLE_OPTS_MOD_N) + if (isGeneralRegisterOrZR(reg4)) { - fmt = IF_SVE_IU_4A; + assert(sopt == INS_SCALABLE_OPTS_LSL_N); + + if (opt == INS_OPTS_SCALABLE_Q) + { + assert(ins == INS_sve_ld1d); + assert(!"not implemented"); + // TODO + } + else + { + assert(opt == INS_OPTS_SCALABLE_D); + assert(isVectorRegister(reg1)); + assert(isPredicateRegister(reg2)); + assert(isGeneralRegister(reg3)); + + switch (ins) + { + case INS_sve_ldff1d: + case INS_sve_ldff1sw: + fmt = IF_SVE_IG_4A; + break; + + default: + assert(!"Invalid instruction"); + break; + } + } } else { - assert(insScalableOptsNone(sopt)); - if (ins == INS_sve_ld1d) + assert(insOptsScalableDoubleWord32bitExtends(opt)); + assert(isVectorRegister(reg4)); + + if (sopt == INS_SCALABLE_OPTS_MOD_N) { - fmt = IF_SVE_IU_4A_C; + fmt = IF_SVE_IU_4A; } else { - fmt = IF_SVE_IU_4A_A; + assert(insScalableOptsNone(sopt)); + if (ins == INS_sve_ld1d) + { + fmt = IF_SVE_IU_4A_C; + } + else + { + fmt = IF_SVE_IU_4A_A; + } } } break; @@ -11829,7 +11930,6 @@ void emitter::emitIns_R_R_R_R(instruction ins, assert(isVectorRegister(reg3)); assert(isGeneralRegisterOrZR(reg4)); assert(isScalableVectorSize(size)); - assert(insScalableOptsNone(sopt)); if (opt == INS_OPTS_SCALABLE_S) @@ -15023,6 +15123,7 @@ void emitter::emitIns_Call(EmitCallType callType, case IF_SVE_IU_4A_A: case IF_SVE_IU_4B_B: case IF_SVE_HX_3A_B: + case IF_SVE_IG_4A: case IF_SVE_IG_4A_D: case IF_SVE_IG_4A_E: case IF_SVE_IF_4A: @@ -15230,7 +15331,7 @@ void emitter::emitIns_Call(EmitCallType callType, /***************************************************************************** * * Returns true if the SVE instruction has a LSL addr. - * This is for formats that have [, , LSL #N] + * This is for formats that have [, , LSL #N], [{, , LSL #N}] */ /*static*/ bool emitter::insSveIsLslN(instruction ins, insFormat fmt) { @@ -15274,6 +15375,41 @@ void emitter::emitIns_Call(EmitCallType callType, } break; + case IF_SVE_IG_4A: + switch (ins) + { + case INS_sve_ldff1d: + case INS_sve_ldff1sw: + return true; + + default: + break; + } + break; + + case IF_SVE_IG_4A_F: + switch (ins) + { + case INS_sve_ldff1sh: + case INS_sve_ldff1w: + return true; + + default: + break; + } + break; + + case IF_SVE_IG_4A_G: + switch (ins) + { + case INS_sve_ldff1h: + return true; + + default: + break; + } + break; + default: break; } @@ -15415,7 +15551,8 @@ void emitter::emitIns_Call(EmitCallType callType, /***************************************************************************** * * Returns 0, 1, 2 or 3 depending on the instruction and format. - * This is for formats that have [, .T, ], [, .T, #N], [, , LSL #N] + * This is for formats that have [, .T, ], [, .T, #N], [, , LSL #N], + * [{, , LSL #N}] */ /*static*/ int emitter::insSveGetLslOrModN(instruction ins, insFormat fmt) @@ -15564,6 +15701,51 @@ void emitter::emitIns_Call(EmitCallType callType, } return 0; + case IF_SVE_IG_4A: + assert(insSveIsLslN(ins, fmt)); + assert(!insSveIsModN(ins, fmt)); + switch (ins) + { + case INS_sve_ldff1sw: + return 2; + + case INS_sve_ldff1d: + return 3; + + default: + break; + } + break; + + case IF_SVE_IG_4A_F: + assert(insSveIsLslN(ins, fmt)); + assert(!insSveIsModN(ins, fmt)); + switch (ins) + { + case INS_sve_ldff1sh: + return 1; + + case INS_sve_ldff1w: + return 2; + + default: + break; + } + break; + + case IF_SVE_IG_4A_G: + assert(insSveIsLslN(ins, fmt)); + assert(!insSveIsModN(ins, fmt)); + switch (ins) + { + case INS_sve_ldff1h: + return 1; + + default: + break; + } + break; + default: break; } @@ -15591,6 +15773,11 @@ void emitter::emitIns_Call(EmitCallType callType, case INS_sve_ldnf1h: case INS_sve_ldnf1sb: case INS_sve_ldnf1b: + case INS_sve_ldff1b: + case INS_sve_ldff1sb: + case INS_sve_ldff1h: + case INS_sve_ldff1sh: + case INS_sve_ldff1w: return true; default: @@ -15614,6 +15801,7 @@ void emitter::emitIns_Call(EmitCallType callType, { case INS_sve_ld1b: case INS_sve_ldnf1b: + case INS_sve_ldff1b: return code; // By default, the instruction already encodes 8-bit. default: @@ -15628,10 +15816,13 @@ void emitter::emitIns_Call(EmitCallType callType, case INS_sve_ld1h: case INS_sve_ldnf1b: case INS_sve_ldnf1h: + case INS_sve_ldff1b: + case INS_sve_ldff1h: return code | (1 << 21); // Set bit '21' to 1. case INS_sve_ld1sb: case INS_sve_ldnf1sb: + case INS_sve_ldff1sb: return code | (1 << 22); // Set bit '22' to 1. default: @@ -15648,18 +15839,23 @@ void emitter::emitIns_Call(EmitCallType callType, return (code | (1 << 15)) | (1 << 22); // Set bit '22' and '15' to 1. case INS_sve_ldnf1w: + case INS_sve_ldff1w: return code; // By default, the instruction already encodes 32-bit. case INS_sve_ld1b: case INS_sve_ld1h: case INS_sve_ldnf1b: case INS_sve_ldnf1h: + case INS_sve_ldff1b: + case INS_sve_ldff1h: return code | (1 << 22); // Set bit '22' to 1. case INS_sve_ld1sb: case INS_sve_ld1sh: case INS_sve_ldnf1sb: case INS_sve_ldnf1sh: + case INS_sve_ldff1sb: + case INS_sve_ldff1sh: return code | (1 << 21); // Set bit '21' to 1. default: @@ -15676,18 +15872,23 @@ void emitter::emitIns_Call(EmitCallType callType, return ((code | (1 << 15)) | (1 << 22)) | (1 << 21); // Set bit '22', '21' and '15' to 1. case INS_sve_ldnf1w: + case INS_sve_ldff1w: return code | (1 << 21); // Set bit '21' to 1. Set bit '15' to 1. case INS_sve_ld1b: case INS_sve_ld1h: case INS_sve_ldnf1b: case INS_sve_ldnf1h: + case INS_sve_ldff1b: + case INS_sve_ldff1h: return (code | (1 << 22)) | (1 << 21); // Set bit '22' and '21' to 1. case INS_sve_ld1sb: case INS_sve_ld1sh: case INS_sve_ldnf1sb: case INS_sve_ldnf1sh: + case INS_sve_ldff1sb: + case INS_sve_ldff1sh: return code; // By default, the instruction already encodes 64-bit. default: @@ -18452,6 +18653,29 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) dst += emitOutput_Instr(dst, code); break; + case IF_SVE_IG_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus scalar) + case IF_SVE_IG_4A_D: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus + // scalar) + case IF_SVE_IG_4A_E: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus + // scalar) + case IF_SVE_IG_4A_F: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus + // scalar) + case IF_SVE_IG_4A_G: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus + // scalar) + code = emitInsCodeSve(ins, fmt); + code |= insEncodeReg_V_4_to_0(id->idReg1()); // ttttt + code |= insEncodeReg_P_12_to_10(id->idReg2()); // ggg + code |= insEncodeReg_R_9_to_5(id->idReg3()); // nnnnn + code |= insEncodeReg_R_20_to_16(id->idReg4()); // mmmmm + + if (canEncodeSveElemsize_dtype(ins)) + { + code = insEncodeSveElemsize_dtype(ins, optGetSveElemsize(id->idInsOpt()), code); + } + + dst += emitOutput_Instr(dst, code); + break; + default: assert(!"Unexpected format"); break; @@ -18932,7 +19156,8 @@ void emitter::emitDispSveExtendOptsModN(insOpts opt, int n) /***************************************************************************** * * Prints the encoding for the or LSL encoding along with the N value - * This is for formats that have [, .T, ], [, .T, #N], [, , LSL #N] + * This is for formats that have [, .T, ], [, .T, #N], [, , LSL #N], + * [{, , LSL #N}] */ void emitter::emitDispSveModAddr(instruction ins, regNumber reg1, regNumber reg2, insOpts opt, insFormat fmt) { @@ -18944,7 +19169,7 @@ void emitter::emitDispSveModAddr(instruction ins, regNumber reg1, regNumber reg2 } else { - emitDispReg(reg1, EA_8BYTE, true); + emitDispReg(reg1, EA_8BYTE, reg2 != REG_ZR); } if (isVectorRegister(reg2)) @@ -18961,7 +19186,8 @@ void emitter::emitDispSveModAddr(instruction ins, regNumber reg1, regNumber reg2 emitDispComma(); emitDispSveExtendOptsModN(opt, insSveGetLslOrModN(ins, fmt)); } - else if (insSveIsLslN(ins, fmt)) + // Omit 'lsl #N' only if the second register is ZR. + else if ((reg2 != REG_ZR) && insSveIsLslN(ins, fmt)) { emitDispComma(); switch (insSveGetLslOrModN(ins, fmt)) @@ -21447,13 +21673,29 @@ void emitter::emitDispInsHelp( // {.D }, /Z, [.D{, }] case IF_SVE_IF_4A_A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit gather non-temporal load (vector plus // scalar) + // {.D }, /Z, [{, , LSL #3}] + // {.D }, /Z, [{, , LSL #2}] case IF_SVE_IG_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus scalar) + // {.H }, /Z, [{, }] + // {.S }, /Z, [{, }] + // {.D }, /Z, [{, }] case IF_SVE_IG_4A_D: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus // scalar) + // {.B }, /Z, [{, }] + // {.H }, /Z, [{, }] + // {.S }, /Z, [{, }] + // {.D }, /Z, [{, }] case IF_SVE_IG_4A_E: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus // scalar) + // {.S }, /Z, [{, , LSL #1}] + // {.D }, /Z, [{, , LSL #1}] + // {.S }, /Z, [{, , LSL #2}] + // {.D }, /Z, [{, , LSL #2}] case IF_SVE_IG_4A_F: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus // scalar) + // {.H }, /Z, [{, , LSL #1}] + // {.S }, /Z, [{, , LSL #1}] + // {.D }, /Z, [{, , LSL #1}] case IF_SVE_IG_4A_G: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus // scalar) case IF_SVE_II_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) diff --git a/src/coreclr/jit/emitarm64.h b/src/coreclr/jit/emitarm64.h index 56dbe2e2c52d1c..30ee97e455d57e 100644 --- a/src/coreclr/jit/emitarm64.h +++ b/src/coreclr/jit/emitarm64.h @@ -508,7 +508,7 @@ static int insGetSveReg1ListSize(instruction ins); static PredicateType insGetPredicateType(insFormat fmt, int regpos = 0); // Returns true if the SVE instruction has a LSL addr. -// This is for formats that have [, , LSL #N] +// This is for formats that have [, , LSL #N], [{, , LSL #N}] static bool insSveIsLslN(instruction ins, insFormat fmt); // Returns true if the SVE instruction has a addr. @@ -516,7 +516,8 @@ static bool insSveIsLslN(instruction ins, insFormat fmt); static bool insSveIsModN(instruction ins, insFormat fmt); // Returns 0, 1, 2 or 3 depending on the instruction and format. -// This is for formats that have [, .T, ], [, .T, #N], [, , LSL #N] +// This is for formats that have [, .T, ], [, .T, #N], [, , LSL #N], +// [{, , LSL #N}] static int insSveGetLslOrModN(instruction ins, insFormat fmt); // Returns true if the specified instruction can encode the 'dtype' field. diff --git a/src/coreclr/jit/instrsarm64sve.h b/src/coreclr/jit/instrsarm64sve.h index ad5094bd141e36..1425cfad6c2312 100644 --- a/src/coreclr/jit/instrsarm64sve.h +++ b/src/coreclr/jit/instrsarm64sve.h @@ -214,7 +214,7 @@ INST8(ldff1w, "ldff1w", 0, IF_SV // LDFF1W {.D }, /Z, [, .D, LSL #2] SVE_HW_4B 11000101011mmmmm 111gggnnnnnttttt C560 E000 // LDFF1W {.D }, /Z, [, .D] SVE_HW_4B_D 11000101010mmmmm 111gggnnnnnttttt C540 E000 // LDFF1W {.D }, /Z, [.D{, #}] SVE_HX_3A_E 10000101001iiiii 111gggnnnnnttttt 8520 E000 - // LDFF1W {.D }, /Z, [{, , LSL #2}] SVE_IG_4A_F 10100101010mmmmm 011gggnnnnnttttt A540 6000 + // LDFF1W {.S }, /Z, [{, , LSL #2}] SVE_IG_4A_F 10100101010mmmmm 011gggnnnnnttttt A540 6000 // enum name info SVE_HW_4A SVE_HW_4A_A SVE_HW_4A_B SVE_HW_4A_C SVE_HW_4B SVE_HW_4B_D SVE_HX_3A_E SVE_IG_4A_G @@ -226,7 +226,7 @@ INST8(ldff1h, "ldff1h", 0, IF_SV // LDFF1H {.D }, /Z, [, .D, LSL #1] SVE_HW_4B 11000100111mmmmm 111gggnnnnnttttt C4E0 E000 // LDFF1H {.D }, /Z, [, .D] SVE_HW_4B_D 11000100110mmmmm 111gggnnnnnttttt C4C0 E000 // LDFF1H {.D }, /Z, [.D{, #}] SVE_HX_3A_E 10000100101iiiii 111gggnnnnnttttt 84A0 E000 - // LDFF1H {.D }, /Z, [{, , LSL #1}] SVE_IG_4A_G 10100100100mmmmm 011gggnnnnnttttt A480 6000 + // LDFF1H {.X }, /Z, [{, , LSL #1}] SVE_IG_4A_G 10100100100mmmmm 011gggnnnnnttttt A480 6000 // enum name info SVE_IJ_3A SVE_IK_4A SVE_IU_4A SVE_IU_4A_A SVE_IU_4B SVE_IU_4B_B SVE_IV_3A @@ -405,7 +405,7 @@ INST5(ldff1b, "ldff1b", 0, IF_SV // LDFF1B {.S }, /Z, [, .S, ] SVE_HW_4A_A 100001000h0mmmmm 011gggnnnnnttttt 8400 6000 // LDFF1B {.D }, /Z, [, .D] SVE_HW_4B 11000100010mmmmm 111gggnnnnnttttt C440 E000 // LDFF1B {.D }, /Z, [.D{, #}] SVE_HX_3A_B 10000100001iiiii 111gggnnnnnttttt 8420 E000 - // LDFF1B {.D }, /Z, [{, }] SVE_IG_4A_E 10100100000mmmmm 011gggnnnnnttttt A400 6000 + // LDFF1B {.B }, /Z, [{, }] SVE_IG_4A_E 10100100000mmmmm 011gggnnnnnttttt A400 6000 // enum name info SVE_AA_3A SVE_AU_3A SVE_BS_1A SVE_CZ_4A From d56e76039d517a698dd01c2e99141846287c1dd5 Mon Sep 17 00:00:00 2001 From: TIHan Date: Tue, 30 Jan 2024 15:20:53 -0800 Subject: [PATCH 03/18] Added SVE_II_4A, SVE_II_4A_B, SVE_II_4A_H formats. Special casing ld1w for encoding elem size. --- src/coreclr/jit/codegenarm64test.cpp | 26 ++-- src/coreclr/jit/emitarm64.cpp | 205 +++++++++++++++++++++++---- src/coreclr/jit/emitarm64.h | 6 +- src/coreclr/jit/instrsarm64sve.h | 2 +- 4 files changed, 202 insertions(+), 37 deletions(-) diff --git a/src/coreclr/jit/codegenarm64test.cpp b/src/coreclr/jit/codegenarm64test.cpp index 047183c3b01363..1a056286ba7179 100644 --- a/src/coreclr/jit/codegenarm64test.cpp +++ b/src/coreclr/jit/codegenarm64test.cpp @@ -6425,17 +6425,21 @@ void CodeGen::genArm64EmitterUnitTestsSve() theEmitter->emitIns_R_R_R_R(INS_sve_ldff1h, EA_SCALABLE, REG_V3, REG_P1, REG_R4, REG_ZR, INS_OPTS_SCALABLE_D, INS_SCALABLE_OPTS_LSL_N); // LDFF1H {.D }, /Z, [{, , LSL #1}] - //// IF_SVE_II_4A - //theEmitter->emitIns_R_R_R_R(INS_sve_ld1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD1D {.D }, /Z, [, , LSL #3] - - //// IF_SVE_II_4A_B - //theEmitter->emitIns_R_R_R_R(INS_sve_ld1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD1D {.Q }, /Z, [, , LSL #3] - - //// IF_SVE_II_4A_H - //theEmitter->emitIns_R_R_R_R(INS_sve_ld1w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD1W {.D }, /Z, [, , LSL #2] + // IF_SVE_II_4A + theEmitter->emitIns_R_R_R_R(INS_sve_ld1d, EA_SCALABLE, REG_V0, REG_P2, REG_R1, REG_R3, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // LD1D {.D }, /Z, [, , LSL #3] + + // IF_SVE_II_4A_B + theEmitter->emitIns_R_R_R_R(INS_sve_ld1d, EA_SCALABLE, REG_V1, REG_P0, REG_R3, REG_R4, INS_OPTS_SCALABLE_Q, + INS_SCALABLE_OPTS_LSL_N); // LD1D {.Q }, /Z, [, , LSL #3] + + // IF_SVE_II_4A_H + theEmitter->emitIns_R_R_R_R(INS_sve_ld1w, EA_SCALABLE, REG_V5, REG_P3, REG_R4, REG_R1, INS_OPTS_SCALABLE_S, + INS_SCALABLE_OPTS_LSL_N); // LD1W {.S }, /Z, [, , LSL #2] + theEmitter->emitIns_R_R_R_R(INS_sve_ld1w, EA_SCALABLE, REG_V5, REG_P3, REG_R4, REG_R1, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // LD1W {.D }, /Z, [, , LSL #2] + theEmitter->emitIns_R_R_R_R(INS_sve_ld1w, EA_SCALABLE, REG_V5, REG_P3, REG_R4, REG_R1, INS_OPTS_SCALABLE_Q, + INS_SCALABLE_OPTS_LSL_N); // LD1W {.Q }, /Z, [, , LSL #2] //// IF_SVE_IK_4A //theEmitter->emitIns_R_R_R_R(INS_sve_ld1sw, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index 8c33c3d75c6f14..c39149457ce4ce 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -1739,8 +1739,35 @@ void emitter::emitInsSanityCheck(instrDesc* id) break; case IF_SVE_II_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) + elemsize = id->idOpSize(); + assert(id->idInsOpt() == INS_OPTS_SCALABLE_D); + assert(isVectorRegister(id->idReg1())); // ttttt + assert(isPredicateRegister(id->idReg2())); // ggg + assert(isGeneralRegister(id->idReg3())); // nnnnn + assert(isGeneralRegister(id->idReg4())); // mmmmm + assert(isScalableVectorSize(elemsize)); + break; + case IF_SVE_II_4A_B: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) + elemsize = id->idOpSize(); + assert(id->idInsOpt() == INS_OPTS_SCALABLE_Q); + assert(isVectorRegister(id->idReg1())); // ttttt + assert(isPredicateRegister(id->idReg2())); // ggg + assert(isGeneralRegister(id->idReg3())); // nnnnn + assert(isGeneralRegister(id->idReg4())); // mmmmm + assert(isScalableVectorSize(elemsize)); + break; + case IF_SVE_II_4A_H: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) + elemsize = id->idOpSize(); + assert(insOptsScalableWordsOrQuadwords(id->idInsOpt())); + assert(isVectorRegister(id->idReg1())); // ttttt + assert(isPredicateRegister(id->idReg2())); // ggg + assert(isGeneralRegister(id->idReg3())); // nnnnn + assert(isGeneralRegister(id->idReg4())); // mmmmm + assert(isScalableVectorSize(elemsize)); + break; + case IF_SVE_IK_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) case IF_SVE_IK_4A_F: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) case IF_SVE_IK_4A_G: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) @@ -1786,8 +1813,8 @@ void emitter::emitInsSanityCheck(instrDesc* id) assert(insOptsScalableWords(id->idInsOpt())); assert(isVectorRegister(id->idReg1())); // ttttt assert(isPredicateRegister(id->idReg2())); // ggg - assert(isVectorRegister(id->idReg3())); // nnnnn - assert(isGeneralRegisterOrZR(id->idReg4())); // mmmmm + assert(isGeneralRegister(id->idReg3())); // nnnnn + assert(isGeneralRegister(id->idReg4())); // mmmmm assert(isScalableVectorSize(elemsize)); break; @@ -11807,6 +11834,11 @@ void emitter::emitIns_R_R_R_R(instruction ins, fmt = IF_SVE_IG_4A_F; break; + case INS_sve_ld1w: + assert(insOptsScalableWordsOrQuadwords(opt)); + fmt = IF_SVE_II_4A_H; + break; + default: assert(!"Invalid instruction"); break; @@ -11871,16 +11903,13 @@ void emitter::emitIns_R_R_R_R(instruction ins, if (opt == INS_OPTS_SCALABLE_Q) { + assert(reg4 != REG_ZR); assert(ins == INS_sve_ld1d); - assert(!"not implemented"); - // TODO + fmt = IF_SVE_II_4A_B; } else { assert(opt == INS_OPTS_SCALABLE_D); - assert(isVectorRegister(reg1)); - assert(isPredicateRegister(reg2)); - assert(isGeneralRegister(reg3)); switch (ins) { @@ -11889,6 +11918,11 @@ void emitter::emitIns_R_R_R_R(instruction ins, fmt = IF_SVE_IG_4A; break; + case INS_sve_ld1d: + assert(reg4 != REG_ZR); + fmt = IF_SVE_II_4A; + break; + default: assert(!"Invalid instruction"); break; @@ -15410,6 +15444,29 @@ void emitter::emitIns_Call(EmitCallType callType, } break; + case IF_SVE_II_4A: + case IF_SVE_II_4A_B: + switch (ins) + { + case INS_sve_ld1d: + return true; + + default: + break; + } + break; + + case IF_SVE_II_4A_H: + switch (ins) + { + case INS_sve_ld1w: + return true; + + default: + break; + } + break; + default: break; } @@ -15746,6 +15803,33 @@ void emitter::emitIns_Call(EmitCallType callType, } break; + case IF_SVE_II_4A: + case IF_SVE_II_4A_B: + assert(insSveIsLslN(ins, fmt)); + assert(!insSveIsModN(ins, fmt)); + switch (ins) + { + case INS_sve_ld1d: + return 3; + + default: + break; + } + break; + + case IF_SVE_II_4A_H: + assert(insSveIsLslN(ins, fmt)); + assert(!insSveIsModN(ins, fmt)); + switch (ins) + { + case INS_sve_ld1w: + return 2; + + default: + break; + } + break; + default: break; } @@ -15787,13 +15871,14 @@ void emitter::emitIns_Call(EmitCallType callType, /***************************************************************************** * - * Returns the encoding to select the 1/2/4/8/16 byte elemsize for an Arm64 Sve vector instruction + * Returns the encoding to select the 1/2/4/8 byte elemsize for an Arm64 Sve vector instruction * for the 'dtype' field. */ /*static*/ emitter::code_t emitter::insEncodeSveElemsize_dtype(instruction ins, emitAttr size, code_t code) { assert(canEncodeSveElemsize_dtype(ins)); + assert(ins != INS_sve_ld1w); switch (size) { case EA_1BYTE: @@ -15833,11 +15918,6 @@ void emitter::emitIns_Call(EmitCallType callType, case EA_4BYTE: switch (ins) { - case INS_sve_ld1w: - // Note: Bit '15' is not actually part of 'dtype', but it is necessary to set to '1' to get the - // proper encoding for S. - return (code | (1 << 15)) | (1 << 22); // Set bit '22' and '15' to 1. - case INS_sve_ldnf1w: case INS_sve_ldff1w: return code; // By default, the instruction already encodes 32-bit. @@ -15866,11 +15946,6 @@ void emitter::emitIns_Call(EmitCallType callType, case EA_8BYTE: switch (ins) { - case INS_sve_ld1w: - // Note: Bit '15' is not actually part of 'dtype', but it is necessary to set to '1' to get the - // proper encoding for D. - return ((code | (1 << 15)) | (1 << 22)) | (1 << 21); // Set bit '22', '21' and '15' to 1. - case INS_sve_ldnf1w: case INS_sve_ldff1w: return code | (1 << 21); // Set bit '21' to 1. Set bit '15' to 1. @@ -15896,21 +15971,83 @@ void emitter::emitIns_Call(EmitCallType callType, } return code; + default: + assert(!"Invalid size for encoding dtype."); + } + + return code; +} + +/***************************************************************************** + * + * Returns the encoding to select the 4/8/16 byte elemsize for the Arm64 Sve vector instruction 'ld1w' + * for the 'dtype' field. + */ + +/*static*/ emitter::code_t emitter::insEncodeSveElemsize_dtype_ld1w(instruction ins, insFormat fmt, emitAttr size, code_t code) +{ + assert(canEncodeSveElemsize_dtype(ins)); + assert(ins == INS_sve_ld1w); + switch (size) + { + case EA_4BYTE: + switch (fmt) + { + case IF_SVE_IH_3A_F: + // Note: Bit '15' is not actually part of 'dtype', but it is necessary to set to '1' to get the + // proper encoding for S. + return (code | (1 << 15)) | (1 << 22); // Set bit '22' and '15' to 1. + + case IF_SVE_II_4A_H: + // Note: Bit '14' is not actually part of 'dtype', but it is necessary to set to '1' to get the + // proper encoding for S. + return (code | (1 << 14)) | (1 << 22); // Set bit '22' and '14' to 1. + + default: + break; + } + break; + + case EA_8BYTE: + switch (fmt) + { + case IF_SVE_IH_3A_F: + // Note: Bit '15' is not actually part of 'dtype', but it is necessary to set to '1' to get the + // proper encoding for D. + return ((code | (1 << 15)) | (1 << 22)) | (1 << 21); // Set bit '22', '21' and '15' to 1. + + case IF_SVE_II_4A_H: + // Note: Bit '14' is not actually part of 'dtype', but it is necessary to set to '1' to get the + // proper encoding for D. + return ((code | (1 << 14)) | (1 << 22)) | (1 << 21); // Set bit '22', '21' and '14' to 1. + + default: + break; + } + break; + case EA_16BYTE: - switch (ins) + switch (fmt) { - case INS_sve_ld1w: + case IF_SVE_IH_3A_F: return code | (1 << 20); // Set bit '20' to 1. + case IF_SVE_II_4A_H: + // Note: Bit '15' is not actually part of 'dtype', but it is necessary to set to '1' to get the + // proper encoding for Q. + return code | (1 << 15); // Set bit '15' to 1. + default: - assert(!"Invalid instruction for encoding dtype."); + break; } - return code; + break; default: assert(!"Invalid size for encoding dtype."); + break; } + assert(!"Invalid instruction format"); return code; } @@ -18515,7 +18652,14 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) if (canEncodeSveElemsize_dtype(ins)) { - code = insEncodeSveElemsize_dtype(ins, optGetSveElemsize(id->idInsOpt()), code); + if (ins == INS_sve_ld1w) + { + code = insEncodeSveElemsize_dtype_ld1w(ins, fmt, optGetSveElemsize(id->idInsOpt()), code); + } + else + { + code = insEncodeSveElemsize_dtype(ins, optGetSveElemsize(id->idInsOpt()), code); + } } dst += emitOutput_Instr(dst, code); @@ -18662,6 +18806,9 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) // scalar) case IF_SVE_IG_4A_G: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus // scalar) + case IF_SVE_II_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) + case IF_SVE_II_4A_B: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) + case IF_SVE_II_4A_H: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) code = emitInsCodeSve(ins, fmt); code |= insEncodeReg_V_4_to_0(id->idReg1()); // ttttt code |= insEncodeReg_P_12_to_10(id->idReg2()); // ggg @@ -18670,7 +18817,14 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) if (canEncodeSveElemsize_dtype(ins)) { - code = insEncodeSveElemsize_dtype(ins, optGetSveElemsize(id->idInsOpt()), code); + if (ins == INS_sve_ld1w) + { + code = insEncodeSveElemsize_dtype_ld1w(ins, fmt, optGetSveElemsize(id->idInsOpt()), code); + } + else + { + code = insEncodeSveElemsize_dtype(ins, optGetSveElemsize(id->idInsOpt()), code); + } } dst += emitOutput_Instr(dst, code); @@ -21698,8 +21852,11 @@ void emitter::emitDispInsHelp( // {.D }, /Z, [{, , LSL #1}] case IF_SVE_IG_4A_G: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus // scalar) + // {.D }, /Z, [, , LSL #3] case IF_SVE_II_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) + // {.Q }, /Z, [, , LSL #3] case IF_SVE_II_4A_B: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) + // {.D }, /Z, [, , LSL #2] case IF_SVE_II_4A_H: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) case IF_SVE_IK_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) case IF_SVE_IK_4A_F: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) diff --git a/src/coreclr/jit/emitarm64.h b/src/coreclr/jit/emitarm64.h index 30ee97e455d57e..d7cb0f918497eb 100644 --- a/src/coreclr/jit/emitarm64.h +++ b/src/coreclr/jit/emitarm64.h @@ -523,10 +523,14 @@ static int insSveGetLslOrModN(instruction ins, insFormat fmt); // Returns true if the specified instruction can encode the 'dtype' field. static bool canEncodeSveElemsize_dtype(instruction ins); -// Returns the encoding to select the 1/2/4/8/16 byte elemsize for an Arm64 Sve vector instruction +// Returns the encoding to select the 1/2/4/8 byte elemsize for an Arm64 Sve vector instruction // for the 'dtype' field. static code_t insEncodeSveElemsize_dtype(instruction ins, emitAttr size, code_t code); +// Returns the encoding to select the 4/8/16 byte elemsize for the Arm64 Sve vector instruction 'ld1w' +// for the 'dtype' field. +static code_t insEncodeSveElemsize_dtype_ld1w(instruction ins, insFormat fmt, emitAttr size, code_t code); + // Returns the encoding for the immediate value as 4-bits at bit locations '19-16'. static code_t insEncodeSimm4_19_to_16(ssize_t imm); diff --git a/src/coreclr/jit/instrsarm64sve.h b/src/coreclr/jit/instrsarm64sve.h index 1425cfad6c2312..a4a5a1da9d8390 100644 --- a/src/coreclr/jit/instrsarm64sve.h +++ b/src/coreclr/jit/instrsarm64sve.h @@ -141,7 +141,7 @@ INST9(ld1w, "ld1w", 0, IF_SV // LD1W {.D }, /Z, [, .D] SVE_HW_4B_D 11000101010mmmmm 110gggnnnnnttttt C540 C000 // LD1W {.D }, /Z, [.D{, #}] SVE_HX_3A_E 10000101001iiiii 110gggnnnnnttttt 8520 C000 // LD1W {.X }, /Z, [{, #, MUL VL}] SVE_IH_3A_F 101001010000iiii 001gggnnnnnttttt A500 2000 - // LD1W {.D }, /Z, [, , LSL #2] SVE_II_4A_H 10100101000mmmmm 000gggnnnnnttttt A500 0000 + // LD1W {.X }, /Z, [, , LSL #2] SVE_II_4A_H 10100101000mmmmm 000gggnnnnnttttt A500 0000 // enum name info SVE_IH_3A SVE_IH_3A_A SVE_II_4A SVE_II_4A_B SVE_IU_4A SVE_IU_4A_C SVE_IU_4B SVE_IU_4B_D SVE_IV_3A From 61e18dac89e7c4a8042b027fb3ee14b8a2a3a109 Mon Sep 17 00:00:00 2001 From: TIHan Date: Tue, 30 Jan 2024 16:01:13 -0800 Subject: [PATCH 04/18] Added SVE_IK_4A to SVE_IK_4A_I formats. --- src/coreclr/jit/codegenarm64test.cpp | 54 +++++++---- src/coreclr/jit/emitarm64.cpp | 128 ++++++++++++++++++++++++++- src/coreclr/jit/instrsarm64sve.h | 4 +- 3 files changed, 164 insertions(+), 22 deletions(-) diff --git a/src/coreclr/jit/codegenarm64test.cpp b/src/coreclr/jit/codegenarm64test.cpp index 1a056286ba7179..fbe33915738542 100644 --- a/src/coreclr/jit/codegenarm64test.cpp +++ b/src/coreclr/jit/codegenarm64test.cpp @@ -6441,25 +6441,41 @@ void CodeGen::genArm64EmitterUnitTestsSve() theEmitter->emitIns_R_R_R_R(INS_sve_ld1w, EA_SCALABLE, REG_V5, REG_P3, REG_R4, REG_R1, INS_OPTS_SCALABLE_Q, INS_SCALABLE_OPTS_LSL_N); // LD1W {.Q }, /Z, [, , LSL #2] - //// IF_SVE_IK_4A - //theEmitter->emitIns_R_R_R_R(INS_sve_ld1sw, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD1SW {.D }, /Z, [, , LSL #2] - - //// IF_SVE_IK_4A_F - //theEmitter->emitIns_R_R_R_R(INS_sve_ld1sb, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD1SB {.D }, /Z, [, ] - - //// IF_SVE_IK_4A_G - //theEmitter->emitIns_R_R_R_R(INS_sve_ld1sh, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD1SH {.D }, /Z, [, , LSL #1] - - //// IF_SVE_IK_4A_H - //theEmitter->emitIns_R_R_R_R(INS_sve_ld1b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD1B {.D }, /Z, [, ] - - //// IF_SVE_IK_4A_I - //theEmitter->emitIns_R_R_R_R(INS_sve_ld1h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD1H {.D }, /Z, [, , LSL #1] + // IF_SVE_IK_4A + theEmitter->emitIns_R_R_R_R(INS_sve_ld1sw, EA_SCALABLE, REG_V0, REG_P1, REG_R2, REG_R3, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // LD1SW {.D }, /Z, [, , LSL #2] + + // IF_SVE_IK_4A_F + theEmitter->emitIns_R_R_R_R(INS_sve_ld1sb, EA_SCALABLE, REG_V1, REG_P2, REG_R3, REG_R4, + INS_OPTS_SCALABLE_H); // LD1SB {.H }, /Z, [, ] + theEmitter->emitIns_R_R_R_R(INS_sve_ld1sb, EA_SCALABLE, REG_V1, REG_P2, REG_R3, REG_R4, + INS_OPTS_SCALABLE_S); // LD1SB {.S }, /Z, [, ] + theEmitter->emitIns_R_R_R_R(INS_sve_ld1sb, EA_SCALABLE, REG_V1, REG_P2, REG_R3, REG_R4, + INS_OPTS_SCALABLE_D); // LD1SB {.D }, /Z, [, ] + + // IF_SVE_IK_4A_G + theEmitter->emitIns_R_R_R_R(INS_sve_ld1sh, EA_SCALABLE, REG_V2, REG_P3, REG_R4, REG_R5, INS_OPTS_SCALABLE_S, + INS_SCALABLE_OPTS_LSL_N); // LD1SH {.S }, /Z, [, , LSL #1] + theEmitter->emitIns_R_R_R_R(INS_sve_ld1sh, EA_SCALABLE, REG_V2, REG_P3, REG_R4, REG_R5, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // LD1SH {.D }, /Z, [, , LSL #1] + + // IF_SVE_IK_4A_H + theEmitter->emitIns_R_R_R_R(INS_sve_ld1b, EA_SCALABLE, REG_V3, REG_P4, REG_R5, REG_R6, + INS_OPTS_SCALABLE_B); // LD1B {.B }, /Z, [, ] + theEmitter->emitIns_R_R_R_R(INS_sve_ld1b, EA_SCALABLE, REG_V3, REG_P4, REG_R5, REG_R6, + INS_OPTS_SCALABLE_H); // LD1B {.H }, /Z, [, ] + theEmitter->emitIns_R_R_R_R(INS_sve_ld1b, EA_SCALABLE, REG_V3, REG_P4, REG_R5, REG_R6, + INS_OPTS_SCALABLE_S); // LD1B {.S }, /Z, [, ] + theEmitter->emitIns_R_R_R_R(INS_sve_ld1b, EA_SCALABLE, REG_V3, REG_P4, REG_R5, REG_R6, + INS_OPTS_SCALABLE_D); // LD1B {.D }, /Z, [, ] + + // IF_SVE_IK_4A_I + theEmitter->emitIns_R_R_R_R(INS_sve_ld1h, EA_SCALABLE, REG_V4, REG_P2, REG_R3, REG_R1, INS_OPTS_SCALABLE_H, + INS_SCALABLE_OPTS_LSL_N); // LD1H {.H }, /Z, [, , LSL #1] + theEmitter->emitIns_R_R_R_R(INS_sve_ld1h, EA_SCALABLE, REG_V4, REG_P2, REG_R3, REG_R1, INS_OPTS_SCALABLE_S, + INS_SCALABLE_OPTS_LSL_N); // LD1H {.S }, /Z, [, , LSL #1] + theEmitter->emitIns_R_R_R_R(INS_sve_ld1h, EA_SCALABLE, REG_V4, REG_P2, REG_R3, REG_R1, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // LD1H {.D }, /Z, [, , LSL #1] //// IF_SVE_IN_4A //theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index c39149457ce4ce..ed691881d6679d 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -1773,6 +1773,15 @@ void emitter::emitInsSanityCheck(instrDesc* id) case IF_SVE_IK_4A_G: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) case IF_SVE_IK_4A_H: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) case IF_SVE_IK_4A_I: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + elemsize = id->idOpSize(); + assert(insOptsScalableStandard(id->idInsOpt())); + assert(isVectorRegister(id->idReg1())); // ttttt + assert(isPredicateRegister(id->idReg2())); // ggg + assert(isGeneralRegister(id->idReg3())); // nnnnn + assert(isGeneralRegister(id->idReg4())); // mmmmm + assert(isScalableVectorSize(elemsize)); + break; + case IF_SVE_IN_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal load (scalar plus scalar) case IF_SVE_IP_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load and broadcast quadword (scalar plus scalar) case IF_SVE_IR_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (quadwords, scalar plus @@ -1810,7 +1819,7 @@ void emitter::emitInsSanityCheck(instrDesc* id) case IF_SVE_JK_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit unscaled // offsets) elemsize = id->idOpSize(); - assert(insOptsScalableWords(id->idInsOpt())); + assert(insOptsScalableStandard(id->idInsOpt())); assert(isVectorRegister(id->idReg1())); // ttttt assert(isPredicateRegister(id->idReg2())); // ggg assert(isGeneralRegister(id->idReg3())); // nnnnn @@ -11781,6 +11790,16 @@ void emitter::emitIns_R_R_R_R(instruction ins, fmt = IF_SVE_IG_4A_D; break; + case INS_sve_ld1sb: + assert(insOptsScalableAtLeastHalf(opt)); + fmt = IF_SVE_IK_4A_F; + break; + + case INS_sve_ld1b: + assert(insOptsScalableStandard(opt)); + fmt = IF_SVE_IK_4A_H; + break; + default: assert(!"Invalid instruction"); break; @@ -11839,6 +11858,16 @@ void emitter::emitIns_R_R_R_R(instruction ins, fmt = IF_SVE_II_4A_H; break; + case INS_sve_ld1sh: + assert(insOptsScalableWords(opt)); + fmt = IF_SVE_IK_4A_G; + break; + + case INS_sve_ld1h: + assert(insOptsScalableAtLeastHalf(opt)); + fmt = IF_SVE_IK_4A_I; + break; + default: assert(!"Invalid instruction"); break; @@ -11923,6 +11952,11 @@ void emitter::emitIns_R_R_R_R(instruction ins, fmt = IF_SVE_II_4A; break; + case INS_sve_ld1sw: + assert(reg4 != REG_ZR); + fmt = IF_SVE_IK_4A; + break; + default: assert(!"Invalid instruction"); break; @@ -15154,6 +15188,8 @@ void emitter::emitIns_Call(EmitCallType callType, case IF_SVE_IG_4A_G: case IF_SVE_IJ_3A: case IF_SVE_IK_4A: + case IF_SVE_IK_4A_F: + case IF_SVE_IK_4A_H: case IF_SVE_IU_4A_A: case IF_SVE_IU_4B_B: case IF_SVE_HX_3A_B: @@ -15467,6 +15503,39 @@ void emitter::emitIns_Call(EmitCallType callType, } break; + case IF_SVE_IK_4A: + switch (ins) + { + case INS_sve_ld1sw: + return true; + + default: + break; + } + break; + + case IF_SVE_IK_4A_G: + switch (ins) + { + case INS_sve_ld1sh: + return true; + + default: + break; + } + break; + + case IF_SVE_IK_4A_I: + switch (ins) + { + case INS_sve_ld1h: + return true; + + default: + break; + } + break; + default: break; } @@ -15830,6 +15899,45 @@ void emitter::emitIns_Call(EmitCallType callType, } break; + case IF_SVE_IK_4A: + assert(insSveIsLslN(ins, fmt)); + assert(!insSveIsModN(ins, fmt)); + switch (ins) + { + case INS_sve_ld1sw: + return 2; + + default: + break; + } + break; + + case IF_SVE_IK_4A_G: + assert(insSveIsLslN(ins, fmt)); + assert(!insSveIsModN(ins, fmt)); + switch (ins) + { + case INS_sve_ld1sh: + return 1; + + default: + break; + } + break; + + case IF_SVE_IK_4A_I: + assert(insSveIsLslN(ins, fmt)); + assert(!insSveIsModN(ins, fmt)); + switch (ins) + { + case INS_sve_ld1h: + return 1; + + default: + break; + } + break; + default: break; } @@ -18809,6 +18917,11 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_SVE_II_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) case IF_SVE_II_4A_B: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) case IF_SVE_II_4A_H: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) + case IF_SVE_IK_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + case IF_SVE_IK_4A_F: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + case IF_SVE_IK_4A_G: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + case IF_SVE_IK_4A_H: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + case IF_SVE_IK_4A_I: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) code = emitInsCodeSve(ins, fmt); code |= insEncodeReg_V_4_to_0(id->idReg1()); // ttttt code |= insEncodeReg_P_12_to_10(id->idReg2()); // ggg @@ -21858,10 +21971,23 @@ void emitter::emitDispInsHelp( case IF_SVE_II_4A_B: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) // {.D }, /Z, [, , LSL #2] case IF_SVE_II_4A_H: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) + // {.D }, /Z, [, , LSL #2 case IF_SVE_IK_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + // {.H }, /Z, [, ] + // {.S }, /Z, [, ] + // {.D }, /Z, [, ] case IF_SVE_IK_4A_F: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + // {.S }, /Z, [, , LSL #1] + // {.D }, /Z, [, , LSL #1] case IF_SVE_IK_4A_G: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + // {.B }, /Z, [, ] + // {.H }, /Z, [, ] + // {.S }, /Z, [, ] + // {.D }, /Z, [, ] case IF_SVE_IK_4A_H: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + // {.H }, /Z, [, , LSL #1] + // {.S }, /Z, [, , LSL #1] + // {.D }, /Z, [, , LSL #1] case IF_SVE_IK_4A_I: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) case IF_SVE_IN_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal load (scalar plus scalar) case IF_SVE_IP_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load and broadcast quadword (scalar plus scalar) diff --git a/src/coreclr/jit/instrsarm64sve.h b/src/coreclr/jit/instrsarm64sve.h index a4a5a1da9d8390..594709ff9063a2 100644 --- a/src/coreclr/jit/instrsarm64sve.h +++ b/src/coreclr/jit/instrsarm64sve.h @@ -128,7 +128,7 @@ INST9(ld1h, "ld1h", 0, IF_SV // LD1H {.D }, /Z, [, .D] SVE_HW_4B_D 11000100110mmmmm 110gggnnnnnttttt C4C0 C000 // LD1H {.D }, /Z, [.D{, #}] SVE_HX_3A_E 10000100101iiiii 110gggnnnnnttttt 84A0 C000 // LD1H {.X }, /Z, [{, #, MUL VL}] SVE_IJ_3A_G 101001001000iiii 101gggnnnnnttttt A480 A000 - // LD1H {.D }, /Z, [, , LSL #1] SVE_IK_4A_I 10100100100mmmmm 010gggnnnnnttttt A480 4000 + // LD1H {.X }, /Z, [, , LSL #1] SVE_IK_4A_I 10100100100mmmmm 010gggnnnnnttttt A480 4000 // enum name info SVE_HW_4A SVE_HW_4A_A SVE_HW_4A_B SVE_HW_4A_C SVE_HW_4B SVE_HW_4B_D SVE_HX_3A_E SVE_IH_3A_F SVE_II_4A_H @@ -277,7 +277,7 @@ INST6(ld1b, "ld1b", 0, IF_SV // LD1B {.D }, /Z, [, .D] SVE_HW_4B 11000100010mmmmm 110gggnnnnnttttt C440 C000 // LD1B {.D }, /Z, [.D{, #}] SVE_HX_3A_B 10000100001iiiii 110gggnnnnnttttt 8420 C000 // LD1B {.B }, /Z, [{, #, MUL VL}] SVE_IJ_3A_E 101001000000iiii 101gggnnnnnttttt A400 A000 - // LD1B {.D }, /Z, [, ] SVE_IK_4A_H 10100100000mmmmm 010gggnnnnnttttt A400 4000 + // LD1B {.B }, /Z, [, ] SVE_IK_4A_H 10100100000mmmmm 010gggnnnnnttttt A400 4000 // enum name info SVE_HY_3A SVE_HY_3A_A SVE_HY_3B SVE_HZ_2A_B SVE_IA_2A SVE_IB_3A From 70f2f23a0587b0aa0a30aa311ded927c5d089fd5 Mon Sep 17 00:00:00 2001 From: TIHan Date: Tue, 30 Jan 2024 16:22:39 -0800 Subject: [PATCH 05/18] Added SVE_IN_4A format --- src/coreclr/jit/codegenarm64test.cpp | 18 ++--- src/coreclr/jit/emitarm64.cpp | 98 ++++++++++++++++++++++++---- 2 files changed, 96 insertions(+), 20 deletions(-) diff --git a/src/coreclr/jit/codegenarm64test.cpp b/src/coreclr/jit/codegenarm64test.cpp index fbe33915738542..b33a0b67c1de41 100644 --- a/src/coreclr/jit/codegenarm64test.cpp +++ b/src/coreclr/jit/codegenarm64test.cpp @@ -6477,15 +6477,15 @@ void CodeGen::genArm64EmitterUnitTestsSve() theEmitter->emitIns_R_R_R_R(INS_sve_ld1h, EA_SCALABLE, REG_V4, REG_P2, REG_R3, REG_R1, INS_OPTS_SCALABLE_D, INS_SCALABLE_OPTS_LSL_N); // LD1H {.D }, /Z, [, , LSL #1] - //// IF_SVE_IN_4A - //theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LDNT1B {.B }, /Z, [, ] - //theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LDNT1D {.D }, /Z, [, , LSL #3] - //theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LDNT1H {.H }, /Z, [, , LSL #1] - //theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LDNT1W {.S }, /Z, [, , LSL #2] + // IF_SVE_IN_4A + theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1b, EA_SCALABLE, REG_V4, REG_P2, REG_R1, REG_R3, + INS_OPTS_SCALABLE_B); // LDNT1B {.B }, /Z, [, ] + theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1d, EA_SCALABLE, REG_V0, REG_P1, REG_R2, REG_R4, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // LDNT1D {.D }, /Z, [, , LSL #3] + theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1h, EA_SCALABLE, REG_V0, REG_P3, REG_R4, REG_R5, INS_OPTS_SCALABLE_H, + INS_SCALABLE_OPTS_LSL_N); // LDNT1H {.H }, /Z, [, , LSL #1] + theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1w, EA_SCALABLE, REG_V2, REG_P0, REG_R3, REG_R1, INS_OPTS_SCALABLE_S, + INS_SCALABLE_OPTS_LSL_N); // LDNT1W {.S }, /Z, [, , LSL #2] //// IF_SVE_IP_4A //theEmitter->emitIns_R_R_R_R(INS_sve_ld1rob, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index ed691881d6679d..260af8d66d00e3 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -11989,25 +11989,66 @@ void emitter::emitIns_R_R_R_R(instruction ins, case INS_sve_ldnt1b: case INS_sve_ldnt1h: + case INS_sve_ldnt1w: + case INS_sve_ldnt1d: case INS_sve_ldnt1sb: case INS_sve_ldnt1sh: - case INS_sve_ldnt1w: - assert(insOptsScalableWords(opt)); assert(isVectorRegister(reg1)); - assert(isPredicateRegister(reg2)); - assert(isVectorRegister(reg3)); - assert(isGeneralRegisterOrZR(reg4)); - assert(isScalableVectorSize(size)); - assert(insScalableOptsNone(sopt)); + assert(isPredicateRegister(reg2)); - if (opt == INS_OPTS_SCALABLE_S) + if (isGeneralRegister(reg3)) { - fmt = IF_SVE_IF_4A; + assert(isGeneralRegister(reg4)); + +#ifdef DEBUG + switch (ins) + { + case INS_sve_ldnt1b: + assert(opt == INS_OPTS_SCALABLE_B); + assert(insScalableOptsNone(sopt)); + break; + + case INS_sve_ldnt1h: + assert(opt == INS_OPTS_SCALABLE_H); + assert(sopt == INS_SCALABLE_OPTS_LSL_N); + break; + + case INS_sve_ldnt1w: + assert(opt == INS_OPTS_SCALABLE_S); + assert(sopt == INS_SCALABLE_OPTS_LSL_N); + break; + + case INS_sve_ldnt1d: + assert(opt == INS_OPTS_SCALABLE_D); + assert(sopt == INS_SCALABLE_OPTS_LSL_N); + break; + + default: + assert(!"Invalid instruction"); + break; + } +#endif // DEBUG + + fmt = IF_SVE_IN_4A; } else { - assert(opt == INS_OPTS_SCALABLE_D); - fmt = IF_SVE_IF_4A_A; + assert(ins != INS_sve_ldnt1d); + assert(insOptsScalableWords(opt)); + assert(isVectorRegister(reg3)); + assert(isGeneralRegisterOrZR(reg4)); + assert(isScalableVectorSize(size)); + assert(insScalableOptsNone(sopt)); + + if (opt == INS_OPTS_SCALABLE_S) + { + fmt = IF_SVE_IF_4A; + } + else + { + assert(opt == INS_OPTS_SCALABLE_D); + fmt = IF_SVE_IF_4A_A; + } } break; @@ -15536,6 +15577,19 @@ void emitter::emitIns_Call(EmitCallType callType, } break; + case IF_SVE_IN_4A: + switch (ins) + { + case INS_sve_ldnt1d: + case INS_sve_ldnt1h: + case INS_sve_ldnt1w: + return true; + + default: + break; + } + break; + default: break; } @@ -15938,6 +15992,23 @@ void emitter::emitIns_Call(EmitCallType callType, } break; + case IF_SVE_IN_4A: + assert(insSveIsLslN(ins, fmt)); + assert(!insSveIsModN(ins, fmt)); + switch (ins) + { + case INS_sve_ldnt1h: + return 1; + case INS_sve_ldnt1w: + return 2; + case INS_sve_ldnt1d: + return 3; + + default: + break; + } + break; + default: break; } @@ -18922,6 +18993,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_SVE_IK_4A_G: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) case IF_SVE_IK_4A_H: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) case IF_SVE_IK_4A_I: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + case IF_SVE_IN_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal load (scalar plus scalar) code = emitInsCodeSve(ins, fmt); code |= insEncodeReg_V_4_to_0(id->idReg1()); // ttttt code |= insEncodeReg_P_12_to_10(id->idReg2()); // ggg @@ -21989,6 +22061,10 @@ void emitter::emitDispInsHelp( // {.S }, /Z, [, , LSL #1] // {.D }, /Z, [, , LSL #1] case IF_SVE_IK_4A_I: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + // {.B }, /Z, [, ] + // {.H }, /Z, [, ] + // {.S }, /Z, [, ] + // {.D }, /Z, [, ] case IF_SVE_IN_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal load (scalar plus scalar) case IF_SVE_IP_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load and broadcast quadword (scalar plus scalar) case IF_SVE_IR_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (quadwords, scalar plus From e3a8cc6474c7a389e36dcee4a276ecc58d964240 Mon Sep 17 00:00:00 2001 From: TIHan Date: Tue, 30 Jan 2024 16:51:32 -0800 Subject: [PATCH 06/18] Preparing to implement more formats by writing out some of the boilerplate --- src/coreclr/jit/codegencommon.cpp | 6 +- src/coreclr/jit/emitarm64.cpp | 386 ++++++++++++++++++++++++++++++ 2 files changed, 389 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index 0537b050219a52..465b5d415dbb33 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -2143,12 +2143,12 @@ void CodeGen::genEmitUnwindDebugGCandEH() #endif // defined(LATE_DISASM) || defined(DEBUG) #ifdef LATE_DISASM - getDisAssembler().disAsmCode((BYTE*)*codePtr, (BYTE*)codePtrRW, finalHotCodeSize, (BYTE*)coldCodePtr, - (BYTE*)coldCodePtrRW, finalColdCodeSize); + //getDisAssembler().disAsmCode((BYTE*)*codePtr, (BYTE*)codePtrRW, finalHotCodeSize, (BYTE*)coldCodePtr, + // (BYTE*)coldCodePtrRW, finalColdCodeSize); #endif // LATE_DISASM #ifdef DEBUG - if (JitConfig.JitRawHexCode().contains(compiler->info.compMethodHnd, compiler->info.compClassHnd, + if (compiler->opts.altJit && JitConfig.JitRawHexCode().contains(compiler->info.compMethodHnd, compiler->info.compClassHnd, &compiler->info.compMethodInfo->args)) { // NOTE: code in cold region is not supported. diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index 260af8d66d00e3..20ee3375d32921 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -15590,6 +15590,149 @@ void emitter::emitIns_Call(EmitCallType callType, } break; + case IF_SVE_IP_4A: + switch (ins) + { + case INS_sve_ld1roh: + case INS_sve_ld1row: + case INS_sve_ld1rod: + case INS_sve_ld1rqh: + case INS_sve_ld1rqw: + case INS_sve_ld1rqd: + return true; + + default: + break; + } + break; + + case IF_SVE_IR_4A: + switch (ins) + { + case INS_sve_ld2q: + case INS_sve_ld3q: + case INS_sve_ld4q: + return true; + + default: + break; + } + break; + + case IF_SVE_IT_4A: + switch (ins) + { + case INS_sve_ld2h: + case INS_sve_ld2w: + case INS_sve_ld2d: + case INS_sve_ld3h: + case INS_sve_ld3w: + case INS_sve_ld3d: + case INS_sve_ld4h: + case INS_sve_ld4w: + case INS_sve_ld4d: + return true; + + default: + break; + } + break; + + case IF_SVE_IU_4B: + switch (ins) + { + case INS_sve_ld1sw: + case INS_sve_ldff1sw: + case INS_sve_ld1d: + case INS_sve_ldff1d: + return true; + + default: + break; + } + break; + + case IF_SVE_JB_4A: + switch (ins) + { + case INS_sve_stnt1h: + case INS_sve_stnt1w: + case INS_sve_stnt1d: + return true; + + default: + break; + } + break; + + case IF_SVE_JC_4A: + switch (ins) + { + case INS_sve_st2h: + case INS_sve_st2w: + case INS_sve_st2d: + case INS_sve_st3h: + case INS_sve_st3w: + case INS_sve_st3d: + case INS_sve_st4h: + case INS_sve_st4w: + case INS_sve_st4d: + return true; + + default: + break; + } + break; + + case IF_SVE_JD_4C: + switch (ins) + { + case INS_sve_st1w: + case INS_sve_st1d: + return true; + + default: + break; + } + break; + + case IF_SVE_JD_4C_A: + switch (ins) + { + case INS_sve_st1d: + return true; + + default: + break; + } + break; + + case IF_SVE_JF_4A: + switch (ins) + { + case INS_sve_st2q: + case INS_sve_st3q: + case INS_sve_st4q: + return true; + + default: + break; + } + break; + + case IF_SVE_JJ_4B: + switch (ins) + { + case INS_sve_st1h: + case INS_sve_st1w: + case INS_sve_st1d: + return true; + + default: + break; + } + break; + default: break; } @@ -16009,6 +16152,192 @@ void emitter::emitIns_Call(EmitCallType callType, } break; + case IF_SVE_IP_4A: + assert(insSveIsLslN(ins, fmt)); + assert(!insSveIsModN(ins, fmt)); + switch (ins) + { + case INS_sve_ld1roh: + case INS_sve_ld1rqh: + return 1; + + case INS_sve_ld1row: + case INS_sve_ld1rqw: + return 2; + case INS_sve_ld1rod: + case INS_sve_ld1rqd: + return 3; + + default: + break; + } + break; + + case IF_SVE_IR_4A: + assert(insSveIsLslN(ins, fmt)); + assert(!insSveIsModN(ins, fmt)); + switch (ins) + { + case INS_sve_ld2q: + case INS_sve_ld3q: + case INS_sve_ld4q: + return 4; + + default: + break; + } + break; + + case IF_SVE_IT_4A: + assert(insSveIsLslN(ins, fmt)); + assert(!insSveIsModN(ins, fmt)); + switch (ins) + { + case INS_sve_ld2h: + case INS_sve_ld3h: + case INS_sve_ld4h: + return 1; + + case INS_sve_ld2w: + case INS_sve_ld3w: + case INS_sve_ld4w: + return 2; + + case INS_sve_ld2d: + case INS_sve_ld3d: + case INS_sve_ld4d: + return 3; + + default: + break; + } + break; + + case IF_SVE_IU_4B: + assert(insSveIsLslN(ins, fmt)); + assert(!insSveIsModN(ins, fmt)); + switch (ins) + { + case INS_sve_ld1sw: + case INS_sve_ldff1sw: + return 2; + + case INS_sve_ld1d: + case INS_sve_ldff1d: + return 3; + + default: + break; + } + break; + + case IF_SVE_JB_4A: + assert(insSveIsLslN(ins, fmt)); + assert(!insSveIsModN(ins, fmt)); + switch (ins) + { + case INS_sve_stnt1h: + return 1; + + case INS_sve_stnt1w: + return 2; + + case INS_sve_stnt1d: + return 3; + + default: + break; + } + break; + + case IF_SVE_JC_4A: + assert(insSveIsLslN(ins, fmt)); + assert(!insSveIsModN(ins, fmt)); + switch (ins) + { + case INS_sve_st2h: + case INS_sve_st3h: + case INS_sve_st4h: + return 1; + + case INS_sve_st2w: + case INS_sve_st3w: + case INS_sve_st4w: + return 2; + + case INS_sve_st2d: + case INS_sve_st3d: + case INS_sve_st4d: + return 3; + + default: + break; + } + break; + + case IF_SVE_JD_4C: + assert(insSveIsLslN(ins, fmt)); + assert(!insSveIsModN(ins, fmt)); + switch (ins) + { + case INS_sve_st1w: + return 2; + + case INS_sve_st1d: + return 3; + + default: + break; + } + break; + + case IF_SVE_JD_4C_A: + assert(insSveIsLslN(ins, fmt)); + assert(!insSveIsModN(ins, fmt)); + switch (ins) + { + case INS_sve_st1d: + return 3; + + default: + break; + } + break; + + case IF_SVE_JF_4A: + assert(insSveIsLslN(ins, fmt)); + assert(!insSveIsModN(ins, fmt)); + switch (ins) + { + case INS_sve_st2q: + case INS_sve_st3q: + case INS_sve_st4q: + return 4; + + default: + break; + } + break; + + case IF_SVE_JJ_4B: + assert(insSveIsLslN(ins, fmt)); + assert(!insSveIsModN(ins, fmt)); + switch (ins) + { + case INS_sve_st1h: + return 1; + + case INS_sve_st1w: + return 2; + + case INS_sve_st1d: + return 3; + + default: + break; + } + break; + default: break; } @@ -22066,39 +22395,96 @@ void emitter::emitDispInsHelp( // {.S }, /Z, [, ] // {.D }, /Z, [, ] case IF_SVE_IN_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal load (scalar plus scalar) + // {.B }, /Z, [, ] + // {.H }, /Z, [, ] + // {.S }, /Z, [, ] + // {.D }, /Z, [, ] case IF_SVE_IP_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load and broadcast quadword (scalar plus scalar) + // {.Q, .Q }, /Z, [, , LSL #4] + // {.Q, .Q, .Q }, /Z, [, , LSL #4] + // {.Q, .Q, .Q, .Q }, /Z, [, , LSL #4] case IF_SVE_IR_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (quadwords, scalar plus // scalar) + // {.B, .B }, /Z, [, ] + // {.H, .H }, /Z, [, , LSL #1] + // {.S, .S }, /Z, [, , LSL #2] + // {.D, .D }, /Z, [, , LSL #3] + // {.B, .B, .B }, /Z, [, ] + // {.H, .H, .H }, /Z, [, , LSL #1] + // {.S, .S, .S }, /Z, [, , LSL #2] + // {.D, .D, .D }, /Z, [, , LSL #3] + // {.B, .B, .B, .B }, /Z, [, ] + // {.H, .H, .H, .H }, /Z, [, , LSL #1] + // {.S, .S, .S, .S }, /Z, [, , LSL #2] + // {.D, .D, .D, .D }, /Z, [, , LSL #3] case IF_SVE_IT_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (scalar plus scalar) + // {.D }, /Z, [, .D, LSL #2] + // {.D }, /Z, [, .D, LSL #3] case IF_SVE_IU_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked // scaled offsets) + // {.D }, /Z, [, .D] case IF_SVE_IU_4B_B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked // scaled offsets) + // {.D }, /Z, [, .D] case IF_SVE_IU_4B_D: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked // scaled offsets) + // {.Q }, /Z, [.D{, }] case IF_SVE_IW_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit gather load (vector plus scalar) + // {.D }, /Z, [.D{, }] case IF_SVE_IX_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit gather non-temporal load (vector plus // scalar) + // {.Q }, , [.D{, }] case IF_SVE_IY_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit scatter store (vector plus scalar) + // {.S }, , [.S{, }] case IF_SVE_IZ_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit scatter non-temporal store (vector plus // scalar) + // {.D }, , [.D{, }] case IF_SVE_IZ_4A_A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit scatter non-temporal store (vector plus // scalar) + // {.D }, , [.D{, }] case IF_SVE_JA_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit scatter non-temporal store (vector plus // scalar) + // {.B }, , [, ] + // {.H }, , [, , LSL #1] + // {.S }, , [, , LSL #2] + // {.D }, , [, , LSL #3] case IF_SVE_JB_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal store (scalar plus // scalar) + // {.B, .B }, , [, ] + // {.H, .H }, , [, , LSL #1] + // {.S, .S }, , [, , LSL #2] + // {.D, .D }, , [, , LSL #3] + // {.B, .B, .B }, , [, ] + // {.H, .H, .H }, , [, , LSL #1] + // {.S, .S, .S }, , [, , LSL #2] + // {.D, .D, .D }, , [, , LSL #3] + // {.B, .B, .B, .B }, , [, ] + // {.H, .H, .H, .H }, , [, , LSL #1] + // {.S, .S, .S, .S }, , [, , LSL #2] + // {.D, .D, .D, .D }, , [, , LSL #3] case IF_SVE_JC_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE store multiple structures (scalar plus scalar) + // {.Q }, , [, , LSL #2] + // {.D }, , [, , LSL #3] case IF_SVE_JD_4C: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous store (scalar plus scalar) + // {.Q }, , [, , LSL #3] case IF_SVE_JD_4C_A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous store (scalar plus scalar) + // {.Q, .Q }, , [, , LSL #4] + // {.Q, .Q, .Q }, , [, , LSL #4] + // {.Q, .Q, .Q, .Q }, , [, , LSL #4] case IF_SVE_JF_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE store multiple structures (quadwords, scalar plus // scalar) + // {.D }, , [, .D, LSL #1] + // {.D }, , [, .D, LSL #2] + // {.D }, , [, .D, LSL #3] case IF_SVE_JJ_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled // offsets) + // {.D }, , [, .D] case IF_SVE_JJ_4B_C: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled // offsets) + // {.D }, , [, .D] case IF_SVE_JJ_4B_E: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled // offsets) + // {.D }, , [, .D] case IF_SVE_JK_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit unscaled // offsets) emitDispSveConsecutiveRegList(id->idReg1(), insGetSveReg1ListSize(ins), id->idInsOpt(), true); // ttttt From 881e63b10ad7d6c224311cab9bed6410508fc02e Mon Sep 17 00:00:00 2001 From: TIHan Date: Tue, 30 Jan 2024 17:08:36 -0800 Subject: [PATCH 07/18] Added SVE_IP_4A format --- src/coreclr/jit/codegenarm64test.cpp | 34 +++++----- src/coreclr/jit/codegencommon.cpp | 6 +- src/coreclr/jit/emitarm64.cpp | 95 +++++++++++++++++++++++++++- 3 files changed, 114 insertions(+), 21 deletions(-) diff --git a/src/coreclr/jit/codegenarm64test.cpp b/src/coreclr/jit/codegenarm64test.cpp index b33a0b67c1de41..0e402a66246db7 100644 --- a/src/coreclr/jit/codegenarm64test.cpp +++ b/src/coreclr/jit/codegenarm64test.cpp @@ -6487,23 +6487,23 @@ void CodeGen::genArm64EmitterUnitTestsSve() theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1w, EA_SCALABLE, REG_V2, REG_P0, REG_R3, REG_R1, INS_OPTS_SCALABLE_S, INS_SCALABLE_OPTS_LSL_N); // LDNT1W {.S }, /Z, [, , LSL #2] - //// IF_SVE_IP_4A - //theEmitter->emitIns_R_R_R_R(INS_sve_ld1rob, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD1ROB {.B }, /Z, [, ] - //theEmitter->emitIns_R_R_R_R(INS_sve_ld1rod, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD1ROD {.D }, /Z, [, , LSL #3] - //theEmitter->emitIns_R_R_R_R(INS_sve_ld1roh, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD1ROH {.H }, /Z, [, , LSL #1] - //theEmitter->emitIns_R_R_R_R(INS_sve_ld1row, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD1ROW {.S }, /Z, [, , LSL #2] - //theEmitter->emitIns_R_R_R_R(INS_sve_ld1rqb, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD1RQB {.B }, /Z, [, ] - //theEmitter->emitIns_R_R_R_R(INS_sve_ld1rqd, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD1RQD {.D }, /Z, [, , LSL #3] - //theEmitter->emitIns_R_R_R_R(INS_sve_ld1rqh, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD1RQH {.H }, /Z, [, , LSL #1] - //theEmitter->emitIns_R_R_R_R(INS_sve_ld1rqw, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD1RQW {.S }, /Z, [, , LSL #2] + // IF_SVE_IP_4A + theEmitter->emitIns_R_R_R_R(INS_sve_ld1rob, EA_SCALABLE, REG_V0, REG_P1, REG_R3, REG_R2, + INS_OPTS_SCALABLE_B); // LD1ROB {.B }, /Z, [, ] + theEmitter->emitIns_R_R_R_R(INS_sve_ld1rod, EA_SCALABLE, REG_V0, REG_P2, REG_R1, REG_R3, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // LD1ROD {.D }, /Z, [, , LSL #3] + theEmitter->emitIns_R_R_R_R(INS_sve_ld1roh, EA_SCALABLE, REG_V4, REG_P3, REG_R2, REG_R1, INS_OPTS_SCALABLE_H, + INS_SCALABLE_OPTS_LSL_N); // LD1ROH {.H }, /Z, [, , LSL #1] + theEmitter->emitIns_R_R_R_R(INS_sve_ld1row, EA_SCALABLE, REG_V1, REG_P3, REG_R2, REG_R4, INS_OPTS_SCALABLE_S, + INS_SCALABLE_OPTS_LSL_N); // LD1ROW {.S }, /Z, [, , LSL #2] + theEmitter->emitIns_R_R_R_R(INS_sve_ld1rqb, EA_SCALABLE, REG_V3, REG_P1, REG_R4, REG_R2, + INS_OPTS_SCALABLE_B); // LD1RQB {.B }, /Z, [, ] + theEmitter->emitIns_R_R_R_R(INS_sve_ld1rqd, EA_SCALABLE, REG_V2, REG_P3, REG_R1, REG_R4, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // LD1RQD {.D }, /Z, [, , LSL #3] + theEmitter->emitIns_R_R_R_R(INS_sve_ld1rqh, EA_SCALABLE, REG_V1, REG_P2, REG_R3, REG_R4, INS_OPTS_SCALABLE_H, + INS_SCALABLE_OPTS_LSL_N); // LD1RQH {.H }, /Z, [, , LSL #1] + theEmitter->emitIns_R_R_R_R(INS_sve_ld1rqw, EA_SCALABLE, REG_V0, REG_P1, REG_R2, REG_R3, INS_OPTS_SCALABLE_S, + INS_SCALABLE_OPTS_LSL_N); // LD1RQW {.S }, /Z, [, , LSL #2] //// IF_SVE_IR_4A //theEmitter->emitIns_R_R_R_R(INS_sve_ld2q, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index 465b5d415dbb33..0537b050219a52 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -2143,12 +2143,12 @@ void CodeGen::genEmitUnwindDebugGCandEH() #endif // defined(LATE_DISASM) || defined(DEBUG) #ifdef LATE_DISASM - //getDisAssembler().disAsmCode((BYTE*)*codePtr, (BYTE*)codePtrRW, finalHotCodeSize, (BYTE*)coldCodePtr, - // (BYTE*)coldCodePtrRW, finalColdCodeSize); + getDisAssembler().disAsmCode((BYTE*)*codePtr, (BYTE*)codePtrRW, finalHotCodeSize, (BYTE*)coldCodePtr, + (BYTE*)coldCodePtrRW, finalColdCodeSize); #endif // LATE_DISASM #ifdef DEBUG - if (compiler->opts.altJit && JitConfig.JitRawHexCode().contains(compiler->info.compMethodHnd, compiler->info.compClassHnd, + if (JitConfig.JitRawHexCode().contains(compiler->info.compMethodHnd, compiler->info.compClassHnd, &compiler->info.compMethodInfo->args)) { // NOTE: code in cold region is not supported. diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index 20ee3375d32921..d36e3b1a06116b 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -11995,6 +11995,7 @@ void emitter::emitIns_R_R_R_R(instruction ins, case INS_sve_ldnt1sh: assert(isVectorRegister(reg1)); assert(isPredicateRegister(reg2)); + assert(isScalableVectorSize(size)); if (isGeneralRegister(reg3)) { @@ -12037,7 +12038,6 @@ void emitter::emitIns_R_R_R_R(instruction ins, assert(insOptsScalableWords(opt)); assert(isVectorRegister(reg3)); assert(isGeneralRegisterOrZR(reg4)); - assert(isScalableVectorSize(size)); assert(insScalableOptsNone(sopt)); if (opt == INS_OPTS_SCALABLE_S) @@ -12052,6 +12052,56 @@ void emitter::emitIns_R_R_R_R(instruction ins, } break; + case INS_sve_ld1rob: + case INS_sve_ld1roh: + case INS_sve_ld1row: + case INS_sve_ld1rod: + case INS_sve_ld1rqb: + case INS_sve_ld1rqh: + case INS_sve_ld1rqw: + case INS_sve_ld1rqd: + assert(isVectorRegister(reg1)); + assert(isPredicateRegister(reg2)); + assert(isGeneralRegister(reg3)); + assert(isGeneralRegister(reg4)); + assert(isScalableVectorSize(size)); + +#ifdef DEBUG + switch (ins) + { + case INS_sve_ld1rob: + case INS_sve_ld1rqb: + assert(opt == INS_OPTS_SCALABLE_B); + assert(insScalableOptsNone(sopt)); + break; + + case INS_sve_ld1roh: + case INS_sve_ld1rqh: + assert(opt == INS_OPTS_SCALABLE_H); + assert(sopt == INS_SCALABLE_OPTS_LSL_N); + break; + + case INS_sve_ld1row: + case INS_sve_ld1rqw: + assert(opt == INS_OPTS_SCALABLE_S); + assert(sopt == INS_SCALABLE_OPTS_LSL_N); + break; + + case INS_sve_ld1rod: + case INS_sve_ld1rqd: + assert(opt == INS_OPTS_SCALABLE_D); + assert(sopt == INS_SCALABLE_OPTS_LSL_N); + break; + + default: + assert(!"Invalid instruction"); + break; + } +#endif // DEBUG + + fmt = IF_SVE_IP_4A; + break; + default: unreached(); break; @@ -19323,6 +19373,41 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_SVE_IK_4A_H: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) case IF_SVE_IK_4A_I: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) case IF_SVE_IN_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal load (scalar plus scalar) + case IF_SVE_IP_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load and broadcast quadword (scalar plus scalar) + case IF_SVE_IR_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (quadwords, scalar plus + // scalar) + case IF_SVE_IT_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (scalar plus scalar) + case IF_SVE_IU_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked + // scaled offsets) + case IF_SVE_IU_4B_B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked + // scaled offsets) + case IF_SVE_IU_4B_D: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked + // scaled offsets) + case IF_SVE_IW_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit gather load (vector plus scalar) + case IF_SVE_IX_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit gather non-temporal load (vector plus + // scalar) + case IF_SVE_IY_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit scatter store (vector plus scalar) + case IF_SVE_IZ_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit scatter non-temporal store (vector plus + // scalar) + case IF_SVE_IZ_4A_A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit scatter non-temporal store (vector plus + // scalar) + case IF_SVE_JA_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit scatter non-temporal store (vector plus + // scalar) + case IF_SVE_JB_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal store (scalar plus + // scalar) + case IF_SVE_JC_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE store multiple structures (scalar plus scalar) + case IF_SVE_JD_4C: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous store (scalar plus scalar) + case IF_SVE_JD_4C_A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous store (scalar plus scalar) + case IF_SVE_JF_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE store multiple structures (quadwords, scalar plus + // scalar) + case IF_SVE_JJ_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled + // offsets) + case IF_SVE_JJ_4B_C: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled + // offsets) + case IF_SVE_JJ_4B_E: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled + // offsets) + case IF_SVE_JK_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit unscaled + // offsets) code = emitInsCodeSve(ins, fmt); code |= insEncodeReg_V_4_to_0(id->idReg1()); // ttttt code |= insEncodeReg_P_12_to_10(id->idReg2()); // ggg @@ -25613,6 +25698,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins perfScoreUnhandledInstruction(id, &result); break; } + break; case IF_SVE_IR_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (quadwords, scalar plus // scalar) @@ -25635,6 +25721,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins perfScoreUnhandledInstruction(id, &result); break; } + break; case IF_SVE_IT_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (scalar plus scalar) switch (ins) @@ -25692,6 +25779,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins perfScoreUnhandledInstruction(id, &result); break; } + break; case IF_SVE_IU_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked // scaled offsets) @@ -25715,6 +25803,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins perfScoreUnhandledInstruction(id, &result); break; } + break; + case IF_SVE_IX_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit gather non-temporal load (vector plus // scalar) result.insThroughput = PERFSCORE_THROUGHPUT_2X; @@ -25733,6 +25823,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins perfScoreUnhandledInstruction(id, &result); break; } + break; case IF_SVE_IZ_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit scatter non-temporal store (vector plus // scalar) @@ -25804,6 +25895,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins perfScoreUnhandledInstruction(id, &result); break; } + break; case IF_SVE_JF_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE store multiple structures (quadwords, scalar plus // scalar) @@ -25826,6 +25918,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins perfScoreUnhandledInstruction(id, &result); break; } + break; case IF_SVE_JJ_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled // offsets) From 1abdaa0c5f0e586c4c08aaef23abec16e46c90a1 Mon Sep 17 00:00:00 2001 From: TIHan Date: Tue, 30 Jan 2024 17:25:02 -0800 Subject: [PATCH 08/18] Added SVE_IR_4A format --- src/coreclr/jit/codegenarm64test.cpp | 20 ++++++++------- src/coreclr/jit/emitarm64.cpp | 38 +++++++++++++++++++++++++--- src/coreclr/jit/emitarm64.h | 2 +- 3 files changed, 47 insertions(+), 13 deletions(-) diff --git a/src/coreclr/jit/codegenarm64test.cpp b/src/coreclr/jit/codegenarm64test.cpp index 0e402a66246db7..d95343479eded1 100644 --- a/src/coreclr/jit/codegenarm64test.cpp +++ b/src/coreclr/jit/codegenarm64test.cpp @@ -6505,15 +6505,17 @@ void CodeGen::genArm64EmitterUnitTestsSve() theEmitter->emitIns_R_R_R_R(INS_sve_ld1rqw, EA_SCALABLE, REG_V0, REG_P1, REG_R2, REG_R3, INS_OPTS_SCALABLE_S, INS_SCALABLE_OPTS_LSL_N); // LD1RQW {.S }, /Z, [, , LSL #2] - //// IF_SVE_IR_4A - //theEmitter->emitIns_R_R_R_R(INS_sve_ld2q, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD2Q {.Q, .Q }, /Z, [, , LSL #4] - //theEmitter->emitIns_R_R_R_R(INS_sve_ld3q, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD3Q {.Q, .Q, .Q }, /Z, [, , - // // LSL #4] - //theEmitter->emitIns_R_R_R_R(INS_sve_ld4q, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD4Q {.Q, .Q, .Q, .Q }, /Z, - // // [, , LSL #4] + // IF_SVE_IR_4A + theEmitter->emitIns_R_R_R_R(INS_sve_ld2q, EA_SCALABLE, REG_V0, REG_P3, REG_R2, REG_R1, INS_OPTS_SCALABLE_Q, + INS_SCALABLE_OPTS_LSL_N); // LD2Q {.Q, .Q }, /Z, [, , LSL + // #4] + theEmitter->emitIns_R_R_R_R(INS_sve_ld3q, EA_SCALABLE, REG_V3, REG_P4, REG_R1, REG_R2, INS_OPTS_SCALABLE_Q, + INS_SCALABLE_OPTS_LSL_N); // LD3Q {.Q, .Q, .Q }, /Z, [, + // , + // LSL #4] + theEmitter->emitIns_R_R_R_R(INS_sve_ld4q, EA_SCALABLE, REG_V5, REG_P1, REG_R4, REG_R3, INS_OPTS_SCALABLE_Q, + INS_SCALABLE_OPTS_LSL_N); // LD4Q {.Q, .Q, .Q, .Q }, /Z, + // [, , LSL #4] //// IF_SVE_IT_4A //theEmitter->emitIns_R_R_R_R(INS_sve_ld2b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index d36e3b1a06116b..d43481152813fa 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -1782,10 +1782,19 @@ void emitter::emitInsSanityCheck(instrDesc* id) assert(isScalableVectorSize(elemsize)); break; - case IF_SVE_IN_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal load (scalar plus scalar) - case IF_SVE_IP_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load and broadcast quadword (scalar plus scalar) case IF_SVE_IR_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (quadwords, scalar plus // scalar) + elemsize = id->idOpSize(); + assert(id->idInsOpt() == INS_OPTS_SCALABLE_Q); + assert(isVectorRegister(id->idReg1())); // ttttt + assert(isPredicateRegister(id->idReg2())); // ggg + assert(isGeneralRegister(id->idReg3())); // nnnnn + assert(isGeneralRegister(id->idReg4())); // mmmmm + assert(isScalableVectorSize(elemsize)); + break; + + case IF_SVE_IN_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal load (scalar plus scalar) + case IF_SVE_IP_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load and broadcast quadword (scalar plus scalar) case IF_SVE_IT_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (scalar plus scalar) case IF_SVE_IU_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked // scaled offsets) @@ -12102,6 +12111,25 @@ void emitter::emitIns_R_R_R_R(instruction ins, fmt = IF_SVE_IP_4A; break; + case INS_sve_ld2q: + case INS_sve_ld3q: + case INS_sve_ld4q: + assert(isVectorRegister(reg1)); + assert(isPredicateRegister(reg2)); + assert(isGeneralRegister(reg3)); + assert(isGeneralRegister(reg4)); + assert(isScalableVectorSize(size)); + assert(opt == INS_OPTS_SCALABLE_Q); + assert(sopt == INS_SCALABLE_OPTS_LSL_N); + + fmt = IF_SVE_IR_4A; + + break; + + + break; + + default: unreached(); break; @@ -15923,7 +15951,7 @@ void emitter::emitIns_Call(EmitCallType callType, /***************************************************************************** * - * Returns 0, 1, 2 or 3 depending on the instruction and format. + * Returns 0, 1, 2, 3 or 4 depending on the instruction and format. * This is for formats that have [, .T, ], [, .T, #N], [, , LSL #N], * [{, , LSL #N}] */ @@ -19945,6 +19973,10 @@ void emitter::emitDispSveModAddr(instruction ins, regNumber reg1, regNumber reg2 emitDispComma(); switch (insSveGetLslOrModN(ins, fmt)) { + case 4: + printf("lsl #4"); + break; + case 3: printf("lsl #3"); break; diff --git a/src/coreclr/jit/emitarm64.h b/src/coreclr/jit/emitarm64.h index d7cb0f918497eb..7a7ddc64824d16 100644 --- a/src/coreclr/jit/emitarm64.h +++ b/src/coreclr/jit/emitarm64.h @@ -515,7 +515,7 @@ static bool insSveIsLslN(instruction ins, insFormat fmt); // This is for formats that have [, .T, ], [, .T, #N] static bool insSveIsModN(instruction ins, insFormat fmt); -// Returns 0, 1, 2 or 3 depending on the instruction and format. +// Returns 0, 1, 2, 3 or 4 depending on the instruction and format. // This is for formats that have [, .T, ], [, .T, #N], [, , LSL #N], // [{, , LSL #N}] static int insSveGetLslOrModN(instruction ins, insFormat fmt); From acd50306ec42cc23ceda89c45ccce4f04c9857d6 Mon Sep 17 00:00:00 2001 From: TIHan Date: Tue, 30 Jan 2024 17:39:04 -0800 Subject: [PATCH 09/18] Added SVE_IT_4A format --- src/coreclr/jit/codegenarm64test.cpp | 70 +++++++++++++++------------- src/coreclr/jit/emitarm64.cpp | 58 +++++++++++++++++++++-- 2 files changed, 93 insertions(+), 35 deletions(-) diff --git a/src/coreclr/jit/codegenarm64test.cpp b/src/coreclr/jit/codegenarm64test.cpp index d95343479eded1..8591b5eb2b9a2c 100644 --- a/src/coreclr/jit/codegenarm64test.cpp +++ b/src/coreclr/jit/codegenarm64test.cpp @@ -6517,38 +6517,44 @@ void CodeGen::genArm64EmitterUnitTestsSve() INS_SCALABLE_OPTS_LSL_N); // LD4Q {.Q, .Q, .Q, .Q }, /Z, // [, , LSL #4] - //// IF_SVE_IT_4A - //theEmitter->emitIns_R_R_R_R(INS_sve_ld2b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD2B {.B, .B }, /Z, [, ] - //theEmitter->emitIns_R_R_R_R(INS_sve_ld2d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD2D {.D, .D }, /Z, [, , LSL #3] - //theEmitter->emitIns_R_R_R_R(INS_sve_ld2h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD2H {.H, .H }, /Z, [, , LSL #1] - //theEmitter->emitIns_R_R_R_R(INS_sve_ld2w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD2W {.S, .S }, /Z, [, , LSL #2] - //theEmitter->emitIns_R_R_R_R(INS_sve_ld3b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD3B {.B, .B, .B }, /Z, [, ] - //theEmitter->emitIns_R_R_R_R(INS_sve_ld3d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD3D {.D, .D, .D }, /Z, [, , - // // LSL #3] - //theEmitter->emitIns_R_R_R_R(INS_sve_ld3h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD3H {.H, .H, .H }, /Z, [, , - // // LSL #1] - //theEmitter->emitIns_R_R_R_R(INS_sve_ld3w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD3W {.S, .S, .S }, /Z, [, , - // // LSL #2] - //theEmitter->emitIns_R_R_R_R(INS_sve_ld4b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD4B {.B, .B, .B, .B }, /Z, - // // [, ] - //theEmitter->emitIns_R_R_R_R(INS_sve_ld4d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD4D {.D, .D, .D, .D }, /Z, - // // [, , LSL #3] - //theEmitter->emitIns_R_R_R_R(INS_sve_ld4h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD4H {.H, .H, .H, .H }, /Z, - // // [, , LSL #1] - //theEmitter->emitIns_R_R_R_R(INS_sve_ld4w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD4W {.S, .S, .S, .S }, /Z, - // // [, , LSL #2] + // IF_SVE_IT_4A + theEmitter->emitIns_R_R_R_R(INS_sve_ld2b, EA_SCALABLE, REG_V0, REG_P1, REG_R2, REG_R3, + INS_OPTS_SCALABLE_B); // LD2B {.B, .B }, /Z, [, ] + theEmitter->emitIns_R_R_R_R(INS_sve_ld2d, EA_SCALABLE, REG_V7, REG_P6, REG_R5, REG_R4, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // LD2D {.D, .D }, /Z, [, , LSL + // #3] + theEmitter->emitIns_R_R_R_R(INS_sve_ld2h, EA_SCALABLE, REG_V8, REG_P5, REG_R9, REG_R10, INS_OPTS_SCALABLE_H, + INS_SCALABLE_OPTS_LSL_N); // LD2H {.H, .H }, /Z, [, , LSL + // #1] + theEmitter->emitIns_R_R_R_R(INS_sve_ld2w, EA_SCALABLE, REG_V6, REG_P5, REG_R4, REG_R7, INS_OPTS_SCALABLE_S, + INS_SCALABLE_OPTS_LSL_N); // LD2W {.S, .S }, /Z, [, , LSL + // #2] + theEmitter->emitIns_R_R_R_R(INS_sve_ld3b, EA_SCALABLE, REG_V1, REG_P0, REG_R3, REG_R2, + INS_OPTS_SCALABLE_B); // LD3B {.B, .B, .B }, /Z, [, ] + theEmitter->emitIns_R_R_R_R(INS_sve_ld3d, EA_SCALABLE, REG_V4, REG_P3, REG_R8, REG_R1, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // LD3D {.D, .D, .D }, /Z, [, + // , + // LSL #3] + theEmitter->emitIns_R_R_R_R(INS_sve_ld3h, EA_SCALABLE, REG_V30, REG_P2, REG_R9, REG_R4, INS_OPTS_SCALABLE_H, + INS_SCALABLE_OPTS_LSL_N); // LD3H {.H, .H, .H }, /Z, [, + // , + // LSL #1] + theEmitter->emitIns_R_R_R_R(INS_sve_ld3w, EA_SCALABLE, REG_V1, REG_P3, REG_R2, REG_R4, INS_OPTS_SCALABLE_S, + INS_SCALABLE_OPTS_LSL_N); // LD3W {.S, .S, .S }, /Z, [, + // , + // LSL #2] + theEmitter->emitIns_R_R_R_R(INS_sve_ld4b, EA_SCALABLE, REG_V0, REG_P1, REG_R2, REG_R3, + INS_OPTS_SCALABLE_B); // LD4B {.B, .B, .B, .B }, /Z, + // [, ] + theEmitter->emitIns_R_R_R_R(INS_sve_ld4d, EA_SCALABLE, REG_V0, REG_P3, REG_R2, REG_R1, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // LD4D {.D, .D, .D, .D }, /Z, + // [, , LSL #3] + theEmitter->emitIns_R_R_R_R(INS_sve_ld4h, EA_SCALABLE, REG_V13, REG_P6, REG_R5, REG_R4, INS_OPTS_SCALABLE_H, + INS_SCALABLE_OPTS_LSL_N); // LD4H {.H, .H, .H, .H }, /Z, + // [, , LSL #1] + theEmitter->emitIns_R_R_R_R(INS_sve_ld4w, EA_SCALABLE, REG_V10, REG_P3, REG_R2, REG_R5, INS_OPTS_SCALABLE_S, + INS_SCALABLE_OPTS_LSL_N); // LD4W {.S, .S, .S, .S }, /Z, + // [, , LSL #2] //// IF_SVE_IU_4B //theEmitter->emitIns_R_R_R_R(INS_sve_ld1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index d43481152813fa..c49f65c264d835 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -12121,14 +12121,66 @@ void emitter::emitIns_R_R_R_R(instruction ins, assert(isScalableVectorSize(size)); assert(opt == INS_OPTS_SCALABLE_Q); assert(sopt == INS_SCALABLE_OPTS_LSL_N); - fmt = IF_SVE_IR_4A; - break; + case INS_sve_ld2b: + case INS_sve_ld3b: + case INS_sve_ld4b: + case INS_sve_ld2h: + case INS_sve_ld3h: + case INS_sve_ld4h: + case INS_sve_ld2w: + case INS_sve_ld3w: + case INS_sve_ld4w: + case INS_sve_ld2d: + case INS_sve_ld3d: + case INS_sve_ld4d: + assert(isVectorRegister(reg1)); + assert(isPredicateRegister(reg2)); + assert(isGeneralRegister(reg3)); + assert(isGeneralRegister(reg4)); + assert(isScalableVectorSize(size)); - break; +#ifdef DEBUG + switch (ins) + { + case INS_sve_ld2b: + case INS_sve_ld3b: + case INS_sve_ld4b: + assert(opt == INS_OPTS_SCALABLE_B); + assert(insScalableOptsNone(sopt)); + break; + case INS_sve_ld2h: + case INS_sve_ld3h: + case INS_sve_ld4h: + assert(opt == INS_OPTS_SCALABLE_H); + assert(sopt == INS_SCALABLE_OPTS_LSL_N); + break; + + case INS_sve_ld2w: + case INS_sve_ld3w: + case INS_sve_ld4w: + assert(opt == INS_OPTS_SCALABLE_S); + assert(sopt == INS_SCALABLE_OPTS_LSL_N); + break; + + case INS_sve_ld2d: + case INS_sve_ld3d: + case INS_sve_ld4d: + assert(opt == INS_OPTS_SCALABLE_D); + assert(sopt == INS_SCALABLE_OPTS_LSL_N); + break; + + default: + assert(!"Invalid instruction"); + break; + } +#endif // DEBUG + + fmt = IF_SVE_IT_4A; + break; default: unreached(); From 5f2c7099381e3caebaa18479fb7254c92c06bd80 Mon Sep 17 00:00:00 2001 From: TIHan Date: Tue, 30 Jan 2024 18:10:00 -0800 Subject: [PATCH 10/18] Added SVE_IU_4B to SVE_IU_4B_D formats. Some minor cleanup. --- src/coreclr/jit/codegenarm64test.cpp | 42 +++++------ src/coreclr/jit/emitarm64.cpp | 106 +++++++++++++++++++-------- 2 files changed, 97 insertions(+), 51 deletions(-) diff --git a/src/coreclr/jit/codegenarm64test.cpp b/src/coreclr/jit/codegenarm64test.cpp index 8591b5eb2b9a2c..77504248346c1d 100644 --- a/src/coreclr/jit/codegenarm64test.cpp +++ b/src/coreclr/jit/codegenarm64test.cpp @@ -6556,27 +6556,27 @@ void CodeGen::genArm64EmitterUnitTestsSve() INS_SCALABLE_OPTS_LSL_N); // LD4W {.S, .S, .S, .S }, /Z, // [, , LSL #2] - //// IF_SVE_IU_4B - //theEmitter->emitIns_R_R_R_R(INS_sve_ld1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD1D {.D }, /Z, [, .D, LSL #3] - //theEmitter->emitIns_R_R_R_R(INS_sve_ld1sw, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD1SW {.D }, /Z, [, .D, LSL #2] - //theEmitter->emitIns_R_R_R_R(INS_sve_ldff1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LDFF1D {.D }, /Z, [, .D, LSL #3] - //theEmitter->emitIns_R_R_R_R(INS_sve_ldff1sw, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LDFF1SW {.D }, /Z, [, .D, LSL #2] - - //// IF_SVE_IU_4B_B - //theEmitter->emitIns_R_R_R_R(INS_sve_ld1sw, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD1SW {.D }, /Z, [, .D] - //theEmitter->emitIns_R_R_R_R(INS_sve_ldff1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LDFF1D {.D }, /Z, [, .D] - //theEmitter->emitIns_R_R_R_R(INS_sve_ldff1sw, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LDFF1SW {.D }, /Z, [, .D] - - //// IF_SVE_IU_4B_D - //theEmitter->emitIns_R_R_R_R(INS_sve_ld1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD1D {.D }, /Z, [, .D] + // IF_SVE_IU_4B + theEmitter->emitIns_R_R_R_R(INS_sve_ld1d, EA_SCALABLE, REG_V0, REG_P4, REG_R3, REG_V2, + INS_OPTS_SCALABLE_D); // LD1D {.D }, /Z, [, .D, LSL #3] + theEmitter->emitIns_R_R_R_R(INS_sve_ld1sw, EA_SCALABLE, REG_V4, REG_P3, REG_R2, REG_V1, + INS_OPTS_SCALABLE_D); // LD1SW {.D }, /Z, [, .D, LSL #2] + theEmitter->emitIns_R_R_R_R(INS_sve_ldff1d, EA_SCALABLE, REG_V5, REG_P6, REG_R7, REG_V8, + INS_OPTS_SCALABLE_D); // LDFF1D {.D }, /Z, [, .D, LSL #3] + theEmitter->emitIns_R_R_R_R(INS_sve_ldff1sw, EA_SCALABLE, REG_V3, REG_P0, REG_R10, REG_V9, + INS_OPTS_SCALABLE_D); // LDFF1SW {.D }, /Z, [, .D, LSL #2] + + // IF_SVE_IU_4B_B + theEmitter->emitIns_R_R_R_R(INS_sve_ld1sw, EA_SCALABLE, REG_V1, REG_P2, REG_R3, REG_V0, + INS_OPTS_SCALABLE_D); // LD1SW {.D }, /Z, [, .D] + theEmitter->emitIns_R_R_R_R(INS_sve_ldff1d, EA_SCALABLE, REG_V2, REG_P6, REG_R5, REG_V4, + INS_OPTS_SCALABLE_D); // LDFF1D {.D }, /Z, [, .D] + theEmitter->emitIns_R_R_R_R(INS_sve_ldff1sw, EA_SCALABLE, REG_V3, REG_P4, REG_R6, REG_V5, + INS_OPTS_SCALABLE_D); // LDFF1SW {.D }, /Z, [, .D] + + // IF_SVE_IU_4B_D + theEmitter->emitIns_R_R_R_R(INS_sve_ld1d, EA_SCALABLE, REG_V0, REG_P1, REG_R3, REG_V4, + INS_OPTS_SCALABLE_D); // LD1D {.D }, /Z, [, .D] //// IF_SVE_IW_4A //theEmitter->emitIns_R_R_R_R(INS_sve_ld1q, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index c49f65c264d835..282a40cb1e9c83 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -1793,15 +1793,24 @@ void emitter::emitInsSanityCheck(instrDesc* id) assert(isScalableVectorSize(elemsize)); break; - case IF_SVE_IN_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal load (scalar plus scalar) - case IF_SVE_IP_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load and broadcast quadword (scalar plus scalar) - case IF_SVE_IT_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (scalar plus scalar) - case IF_SVE_IU_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked - // scaled offsets) + case IF_SVE_IU_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked + // scaled offsets) case IF_SVE_IU_4B_B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked // scaled offsets) case IF_SVE_IU_4B_D: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked // scaled offsets) + elemsize = id->idOpSize(); + assert(id->idInsOpt() == INS_OPTS_SCALABLE_D); + assert(isVectorRegister(id->idReg1())); // ttttt + assert(isPredicateRegister(id->idReg2())); // ggg + assert(isGeneralRegister(id->idReg3())); // nnnnn + assert(isVectorRegister(id->idReg4())); // mmmmm + assert(isScalableVectorSize(elemsize)); + break; + + case IF_SVE_IN_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal load (scalar plus scalar) + case IF_SVE_IP_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load and broadcast quadword (scalar plus scalar) + case IF_SVE_IT_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (scalar plus scalar) case IF_SVE_IW_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit gather load (vector plus scalar) case IF_SVE_IX_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit gather non-temporal load (vector plus // scalar) @@ -11972,9 +11981,8 @@ void emitter::emitIns_R_R_R_R(instruction ins, } } } - else + else if (insOptsScalableDoubleWord32bitExtends(opt)) { - assert(insOptsScalableDoubleWord32bitExtends(opt)); assert(isVectorRegister(reg4)); if (sopt == INS_SCALABLE_OPTS_MOD_N) @@ -11984,6 +11992,7 @@ void emitter::emitIns_R_R_R_R(instruction ins, else { assert(insScalableOptsNone(sopt)); + if (ins == INS_sve_ld1d) { fmt = IF_SVE_IU_4A_C; @@ -11994,6 +12003,27 @@ void emitter::emitIns_R_R_R_R(instruction ins, } } } + else if (sopt == INS_SCALABLE_OPTS_LSL_N) + { + assert(isVectorRegister(reg4)); + assert(opt == INS_OPTS_SCALABLE_D); + fmt = IF_SVE_IU_4B; + } + else + { + assert(isVectorRegister(reg4)); + assert(opt == INS_OPTS_SCALABLE_D); + assert(insScalableOptsNone(sopt)); + + if (ins == INS_sve_ld1d) + { + fmt = IF_SVE_IU_4B_D; + } + else + { + fmt = IF_SVE_IU_4B_B; + } + } break; case INS_sve_ldnt1b: @@ -19435,6 +19465,32 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) dst += emitOutput_Instr(dst, code); break; + case IF_SVE_II_4A_H: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) + case IF_SVE_IK_4A_F: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + case IF_SVE_IK_4A_G: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + case IF_SVE_IK_4A_H: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + case IF_SVE_IK_4A_I: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + code = emitInsCodeSve(ins, fmt); + code |= insEncodeReg_V_4_to_0(id->idReg1()); // ttttt + code |= insEncodeReg_P_12_to_10(id->idReg2()); // ggg + code |= insEncodeReg_R_9_to_5(id->idReg3()); // nnnnn + code |= insEncodeReg_R_20_to_16(id->idReg4()); // mmmmm + + if (canEncodeSveElemsize_dtype(ins)) + { + if (ins == INS_sve_ld1w) + { + code = insEncodeSveElemsize_dtype_ld1w(ins, fmt, optGetSveElemsize(id->idInsOpt()), code); + } + else + { + code = insEncodeSveElemsize_dtype(ins, optGetSveElemsize(id->idInsOpt()), code); + } + } + + dst += emitOutput_Instr(dst, code); + break; + case IF_SVE_IG_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus scalar) case IF_SVE_IG_4A_D: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus // scalar) @@ -19446,23 +19502,12 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) // scalar) case IF_SVE_II_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) case IF_SVE_II_4A_B: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) - case IF_SVE_II_4A_H: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) case IF_SVE_IK_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) - case IF_SVE_IK_4A_F: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) - case IF_SVE_IK_4A_G: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) - case IF_SVE_IK_4A_H: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) - case IF_SVE_IK_4A_I: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) case IF_SVE_IN_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal load (scalar plus scalar) case IF_SVE_IP_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load and broadcast quadword (scalar plus scalar) case IF_SVE_IR_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (quadwords, scalar plus // scalar) case IF_SVE_IT_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (scalar plus scalar) - case IF_SVE_IU_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked - // scaled offsets) - case IF_SVE_IU_4B_B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked - // scaled offsets) - case IF_SVE_IU_4B_D: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked - // scaled offsets) case IF_SVE_IW_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit gather load (vector plus scalar) case IF_SVE_IX_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit gather non-temporal load (vector plus // scalar) @@ -19493,19 +19538,20 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) code |= insEncodeReg_P_12_to_10(id->idReg2()); // ggg code |= insEncodeReg_R_9_to_5(id->idReg3()); // nnnnn code |= insEncodeReg_R_20_to_16(id->idReg4()); // mmmmm + dst += emitOutput_Instr(dst, code); + break; - if (canEncodeSveElemsize_dtype(ins)) - { - if (ins == INS_sve_ld1w) - { - code = insEncodeSveElemsize_dtype_ld1w(ins, fmt, optGetSveElemsize(id->idInsOpt()), code); - } - else - { - code = insEncodeSveElemsize_dtype(ins, optGetSveElemsize(id->idInsOpt()), code); - } - } - + case IF_SVE_IU_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked + // scaled offsets) + case IF_SVE_IU_4B_B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked + // scaled offsets) + case IF_SVE_IU_4B_D: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked + // scaled offsets) + code = emitInsCodeSve(ins, fmt); + code |= insEncodeReg_V_4_to_0(id->idReg1()); // ttttt + code |= insEncodeReg_P_12_to_10(id->idReg2()); // ggg + code |= insEncodeReg_R_9_to_5(id->idReg3()); // nnnnn + code |= insEncodeReg_V_20_to_16(id->idReg4()); // mmmmm dst += emitOutput_Instr(dst, code); break; From a819590119c0f39d5a0ba572556ca6f937f5bb3e Mon Sep 17 00:00:00 2001 From: TIHan Date: Wed, 31 Jan 2024 12:28:22 -0800 Subject: [PATCH 11/18] Added SVE_IW_4A format. Fixed an issue with SVE_IU_4B test not including LSL. --- src/coreclr/jit/codegenarm64test.cpp | 25 ++++++++++--------- src/coreclr/jit/emitarm64.cpp | 37 +++++++++++++++++++++++++--- 2 files changed, 48 insertions(+), 14 deletions(-) diff --git a/src/coreclr/jit/codegenarm64test.cpp b/src/coreclr/jit/codegenarm64test.cpp index 77504248346c1d..806596791ac1b3 100644 --- a/src/coreclr/jit/codegenarm64test.cpp +++ b/src/coreclr/jit/codegenarm64test.cpp @@ -6557,14 +6557,14 @@ void CodeGen::genArm64EmitterUnitTestsSve() // [, , LSL #2] // IF_SVE_IU_4B - theEmitter->emitIns_R_R_R_R(INS_sve_ld1d, EA_SCALABLE, REG_V0, REG_P4, REG_R3, REG_V2, - INS_OPTS_SCALABLE_D); // LD1D {.D }, /Z, [, .D, LSL #3] - theEmitter->emitIns_R_R_R_R(INS_sve_ld1sw, EA_SCALABLE, REG_V4, REG_P3, REG_R2, REG_V1, - INS_OPTS_SCALABLE_D); // LD1SW {.D }, /Z, [, .D, LSL #2] - theEmitter->emitIns_R_R_R_R(INS_sve_ldff1d, EA_SCALABLE, REG_V5, REG_P6, REG_R7, REG_V8, - INS_OPTS_SCALABLE_D); // LDFF1D {.D }, /Z, [, .D, LSL #3] - theEmitter->emitIns_R_R_R_R(INS_sve_ldff1sw, EA_SCALABLE, REG_V3, REG_P0, REG_R10, REG_V9, - INS_OPTS_SCALABLE_D); // LDFF1SW {.D }, /Z, [, .D, LSL #2] + theEmitter->emitIns_R_R_R_R(INS_sve_ld1d, EA_SCALABLE, REG_V0, REG_P4, REG_R3, REG_V2, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // LD1D {.D }, /Z, [, .D, LSL #3] + theEmitter->emitIns_R_R_R_R(INS_sve_ld1sw, EA_SCALABLE, REG_V4, REG_P3, REG_R2, REG_V1, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // LD1SW {.D }, /Z, [, .D, LSL #2] + theEmitter->emitIns_R_R_R_R(INS_sve_ldff1d, EA_SCALABLE, REG_V5, REG_P6, REG_R7, REG_V8, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // LDFF1D {.D }, /Z, [, .D, LSL #3] + theEmitter->emitIns_R_R_R_R(INS_sve_ldff1sw, EA_SCALABLE, REG_V3, REG_P0, REG_R10, REG_V9, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // LDFF1SW {.D }, /Z, [, .D, LSL #2] // IF_SVE_IU_4B_B theEmitter->emitIns_R_R_R_R(INS_sve_ld1sw, EA_SCALABLE, REG_V1, REG_P2, REG_R3, REG_V0, @@ -6578,9 +6578,12 @@ void CodeGen::genArm64EmitterUnitTestsSve() theEmitter->emitIns_R_R_R_R(INS_sve_ld1d, EA_SCALABLE, REG_V0, REG_P1, REG_R3, REG_V4, INS_OPTS_SCALABLE_D); // LD1D {.D }, /Z, [, .D] - //// IF_SVE_IW_4A - //theEmitter->emitIns_R_R_R_R(INS_sve_ld1q, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LD1Q {.Q }, /Z, [.D{, }] + // IF_SVE_IW_4A + theEmitter->emitIns_R_R_R_R(INS_sve_ld1q, EA_SCALABLE, REG_V0, REG_P1, REG_V2, REG_R3, + INS_OPTS_SCALABLE_Q); // LD1Q {.Q }, /Z, [.D{, }] + // REG_ZR can be used due to the optional {, } of the format. + theEmitter->emitIns_R_R_R_R(INS_sve_ld1q, EA_SCALABLE, REG_V0, REG_P1, REG_V2, REG_ZR, + INS_OPTS_SCALABLE_Q); // LD1Q {.Q }, /Z, [.D{, }] //// IF_SVE_IX_4A //theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index 282a40cb1e9c83..970d9f38e6f284 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -1808,10 +1808,19 @@ void emitter::emitInsSanityCheck(instrDesc* id) assert(isScalableVectorSize(elemsize)); break; + case IF_SVE_IW_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit gather load (vector plus scalar) + elemsize = id->idOpSize(); + assert(id->idInsOpt() == INS_OPTS_SCALABLE_Q); + assert(isVectorRegister(id->idReg1())); // ttttt + assert(isPredicateRegister(id->idReg2())); // ggg + assert(isVectorRegister(id->idReg3())); // nnnnn + assert(isGeneralRegisterOrZR(id->idReg4())); // mmmmm + assert(isScalableVectorSize(elemsize)); + break; + case IF_SVE_IN_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal load (scalar plus scalar) case IF_SVE_IP_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load and broadcast quadword (scalar plus scalar) case IF_SVE_IT_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (scalar plus scalar) - case IF_SVE_IW_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit gather load (vector plus scalar) case IF_SVE_IX_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit gather non-temporal load (vector plus // scalar) case IF_SVE_IY_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit scatter store (vector plus scalar) @@ -12141,6 +12150,17 @@ void emitter::emitIns_R_R_R_R(instruction ins, fmt = IF_SVE_IP_4A; break; + case INS_sve_ld1q: + assert(isVectorRegister(reg1)); + assert(isPredicateRegister(reg2)); + assert(isVectorRegister(reg3)); + assert(isGeneralRegisterOrZR(reg4)); + assert(isScalableVectorSize(size)); + assert(opt == INS_OPTS_SCALABLE_Q); + assert(insScalableOptsNone(sopt)); + fmt = IF_SVE_IW_4A; + break; + case INS_sve_ld2q: case INS_sve_ld3q: case INS_sve_ld4q: @@ -19457,6 +19477,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) // scalar) case IF_SVE_IF_4A_A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit gather non-temporal load (vector plus // scalar) + case IF_SVE_IW_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit gather load (vector plus scalar) code = emitInsCodeSve(ins, fmt); code |= insEncodeReg_V_4_to_0(id->idReg1()); // ttttt code |= insEncodeReg_P_12_to_10(id->idReg2()); // ggg @@ -19508,7 +19529,6 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_SVE_IR_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (quadwords, scalar plus // scalar) case IF_SVE_IT_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (scalar plus scalar) - case IF_SVE_IW_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit gather load (vector plus scalar) case IF_SVE_IX_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit gather non-temporal load (vector plus // scalar) case IF_SVE_IY_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit scatter store (vector plus scalar) @@ -20044,7 +20064,18 @@ void emitter::emitDispSveModAddr(instruction ins, regNumber reg1, regNumber reg2 if (isVectorRegister(reg1)) { - emitDispSveReg(reg1, opt, reg2 != REG_ZR); + // If the overall instruction is working on 128-bit + // registers, the size of this register for + // the mod addr is always 64-bit. + // Example: LD1Q {.Q }, /Z, [.D{, }] + if (opt == INS_OPTS_SCALABLE_Q) + { + emitDispSveReg(reg1, INS_OPTS_SCALABLE_D, reg2 != REG_ZR); + } + else + { + emitDispSveReg(reg1, opt, reg2 != REG_ZR); + } } else { From dc3876217a1b85baf06ea577381bb173c0a864a0 Mon Sep 17 00:00:00 2001 From: TIHan Date: Wed, 31 Jan 2024 12:46:26 -0800 Subject: [PATCH 12/18] Added SVE_IX_4A format --- src/coreclr/jit/codegenarm64test.cpp | 13 ++++++++----- src/coreclr/jit/emitarm64.cpp | 28 +++++++++++++++++++++++----- 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/src/coreclr/jit/codegenarm64test.cpp b/src/coreclr/jit/codegenarm64test.cpp index 806596791ac1b3..e269919de00ad4 100644 --- a/src/coreclr/jit/codegenarm64test.cpp +++ b/src/coreclr/jit/codegenarm64test.cpp @@ -6585,11 +6585,14 @@ void CodeGen::genArm64EmitterUnitTestsSve() theEmitter->emitIns_R_R_R_R(INS_sve_ld1q, EA_SCALABLE, REG_V0, REG_P1, REG_V2, REG_ZR, INS_OPTS_SCALABLE_Q); // LD1Q {.Q }, /Z, [.D{, }] - //// IF_SVE_IX_4A - //theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LDNT1D {.D }, /Z, [.D{, }] - //theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1sw, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // LDNT1SW {.D }, /Z, [.D{, }] + // IF_SVE_IX_4A + theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1d, EA_SCALABLE, REG_V4, REG_P2, REG_V1, REG_R3, + INS_OPTS_SCALABLE_D); // LDNT1D {.D }, /Z, [.D{, }] + theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1sw, EA_SCALABLE, REG_V7, REG_P1, REG_V0, REG_R1, + INS_OPTS_SCALABLE_D); // LDNT1SW {.D }, /Z, [.D{, }] + // REG_ZR can be used due to the optional {, } of the format. + theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1sw, EA_SCALABLE, REG_V7, REG_P1, REG_V0, REG_ZR, + INS_OPTS_SCALABLE_D); // LDNT1SW {.D }, /Z, [.D{, }] //// IF_SVE_IY_4A //theEmitter->emitIns_R_R_R_R(INS_sve_st1q, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index 970d9f38e6f284..a043c11f6d4aa2 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -1818,11 +1818,20 @@ void emitter::emitInsSanityCheck(instrDesc* id) assert(isScalableVectorSize(elemsize)); break; + case IF_SVE_IX_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit gather non-temporal load (vector plus + // scalar) + elemsize = id->idOpSize(); + assert(id->idInsOpt() == INS_OPTS_SCALABLE_D); + assert(isVectorRegister(id->idReg1())); // ttttt + assert(isPredicateRegister(id->idReg2())); // ggg + assert(isVectorRegister(id->idReg3())); // nnnnn + assert(isGeneralRegisterOrZR(id->idReg4())); // mmmmm + assert(isScalableVectorSize(elemsize)); + break; + case IF_SVE_IN_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal load (scalar plus scalar) case IF_SVE_IP_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load and broadcast quadword (scalar plus scalar) case IF_SVE_IT_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (scalar plus scalar) - case IF_SVE_IX_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit gather non-temporal load (vector plus - // scalar) case IF_SVE_IY_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit scatter store (vector plus scalar) case IF_SVE_IZ_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit scatter non-temporal store (vector plus // scalar) @@ -12041,6 +12050,7 @@ void emitter::emitIns_R_R_R_R(instruction ins, case INS_sve_ldnt1d: case INS_sve_ldnt1sb: case INS_sve_ldnt1sh: + case INS_sve_ldnt1sw: assert(isVectorRegister(reg1)); assert(isPredicateRegister(reg2)); assert(isScalableVectorSize(size)); @@ -12080,9 +12090,17 @@ void emitter::emitIns_R_R_R_R(instruction ins, fmt = IF_SVE_IN_4A; } + else if ((ins == INS_sve_ldnt1d) || (ins == INS_sve_ldnt1sw)) + { + assert(insOptsScalableWords(opt)); + assert(isVectorRegister(reg3)); + assert(isGeneralRegisterOrZR(reg4)); + assert(insScalableOptsNone(sopt)); + assert(opt == INS_OPTS_SCALABLE_D); + fmt = IF_SVE_IX_4A; + } else { - assert(ins != INS_sve_ldnt1d); assert(insOptsScalableWords(opt)); assert(isVectorRegister(reg3)); assert(isGeneralRegisterOrZR(reg4)); @@ -19478,6 +19496,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_SVE_IF_4A_A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit gather non-temporal load (vector plus // scalar) case IF_SVE_IW_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit gather load (vector plus scalar) + case IF_SVE_IX_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit gather non-temporal load (vector plus + // scalar) code = emitInsCodeSve(ins, fmt); code |= insEncodeReg_V_4_to_0(id->idReg1()); // ttttt code |= insEncodeReg_P_12_to_10(id->idReg2()); // ggg @@ -19529,8 +19549,6 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_SVE_IR_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (quadwords, scalar plus // scalar) case IF_SVE_IT_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (scalar plus scalar) - case IF_SVE_IX_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit gather non-temporal load (vector plus - // scalar) case IF_SVE_IY_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit scatter store (vector plus scalar) case IF_SVE_IZ_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit scatter non-temporal store (vector plus // scalar) From eb50e702b3bd45e63e184c5ed7cf7a8962c9cac7 Mon Sep 17 00:00:00 2001 From: TIHan Date: Wed, 31 Jan 2024 18:44:30 -0800 Subject: [PATCH 13/18] Added remaining formats --- src/coreclr/jit/codegenarm64test.cpp | 236 ++++++++++-------- src/coreclr/jit/emitarm64.cpp | 355 +++++++++++++++++++++++---- src/coreclr/jit/emitarm64.h | 6 + 3 files changed, 444 insertions(+), 153 deletions(-) diff --git a/src/coreclr/jit/codegenarm64test.cpp b/src/coreclr/jit/codegenarm64test.cpp index e269919de00ad4..e11ebb17a7b4d2 100644 --- a/src/coreclr/jit/codegenarm64test.cpp +++ b/src/coreclr/jit/codegenarm64test.cpp @@ -6594,114 +6594,134 @@ void CodeGen::genArm64EmitterUnitTestsSve() theEmitter->emitIns_R_R_R_R(INS_sve_ldnt1sw, EA_SCALABLE, REG_V7, REG_P1, REG_V0, REG_ZR, INS_OPTS_SCALABLE_D); // LDNT1SW {.D }, /Z, [.D{, }] - //// IF_SVE_IY_4A - //theEmitter->emitIns_R_R_R_R(INS_sve_st1q, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // ST1Q {.Q }, , [.D{, }] - - //// IF_SVE_IZ_4A - //theEmitter->emitIns_R_R_R_R(INS_sve_stnt1b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // STNT1B {.S }, , [.S{, }] - //theEmitter->emitIns_R_R_R_R(INS_sve_stnt1h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // STNT1H {.S }, , [.S{, }] - //theEmitter->emitIns_R_R_R_R(INS_sve_stnt1w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // STNT1W {.S }, , [.S{, }] - - //// IF_SVE_IZ_4A_A - //theEmitter->emitIns_R_R_R_R(INS_sve_stnt1b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // STNT1B {.D }, , [.D{, }] - //theEmitter->emitIns_R_R_R_R(INS_sve_stnt1h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // STNT1H {.D }, , [.D{, }] - //theEmitter->emitIns_R_R_R_R(INS_sve_stnt1w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // STNT1W {.D }, , [.D{, }] - - //// IF_SVE_JA_4A - //theEmitter->emitIns_R_R_R_R(INS_sve_stnt1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // STNT1D {.D }, , [.D{, }] - - //// IF_SVE_JB_4A - //theEmitter->emitIns_R_R_R_R(INS_sve_stnt1b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // STNT1B {.B }, , [, ] - //theEmitter->emitIns_R_R_R_R(INS_sve_stnt1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // STNT1D {.D }, , [, , LSL #3] - //theEmitter->emitIns_R_R_R_R(INS_sve_stnt1h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // STNT1H {.H }, , [, , LSL #1] - //theEmitter->emitIns_R_R_R_R(INS_sve_stnt1w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // STNT1W {.S }, , [, , LSL #2] - - //// IF_SVE_JC_4A - //theEmitter->emitIns_R_R_R_R(INS_sve_st2b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // ST2B {.B, .B }, , [, ] - //theEmitter->emitIns_R_R_R_R(INS_sve_st2d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // ST2D {.D, .D }, , [, , LSL #3] - //theEmitter->emitIns_R_R_R_R(INS_sve_st2h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // ST2H {.H, .H }, , [, , LSL #1] - //theEmitter->emitIns_R_R_R_R(INS_sve_st2w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // ST2W {.S, .S }, , [, , LSL #2] - //theEmitter->emitIns_R_R_R_R(INS_sve_st3b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // ST3B {.B, .B, .B }, , [, ] - //theEmitter->emitIns_R_R_R_R(INS_sve_st3d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // ST3D {.D, .D, .D }, , [, , LSL - // // #3] - //theEmitter->emitIns_R_R_R_R(INS_sve_st3h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // ST3H {.H, .H, .H }, , [, , LSL - // // #1] - //theEmitter->emitIns_R_R_R_R(INS_sve_st3w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // ST3W {.S, .S, .S }, , [, , LSL - // // #2] - //theEmitter->emitIns_R_R_R_R(INS_sve_st4b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // ST4B {.B, .B, .B, .B }, , [, - // // ] - //theEmitter->emitIns_R_R_R_R(INS_sve_st4d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // ST4D {.D, .D, .D, .D }, , [, - // // , LSL #3] - //theEmitter->emitIns_R_R_R_R(INS_sve_st4h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // ST4H {.H, .H, .H, .H }, , [, - // // , LSL #1] - //theEmitter->emitIns_R_R_R_R(INS_sve_st4w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // ST4W {.S, .S, .S, .S }, , [, - // // , LSL #2] - - //// IF_SVE_JD_4C - //theEmitter->emitIns_R_R_R_R(INS_sve_st1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // ST1D {.D }, , [, , LSL #3] - //theEmitter->emitIns_R_R_R_R(INS_sve_st1w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // ST1W {.Q }, , [, , LSL #2] - - //// IF_SVE_JD_4C_A - //theEmitter->emitIns_R_R_R_R(INS_sve_st1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // ST1D {.Q }, , [, , LSL #3] - - //// IF_SVE_JF_4A - //theEmitter->emitIns_R_R_R_R(INS_sve_st2q, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // ST2Q {.Q, .Q }, , [, , LSL #4] - //theEmitter->emitIns_R_R_R_R(INS_sve_st3q, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // ST3Q {.Q, .Q, .Q }, , [, , LSL - // // #4] - //theEmitter->emitIns_R_R_R_R(INS_sve_st4q, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // ST4Q {.Q, .Q, .Q, .Q }, , [, - // // , LSL #4] - - //// IF_SVE_JJ_4B - //theEmitter->emitIns_R_R_R_R(INS_sve_st1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // ST1D {.D }, , [, .D, LSL #3] - //theEmitter->emitIns_R_R_R_R(INS_sve_st1h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // ST1H {.D }, , [, .D, LSL #1] - //theEmitter->emitIns_R_R_R_R(INS_sve_st1w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // ST1W {.D }, , [, .D, LSL #2] - - //// IF_SVE_JJ_4B_C - //theEmitter->emitIns_R_R_R_R(INS_sve_st1d, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // ST1D {.D }, , [, .D] - - //// IF_SVE_JJ_4B_E - //theEmitter->emitIns_R_R_R_R(INS_sve_st1h, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // ST1H {.D }, , [, .D] - //theEmitter->emitIns_R_R_R_R(INS_sve_st1w, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // ST1W {.D }, , [, .D] - - //// IF_SVE_JK_4B - //theEmitter->emitIns_R_R_R_R(INS_sve_st1b, EA_SCALABLE, REG_V0, REG_P0, REG_V0, REG_R0, - // INS_OPTS_SCALABLE_B); // ST1B {.D }, , [, .D] + // IF_SVE_IY_4A + theEmitter->emitIns_R_R_R_R(INS_sve_st1q, EA_SCALABLE, REG_V1, REG_P2, REG_V3, REG_R4, + INS_OPTS_SCALABLE_Q); // ST1Q {.Q }, , [.D{, }] + // REG_ZR can be used due to the optional {, } of the format. + theEmitter->emitIns_R_R_R_R(INS_sve_st1q, EA_SCALABLE, REG_V1, REG_P2, REG_V3, REG_ZR, + INS_OPTS_SCALABLE_Q); // ST1Q {.Q }, , [.D{, }] + + // IF_SVE_IZ_4A + theEmitter->emitIns_R_R_R_R(INS_sve_stnt1b, EA_SCALABLE, REG_V0, REG_P2, REG_V3, REG_R4, + INS_OPTS_SCALABLE_S); // STNT1B {.S }, , [.S{, }] + theEmitter->emitIns_R_R_R_R(INS_sve_stnt1h, EA_SCALABLE, REG_V2, REG_P7, REG_V6, REG_R5, + INS_OPTS_SCALABLE_S); // STNT1H {.S }, , [.S{, }] + theEmitter->emitIns_R_R_R_R(INS_sve_stnt1w, EA_SCALABLE, REG_V1, REG_P3, REG_V2, REG_R0, + INS_OPTS_SCALABLE_S); // STNT1W {.S }, , [.S{, }] + // REG_ZR can be used due to the optional {, } of the format. + theEmitter->emitIns_R_R_R_R(INS_sve_stnt1w, EA_SCALABLE, REG_V1, REG_P3, REG_V2, REG_ZR, + INS_OPTS_SCALABLE_S); // STNT1W {.S }, , [.S{, }] + + // IF_SVE_IZ_4A_A + theEmitter->emitIns_R_R_R_R(INS_sve_stnt1b, EA_SCALABLE, REG_V0, REG_P4, REG_V6, REG_R8, + INS_OPTS_SCALABLE_D); // STNT1B {.D }, , [.D{, }] + theEmitter->emitIns_R_R_R_R(INS_sve_stnt1h, EA_SCALABLE, REG_V5, REG_P3, REG_V1, REG_R2, + INS_OPTS_SCALABLE_D); // STNT1H {.D }, , [.D{, }] + theEmitter->emitIns_R_R_R_R(INS_sve_stnt1w, EA_SCALABLE, REG_V3, REG_P1, REG_V2, REG_R0, + INS_OPTS_SCALABLE_D); // STNT1W {.D }, , [.D{, }] + // REG_ZR can be used due to the optional {, } of the format. + theEmitter->emitIns_R_R_R_R(INS_sve_stnt1b, EA_SCALABLE, REG_V0, REG_P4, REG_V6, REG_ZR, + INS_OPTS_SCALABLE_D); // STNT1B {.D }, , [.D{, }] + + // IF_SVE_JA_4A + theEmitter->emitIns_R_R_R_R(INS_sve_stnt1d, EA_SCALABLE, REG_V1, REG_P3, REG_V4, REG_R5, + INS_OPTS_SCALABLE_D); // STNT1D {.D }, , [.D{, }] + // REG_ZR can be used due to the optional {, } of the format. + theEmitter->emitIns_R_R_R_R(INS_sve_stnt1d, EA_SCALABLE, REG_V0, REG_P4, REG_V5, REG_ZR, + INS_OPTS_SCALABLE_D); // STNT1D {.D }, , [.D{, }] + + // IF_SVE_JB_4A + theEmitter->emitIns_R_R_R_R(INS_sve_stnt1b, EA_SCALABLE, REG_V6, REG_P5, REG_R4, REG_R3, + INS_OPTS_SCALABLE_B); // STNT1B {.B }, , [, ] + theEmitter->emitIns_R_R_R_R(INS_sve_stnt1d, EA_SCALABLE, REG_V7, REG_P6, REG_R5, REG_R4, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // STNT1D {.D }, , [, , LSL #3] + theEmitter->emitIns_R_R_R_R(INS_sve_stnt1h, EA_SCALABLE, REG_V0, REG_P1, REG_R2, REG_R3, INS_OPTS_SCALABLE_H, + INS_SCALABLE_OPTS_LSL_N); // STNT1H {.H }, , [, , LSL #1] + theEmitter->emitIns_R_R_R_R(INS_sve_stnt1w, EA_SCALABLE, REG_V0, REG_P5, REG_R6, REG_R7, INS_OPTS_SCALABLE_S, + INS_SCALABLE_OPTS_LSL_N); // STNT1W {.S }, , [, , LSL #2] + + // IF_SVE_JC_4A + theEmitter->emitIns_R_R_R_R(INS_sve_st2b, EA_SCALABLE, REG_V0, REG_P1, REG_R2, REG_R4, + INS_OPTS_SCALABLE_B); // ST2B {.B, .B }, , [, ] + theEmitter->emitIns_R_R_R_R(INS_sve_st2d, EA_SCALABLE, REG_V1, REG_P7, REG_R6, REG_R5, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // ST2D {.D, .D }, , [, , LSL #3] + theEmitter->emitIns_R_R_R_R(INS_sve_st2h, EA_SCALABLE, REG_V2, REG_P3, REG_R5, REG_R6, INS_OPTS_SCALABLE_H, + INS_SCALABLE_OPTS_LSL_N); // ST2H {.H, .H }, , [, , LSL #1] + theEmitter->emitIns_R_R_R_R(INS_sve_st2w, EA_SCALABLE, REG_V0, REG_P2, REG_R8, REG_R7, INS_OPTS_SCALABLE_S, + INS_SCALABLE_OPTS_LSL_N); // ST2W {.S, .S }, , [, , LSL #2] + theEmitter->emitIns_R_R_R_R(INS_sve_st3b, EA_SCALABLE, REG_V0, REG_P1, REG_R3, REG_R4, + INS_OPTS_SCALABLE_B); // ST3B {.B, .B, .B }, , [, ] + theEmitter->emitIns_R_R_R_R(INS_sve_st3d, EA_SCALABLE, REG_V2, REG_P3, REG_R4, REG_R6, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // ST3D {.D, .D, .D }, , [, , + // LSL + // #3] + theEmitter->emitIns_R_R_R_R(INS_sve_st3h, EA_SCALABLE, REG_V1, REG_P0, REG_R3, REG_R8, INS_OPTS_SCALABLE_H, + INS_SCALABLE_OPTS_LSL_N); // ST3H {.H, .H, .H }, , [, , + // LSL + // #1] + theEmitter->emitIns_R_R_R_R(INS_sve_st3w, EA_SCALABLE, REG_V0, REG_P1, REG_R2, REG_R3, INS_OPTS_SCALABLE_S, + INS_SCALABLE_OPTS_LSL_N); // ST3W {.S, .S, .S }, , [, , + // LSL + // #2] + theEmitter->emitIns_R_R_R_R(INS_sve_st4b, EA_SCALABLE, REG_V0, REG_P6, REG_R5, REG_R4, + INS_OPTS_SCALABLE_B); // ST4B {.B, .B, .B, .B }, , [, + // ] + theEmitter->emitIns_R_R_R_R(INS_sve_st4d, EA_SCALABLE, REG_V5, REG_P2, REG_R1, REG_R0, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // ST4D {.D, .D, .D, .D }, , + // [, + // , LSL #3] + theEmitter->emitIns_R_R_R_R(INS_sve_st4h, EA_SCALABLE, REG_V1, REG_P0, REG_R9, REG_R8, INS_OPTS_SCALABLE_H, + INS_SCALABLE_OPTS_LSL_N); // ST4H {.H, .H, .H, .H }, , + // [, + // , LSL #1] + theEmitter->emitIns_R_R_R_R(INS_sve_st4w, EA_SCALABLE, REG_V0, REG_P1, REG_R4, REG_R5, INS_OPTS_SCALABLE_S, + INS_SCALABLE_OPTS_LSL_N); // ST4W {.S, .S, .S, .S }, , + // [, + // , LSL #2] + + // IF_SVE_JD_4C + theEmitter->emitIns_R_R_R_R(INS_sve_st1d, EA_SCALABLE, REG_V1, REG_P4, REG_R5, REG_R6, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // ST1D {.D }, , [, , LSL #3] + theEmitter->emitIns_R_R_R_R(INS_sve_st1w, EA_SCALABLE, REG_V2, REG_P1, REG_R8, REG_R7, INS_OPTS_SCALABLE_Q, + INS_SCALABLE_OPTS_LSL_N); // ST1W {.Q }, , [, , LSL #2] + + // IF_SVE_JD_4C_A + theEmitter->emitIns_R_R_R_R(INS_sve_st1d, EA_SCALABLE, REG_V3, REG_P5, REG_R6, REG_R1, INS_OPTS_SCALABLE_Q, + INS_SCALABLE_OPTS_LSL_N); // ST1D {.Q }, , [, , LSL #3] + + // IF_SVE_JF_4A + theEmitter->emitIns_R_R_R_R(INS_sve_st2q, EA_SCALABLE, REG_V0, REG_P2, REG_R3, REG_R5, INS_OPTS_SCALABLE_Q, + INS_SCALABLE_OPTS_LSL_N); // ST2Q {.Q, .Q }, , [, , LSL #4] + theEmitter->emitIns_R_R_R_R(INS_sve_st3q, EA_SCALABLE, REG_V1, REG_P4, REG_R2, REG_R8, INS_OPTS_SCALABLE_Q, + INS_SCALABLE_OPTS_LSL_N); // ST3Q {.Q, .Q, .Q }, , [, , + // LSL + // #4] + theEmitter->emitIns_R_R_R_R(INS_sve_st4q, EA_SCALABLE, REG_V4, REG_P1, REG_R8, REG_R2, INS_OPTS_SCALABLE_Q, + INS_SCALABLE_OPTS_LSL_N); // ST4Q {.Q, .Q, .Q, .Q }, , + // [, + // , LSL #4] + + // IF_SVE_JJ_4B + theEmitter->emitIns_R_R_R_R(INS_sve_st1d, EA_SCALABLE, REG_V0, REG_P3, REG_R2, REG_V1, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // ST1D {.D }, , [, .D, LSL #3] + theEmitter->emitIns_R_R_R_R(INS_sve_st1h, EA_SCALABLE, REG_V1, REG_P2, REG_R3, REG_V4, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // ST1H {.D }, , [, .D, LSL #1] + theEmitter->emitIns_R_R_R_R(INS_sve_st1w, EA_SCALABLE, REG_V2, REG_P3, REG_R4, REG_V5, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // ST1W {.D }, , [, .D, LSL #2] + + // IF_SVE_JJ_4B_C + theEmitter->emitIns_R_R_R_R(INS_sve_st1d, EA_SCALABLE, REG_V3, REG_P4, REG_R5, REG_V6, + INS_OPTS_SCALABLE_D); // ST1D {.D }, , [, .D] + + // IF_SVE_JJ_4B_E + theEmitter->emitIns_R_R_R_R(INS_sve_st1h, EA_SCALABLE, REG_V1, REG_P4, REG_R3, REG_V2, + INS_OPTS_SCALABLE_D); // ST1H {.D }, , [, .D] + theEmitter->emitIns_R_R_R_R(INS_sve_st1w, EA_SCALABLE, REG_V3, REG_P5, REG_R1, REG_V0, + INS_OPTS_SCALABLE_D); // ST1W {.D }, , [, .D] + + // IF_SVE_JK_4B + theEmitter->emitIns_R_R_R_R(INS_sve_st1b, EA_SCALABLE, REG_V6, REG_P3, REG_R0, REG_V4, + INS_OPTS_SCALABLE_D); // ST1B {.D }, , [, .D] } #endif // defined(TARGET_ARM64) && defined(DEBUG) diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index a043c11f6d4aa2..ad98acf00c526b 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -1809,6 +1809,7 @@ void emitter::emitInsSanityCheck(instrDesc* id) break; case IF_SVE_IW_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit gather load (vector plus scalar) + case IF_SVE_IY_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit scatter store (vector plus scalar) elemsize = id->idOpSize(); assert(id->idInsOpt() == INS_OPTS_SCALABLE_Q); assert(isVectorRegister(id->idReg1())); // ttttt @@ -1829,25 +1830,60 @@ void emitter::emitInsSanityCheck(instrDesc* id) assert(isScalableVectorSize(elemsize)); break; - case IF_SVE_IN_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal load (scalar plus scalar) - case IF_SVE_IP_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load and broadcast quadword (scalar plus scalar) - case IF_SVE_IT_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (scalar plus scalar) - case IF_SVE_IY_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit scatter store (vector plus scalar) case IF_SVE_IZ_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit scatter non-temporal store (vector plus // scalar) case IF_SVE_IZ_4A_A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit scatter non-temporal store (vector plus // scalar) case IF_SVE_JA_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit scatter non-temporal store (vector plus // scalar) - case IF_SVE_JB_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal store (scalar plus - // scalar) - case IF_SVE_JC_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE store multiple structures (scalar plus scalar) + elemsize = id->idOpSize(); + assert(insOptsScalableWords(id->idInsOpt())); + assert(isVectorRegister(id->idReg1())); // ttttt + assert(isPredicateRegister(id->idReg2())); // ggg + assert(isVectorRegister(id->idReg3())); // nnnnn + assert(isGeneralRegisterOrZR(id->idReg4())); // mmmmm + assert(isScalableVectorSize(elemsize)); + break; + case IF_SVE_JD_4C: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous store (scalar plus scalar) case IF_SVE_JD_4C_A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous store (scalar plus scalar) + elemsize = id->idOpSize(); + assert(insOptsScalableDoubleWordsOrQuadword(id->idInsOpt())); + assert(isVectorRegister(id->idReg1())); // ttttt + assert(isPredicateRegister(id->idReg2())); // ggg + assert(isGeneralRegister(id->idReg3())); // nnnnn + assert(isGeneralRegister(id->idReg4())); // mmmmm + assert(isScalableVectorSize(elemsize)); + break; + case IF_SVE_JF_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE store multiple structures (quadwords, scalar plus // scalar) - case IF_SVE_JJ_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled - // offsets) + elemsize = id->idOpSize(); + assert(id->idInsOpt() == INS_OPTS_SCALABLE_Q); + assert(isVectorRegister(id->idReg1())); // ttttt + assert(isPredicateRegister(id->idReg2())); // ggg + assert(isGeneralRegister(id->idReg3())); // nnnnn + assert(isGeneralRegister(id->idReg4())); // mmmmm + assert(isScalableVectorSize(elemsize)); + break; + + case IF_SVE_IN_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal load (scalar plus scalar) + case IF_SVE_IP_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load and broadcast quadword (scalar plus scalar) + case IF_SVE_IT_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (scalar plus scalar) + case IF_SVE_JB_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal store (scalar plus + // scalar) + case IF_SVE_JC_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE store multiple structures (scalar plus scalar) + elemsize = id->idOpSize(); + assert(insOptsScalableStandard(id->idInsOpt())); + assert(isVectorRegister(id->idReg1())); // ttttt + assert(isPredicateRegister(id->idReg2())); // ggg + assert(isGeneralRegister(id->idReg3())); // nnnnn + assert(isGeneralRegister(id->idReg4())); // mmmmm + assert(isScalableVectorSize(elemsize)); + break; + + case IF_SVE_JJ_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled + // offsets) case IF_SVE_JJ_4B_C: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled // offsets) case IF_SVE_JJ_4B_E: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled @@ -1855,11 +1891,11 @@ void emitter::emitInsSanityCheck(instrDesc* id) case IF_SVE_JK_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit unscaled // offsets) elemsize = id->idOpSize(); - assert(insOptsScalableStandard(id->idInsOpt())); + assert(id->idInsOpt() == INS_OPTS_SCALABLE_D); assert(isVectorRegister(id->idReg1())); // ttttt assert(isPredicateRegister(id->idReg2())); // ggg assert(isGeneralRegister(id->idReg3())); // nnnnn - assert(isGeneralRegister(id->idReg4())); // mmmmm + assert(isVectorRegister(id->idReg4())); // mmmmm assert(isScalableVectorSize(elemsize)); break; @@ -11648,10 +11684,18 @@ void emitter::emitIns_R_R_R_R(instruction ins, assert(isGeneralRegister(reg3)); assert(isScalableVectorSize(size)); assert(insScalableOptsNone(sopt)); + if (insOptsScalableStandard(opt)) { - assert(isGeneralRegister(reg4)); - fmt = IF_SVE_JD_4A; + if (isGeneralRegister(reg4)) + { + fmt = IF_SVE_JD_4A; + } + else + { + assert(isVectorRegister(reg4)); + fmt = IF_SVE_JK_4B; + } } else { @@ -11680,13 +11724,30 @@ void emitter::emitIns_R_R_R_R(instruction ins, assert(isPredicateRegister(reg2)); assert(isGeneralRegister(reg3)); assert(isScalableVectorSize(size)); + if (insOptsScalableStandard(opt)) { - // st1h is reserved for scalable B - assert((ins == INS_sve_st1h) ? insOptsScalableAtLeastHalf(opt) : true); - assert(isGeneralRegister(reg4)); - assert(sopt == INS_SCALABLE_OPTS_LSL_N); - fmt = IF_SVE_JD_4A; + if (sopt == INS_SCALABLE_OPTS_LSL_N) + { + if (isGeneralRegister(reg4)) + { + // st1h is reserved for scalable B + assert((ins == INS_sve_st1h) ? insOptsScalableAtLeastHalf(opt) : true); + assert(sopt == INS_SCALABLE_OPTS_LSL_N); + fmt = IF_SVE_JD_4A; + } + else + { + assert(isVectorRegister(reg4)); + fmt = IF_SVE_JJ_4B; + } + } + else + { + assert(isVectorRegister(reg4)); + assert(insScalableOptsNone(sopt)); + fmt = IF_SVE_JJ_4B_E; + } } else { @@ -11731,15 +11792,38 @@ void emitter::emitIns_R_R_R_R(instruction ins, assert(isPredicateRegister(reg2)); assert(isGeneralRegister(reg3)); assert(isScalableVectorSize(size)); + if (insOptsScalableStandard(opt)) + { + if (sopt == INS_SCALABLE_OPTS_LSL_N) + { + if (isGeneralRegister(reg4)) + { + fmt = IF_SVE_JD_4B; + } + else + { + assert(isVectorRegister(reg4)); + fmt = IF_SVE_JJ_4B; + } + } + else + { + assert(isVectorRegister(reg4)); + assert(insScalableOptsNone(sopt)); + fmt = IF_SVE_JJ_4B_E; + } + } + else if (opt == INS_OPTS_SCALABLE_Q) { assert(isGeneralRegister(reg4)); assert(sopt == INS_SCALABLE_OPTS_LSL_N); - fmt = IF_SVE_JD_4B; + fmt = IF_SVE_JD_4C; } else { assert(insOptsScalable32bitExtends(opt)); + assert(isVectorRegister(reg4)); switch (opt) { case INS_OPTS_SCALABLE_S_UXTW: @@ -11776,29 +11860,63 @@ void emitter::emitIns_R_R_R_R(instruction ins, break; case INS_sve_st1d: - assert(insOptsScalable32bitExtends(opt)); assert(isVectorRegister(reg1)); assert(isPredicateRegister(reg2)); assert(isGeneralRegister(reg3)); assert(isScalableVectorSize(size)); - switch (opt) + + if (isGeneralRegister(reg4)) { - case INS_OPTS_SCALABLE_D_UXTW: - case INS_OPTS_SCALABLE_D_SXTW: - if (sopt == INS_SCALABLE_OPTS_MOD_N) + assert(sopt == INS_SCALABLE_OPTS_LSL_N); + if (opt == INS_OPTS_SCALABLE_Q) + { + fmt = IF_SVE_JD_4C_A; + } + else + { + assert(opt == INS_OPTS_SCALABLE_D); + fmt = IF_SVE_JD_4C; + } + } + else + { + assert(isVectorRegister(reg4)); + + if (opt == INS_OPTS_SCALABLE_D) + { + if (sopt == INS_SCALABLE_OPTS_LSL_N) { - fmt = IF_SVE_JJ_4A; + fmt = IF_SVE_JJ_4B; } else { assert(insScalableOptsNone(sopt)); - fmt = IF_SVE_JJ_4A_B; + fmt = IF_SVE_JJ_4B_C; } - break; + } + else + { + assert(insOptsScalable32bitExtends(opt)); + switch (opt) + { + case INS_OPTS_SCALABLE_D_UXTW: + case INS_OPTS_SCALABLE_D_SXTW: + if (sopt == INS_SCALABLE_OPTS_MOD_N) + { + fmt = IF_SVE_JJ_4A; + } + else + { + assert(insScalableOptsNone(sopt)); + fmt = IF_SVE_JJ_4A_B; + } + break; - default: - assert(!"Invalid options for scalable"); - break; + default: + assert(!"Invalid options for scalable"); + break; + } + } } break; @@ -12250,6 +12368,153 @@ void emitter::emitIns_R_R_R_R(instruction ins, fmt = IF_SVE_IT_4A; break; + case INS_sve_st1q: + assert(isVectorRegister(reg1)); + assert(isPredicateRegister(reg2)); + assert(isVectorRegister(reg3)); + assert(isGeneralRegisterOrZR(reg4)); + assert(isScalableVectorSize(size)); + assert(opt == INS_OPTS_SCALABLE_Q); + assert(insScalableOptsNone(sopt)); + fmt = IF_SVE_IY_4A; + break; + + case INS_sve_stnt1b: + case INS_sve_stnt1h: + case INS_sve_stnt1w: + case INS_sve_stnt1d: + assert(isVectorRegister(reg1)); + assert(isPredicateRegister(reg2)); + assert(isScalableVectorSize(size)); + + if (isGeneralRegister(reg3)) + { + assert(isGeneralRegister(reg4)); +#ifdef DEBUG + switch (ins) + { + case INS_sve_stnt1b: + assert(opt == INS_OPTS_SCALABLE_B); + assert(insScalableOptsNone(sopt)); + break; + + case INS_sve_stnt1h: + assert(opt == INS_OPTS_SCALABLE_H); + assert(sopt == INS_SCALABLE_OPTS_LSL_N); + break; + + case INS_sve_stnt1w: + assert(opt == INS_OPTS_SCALABLE_S); + assert(sopt == INS_SCALABLE_OPTS_LSL_N); + break; + + case INS_sve_stnt1d: + assert(opt == INS_OPTS_SCALABLE_D); + assert(sopt == INS_SCALABLE_OPTS_LSL_N); + break; + + default: + assert(!"Invalid instruction"); + break; + } +#endif // DEBUG + fmt = IF_SVE_JB_4A; + } + else + { + assert(isVectorRegister(reg3)); + assert(isGeneralRegisterOrZR(reg4)); + assert(isScalableVectorSize(size)); + assert(insScalableOptsNone(sopt)); + + if (opt == INS_OPTS_SCALABLE_S) + { + fmt = IF_SVE_IZ_4A; + } + else + { + assert(opt == INS_OPTS_SCALABLE_D); + if (ins == INS_sve_stnt1d) + { + fmt = IF_SVE_JA_4A; + } + else + { + fmt = IF_SVE_IZ_4A_A; + } + } + } + break; + + case INS_sve_st2b: + case INS_sve_st3b: + case INS_sve_st4b: + case INS_sve_st2h: + case INS_sve_st3h: + case INS_sve_st4h: + case INS_sve_st2w: + case INS_sve_st3w: + case INS_sve_st4w: + case INS_sve_st2d: + case INS_sve_st3d: + case INS_sve_st4d: + assert(isVectorRegister(reg1)); + assert(isPredicateRegister(reg2)); + assert(isGeneralRegister(reg3)); + assert(isGeneralRegister(reg4)); + assert(isScalableVectorSize(size)); + +#ifdef DEBUG + switch (ins) + { + case INS_sve_st2b: + case INS_sve_st3b: + case INS_sve_st4b: + assert(opt == INS_OPTS_SCALABLE_B); + assert(insScalableOptsNone(sopt)); + break; + + case INS_sve_st2h: + case INS_sve_st3h: + case INS_sve_st4h: + assert(opt == INS_OPTS_SCALABLE_H); + assert(sopt == INS_SCALABLE_OPTS_LSL_N); + break; + + case INS_sve_st2w: + case INS_sve_st3w: + case INS_sve_st4w: + assert(opt == INS_OPTS_SCALABLE_S); + assert(sopt == INS_SCALABLE_OPTS_LSL_N); + break; + + case INS_sve_st2d: + case INS_sve_st3d: + case INS_sve_st4d: + assert(opt == INS_OPTS_SCALABLE_D); + assert(sopt == INS_SCALABLE_OPTS_LSL_N); + break; + + default: + assert(!"Invalid instruction"); + break; + } +#endif // DEBUG + fmt = IF_SVE_JC_4A; + break; + + case INS_sve_st2q: + case INS_sve_st3q: + case INS_sve_st4q: + assert(isVectorRegister(reg1)); + assert(isPredicateRegister(reg2)); + assert(isGeneralRegister(reg3)); + assert(isGeneralRegister(reg4)); + assert(isScalableVectorSize(size)); + assert(opt == INS_OPTS_SCALABLE_Q); + fmt = IF_SVE_JF_4A; + break; + default: unreached(); break; @@ -19498,6 +19763,13 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_SVE_IW_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit gather load (vector plus scalar) case IF_SVE_IX_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit gather non-temporal load (vector plus // scalar) + case IF_SVE_IY_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit scatter store (vector plus scalar) + case IF_SVE_IZ_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit scatter non-temporal store (vector plus + // scalar) + case IF_SVE_IZ_4A_A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit scatter non-temporal store (vector plus + // scalar) + case IF_SVE_JA_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit scatter non-temporal store (vector plus + // scalar) code = emitInsCodeSve(ins, fmt); code |= insEncodeReg_V_4_to_0(id->idReg1()); // ttttt code |= insEncodeReg_P_12_to_10(id->idReg2()); // ggg @@ -19549,13 +19821,6 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_SVE_IR_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (quadwords, scalar plus // scalar) case IF_SVE_IT_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (scalar plus scalar) - case IF_SVE_IY_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit scatter store (vector plus scalar) - case IF_SVE_IZ_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit scatter non-temporal store (vector plus - // scalar) - case IF_SVE_IZ_4A_A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit scatter non-temporal store (vector plus - // scalar) - case IF_SVE_JA_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit scatter non-temporal store (vector plus - // scalar) case IF_SVE_JB_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal store (scalar plus // scalar) case IF_SVE_JC_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE store multiple structures (scalar plus scalar) @@ -19563,14 +19828,6 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_SVE_JD_4C_A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous store (scalar plus scalar) case IF_SVE_JF_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE store multiple structures (quadwords, scalar plus // scalar) - case IF_SVE_JJ_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled - // offsets) - case IF_SVE_JJ_4B_C: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled - // offsets) - case IF_SVE_JJ_4B_E: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled - // offsets) - case IF_SVE_JK_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit unscaled - // offsets) code = emitInsCodeSve(ins, fmt); code |= insEncodeReg_V_4_to_0(id->idReg1()); // ttttt code |= insEncodeReg_P_12_to_10(id->idReg2()); // ggg @@ -19585,6 +19842,14 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) // scaled offsets) case IF_SVE_IU_4B_D: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked // scaled offsets) + case IF_SVE_JJ_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled + // offsets) + case IF_SVE_JJ_4B_C: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled + // offsets) + case IF_SVE_JJ_4B_E: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled + // offsets) + case IF_SVE_JK_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit unscaled + // offsets) code = emitInsCodeSve(ins, fmt); code |= insEncodeReg_V_4_to_0(id->idReg1()); // ttttt code |= insEncodeReg_P_12_to_10(id->idReg2()); // ggg diff --git a/src/coreclr/jit/emitarm64.h b/src/coreclr/jit/emitarm64.h index 7a7ddc64824d16..8fd1b2df2e8502 100644 --- a/src/coreclr/jit/emitarm64.h +++ b/src/coreclr/jit/emitarm64.h @@ -1069,6 +1069,12 @@ inline static bool insOptsScalableWordsOrQuadwords(insOpts opt) return (insOptsScalableWords(opt) || (opt == INS_OPTS_SCALABLE_Q)); } +inline static bool insOptsScalableDoubleWordsOrQuadword(insOpts opt) +{ + // `opt` is a double-word or quad-word. + return ((opt == INS_OPTS_SCALABLE_D) || (opt == INS_OPTS_SCALABLE_Q)); +} + inline static bool insOptsScalableAtLeastHalf(insOpts opt) { // `opt` is any of the half and above scalable types. From 0ff8a934a258e03f1f63d5c5b5cb342d0b40fa78 Mon Sep 17 00:00:00 2001 From: TIHan Date: Wed, 31 Jan 2024 19:04:25 -0800 Subject: [PATCH 14/18] Minor format fixes --- src/coreclr/jit/emitarm64.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index ad98acf00c526b..ed64ec35880f0a 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -19778,6 +19778,14 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) dst += emitOutput_Instr(dst, code); break; + case IF_SVE_IG_4A_D: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus + // scalar) + case IF_SVE_IG_4A_E: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus + // scalar) + case IF_SVE_IG_4A_F: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus + // scalar) + case IF_SVE_IG_4A_G: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus + // scalar) case IF_SVE_II_4A_H: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) case IF_SVE_IK_4A_F: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) case IF_SVE_IK_4A_G: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) @@ -19805,14 +19813,6 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) break; case IF_SVE_IG_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus scalar) - case IF_SVE_IG_4A_D: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus - // scalar) - case IF_SVE_IG_4A_E: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus - // scalar) - case IF_SVE_IG_4A_F: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus - // scalar) - case IF_SVE_IG_4A_G: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus - // scalar) case IF_SVE_II_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) case IF_SVE_II_4A_B: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) case IF_SVE_IK_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) From 0bac2ff71178a7c1e0026ed06cf23d256dddd6a1 Mon Sep 17 00:00:00 2001 From: TIHan Date: Thu, 1 Feb 2024 10:16:42 -0800 Subject: [PATCH 15/18] Separated emitting SVE instructions for R_R_R_R --- src/coreclr/jit/emitarm64.cpp | 44 +++++++++++++++++++++++++++++++++++ src/coreclr/jit/emitarm64.h | 9 +++++++ 2 files changed, 53 insertions(+) diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index ed64ec35880f0a..5a739038179d79 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -11554,6 +11554,7 @@ void emitter::emitIns_R_R_R_R(instruction ins, assert(isGeneralRegister(reg2)); assert(isGeneralRegister(reg3)); assert(isGeneralRegister(reg4)); + assert(insScalableOptsNone(sopt)); fmt = IF_DR_4A; break; @@ -11567,6 +11568,7 @@ void emitter::emitIns_R_R_R_R(instruction ins, assert(isVectorRegister(reg2)); assert(isVectorRegister(reg3)); assert(isVectorRegister(reg4)); + assert(insScalableOptsNone(sopt)); fmt = IF_DV_4A; break; @@ -11574,6 +11576,48 @@ void emitter::emitIns_R_R_R_R(instruction ins, fmt = IF_NONE; break; + // Fallback handles emitting the SVE instructions. + default: + return emitInsSve_R_R_R_R(ins, attr, reg1, reg2, reg3, reg4, opt, sopt); + } + assert(fmt != IF_NONE); + + instrDesc* id = emitNewInstr(attr); + + id->idIns(ins); + id->idInsFmt(fmt); + id->idInsOpt(opt); + + id->idReg1(reg1); + id->idReg2(reg2); + id->idReg3(reg3); + id->idReg4(reg4); + + dispIns(id); + appendToCurIG(id); +} + +/***************************************************************************** + * + * Add a SVE instruction referencing four registers. + * Do not call this directly. Use 'emitIns_R_R_R_R' instead. + */ + +void emitter::emitInsSve_R_R_R_R(instruction ins, + emitAttr attr, + regNumber reg1, + regNumber reg2, + regNumber reg3, + regNumber reg4, + insOpts opt /* = INS_OPT_NONE*/, + insScalableOpts sopt /* = INS_SCALABLE_OPTS_NONE */) +{ + emitAttr size = EA_SIZE(attr); + insFormat fmt = IF_NONE; + + /* Figure out the encoding format of the instruction */ + switch (ins) + { case INS_sve_cmpeq: case INS_sve_cmpgt: case INS_sve_cmpge: diff --git a/src/coreclr/jit/emitarm64.h b/src/coreclr/jit/emitarm64.h index 8fd1b2df2e8502..25f41d3aa026fd 100644 --- a/src/coreclr/jit/emitarm64.h +++ b/src/coreclr/jit/emitarm64.h @@ -1235,6 +1235,15 @@ void emitIns_R_R_R_R(instruction ins, insOpts opt = INS_OPTS_NONE, insScalableOpts sopt = INS_SCALABLE_OPTS_NONE); +void emitInsSve_R_R_R_R(instruction ins, + emitAttr attr, + regNumber reg1, + regNumber reg2, + regNumber reg3, + regNumber reg4, + insOpts opt = INS_OPTS_NONE, + insScalableOpts sopt = INS_SCALABLE_OPTS_NONE); + void emitIns_R_COND(instruction ins, emitAttr attr, regNumber reg, insCond cond); void emitIns_R_R_COND(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, insCond cond); From fc20ecdfccdeec4e076ba477b8c5e75a9ffb8479 Mon Sep 17 00:00:00 2001 From: TIHan Date: Thu, 1 Feb 2024 19:56:20 -0800 Subject: [PATCH 16/18] Added emitInstrSve --- src/coreclr/jit/emit.h | 1 + src/coreclr/jit/emitarm64.cpp | 395 ++++++++++++++++++---------------- 2 files changed, 210 insertions(+), 186 deletions(-) diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 8ab061f7ab9716..de69c4204cda0a 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -2398,6 +2398,7 @@ class emitter void emitAdvanceInstrDesc(instrDesc** id, size_t idSize) const; size_t emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp); size_t emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp); + BYTE* emitInstrSve(instrDesc* id, BYTE* dst); bool emitHasFramePtr; diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index 5a739038179d79..b065a80a652daa 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -19156,6 +19156,209 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) dst += emitOutput_Instr(dst, code); break; + default: + dst = emitInstrSve(id, dst); + break; + } + + // Determine if any registers now hold GC refs, or whether a register that was overwritten held a GC ref. + // We assume here that "id->idGCref()" is not GC_NONE only if the instruction described by "id" writes a + // GC ref to register "id->idReg1()". (It may, apparently, also not be GC_NONE in other cases, such as + // for stores, but we ignore those cases here.) + if (emitInsMayWriteToGCReg(id)) // True if "id->idIns()" writes to a register than can hold GC ref. + { + // We assume that "idReg1" is the primary destination register for all instructions + assert(!emitInsDestIsOp2(ins)); + if (id->idGCref() != GCT_NONE) + { + emitGCregLiveUpd(id->idGCref(), id->idReg1(), dst); + } + else + { + emitGCregDeadUpd(id->idReg1(), dst); + } + + if (emitInsMayWriteMultipleRegs(id)) + { + // INS_ldp etc... + // "idReg2" is the secondary destination register + if (id->idGCrefReg2() != GCT_NONE) + { + emitGCregLiveUpd(id->idGCrefReg2(), id->idReg2(), dst); + } + else + { + emitGCregDeadUpd(id->idReg2(), dst); + } + } + } + +SKIP_GC_UPDATE: + // Now we determine if the instruction has written to a (local variable) stack location, and either written a GC + // ref or overwritten one. + if (emitInsWritesToLclVarStackLoc(id) || emitInsWritesToLclVarStackLocPair(id)) + { + int varNum = id->idAddr()->iiaLclVar.lvaVarNum(); + unsigned ofs = AlignDown(id->idAddr()->iiaLclVar.lvaOffset(), TARGET_POINTER_SIZE); + bool FPbased; + int adr = emitComp->lvaFrameAddress(varNum, &FPbased); + if (id->idGCref() != GCT_NONE) + { + emitGCvarLiveUpd(adr + ofs, varNum, id->idGCref(), dst DEBUG_ARG(varNum)); + } + else + { + // If the type of the local is a gc ref type, update the liveness. + var_types vt; + if (varNum >= 0) + { + // "Regular" (non-spill-temp) local. + vt = var_types(emitComp->lvaTable[varNum].lvType); + } + else + { + TempDsc* tmpDsc = codeGen->regSet.tmpFindNum(varNum); + vt = tmpDsc->tdTempType(); + } + if (vt == TYP_REF || vt == TYP_BYREF) + { + emitGCvarDeadUpd(adr + ofs, dst DEBUG_ARG(varNum)); + } + } + if (emitInsWritesToLclVarStackLocPair(id)) + { + int varNum2 = varNum; + int adr2 = adr; + unsigned ofs2 = ofs; + unsigned ofs2Dist; + + if (id->idIsLclVarPair()) + { + bool FPbased2; + + emitLclVarAddr* lclVarAddr2 = emitGetLclVarPairLclVar2(id); + varNum2 = lclVarAddr2->lvaVarNum(); + ofs2 = lclVarAddr2->lvaOffset(); + + // If there are 2 GC vars in this instrDesc, get the 2nd variable + // that should be tracked. + adr2 = emitComp->lvaFrameAddress(varNum2, &FPbased2); + ofs2Dist = EA_SIZE_IN_BYTES(size); +#ifdef DEBUG + assert(FPbased == FPbased2); + if (FPbased) + { + assert(id->idReg3() == REG_FP); + } + else + { + assert(id->idReg3() == REG_SP); + } + assert(varNum2 != -1); +#endif // DEBUG + } + else + { + ofs2Dist = TARGET_POINTER_SIZE; + ofs2 += ofs2Dist; + } + + ofs2 = AlignDown(ofs2, ofs2Dist); + + if (id->idGCrefReg2() != GCT_NONE) + { +#ifdef DEBUG + if (id->idGCref() != GCT_NONE) + { + // If 1st register was a gc-var, then make sure the offset + // are correctly set for the 2nd register that is holding + // another gc-var. + assert((adr + ofs + ofs2Dist) == (adr2 + ofs2)); + } +#endif + emitGCvarLiveUpd(adr2 + ofs2, varNum2, id->idGCrefReg2(), dst DEBUG_ARG(varNum2)); + } + else + { + // If the type of the local is a gc ref type, update the liveness. + var_types vt; + if (varNum2 >= 0) + { + // "Regular" (non-spill-temp) local. + vt = var_types(emitComp->lvaTable[varNum2].lvType); + } + else + { + TempDsc* tmpDsc = codeGen->regSet.tmpFindNum(varNum2); + vt = tmpDsc->tdTempType(); + } + if (vt == TYP_REF || vt == TYP_BYREF) + { + emitGCvarDeadUpd(adr2 + ofs2, dst DEBUG_ARG(varNum2)); + } + } + } + } + +#ifdef DEBUG + /* Make sure we set the instruction descriptor size correctly */ + + size_t expected = emitSizeOfInsDsc(id); + assert(sz == expected); + + if (emitComp->opts.disAsm || emitComp->verbose) + { + emitDispIns(id, false, dspOffs, true, emitCurCodeOffs(odst), *dp, (dst - *dp), ig); + } + + if (emitComp->compDebugBreak) + { + // For example, set JitBreakEmitOutputInstr=a6 will break when this method is called for + // emitting instruction a6, (i.e. IN00a6 in jitdump). + if ((unsigned)JitConfig.JitBreakEmitOutputInstr() == id->idDebugOnlyInfo()->idNum) + { + assert(!"JitBreakEmitOutputInstr reached"); + } + } + + // Output any delta in GC info. + if (EMIT_GC_VERBOSE || emitComp->opts.disasmWithGC) + { + emitDispGCInfoDelta(); + } +#else + if (emitComp->opts.disAsm) + { + size_t expected = emitSizeOfInsDsc(id); + assert(sz == expected); + emitDispIns(id, false, 0, true, emitCurCodeOffs(odst), *dp, (dst - *dp), ig); + } +#endif + + /* All instructions are expected to generate code */ + + assert(*dp != dst || id->idIsEmptyAlign()); + + *dp = dst; + + return sz; +} + +/***************************************************************************** + * + * Append the machine code corresponding to the given SVE instruction descriptor. + */ +BYTE* emitter::emitInstrSve(instrDesc* id, BYTE* dst) +{ + code_t code = 0; + instruction ins = id->idIns(); + insFormat fmt = id->idInsFmt(); + emitAttr size = id->idOpSize(); + + ssize_t imm; + + switch (fmt) + { // Scalable. case IF_SVE_AA_3A: // ........xx...... ...gggmmmmmddddd -- SVE bitwise logical operations (predicated) case IF_SVE_AB_3A: // ........xx...... ...gggmmmmmddddd -- SVE integer add/subtract vectors (predicated) @@ -19857,7 +20060,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) break; case IF_SVE_IG_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus scalar) - case IF_SVE_II_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) + case IF_SVE_II_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) case IF_SVE_II_4A_B: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) case IF_SVE_IK_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) case IF_SVE_IN_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal load (scalar plus scalar) @@ -19865,10 +20068,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_SVE_IR_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (quadwords, scalar plus // scalar) case IF_SVE_IT_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (scalar plus scalar) - case IF_SVE_JB_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal store (scalar plus - // scalar) - case IF_SVE_JC_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE store multiple structures (scalar plus scalar) - case IF_SVE_JD_4C: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous store (scalar plus scalar) + case IF_SVE_JB_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal store (scalar plus + // scalar) + case IF_SVE_JC_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE store multiple structures (scalar plus scalar) + case IF_SVE_JD_4C: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous store (scalar plus scalar) case IF_SVE_JD_4C_A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous store (scalar plus scalar) case IF_SVE_JF_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE store multiple structures (quadwords, scalar plus // scalar) @@ -19907,187 +20110,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) break; } - // Determine if any registers now hold GC refs, or whether a register that was overwritten held a GC ref. - // We assume here that "id->idGCref()" is not GC_NONE only if the instruction described by "id" writes a - // GC ref to register "id->idReg1()". (It may, apparently, also not be GC_NONE in other cases, such as - // for stores, but we ignore those cases here.) - if (emitInsMayWriteToGCReg(id)) // True if "id->idIns()" writes to a register than can hold GC ref. - { - // We assume that "idReg1" is the primary destination register for all instructions - assert(!emitInsDestIsOp2(ins)); - if (id->idGCref() != GCT_NONE) - { - emitGCregLiveUpd(id->idGCref(), id->idReg1(), dst); - } - else - { - emitGCregDeadUpd(id->idReg1(), dst); - } - - if (emitInsMayWriteMultipleRegs(id)) - { - // INS_ldp etc... - // "idReg2" is the secondary destination register - if (id->idGCrefReg2() != GCT_NONE) - { - emitGCregLiveUpd(id->idGCrefReg2(), id->idReg2(), dst); - } - else - { - emitGCregDeadUpd(id->idReg2(), dst); - } - } - } - -SKIP_GC_UPDATE: - // Now we determine if the instruction has written to a (local variable) stack location, and either written a GC - // ref or overwritten one. - if (emitInsWritesToLclVarStackLoc(id) || emitInsWritesToLclVarStackLocPair(id)) - { - int varNum = id->idAddr()->iiaLclVar.lvaVarNum(); - unsigned ofs = AlignDown(id->idAddr()->iiaLclVar.lvaOffset(), TARGET_POINTER_SIZE); - bool FPbased; - int adr = emitComp->lvaFrameAddress(varNum, &FPbased); - if (id->idGCref() != GCT_NONE) - { - emitGCvarLiveUpd(adr + ofs, varNum, id->idGCref(), dst DEBUG_ARG(varNum)); - } - else - { - // If the type of the local is a gc ref type, update the liveness. - var_types vt; - if (varNum >= 0) - { - // "Regular" (non-spill-temp) local. - vt = var_types(emitComp->lvaTable[varNum].lvType); - } - else - { - TempDsc* tmpDsc = codeGen->regSet.tmpFindNum(varNum); - vt = tmpDsc->tdTempType(); - } - if (vt == TYP_REF || vt == TYP_BYREF) - { - emitGCvarDeadUpd(adr + ofs, dst DEBUG_ARG(varNum)); - } - } - if (emitInsWritesToLclVarStackLocPair(id)) - { - int varNum2 = varNum; - int adr2 = adr; - unsigned ofs2 = ofs; - unsigned ofs2Dist; - - if (id->idIsLclVarPair()) - { - bool FPbased2; - - emitLclVarAddr* lclVarAddr2 = emitGetLclVarPairLclVar2(id); - varNum2 = lclVarAddr2->lvaVarNum(); - ofs2 = lclVarAddr2->lvaOffset(); - - // If there are 2 GC vars in this instrDesc, get the 2nd variable - // that should be tracked. - adr2 = emitComp->lvaFrameAddress(varNum2, &FPbased2); - ofs2Dist = EA_SIZE_IN_BYTES(size); -#ifdef DEBUG - assert(FPbased == FPbased2); - if (FPbased) - { - assert(id->idReg3() == REG_FP); - } - else - { - assert(id->idReg3() == REG_SP); - } - assert(varNum2 != -1); -#endif // DEBUG - } - else - { - ofs2Dist = TARGET_POINTER_SIZE; - ofs2 += ofs2Dist; - } - - ofs2 = AlignDown(ofs2, ofs2Dist); - - if (id->idGCrefReg2() != GCT_NONE) - { -#ifdef DEBUG - if (id->idGCref() != GCT_NONE) - { - // If 1st register was a gc-var, then make sure the offset - // are correctly set for the 2nd register that is holding - // another gc-var. - assert((adr + ofs + ofs2Dist) == (adr2 + ofs2)); - } -#endif - emitGCvarLiveUpd(adr2 + ofs2, varNum2, id->idGCrefReg2(), dst DEBUG_ARG(varNum2)); - } - else - { - // If the type of the local is a gc ref type, update the liveness. - var_types vt; - if (varNum2 >= 0) - { - // "Regular" (non-spill-temp) local. - vt = var_types(emitComp->lvaTable[varNum2].lvType); - } - else - { - TempDsc* tmpDsc = codeGen->regSet.tmpFindNum(varNum2); - vt = tmpDsc->tdTempType(); - } - if (vt == TYP_REF || vt == TYP_BYREF) - { - emitGCvarDeadUpd(adr2 + ofs2, dst DEBUG_ARG(varNum2)); - } - } - } - } - -#ifdef DEBUG - /* Make sure we set the instruction descriptor size correctly */ - - size_t expected = emitSizeOfInsDsc(id); - assert(sz == expected); - - if (emitComp->opts.disAsm || emitComp->verbose) - { - emitDispIns(id, false, dspOffs, true, emitCurCodeOffs(odst), *dp, (dst - *dp), ig); - } - - if (emitComp->compDebugBreak) - { - // For example, set JitBreakEmitOutputInstr=a6 will break when this method is called for - // emitting instruction a6, (i.e. IN00a6 in jitdump). - if ((unsigned)JitConfig.JitBreakEmitOutputInstr() == id->idDebugOnlyInfo()->idNum) - { - assert(!"JitBreakEmitOutputInstr reached"); - } - } - - // Output any delta in GC info. - if (EMIT_GC_VERBOSE || emitComp->opts.disasmWithGC) - { - emitDispGCInfoDelta(); - } -#else - if (emitComp->opts.disAsm) - { - size_t expected = emitSizeOfInsDsc(id); - assert(sz == expected); - emitDispIns(id, false, 0, true, emitCurCodeOffs(odst), *dp, (dst - *dp), ig); - } -#endif - - /* All instructions are expected to generate code */ - - assert(*dp != dst || id->idIsEmptyAlign()); - - *dp = dst; - - return sz; + return dst; } /*****************************************************************************/ From 8a08c17ca392241f2552409fc30bf09cbc598b2f Mon Sep 17 00:00:00 2001 From: TIHan Date: Sun, 4 Feb 2024 11:50:52 -0800 Subject: [PATCH 17/18] Renamed emitInstrSve to emitOutput_InstrSve --- src/coreclr/jit/emit.h | 1 - src/coreclr/jit/emitarm64.cpp | 4 ++-- src/coreclr/jit/emitarm64.h | 3 +++ 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index de69c4204cda0a..8ab061f7ab9716 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -2398,7 +2398,6 @@ class emitter void emitAdvanceInstrDesc(instrDesc** id, size_t idSize) const; size_t emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp); size_t emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp); - BYTE* emitInstrSve(instrDesc* id, BYTE* dst); bool emitHasFramePtr; diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index b065a80a652daa..a6a0f51e311c13 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -19157,7 +19157,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) break; default: - dst = emitInstrSve(id, dst); + dst = emitOutput_InstrSve(dst, id); break; } @@ -19348,7 +19348,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) * * Append the machine code corresponding to the given SVE instruction descriptor. */ -BYTE* emitter::emitInstrSve(instrDesc* id, BYTE* dst) +BYTE* emitter::emitOutput_InstrSve(BYTE* dst, instrDesc* id) { code_t code = 0; instruction ins = id->idIns(); diff --git a/src/coreclr/jit/emitarm64.h b/src/coreclr/jit/emitarm64.h index 25f41d3aa026fd..616c46fc047417 100644 --- a/src/coreclr/jit/emitarm64.h +++ b/src/coreclr/jit/emitarm64.h @@ -129,6 +129,9 @@ void emitInsLoadStoreOp(instruction ins, emitAttr attr, regNumber dataReg, GenTr // Emit the 32-bit Arm64 instruction 'code' into the 'dst' buffer unsigned emitOutput_Instr(BYTE* dst, code_t code); +// Append the machine code corresponding to the given SVE instruction descriptor. +BYTE* emitOutput_InstrSve(BYTE* dst, instrDesc* id); + // A helper method to return the natural scale for an EA 'size' static unsigned NaturalScale_helper(emitAttr size); From b45d3a69f4dc1570a027fc5c93201b175a62d709 Mon Sep 17 00:00:00 2001 From: TIHan Date: Mon, 5 Feb 2024 11:21:39 -0800 Subject: [PATCH 18/18] Formatting --- src/coreclr/jit/codegenarm64test.cpp | 24 +++++------ src/coreclr/jit/emitarm64.cpp | 62 +++++++++++++++------------- 2 files changed, 45 insertions(+), 41 deletions(-) diff --git a/src/coreclr/jit/codegenarm64test.cpp b/src/coreclr/jit/codegenarm64test.cpp index 58f1c03cd3ef84..8c964c58e61db8 100644 --- a/src/coreclr/jit/codegenarm64test.cpp +++ b/src/coreclr/jit/codegenarm64test.cpp @@ -6506,10 +6506,10 @@ void CodeGen::genArm64EmitterUnitTestsSve() INS_OPTS_SCALABLE_D); // LDNT1W {.D }, /Z, [.D{, }] // IF_SVE_IG_4A - theEmitter->emitIns_R_R_R_R(INS_sve_ldff1d, EA_SCALABLE, REG_V1, REG_P2, REG_R3, REG_R4, - INS_OPTS_SCALABLE_D, INS_SCALABLE_OPTS_LSL_N); // LDFF1D {.D }, /Z, [{, , LSL #3}] - theEmitter->emitIns_R_R_R_R(INS_sve_ldff1sw, EA_SCALABLE, REG_V2, REG_P3, REG_R4, REG_R5, - INS_OPTS_SCALABLE_D, INS_SCALABLE_OPTS_LSL_N); // LDFF1SW {.D }, /Z, [{, , LSL #2}] + theEmitter->emitIns_R_R_R_R(INS_sve_ldff1d, EA_SCALABLE, REG_V1, REG_P2, REG_R3, REG_R4, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // LDFF1D {.D }, /Z, [{, , LSL #3}] + theEmitter->emitIns_R_R_R_R(INS_sve_ldff1sw, EA_SCALABLE, REG_V2, REG_P3, REG_R4, REG_R5, INS_OPTS_SCALABLE_D, + INS_SCALABLE_OPTS_LSL_N); // LDFF1SW {.D }, /Z, [{, , LSL #2}] // REG_ZR can be used due to the optional {, , LSL #2}} of the format, though it still requires passing // INS_SCALABLE_OPTS_LSL_N with it. theEmitter->emitIns_R_R_R_R(INS_sve_ldff1sw, EA_SCALABLE, REG_V2, REG_P3, REG_R4, REG_ZR, INS_OPTS_SCALABLE_D, @@ -6788,30 +6788,30 @@ void CodeGen::genArm64EmitterUnitTestsSve() theEmitter->emitIns_R_R_R_R(INS_sve_st3d, EA_SCALABLE, REG_V2, REG_P3, REG_R4, REG_R6, INS_OPTS_SCALABLE_D, INS_SCALABLE_OPTS_LSL_N); // ST3D {.D, .D, .D }, , [, , // LSL - // #3] + // #3] theEmitter->emitIns_R_R_R_R(INS_sve_st3h, EA_SCALABLE, REG_V1, REG_P0, REG_R3, REG_R8, INS_OPTS_SCALABLE_H, INS_SCALABLE_OPTS_LSL_N); // ST3H {.H, .H, .H }, , [, , // LSL - // #1] + // #1] theEmitter->emitIns_R_R_R_R(INS_sve_st3w, EA_SCALABLE, REG_V0, REG_P1, REG_R2, REG_R3, INS_OPTS_SCALABLE_S, INS_SCALABLE_OPTS_LSL_N); // ST3W {.S, .S, .S }, , [, , // LSL - // #2] + // #2] theEmitter->emitIns_R_R_R_R(INS_sve_st4b, EA_SCALABLE, REG_V0, REG_P6, REG_R5, REG_R4, INS_OPTS_SCALABLE_B); // ST4B {.B, .B, .B, .B }, , [, // ] theEmitter->emitIns_R_R_R_R(INS_sve_st4d, EA_SCALABLE, REG_V5, REG_P2, REG_R1, REG_R0, INS_OPTS_SCALABLE_D, INS_SCALABLE_OPTS_LSL_N); // ST4D {.D, .D, .D, .D }, , // [, - // , LSL #3] + // , LSL #3] theEmitter->emitIns_R_R_R_R(INS_sve_st4h, EA_SCALABLE, REG_V1, REG_P0, REG_R9, REG_R8, INS_OPTS_SCALABLE_H, INS_SCALABLE_OPTS_LSL_N); // ST4H {.H, .H, .H, .H }, , // [, - // , LSL #1] + // , LSL #1] theEmitter->emitIns_R_R_R_R(INS_sve_st4w, EA_SCALABLE, REG_V0, REG_P1, REG_R4, REG_R5, INS_OPTS_SCALABLE_S, INS_SCALABLE_OPTS_LSL_N); // ST4W {.S, .S, .S, .S }, , // [, - // , LSL #2] + // , LSL #2] // IF_SVE_JD_4C theEmitter->emitIns_R_R_R_R(INS_sve_st1d, EA_SCALABLE, REG_V1, REG_P4, REG_R5, REG_R6, INS_OPTS_SCALABLE_D, @@ -6829,11 +6829,11 @@ void CodeGen::genArm64EmitterUnitTestsSve() theEmitter->emitIns_R_R_R_R(INS_sve_st3q, EA_SCALABLE, REG_V1, REG_P4, REG_R2, REG_R8, INS_OPTS_SCALABLE_Q, INS_SCALABLE_OPTS_LSL_N); // ST3Q {.Q, .Q, .Q }, , [, , // LSL - // #4] + // #4] theEmitter->emitIns_R_R_R_R(INS_sve_st4q, EA_SCALABLE, REG_V4, REG_P1, REG_R8, REG_R2, INS_OPTS_SCALABLE_Q, INS_SCALABLE_OPTS_LSL_N); // ST4Q {.Q, .Q, .Q, .Q }, , // [, - // , LSL #4] + // , LSL #4] // IF_SVE_JJ_4B theEmitter->emitIns_R_R_R_R(INS_sve_st1d, EA_SCALABLE, REG_V0, REG_P3, REG_R2, REG_V1, INS_OPTS_SCALABLE_D, diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index 595e3d3b9df52b..49ff10255b5ba1 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -1830,7 +1830,7 @@ void emitter::emitInsSanityCheck(instrDesc* id) assert(isScalableVectorSize(elemsize)); break; - case IF_SVE_II_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) + case IF_SVE_II_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) elemsize = id->idOpSize(); assert(id->idInsOpt() == INS_OPTS_SCALABLE_D); assert(isVectorRegister(id->idReg1())); // ttttt @@ -1962,9 +1962,9 @@ void emitter::emitInsSanityCheck(instrDesc* id) case IF_SVE_IN_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal load (scalar plus scalar) case IF_SVE_IP_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load and broadcast quadword (scalar plus scalar) case IF_SVE_IT_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE load multiple structures (scalar plus scalar) - case IF_SVE_JB_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal store (scalar plus - // scalar) - case IF_SVE_JC_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE store multiple structures (scalar plus scalar) + case IF_SVE_JB_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal store (scalar plus + // scalar) + case IF_SVE_JC_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE store multiple structures (scalar plus scalar) elemsize = id->idOpSize(); assert(insOptsScalableStandard(id->idInsOpt())); assert(isVectorRegister(id->idReg1())); // ttttt @@ -16385,7 +16385,7 @@ void emitter::emitIns_Call(EmitCallType callType, case IF_SVE_IP_4A: switch (ins) - { + { case INS_sve_ld1roh: case INS_sve_ld1row: case INS_sve_ld1rod: @@ -17285,7 +17285,10 @@ void emitter::emitIns_Call(EmitCallType callType, * for the 'dtype' field. */ -/*static*/ emitter::code_t emitter::insEncodeSveElemsize_dtype_ld1w(instruction ins, insFormat fmt, emitAttr size, code_t code) +/*static*/ emitter::code_t emitter::insEncodeSveElemsize_dtype_ld1w(instruction ins, + insFormat fmt, + emitAttr size, + code_t code) { assert(canEncodeSveElemsize_dtype(ins)); assert(ins == INS_sve_ld1w); @@ -23475,8 +23478,8 @@ void emitter::emitDispInsHelp( case IF_SVE_HW_4B_D: // ...........mmmmm ...gggnnnnnttttt -- SVE 32-bit gather load (scalar plus 32-bit unscaled // offsets) // {.S }, /Z, [.S{, }] - case IF_SVE_IF_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit gather non-temporal load (vector plus - // scalar) + case IF_SVE_IF_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit gather non-temporal load (vector plus + // scalar) // {.D }, /Z, [.D{, }] case IF_SVE_IF_4A_A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit gather non-temporal load (vector plus // scalar) @@ -23506,13 +23509,13 @@ void emitter::emitDispInsHelp( case IF_SVE_IG_4A_G: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous first-fault load (scalar plus // scalar) // {.D }, /Z, [, , LSL #3] - case IF_SVE_II_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) + case IF_SVE_II_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) // {.Q }, /Z, [, , LSL #3] case IF_SVE_II_4A_B: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) // {.D }, /Z, [, , LSL #2] case IF_SVE_II_4A_H: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (quadwords, scalar plus scalar) // {.D }, /Z, [, , LSL #2 - case IF_SVE_IK_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) + case IF_SVE_IK_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous load (scalar plus scalar) // {.H }, /Z, [, ] // {.S }, /Z, [, ] // {.D }, /Z, [, ] @@ -23568,27 +23571,27 @@ void emitter::emitDispInsHelp( case IF_SVE_IU_4B_D: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked // scaled offsets) // {.Q }, /Z, [.D{, }] - case IF_SVE_IW_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit gather load (vector plus scalar) + case IF_SVE_IW_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit gather load (vector plus scalar) // {.D }, /Z, [.D{, }] - case IF_SVE_IX_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit gather non-temporal load (vector plus - // scalar) + case IF_SVE_IX_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit gather non-temporal load (vector plus + // scalar) // {.Q }, , [.D{, }] - case IF_SVE_IY_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit scatter store (vector plus scalar) + case IF_SVE_IY_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 128-bit scatter store (vector plus scalar) // {.S }, , [.S{, }] - case IF_SVE_IZ_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit scatter non-temporal store (vector plus - // scalar) + case IF_SVE_IZ_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit scatter non-temporal store (vector plus + // scalar) // {.D }, , [.D{, }] case IF_SVE_IZ_4A_A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit scatter non-temporal store (vector plus // scalar) // {.D }, , [.D{, }] - case IF_SVE_JA_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit scatter non-temporal store (vector plus - // scalar) + case IF_SVE_JA_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit scatter non-temporal store (vector plus + // scalar) // {.B }, , [, ] // {.H }, , [, , LSL #1] // {.S }, , [, , LSL #2] // {.D }, , [, , LSL #3] - case IF_SVE_JB_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal store (scalar plus - // scalar) + case IF_SVE_JB_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous non-temporal store (scalar plus + // scalar) // {.B, .B }, , [, ] // {.H, .H }, , [, , LSL #1] // {.S, .S }, , [, , LSL #2] @@ -23601,10 +23604,10 @@ void emitter::emitDispInsHelp( // {.H, .H, .H, .H }, , [, , LSL #1] // {.S, .S, .S, .S }, , [, , LSL #2] // {.D, .D, .D, .D }, , [, , LSL #3] - case IF_SVE_JC_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE store multiple structures (scalar plus scalar) + case IF_SVE_JC_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE store multiple structures (scalar plus scalar) // {.Q }, , [, , LSL #2] // {.D }, , [, , LSL #3] - case IF_SVE_JD_4C: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous store (scalar plus scalar) + case IF_SVE_JD_4C: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous store (scalar plus scalar) // {.Q }, , [, , LSL #3] case IF_SVE_JD_4C_A: // ...........mmmmm ...gggnnnnnttttt -- SVE contiguous store (scalar plus scalar) // {.Q, .Q }, , [, , LSL #4] @@ -26765,7 +26768,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins result.insLatency = PERFSCORE_LATENCY_9C; break; - case IF_SVE_IF_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit gather non-temporal load (vector plus scalar) + case IF_SVE_IF_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit gather non-temporal load (vector plus + // scalar) case IF_SVE_IF_4A_A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit gather non-temporal load (vector plus // scalar) result.insThroughput = PERFSCORE_THROUGHPUT_2X; @@ -26921,8 +26925,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins } break; - case IF_SVE_IU_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked - // scaled offsets) + case IF_SVE_IU_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked + // scaled offsets) case IF_SVE_IU_4B_B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked // scaled offsets) case IF_SVE_IU_4B_D: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit gather load (scalar plus 32-bit unpacked @@ -26965,8 +26969,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins } break; - case IF_SVE_IZ_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit scatter non-temporal store (vector plus - // scalar) + case IF_SVE_IZ_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit scatter non-temporal store (vector plus + // scalar) case IF_SVE_IZ_4A_A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 32-bit scatter non-temporal store (vector plus // scalar) case IF_SVE_JA_4A: // ...........mmmmm ...gggnnnnnttttt -- SVE2 64-bit scatter non-temporal store (vector plus @@ -27060,8 +27064,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins } break; - case IF_SVE_JJ_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled - // offsets) + case IF_SVE_JJ_4B: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled + // offsets) case IF_SVE_JJ_4B_C: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled // offsets) case IF_SVE_JJ_4B_E: // ...........mmmmm ...gggnnnnnttttt -- SVE 64-bit scatter store (scalar plus 64-bit scaled