diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp index 965b6bbd..6a73f426 100644 --- a/gen/gen_code.cpp +++ b/gen/gen_code.cpp @@ -2017,12 +2017,12 @@ void put64() void putAMX_TILE() { - puts("void ldtilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_0F38 | T_W0, 0x49); }"); - puts("void sttilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_66 | T_0F38 | T_W0, 0x49); }"); - puts("void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2 | T_0F38 | T_W0, 0x4b); }"); - puts("void tileloaddt1(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_66 | T_0F38 | T_W0, 0x4b); }"); + puts("void ldtilecfg(const Address& addr) { if (opROO(Reg(), addr, tmm0, T_APX|T_0F38|T_W0, 0x49)) return; opVex(tmm0, &tmm0, addr, T_0F38|T_W0, 0x49); }"); + puts("void sttilecfg(const Address& addr) { if (opROO(Reg(), addr, tmm0, T_APX|T_66|T_0F38|T_W0, 0x49)) return; opVex(tmm0, &tmm0, addr, T_66|T_0F38 | T_W0, 0x49); }"); + puts("void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2|T_0F38|T_W0, 0x4B); }"); + puts("void tileloaddt1(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_66|T_0F38|T_W0, 0x4B); }"); puts("void tilerelease() { db(0xc4); db(0xe2); db(0x78); db(0x49); db(0xc0); }"); - puts("void tilestored(const Address& addr, const Tmm& tm) { opVex(tm, &tmm0, addr, T_F3 | T_0F38 | T_W0, 0x4b); }"); + puts("void tilestored(const Address& addr, const Tmm& tm) { if (opROO(Reg(), addr, tm, T_APX|T_F3|T_0F38|T_W0, 0x4B)) return; opVex(tm, &tmm0, addr, T_F3|T_0F38|T_W0, 0x4B); }"); puts("void tilezero(const Tmm& Tmm) { opVex(Tmm, &tmm0, tmm0, T_F2 | T_0F38 | T_W0, 0x49); }"); } void putAMX_INT8() diff --git a/test/apx.cpp b/test/apx.cpp index d184e6c3..f03b032c 100644 --- a/test/apx.cpp +++ b/test/apx.cpp @@ -1753,3 +1753,25 @@ CYBOZU_TEST_AUTO(kmov) CYBOZU_TEST_EQUAL(c.getSize(), n); CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); } + +CYBOZU_TEST_AUTO(amx) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { + ldtilecfg(ptr [r30+r29*4+0x12]); + sttilecfg(ptr [r30+r29*4+0x12]); + tileloadd(tmm1, ptr [r30+r29*4+0x12]); + tileloaddt1(tmm3, ptr [r30+r29*4+0x12]); + tilestored(ptr [r30+r29*4+0x12], tmm5); + } + } c; + const uint8_t tbl[] = { + 0x62, 0x9a, 0x78, 0x08, 0x49, 0x44, 0xae, 0x12, 0x62, 0x9a, 0x79, 0x08, 0x49, 0x44, 0xae, 0x12, + 0x62, 0x9a, 0x7b, 0x08, 0x4b, 0x4c, 0xae, 0x12, 0x62, 0x9a, 0x79, 0x08, 0x4b, 0x5c, 0xae, 0x12, + 0x62, 0x9a, 0x7a, 0x08, 0x4b, 0x6c, 0xae, 0x12, + }; + const size_t n = sizeof(tbl); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); +} diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index 5d03da8e..181d6ed2 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -2708,6 +2708,7 @@ class CodeGenerator : public CodeArray { Address addr2 = addr.cloneNoOptimize(); const RegExp exp = addr2.getRegExp(); if (exp.getBase().getBit() == 0 || exp.getIndex().getBit() == 0) XBYAK_THROW(ERR_NOT_SUPPORTED) + if (opROO(Reg(), addr2, t1, T_APX|type, code)) return; opVex(t1, &tmm0, addr2, type, code); } #endif diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index 5b072f3e..882ec025 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -1926,12 +1926,12 @@ void cmpoxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opRRO(r void cmppxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opRRO(r1, r2, addr, T_APX|T_66|T_0F38, 0xEA); } void cmpsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opRRO(r1, r2, addr, T_APX|T_66|T_0F38, 0xE8); } void cmpzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opRRO(r1, r2, addr, T_APX|T_66|T_0F38, 0xE4); } -void ldtilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_0F38 | T_W0, 0x49); } -void sttilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_66 | T_0F38 | T_W0, 0x49); } -void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2 | T_0F38 | T_W0, 0x4b); } -void tileloaddt1(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_66 | T_0F38 | T_W0, 0x4b); } +void ldtilecfg(const Address& addr) { if (opROO(Reg(), addr, tmm0, T_APX|T_0F38|T_W0, 0x49)) return; opVex(tmm0, &tmm0, addr, T_0F38|T_W0, 0x49); } +void sttilecfg(const Address& addr) { if (opROO(Reg(), addr, tmm0, T_APX|T_66|T_0F38|T_W0, 0x49)) return; opVex(tmm0, &tmm0, addr, T_66|T_0F38 | T_W0, 0x49); } +void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2|T_0F38|T_W0, 0x4B); } +void tileloaddt1(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_66|T_0F38|T_W0, 0x4B); } void tilerelease() { db(0xc4); db(0xe2); db(0x78); db(0x49); db(0xc0); } -void tilestored(const Address& addr, const Tmm& tm) { opVex(tm, &tmm0, addr, T_F3 | T_0F38 | T_W0, 0x4b); } +void tilestored(const Address& addr, const Tmm& tm) { if (opROO(Reg(), addr, tm, T_APX|T_F3|T_0F38|T_W0, 0x4B)) return; opVex(tm, &tmm0, addr, T_F3|T_0F38|T_W0, 0x4B); } void tilezero(const Tmm& Tmm) { opVex(Tmm, &tmm0, tmm0, T_F2 | T_0F38 | T_W0, 0x49); } void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5e); } void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); }