[SYCL][JM] Add Panther Lake (PTL) support to joint matrix query and aspect (#16885)

dkhaldi · web-flow · commit a13d57a7f99a · 2025-02-05T13:53:13.000-08:00
Also, as part of this PR, I added missing references to architectures we
recently added (BMG, LNL)
diff --git a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc
@@ -490,7 +490,9 @@ with the machine learning types, `T` should be the element type
 ==== Appendix: Restrictions Per Hardware
 ===== Intel XMX
 The checked APIs are currently available in devices with the architecture
-`architecture::intel_gpu_pvc`. The following restrictions apply to
+`architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`,
+`architecture::intel_gpu_lnl_m`, `architecture::intel_gpu_ptl_h`, or
+`architecture::intel_gpu_ptl_u`. The following restrictions apply to
 these checked APIs:
 
 - The `stride` parameter has the following restrictions:
diff --git a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc
@@ -57,8 +57,8 @@ optional kernel features as defined in section 5.7 of the core SYCL
 specification.  Each device supports only certain values for the `M`,
 `N`, and `K` template parameters and only certain types for the `Ta`,
 `Tb`, and `Tc` template parameters. Applications can use the query API
-in `matrix_params` or 
-`get_info<ext::oneapi::experimental::info::device::matrix_combinations>` 
+in `matrix_params` or
+`get_info<ext::oneapi::experimental::info::device::matrix_combinations>`
 to determine the set of legal parameters for each device.  If the
 application submits a kernel using an unsupported `joint_matrix` type
 or calls `joint_matrix_mad` with an unsupported combination, the
@@ -269,7 +269,7 @@ The two last overloads of `joint_matrix_load` take
 `sycl::ext::oneapi::experimental::annotated_ptr` as argument instead
 of `sycl::multi_ptr`. The property list associated with the
 `annotated_ptr` argument represents the compile-time constant
-properties for cache control included in the SYCL extenion
+properties for cache control included in the SYCL extension
 link:../../proposed/sycl_ext_intel_cache_controls.asciidoc[sycl_ext_intel_cache_controls]
 as illustrated in the example below.
 
@@ -1109,43 +1109,49 @@ This is currently available in devices with the architecture
 `architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`,
 `architecture::intel_gpu_lnl_m`, `architecture::intel_gpu_dg2_g10`,
 `architecture::intel_gpu_dg2_g11`, `architecture::intel_gpu_dg2_g12`,
-and `architecture::intel_gpu_arl_h`.
+`architecture::intel_gpu_arl_h`, `architecture::intel_gpu_ptl_h`, and
+`architecture::intel_gpu_ptl_u`.
 
 [frame="none",options="header"]
 |======================
 | A type | B type | C type | D type | M | N | K | device
 .2+| `matrix_type::uint8`  .2+| `matrix_type::uint8` .2+|
 `matrix_type::sint32` .2+| `matrix_type::sint32`  .2+|  +<=+ 8 |  16 .2+|  32
 |`architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`,
-`architecture::intel_gpu_lnl_m`
+`architecture::intel_gpu_lnl_m`, `architecture::intel_gpu_ptl_h`,
+`architecture::intel_gpu_ptl_u`
 |8|`architecture::intel_gpu_dg2_g10,
 architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12`,
 `architecture::intel_gpu_arl_h`
 .2+| `matrix_type::uint8`  .2+| `matrix_type::sint8` .2+|
 `matrix_type::sint32` .2+|`matrix_type::sint32`  .2+|  +<=+ 8 |  16 .2+|  32 |
 `architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`,
-`architecture::intel_gpu_lnl_m`
+`architecture::intel_gpu_lnl_m`, `architecture::intel_gpu_ptl_h`,
+`architecture::intel_gpu_ptl_u`
 |8|`architecture::intel_gpu_dg2_g10,
 architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12`,
 `architecture::intel_gpu_arl_h`
 .2+| `matrix_type::sint8`  .2+| `matrix_type::uint8` .2+|
 `matrix_type::sint32` .2+|`matrix_type::sint32`  .2+|  +<=+ 8 |  16 .2+|  32 |
 `architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`,
-`architecture::intel_gpu_lnl_m`
+`architecture::intel_gpu_lnl_m`, `architecture::intel_gpu_ptl_h`,
+`architecture::intel_gpu_ptl_u`
 |8|`architecture::intel_gpu_dg2_g10,
 architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12`,
 `architecture::intel_gpu_arl_h`
 .2+| `matrix_type::sint8`  .2+| `matrix_type::sint8` .2+|
 `matrix_type::sint32` .2+| `matrix_type::sint32`  .2+|  +<=+ 8 |  16 .2+|  32 |
 `architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`,
-`architecture::intel_gpu_lnl_m`
+`architecture::intel_gpu_lnl_m`, `architecture::intel_gpu_ptl_h`,
+`architecture::intel_gpu_ptl_u`
 |8|`architecture::intel_gpu_dg2_g10,
 architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12`,
 `architecture::intel_gpu_arl_h`
 .8+|`matrix_type::fp16`       .8+|  `matrix_type::fp16`   .8+|
 `matrix_type::fp32` .8+|`matrix_type::fp32`  .1+| 16 .1+| 16 | 16
 .6+|`architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`,
-`architecture::intel_gpu_lnl_m`
+`architecture::intel_gpu_lnl_m`, `architecture::intel_gpu_ptl_h`,
+`architecture::intel_gpu_ptl_u`
 .2+| 1 .2+| 64 | 16 |32
 .2+| 32 .2+| 64 | 16 |32
 .2+|  +<=+ 8 |  16   .2+|  16
@@ -1156,24 +1162,28 @@ architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12`,
 .6+|`matrix_type::fp16`       .6+|  `matrix_type::fp16`   .6+|
 `matrix_type::fp16` .6+|`matrix_type::fp32`  .1+|  +<=+ 8 |  16   .1+|  16
 .6+| `architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`,
-`architecture::intel_gpu_lnl_m`
+`architecture::intel_gpu_lnl_m`, `architecture::intel_gpu_ptl_h`,
+`architecture::intel_gpu_ptl_u`
 | 16 | 16 | 16 .2+| 1 .2+| 64 | 16 | 32
 .2+| 32 .2+| 64 | 16 | 32
 .6+|`matrix_type::fp16`       .6+|  `matrix_type::fp16`   .6+|
 `matrix_type::fp32` .6+|`matrix_type::fp16`  .1+|  +<=+ 8 |  16    .1+|  16
 .6+|`architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`,
-`architecture::intel_gpu_lnl_m`
+`architecture::intel_gpu_lnl_m`, `architecture::intel_gpu_ptl_h`,
+`architecture::intel_gpu_ptl_u`
 | 16 | 16 | 16 .2+| 1 .2+| 64 | 16 | 32
 .2+| 32 .2+| 64 |16 | 32
 .6+|`matrix_type::fp16`       .6+|  `matrix_type::fp16`   .6+|
 `matrix_type::fp16` .6+|`matrix_type::fp16`  .1+|  +<=+ 8 |  16   .1+|  16
 .6+|`architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`,
-`architecture::intel_gpu_lnl_m`
+`architecture::intel_gpu_lnl_m`, `architecture::intel_gpu_ptl_h`,
+`architecture::intel_gpu_ptl_u`
 | 16 | 16 | 16 .2+| 1 .2+| 64 | 16 |32 .2+| 32 .2+| 64 | 16 | 32
 .8+|  `matrix_type::bf16`       .8+|  `matrix_type::bf16`   .8+|
 `matrix_type::fp32` .8+| `matrix_type::fp32`  | 16 | 16 | 16
 .6+|`architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`,
-`architecture::intel_gpu_lnl_m`
+`architecture::intel_gpu_lnl_m`, `architecture::intel_gpu_ptl_h`,
+`architecture::intel_gpu_ptl_u`
 .2+| 1 .2+| 64 | 16 | 32
 .2+| 32 .2+| 64 | 16 |32
 .2+|  +<=+ 8 |  16   .2+|  16
@@ -1184,28 +1194,34 @@ architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12`,
 .6+|`matrix_type::bf16`       .6+|  `matrix_type::bf16`   .6+|
 `matrix_type::bf16` .6+|`matrix_type::fp32`  .1+|  +<=+ 8 |  16   .1+|  16 .6+|
 `architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`,
-`architecture::intel_gpu_lnl_m`
+`architecture::intel_gpu_lnl_m`, `architecture::intel_gpu_ptl_h`,
+`architecture::intel_gpu_ptl_u`
 | 16 | 16 | 16 .2+| 1 .2+| 64 | 16 | 32
 .2+| 32 .2+| 64 |16 | 32
 .6+|`matrix_type::bf16`       .6+|  `matrix_type::bf16`   .6+|
 `matrix_type::fp32` .6+|`matrix_type::bf16`  .1+|  +<=+ 8 |  16   .1+|  16 .6+|
 `architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`,
-`architecture::intel_gpu_lnl_m`
+`architecture::intel_gpu_lnl_m`, `architecture::intel_gpu_ptl_h`,
+`architecture::intel_gpu_ptl_u`
 | 16 | 16 | 16 .2+| 1 .2+| 64 | 16 | 32
 .2+| 32 .2+| 64 |16 | 32
 .6+|`matrix_type::bf16`       .6+|  `matrix_type::bf16`   .6+|
 `matrix_type::bf16` .6+|`matrix_type::bf16`  .1+|  +<=+ 8 |  16   .1+|  16 .6+|
 `architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`,
-`architecture::intel_gpu_lnl_m`
+`architecture::intel_gpu_lnl_m`, `architecture::intel_gpu_ptl_h`,
+`architecture::intel_gpu_ptl_u`
 | 16 | 16 | 16 .2+| 1 .2+| 64 | 16 | 32
 .2+| 32 .2+| 64 |16 | 32
 |  `matrix_type::tf32`       |  `matrix_type::tf32`   |
 `matrix_type::fp32` .2+| `matrix_type::fp32`   |  +<=+ 8 |  16   |  8 |
 `architecture::intel_gpu_pvc`, `architecture::intel_gpu_bmg_g21`,
-`architecture::intel_gpu_lnl_m`
+`architecture::intel_gpu_lnl_m`, `architecture::intel_gpu_ptl_h`,
+`architecture::intel_gpu_ptl_u`
 |======================
 
-===== Restrictions on `architecture::intel_gpu_pvc`
+===== Restrictions on `architecture::intel_gpu_pvc`,
+`architecture::intel_gpu_bmg_g21`, `architecture::intel_gpu_lnl_m`,
+`architecture::intel_gpu_ptl_h`, and `architecture::intel_gpu_ptl_u`
 
 - The `stride` parameter to `joint_matrix_load` and
   `joint_matrix_store` has the following restrictions:
diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp
@@ -684,6 +684,7 @@ bool device_impl::has(aspect Aspect) const {
         arch::intel_gpu_dg2_g10, arch::intel_gpu_dg2_g11,
         arch::intel_gpu_dg2_g12, arch::intel_gpu_bmg_g21,
         arch::intel_gpu_lnl_m,   arch::intel_gpu_arl_h,
+        arch::intel_gpu_ptl_h,   arch::intel_gpu_ptl_u,
     };
     try {
       return std::any_of(
diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp
@@ -868,7 +868,9 @@ struct get_device_info_impl<
       };
     else if ((architecture::intel_gpu_pvc == DeviceArch) ||
              (architecture::intel_gpu_bmg_g21 == DeviceArch) ||
-             (architecture::intel_gpu_lnl_m == DeviceArch)) {
+             (architecture::intel_gpu_lnl_m == DeviceArch) ||
+             (architecture::intel_gpu_ptl_h == DeviceArch) ||
+             (architecture::intel_gpu_ptl_u == DeviceArch)) {
       std::vector<ext::oneapi::experimental::matrix::combination> pvc_combs = {
           {8, 0, 0, 0, 16, 32, matrix_type::uint8, matrix_type::uint8,
            matrix_type::sint32, matrix_type::sint32},
diff --git a/sycl/test-e2e/matrix_aspect.cpp b/sycl/test-e2e/matrix_aspect.cpp
@@ -2,11 +2,10 @@
 // RUN: %{run-unfiltered-devices} %t.out
 //
 
-//==--------------- AMX_aspect.cpp - SYCL device test
-//------------------------==//
+//==--------------- matrix_aspect.cpp - SYCL device test--------------------==//
 //
 // Checks that the has(aspect) method on a device returns the correct answer
-// when queried about ext_intel_matrix AMX aspect.
+// when queried about ext_intel_matrix joint matrix aspect.
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -21,8 +20,10 @@ using namespace sycl;
 using arch = sycl::ext::oneapi::experimental::architecture;
 int main() {
   const std::vector<arch> supported_archs = {
-      arch::intel_cpu_spr, arch::intel_gpu_pvc, arch::intel_gpu_dg2_g10,
-      arch::intel_gpu_dg2_g11, arch::intel_gpu_dg2_g12};
+      arch::intel_cpu_spr,     arch::intel_cpu_gnr,     arch::intel_cpu_dmr,
+      arch::intel_gpu_pvc,     arch::intel_gpu_dg2_g10, arch::intel_gpu_dg2_g11,
+      arch::intel_gpu_dg2_g12, arch::intel_gpu_bmg_g21, arch::intel_gpu_lnl_m,
+      arch::intel_gpu_arl_h,   arch::intel_gpu_ptl_h,   arch::intel_gpu_ptl_u};
   for (const auto &plt : platform::get_platforms()) {
     for (auto &dev : plt.get_devices()) {
       try {