From 914763dc88395d5f0840de5c1f13d5ade08a4262 Mon Sep 17 00:00:00 2001 From: Nishant Patel Date: Thu, 28 Sep 2023 10:02:09 -0700 Subject: [PATCH] Add spirv test cases to showcase the use of raw_send --- .../IntelVectorExtension/DPAS_raw_send.mlir | 307 ++++++++++++++++++ .../Load_1d_raw_send.mlir | 185 +++++++++++ .../IntelVectorExtension/Load_1d_slm.mlir | 150 +++++++++ .../Load_2d_raw_send.mlir | 201 ++++++++++++ .../Store2d_raw_send.mlir | 192 +++++++++++ 5 files changed, 1035 insertions(+) create mode 100644 test/SPIRV/IntelVectorExtension/DPAS_raw_send.mlir create mode 100644 test/SPIRV/IntelVectorExtension/Load_1d_raw_send.mlir create mode 100644 test/SPIRV/IntelVectorExtension/Load_1d_slm.mlir create mode 100644 test/SPIRV/IntelVectorExtension/Load_2d_raw_send.mlir create mode 100644 test/SPIRV/IntelVectorExtension/Store2d_raw_send.mlir diff --git a/test/SPIRV/IntelVectorExtension/DPAS_raw_send.mlir b/test/SPIRV/IntelVectorExtension/DPAS_raw_send.mlir new file mode 100644 index 000000000..bb11319b5 --- /dev/null +++ b/test/SPIRV/IntelVectorExtension/DPAS_raw_send.mlir @@ -0,0 +1,307 @@ +// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \ +// RUN: --runner imex-cpu-runner -e main \ +// RUN: --entry-point-result=void \ +// RUN: --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck +// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \ +// RUN: --runner imex-cpu-runner -e main \ +// RUN: --entry-point-result=void \ +// RUN: --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck + + +/// A simple Matrix Multiplication using DPAS instruction +/// A and B are in bf16, while the result C is f32 +/// C[4x16] = A[4x16] x B[16x16] +/// This example uses 2d block load/store + +module attributes {gpu.container_module} { + + // function to setup the launch and launch the kernel + // args: size_t systolic_depth, size_t repeat_cnt, size_t N + func.func @dpas_gpu(%arg_sys_dpth: index, %arg_rpt_cnt: index, %arg_N: index, %arg_C : memref, %arg_B : memref, %arg_A : memref) { + %c1 = arith.constant 1 : index + + // Since we are using only one DPAS instruction we are launching, + // 1 workgroup and, 1 thread per workgroup + gpu.launch_func @dpas_module::@dpas_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%arg_C : memref, %arg_B : memref, %arg_A : memref) + return + } + + // SPIR-V DPAS module, it holds the DPAS kernel + spirv.module @__spv__dpas_module Physical64 OpenCL requires #spirv.vce attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + // DPAS kernel + spirv.func @dpas_kernel(%arg0: !spirv.ptr, %arg1: !spirv.ptr, %arg2: !spirv.ptr) "DontInline" attributes {spirv.entry_point_abi = #spirv.entry_point_abi<>, workgroup_attributions = 0 : i64, VectorComputeFunctionINTEL} { + %true = spirv.Constant true + + %uchar_0 = spirv.Constant 0 : i8 + %uchar_1 = spirv.Constant 1 : i8 + %uchar_2 = spirv.Constant 2 : i8 + %uchar_3 = spirv.Constant 3 : i8 + %uchar_4 = spirv.Constant 4 : i8 + %uchar_7 = spirv.Constant 7 : i8 + %uchar_8 = spirv.Constant 8 : i8 + %uchar_9 = spirv.Constant 9 : i8 + %uchar_10 = spirv.Constant 10 : i8 + %uchar_15 = spirv.Constant 15 : i8 + %uchar_16 = spirv.Constant 16 : i8 + + %ushort_1 = spirv.Constant 1 : i16 + + %uint_0 = spirv.Constant 0 : i32 + %uint_1 = spirv.Constant 1 : i32 + %uint_2 = spirv.Constant 2 : i32 + %uint_3 = spirv.Constant 3 : i32 + %uint_4 = spirv.Constant 4 : i32 + %uint_5 = spirv.Constant 5 : i32 + %uint_6 = spirv.Constant 6 : i32 + %uint_7 = spirv.Constant 7 : i32 + %uint_8 = spirv.Constant 8 : i32 + %uint_9 = spirv.Constant 9 : i32 + %uint_15 = spirv.Constant 15 : i32 + %uint_16 = spirv.Constant 16 : i32 + %uint_31 = spirv.Constant 31 : i32 + %uint_32 = spirv.Constant 32 : i32 + %uint_63 = spirv.Constant 63 : i32 + %uint_64 = spirv.Constant 64 : i32 + %uint_783 = spirv.Constant 783 : i32 + %ulong_4294967295 = spirv.Constant 4294967295 : i64 // 0xFFFFFFFF + + // Load Message descriptor : 0 00 0001 00100 000 0 0 000 010 0 0 0 000011 + // https://gfxspecs.intel.com/Predator/Home/Index/53680 + %uint_37749763 = spirv.Constant 37749763 : i32 + + // Store Message descriptor : 0 0001 00000 000 00000 010 000 000111 + // https://gfxspecs.intel.com/Predator/Home/Index/53530 + %uint_33555463 = spirv.Constant 33555463 : i32 + %zero_vector = spirv.Constant dense<0.0> : vector<64xf32> + %ulong_32 = spirv.Constant 32 : i64 + + %addr_payload_vector_store = spirv.Constant dense<[0,0,0,0,0,0,0,0]> : vector<8xi32> + + // Cast the uchar pointers (i8 ptr) to ulongs (i64) + %arg_0 = spirv.FunctionCall @llvm_genx_address_convert_i64_p1i8(%arg0) : (!spirv.ptr) -> i64 + %arg_1 = spirv.FunctionCall @llvm_genx_address_convert_i64_p1i8(%arg1) : (!spirv.ptr) -> i64 + %arg_2 = spirv.FunctionCall @llvm_genx_address_convert_i64_p1i8(%arg2) : (!spirv.ptr) -> i64 + + // --------------- STORE USING RAW SEND ------------------- + // STORE: Extract the LSB and MSB of the address and convert them to 32 bits + %addr_payload_msb_32_or = spirv.ShiftRightLogical %arg_0, %ulong_32 : i64, i64 + %addr_payload_msb_32 = spirv.UConvert %addr_payload_msb_32_or : i64 to i32 + %addr_payload_lsb_and = spirv.BitwiseAnd %arg_0, %ulong_4294967295 : i64 + %addr_payload_lsb_32 = spirv.UConvert %addr_payload_lsb_and : i64 to i32 + + // https://gfxspecs.intel.com/Predator/Home/Index/53567 + // For storing a vector<16x16> f32 + // %addr_payload = vector<8xi32> + // vector[0] = %addr_payload_lsb_32 + // vector[1] = %addr_payload_msb_32 + // vector[2] = 63 (bytes) ..width is 16*32 = 512/8 = 64 - 1 = 63 + // vector[3] = 16 (height in number of elements) - 1 = 3 + // vector[4] = pitch = 63. distance between two rows in number of bytes + // vector[5] = block start X = 0; + // vector[6] = block start Y = 0; + // vector[7] = block width 224:231 (15), block height 232:239 (15), array_length 240:243 (0) = 0000 00000011 00001111 = 783 + %0 = spirv.VectorInsertDynamic %addr_payload_lsb_32, %addr_payload_vector_store[%uint_0] : vector<8xi32>, i32 + %1 = spirv.VectorInsertDynamic %addr_payload_msb_32, %0[%uint_1] : vector<8xi32>, i32 + %2 = spirv.VectorInsertDynamic %uint_63, %1[%uint_2] : vector<8xi32>, i32 + %3 = spirv.VectorInsertDynamic %uint_3, %2[%uint_3] : vector<8xi32>, i32 + %4 = spirv.VectorInsertDynamic %uint_63, %3[%uint_4] : vector<8xi32>, i32 + %5 = spirv.VectorInsertDynamic %uint_0, %4[%uint_5] : vector<8xi32>, i32 + %6 = spirv.VectorInsertDynamic %uint_0, %5[%uint_6] : vector<8xi32>, i32 + %7 = spirv.VectorInsertDynamic %uint_783, %6[%uint_7] : vector<8xi32>, i32 + + + // Load vector C using raw_send2 + %C = spirv.FunctionCall @llvm_genx_raw_send2_v64f32_i1_v8i32(%uchar_0, %uchar_1, %true, %uchar_1, %uchar_4, %uchar_15, %uint_0, %uint_37749763, %7, %zero_vector) : (i8, i8, i1, i8, i8, i8, i32, i32, vector<8xi32>, vector<64xf32>) -> vector<64xf32> + + // Load vector B using load stateless, this load uses internal VNNI transformation while loading + %B = spirv.FunctionCall @llvm_genx_lsc_load2d_stateless_v128i32_i1_i64(%true, %uchar_0, %uchar_0, %uchar_2, %uchar_1, %uchar_1, %uint_16, %uint_16, %uchar_1, %arg_1, %uint_31, %uint_15, %uint_31, %uint_0, %uint_0) : (i1, i8, i8, i8, i8, i8, i32, i32, i8, i64, i32, i32, i32, i32, i32) -> vector<128xi32> + + // Load Vector A + + %A = spirv.FunctionCall @llvm_genx_lsc_load2d_stateless_v32i32_i1_i64(%true, %uchar_0, %uchar_0, %uchar_2, %uchar_1, %uchar_1, %uint_16, %uint_4, %uchar_0, %arg_2, %uint_31, %uint_3, %uint_31, %uint_0, %uint_0) : (i1, i8, i8, i8, i8, i8, i32, i32, i8, i64, i32, i32, i32, i32, i32) -> vector<32xi32> + + // Call dpas2 + %dpas_result = spirv.FunctionCall @llvm_genx_dpas2_v64f32_v64f32_v128i32_v32i32(%C, %B, %A, %uint_9, %uint_9, %uint_8, %uint_4, %uint_0, %uint_0): (vector<64 x f32>, vector<128 x i32>, vector<32 x i32>, i32, i32, i32, i32, i32, i32) -> vector<64 x f32> + + + //store using raw_send2_no_result + spirv.FunctionCall @llvm_genx_raw_sends2_noresult_i1_v8i32_v64f32(%uchar_0, %uchar_0, %true, %uchar_1, %uchar_4, %uchar_15, %uint_0, %uint_33555463, %7, %dpas_result) : (i8, i8, i1, i8, i8, i8, i32, i32, vector<8xi32>, vector<64xf32>) -> () + + spirv.Return + } + spirv.EntryPoint "Kernel" @dpas_kernel + spirv.ExecutionMode @dpas_kernel "ContractionOff" + spirv.ExecutionMode @dpas_kernel "SharedLocalMemorySizeINTEL", 0 + // Utility function declarations (Intel vc-intrinsics) + spirv.func @llvm_genx_address_convert_i64_p1i8(%arg: !spirv.ptr) -> i64 "Pure" attributes { + linkage_attributes=#spirv.linkage_attributes< + linkage_name="llvm.genx.address.convert.i64.p1i8", + linkage_type= + >, + VectorComputeFunctionINTEL} + + spirv.func @llvm_genx_lsc_load2d_stateless_v128i32_i1_i64(%arg0 : i1, %arg1 : i8, %arg2 : i8, %arg3 : i8, %arg4 : i8, %arg5 : i8, %arg6 : i32, %arg7 : i32, %arg8 : i8, %arg9 : i64, %arg10 : i32, %arg11 : i32, %arg12 : i32, %arg13 : i32, %arg14 : i32) -> vector<128xi32> "Pure" attributes { + linkage_attributes=#spirv.linkage_attributes< + linkage_name="llvm.genx.lsc.load2d.stateless.v128i32.i1.i64", + linkage_type= + >, + VectorComputeFunctionINTEL} + + spirv.func @llvm_genx_lsc_load2d_stateless_v64f32_i1_i64(%arg0 : i1, %arg1 : i8, %arg2 : i8, %arg3 : i8, %arg4 : i8, %arg5 : i8, %arg6 : i32, %arg7 : i32, %arg8 : i8, %arg9 : i64, %arg10 : i32, %arg11 : i32, %arg12 : i32, %arg13 : i32, %arg14 : i32) -> vector<64xf32> "Pure" attributes { + linkage_attributes=#spirv.linkage_attributes< + linkage_name="llvm.genx.lsc.load2d.stateless.v64f32.i1.i64", + linkage_type= + >, + VectorComputeFunctionINTEL} + + spirv.func @llvm_genx_lsc_load2d_stateless_v32i32_i1_i64(%arg0 : i1, %arg1 : i8, %arg2 : i8, %arg3 : i8, %arg4 : i8, %arg5 : i8, %arg6 : i32, %arg7 : i32, %arg8 : i8, %arg9 : i64, %arg10 : i32, %arg11 : i32, %arg12 : i32, %arg13 : i32, %arg14 : i32) -> vector<32xi32> "Pure" attributes { + linkage_attributes=#spirv.linkage_attributes< + linkage_name="llvm.genx.lsc.load2d.stateless.v32i32.i1.i64", + linkage_type= + >, + VectorComputeFunctionINTEL} + + spirv.func @llvm_genx_lsc_store2d_stateless_i1_i64_v64f32(%arg0 : i1, %arg1 : i8, %arg2 : i8, %arg3 : i8, %arg4 : i8, %arg5 : i8, %arg6 : i32, %arg7 : i32, %arg8 : i8, %arg9 : i64, %arg10 : i32, %arg11 : i32, %arg12 : i32, %arg13 : i32, %arg14 : i32, %arg15 : vector<64xf32>) "None" attributes { + linkage_attributes=#spirv.linkage_attributes< + linkage_name="llvm.genx.lsc.store2d.stateless.i1.i64.v64f32", + linkage_type= + >, + VectorComputeFunctionINTEL} + + spirv.func @llvm_genx_dpas2_v64f32_v64f32_v128i32_v32i32(%arg0 : vector<64 x f32>, %arg1 : vector<128 x i32>, %arg2 : vector<32 x i32>, %arg3 : i32, %arg4 : i32, %arg5 : i32, %arg6 : i32, %arg7 : i32, %arg8 : i32) -> vector<64 x f32> "Pure" attributes{ + linkage_attributes=#spirv.linkage_attributes< + linkage_name="llvm.genx.dpas2.v64f32.v64f32.v128i32.v32i32", + linkage_type= + >, + VectorComputeFunctionINTEL} + + spirv.func @llvm_genx_raw_send2_v64f32_i1_v8i32(%arg0 : i8, %arg1 : i8, %arg2 : i1, %arg3 : i8, %arg4 : i8, %arg5 : i8, %arg6 : i32, %arg7 : i32, %arg8 : vector<8xi32>, %arg9 : vector<64xf32>) -> vector<64xf32> "Pure" attributes { + linkage_attributes=#spirv.linkage_attributes< + linkage_name="llvm.genx.raw.send2.v64f32.i1.v8i32", + linkage_type= + >, + VectorComputeFunctionINTEL} + + spirv.func @llvm_genx_raw_sends2_noresult_i1_v8i32_v64f32(%arg0 : i8, %arg1 : i8, %arg2 : i1, %arg3 : i8, %arg4 : i8, %arg5 : i8, %arg6 : i32, %arg7 : i32, %arg8 : vector<8xi32>, %arg9 : vector<64xf32>) "None" attributes { + linkage_attributes=#spirv.linkage_attributes< + linkage_name="llvm.genx.raw.sends2.noresult.i1.v8i32.v64f32", + linkage_type= + >, + VectorComputeFunctionINTEL} + } + + + // GPU module, almost same as the SPIR-V module but without 'spirv' dialect specific properties + gpu.module @dpas_module { + gpu.func @dpas_kernel(%arg0: memref, %arg1: memref, %arg2: memref) kernel attributes {spirv.entry_point_abi = #spirv.entry_point_abi<>} { + gpu.return + } + } + + func.func @dpas_ref(%arg_sys_dpth: index, %arg_rpt_cnt: index, %arg_N: index, %arg_C : memref, %arg_B : memref, %arg_A : memref){ + return + } + + func.func @dpas_test(%arg_sys_dpth: index, %arg_rpt_cnt: index, %arg_N: index){ + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant 1.100000e+00 : f32 + %cst_2 = arith.constant 2.200000e+00 : f32 + + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c4 = arith.constant 4 : index + + // Allocate vectors to be passed to function + + // Setting up Vector C + %C_size = arith.muli %arg_rpt_cnt, %arg_N : index + %C_size_i8 = arith.muli %C_size, %c4 : index + + %memref_C_i8 = gpu.alloc host_shared (%C_size_i8) : memref + %memref_C = memref.view %memref_C_i8[%c0][%C_size] : memref to memref + // Initialize C to 0 + call @fillResource1DF32(%memref_C, %cst_0) : (memref, f32) -> () + + // Setting up the Vector B & A + // B and A is setup slightly differently than other vectors, since B is + // expected to be bf16 by the dpas instruction, but can not be passed + // in SPIR-V (SPIR-V does not support bf16), we first allocate B + // as i8 and then change the type (create view) to bf16. We use the bf16 + // view to initialize the vectors. We finally pass the i8 pointer to the + // kernel and load bf16 from that using the intel vc-intrinsic + + // Alternative ways:, we could also create a i16 view and pass that. + // This way, both views point to the same vector, but accessed + // differently based what view is used + // Since, in our case, the vector is essentially bf16, but needed to + // have a view of i16 just to be passed in SPIR-V and inside DPAS + // reinterpreted back bf16, we can safely use this approach + // / bf16 (initialization) \ + // B = i8 - -> + // \ i16 (passed to SPIR-V kernel) / + + %tmp_sys_dpth = arith.muli %arg_sys_dpth, %c2 : index + %B_size = arith.muli %tmp_sys_dpth, %arg_N : index + + // Since, we are allocating bf16 as i8, %B_size * 2 is used + // for allocation size + %B_size_i8 = arith.muli %B_size, %c2 : index + + %memref_B = gpu.alloc host_shared (%B_size_i8) : memref + + // Create a view of bf16 vector + %memref_B_bf16 = memref.view %memref_B[%c0][%B_size] : memref to memref + + // Initialize it to 1.1 as bf16, since that's the original data type for B + call @fillResource1DBF16(%memref_B_bf16, %cst_1) : (memref, f32) -> () + + // Setting up the Vector A + %A_size = arith.muli %tmp_sys_dpth, %arg_rpt_cnt : index + + // Since, we are allocating bf16 as i8, %A_size * 2 is used + // for allocation size + %A_size_i8 = arith.muli %A_size, %c2 : index + + %memref_A = gpu.alloc host_shared (%A_size_i8) : memref + // Create a view of bf16 vector + %memref_A_bf16 = memref.view %memref_A[%c0][%A_size] : memref to memref + + // SPIR-V type does not support bf16, hence passing vector 1, and vector 2 as i8, will load bf16 from this vector using the intel vc-intrinsic + + // Initialize it to 2.2 as bf16, since that's the original data type for A + call @fillResource1DBF16(%memref_A_bf16, %cst_2) : (memref, f32) -> () + + // Calling the reference function/CPU version + call @dpas_ref(%arg_sys_dpth, %arg_rpt_cnt, %arg_N, %memref_C, %memref_B_bf16, %memref_A_bf16) : (index, index, index, memref, memref, memref) -> () + + // Calling the GPU version, using f16 view of B and A vector + call @dpas_gpu(%arg_sys_dpth, %arg_rpt_cnt, %arg_N, %memref_C_i8, %memref_B, %memref_A) : (index, index, index, memref, memref, memref) -> () + + // Print the result + %result = memref.cast %memref_C : memref to memref<*xf32> + call @printMemrefF32(%result) : (memref<*xf32>) -> () + // CHECK: Unranked Memref base@ = {{(0x)?[-9a-f]*}} + // CHECK-COUNT-64: 38.8301 + return + } + + // main function + func.func @main() { + %cst_sys_dpth = arith.constant 8 : index + %cst_rpt_cnt = arith.constant 4 : index + %cst_N = arith.constant 16 : index + + call @dpas_test(%cst_sys_dpth, %cst_rpt_cnt, %cst_N) : (index, index, index) -> () + return + } + + // Helper functions + func.func private @fillResource1DBF16(memref, f32) attributes {llvm.emit_c_interface} + func.func private @fillResource1DF16(memref, f32) attributes {llvm.emit_c_interface} + func.func private @fillResource1DF32(memref, f32) attributes {llvm.emit_c_interface} + func.func private @printMemrefBF16(memref<*xbf16>) attributes {llvm.emit_c_interface} + func.func private @printMemrefF16(memref<*xf16>) attributes {llvm.emit_c_interface} + func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface} + +} diff --git a/test/SPIRV/IntelVectorExtension/Load_1d_raw_send.mlir b/test/SPIRV/IntelVectorExtension/Load_1d_raw_send.mlir new file mode 100644 index 000000000..5bc7b5600 --- /dev/null +++ b/test/SPIRV/IntelVectorExtension/Load_1d_raw_send.mlir @@ -0,0 +1,185 @@ +// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \ +// RUN: --runner imex-cpu-runner -e main \ +// RUN: --entry-point-result=void \ +// RUN: --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck +// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \ +// RUN: --runner imex-cpu-runner -e main \ +// RUN: --entry-point-result=void \ +// RUN: --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck + +// A simple test case showing how to use raw_send2 VC intrinsics for doing a load1d + +module attributes {gpu.container_module} { + + // function to setup the launch and launch the kernel + func.func @load_1d_raw_send_gpu(%arg_A : memref<256xi8>, %arg_B : memref<256xi8>) { + %c1 = arith.constant 1 : index + + // 1 workgroup and, 1 thread per workgroup + gpu.launch_func @load_1d_raw_send_module::@load_1d_raw_send_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%arg_A : memref<256xi8>, %arg_B : memref<256xi8>) + return + } + + // SPIR-V load_1d_raw_send module, it holds the load_1d_raw_send kernel + spirv.module @__spv__load_1d_raw_send_module Physical64 OpenCL requires #spirv.vce attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + // load_1d_raw_send kernel + spirv.func @load_1d_raw_send_kernel(%arg1: !spirv.ptr, %arg2: !spirv.ptr) "DontInline" attributes {spirv.entry_point_abi = #spirv.entry_point_abi<>, workgroup_attributions = 0 : i64, VectorComputeFunctionINTEL} { + %ushort_1 = spirv.Constant 1 : i16 + %true = spirv.Constant true + + %uchar_0 = spirv.Constant 0 : i8 + %uchar_1 = spirv.Constant 1 : i8 + %uchar_2 = spirv.Constant 2 : i8 + %uchar_3 = spirv.Constant 3 : i8 + %uchar_4 = spirv.Constant 4 : i8 + %uchar_7 = spirv.Constant 7 : i8 + %uchar_8 = spirv.Constant 8 : i8 + %uchar_9 = spirv.Constant 9 : i8 + %uchar_10 = spirv.Constant 10 : i8 + %uchar_15 = spirv.Constant 15 : i8 + %uchar_16 = spirv.Constant 16 : i8 + + %uint_0 = spirv.Constant 0 : i32 + %uint_1 = spirv.Constant 1 : i32 + %uint_2 = spirv.Constant 2 : i32 + %uint_3 = spirv.Constant 3 : i32 + %uint_4 = spirv.Constant 4 : i32 + %uint_5 = spirv.Constant 5 : i32 + %uint_6 = spirv.Constant 6 : i32 + %uint_7 = spirv.Constant 7 : i32 + %uint_8 = spirv.Constant 8 : i32 + %uint_9 = spirv.Constant 9 : i32 + %uint_15 = spirv.Constant 15 : i32 + %uint_16 = spirv.Constant 16 : i32 + %uint_31 = spirv.Constant 31 : i32 + %uint_32 = spirv.Constant 32 : i32 + %uint_63 = spirv.Constant 63 : i32 + %uint_64 = spirv.Constant 64 : i32 + %uint_255 = spirv.Constant 255 : i32 + %uint_256 = spirv.Constant 256 : i32 + %uint_3855 = spirv.Constant 3855 : i32 + + + // You can refer to : + // https://gfxspecs.intel.com/Predator/Home/Index/53523 + // bit[0: 5] : opcode for instruction: + // https://gfxspecs.intel.com/Predator/Home/Index/68015 + // bit[7: 8] Address Size + // https://gfxspecs.intel.com/Predator/Home/Index/53558 + // bit[9: 11] Data Size + // https://gfxspecs.intel.com/Predator/Home/Index/53563 + // bit[12: 14] Vector Size + // https://gfxspecs.intel.com/Predator/Home/Index/53566 + // bit[15] Transpose + // https://gfxspecs.intel.com/Predator/Home/Index/53565 + // https://github.com/intel-innersource/drivers.gpu.compute.vc-intrinsics/blob/cmc_experimental/GenXIntrinsics/include/llvm/GenXIntrinsics/GenXIntrinsics.h#L85 + // bit[17: 19] cacheHint + // https://gfxspecs.intel.com/Predator/Home/Index/53560 + // https://github.com/intel-innersource/drivers.gpu.compute.vc-intrinsics/blob/cmc_experimental/GenXIntrinsics/include/llvm/GenXIntrinsics/Intrinsic_definitions.py#L1953 + // bit[20: 24] Dest Length + // bit[25: 28] Src Length + // https://gfxspecs.intel.com/Predator/Home/Index/53680 + %uint_38335872 = spirv.Constant 38335872 : i32 + + %ulong_32 = spirv.Constant 32 : i64 + %zero_vector = spirv.Constant dense<0.0> : vector<64xf32> + %addr_payload_vector = spirv.Constant dense<[0,0,0,0]> : vector<4xi64> + + // Cast the uchar pointers (i8 ptr) to ulongs (i64) + %arg_1 = spirv.FunctionCall @llvm_genx_address_convert_i64_p1i8(%arg1) : (!spirv.ptr) -> i64 + %arg_2 = spirv.FunctionCall @llvm_genx_address_convert_i64_p1i8(%arg2) : (!spirv.ptr) -> i64 + + + // For load_1d we only need to put addr into the payload + // In the generated assembly, a mov instruction will be generated to create the payload. + %0 = spirv.VectorInsertDynamic %arg_1, %addr_payload_vector[%uint_0] : vector<4xi64>, i32 + + // Load A from global using raw_send2 + %load_from_global = spirv.FunctionCall @llvm_genx_raw_send2_v64f32_i1_v4i64(%uchar_0, %uchar_0, %true, %uchar_1, %uchar_16, %uchar_15, %uint_0,%uint_38335872, %0, %zero_vector) : (i8, i8, i1, i8, i8, i8, i32, i32, vector<4xi64>, vector<64xf32>) -> vector<64xf32> + + // store A in B (global) + spirv.FunctionCall @llvm_genx_lsc_store_stateless_i1_i64_v64f32(%true, %uchar_4, %uchar_0, %uchar_0, %ushort_1, %uint_0, %uchar_3, %uchar_8, %uchar_2, %uchar_0, %arg_2, %load_from_global, %uint_0) : (i1, i8, i8, i8, i16, i32, i8, i8, i8, i8, i64, vector<64 x f32>, i32) -> () // -> mlir::NoneType + spirv.Return + } + + spirv.EntryPoint "Kernel" @load_1d_raw_send_kernel + spirv.ExecutionMode @load_1d_raw_send_kernel "ContractionOff" + spirv.ExecutionMode @load_1d_raw_send_kernel "SharedLocalMemorySizeINTEL", 0 + // Utility function declarations (Intel vc-intrinsics) + + spirv.func @llvm_genx_address_convert_i64_p1i8(%arg: !spirv.ptr) -> i64 "Pure" attributes { + linkage_attributes=#spirv.linkage_attributes< + linkage_name="llvm.genx.address.convert.i64.p1i8", + linkage_type= + >, + VectorComputeFunctionINTEL} + + spirv.func @llvm_genx_lsc_store_stateless_i1_i64_v64f32(%arg0 : i1, %arg1 : i8, %arg2 : i8, %arg3 : i8, %arg4 : i16, %arg5 : i32, %arg6 : i8, %arg7 : i8, %arg8 : i8, %arg9 : i8, %arg10 : i64, %arg11 : vector<64 x f32>, %arg12 : i32) "None" attributes{ + linkage_attributes=#spirv.linkage_attributes< + linkage_name="llvm.genx.lsc.store.stateless.i1.i64.v64f32", + linkage_type= + >, + VectorComputeFunctionINTEL} + + + spirv.func @llvm_genx_raw_send2_v64f32_i1_v4i64(%arg0 : i8, %arg1 : i8, %arg2 : i1, %arg3 : i8, %arg4 : i8, %arg5 : i8, %arg6 : i32, %arg7 : i32, %arg8 : vector<4xi64>, %arg9 : vector<64xf32>) -> vector<64xf32> "Pure" attributes { + linkage_attributes=#spirv.linkage_attributes< + linkage_name="llvm.genx.raw.send2.v64f32.i1.v4i64", + linkage_type= + >, + VectorComputeFunctionINTEL} + } + + // GPU module, almost same as the SPIR-V module but without 'spirv' dialect specific properties + gpu.module @load_1d_raw_send_module { + gpu.func @load_1d_raw_send_kernel(%arg1: memref<256xi8>, %arg2: memref<256xi8>) kernel attributes {spirv.entry_point_abi = #spirv.entry_point_abi<>} { + gpu.return + } + } + + func.func @load_1d_raw_send_test(%arg_sys_dpth: index, %arg_rpt_cnt: index, %arg_N: index){ + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant 1.100000e+00 : f32 + %cst_2 = arith.constant 2.200000e+00 : f32 + + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + + %memref_A = gpu.alloc host_shared () :memref<256xi8> + %memref_A_i8 = memref.view %memref_A[%c0][] : memref<256xi8> to memref<64xf32> + %memref_A_i8_1D = memref.cast %memref_A_i8 : memref<64xf32> to memref + // Initialize it to 2.2 + call @fillResource1DF32(%memref_A_i8_1D, %cst_2) : (memref, f32) -> () + + %memref_B_i8 = gpu.alloc host_shared () : memref<256xi8> + %memref_B = memref.view %memref_B_i8[%c0][] : memref<256xi8> to memref<64xf32> + %memref_B_1D = memref.cast %memref_B : memref<64xf32> to memref + call @fillResource1DF32(%memref_B_1D, %cst_1) : (memref, f32) -> () + + call @load_1d_raw_send_gpu(%memref_A, %memref_B_i8) : (memref<256xi8>, memref<256xi8>) -> () + + // Print the result + %result = memref.cast %memref_B : memref<64xf32> to memref<*xf32> + call @printMemrefF32(%result) : (memref<*xf32>) -> () + // CHECK: Unranked Memref base@ = {{(0x)?[-9a-f]*}} + // CHECK-COUNT-64: 2.2 + return + } + + // main function + func.func @main() { + %cst_sys_dpth = arith.constant 8 : index + %cst_rpt_cnt = arith.constant 4 : index + %cst_N = arith.constant 16 : index + + call @load_1d_raw_send_test(%cst_sys_dpth, %cst_rpt_cnt, %cst_N) : (index, index, index) -> () + return + } + + // Helper functions + func.func private @fillResource1DF16(memref, f32) attributes {llvm.emit_c_interface} + func.func private @fillResource1DF32(memref, f32) attributes {llvm.emit_c_interface} + func.func private @printMemrefF16(memref<*xf16>) attributes {llvm.emit_c_interface} + func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface} + +} diff --git a/test/SPIRV/IntelVectorExtension/Load_1d_slm.mlir b/test/SPIRV/IntelVectorExtension/Load_1d_slm.mlir new file mode 100644 index 000000000..a444f7d21 --- /dev/null +++ b/test/SPIRV/IntelVectorExtension/Load_1d_slm.mlir @@ -0,0 +1,150 @@ +// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \ +// RUN: --runner imex-cpu-runner -e main \ +// RUN: --entry-point-result=void \ +// RUN: --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck +// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \ +// RUN: --runner imex-cpu-runner -e main \ +// RUN: --entry-point-result=void \ +// RUN: --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck + + +module attributes {gpu.container_module} { + + // function to setup the launch and launch the kernel + func.func @lsc_load_1d_slm_gpu(%arg_A : memref<256xi8>, %arg_B : memref<256xi8>) { + %c1 = arith.constant 1 : index + + // 1 workgroup and, 1 thread per workgroup + gpu.launch_func @lsc_load_1d_slm_module::@lsc_load_1d_slm_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%arg_A : memref<256xi8>, %arg_B : memref<256xi8>) + return + } + + spirv.module @__spv__lsc_load_1d_slm_module Physical64 OpenCL requires #spirv.vce attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + + spirv.func @lsc_load_1d_slm_kernel(%arg1: !spirv.ptr, %arg2: !spirv.ptr) "DontInline" attributes {spirv.entry_point_abi = #spirv.entry_point_abi<>, workgroup_attributions = 0 : i64, VectorComputeFunctionINTEL} { + %uchar_0 = spirv.Constant 0 : i8 + %ushort_1 = spirv.Constant 1 : i16 + %uint_0 = spirv.Constant 0 : i32 + %uchar_3 = spirv.Constant 3 : i8 + %uchar_8 = spirv.Constant 8 : i8 + %uchar_2 = spirv.Constant 2 : i8 + %uchar_4 = spirv.Constant 4 : i8 + %uchar_7 = spirv.Constant 7 : i8 + %uint_9 = spirv.Constant 9 : i32 + %uint_8 = spirv.Constant 8 : i32 + %uint_4 = spirv.Constant 4 : i32 + %true = spirv.Constant true + + // Cast the uchar pointers (i8 ptr) to ulongs (i64) + %arg_1 = spirv.FunctionCall @llvm_genx_address_convert_i64_p1i8(%arg1) : (!spirv.ptr) -> i64 + %arg_2 = spirv.FunctionCall @llvm_genx_address_convert_i64_p1i8(%arg2) : (!spirv.ptr) -> i64 + + // Load A from global + %load_from_global = spirv.FunctionCall @llvm_genx_lsc_load_stateless_v64f32_i1_i64(%true, %uchar_0, %uchar_0, %uchar_0, %ushort_1, %uint_0, %uchar_3, %uchar_8, %uchar_2, %uchar_0, %arg_1, %uint_0) : (i1, i8, i8, i8, i16, i32, i8, i8, i8, i8, i64, i32) -> vector<64 x f32> + + // store A in (slm) + spirv.FunctionCall @llvm_genx_lsc_store_slm_i1_i64_v64f32(%true, %uchar_4, %uchar_0, %uchar_0, %ushort_1, %uint_0, %uchar_3, %uchar_8, %uchar_2, %uchar_0, %uint_0, %load_from_global, %uint_0) : (i1, i8, i8, i8, i16, i32, i8, i8, i8, i8, i32, vector<64 x f32>, i32) -> () // -> mlir::NoneType + // For SLM we need to only pass offset. We dont need to pass pointer. Need to double check. + + // Load A from slm + %load_from_slm = spirv.FunctionCall @llvm_genx_lsc_load_slm_v64f32_i1_i64(%true, %uchar_0, %uchar_0, %uchar_0, %ushort_1, %uint_0, %uchar_3, %uchar_8, %uchar_2, %uchar_0, %uint_0, %uint_0) : (i1, i8, i8, i8, i16, i32, i8, i8, i8, i8, i32, i32) -> vector<64xf32> + + // store A in B (global) + spirv.FunctionCall @llvm_genx_lsc_store_stateless_i1_i64_v64f32(%true, %uchar_4, %uchar_0, %uchar_0, %ushort_1, %uint_0, %uchar_3, %uchar_8, %uchar_2, %uchar_0, %arg_2, %load_from_slm, %uint_0) : (i1, i8, i8, i8, i16, i32, i8, i8, i8, i8, i64, vector<64 x f32>, i32) -> () // -> mlir::NoneType + spirv.Return + } + + spirv.EntryPoint "Kernel" @lsc_load_1d_slm_kernel + spirv.ExecutionMode @lsc_load_1d_slm_kernel "ContractionOff" + spirv.ExecutionMode @lsc_load_1d_slm_kernel "SharedLocalMemorySizeINTEL", 2048 + // Utility function declarations (Intel vc-intrinsics) + + spirv.func @llvm_genx_address_convert_i64_p1i8(%arg: !spirv.ptr) -> i64 "Pure" attributes { + linkage_attributes=#spirv.linkage_attributes< + linkage_name="llvm.genx.address.convert.i64.p1i8", + linkage_type= + >, + VectorComputeFunctionINTEL} + + spirv.func @llvm_genx_lsc_load_slm_v64f32_i1_i64(%arg0 : i1, %arg1 : i8, %arg2 : i8, %arg3 : i8, %arg4 : i16, %arg5 : i32, %arg6 : i8, %arg7 : i8, %arg8 : i8, %arg9 : i8, %arg10 : i32, %arg11 : i32) -> vector<64 x f32> "Const" attributes{ + linkage_attributes=#spirv.linkage_attributes< + linkage_name="llvm.genx.lsc.load.slm.v64f32.i1.i64", + linkage_type= + >, + VectorComputeFunctionINTEL} + + spirv.func @llvm_genx_lsc_load_stateless_v64f32_i1_i64(%arg0 : i1, %arg1 : i8, %arg2 : i8, %arg3 : i8, %arg4 : i16, %arg5 : i32, %arg6 : i8, %arg7 : i8, %arg8 : i8, %arg9 : i8, %arg10 : i64, %arg11 : i32) -> vector<64 x f32> "Const" attributes{ + linkage_attributes=#spirv.linkage_attributes< + linkage_name="llvm.genx.lsc.load.stateless.v64f32.i1.i64", + linkage_type= + >, + VectorComputeFunctionINTEL} + + spirv.func @llvm_genx_lsc_store_slm_i1_i64_v64f32(%arg0 : i1, %arg1 : i8, %arg2 : i8, %arg3 : i8, %arg4 : i16, %arg5 : i32, %arg6 : i8, %arg7 : i8, %arg8 : i8, %arg9 : i8, %arg10 : i32, %arg11 : vector<64 x f32>, %arg12 : i32) "None" attributes{ + linkage_attributes=#spirv.linkage_attributes< + linkage_name="llvm.genx.lsc.store.slm.i1.i64.v64f32", + linkage_type= + >, + VectorComputeFunctionINTEL} + + spirv.func @llvm_genx_lsc_store_stateless_i1_i64_v64f32(%arg0 : i1, %arg1 : i8, %arg2 : i8, %arg3 : i8, %arg4 : i16, %arg5 : i32, %arg6 : i8, %arg7 : i8, %arg8 : i8, %arg9 : i8, %arg10 : i64, %arg11 : vector<64 x f32>, %arg12 : i32) "None" attributes{ + linkage_attributes=#spirv.linkage_attributes< + linkage_name="llvm.genx.lsc.store.stateless.i1.i64.v64f32", + linkage_type= + >, + VectorComputeFunctionINTEL} + } + + // GPU module, almost same as the SPIR-V module but without 'spirv' dialect specific properties + gpu.module @lsc_load_1d_slm_module { + gpu.func @lsc_load_1d_slm_kernel(%arg1: memref<256xi8>, %arg2: memref<256xi8>) kernel attributes {spirv.entry_point_abi = #spirv.entry_point_abi<>} { + gpu.return + } + } + + func.func @lsc_load_1d_slm_test(%arg_sys_dpth: index, %arg_rpt_cnt: index, %arg_N: index){ + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant 1.100000e+00 : f32 + %cst_2 = arith.constant 2.200000e+00 : f32 + + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + + %memref_A = gpu.alloc host_shared () :memref<256xi8> + %memref_A_i8 = memref.view %memref_A[%c0][] : memref<256xi8> to memref<64xf32> + %memref_A_i8_1D = memref.cast %memref_A_i8 : memref<64xf32> to memref + // Initialize it to 2.2 + call @fillResource1DF32(%memref_A_i8_1D, %cst_2) : (memref, f32) -> () + + %memref_B_i8 = gpu.alloc host_shared () : memref<256xi8> + %memref_B = memref.view %memref_B_i8[%c0][] : memref<256xi8> to memref<64xf32> + %memref_B_1D = memref.cast %memref_B : memref<64xf32> to memref + call @fillResource1DF32(%memref_B_1D, %cst_1) : (memref, f32) -> () + + call @lsc_load_1d_slm_gpu(%memref_A, %memref_B_i8) : (memref<256xi8>, memref<256xi8>) -> () + + // Print the result + %result = memref.cast %memref_B : memref<64xf32> to memref<*xf32> + call @printMemrefF32(%result) : (memref<*xf32>) -> () + // CHECK: Unranked Memref base@ = {{(0x)?[-9a-f]*}} + // CHECK-COUNT-64: 2.2 + return + } + + // main function + func.func @main() { + %cst_sys_dpth = arith.constant 8 : index + %cst_rpt_cnt = arith.constant 4 : index + %cst_N = arith.constant 16 : index + + call @lsc_load_1d_slm_test(%cst_sys_dpth, %cst_rpt_cnt, %cst_N) : (index, index, index) -> () + return + } + + // Helper functions + func.func private @fillResource1DF16(memref, f32) attributes {llvm.emit_c_interface} + func.func private @fillResource1DF32(memref, f32) attributes {llvm.emit_c_interface} + func.func private @printMemrefF16(memref<*xf16>) attributes {llvm.emit_c_interface} + func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface} + +} diff --git a/test/SPIRV/IntelVectorExtension/Load_2d_raw_send.mlir b/test/SPIRV/IntelVectorExtension/Load_2d_raw_send.mlir new file mode 100644 index 000000000..1d00deb06 --- /dev/null +++ b/test/SPIRV/IntelVectorExtension/Load_2d_raw_send.mlir @@ -0,0 +1,201 @@ +// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \ +// RUN: --runner imex-cpu-runner -e main \ +// RUN: --entry-point-result=void \ +// RUN: --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck +// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \ +// RUN: --runner imex-cpu-runner -e main \ +// RUN: --entry-point-result=void \ +// RUN: --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck + +/// A simple load2d/store2d example +/// This example loads and stores 16x16xf32 elements using raw_send2/store2d + +module attributes {gpu.container_module} { + + func.func @load_store_2d_gpu(%arg_In : memref<1024xi8>, %arg_Out : memref<1024xi8>) { + %c1 = arith.constant 1 : index + + // 1 workgroup and, 1 thread per workgroup + gpu.launch_func @load_store_2d_module::@load_store_2d_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%arg_In : memref<1024xi8>, %arg_Out : memref<1024xi8>) + return + } + + // SPIR-V module, it holds the kernel + spirv.module @__spv__load_store_2d_module Physical64 OpenCL requires #spirv.vce attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + // load_store_2d kernel + spirv.func @load_store_2d_kernel(%arg0: !spirv.ptr, %arg1: !spirv.ptr) "DontInline" attributes {spirv.entry_point_abi = #spirv.entry_point_abi<>, workgroup_attributions = 0 : i64, VectorComputeFunctionINTEL} { + + %true = spirv.Constant true + + %uchar_0 = spirv.Constant 0 : i8 + %uchar_1 = spirv.Constant 1 : i8 + %uchar_2 = spirv.Constant 2 : i8 + %uchar_3 = spirv.Constant 3 : i8 + %uchar_4 = spirv.Constant 4 : i8 + %uchar_7 = spirv.Constant 7 : i8 + %uchar_8 = spirv.Constant 8 : i8 + %uchar_9 = spirv.Constant 9 : i8 + %uchar_10 = spirv.Constant 10 : i8 + %uchar_15 = spirv.Constant 15 : i8 + %uchar_16 = spirv.Constant 16 : i8 + + %uint_0 = spirv.Constant 0 : i32 + %uint_1 = spirv.Constant 1 : i32 + %uint_2 = spirv.Constant 2 : i32 + %uint_3 = spirv.Constant 3 : i32 + %uint_4 = spirv.Constant 4 : i32 + %uint_5 = spirv.Constant 5 : i32 + %uint_6 = spirv.Constant 6 : i32 + %uint_7 = spirv.Constant 7 : i32 + %uint_8 = spirv.Constant 8 : i32 + %uint_9 = spirv.Constant 9 : i32 + %uint_15 = spirv.Constant 15 : i32 + %uint_16 = spirv.Constant 16 : i32 + %uint_31 = spirv.Constant 31 : i32 + %uint_32 = spirv.Constant 32 : i32 + %uint_63 = spirv.Constant 63 : i32 + %uint_64 = spirv.Constant 64 : i32 + %uint_255 = spirv.Constant 255 : i32 + %uint_256 = spirv.Constant 256 : i32 + %uint_3855 = spirv.Constant 3855 : i32 + + // 0xFFFFFFFF + %uint_4294967295 = spirv.Constant 4294967295 : i64 + + + // Load Message descriptor : 0 00 0001 10000 000 0 0 000 010 0 0 0 000011 + // https://gfxspecs.intel.com/Predator/Home/Index/53680 + %uint_50332675 = spirv.Constant 50332675 : i32 + + %ulong_32 = spirv.Constant 32 : i64 + %zero_vector = spirv.Constant dense<0.0> : vector<256xf32> + %addr_payload_vector = spirv.Constant dense<[0,0,0,0,0,0,0,0]> : vector<8xi32> + + + // Cast the uchar pointers (i8 ptr) to ulongs (i64) + %arg_0 = spirv.FunctionCall @llvm_genx_address_convert_i64_p1i8(%arg0) : (!spirv.ptr) -> i64 + %arg_1 = spirv.FunctionCall @llvm_genx_address_convert_i64_p1i8(%arg1) : (!spirv.ptr) -> i64 + + + // --------------- LOAD USING RAW SEND ------------------- + %addr_payload_msb_32_or = spirv.ShiftRightLogical %arg_0, %ulong_32 : i64, i64 + %addr_payload_msb_32 = spirv.UConvert %addr_payload_msb_32_or : i64 to i32 + %addr_payload_lsb_and = spirv.BitwiseAnd %arg_0, %uint_4294967295 : i64 + %addr_payload_lsb_32 = spirv.UConvert %addr_payload_lsb_and : i64 to i32 + + + %0 = spirv.VectorInsertDynamic %addr_payload_lsb_32, %addr_payload_vector[%uint_0] : vector<8xi32>, i32 + %1 = spirv.VectorInsertDynamic %addr_payload_msb_32, %0[%uint_1] : vector<8xi32>, i32 + %2 = spirv.VectorInsertDynamic %uint_63, %1[%uint_2] : vector<8xi32>, i32 + %3 = spirv.VectorInsertDynamic %uint_15, %2[%uint_3] : vector<8xi32>, i32 + %4 = spirv.VectorInsertDynamic %uint_63, %3[%uint_4] : vector<8xi32>, i32 + %5 = spirv.VectorInsertDynamic %uint_0, %4[%uint_5] : vector<8xi32>, i32 + %6 = spirv.VectorInsertDynamic %uint_0, %5[%uint_6] : vector<8xi32>, i32 + %7 = spirv.VectorInsertDynamic %uint_3855, %6[%uint_7] : vector<8xi32>, i32 + + + // https://gfxspecs.intel.com/Predator/Home/Index/53567 + // %addr_payload = vector<8xi32> : spirv.vector<8xi32> + // vector[0] = %addr_payload_lsb_32 + // vector[1] = %addr_payload_msb_32 + // vector[2] = 63 (bytes) ..width is 16*32 = 512/8 = 64 - 1 = 63 + // vector[3] = 16 (height in number of elements) - 1 = 15 + // vector[4] = pitch = 63. distance between two rows in number of bytes. + // vector[5] = block start X = 0; + // vector[6] = block start Y = 0; + // vector[7] = block width 224:231 (15), block height 232:239 (15), array_length 240:243 (0) = 0000 00001111 00001111 = 3855 + + + // Load data from the input using raw_send2 + %input = spirv.FunctionCall @llvm_genx_raw_send2_v256f32_i1_v8i32(%uchar_0, %uchar_1, %true, %uchar_1, %uchar_16, %uchar_15, %uint_0, %uint_50332675, %7, %zero_vector) : (i8, i8, i1, i8, i8, i8, i32, i32, vector<8xi32>, vector<256xf32>) -> vector<256xf32> + + // Store the result + spirv.FunctionCall @llvm_genx_lsc_store2d_stateless_i1_i64_v256f32(%true, %uchar_0, %uchar_0, %uchar_3, %uchar_1, %uchar_1, %uint_16, %uint_16, %uchar_0, %arg_1, %uint_63, %uint_15, %uint_63, %uint_0, %uint_0, %input) : (i1, i8, i8, i8, i8, i8, i32, i32, i8, i64, i32, i32, i32, i32, i32, vector<256xf32>) -> () + + spirv.Return + } + spirv.EntryPoint "Kernel" @load_store_2d_kernel + spirv.ExecutionMode @load_store_2d_kernel "ContractionOff" + spirv.ExecutionMode @load_store_2d_kernel "SharedLocalMemorySizeINTEL", 0 + // Utility function declarations (Intel vc-intrinsics) + spirv.func @llvm_genx_address_convert_i64_p1i8(%arg: !spirv.ptr) -> i64 "Pure" attributes { + linkage_attributes=#spirv.linkage_attributes< + linkage_name="llvm.genx.address.convert.i64.p1i8", + linkage_type= + >, + VectorComputeFunctionINTEL} + + spirv.func @llvm_genx_raw_send2_v256f32_i1_v8i32(%arg0 : i8, %arg1 : i8, %arg2 : i1, %arg3 : i8, %arg4 : i8, %arg5 : i8, %arg6 : i32, %arg7 : i32, %arg8 : vector<8xi32>, %arg9 : vector<256xf32>) -> vector<256xf32> "Pure" attributes { + linkage_attributes=#spirv.linkage_attributes< + linkage_name="llvm.genx.raw.send2.v256f32.i1.v8i32", + linkage_type= + >, + VectorComputeFunctionINTEL} + + spirv.func @llvm_genx_lsc_store2d_stateless_i1_i64_v256f32(%arg0 : i1, %arg1 : i8, %arg2 : i8, %arg3 : i8, %arg4 : i8, %arg5 : i8, %arg6 : i32, %arg7 : i32, %arg8 : i8, %arg9 : i64, %arg10 : i32, %arg11 : i32, %arg12 : i32, %arg13 : i32, %arg14 : i32, %arg15 : vector<256xf32>) "None" attributes { + linkage_attributes=#spirv.linkage_attributes< + linkage_name="llvm.genx.lsc.store2d.stateless.i1.i64.v256f32", + linkage_type= + >, + VectorComputeFunctionINTEL} + + } + + // GPU module, almost same as the SPIR-V module but without 'spirv' dialect specific properties + gpu.module @load_store_2d_module { + gpu.func @load_store_2d_kernel(%arg0: memref<1024xi8>, %arg1: memref<1024xi8>) kernel attributes {spirv.entry_point_abi = #spirv.entry_point_abi<>} { + gpu.return + } + } + + func.func @load_store_2d_test(){ + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant 1.100000e+00 : f32 + %cst_2 = arith.constant 2.200000e+00 : f32 + + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c4 = arith.constant 4 : index + + // Allocate Inputs and Outputs to be passed to function + + %memref_In_i8 = gpu.alloc host_shared () : memref<1024xi8> + %memref_In = memref.view %memref_In_i8[%c0][] : memref<1024xi8> to memref<256xf32> + // Initialize Input + %memref_In_1D = memref.cast %memref_In : memref<256xf32> to memref + call @fillResource1DF32(%memref_In_1D, %cst_1) : (memref, f32) -> () + + // Output + %memref_Out_i8 = gpu.alloc host_shared () : memref<1024xi8> + %memref_Out = memref.view %memref_Out_i8[%c0][] : memref<1024xi8> to memref<256xf32> + // Initialize Out to 0 + %memref_Out_1D = memref.cast %memref_Out : memref<256xf32> to memref + call @fillResource1DF32(%memref_Out_1D, %cst_0) : (memref, f32) -> () + + // Calling the GPU version of load and store + call @load_store_2d_gpu(%memref_In_i8, %memref_Out_i8) : (memref<1024xi8>, memref<1024xi8>) -> () + + // Print the result + %result = memref.cast %memref_Out_1D : memref to memref<*xf32> + call @printMemrefF32(%result) : (memref<*xf32>) -> () + // CHECK: Unranked Memref base@ = {{(0x)?[-9a-f]*}} + // CHECK-COUNT-256: 1.1 + return + } + + // main function + func.func @main() { + call @load_store_2d_test() : () -> () + return + } + + // Helper functions + func.func private @fillResource1DBF16(memref, f32) attributes {llvm.emit_c_interface} + func.func private @fillResource1DF16(memref, f32) attributes {llvm.emit_c_interface} + func.func private @fillResource1DF32(memref, f32) attributes {llvm.emit_c_interface} + func.func private @printMemrefBF16(memref<*xbf16>) attributes {llvm.emit_c_interface} + func.func private @printMemrefF16(memref<*xf16>) attributes {llvm.emit_c_interface} + func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface} + +} diff --git a/test/SPIRV/IntelVectorExtension/Store2d_raw_send.mlir b/test/SPIRV/IntelVectorExtension/Store2d_raw_send.mlir new file mode 100644 index 000000000..370c4e7fc --- /dev/null +++ b/test/SPIRV/IntelVectorExtension/Store2d_raw_send.mlir @@ -0,0 +1,192 @@ +// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \ +// RUN: --runner imex-cpu-runner -e main \ +// RUN: --entry-point-result=void \ +// RUN: --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck +// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/spirv-to-llvm.pp \ +// RUN: --runner imex-cpu-runner -e main \ +// RUN: --entry-point-result=void \ +// RUN: --shared-libs=%mlir_runner_utils,%irunner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck + +/// A simple load2d/store2d example +/// This example loads and stores 16x16xf32 elements using load2d/raw_sends2 + +module attributes {gpu.container_module} { + + func.func @load_store_2d_gpu(%arg_In : memref<1024xi8>, %arg_Out : memref<1024xi8>) { + %c1 = arith.constant 1 : index + + // 1 workgroup and, 1 thread per workgroup + gpu.launch_func @load_store_2d_module::@load_store_2d_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%arg_In : memref<1024xi8>, %arg_Out : memref<1024xi8>) + return + } + + // SPIR-V module, it holds the kernel + spirv.module @__spv__load_store_2d_module Physical64 OpenCL requires #spirv.vce attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + // load_store_2d kernel + spirv.func @load_store_2d_kernel(%arg0: !spirv.ptr, %arg1: !spirv.ptr) "DontInline" attributes {spirv.entry_point_abi = #spirv.entry_point_abi<>, workgroup_attributions = 0 : i64, VectorComputeFunctionINTEL} { + + %true = spirv.Constant true + + %uchar_0 = spirv.Constant 0 : i8 + %uchar_1 = spirv.Constant 1 : i8 + %uchar_2 = spirv.Constant 2 : i8 + %uchar_3 = spirv.Constant 3 : i8 + %uchar_4 = spirv.Constant 4 : i8 + %uchar_7 = spirv.Constant 7 : i8 + %uchar_8 = spirv.Constant 8 : i8 + %uchar_9 = spirv.Constant 9 : i8 + %uchar_10 = spirv.Constant 10 : i8 + %uchar_15 = spirv.Constant 15 : i8 + %uchar_16 = spirv.Constant 16 : i8 + + %uint_0 = spirv.Constant 0 : i32 + %uint_1 = spirv.Constant 1 : i32 + %uint_2 = spirv.Constant 2 : i32 + %uint_3 = spirv.Constant 3 : i32 + %uint_4 = spirv.Constant 4 : i32 + %uint_5 = spirv.Constant 5 : i32 + %uint_6 = spirv.Constant 6 : i32 + %uint_7 = spirv.Constant 7 : i32 + %uint_8 = spirv.Constant 8 : i32 + %uint_9 = spirv.Constant 9 : i32 + %uint_15 = spirv.Constant 15 : i32 + %uint_16 = spirv.Constant 16 : i32 + %uint_31 = spirv.Constant 31 : i32 + %uint_32 = spirv.Constant 32 : i32 + %uint_63 = spirv.Constant 63 : i32 + %uint_64 = spirv.Constant 64 : i32 + %uint_255 = spirv.Constant 255 : i32 + %uint_256 = spirv.Constant 256 : i32 + %uint_3855 = spirv.Constant 3855 : i32 + %ulong_4294967295 = spirv.Constant 4294967295 : i64 // 0xFFFFFFFF + + // Store Message descriptor : 0 0001 00000 000 00000 010 000 000111 + // https://gfxspecs.intel.com/Predator/Home/Index/53530 + %uint_33555463 = spirv.Constant 33555463 : i32 + %zero_vector = spirv.Constant dense<0.0> : vector<256xf32> + %ulong_32 = spirv.Constant 32 : i64 + + %addr_payload_vector_store = spirv.Constant dense<[0,0,0,0,0,0,0,0]> : vector<8xi32> + + // Cast the uchar pointers (i8 ptr) to ulongs (i64) + %arg_0 = spirv.FunctionCall @llvm_genx_address_convert_i64_p1i8(%arg0) : (!spirv.ptr) -> i64 + %arg_1 = spirv.FunctionCall @llvm_genx_address_convert_i64_p1i8(%arg1) : (!spirv.ptr) -> i64 + + // --------------- STORE USING RAW SEND ------------------- + // STORE: Extract the LSB and MSB of the address and convert them to 32 bits + %addr_payload_msb_32_or = spirv.ShiftRightLogical %arg_1, %ulong_32 : i64, i64 + %addr_payload_msb_32 = spirv.UConvert %addr_payload_msb_32_or : i64 to i32 + %addr_payload_lsb_and = spirv.BitwiseAnd %arg_1, %ulong_4294967295 : i64 + %addr_payload_lsb_32 = spirv.UConvert %addr_payload_lsb_and : i64 to i32 + + // https://gfxspecs.intel.com/Predator/Home/Index/53567 + // For storing a vector<16x16> f32 + // %addr_payload = vector<8xi32> + // vector[0] = %addr_payload_lsb_32 + // vector[1] = %addr_payload_msb_32 + // vector[2] = 63 (bytes) ..width is 16*32 = 512/8 = 64 - 1 = 63 + // vector[3] = 16 (height in number of elements) - 1 = 15 + // vector[4] = pitch = 63. distance between two rows in number of bytes + // vector[5] = block start X = 0; + // vector[6] = block start Y = 0; + // vector[7] = block width 224:231 (15), block height 232:239 (15), array_length 240:243 (0) = 0000 00001111 00001111 = 3855 + %0 = spirv.VectorInsertDynamic %addr_payload_lsb_32, %addr_payload_vector_store[%uint_0] : vector<8xi32>, i32 + %1 = spirv.VectorInsertDynamic %addr_payload_msb_32, %0[%uint_1] : vector<8xi32>, i32 + %2 = spirv.VectorInsertDynamic %uint_63, %1[%uint_2] : vector<8xi32>, i32 + %3 = spirv.VectorInsertDynamic %uint_15, %2[%uint_3] : vector<8xi32>, i32 + %4 = spirv.VectorInsertDynamic %uint_63, %3[%uint_4] : vector<8xi32>, i32 + %5 = spirv.VectorInsertDynamic %uint_0, %4[%uint_5] : vector<8xi32>, i32 + %6 = spirv.VectorInsertDynamic %uint_0, %5[%uint_6] : vector<8xi32>, i32 + %7 = spirv.VectorInsertDynamic %uint_3855, %6[%uint_7] : vector<8xi32>, i32 + + // Load data from the input + %input = spirv.FunctionCall @llvm_genx_lsc_load2d_stateless_v256f32_i1_i64(%true, %uchar_2, %uchar_2, %uchar_3, %uchar_1, %uchar_1, %uint_16, %uint_16, %uchar_0, %arg_0, %uint_63, %uint_15, %uint_63, %uint_0, %uint_0) : (i1, i8, i8, i8, i8, i8, i32, i32, i8, i64, i32, i32, i32, i32, i32) -> vector<256xf32> + + //store using raw_send2_no_result + spirv.FunctionCall @llvm_genx_raw_sends2_noresult_i1_v8i32_v256f32(%uchar_0, %uchar_0, %true, %uchar_1, %uchar_16, %uchar_15, %uint_0, %uint_33555463, %7, %input) : (i8, i8, i1, i8, i8, i8, i32, i32, vector<8xi32>, vector<256xf32>) -> () + spirv.Return + } + spirv.EntryPoint "Kernel" @load_store_2d_kernel + spirv.ExecutionMode @load_store_2d_kernel "ContractionOff" + spirv.ExecutionMode @load_store_2d_kernel "SharedLocalMemorySizeINTEL", 0 + // Utility function declarations (Intel vc-intrinsics) + spirv.func @llvm_genx_address_convert_i64_p1i8(%arg: !spirv.ptr) -> i64 "None" attributes { + linkage_attributes=#spirv.linkage_attributes< + linkage_name="llvm.genx.address.convert.i64.p1i8", + linkage_type= + >, + VectorComputeFunctionINTEL} + + spirv.func @llvm_genx_lsc_load2d_stateless_v256f32_i1_i64(%arg0 : i1, %arg1 : i8, %arg2 : i8, %arg3 : i8, %arg4 : i8, %arg5 : i8, %arg6 : i32, %arg7 : i32, %arg8 : i8, %arg9 : i64, %arg10 : i32, %arg11 : i32, %arg12 : i32, %arg13 : i32, %arg14 : i32) -> vector<256xf32> "Pure" attributes { + linkage_attributes=#spirv.linkage_attributes< + linkage_name="llvm.genx.lsc.load2d.stateless.v256f32.i1.i64", + linkage_type= + >, + VectorComputeFunctionINTEL} + + spirv.func @llvm_genx_raw_sends2_noresult_i1_v8i32_v256f32(%arg0 : i8, %arg1 : i8, %arg2 : i1, %arg3 : i8, %arg4 : i8, %arg5 : i8, %arg6 : i32, %arg7 : i32, %arg8 : vector<8xi32>, %arg9 : vector<256xf32>) "None" attributes { + linkage_attributes=#spirv.linkage_attributes< + linkage_name="llvm.genx.raw.sends2.noresult.i1.v8i32.v256f32", + linkage_type= + >, + VectorComputeFunctionINTEL} + } + + // GPU module, almost same as the SPIR-V module but without 'spirv' dialect specific properties + gpu.module @load_store_2d_module { + gpu.func @load_store_2d_kernel(%arg0: memref<1024xi8>, %arg1: memref<1024xi8>) kernel attributes {spirv.entry_point_abi = #spirv.entry_point_abi<>} { + gpu.return + } + } + + func.func @load_store_2d_test(){ + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant 1.100000e+00 : f32 + %cst_2 = arith.constant 2.200000e+00 : f32 + + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c4 = arith.constant 4 : index + + // Allocate Inputs and Outputs to be passed to function + + %memref_In_i8 = gpu.alloc host_shared () : memref<1024xi8> + %memref_In = memref.view %memref_In_i8[%c0][] : memref<1024xi8> to memref<256xf32> + // Initialize Input + %memref_In_1D = memref.cast %memref_In : memref<256xf32> to memref + call @fillResource1DF32(%memref_In_1D, %cst_1) : (memref, f32) -> () + + // Output + %memref_Out_i8 = gpu.alloc host_shared () : memref<1024xi8> + %memref_Out = memref.view %memref_Out_i8[%c0][] : memref<1024xi8> to memref<256xf32> + // Initialize Out to 0 + %memref_Out_1D = memref.cast %memref_Out : memref<256xf32> to memref + call @fillResource1DF32(%memref_Out_1D, %cst_0) : (memref, f32) -> () + + // Calling the GPU version of load and store + call @load_store_2d_gpu(%memref_In_i8, %memref_Out_i8) : (memref<1024xi8>, memref<1024xi8>) -> () + + // Print the result + %result = memref.cast %memref_Out_1D : memref to memref<*xf32> + call @printMemrefF32(%result) : (memref<*xf32>) -> () + // CHECK: Unranked Memref base@ = {{(0x)?[-9a-f]*}} + // CHECK-COUNT-256: 1.1 + return + } + + // main function + func.func @main() { + call @load_store_2d_test() : () -> () + return + } + + // Helper functions + func.func private @fillResource1DBF16(memref, f32) attributes {llvm.emit_c_interface} + func.func private @fillResource1DF16(memref, f32) attributes {llvm.emit_c_interface} + func.func private @fillResource1DF32(memref, f32) attributes {llvm.emit_c_interface} + func.func private @printMemrefBF16(memref<*xbf16>) attributes {llvm.emit_c_interface} + func.func private @printMemrefF16(memref<*xf16>) attributes {llvm.emit_c_interface} + func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface} + +}