From 66acb2694655321b37a1ee3ff19207a111756562 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Tue, 7 Jan 2025 12:01:31 +0200 Subject: [PATCH] [clang][CodeGen][SPIRV] Translate `amdgpu_flat_work_group_size` into `max_work_group_size`. (#116820) HIPAMD relies on the `amdgpu_flat_work_group_size` attribute to implement key functionality such as the `__launch_bounds__` `__global__` function annotation. This attribute is not available / directly translatable to SPIR-V, hence as it is AMDGCN flavoured SPIR-V suffers from information loss. This patch addresses that limitation by converting the unsupported attribute into the `max_work_group_size` attribute which maps to [`MaxWorkgroupSizeINTEL`](https://github.com/KhronosGroup/SPIRV-Registry/blob/main/extensions/INTEL/SPV_INTEL_kernel_attributes.asciidoc), which is available in / handled by SPIR-V. When reverse translating from SPIR-V to AMDGCN LLVMIR we invert the map and add the original AMDGPU attribute. --- clang/lib/CodeGen/Targets/SPIR.cpp | 37 ++++++++++++++++++ .../amdgpu-kernel-arg-pointer-type.cu | 38 ++++++++++--------- clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu | 7 ++++ 3 files changed, 65 insertions(+), 17 deletions(-) diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp index a48fe9d5f1ee9c..5c75e985e953de 100644 --- a/clang/lib/CodeGen/Targets/SPIR.cpp +++ b/clang/lib/CodeGen/Targets/SPIR.cpp @@ -64,6 +64,8 @@ class SPIRVTargetCodeGenInfo : public CommonSPIRTargetCodeGenInfo { void setCUDAKernelCallingConvention(const FunctionType *&FT) const override; LangAS getGlobalVarAddressSpace(CodeGenModule &CGM, const VarDecl *D) const override; + void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV, + CodeGen::CodeGenModule &M) const override; llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts, SyncScope Scope, llvm::AtomicOrdering Ordering, @@ -245,6 +247,41 @@ SPIRVTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM, return DefaultGlobalAS; } +void SPIRVTargetCodeGenInfo::setTargetAttributes( + const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const { + if (!M.getLangOpts().HIP || + M.getTarget().getTriple().getVendor() != llvm::Triple::AMD) + return; + if (GV->isDeclaration()) + return; + + auto F = dyn_cast(GV); + if (!F) + return; + + auto FD = dyn_cast_or_null(D); + if (!FD) + return; + if (!FD->hasAttr()) + return; + + unsigned N = M.getLangOpts().GPUMaxThreadsPerBlock; + if (auto FlatWGS = FD->getAttr()) + N = FlatWGS->getMax()->EvaluateKnownConstInt(M.getContext()).getExtValue(); + + // We encode the maximum flat WG size in the first component of the 3D + // max_work_group_size attribute, which will get reverse translated into the + // original AMDGPU attribute when targeting AMDGPU. + auto Int32Ty = llvm::IntegerType::getInt32Ty(M.getLLVMContext()); + llvm::Metadata *AttrMDArgs[] = { + llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(Int32Ty, N)), + llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(Int32Ty, 1)), + llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(Int32Ty, 1))}; + + F->setMetadata("max_work_group_size", + llvm::MDNode::get(M.getLLVMContext(), AttrMDArgs)); +} + llvm::SyncScope::ID SPIRVTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &, SyncScope Scope, llvm::AtomicOrdering, diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu index fab87aac18310a..19730e3925515e 100644 --- a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu +++ b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu @@ -33,7 +33,7 @@ // CHECK-NEXT: ret void // // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel1Pi( -// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0:[0-9]+]] { +// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0:[0-9]+]] !max_work_group_size [[META5:![0-9]+]] { // CHECK-SPIRV-NEXT: [[ENTRY:.*:]] // CHECK-SPIRV-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8 // CHECK-SPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8 @@ -58,7 +58,7 @@ // OPT-NEXT: ret void // // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel1Pi( -// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0:[0-9]+]] { +// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0:[0-9]+]] !max_work_group_size [[META5:![0-9]+]] { // OPT-SPIRV-NEXT: [[ENTRY:.*:]] // OPT-SPIRV-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[X_COERCE]] to i64 // OPT-SPIRV-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) @@ -102,7 +102,7 @@ __global__ void kernel1(int *x) { // CHECK-NEXT: ret void // // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel2Ri( -// CHECK-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] { +// CHECK-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] { // CHECK-SPIRV-NEXT: [[ENTRY:.*:]] // CHECK-SPIRV-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8 // CHECK-SPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8 @@ -126,7 +126,7 @@ __global__ void kernel1(int *x) { // OPT-NEXT: ret void // // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel2Ri( -// OPT-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] { +// OPT-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] { // OPT-SPIRV-NEXT: [[ENTRY:.*:]] // OPT-SPIRV-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[X_COERCE]] to i64 // OPT-SPIRV-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) @@ -171,7 +171,7 @@ __global__ void kernel2(int &x) { // CHECK-NEXT: ret void // // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel3PU3AS2iPU3AS1i( -// CHECK-SPIRV-SAME: ptr addrspace(2) noundef [[X:%.*]], ptr addrspace(1) noundef [[Y:%.*]]) addrspace(4) #[[ATTR0]] { +// CHECK-SPIRV-SAME: ptr addrspace(2) noundef [[X:%.*]], ptr addrspace(1) noundef [[Y:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] { // CHECK-SPIRV-NEXT: [[ENTRY:.*:]] // CHECK-SPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(2), align 8 // CHECK-SPIRV-NEXT: [[Y_ADDR:%.*]] = alloca ptr addrspace(1), align 8 @@ -195,7 +195,7 @@ __global__ void kernel2(int &x) { // OPT-NEXT: ret void // // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel3PU3AS2iPU3AS1i( -// OPT-SPIRV-SAME: ptr addrspace(2) nocapture noundef readonly [[X:%.*]], ptr addrspace(1) nocapture noundef writeonly initializes((0, 4)) [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR1:[0-9]+]] { +// OPT-SPIRV-SAME: ptr addrspace(2) nocapture noundef readonly [[X:%.*]], ptr addrspace(1) nocapture noundef writeonly initializes((0, 4)) [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR1:[0-9]+]] !max_work_group_size [[META5]] { // OPT-SPIRV-NEXT: [[ENTRY:.*:]] // OPT-SPIRV-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(2) [[X]], align 4 // OPT-SPIRV-NEXT: store i32 [[TMP0]], ptr addrspace(1) [[Y]], align 4 @@ -302,7 +302,7 @@ struct S { // CHECK-NEXT: ret void // // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel41S( -// CHECK-SPIRV-SAME: [[STRUCT_S:%.*]] [[S_COERCE:%.*]]) addrspace(4) #[[ATTR0]] { +// CHECK-SPIRV-SAME: [[STRUCT_S:%.*]] [[S_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] { // CHECK-SPIRV-NEXT: [[ENTRY:.*:]] // CHECK-SPIRV-NEXT: [[S:%.*]] = alloca [[STRUCT_S]], align 8 // CHECK-SPIRV-NEXT: [[S1:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(4) @@ -343,7 +343,7 @@ struct S { // OPT-NEXT: ret void // // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel41S( -// OPT-SPIRV-SAME: [[STRUCT_S:%.*]] [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] { +// OPT-SPIRV-SAME: [[STRUCT_S:%.*]] [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] { // OPT-SPIRV-NEXT: [[ENTRY:.*:]] // OPT-SPIRV-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_S]] [[S_COERCE]], 0 // OPT-SPIRV-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_S]] [[S_COERCE]], 1 @@ -406,7 +406,7 @@ __global__ void kernel4(struct S s) { // CHECK-NEXT: ret void // // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel5P1S( -// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) addrspace(4) #[[ATTR0]] { +// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] { // CHECK-SPIRV-NEXT: [[ENTRY:.*:]] // CHECK-SPIRV-NEXT: [[S:%.*]] = alloca ptr addrspace(4), align 8 // CHECK-SPIRV-NEXT: [[S_ADDR:%.*]] = alloca ptr addrspace(4), align 8 @@ -432,7 +432,7 @@ __global__ void kernel4(struct S s) { // CHECK-SPIRV-NEXT: ret void // // OPT-LABEL: define dso_local amdgpu_kernel void @_Z7kernel5P1S( -// OPT-SAME: ptr addrspace(1) nocapture noundef readonly [[S_COERCE:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] { +// OPT-SAME: ptr addrspace(1) nocapture noundef readonly [[S_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] { // OPT-NEXT: [[ENTRY:.*:]] // OPT-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(1) [[S_COERCE]], align 8 // OPT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -446,7 +446,7 @@ __global__ void kernel4(struct S s) { // OPT-NEXT: ret void // // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel5P1S( -// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] { +// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] { // OPT-SPIRV-NEXT: [[ENTRY:.*:]] // OPT-SPIRV-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[S_COERCE]] to i64 // OPT-SPIRV-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) @@ -511,7 +511,7 @@ struct T { // CHECK-NEXT: ret void // // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel61T( -// CHECK-SPIRV-SAME: [[STRUCT_T:%.*]] [[T_COERCE:%.*]]) addrspace(4) #[[ATTR0]] { +// CHECK-SPIRV-SAME: [[STRUCT_T:%.*]] [[T_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] { // CHECK-SPIRV-NEXT: [[ENTRY:.*:]] // CHECK-SPIRV-NEXT: [[T:%.*]] = alloca [[STRUCT_T]], align 8 // CHECK-SPIRV-NEXT: [[T1:%.*]] = addrspacecast ptr [[T]] to ptr addrspace(4) @@ -551,7 +551,7 @@ struct T { // OPT-NEXT: ret void // // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel61T( -// OPT-SPIRV-SAME: [[STRUCT_T:%.*]] [[T_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] { +// OPT-SPIRV-SAME: [[STRUCT_T:%.*]] [[T_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] { // OPT-SPIRV-NEXT: [[ENTRY:.*:]] // OPT-SPIRV-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_T]] [[T_COERCE]], 0 // OPT-SPIRV-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x ptr addrspace(4)] [[TMP0]], 0 @@ -606,7 +606,7 @@ __global__ void kernel6(struct T t) { // CHECK-NEXT: ret void // // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel7Pi( -// CHECK-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] { +// CHECK-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] { // CHECK-SPIRV-NEXT: [[ENTRY:.*:]] // CHECK-SPIRV-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8 // CHECK-SPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8 @@ -631,7 +631,7 @@ __global__ void kernel6(struct T t) { // OPT-NEXT: ret void // // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel7Pi( -// OPT-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] { +// OPT-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] { // OPT-SPIRV-NEXT: [[ENTRY:.*:]] // OPT-SPIRV-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[X_COERCE]] to i64 // OPT-SPIRV-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) @@ -677,7 +677,7 @@ struct SS { // CHECK-NEXT: ret void // // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel82SS( -// CHECK-SPIRV-SAME: [[STRUCT_SS:%.*]] [[A_COERCE:%.*]]) addrspace(4) #[[ATTR0]] { +// CHECK-SPIRV-SAME: [[STRUCT_SS:%.*]] [[A_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] { // CHECK-SPIRV-NEXT: [[ENTRY:.*:]] // CHECK-SPIRV-NEXT: [[A:%.*]] = alloca [[STRUCT_SS]], align 8 // CHECK-SPIRV-NEXT: [[A1:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(4) @@ -700,7 +700,7 @@ struct SS { // OPT-NEXT: ret void // // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel82SS( -// OPT-SPIRV-SAME: [[STRUCT_SS:%.*]] [[A_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] { +// OPT-SPIRV-SAME: [[STRUCT_SS:%.*]] [[A_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] { // OPT-SPIRV-NEXT: [[ENTRY:.*:]] // OPT-SPIRV-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_SS]] [[A_COERCE]], 0 // OPT-SPIRV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[TMP0]], align 4 @@ -727,5 +727,9 @@ __global__ void kernel8(struct SS a) { *a.x += 3.f; } //. +// CHECK-SPIRV: [[META5]] = !{i32 1024, i32 1, i32 1} +//. // OPT: [[META4]] = !{} //. +// OPT-SPIRV: [[META5]] = !{i32 1024, i32 1, i32 1} +//. diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu index 11a133fd1351d2..253ac0898f5462 100644 --- a/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu +++ b/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu @@ -4,6 +4,9 @@ // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa --gpu-max-threads-per-block=1024 \ // RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \ // RUN: | FileCheck -check-prefixes=CHECK,MAX1024 %s +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa --gpu-max-threads-per-block=1024 \ +// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefixes=CHECK-SPIRV,MAX1024-SPIRV %s // RUN: %clang_cc1 -triple nvptx \ // RUN: -fcuda-is-device -emit-llvm -o - %s | FileCheck %s \ // RUN: -check-prefix=NAMD @@ -21,12 +24,14 @@ __global__ void flat_work_group_size_default() { // CHECK: define{{.*}} amdgpu_kernel void @_Z28flat_work_group_size_defaultv() [[FLAT_WORK_GROUP_SIZE_DEFAULT:#[0-9]+]] +// CHECK-SPIRV: define{{.*}} spir_kernel void @_Z28flat_work_group_size_defaultv(){{.*}} !max_work_group_size [[MAX_WORK_GROUP_SIZE_DEFAULT:![0-9]+]] // NOUB: define{{.*}} void @_Z28flat_work_group_size_defaultv() [[NOUB:#[0-9]+]] } __attribute__((amdgpu_flat_work_group_size(32, 64))) // expected-no-diagnostics __global__ void flat_work_group_size_32_64() { // CHECK: define{{.*}} amdgpu_kernel void @_Z26flat_work_group_size_32_64v() [[FLAT_WORK_GROUP_SIZE_32_64:#[0-9]+]] +// CHECK-SPIRV: define{{.*}} spir_kernel void @_Z26flat_work_group_size_32_64v(){{.*}} !max_work_group_size [[MAX_WORK_GROUP_SIZE_64:![0-9]+]] } __attribute__((amdgpu_waves_per_eu(2))) // expected-no-diagnostics __global__ void waves_per_eu_2() { @@ -82,7 +87,9 @@ template __global__ void template_32_4_a_max_num_work_groups<2>(); // DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"{{.*}}"uniform-work-group-size"="true" // MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024" +// MAX1024-SPIRV-DAG: [[MAX_WORK_GROUP_SIZE_DEFAULT]] = !{i32 1024, i32 1, i32 1} // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}}"amdgpu-flat-work-group-size"="32,64" +// CHECK-SPIRV-DAG: [[MAX_WORK_GROUP_SIZE_64]] = !{i32 64, i32 1, i32 1} // CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2" // CHECK-DAG: attributes [[NUM_SGPR_32]] = {{.*}}"amdgpu-num-sgpr"="32" // CHECK-DAG: attributes [[NUM_VGPR_64]] = {{.*}}"amdgpu-num-vgpr"="64"