Skip to content

Commit

Permalink
[clang][CodeGen][SPIRV] Translate amdgpu_flat_work_group_size into …
Browse files Browse the repository at this point in the history
…`max_work_group_size`. (llvm#116820)

HIPAMD relies on the `amdgpu_flat_work_group_size` attribute to
implement key functionality such as the `__launch_bounds__` `__global__`
function annotation. This attribute is not available / directly
translatable to SPIR-V, hence as it is AMDGCN flavoured SPIR-V suffers
from information loss.

This patch addresses that limitation by converting the unsupported
attribute into the `max_work_group_size` attribute which maps to
[`MaxWorkgroupSizeINTEL`](https://github.com/KhronosGroup/SPIRV-Registry/blob/main/extensions/INTEL/SPV_INTEL_kernel_attributes.asciidoc),
which is available in / handled by SPIR-V. When reverse translating from
SPIR-V to AMDGCN LLVMIR we invert the map and add the original AMDGPU
attribute.
  • Loading branch information
AlexVlx authored Jan 7, 2025
1 parent 7ce15f3 commit 66acb26
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 17 deletions.
37 changes: 37 additions & 0 deletions clang/lib/CodeGen/Targets/SPIR.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ class SPIRVTargetCodeGenInfo : public CommonSPIRTargetCodeGenInfo {
void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
const VarDecl *D) const override;
void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
CodeGen::CodeGenModule &M) const override;
llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
SyncScope Scope,
llvm::AtomicOrdering Ordering,
Expand Down Expand Up @@ -245,6 +247,41 @@ SPIRVTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
return DefaultGlobalAS;
}

void SPIRVTargetCodeGenInfo::setTargetAttributes(
const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
if (!M.getLangOpts().HIP ||
M.getTarget().getTriple().getVendor() != llvm::Triple::AMD)
return;
if (GV->isDeclaration())
return;

auto F = dyn_cast<llvm::Function>(GV);
if (!F)
return;

auto FD = dyn_cast_or_null<FunctionDecl>(D);
if (!FD)
return;
if (!FD->hasAttr<CUDAGlobalAttr>())
return;

unsigned N = M.getLangOpts().GPUMaxThreadsPerBlock;
if (auto FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>())
N = FlatWGS->getMax()->EvaluateKnownConstInt(M.getContext()).getExtValue();

// We encode the maximum flat WG size in the first component of the 3D
// max_work_group_size attribute, which will get reverse translated into the
// original AMDGPU attribute when targeting AMDGPU.
auto Int32Ty = llvm::IntegerType::getInt32Ty(M.getLLVMContext());
llvm::Metadata *AttrMDArgs[] = {
llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(Int32Ty, N)),
llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(Int32Ty, 1)),
llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(Int32Ty, 1))};

F->setMetadata("max_work_group_size",
llvm::MDNode::get(M.getLLVMContext(), AttrMDArgs));
}

llvm::SyncScope::ID
SPIRVTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &, SyncScope Scope,
llvm::AtomicOrdering,
Expand Down
38 changes: 21 additions & 17 deletions clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
// CHECK-NEXT: ret void
//
// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel1Pi(
// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0:[0-9]+]] {
// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0:[0-9]+]] !max_work_group_size [[META5:![0-9]+]] {
// CHECK-SPIRV-NEXT: [[ENTRY:.*:]]
// CHECK-SPIRV-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8
// CHECK-SPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8
Expand All @@ -58,7 +58,7 @@
// OPT-NEXT: ret void
//
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel1Pi(
// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0:[0-9]+]] {
// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0:[0-9]+]] !max_work_group_size [[META5:![0-9]+]] {
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
// OPT-SPIRV-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[X_COERCE]] to i64
// OPT-SPIRV-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4)
Expand Down Expand Up @@ -102,7 +102,7 @@ __global__ void kernel1(int *x) {
// CHECK-NEXT: ret void
//
// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel2Ri(
// CHECK-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] {
// CHECK-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
// CHECK-SPIRV-NEXT: [[ENTRY:.*:]]
// CHECK-SPIRV-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8
// CHECK-SPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8
Expand All @@ -126,7 +126,7 @@ __global__ void kernel1(int *x) {
// OPT-NEXT: ret void
//
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel2Ri(
// OPT-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
// OPT-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
// OPT-SPIRV-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[X_COERCE]] to i64
// OPT-SPIRV-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4)
Expand Down Expand Up @@ -171,7 +171,7 @@ __global__ void kernel2(int &x) {
// CHECK-NEXT: ret void
//
// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel3PU3AS2iPU3AS1i(
// CHECK-SPIRV-SAME: ptr addrspace(2) noundef [[X:%.*]], ptr addrspace(1) noundef [[Y:%.*]]) addrspace(4) #[[ATTR0]] {
// CHECK-SPIRV-SAME: ptr addrspace(2) noundef [[X:%.*]], ptr addrspace(1) noundef [[Y:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
// CHECK-SPIRV-NEXT: [[ENTRY:.*:]]
// CHECK-SPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(2), align 8
// CHECK-SPIRV-NEXT: [[Y_ADDR:%.*]] = alloca ptr addrspace(1), align 8
Expand All @@ -195,7 +195,7 @@ __global__ void kernel2(int &x) {
// OPT-NEXT: ret void
//
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel3PU3AS2iPU3AS1i(
// OPT-SPIRV-SAME: ptr addrspace(2) nocapture noundef readonly [[X:%.*]], ptr addrspace(1) nocapture noundef writeonly initializes((0, 4)) [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR1:[0-9]+]] {
// OPT-SPIRV-SAME: ptr addrspace(2) nocapture noundef readonly [[X:%.*]], ptr addrspace(1) nocapture noundef writeonly initializes((0, 4)) [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR1:[0-9]+]] !max_work_group_size [[META5]] {
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
// OPT-SPIRV-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(2) [[X]], align 4
// OPT-SPIRV-NEXT: store i32 [[TMP0]], ptr addrspace(1) [[Y]], align 4
Expand Down Expand Up @@ -302,7 +302,7 @@ struct S {
// CHECK-NEXT: ret void
//
// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel41S(
// CHECK-SPIRV-SAME: [[STRUCT_S:%.*]] [[S_COERCE:%.*]]) addrspace(4) #[[ATTR0]] {
// CHECK-SPIRV-SAME: [[STRUCT_S:%.*]] [[S_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
// CHECK-SPIRV-NEXT: [[ENTRY:.*:]]
// CHECK-SPIRV-NEXT: [[S:%.*]] = alloca [[STRUCT_S]], align 8
// CHECK-SPIRV-NEXT: [[S1:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(4)
Expand Down Expand Up @@ -343,7 +343,7 @@ struct S {
// OPT-NEXT: ret void
//
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel41S(
// OPT-SPIRV-SAME: [[STRUCT_S:%.*]] [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
// OPT-SPIRV-SAME: [[STRUCT_S:%.*]] [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
// OPT-SPIRV-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_S]] [[S_COERCE]], 0
// OPT-SPIRV-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_S]] [[S_COERCE]], 1
Expand Down Expand Up @@ -406,7 +406,7 @@ __global__ void kernel4(struct S s) {
// CHECK-NEXT: ret void
//
// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel5P1S(
// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) addrspace(4) #[[ATTR0]] {
// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
// CHECK-SPIRV-NEXT: [[ENTRY:.*:]]
// CHECK-SPIRV-NEXT: [[S:%.*]] = alloca ptr addrspace(4), align 8
// CHECK-SPIRV-NEXT: [[S_ADDR:%.*]] = alloca ptr addrspace(4), align 8
Expand All @@ -432,7 +432,7 @@ __global__ void kernel4(struct S s) {
// CHECK-SPIRV-NEXT: ret void
//
// OPT-LABEL: define dso_local amdgpu_kernel void @_Z7kernel5P1S(
// OPT-SAME: ptr addrspace(1) nocapture noundef readonly [[S_COERCE:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
// OPT-SAME: ptr addrspace(1) nocapture noundef readonly [[S_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
// OPT-NEXT: [[ENTRY:.*:]]
// OPT-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(1) [[S_COERCE]], align 8
// OPT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
Expand All @@ -446,7 +446,7 @@ __global__ void kernel4(struct S s) {
// OPT-NEXT: ret void
//
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel5P1S(
// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
// OPT-SPIRV-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[S_COERCE]] to i64
// OPT-SPIRV-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4)
Expand Down Expand Up @@ -511,7 +511,7 @@ struct T {
// CHECK-NEXT: ret void
//
// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel61T(
// CHECK-SPIRV-SAME: [[STRUCT_T:%.*]] [[T_COERCE:%.*]]) addrspace(4) #[[ATTR0]] {
// CHECK-SPIRV-SAME: [[STRUCT_T:%.*]] [[T_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
// CHECK-SPIRV-NEXT: [[ENTRY:.*:]]
// CHECK-SPIRV-NEXT: [[T:%.*]] = alloca [[STRUCT_T]], align 8
// CHECK-SPIRV-NEXT: [[T1:%.*]] = addrspacecast ptr [[T]] to ptr addrspace(4)
Expand Down Expand Up @@ -551,7 +551,7 @@ struct T {
// OPT-NEXT: ret void
//
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel61T(
// OPT-SPIRV-SAME: [[STRUCT_T:%.*]] [[T_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
// OPT-SPIRV-SAME: [[STRUCT_T:%.*]] [[T_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
// OPT-SPIRV-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_T]] [[T_COERCE]], 0
// OPT-SPIRV-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x ptr addrspace(4)] [[TMP0]], 0
Expand Down Expand Up @@ -606,7 +606,7 @@ __global__ void kernel6(struct T t) {
// CHECK-NEXT: ret void
//
// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel7Pi(
// CHECK-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] {
// CHECK-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
// CHECK-SPIRV-NEXT: [[ENTRY:.*:]]
// CHECK-SPIRV-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8
// CHECK-SPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8
Expand All @@ -631,7 +631,7 @@ __global__ void kernel6(struct T t) {
// OPT-NEXT: ret void
//
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel7Pi(
// OPT-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
// OPT-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
// OPT-SPIRV-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[X_COERCE]] to i64
// OPT-SPIRV-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4)
Expand Down Expand Up @@ -677,7 +677,7 @@ struct SS {
// CHECK-NEXT: ret void
//
// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel82SS(
// CHECK-SPIRV-SAME: [[STRUCT_SS:%.*]] [[A_COERCE:%.*]]) addrspace(4) #[[ATTR0]] {
// CHECK-SPIRV-SAME: [[STRUCT_SS:%.*]] [[A_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
// CHECK-SPIRV-NEXT: [[ENTRY:.*:]]
// CHECK-SPIRV-NEXT: [[A:%.*]] = alloca [[STRUCT_SS]], align 8
// CHECK-SPIRV-NEXT: [[A1:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(4)
Expand All @@ -700,7 +700,7 @@ struct SS {
// OPT-NEXT: ret void
//
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel82SS(
// OPT-SPIRV-SAME: [[STRUCT_SS:%.*]] [[A_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
// OPT-SPIRV-SAME: [[STRUCT_SS:%.*]] [[A_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
// OPT-SPIRV-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_SS]] [[A_COERCE]], 0
// OPT-SPIRV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[TMP0]], align 4
Expand All @@ -727,5 +727,9 @@ __global__ void kernel8(struct SS a) {
*a.x += 3.f;
}
//.
// CHECK-SPIRV: [[META5]] = !{i32 1024, i32 1, i32 1}
//.
// OPT: [[META4]] = !{}
//.
// OPT-SPIRV: [[META5]] = !{i32 1024, i32 1, i32 1}
//.
7 changes: 7 additions & 0 deletions clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa --gpu-max-threads-per-block=1024 \
// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefixes=CHECK,MAX1024 %s
// RUN: %clang_cc1 -triple spirv64-amd-amdhsa --gpu-max-threads-per-block=1024 \
// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefixes=CHECK-SPIRV,MAX1024-SPIRV %s
// RUN: %clang_cc1 -triple nvptx \
// RUN: -fcuda-is-device -emit-llvm -o - %s | FileCheck %s \
// RUN: -check-prefix=NAMD
Expand All @@ -21,12 +24,14 @@

__global__ void flat_work_group_size_default() {
// CHECK: define{{.*}} amdgpu_kernel void @_Z28flat_work_group_size_defaultv() [[FLAT_WORK_GROUP_SIZE_DEFAULT:#[0-9]+]]
// CHECK-SPIRV: define{{.*}} spir_kernel void @_Z28flat_work_group_size_defaultv(){{.*}} !max_work_group_size [[MAX_WORK_GROUP_SIZE_DEFAULT:![0-9]+]]
// NOUB: define{{.*}} void @_Z28flat_work_group_size_defaultv() [[NOUB:#[0-9]+]]
}

__attribute__((amdgpu_flat_work_group_size(32, 64))) // expected-no-diagnostics
__global__ void flat_work_group_size_32_64() {
// CHECK: define{{.*}} amdgpu_kernel void @_Z26flat_work_group_size_32_64v() [[FLAT_WORK_GROUP_SIZE_32_64:#[0-9]+]]
// CHECK-SPIRV: define{{.*}} spir_kernel void @_Z26flat_work_group_size_32_64v(){{.*}} !max_work_group_size [[MAX_WORK_GROUP_SIZE_64:![0-9]+]]
}
__attribute__((amdgpu_waves_per_eu(2))) // expected-no-diagnostics
__global__ void waves_per_eu_2() {
Expand Down Expand Up @@ -82,7 +87,9 @@ template __global__ void template_32_4_a_max_num_work_groups<2>();

// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"{{.*}}"uniform-work-group-size"="true"
// MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"
// MAX1024-SPIRV-DAG: [[MAX_WORK_GROUP_SIZE_DEFAULT]] = !{i32 1024, i32 1, i32 1}
// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}}"amdgpu-flat-work-group-size"="32,64"
// CHECK-SPIRV-DAG: [[MAX_WORK_GROUP_SIZE_64]] = !{i32 64, i32 1, i32 1}
// CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2"
// CHECK-DAG: attributes [[NUM_SGPR_32]] = {{.*}}"amdgpu-num-sgpr"="32"
// CHECK-DAG: attributes [[NUM_VGPR_64]] = {{.*}}"amdgpu-num-vgpr"="64"
Expand Down

0 comments on commit 66acb26

Please sign in to comment.