From 72a794ab179c588b54ef9a42d335ea7776b7bea5 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Thu, 6 Mar 2025 14:17:27 +0000 Subject: [PATCH] [LV] Fix runtime-VF logic when generating RT-checks GeneratedRTChecks::create has a bug when calling addDiffRuntimeChecks: it persists the value of RuntimeVF from a previous call to the GetVF lambda, which results in a smaller runtime VF being returned in some cases. Fix the bug, stripping a FIXME in a test. --- .../Transforms/Vectorize/LoopVectorize.cpp | 26 +++++++-------- .../sve-runtime-check-size-based-threshold.ll | 27 ++++++++++------ ...ize-force-tail-with-evl-call-intrinsics.ll | 32 ++++++++++++------- 3 files changed, 48 insertions(+), 37 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index cb860a472d8f7..5fe6551c3f8e2 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1924,21 +1924,17 @@ class GeneratedRTChecks { "vector.memcheck"); auto DiffChecks = RtPtrChecking.getDiffChecks(); - if (DiffChecks) { - Value *RuntimeVF = nullptr; - MemRuntimeCheckCond = addDiffRuntimeChecks( - MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp, - [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) { - if (!RuntimeVF) - RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF); - return RuntimeVF; - }, - IC); - } else { - MemRuntimeCheckCond = addRuntimeChecks( - MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(), - MemCheckExp, VectorizerParams::HoistRuntimeChecks); - } + MemRuntimeCheckCond = + DiffChecks + ? addDiffRuntimeChecks( + MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp, + [VF](IRBuilderBase &B, unsigned Bits) { + return getRuntimeVF(B, B.getIntNTy(Bits), VF); + }, + IC) + : addRuntimeChecks(MemCheckBlock->getTerminator(), L, + RtPtrChecking.getChecks(), MemCheckExp, + VectorizerParams::HoistRuntimeChecks); assert(MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking " "claimed checks are required"); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll index feb27caf305a2..f1b54e6569afe 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll @@ -5,7 +5,6 @@ target triple = "aarch64-unknown-linux-gnu" ; Test case where the minimum profitable trip count due to runtime checks ; exceeds VF.getKnownMinValue() * UF. -; FIXME: The code currently incorrectly is missing a umax(VF * UF, 28). define void @min_trip_count_due_to_runtime_checks_1(ptr %dst.1, ptr %dst.2, ptr %src.1, ptr %src.2, i64 %n) { ; CHECK-LABEL: @min_trip_count_due_to_runtime_checks_1( ; CHECK-NEXT: entry: @@ -16,7 +15,7 @@ define void @min_trip_count_due_to_runtime_checks_1(ptr %dst.1, ptr %dst.2, ptr ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 20, i64 [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 28, i64 [[TMP1]]) ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], [[TMP2]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: @@ -25,21 +24,29 @@ define void @min_trip_count_due_to_runtime_checks_1(ptr %dst.1, ptr %dst.2, ptr ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 ; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[DST_21]], [[DST_12]] ; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 16 +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP7]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP18]], 16 ; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[DST_12]], [[SRC_13]] -; CHECK-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP8]], [[TMP9]] ; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] -; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP4]], 16 +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP11]], 2 +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP22]], 16 ; CHECK-NEXT: [[TMP10:%.*]] = sub i64 [[DST_12]], [[SRC_25]] -; CHECK-NEXT: [[DIFF_CHECK6:%.*]] = icmp ult i64 [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[DIFF_CHECK6:%.*]] = icmp ult i64 [[TMP10]], [[TMP13]] ; CHECK-NEXT: [[CONFLICT_RDX7:%.*]] = or i1 [[CONFLICT_RDX]], [[DIFF_CHECK6]] -; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP4]], 16 +; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP26:%.*]] = mul i64 [[TMP24]], 2 +; CHECK-NEXT: [[TMP38:%.*]] = mul i64 [[TMP26]], 16 ; CHECK-NEXT: [[TMP12:%.*]] = sub i64 [[DST_21]], [[SRC_13]] -; CHECK-NEXT: [[DIFF_CHECK8:%.*]] = icmp ult i64 [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[DIFF_CHECK8:%.*]] = icmp ult i64 [[TMP12]], [[TMP38]] ; CHECK-NEXT: [[CONFLICT_RDX9:%.*]] = or i1 [[CONFLICT_RDX7]], [[DIFF_CHECK8]] -; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP4]], 16 +; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2 +; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 16 ; CHECK-NEXT: [[TMP14:%.*]] = sub i64 [[DST_21]], [[SRC_25]] -; CHECK-NEXT: [[DIFF_CHECK10:%.*]] = icmp ult i64 [[TMP14]], [[TMP13]] +; CHECK-NEXT: [[DIFF_CHECK10:%.*]] = icmp ult i64 [[TMP14]], [[TMP21]] ; CHECK-NEXT: [[CONFLICT_RDX11:%.*]] = or i1 [[CONFLICT_RDX9]], [[DIFF_CHECK10]] ; CHECK-NEXT: br i1 [[CONFLICT_RDX11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll index f19e581d1c028..bd4d973046ae3 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll @@ -19,7 +19,7 @@ define void @vp_smax(ptr %a, ptr %b, ptr %c, i64 %N) { ; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]] ; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 -; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 13, i64 [[TMP2]]) +; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP2]]) ; IF-EVL-NEXT: [[TMP22:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]] ; IF-EVL-NEXT: br i1 [[TMP22]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; IF-EVL: [[VECTOR_MEMCHECK]]: @@ -28,9 +28,11 @@ define void @vp_smax(ptr %a, ptr %b, ptr %c, i64 %N) { ; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 [[TMP5]], 4 ; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 [[A1]], [[B2]] ; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP24]], [[TMP23]] -; IF-EVL-NEXT: [[TMP25:%.*]] = mul i64 [[TMP5]], 4 +; IF-EVL-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP25:%.*]] = mul i64 [[TMP15]], 4 +; IF-EVL-NEXT: [[TMP30:%.*]] = mul i64 [[TMP25]], 4 ; IF-EVL-NEXT: [[TMP26:%.*]] = sub i64 [[A1]], [[C3]] -; IF-EVL-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP26]], [[TMP25]] +; IF-EVL-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP26]], [[TMP30]] ; IF-EVL-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] ; IF-EVL-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] ; IF-EVL: [[VECTOR_PH]]: @@ -134,7 +136,7 @@ define void @vp_smin(ptr %a, ptr %b, ptr %c, i64 %N) { ; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]] ; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 -; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 13, i64 [[TMP2]]) +; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP2]]) ; IF-EVL-NEXT: [[TMP22:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]] ; IF-EVL-NEXT: br i1 [[TMP22]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; IF-EVL: [[VECTOR_MEMCHECK]]: @@ -143,9 +145,11 @@ define void @vp_smin(ptr %a, ptr %b, ptr %c, i64 %N) { ; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 [[TMP5]], 4 ; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 [[A1]], [[B2]] ; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP24]], [[TMP23]] -; IF-EVL-NEXT: [[TMP25:%.*]] = mul i64 [[TMP5]], 4 +; IF-EVL-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP25:%.*]] = mul i64 [[TMP15]], 4 +; IF-EVL-NEXT: [[TMP30:%.*]] = mul i64 [[TMP25]], 4 ; IF-EVL-NEXT: [[TMP26:%.*]] = sub i64 [[A1]], [[C3]] -; IF-EVL-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP26]], [[TMP25]] +; IF-EVL-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP26]], [[TMP30]] ; IF-EVL-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] ; IF-EVL-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] ; IF-EVL: [[VECTOR_PH]]: @@ -249,7 +253,7 @@ define void @vp_umax(ptr %a, ptr %b, ptr %c, i64 %N) { ; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]] ; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 -; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 13, i64 [[TMP2]]) +; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP2]]) ; IF-EVL-NEXT: [[TMP22:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]] ; IF-EVL-NEXT: br i1 [[TMP22]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; IF-EVL: [[VECTOR_MEMCHECK]]: @@ -258,9 +262,11 @@ define void @vp_umax(ptr %a, ptr %b, ptr %c, i64 %N) { ; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 [[TMP5]], 4 ; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 [[A1]], [[B2]] ; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP24]], [[TMP23]] -; IF-EVL-NEXT: [[TMP25:%.*]] = mul i64 [[TMP5]], 4 +; IF-EVL-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP25:%.*]] = mul i64 [[TMP15]], 4 +; IF-EVL-NEXT: [[TMP30:%.*]] = mul i64 [[TMP25]], 4 ; IF-EVL-NEXT: [[TMP26:%.*]] = sub i64 [[A1]], [[C3]] -; IF-EVL-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP26]], [[TMP25]] +; IF-EVL-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP26]], [[TMP30]] ; IF-EVL-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] ; IF-EVL-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] ; IF-EVL: [[VECTOR_PH]]: @@ -364,7 +370,7 @@ define void @vp_umin(ptr %a, ptr %b, ptr %c, i64 %N) { ; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]] ; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 -; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 13, i64 [[TMP2]]) +; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP2]]) ; IF-EVL-NEXT: [[TMP22:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]] ; IF-EVL-NEXT: br i1 [[TMP22]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; IF-EVL: [[VECTOR_MEMCHECK]]: @@ -373,9 +379,11 @@ define void @vp_umin(ptr %a, ptr %b, ptr %c, i64 %N) { ; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 [[TMP5]], 4 ; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 [[A1]], [[B2]] ; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP24]], [[TMP23]] -; IF-EVL-NEXT: [[TMP25:%.*]] = mul i64 [[TMP5]], 4 +; IF-EVL-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP25:%.*]] = mul i64 [[TMP15]], 4 +; IF-EVL-NEXT: [[TMP30:%.*]] = mul i64 [[TMP25]], 4 ; IF-EVL-NEXT: [[TMP26:%.*]] = sub i64 [[A1]], [[C3]] -; IF-EVL-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP26]], [[TMP25]] +; IF-EVL-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP26]], [[TMP30]] ; IF-EVL-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] ; IF-EVL-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] ; IF-EVL: [[VECTOR_PH]]: