Skip to content

Commit 80bdfcd

Browse files
authored
[LoopUtils] Don't wrap in getLoopEstimatedTripCount (#129080)
getLoopEstimatedTripCount returns the trip count based on profiling data, and its documentation says that it could return 0 when the trip count is zero, but this is not the case: a valid trip count can never be zero, and it returns 0 when the unsigned ExitCount is incremented by 1 and wraps. Some callers are careful about checking for this zero value in an std::optional, but it makes for an API with footguns, as a std::optional return value indicates that a non-nullopt value would be a valid trip count. Fix this by explicitly returning std::nullopt when the return value would wrap, and strip additional checks in callers. This also fixes a minor bug in LoopVectorize.
1 parent 23a30e6 commit 80bdfcd

File tree

6 files changed

+70
-40
lines changed

6 files changed

+70
-40
lines changed

llvm/include/llvm/Transforms/Utils/LoopUtils.h

+3-2
Original file line numberDiff line numberDiff line change
@@ -318,8 +318,9 @@ void addStringMetadataToLoop(Loop *TheLoop, const char *MDString,
318318
/// Returns a loop's estimated trip count based on branch weight metadata.
319319
/// In addition if \p EstimatedLoopInvocationWeight is not null it is
320320
/// initialized with weight of loop's latch leading to the exit.
321-
/// Returns 0 when the count is estimated to be 0, or std::nullopt when a
322-
/// meaningful estimate can not be made.
321+
/// Returns a valid positive trip count, or std::nullopt when a meaningful
322+
/// estimate cannot be made (including when the trip count wouldn't fit in the
323+
/// result type).
323324
std::optional<unsigned>
324325
getLoopEstimatedTripCount(Loop *L,
325326
unsigned *EstimatedLoopInvocationWeight = nullptr);

llvm/lib/Transforms/Utils/LoopPeel.cpp

+11-13
Original file line numberDiff line numberDiff line change
@@ -640,20 +640,18 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
640640
LLVM_DEBUG(dbgs() << "Profile-based estimated trip count is "
641641
<< *EstimatedTripCount << "\n");
642642

643-
if (*EstimatedTripCount) {
644-
if (*EstimatedTripCount + AlreadyPeeled <= MaxPeelCount) {
645-
unsigned PeelCount = *EstimatedTripCount;
646-
LLVM_DEBUG(dbgs() << "Peeling first " << PeelCount << " iterations.\n");
647-
PP.PeelCount = PeelCount;
648-
return;
649-
}
650-
LLVM_DEBUG(dbgs() << "Already peel count: " << AlreadyPeeled << "\n");
651-
LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
652-
LLVM_DEBUG(dbgs() << "Loop cost: " << LoopSize << "\n");
653-
LLVM_DEBUG(dbgs() << "Max peel cost: " << Threshold << "\n");
654-
LLVM_DEBUG(dbgs() << "Max peel count by cost: "
655-
<< (Threshold / LoopSize - 1) << "\n");
643+
if (*EstimatedTripCount + AlreadyPeeled <= MaxPeelCount) {
644+
unsigned PeelCount = *EstimatedTripCount;
645+
LLVM_DEBUG(dbgs() << "Peeling first " << PeelCount << " iterations.\n");
646+
PP.PeelCount = PeelCount;
647+
return;
656648
}
649+
LLVM_DEBUG(dbgs() << "Already peel count: " << AlreadyPeeled << "\n");
650+
LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
651+
LLVM_DEBUG(dbgs() << "Loop cost: " << LoopSize << "\n");
652+
LLVM_DEBUG(dbgs() << "Max peel cost: " << Threshold << "\n");
653+
LLVM_DEBUG(dbgs() << "Max peel count by cost: "
654+
<< (Threshold / LoopSize - 1) << "\n");
657655
}
658656
}
659657

llvm/lib/Transforms/Utils/LoopUtils.cpp

+6-1
Original file line numberDiff line numberDiff line change
@@ -820,7 +820,7 @@ static BranchInst *getExpectedExitLoopLatchBranch(Loop *L) {
820820

821821
/// Return the estimated trip count for any exiting branch which dominates
822822
/// the loop latch.
823-
static std::optional<uint64_t> getEstimatedTripCount(BranchInst *ExitingBranch,
823+
static std::optional<unsigned> getEstimatedTripCount(BranchInst *ExitingBranch,
824824
Loop *L,
825825
uint64_t &OrigExitWeight) {
826826
// To estimate the number of times the loop body was executed, we want to
@@ -842,6 +842,11 @@ static std::optional<uint64_t> getEstimatedTripCount(BranchInst *ExitingBranch,
842842
// Estimated exit count is a ratio of the loop weight by the weight of the
843843
// edge exiting the loop, rounded to nearest.
844844
uint64_t ExitCount = llvm::divideNearest(LoopWeight, ExitWeight);
845+
846+
// When ExitCount + 1 would wrap in unsigned, return std::nullopt instead.
847+
if (ExitCount >= std::numeric_limits<unsigned>::max())
848+
return std::nullopt;
849+
845850
// Estimated trip count is one plus estimated exit count.
846851
return ExitCount + 1;
847852
}

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+5-4
Original file line numberDiff line numberDiff line change
@@ -422,8 +422,10 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
422422
return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
423423
}
424424

425-
/// Returns "best known" trip count for the specified loop \p L as defined by
426-
/// the following procedure:
425+
/// Returns "best known" trip count, which is either a valid positive trip count
426+
/// or std::nullopt when an estimate cannot be made (including when the trip
427+
/// count would overflow), for the specified loop \p L as defined by the
428+
/// following procedure:
427429
/// 1) Returns exact trip count if it is known.
428430
/// 2) Returns expected trip count according to profile data if any.
429431
/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
@@ -2031,7 +2033,6 @@ class GeneratedRTChecks {
20312033
PSE, OuterLoop, /* CanUseConstantMax = */ false))
20322034
BestTripCount = *EstimatedTC;
20332035

2034-
BestTripCount = std::max(BestTripCount, 1U);
20352036
InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
20362037

20372038
// Let's ensure the cost is always at least 1.
@@ -5036,7 +5037,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
50365037
if (TailTripCountUB == TailTripCountLB)
50375038
MaxInterleaveCount = InterleaveCountUB;
50385039
}
5039-
} else if (BestKnownTC && *BestKnownTC > 0) {
5040+
} else if (BestKnownTC) {
50405041
// At least one iteration must be scalar when this constraint holds. So the
50415042
// maximum available iterations for interleaving is one less.
50425043
unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())

llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll

+43-18
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ target triple = "aarch64-linux-gnu"
55

66
%pair = type { i8, i8 }
77

8-
; For a loop with a profile-guided estimated TC of 32, when the auto-vectorizer chooses VF 16,
8+
; For a loop with a profile-guided estimated TC of 32, when the auto-vectorizer chooses VF 16,
99
; it should conservatively choose IC 1 so that the vector loop runs twice at least
1010
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
1111
define void @loop_with_profile_tc_32(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -29,7 +29,7 @@ for.end:
2929
ret void
3030
}
3131

32-
; For a loop with a profile-guided estimated TC of 33, when the auto-vectorizer chooses VF 16,
32+
; For a loop with a profile-guided estimated TC of 33, when the auto-vectorizer chooses VF 16,
3333
; it should conservatively choose IC 1 so that the vector loop runs twice at least
3434
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
3535
define void @loop_with_profile_tc_33(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -53,7 +53,7 @@ for.end:
5353
ret void
5454
}
5555

56-
; For a loop with a profile-guided estimated TC of 48, when the auto-vectorizer chooses VF 16,
56+
; For a loop with a profile-guided estimated TC of 48, when the auto-vectorizer chooses VF 16,
5757
; it should conservatively choose IC 1 so that the vector loop runs twice at least
5858
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
5959
define void @loop_with_profile_tc_48(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -77,7 +77,7 @@ for.end:
7777
ret void
7878
}
7979

80-
; For a loop with a profile-guided estimated TC of 63, when the auto-vectorizer chooses VF 16,
80+
; For a loop with a profile-guided estimated TC of 63, when the auto-vectorizer chooses VF 16,
8181
; it should conservatively choose IC 1 so that the vector loop runs twice at least
8282
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
8383
define void @loop_with_profile_tc_63(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -101,7 +101,7 @@ for.end:
101101
ret void
102102
}
103103

104-
; For a loop with a profile-guided estimated TC of 64, when the auto-vectorizer chooses VF 16,
104+
; For a loop with a profile-guided estimated TC of 64, when the auto-vectorizer chooses VF 16,
105105
; it should choose conservatively IC 2 so that the vector loop runs twice at least
106106
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
107107
define void @loop_with_profile_tc_64(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -125,10 +125,10 @@ for.end:
125125
ret void
126126
}
127127

128-
; This has the same profile-guided estimated trip count as loop_with_profile_tc_64 but since the
129-
; resulting interleaved group in this case may access memory out-of-bounds, it requires a scalar
128+
; This has the same profile-guided estimated trip count as loop_with_profile_tc_64 but since the
129+
; resulting interleaved group in this case may access memory out-of-bounds, it requires a scalar
130130
; epilogue iteration for correctness, making at most 63 iterations available for interleaving.
131-
; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar
131+
; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar
132132
; remainder than IC 2
133133
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
134134
define void @loop_with_profile_tc_64_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -149,7 +149,7 @@ for.end:
149149
ret void
150150
}
151151

152-
; For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16,
152+
; For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16,
153153
; it should choose conservatively IC 2 so that the vector loop runs twice at least
154154
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
155155
define void @loop_with_profile_tc_100(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -173,7 +173,7 @@ for.end:
173173
ret void
174174
}
175175

176-
; For a loop with a profile-guided estimated TC of 128, when the auto-vectorizer chooses VF 16,
176+
; For a loop with a profile-guided estimated TC of 128, when the auto-vectorizer chooses VF 16,
177177
; it should choose conservatively IC 4 so that the vector loop runs twice at least
178178
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
179179
define void @loop_with_profile_tc_128(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -206,11 +206,11 @@ for.end:
206206
ret void
207207
}
208208

209-
; This has the same profile-guided estimated trip count as loop_with_profile_tc_128 but since
210-
; the resulting interleaved group in this case may access memory out-of-bounds, it requires
211-
; a scalar epilogue iteration for correctness, making at most 127 iterations available for
209+
; This has the same profile-guided estimated trip count as loop_with_profile_tc_128 but since
210+
; the resulting interleaved group in this case may access memory out-of-bounds, it requires
211+
; a scalar epilogue iteration for correctness, making at most 127 iterations available for
212212
; interleaving.
213-
; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
213+
; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
214214
; remainder than IC 4
215215
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
216216
define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -240,7 +240,7 @@ for.end:
240240
ret void
241241
}
242242

243-
; For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16,
243+
; For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16,
244244
; it should choose conservatively IC 4 so that the vector loop runs twice at least
245245
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
246246
define void @loop_with_profile_tc_129(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -264,7 +264,7 @@ for.end:
264264
ret void
265265
}
266266

267-
; For a loop with a profile-guided estimated TC of 180, when the auto-vectorizer chooses VF 16,
267+
; For a loop with a profile-guided estimated TC of 180, when the auto-vectorizer chooses VF 16,
268268
; it should choose conservatively IC 4 so that the vector loop runs twice at least
269269
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
270270
define void @loop_with_profile_tc_180(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -288,7 +288,7 @@ for.end:
288288
ret void
289289
}
290290

291-
; For a loop with a profile-guided estimated TC of 193, when the auto-vectorizer chooses VF 16,
291+
; For a loop with a profile-guided estimated TC of 193, when the auto-vectorizer chooses VF 16,
292292
; it should choose conservatively IC 4 so that the vector loop runs twice at least
293293
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
294294
define void @loop_with_profile_tc_193(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -312,7 +312,7 @@ for.end:
312312
ret void
313313
}
314314

315-
; For a loop with a profile-guided estimated TC of 1000, when the auto-vectorizer chooses VF 16,
315+
; For a loop with a profile-guided estimated TC of 1000, when the auto-vectorizer chooses VF 16,
316316
; the IC will be capped by the target-specific maximum interleave count
317317
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
318318
define void @loop_with_profile_tc_1000(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -336,6 +336,30 @@ for.end:
336336
ret void
337337
}
338338

339+
; When the loop weight is UINT_MAX, and the exit count is 1, the trip count
340+
; computation could wrap.
341+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
342+
define void @loop_with_profile_wrap(ptr noalias %p, ptr noalias %q, i64 %n) {
343+
entry:
344+
br label %for.body
345+
346+
for.body:
347+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
348+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
349+
%tmp1 = load i8, ptr %tmp0, align 1
350+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
351+
%tmp3 = load i8, ptr %tmp2, align 1
352+
%add = add i8 %tmp1, %tmp3
353+
%qi = getelementptr i8, ptr %q, i64 %i
354+
store i8 %add, ptr %qi, align 1
355+
%i.next = add nuw nsw i64 %i, 1
356+
%cond = icmp eq i64 %i.next, %n
357+
br i1 %cond, label %for.end, label %for.body, !prof !11
358+
359+
for.end:
360+
ret void
361+
}
362+
339363
!0 = !{!"branch_weights", i32 1, i32 31}
340364
!1 = !{!"branch_weights", i32 1, i32 32}
341365
!2 = !{!"branch_weights", i32 1, i32 47}
@@ -347,3 +371,4 @@ for.end:
347371
!8 = !{!"branch_weights", i32 1, i32 179}
348372
!9 = !{!"branch_weights", i32 1, i32 192}
349373
!10 = !{!"branch_weights", i32 1, i32 999}
374+
!11 = !{!"branch_weights", i32 1, i32 -1}

llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -180,8 +180,8 @@ outer.exit:
180180
define void @outer_pgo_minus1(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
181181
; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_minus1'
182182
; CHECK: Calculating cost of runtime checks:
183-
; CHECK-NOT: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced
184-
; CHECK: Total cost of runtime checks: 6
183+
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 3
184+
; CHECK: Total cost of runtime checks: 3
185185
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
186186
entry:
187187
br label %outer.loop

0 commit comments

Comments
 (0)