[LoopUtils] Don't wrap in getLoopEstimatedTripCount (#129080)

artagnon · web-flow · commit 80bdfcd411cd · 2025-03-04T08:43:08.000Z
getLoopEstimatedTripCount returns the trip count based on profiling
data, and its documentation says that it could return 0 when the trip
count is zero, but this is not the case: a valid trip count can never be
zero, and it returns 0 when the unsigned ExitCount is incremented by 1
and wraps. Some callers are careful about checking for this zero value
in an std::optional, but it makes for an API with footguns, as a
std::optional return value indicates that a non-nullopt value would be a
valid trip count. Fix this by explicitly returning std::nullopt when the
return value would wrap, and strip additional checks in callers. This
also fixes a minor bug in LoopVectorize.
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -318,8 +318,9 @@ void addStringMetadataToLoop(Loop *TheLoop, const char *MDString,
 /// Returns a loop's estimated trip count based on branch weight metadata.
 /// In addition if \p EstimatedLoopInvocationWeight is not null it is
 /// initialized with weight of loop's latch leading to the exit.
-/// Returns 0 when the count is estimated to be 0, or std::nullopt when a
-/// meaningful estimate can not be made.
+/// Returns a valid positive trip count, or std::nullopt when a meaningful
+/// estimate cannot be made (including when the trip count wouldn't fit in the
+/// result type).
 std::optional<unsigned>
 getLoopEstimatedTripCount(Loop *L,
                           unsigned *EstimatedLoopInvocationWeight = nullptr);
diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -640,20 +640,18 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
     LLVM_DEBUG(dbgs() << "Profile-based estimated trip count is "
                       << *EstimatedTripCount << "\n");
 
-    if (*EstimatedTripCount) {
-      if (*EstimatedTripCount + AlreadyPeeled <= MaxPeelCount) {
-        unsigned PeelCount = *EstimatedTripCount;
-        LLVM_DEBUG(dbgs() << "Peeling first " << PeelCount << " iterations.\n");
-        PP.PeelCount = PeelCount;
-        return;
-      }
-      LLVM_DEBUG(dbgs() << "Already peel count: " << AlreadyPeeled << "\n");
-      LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
-      LLVM_DEBUG(dbgs() << "Loop cost: " << LoopSize << "\n");
-      LLVM_DEBUG(dbgs() << "Max peel cost: " << Threshold << "\n");
-      LLVM_DEBUG(dbgs() << "Max peel count by cost: "
-                        << (Threshold / LoopSize - 1) << "\n");
+    if (*EstimatedTripCount + AlreadyPeeled <= MaxPeelCount) {
+      unsigned PeelCount = *EstimatedTripCount;
+      LLVM_DEBUG(dbgs() << "Peeling first " << PeelCount << " iterations.\n");
+      PP.PeelCount = PeelCount;
+      return;
     }
+    LLVM_DEBUG(dbgs() << "Already peel count: " << AlreadyPeeled << "\n");
+    LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
+    LLVM_DEBUG(dbgs() << "Loop cost: " << LoopSize << "\n");
+    LLVM_DEBUG(dbgs() << "Max peel cost: " << Threshold << "\n");
+    LLVM_DEBUG(dbgs() << "Max peel count by cost: "
+                      << (Threshold / LoopSize - 1) << "\n");
   }
 }
 
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -820,7 +820,7 @@ static BranchInst *getExpectedExitLoopLatchBranch(Loop *L) {
 
 /// Return the estimated trip count for any exiting branch which dominates
 /// the loop latch.
-static std::optional<uint64_t> getEstimatedTripCount(BranchInst *ExitingBranch,
+static std::optional<unsigned> getEstimatedTripCount(BranchInst *ExitingBranch,
                                                      Loop *L,
                                                      uint64_t &OrigExitWeight) {
   // To estimate the number of times the loop body was executed, we want to
@@ -842,6 +842,11 @@ static std::optional<uint64_t> getEstimatedTripCount(BranchInst *ExitingBranch,
   // Estimated exit count is a ratio of the loop weight by the weight of the
   // edge exiting the loop, rounded to nearest.
   uint64_t ExitCount = llvm::divideNearest(LoopWeight, ExitWeight);
+
+  // When ExitCount + 1 would wrap in unsigned, return std::nullopt instead.
+  if (ExitCount >= std::numeric_limits<unsigned>::max())
+    return std::nullopt;
+
   // Estimated trip count is one plus estimated exit count.
   return ExitCount + 1;
 }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -422,8 +422,10 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
 }
 
-/// Returns "best known" trip count for the specified loop \p L as defined by
-/// the following procedure:
+/// Returns "best known" trip count, which is either a valid positive trip count
+/// or std::nullopt when an estimate cannot be made (including when the trip
+/// count would overflow), for the specified loop \p L as defined by the
+/// following procedure:
 ///   1) Returns exact trip count if it is known.
 ///   2) Returns expected trip count according to profile data if any.
 ///   3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
@@ -2031,7 +2033,6 @@ class GeneratedRTChecks {
                   PSE, OuterLoop, /* CanUseConstantMax = */ false))
             BestTripCount = *EstimatedTC;
 
-          BestTripCount = std::max(BestTripCount, 1U);
           InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
 
           // Let's ensure the cost is always at least 1.
@@ -5036,7 +5037,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
       if (TailTripCountUB == TailTripCountLB)
         MaxInterleaveCount = InterleaveCountUB;
     }
-  } else if (BestKnownTC && *BestKnownTC > 0) {
+  } else if (BestKnownTC) {
     // At least one iteration must be scalar when this constraint holds. So the
     // maximum available iterations for interleaving is one less.
     unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
@@ -5,7 +5,7 @@ target triple = "aarch64-linux-gnu"
 
 %pair = type { i8, i8 }
 
-; For a loop with a profile-guided estimated TC of 32, when the auto-vectorizer chooses VF 16, 
+; For a loop with a profile-guided estimated TC of 32, when the auto-vectorizer chooses VF 16,
 ; it should conservatively choose IC 1 so that the vector loop runs twice at least
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
 define void @loop_with_profile_tc_32(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -29,7 +29,7 @@ for.end:
   ret void
 }
 
-; For a loop with a profile-guided estimated TC of 33, when the auto-vectorizer chooses VF 16, 
+; For a loop with a profile-guided estimated TC of 33, when the auto-vectorizer chooses VF 16,
 ; it should conservatively choose IC 1 so that the vector loop runs twice at least
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
 define void @loop_with_profile_tc_33(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -53,7 +53,7 @@ for.end:
   ret void
 }
 
-; For a loop with a profile-guided estimated TC of 48, when the auto-vectorizer chooses VF 16, 
+; For a loop with a profile-guided estimated TC of 48, when the auto-vectorizer chooses VF 16,
 ; it should conservatively choose IC 1 so that the vector loop runs twice at least
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
 define void @loop_with_profile_tc_48(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -77,7 +77,7 @@ for.end:
   ret void
 }
 
-; For a loop with a profile-guided estimated TC of 63, when the auto-vectorizer chooses VF 16, 
+; For a loop with a profile-guided estimated TC of 63, when the auto-vectorizer chooses VF 16,
 ; it should conservatively choose IC 1 so that the vector loop runs twice at least
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
 define void @loop_with_profile_tc_63(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -101,7 +101,7 @@ for.end:
   ret void
 }
 
-; For a loop with a profile-guided estimated TC of 64, when the auto-vectorizer chooses VF 16, 
+; For a loop with a profile-guided estimated TC of 64, when the auto-vectorizer chooses VF 16,
 ; it should choose conservatively IC 2 so that the vector loop runs twice at least
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
 define void @loop_with_profile_tc_64(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -125,10 +125,10 @@ for.end:
   ret void
 }
 
-; This has the same profile-guided estimated trip count as loop_with_profile_tc_64 but since the 
-; resulting interleaved group in this case may access memory out-of-bounds, it requires a scalar 
+; This has the same profile-guided estimated trip count as loop_with_profile_tc_64 but since the
+; resulting interleaved group in this case may access memory out-of-bounds, it requires a scalar
 ; epilogue iteration for correctness, making at most 63 iterations available for interleaving.
-; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar 
+; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar
 ; remainder than IC 2
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
 define void @loop_with_profile_tc_64_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -149,7 +149,7 @@ for.end:
   ret void
 }
 
-; For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16, 
+; For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16,
 ; it should choose conservatively IC 2 so that the vector loop runs twice at least
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
 define void @loop_with_profile_tc_100(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -173,7 +173,7 @@ for.end:
   ret void
 }
 
-; For a loop with a profile-guided estimated TC of 128, when the auto-vectorizer chooses VF 16, 
+; For a loop with a profile-guided estimated TC of 128, when the auto-vectorizer chooses VF 16,
 ; it should choose conservatively IC 4 so that the vector loop runs twice at least
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
 define void @loop_with_profile_tc_128(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -206,11 +206,11 @@ for.end:
   ret void
 }
 
-; This has the same profile-guided estimated trip count as loop_with_profile_tc_128 but since 
-; the resulting interleaved group in this case may access memory out-of-bounds, it requires 
-; a scalar epilogue iteration for correctness, making at most 127 iterations available for 
+; This has the same profile-guided estimated trip count as loop_with_profile_tc_128 but since
+; the resulting interleaved group in this case may access memory out-of-bounds, it requires
+; a scalar epilogue iteration for correctness, making at most 127 iterations available for
 ; interleaving.
-; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar 
+; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
 ; remainder than IC 4
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
 define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -240,7 +240,7 @@ for.end:
   ret void
 }
 
-; For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16, 
+; For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16,
 ; it should choose conservatively IC 4 so that the vector loop runs twice at least
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
 define void @loop_with_profile_tc_129(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -264,7 +264,7 @@ for.end:
   ret void
 }
 
-; For a loop with a profile-guided estimated TC of 180, when the auto-vectorizer chooses VF 16, 
+; For a loop with a profile-guided estimated TC of 180, when the auto-vectorizer chooses VF 16,
 ; it should choose conservatively IC 4 so that the vector loop runs twice at least
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
 define void @loop_with_profile_tc_180(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -288,7 +288,7 @@ for.end:
   ret void
 }
 
-; For a loop with a profile-guided estimated TC of 193, when the auto-vectorizer chooses VF 16, 
+; For a loop with a profile-guided estimated TC of 193, when the auto-vectorizer chooses VF 16,
 ; it should choose conservatively IC 4 so that the vector loop runs twice at least
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
 define void @loop_with_profile_tc_193(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -312,7 +312,7 @@ for.end:
   ret void
 }
 
-; For a loop with a profile-guided estimated TC of 1000, when the auto-vectorizer chooses VF 16, 
+; For a loop with a profile-guided estimated TC of 1000, when the auto-vectorizer chooses VF 16,
 ; the IC will be capped by the target-specific maximum interleave count
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
 define void @loop_with_profile_tc_1000(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -336,6 +336,30 @@ for.end:
   ret void
 }
 
+; When the loop weight is UINT_MAX, and the exit count is 1, the trip count
+; computation could wrap.
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_profile_wrap(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !11
+
+for.end:
+  ret void
+}
+
 !0 = !{!"branch_weights", i32 1, i32 31}
 !1 = !{!"branch_weights", i32 1, i32 32}
 !2 = !{!"branch_weights", i32 1, i32 47}
@@ -347,3 +371,4 @@ for.end:
 !8 = !{!"branch_weights", i32 1, i32 179}
 !9 = !{!"branch_weights", i32 1, i32 192}
 !10 = !{!"branch_weights", i32 1, i32 999}
+!11 = !{!"branch_weights", i32 1, i32 -1}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
@@ -180,8 +180,8 @@ outer.exit:
 define void @outer_pgo_minus1(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_minus1'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK-NOT:  We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced
-; CHECK:      Total cost of runtime checks: 6
+; CHECK:      We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 3
+; CHECK:      Total cost of runtime checks: 3
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop