@@ -5,7 +5,7 @@ target triple = "aarch64-linux-gnu"
5
5
6
6
%pair = type { i8 , i8 }
7
7
8
- ; For a loop with a profile-guided estimated TC of 32, when the auto-vectorizer chooses VF 16,
8
+ ; For a loop with a profile-guided estimated TC of 32, when the auto-vectorizer chooses VF 16,
9
9
; it should conservatively choose IC 1 so that the vector loop runs twice at least
10
10
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
11
11
define void @loop_with_profile_tc_32 (ptr noalias %p , ptr noalias %q , i64 %n ) {
@@ -29,7 +29,7 @@ for.end:
29
29
ret void
30
30
}
31
31
32
- ; For a loop with a profile-guided estimated TC of 33, when the auto-vectorizer chooses VF 16,
32
+ ; For a loop with a profile-guided estimated TC of 33, when the auto-vectorizer chooses VF 16,
33
33
; it should conservatively choose IC 1 so that the vector loop runs twice at least
34
34
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
35
35
define void @loop_with_profile_tc_33 (ptr noalias %p , ptr noalias %q , i64 %n ) {
@@ -53,7 +53,7 @@ for.end:
53
53
ret void
54
54
}
55
55
56
- ; For a loop with a profile-guided estimated TC of 48, when the auto-vectorizer chooses VF 16,
56
+ ; For a loop with a profile-guided estimated TC of 48, when the auto-vectorizer chooses VF 16,
57
57
; it should conservatively choose IC 1 so that the vector loop runs twice at least
58
58
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
59
59
define void @loop_with_profile_tc_48 (ptr noalias %p , ptr noalias %q , i64 %n ) {
@@ -77,7 +77,7 @@ for.end:
77
77
ret void
78
78
}
79
79
80
- ; For a loop with a profile-guided estimated TC of 63, when the auto-vectorizer chooses VF 16,
80
+ ; For a loop with a profile-guided estimated TC of 63, when the auto-vectorizer chooses VF 16,
81
81
; it should conservatively choose IC 1 so that the vector loop runs twice at least
82
82
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
83
83
define void @loop_with_profile_tc_63 (ptr noalias %p , ptr noalias %q , i64 %n ) {
@@ -101,7 +101,7 @@ for.end:
101
101
ret void
102
102
}
103
103
104
- ; For a loop with a profile-guided estimated TC of 64, when the auto-vectorizer chooses VF 16,
104
+ ; For a loop with a profile-guided estimated TC of 64, when the auto-vectorizer chooses VF 16,
105
105
; it should choose conservatively IC 2 so that the vector loop runs twice at least
106
106
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
107
107
define void @loop_with_profile_tc_64 (ptr noalias %p , ptr noalias %q , i64 %n ) {
@@ -125,10 +125,10 @@ for.end:
125
125
ret void
126
126
}
127
127
128
- ; This has the same profile-guided estimated trip count as loop_with_profile_tc_64 but since the
129
- ; resulting interleaved group in this case may access memory out-of-bounds, it requires a scalar
128
+ ; This has the same profile-guided estimated trip count as loop_with_profile_tc_64 but since the
129
+ ; resulting interleaved group in this case may access memory out-of-bounds, it requires a scalar
130
130
; epilogue iteration for correctness, making at most 63 iterations available for interleaving.
131
- ; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar
131
+ ; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar
132
132
; remainder than IC 2
133
133
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
134
134
define void @loop_with_profile_tc_64_scalar_epilogue_reqd (ptr noalias %p , ptr noalias %q , i64 %n ) {
@@ -149,7 +149,7 @@ for.end:
149
149
ret void
150
150
}
151
151
152
- ; For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16,
152
+ ; For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16,
153
153
; it should choose conservatively IC 2 so that the vector loop runs twice at least
154
154
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
155
155
define void @loop_with_profile_tc_100 (ptr noalias %p , ptr noalias %q , i64 %n ) {
@@ -173,7 +173,7 @@ for.end:
173
173
ret void
174
174
}
175
175
176
- ; For a loop with a profile-guided estimated TC of 128, when the auto-vectorizer chooses VF 16,
176
+ ; For a loop with a profile-guided estimated TC of 128, when the auto-vectorizer chooses VF 16,
177
177
; it should choose conservatively IC 4 so that the vector loop runs twice at least
178
178
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
179
179
define void @loop_with_profile_tc_128 (ptr noalias %p , ptr noalias %q , i64 %n ) {
@@ -206,11 +206,11 @@ for.end:
206
206
ret void
207
207
}
208
208
209
- ; This has the same profile-guided estimated trip count as loop_with_profile_tc_128 but since
210
- ; the resulting interleaved group in this case may access memory out-of-bounds, it requires
211
- ; a scalar epilogue iteration for correctness, making at most 127 iterations available for
209
+ ; This has the same profile-guided estimated trip count as loop_with_profile_tc_128 but since
210
+ ; the resulting interleaved group in this case may access memory out-of-bounds, it requires
211
+ ; a scalar epilogue iteration for correctness, making at most 127 iterations available for
212
212
; interleaving.
213
- ; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
213
+ ; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
214
214
; remainder than IC 4
215
215
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
216
216
define void @loop_with_profile_tc_128_scalar_epilogue_reqd (ptr noalias %p , ptr noalias %q , i64 %n ) {
@@ -240,7 +240,7 @@ for.end:
240
240
ret void
241
241
}
242
242
243
- ; For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16,
243
+ ; For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16,
244
244
; it should choose conservatively IC 4 so that the vector loop runs twice at least
245
245
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
246
246
define void @loop_with_profile_tc_129 (ptr noalias %p , ptr noalias %q , i64 %n ) {
@@ -264,7 +264,7 @@ for.end:
264
264
ret void
265
265
}
266
266
267
- ; For a loop with a profile-guided estimated TC of 180, when the auto-vectorizer chooses VF 16,
267
+ ; For a loop with a profile-guided estimated TC of 180, when the auto-vectorizer chooses VF 16,
268
268
; it should choose conservatively IC 4 so that the vector loop runs twice at least
269
269
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
270
270
define void @loop_with_profile_tc_180 (ptr noalias %p , ptr noalias %q , i64 %n ) {
@@ -288,7 +288,7 @@ for.end:
288
288
ret void
289
289
}
290
290
291
- ; For a loop with a profile-guided estimated TC of 193, when the auto-vectorizer chooses VF 16,
291
+ ; For a loop with a profile-guided estimated TC of 193, when the auto-vectorizer chooses VF 16,
292
292
; it should choose conservatively IC 4 so that the vector loop runs twice at least
293
293
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
294
294
define void @loop_with_profile_tc_193 (ptr noalias %p , ptr noalias %q , i64 %n ) {
@@ -312,7 +312,7 @@ for.end:
312
312
ret void
313
313
}
314
314
315
- ; For a loop with a profile-guided estimated TC of 1000, when the auto-vectorizer chooses VF 16,
315
+ ; For a loop with a profile-guided estimated TC of 1000, when the auto-vectorizer chooses VF 16,
316
316
; the IC will be capped by the target-specific maximum interleave count
317
317
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
318
318
define void @loop_with_profile_tc_1000 (ptr noalias %p , ptr noalias %q , i64 %n ) {
@@ -336,6 +336,30 @@ for.end:
336
336
ret void
337
337
}
338
338
339
+ ; When the loop weight is UINT_MAX, and the exit count is 1, the trip count
340
+ ; computation could wrap.
341
+ ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
342
+ define void @loop_with_profile_wrap (ptr noalias %p , ptr noalias %q , i64 %n ) {
343
+ entry:
344
+ br label %for.body
345
+
346
+ for.body:
347
+ %i = phi i64 [ 0 , %entry ], [ %i.next , %for.body ]
348
+ %tmp0 = getelementptr %pair , ptr %p , i64 %i , i32 0
349
+ %tmp1 = load i8 , ptr %tmp0 , align 1
350
+ %tmp2 = getelementptr %pair , ptr %p , i64 %i , i32 1
351
+ %tmp3 = load i8 , ptr %tmp2 , align 1
352
+ %add = add i8 %tmp1 , %tmp3
353
+ %qi = getelementptr i8 , ptr %q , i64 %i
354
+ store i8 %add , ptr %qi , align 1
355
+ %i.next = add nuw nsw i64 %i , 1
356
+ %cond = icmp eq i64 %i.next , %n
357
+ br i1 %cond , label %for.end , label %for.body , !prof !11
358
+
359
+ for.end:
360
+ ret void
361
+ }
362
+
339
363
!0 = !{!"branch_weights" , i32 1 , i32 31 }
340
364
!1 = !{!"branch_weights" , i32 1 , i32 32 }
341
365
!2 = !{!"branch_weights" , i32 1 , i32 47 }
@@ -347,3 +371,4 @@ for.end:
347
371
!8 = !{!"branch_weights" , i32 1 , i32 179 }
348
372
!9 = !{!"branch_weights" , i32 1 , i32 192 }
349
373
!10 = !{!"branch_weights" , i32 1 , i32 999 }
374
+ !11 = !{!"branch_weights" , i32 1 , i32 -1 }
0 commit comments