@@ -196,22 +196,115 @@ pub fn utf8CountCodepoints(s: []const u8) !usize {
196
196
return len ;
197
197
}
198
198
199
- pub fn utf8ValidateSlice (s : []const u8 ) bool {
199
+ /// Returns true if the input consists entirely of UTF-8 codepoints
200
+ pub fn utf8ValidateSlice (input : []const u8 ) bool {
201
+ var remaining = input ;
202
+
203
+ const V_len = comptime std .simd .suggestVectorSize (usize ) orelse 1 ;
204
+ const V = @Vector (V_len , usize );
205
+ const u8s_in_vector = @sizeOf (usize ) * V_len ;
206
+
207
+ // Fast path. Check for and skip ASCII characters at the start of the input.
208
+ while (remaining .len >= u8s_in_vector ) {
209
+ const chunk : V = @bitCast (remaining [0.. u8s_in_vector ].* );
210
+ const swapped = mem .littleToNative (V , chunk );
211
+ const reduced = @reduce (.Or , swapped );
212
+ const mask : usize = @bitCast ([1 ]u8 {0x80 } ** @sizeOf (usize ));
213
+ if (reduced & mask != 0 ) {
214
+ // Found a non ASCII byte
215
+ break ;
216
+ }
217
+ remaining = remaining [u8s_in_vector .. ];
218
+ }
219
+
220
+ // default lowest and highest continuation byte
221
+ const lo_cb = 0b10000000 ;
222
+ const hi_cb = 0b10111111 ;
223
+
224
+ const min_non_ascii_codepoint = 0x80 ;
225
+
226
+ // The first nibble is used to identify the continuation byte range to
227
+ // accept. The second nibble is the size.
228
+ const xx = 0xF1 ; // invalid: size 1
229
+ const as = 0xF0 ; // ASCII: size 1
230
+ const s1 = 0x02 ; // accept 0, size 2
231
+ const s2 = 0x13 ; // accept 1, size 3
232
+ const s3 = 0x03 ; // accept 0, size 3
233
+ const s4 = 0x23 ; // accept 2, size 3
234
+ const s5 = 0x34 ; // accept 3, size 4
235
+ const s6 = 0x04 ; // accept 0, size 4
236
+ const s7 = 0x44 ; // accept 4, size 4
237
+
238
+ // Information about the first byte in a UTF-8 sequence.
239
+ const first = comptime ([_ ]u8 {as } ** 128 ) ++ ([_ ]u8 {xx } ** 64 ) ++ [_ ]u8 {
240
+ xx , xx , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 ,
241
+ s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 ,
242
+ s2 , s3 , s3 , s3 , s3 , s3 , s3 , s3 , s3 , s3 , s3 , s3 , s3 , s4 , s3 , s3 ,
243
+ s5 , s6 , s6 , s6 , s7 , xx , xx , xx , xx , xx , xx , xx , xx , xx , xx , xx ,
244
+ };
245
+
246
+ var n = remaining .len ;
200
247
var i : usize = 0 ;
201
- while (i < s .len ) {
202
- if (utf8ByteSequenceLength (s [i ])) | cp_len | {
203
- if (i + cp_len > s .len ) {
204
- return false ;
205
- }
248
+ while (i < n ) {
249
+ const first_byte = remaining [i ];
250
+ if (first_byte < min_non_ascii_codepoint ) {
251
+ i += 1 ;
252
+ continue ;
253
+ }
206
254
207
- if (std .meta .isError (utf8Decode (s [i .. i + cp_len ]))) {
208
- return false ;
209
- }
210
- i += cp_len ;
211
- } else | _ | {
255
+ const info = first [first_byte ];
256
+ if (info == xx ) {
257
+ return false ; // Illegal starter byte.
258
+ }
259
+
260
+ const size = info & 7 ;
261
+ if (i + size > n ) {
262
+ return false ; // Short or invalid.
263
+ }
264
+
265
+ // Figure out the acceptable low and high continuation bytes, starting
266
+ // with our defaults.
267
+ var accept_lo : u8 = lo_cb ;
268
+ var accept_hi : u8 = hi_cb ;
269
+
270
+ switch (info >> 4 ) {
271
+ 0 = > {},
272
+ 1 = > accept_lo = 0xA0 ,
273
+ 2 = > accept_hi = 0x9F ,
274
+ 3 = > accept_lo = 0x90 ,
275
+ 4 = > accept_hi = 0x8F ,
276
+ else = > unreachable ,
277
+ }
278
+
279
+ const c1 = remaining [i + 1 ];
280
+ if (c1 < accept_lo or accept_hi < c1 ) {
212
281
return false ;
213
282
}
283
+
284
+ switch (size ) {
285
+ 2 = > i += 2 ,
286
+ 3 = > {
287
+ const c2 = remaining [i + 2 ];
288
+ if (c2 < lo_cb or hi_cb < c2 ) {
289
+ return false ;
290
+ }
291
+ i += 3 ;
292
+ },
293
+ 4 = > {
294
+ const c2 = remaining [i + 2 ];
295
+ if (c2 < lo_cb or hi_cb < c2 ) {
296
+ return false ;
297
+ }
298
+ const c3 = remaining [i + 3 ];
299
+ if (c3 < lo_cb or hi_cb < c3 ) {
300
+ return false ;
301
+ }
302
+ i += 4 ;
303
+ },
304
+ else = > unreachable ,
305
+ }
214
306
}
307
+
215
308
return true ;
216
309
}
217
310
@@ -502,15 +595,44 @@ fn testUtf8ViewOk() !void {
502
595
try testing .expect (it2 .nextCodepoint () == null );
503
596
}
504
597
505
- test "bad utf8 slice" {
506
- try comptime testBadUtf8Slice ();
507
- try testBadUtf8Slice ();
598
+ test "validate slice" {
599
+ try comptime testValidateSlice ();
600
+ try testValidateSlice ();
601
+
602
+ // We skip a variable (based on recommended vector size) chunks of
603
+ // ASCII characters. Let's make sure we're chunking correctly.
604
+ const str = [_ ]u8 {'a' } ** 550 ++ "\xc0 " ;
605
+ for (0.. str .len - 3 ) | i | {
606
+ try testing .expect (! utf8ValidateSlice (str [i .. ]));
607
+ }
508
608
}
509
- fn testBadUtf8Slice () ! void {
609
+ fn testValidateSlice () ! void {
510
610
try testing .expect (utf8ValidateSlice ("abc" ));
611
+ try testing .expect (utf8ValidateSlice ("abc\xdf\xbf " ));
612
+ try testing .expect (utf8ValidateSlice ("" ));
613
+ try testing .expect (utf8ValidateSlice ("a" ));
614
+ try testing .expect (utf8ValidateSlice ("abc" ));
615
+ try testing .expect (utf8ValidateSlice ("Ж" ));
616
+ try testing .expect (utf8ValidateSlice ("ЖЖ" ));
617
+ try testing .expect (utf8ValidateSlice ("брэд-ЛГТМ" ));
618
+ try testing .expect (utf8ValidateSlice ("☺☻☹" ));
619
+ try testing .expect (utf8ValidateSlice ("a\u{fffdb} " ));
620
+ try testing .expect (utf8ValidateSlice ("\xf4\x8f\xbf\xbf " ));
621
+ try testing .expect (utf8ValidateSlice ("abc\xdf\xbf " ));
622
+
511
623
try testing .expect (! utf8ValidateSlice ("abc\xc0 " ));
512
624
try testing .expect (! utf8ValidateSlice ("abc\xc0 abc" ));
513
- try testing .expect (utf8ValidateSlice ("abc\xdf\xbf " ));
625
+ try testing .expect (! utf8ValidateSlice ("aa\xe2 " ));
626
+ try testing .expect (! utf8ValidateSlice ("\x42\xfa " ));
627
+ try testing .expect (! utf8ValidateSlice ("\x42\xfa\x43 " ));
628
+ try testing .expect (! utf8ValidateSlice ("abc\xc0 " ));
629
+ try testing .expect (! utf8ValidateSlice ("abc\xc0 abc" ));
630
+ try testing .expect (! utf8ValidateSlice ("\xf4\x90\x80\x80 " ));
631
+ try testing .expect (! utf8ValidateSlice ("\xf7\xbf\xbf\xbf " ));
632
+ try testing .expect (! utf8ValidateSlice ("\xfb\xbf\xbf\xbf\xbf " ));
633
+ try testing .expect (! utf8ValidateSlice ("\xc0\x80 " ));
634
+ try testing .expect (! utf8ValidateSlice ("\xed\xa0\x80 " ));
635
+ try testing .expect (! utf8ValidateSlice ("\xed\xbf\xbf " ));
514
636
}
515
637
516
638
test "valid utf8" {
0 commit comments