Skip to content

Commit d68f39b

Browse files
authored
std.unicode.utf8ValidateSlice: optimize implementation (#17329)
Originally inspired by Go's `utf8.Valid` function. Includes some test cases from Go's test suite. Further optimized to be faster in all tested cases (short/long ascii/UTF8), in all release modes. Takes advantage of SIMD for the ASCII fast path.
1 parent 5a4a587 commit d68f39b

File tree

1 file changed

+138
-16
lines changed

1 file changed

+138
-16
lines changed

lib/std/unicode.zig

Lines changed: 138 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -196,22 +196,115 @@ pub fn utf8CountCodepoints(s: []const u8) !usize {
196196
return len;
197197
}
198198

199-
pub fn utf8ValidateSlice(s: []const u8) bool {
199+
/// Returns true if the input consists entirely of UTF-8 codepoints
200+
pub fn utf8ValidateSlice(input: []const u8) bool {
201+
var remaining = input;
202+
203+
const V_len = comptime std.simd.suggestVectorSize(usize) orelse 1;
204+
const V = @Vector(V_len, usize);
205+
const u8s_in_vector = @sizeOf(usize) * V_len;
206+
207+
// Fast path. Check for and skip ASCII characters at the start of the input.
208+
while (remaining.len >= u8s_in_vector) {
209+
const chunk: V = @bitCast(remaining[0..u8s_in_vector].*);
210+
const swapped = mem.littleToNative(V, chunk);
211+
const reduced = @reduce(.Or, swapped);
212+
const mask: usize = @bitCast([1]u8{0x80} ** @sizeOf(usize));
213+
if (reduced & mask != 0) {
214+
// Found a non ASCII byte
215+
break;
216+
}
217+
remaining = remaining[u8s_in_vector..];
218+
}
219+
220+
// default lowest and highest continuation byte
221+
const lo_cb = 0b10000000;
222+
const hi_cb = 0b10111111;
223+
224+
const min_non_ascii_codepoint = 0x80;
225+
226+
// The first nibble is used to identify the continuation byte range to
227+
// accept. The second nibble is the size.
228+
const xx = 0xF1; // invalid: size 1
229+
const as = 0xF0; // ASCII: size 1
230+
const s1 = 0x02; // accept 0, size 2
231+
const s2 = 0x13; // accept 1, size 3
232+
const s3 = 0x03; // accept 0, size 3
233+
const s4 = 0x23; // accept 2, size 3
234+
const s5 = 0x34; // accept 3, size 4
235+
const s6 = 0x04; // accept 0, size 4
236+
const s7 = 0x44; // accept 4, size 4
237+
238+
// Information about the first byte in a UTF-8 sequence.
239+
const first = comptime ([_]u8{as} ** 128) ++ ([_]u8{xx} ** 64) ++ [_]u8{
240+
xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
241+
s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
242+
s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3,
243+
s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
244+
};
245+
246+
var n = remaining.len;
200247
var i: usize = 0;
201-
while (i < s.len) {
202-
if (utf8ByteSequenceLength(s[i])) |cp_len| {
203-
if (i + cp_len > s.len) {
204-
return false;
205-
}
248+
while (i < n) {
249+
const first_byte = remaining[i];
250+
if (first_byte < min_non_ascii_codepoint) {
251+
i += 1;
252+
continue;
253+
}
206254

207-
if (std.meta.isError(utf8Decode(s[i .. i + cp_len]))) {
208-
return false;
209-
}
210-
i += cp_len;
211-
} else |_| {
255+
const info = first[first_byte];
256+
if (info == xx) {
257+
return false; // Illegal starter byte.
258+
}
259+
260+
const size = info & 7;
261+
if (i + size > n) {
262+
return false; // Short or invalid.
263+
}
264+
265+
// Figure out the acceptable low and high continuation bytes, starting
266+
// with our defaults.
267+
var accept_lo: u8 = lo_cb;
268+
var accept_hi: u8 = hi_cb;
269+
270+
switch (info >> 4) {
271+
0 => {},
272+
1 => accept_lo = 0xA0,
273+
2 => accept_hi = 0x9F,
274+
3 => accept_lo = 0x90,
275+
4 => accept_hi = 0x8F,
276+
else => unreachable,
277+
}
278+
279+
const c1 = remaining[i + 1];
280+
if (c1 < accept_lo or accept_hi < c1) {
212281
return false;
213282
}
283+
284+
switch (size) {
285+
2 => i += 2,
286+
3 => {
287+
const c2 = remaining[i + 2];
288+
if (c2 < lo_cb or hi_cb < c2) {
289+
return false;
290+
}
291+
i += 3;
292+
},
293+
4 => {
294+
const c2 = remaining[i + 2];
295+
if (c2 < lo_cb or hi_cb < c2) {
296+
return false;
297+
}
298+
const c3 = remaining[i + 3];
299+
if (c3 < lo_cb or hi_cb < c3) {
300+
return false;
301+
}
302+
i += 4;
303+
},
304+
else => unreachable,
305+
}
214306
}
307+
215308
return true;
216309
}
217310

@@ -502,15 +595,44 @@ fn testUtf8ViewOk() !void {
502595
try testing.expect(it2.nextCodepoint() == null);
503596
}
504597

505-
test "bad utf8 slice" {
506-
try comptime testBadUtf8Slice();
507-
try testBadUtf8Slice();
598+
test "validate slice" {
599+
try comptime testValidateSlice();
600+
try testValidateSlice();
601+
602+
// We skip a variable (based on recommended vector size) chunks of
603+
// ASCII characters. Let's make sure we're chunking correctly.
604+
const str = [_]u8{'a'} ** 550 ++ "\xc0";
605+
for (0..str.len - 3) |i| {
606+
try testing.expect(!utf8ValidateSlice(str[i..]));
607+
}
508608
}
509-
fn testBadUtf8Slice() !void {
609+
fn testValidateSlice() !void {
510610
try testing.expect(utf8ValidateSlice("abc"));
611+
try testing.expect(utf8ValidateSlice("abc\xdf\xbf"));
612+
try testing.expect(utf8ValidateSlice(""));
613+
try testing.expect(utf8ValidateSlice("a"));
614+
try testing.expect(utf8ValidateSlice("abc"));
615+
try testing.expect(utf8ValidateSlice("Ж"));
616+
try testing.expect(utf8ValidateSlice("ЖЖ"));
617+
try testing.expect(utf8ValidateSlice("брэд-ЛГТМ"));
618+
try testing.expect(utf8ValidateSlice("☺☻☹"));
619+
try testing.expect(utf8ValidateSlice("a\u{fffdb}"));
620+
try testing.expect(utf8ValidateSlice("\xf4\x8f\xbf\xbf"));
621+
try testing.expect(utf8ValidateSlice("abc\xdf\xbf"));
622+
511623
try testing.expect(!utf8ValidateSlice("abc\xc0"));
512624
try testing.expect(!utf8ValidateSlice("abc\xc0abc"));
513-
try testing.expect(utf8ValidateSlice("abc\xdf\xbf"));
625+
try testing.expect(!utf8ValidateSlice("aa\xe2"));
626+
try testing.expect(!utf8ValidateSlice("\x42\xfa"));
627+
try testing.expect(!utf8ValidateSlice("\x42\xfa\x43"));
628+
try testing.expect(!utf8ValidateSlice("abc\xc0"));
629+
try testing.expect(!utf8ValidateSlice("abc\xc0abc"));
630+
try testing.expect(!utf8ValidateSlice("\xf4\x90\x80\x80"));
631+
try testing.expect(!utf8ValidateSlice("\xf7\xbf\xbf\xbf"));
632+
try testing.expect(!utf8ValidateSlice("\xfb\xbf\xbf\xbf\xbf"));
633+
try testing.expect(!utf8ValidateSlice("\xc0\x80"));
634+
try testing.expect(!utf8ValidateSlice("\xed\xa0\x80"));
635+
try testing.expect(!utf8ValidateSlice("\xed\xbf\xbf"));
514636
}
515637

516638
test "valid utf8" {

0 commit comments

Comments
 (0)