Skip to content

Commit 0e16dd2

Browse files
committed
simplify/optimize
1 parent c26a718 commit 0e16dd2

File tree

1 file changed

+34
-43
lines changed

1 file changed

+34
-43
lines changed

portable/src/implementation/fallback.rs

+34-43
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,7 @@ pub fn validate_utf8_compat(input: &[u8]) -> Result<(), crate::compat::Utf8Error
2626
/// available.
2727
#[cfg(feature = "public_imp")]
2828
pub struct Utf8ValidatorImp {
29-
incomplete_data: [u8; 4],
30-
incomplete_len: u8,
29+
expected_cont_bytes: u8,
3130
err: bool,
3231
}
3332

@@ -39,65 +38,59 @@ pub use Utf8ValidatorImp as ChunkedUtf8ValidatorImp;
3938
#[cfg(feature = "public_imp")]
4039
impl Utf8ValidatorImp {
4140
#[inline]
42-
#[expect(clippy::cast_possible_truncation)]
4341
fn update(&mut self, mut input: &[u8]) {
4442
if self.err {
4543
return;
4644
}
47-
if self.incomplete_len > 0 {
48-
let total_bytes_needed: usize = match self.incomplete_data[0] {
49-
0..0b1000_0000 => {
50-
panic!("ASCII data should never be incomplete");
51-
}
52-
0b1000_0000..0b1100_0000 => {
53-
// first byte cannot be a continuation byte
45+
if self.expected_cont_bytes > 0 {
46+
let to_check = (self.expected_cont_bytes as usize).min(input.len());
47+
for b in &input[..to_check] {
48+
if b & 0b1100_0000 != 0b1000_0000 {
49+
// not a continuation byte
5450
self.err = true;
5551
return;
5652
}
57-
0b1100_0000..0b1110_0000 => 2,
58-
0b1110_0000..0b1111_0000 => 3,
59-
0b1111_0000..0b1111_1000 => 4,
53+
self.expected_cont_bytes -= 1;
54+
}
55+
if self.expected_cont_bytes > 0 {
56+
// not enough continuation bytes
57+
return;
58+
}
59+
input = &input[to_check..];
60+
}
61+
if let Err(e) = core::str::from_utf8(input) {
62+
// cannot wrap, since there is at least one byte left which is not valid UTF-8
63+
// by itself
64+
self.expected_cont_bytes = match input[e.valid_up_to()] {
65+
0b1100_0000..0b1110_0000 => 1,
66+
0b1110_0000..0b1111_0000 => 2,
67+
0b1111_0000..0b1111_1000 => 3,
6068
_ => {
6169
// invalid byte for starting sequence
6270
self.err = true;
6371
return;
6472
}
6573
};
66-
if self.incomplete_len as usize >= total_bytes_needed {
67-
// actually errored on previous update
68-
self.err = true;
69-
return;
70-
}
71-
let bytes_needed = total_bytes_needed - self.incomplete_len as usize;
72-
let to_copy = core::cmp::min(bytes_needed, input.len());
73-
self.incomplete_data
74-
[self.incomplete_len as usize..self.incomplete_len as usize + to_copy]
75-
.copy_from_slice(&input[..to_copy]);
76-
if to_copy < bytes_needed {
77-
self.incomplete_len += to_copy as u8;
78-
return;
79-
}
80-
if core::str::from_utf8(&self.incomplete_data[..total_bytes_needed]).is_err() {
74+
let rem_input = input.len() - e.valid_up_to() - 1;
75+
if rem_input >= self.expected_cont_bytes as usize {
76+
// too many continuation bytes so they are not valid
8177
self.err = true;
8278
return;
8379
}
84-
self.incomplete_len = 0;
85-
input = &input[to_copy..];
86-
}
87-
if let Err(e) = core::str::from_utf8(input) {
88-
if input.len() - e.valid_up_to() > 3 {
89-
self.err = true;
90-
return;
80+
for i in 0..rem_input {
81+
if input[e.valid_up_to() + i + 1] & 0b1100_0000 != 0b1000_0000 {
82+
// not a continuation byte
83+
self.err = true;
84+
return;
85+
}
86+
self.expected_cont_bytes -= 1;
9187
}
92-
self.incomplete_len = (input.len() - e.valid_up_to()) as u8;
93-
self.incomplete_data[..self.incomplete_len as usize]
94-
.copy_from_slice(&input[e.valid_up_to()..]);
9588
}
9689
}
9790

9891
#[inline]
9992
const fn finalize(self) -> core::result::Result<(), crate::basic::Utf8Error> {
100-
if self.err || self.incomplete_len > 0 {
93+
if self.err || self.expected_cont_bytes > 0 {
10194
Err(crate::basic::Utf8Error {})
10295
} else {
10396
Ok(())
@@ -111,8 +104,7 @@ impl crate::basic::imp::Utf8Validator for Utf8ValidatorImp {
111104
#[must_use]
112105
fn new() -> Self {
113106
Self {
114-
incomplete_data: [0; 4],
115-
incomplete_len: 0,
107+
expected_cont_bytes: 0,
116108
err: false,
117109
}
118110
}
@@ -137,8 +129,7 @@ impl crate::basic::imp::ChunkedUtf8Validator for Utf8ValidatorImp {
137129
#[must_use]
138130
fn new() -> Self {
139131
Self {
140-
incomplete_data: [0; 4],
141-
incomplete_len: 0,
132+
expected_cont_bytes: 0,
142133
err: false,
143134
}
144135
}

0 commit comments

Comments
 (0)