Skip to content

Commit c26a718

Browse files
committed
fallback, etc.
1 parent 5a42598 commit c26a718

File tree

7 files changed

+279
-67
lines changed

7 files changed

+279
-67
lines changed

portable/Cargo.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,9 @@ std = []
1010

1111
# expose SIMD implementations in basic::imp::* and compat::imp::*
1212
public_imp = []
13+
force_fallback = []
14+
force_simd128 = []
15+
force_simd256 = []
16+
17+
[dependencies]
18+
cfg-if = "1.0.0"

portable/src/basic.rs

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -193,11 +193,25 @@ pub mod imp {
193193
) -> core::result::Result<(), basic::Utf8Error>;
194194
}
195195

196-
/// Best for current target
196+
/// Best for current target as defined by compile-time arch and target features. If no fast
197+
/// SIMD implementation is available, the scalar implementation from the standard library is
198+
/// used as a fallback.
199+
///
200+
/// However, the crate feature `force_nonsimd` forces the fallback implementation, `force_simd128`
201+
/// forces the 128-bit SIMD implementation and `force_simd256` forces the 256-bit SIMD implementation,
202+
/// in order of precedence.
203+
///
197204
pub mod auto {
198-
pub use crate::implementation::simd::auto::validate_utf8_basic as validate_utf8;
199-
pub use crate::implementation::simd::auto::ChunkedUtf8ValidatorImp;
200-
pub use crate::implementation::simd::auto::Utf8ValidatorImp;
205+
pub use crate::implementation::auto::validate_utf8_basic as validate_utf8;
206+
pub use crate::implementation::auto::ChunkedUtf8ValidatorImp;
207+
pub use crate::implementation::auto::Utf8ValidatorImp;
208+
}
209+
210+
/// Includes the scalar fallback implementation using 128-bit portable SIMD.
211+
pub mod fallback {
212+
pub use crate::implementation::fallback::validate_utf8_basic as validate_utf8;
213+
pub use crate::implementation::fallback::ChunkedUtf8ValidatorImp;
214+
pub use crate::implementation::fallback::Utf8ValidatorImp;
201215
}
202216

203217
/// Includes the validation implementation using 128-bit portable SIMD.

portable/src/compat.rs

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,22 @@ pub fn from_utf8_mut(input: &mut [u8]) -> Result<&mut str, Utf8Error> {
101101
/// Allows direct access to the platform-specific unsafe validation implementations.
102102
#[cfg(feature = "public_imp")]
103103
pub mod imp {
104-
/// Best for current target FIXME: 256-bit support
105-
pub use v128 as auto;
104+
/// Best for current target as defined by compile-time arch and target features. If no fast
105+
/// SIMD implementation is available, the scalar implementation from the standard library is
106+
/// used as a fallback.
107+
///
108+
/// However, the crate feature `force_nonsimd` forces the fallback implementation, `force_simd128`
109+
/// forces the 128-bit SIMD implementation and `force_simd256` forces the 256-bit SIMD implementation,
110+
/// in order of precedence.
111+
///
112+
pub mod auto {
113+
pub use crate::implementation::auto::validate_utf8_compat as validate_utf8;
114+
}
115+
116+
/// Includes the scalar fallback implementation using 128-bit portable SIMD.
117+
pub mod fallback {
118+
pub use crate::implementation::fallback::validate_utf8_compat as validate_utf8;
119+
}
106120

107121
/// Includes the validation implementation for 128-bit portable SIMD.
108122
pub mod v128 {
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
/// Fallback implementation using the standard library.
2+
///
3+
/// # Errors
4+
/// Returns the zero-sized [`basic::Utf8Error`] on failure.
5+
#[inline]
6+
pub const fn validate_utf8_basic(input: &[u8]) -> Result<(), crate::basic::Utf8Error> {
7+
match core::str::from_utf8(input) {
8+
Ok(_) => Ok(()),
9+
Err(_) => Err(crate::basic::Utf8Error {}),
10+
}
11+
}
12+
13+
/// Fallback implementation using the standard library.
14+
///
15+
/// # Errors
16+
/// Returns [`compat::Utf8Error`] with detailed error information on failure.
17+
#[inline]
18+
pub fn validate_utf8_compat(input: &[u8]) -> Result<(), crate::compat::Utf8Error> {
19+
super::validate_utf8_at_offset(input, 0)
20+
}
21+
22+
/// Low-level implementation of the [`basic::imp::Utf8Validator`] trait.
23+
///
24+
/// This is implementation requires CPU SIMD features specified by the module it resides in.
25+
/// It is undefined behavior to call it if the required CPU features are not
26+
/// available.
27+
#[cfg(feature = "public_imp")]
28+
pub struct Utf8ValidatorImp {
29+
incomplete_data: [u8; 4],
30+
incomplete_len: u8,
31+
err: bool,
32+
}
33+
34+
use core::panic;
35+
36+
#[cfg(feature = "public_imp")]
37+
pub use Utf8ValidatorImp as ChunkedUtf8ValidatorImp;
38+
39+
#[cfg(feature = "public_imp")]
40+
impl Utf8ValidatorImp {
41+
#[inline]
42+
#[expect(clippy::cast_possible_truncation)]
43+
fn update(&mut self, mut input: &[u8]) {
44+
if self.err {
45+
return;
46+
}
47+
if self.incomplete_len > 0 {
48+
let total_bytes_needed: usize = match self.incomplete_data[0] {
49+
0..0b1000_0000 => {
50+
panic!("ASCII data should never be incomplete");
51+
}
52+
0b1000_0000..0b1100_0000 => {
53+
// first byte cannot be a continuation byte
54+
self.err = true;
55+
return;
56+
}
57+
0b1100_0000..0b1110_0000 => 2,
58+
0b1110_0000..0b1111_0000 => 3,
59+
0b1111_0000..0b1111_1000 => 4,
60+
_ => {
61+
// invalid byte for starting sequence
62+
self.err = true;
63+
return;
64+
}
65+
};
66+
if self.incomplete_len as usize >= total_bytes_needed {
67+
// actually errored on previous update
68+
self.err = true;
69+
return;
70+
}
71+
let bytes_needed = total_bytes_needed - self.incomplete_len as usize;
72+
let to_copy = core::cmp::min(bytes_needed, input.len());
73+
self.incomplete_data
74+
[self.incomplete_len as usize..self.incomplete_len as usize + to_copy]
75+
.copy_from_slice(&input[..to_copy]);
76+
if to_copy < bytes_needed {
77+
self.incomplete_len += to_copy as u8;
78+
return;
79+
}
80+
if core::str::from_utf8(&self.incomplete_data[..total_bytes_needed]).is_err() {
81+
self.err = true;
82+
return;
83+
}
84+
self.incomplete_len = 0;
85+
input = &input[to_copy..];
86+
}
87+
if let Err(e) = core::str::from_utf8(input) {
88+
if input.len() - e.valid_up_to() > 3 {
89+
self.err = true;
90+
return;
91+
}
92+
self.incomplete_len = (input.len() - e.valid_up_to()) as u8;
93+
self.incomplete_data[..self.incomplete_len as usize]
94+
.copy_from_slice(&input[e.valid_up_to()..]);
95+
}
96+
}
97+
98+
#[inline]
99+
const fn finalize(self) -> core::result::Result<(), crate::basic::Utf8Error> {
100+
if self.err || self.incomplete_len > 0 {
101+
Err(crate::basic::Utf8Error {})
102+
} else {
103+
Ok(())
104+
}
105+
}
106+
}
107+
108+
#[cfg(feature = "public_imp")]
109+
impl crate::basic::imp::Utf8Validator for Utf8ValidatorImp {
110+
#[inline]
111+
#[must_use]
112+
fn new() -> Self {
113+
Self {
114+
incomplete_data: [0; 4],
115+
incomplete_len: 0,
116+
err: false,
117+
}
118+
}
119+
120+
#[inline]
121+
fn update(&mut self, input: &[u8]) {
122+
if input.is_empty() {
123+
return;
124+
}
125+
self.update(input);
126+
}
127+
128+
#[inline]
129+
fn finalize(self) -> core::result::Result<(), crate::basic::Utf8Error> {
130+
self.finalize()
131+
}
132+
}
133+
134+
#[cfg(feature = "public_imp")]
135+
impl crate::basic::imp::ChunkedUtf8Validator for Utf8ValidatorImp {
136+
#[inline]
137+
#[must_use]
138+
fn new() -> Self {
139+
Self {
140+
incomplete_data: [0; 4],
141+
incomplete_len: 0,
142+
err: false,
143+
}
144+
}
145+
146+
#[inline]
147+
fn update_from_chunks(&mut self, input: &[u8]) {
148+
self.update(input);
149+
}
150+
151+
#[inline]
152+
fn finalize(
153+
mut self,
154+
remaining_input: core::option::Option<&[u8]>,
155+
) -> core::result::Result<(), crate::basic::Utf8Error> {
156+
if let Some(remaining_input) = remaining_input {
157+
self.update(remaining_input);
158+
}
159+
self.finalize()
160+
}
161+
}

portable/src/implementation/mod.rs

Lines changed: 26 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -2,49 +2,53 @@
22
33
#![forbid(unsafe_code)]
44

5+
pub(crate) mod fallback;
56
pub(crate) mod simd;
67

8+
cfg_if::cfg_if! {
9+
if #[cfg(feature = "force_fallback")] {
10+
pub(crate) use fallback as auto;
11+
} else if #[cfg(feature = "force_simd128")] {
12+
pub(crate) use simd::v128 as auto;
13+
} else if #[cfg(feature = "force_simd256")] {
14+
pub(crate) use simd::v256 as auto;
15+
16+
// known good configurations
17+
} else if #[cfg(all(
18+
any(target_arch = "x86_64", target_arch = "x86"),
19+
target_feature = "avx2"
20+
))] {
21+
pub(crate) use simd::v256 as auto;
22+
} else {
23+
pub(crate) use fallback as auto;
24+
}
25+
}
26+
727
#[inline]
8-
pub(crate) fn validate_utf8_basic(input: &[u8]) -> Result<(), crate::basic::Utf8Error> {
28+
pub(crate) const fn validate_utf8_basic(input: &[u8]) -> Result<(), crate::basic::Utf8Error> {
929
if input.len() < simd::SIMD_CHUNK_SIZE {
10-
return validate_utf8_basic_fallback(input);
30+
return fallback::validate_utf8_basic(input);
1131
}
1232

1333
validate_utf8_basic_simd(input)
1434
}
1535

1636
#[inline(never)]
17-
fn validate_utf8_basic_simd(input: &[u8]) -> Result<(), crate::basic::Utf8Error> {
18-
simd::auto::validate_utf8_basic(input)
37+
const fn validate_utf8_basic_simd(input: &[u8]) -> Result<(), crate::basic::Utf8Error> {
38+
auto::validate_utf8_basic(input)
1939
}
2040

2141
#[inline]
2242
pub(crate) fn validate_utf8_compat(input: &[u8]) -> Result<(), crate::compat::Utf8Error> {
2343
if input.len() < simd::SIMD_CHUNK_SIZE {
24-
return validate_utf8_compat_fallback(input);
44+
return fallback::validate_utf8_compat(input);
2545
}
2646

2747
validate_utf8_compat_simd(input)
2848
}
2949

3050
fn validate_utf8_compat_simd(input: &[u8]) -> Result<(), crate::compat::Utf8Error> {
31-
simd::auto::validate_utf8_compat(input)
32-
}
33-
34-
// fallback method implementations
35-
#[inline]
36-
pub(crate) const fn validate_utf8_basic_fallback(
37-
input: &[u8],
38-
) -> Result<(), crate::basic::Utf8Error> {
39-
match core::str::from_utf8(input) {
40-
Ok(_) => Ok(()),
41-
Err(_) => Err(crate::basic::Utf8Error {}),
42-
}
43-
}
44-
45-
#[inline]
46-
pub(crate) fn validate_utf8_compat_fallback(input: &[u8]) -> Result<(), crate::compat::Utf8Error> {
47-
validate_utf8_at_offset(input, 0)
51+
auto::validate_utf8_compat(input)
4852
}
4953

5054
type Utf8ErrorCompat = crate::compat::Utf8Error;
@@ -63,25 +67,3 @@ pub(crate) fn validate_utf8_at_offset(input: &[u8], offset: usize) -> Result<(),
6367
}),
6468
}
6569
}
66-
67-
#[cold]
68-
#[expect(clippy::unwrap_used)]
69-
#[allow(dead_code)] // only used if there is a SIMD implementation
70-
pub(crate) fn get_compat_error(input: &[u8], failing_block_pos: usize) -> Utf8ErrorCompat {
71-
let offset = if failing_block_pos == 0 {
72-
// Error must be in this block since it is the first.
73-
0
74-
} else {
75-
// The previous block is OK except for a possible continuation over the block boundary.
76-
// We go backwards over the last three bytes of the previous block and find the
77-
// last non-continuation byte as a starting point for an std validation. If the last
78-
// three bytes are all continuation bytes then the previous block ends with a four byte
79-
// UTF-8 codepoint, is thus complete and valid UTF-8. We start the check with the
80-
// current block in that case.
81-
(1..=3)
82-
.find(|i| input[failing_block_pos - i] >> 6 != 0b10)
83-
.map_or(failing_block_pos, |i| failing_block_pos - i)
84-
};
85-
// UNWRAP: safe because the SIMD UTF-8 validation found an error
86-
validate_utf8_at_offset(input, offset).unwrap_err()
87-
}

0 commit comments

Comments
 (0)