Skip to content

Commit

Permalink
perf: use simdutf8 to validate UTF-8 when reading files (#237)
Browse files Browse the repository at this point in the history
  • Loading branch information
Boshen authored Aug 26, 2024
1 parent 6c4297b commit 587bdab
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 4 deletions.
7 changes: 7 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -86,13 +86,14 @@ thiserror = "1.0.61"
json-strip-comments = "1.0.2"
indexmap = { version = "2.2.6", features = ["serde"] }
cfg-if = "1.0"
simdutf8 = { version = "0.1.4", features = ["aarch64_neon"] }

pnp = { version = "0.9.0", optional = true }

document-features = { version = "0.2.8", optional = true }

[dev-dependencies]
vfs = "0.12.0" # for testing with in memory file system
vfs = "0.12.0" # for testing with in memory file system
rayon = { version = "1.10.0" }
criterion2 = { version = "1.0.0", default-features = false }
normalize-path = { version = "0.2.1" }
Expand Down
20 changes: 17 additions & 3 deletions src/file_system.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,20 @@ impl Default for FileSystemOs {
}
}

fn read_to_string(path: &Path) -> io::Result<String> {
// `simdutf8` is faster than `std::str::from_utf8` which `fs::read_to_string` uses internally
let bytes = std::fs::read(path)?;
if simdutf8::basic::from_utf8(&bytes).is_err() {
// Same error as `fs::read_to_string` produces (`io::Error::INVALID_UTF8`)
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"stream did not contain valid UTF-8",
));
}
// SAFETY: `simdutf8` has ensured it's a valid UTF-8 string
Ok(unsafe { String::from_utf8_unchecked(bytes) })
}

impl FileSystem for FileSystemOs {
fn read_to_string(&self, path: &Path) -> io::Result<String> {
cfg_if! {
Expand All @@ -113,11 +127,11 @@ impl FileSystem for FileSystemOs {
VPath::Zip(info) => {
self.pnp_lru.read_to_string(info.physical_base_path(), info.zip_path)
}
VPath::Virtual(info) => fs::read_to_string(info.physical_base_path()),
VPath::Native(path) => fs::read_to_string(path),
VPath::Virtual(info) => read_to_string(&info.physical_base_path()),
VPath::Native(path) => read_to_string(&path),
}
} else {
fs::read_to_string(path)
read_to_string(path)
}
}
}
Expand Down

0 comments on commit 587bdab

Please sign in to comment.