From 4d037a000c7e22b7dd6b14a1f8c61c91ca4558a4 Mon Sep 17 00:00:00 2001
From: Lukasz Anforowicz <lukasza@chromium.org>
Date: Thu, 24 Oct 2024 18:29:49 +0000
Subject: [PATCH] Add fuzzer for `utf8::validate`.

---
 .github/workflows/fuzz.yml         | 24 +++++++++++++
 .gitignore                         |  2 +-
 fuzz/.gitignore                    |  4 +++
 fuzz/Cargo.toml                    | 23 +++++++++++++
 fuzz/fuzz_targets/utf8_validate.rs | 55 ++++++++++++++++++++++++++++++
 5 files changed, 107 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/fuzz.yml
 create mode 100644 fuzz/.gitignore
 create mode 100644 fuzz/Cargo.toml
 create mode 100644 fuzz/fuzz_targets/utf8_validate.rs

diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml
new file mode 100644
index 0000000..771492d
--- /dev/null
+++ b/.github/workflows/fuzz.yml
@@ -0,0 +1,24 @@
+name: Fuzzing
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  fuzzing:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: Install Rust
+      uses: actions-rs/toolchain@v1
+      with:
+        toolchain: nightly
+        override: true
+    - name: Install cargo-fuzz
+      run: |
+        cargo install cargo-fuzz
+    - name: Run Fuzzing
+      run: |
+        cargo fuzz run utf8_validate -- -max_total_time=180
diff --git a/.gitignore b/.gitignore
index 42cacb3..ca54338 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,4 @@
 .*.swp
 tags
 target
-/Cargo.lock
+Cargo.lock
diff --git a/fuzz/.gitignore b/fuzz/.gitignore
new file mode 100644
index 0000000..1a45eee
--- /dev/null
+++ b/fuzz/.gitignore
@@ -0,0 +1,4 @@
+target
+corpus
+artifacts
+coverage
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
new file mode 100644
index 0000000..1f85670
--- /dev/null
+++ b/fuzz/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "bstr-fuzz"
+version = "0.0.0"
+publish = false
+edition = "2018"
+
+[package.metadata]
+cargo-fuzz = true
+
+[dependencies]
+libfuzzer-sys = "0.4"
+
+[dependencies.bstr]
+path = ".."
+
+[[bin]]
+name = "utf8_validate"
+path = "fuzz_targets/utf8_validate.rs"
+test = false
+doc = false
+bench = false
+
+[workspace]
diff --git a/fuzz/fuzz_targets/utf8_validate.rs b/fuzz/fuzz_targets/utf8_validate.rs
new file mode 100644
index 0000000..86cb214
--- /dev/null
+++ b/fuzz/fuzz_targets/utf8_validate.rs
@@ -0,0 +1,55 @@
+//! This fuzzer attempts to test the functional correctness of the `bstr::utf8::validate` function.
+//! This coverage is desirable, because some `unsafe` blocks in the `bstr` crate depend on the
+//! guarantees made by `utf8::validate` - e.g. the soundness of `bstr::ByteSlice::to_str` depends
+//! on these guarantees.
+//!
+//! The `utf8::validate` function is in a non-public module, which means that we can't test it
+//! directly.  Therefore we test via `bstr::ByteSlice::to_str` instead.
+//!
+//! We use the following [test oracle](https://en.wikipedia.org/wiki/Test_oracle) to validate
+//! results returned by `utf8::validate`:
+//!
+//! * A standard library implementation (`std::str::from_utf8` is analogous to
+//!   `bstr::ByteSlice::to_str` and `run_utf8_validation` in `core/str/validations.rs` is analogous
+//!   to `bstr::utf8::validate`).
+//!   https://github.com/BurntSushi/bstr/issues/25#issuecomment-543835601 explains
+//!   why `bstr` doesn't reuse the standard library's implementation.
+//! * TODO: Consider also adding a manual, simple (and therefore hopefully "obviously correct")
+//!   implementation as another test oracle.
+
+#![no_main]
+
+use bstr::ByteSlice;
+use libfuzzer_sys::fuzz_target;
+
+fn validate(data: &[u8]) {
+    let bstr_result = data.to_str();
+    let std_result = std::str::from_utf8(data);
+
+    match bstr_result {
+        Ok(bstr_str) => {
+            let Ok(std_str) = std_result else {
+                panic!("`bstr` succeeded but `std` failed");
+            };
+            assert_eq!(data.as_ptr(), bstr_str.as_ptr());
+            assert_eq!(data.as_ptr(), std_str.as_ptr());
+            assert_eq!(data.len(), bstr_str.len());
+            assert_eq!(data.len(), std_str.len());
+        }
+        Err(bstr_err) => {
+            let Err(std_err) = std_result else {
+                panic!("`bstr` failed but `std` succeeded");
+            };
+            assert_eq!(bstr_err.error_len(), std_err.error_len());
+            assert_eq!(bstr_err.valid_up_to(), std_err.valid_up_to());
+        }
+    }
+}
+
+fuzz_target!(|data: &[u8]| {
+    // Test various alignments, because `utf8::validate` calls into `ascii::first_non_ascii_byte`
+    // and the latter is sensitive to the alignment.
+    for alignment_offset in 0..=(std::cmp::min(data.len(), 16)) {
+        validate(&data[alignment_offset..]);
+    }
+});