toml-rs · Manishearth · Apr 26, 2024 · May 2, 2024 · epage · Apr 26, 2024
diff --git a/crates/toml_edit/src/parser/datetime.rs b/crates/toml_edit/src/parser/datetime.rs
@@ -255,6 +255,7 @@ pub(crate) fn unsigned_digits<'i, const MIN: usize, const MAX: usize>(
     input: &mut Input<'i>,
 ) -> PResult<&'i str> {
     take_while(MIN..=MAX, DIGIT)
+        // Safety: `digit` only produces ASCII
         .map(|b: &[u8]| unsafe { from_utf8_unchecked(b, "`is_ascii_digit` filters out on-ASCII") })
         .parse_next(input)
 }

diff --git a/crates/toml_edit/src/parser/key.rs b/crates/toml_edit/src/parser/key.rs
@@ -90,6 +90,7 @@ pub(crate) fn simple_key(input: &mut Input<'_>) -> PResult<(RawString, InternalS
 fn unquoted_key<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
     trace(
         "unquoted-key",
+        // Safety: UNQUOTED_CHAR is only ASCII ranges
         take_while(1.., UNQUOTED_CHAR)
             .map(|b| unsafe { from_utf8_unchecked(b, "`is_unquoted_char` filters out on-ASCII") }),
     )
@@ -101,6 +102,7 @@ pub(crate) fn is_unquoted_char(c: u8) -> bool {
     UNQUOTED_CHAR.contains_token(c)
 }
 
+// Safety-usable invariant: UNQUOTED_CHAR is only ASCII ranges
 const UNQUOTED_CHAR: (
     RangeInclusive<u8>,
     RangeInclusive<u8>,

diff --git a/crates/toml_edit/src/parser/numbers.rs b/crates/toml_edit/src/parser/numbers.rs
@@ -79,13 +79,15 @@ pub(crate) fn dec_int<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
             )),
         )
             .recognize()
+            // Safety: DIGIT1_9, digit(), and `_` only covers ASCII ranges
             .map(|b: &[u8]| unsafe {
                 from_utf8_unchecked(b, "`digit` and `_` filter out non-ASCII")
             })
             .context(StrContext::Label("integer")),
     )
     .parse_next(input)
 }
+/// Safety-usable invariant: DIGIT1_9 is only ASCII ranges
 const DIGIT1_9: RangeInclusive<u8> = b'1'..=b'9';
 
 // hex-prefix = %x30.78               ; 0x
@@ -114,11 +116,13 @@ pub(crate) fn hex_int<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
             ))
             .recognize(),
         )
+        // Safety: HEX_PREFIX, hexdig(), and `_` only covers ASCII ranges
         .map(|b| unsafe { from_utf8_unchecked(b, "`hexdig` and `_` filter out non-ASCII") })
         .context(StrContext::Label("hexadecimal integer")),
     )
     .parse_next(input)
 }
+/// Safety-usable invariant: HEX_PREFIX is ASCII only
 const HEX_PREFIX: &[u8] = b"0x";
 
 // oct-prefix = %x30.6F               ; 0o
@@ -147,12 +151,15 @@ pub(crate) fn oct_int<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
             ))
             .recognize(),
         )
+        // Safety: DIGIT0_7, OCT_PREFIX, and `_` only covers ASCII ranges
         .map(|b| unsafe { from_utf8_unchecked(b, "`DIGIT0_7` and `_` filter out non-ASCII") })
         .context(StrContext::Label("octal integer")),
     )
     .parse_next(input)
 }
+/// Safety-usable invariant: OCT_PREFIX is ASCII only
 const OCT_PREFIX: &[u8] = b"0o";
+/// Safety-usable invariant: DIGIT0_7 is ASCII only
 const DIGIT0_7: RangeInclusive<u8> = b'0'..=b'7';
 
 // bin-prefix = %x30.62               ; 0b
@@ -181,12 +188,15 @@ pub(crate) fn bin_int<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
             ))
             .recognize(),
         )
+        // Safety: DIGIT0_1, BIN_PREFIX, and `_` only covers ASCII ranges
         .map(|b| unsafe { from_utf8_unchecked(b, "`DIGIT0_1` and `_` filter out non-ASCII") })
         .context(StrContext::Label("binary integer")),
     )
     .parse_next(input)
 }
+/// Safety-usable invariant: BIN_PREFIX is ASCII only
 const BIN_PREFIX: &[u8] = b"0b";
+/// Safety-usable invariant: DIGIT0_1 is ASCII only
 const DIGIT0_1: RangeInclusive<u8> = b'0'..=b'1';
 
 // ;; Float
@@ -234,6 +244,7 @@ pub(crate) fn frac<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
     )
         .recognize()
         .map(|b: &[u8]| unsafe {
+            // Safety: `.` and `zero_prefixable_int` only handle ASCII
             from_utf8_unchecked(
                 b,
                 "`.` and `parse_zero_prefixable_int` filter out non-ASCII",
@@ -243,6 +254,7 @@ pub(crate) fn frac<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
 }
 
 // zero-prefixable-int = DIGIT *( DIGIT / underscore DIGIT )
+/// Safety-usable invariant: only produces ASCII
 pub(crate) fn zero_prefixable_int<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
     (
         digit,
@@ -261,8 +273,10 @@ pub(crate) fn zero_prefixable_int<'i>(input: &mut Input<'i>) -> PResult<&'i str>
         .map(|()| ()),
     )
         .recognize()
+        // Safety: `digit()` and `_` are all ASCII
         .map(|b: &[u8]| unsafe { from_utf8_unchecked(b, "`digit` and `_` filter out non-ASCII") })
         .parse_next(input)
+    /// Safety-usable invariant upheld by only using `digit` and `_` in the parser
 }
 
 // exp = "e" float-exp-part
@@ -275,6 +289,7 @@ pub(crate) fn exp<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
     )
         .recognize()
         .map(|b: &[u8]| unsafe {
+            // Safety: `e`, `E`, `+`, `-`, and `zero_prefixable_int` are all ASCII
             from_utf8_unchecked(
                 b,
                 "`one_of` and `parse_zero_prefixable_int` filter out non-ASCII",
@@ -305,15 +320,20 @@ pub(crate) fn nan(input: &mut Input<'_>) -> PResult<f64> {
 const NAN: &[u8] = b"nan";
 
 // DIGIT = %x30-39 ; 0-9
+/// Safety-usable invariant: only parses ASCII
 pub(crate) fn digit(input: &mut Input<'_>) -> PResult<u8> {
+    // Safety: DIGIT is all ASCII
     one_of(DIGIT).parse_next(input)
 }
 const DIGIT: RangeInclusive<u8> = b'0'..=b'9';
 
 // HEXDIG = DIGIT / "A" / "B" / "C" / "D" / "E" / "F"
+/// Safety-usable invariant: only parses ASCII
 pub(crate) fn hexdig(input: &mut Input<'_>) -> PResult<u8> {
+    // Safety: HEXDIG is all ASCII
     one_of(HEXDIG).parse_next(input)
 }
+/// Safety-usable invariant: only ASCII ranges
 pub(crate) const HEXDIG: (RangeInclusive<u8>, RangeInclusive<u8>, RangeInclusive<u8>) =
     (DIGIT, b'A'..=b'F', b'a'..=b'f');
 

diff --git a/crates/toml_edit/src/parser/strings.rs b/crates/toml_edit/src/parser/strings.rs
@@ -138,6 +138,7 @@ fn escape_seq_char(input: &mut Input<'_>) -> PResult<char> {
 pub(crate) fn hexescape<const N: usize>(input: &mut Input<'_>) -> PResult<char> {
     take_while(0..=N, HEXDIG)
         .verify(|b: &[u8]| b.len() == N)
+        // Safety: HEXDIG is ASCII-only
         .map(|b: &[u8]| unsafe { from_utf8_unchecked(b, "`is_ascii_digit` filters out on-ASCII") })
         .verify_map(|s| u32::from_str_radix(s, 16).ok())
         .try_map(|h| char::from_u32(h).ok_or(CustomError::OutOfRange))
@@ -217,13 +218,17 @@ fn mlb_quotes<'i>(
     move |input: &mut Input<'i>| {
         let start = input.checkpoint();
         let res = terminated(b"\"\"", peek(term.by_ref()))
+            // Safety: terminated returns the output of the first parser here,
+            // which only parses ASCII
             .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
             .parse_next(input);
 
         match res {
             Err(winnow::error::ErrMode::Backtrack(_)) => {
                 input.reset(&start);
                 terminated(b"\"", peek(term.by_ref()))
+                    // Safety: terminated returns the output of the first parser here,
+                    // which only parses ASCII
                     .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
                     .parse_next(input)
             }
@@ -346,13 +351,17 @@ fn mll_quotes<'i>(
     move |input: &mut Input<'i>| {
         let start = input.checkpoint();
         let res = terminated(b"''", peek(term.by_ref()))
+            // Safety: terminated returns the output of the first parser here,
+            // which only parses ASCII
             .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
             .parse_next(input);
 
         match res {
             Err(winnow::error::ErrMode::Backtrack(_)) => {
                 input.reset(&start);
                 terminated(b"'", peek(term.by_ref()))
+                    // Safety: terminated returns the output of the first parser here,
+                    // which only parses ASCII
                     .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
                     .parse_next(input)
             }

diff --git a/crates/toml_edit/src/parser/trivia.rs b/crates/toml_edit/src/parser/trivia.rs
@@ -11,6 +11,7 @@ use winnow::token::take_while;
 
 use crate::parser::prelude::*;
 
+/// Safety invariant: must be called with valid UTF-8 in `bytes`
 pub(crate) unsafe fn from_utf8_unchecked<'b>(
     bytes: &'b [u8],
     safety_justification: &'static str,
@@ -27,10 +28,12 @@ pub(crate) unsafe fn from_utf8_unchecked<'b>(
 
 // wschar = ( %x20 /              ; Space
 //            %x09 )              ; Horizontal tab
+/// Safety-usable invariant: WSCHAR is only ASCII values
 pub(crate) const WSCHAR: (u8, u8) = (b' ', b'\t');
 
 // ws = *wschar
 pub(crate) fn ws<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
+    // Safety: WSCHAR only contains ASCII
     take_while(0.., WSCHAR)
         .map(|b| unsafe { from_utf8_unchecked(b, "`is_wschar` filters out on-ASCII") })
         .parse_next(input)
@@ -58,8 +61,10 @@ pub(crate) fn comment<'i>(input: &mut Input<'i>) -> PResult<&'i [u8]> {
 
 // newline = ( %x0A /              ; LF
 //             %x0D.0A )           ; CRLF
+/// Safety-usable invariant: Only returns ASCII bytes
 pub(crate) fn newline(input: &mut Input<'_>) -> PResult<u8> {
     alt((
+        // Safety: CR and LF are ASCII
         one_of(LF).value(b'\n'),
         (one_of(CR), one_of(LF)).value(b'\n'),
     ))
@@ -76,6 +81,7 @@ pub(crate) fn ws_newline<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
     )
     .map(|()| ())
     .recognize()
+    // Safety: `newline` and `WSCHAR` are all ASCII
     .map(|b| unsafe { from_utf8_unchecked(b, "`is_wschar` and `newline` filters out on-ASCII") })
     .parse_next(input)
 }
@@ -85,6 +91,7 @@ pub(crate) fn ws_newlines<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
     (newline, ws_newline)
         .recognize()
         .map(|b| unsafe {
+    // Safety: `newline` and `WSCHAR` are all ASCII
             from_utf8_unchecked(b, "`is_wschar` and `newline` filters out on-ASCII")
         })
         .parse_next(input)