hyperium · sbosnick · Apr 11, 2020 · Apr 11, 2020 · Apr 11, 2020 · Apr 12, 2020
diff --git a/src/uri/path.rs b/src/uri/path.rs
@@ -19,74 +19,56 @@ const NONE: u16 = ::std::u16::MAX;
 impl PathAndQuery {
     // Not public while `bytes` is unstable.
     pub(super) fn from_shared(mut src: Bytes) -> Result<Self, InvalidUri> {
-        let mut query = NONE;
-        let mut fragment = None;
-
-        // block for iterator borrow
-        {
-            let mut iter = src.as_ref().iter().enumerate();
-
-            // path ...
-            for (i, &b) in &mut iter {
-                // See https://url.spec.whatwg.org/#path-state
-                match b {
-                    b'?' => {
-                        debug_assert_eq!(query, NONE);
-                        query = i as u16;
-                        break;
-                    }
-                    b'#' => {
-                        fragment = Some(i);
-                        break;
-                    }
+        let (query, len) = src.as_ref().iter()
+            // stop at the fragment specifier
+            .take_while(|&&c| c != b'#')
+            .try_fold((None, 0u16), |(query, i), &c| {
+                match c {
+                    // match the query specifier if this is the first occurance
+                    b'?' if query.is_none() => Ok((Some(i), i+1)),
 
                     // This is the range of bytes that don't need to be
                     // percent-encoded in the path. If it should have been
                     // percent-encoded, then error.
+                    //
+                    // See https://url.spec.whatwg.org/#path-state
                     0x21 |
                     0x24..=0x3B |
                     0x3D |
                     0x40..=0x5F |
                     0x61..=0x7A |
                     0x7C |
-                    0x7E => {},
-
-                    _ => return Err(ErrorKind::InvalidUriChar.into()),
+                    0x7E => Ok((query, i+1)),
+
+                    // While queries *should* be percent-encoded, most
+                    // bytes are actually allowed...
+                    // See https://url.spec.whatwg.org/#query-state
+                    //
+                    // Allowed: 0x21 / 0x24 - 0x3B / 0x3D / 0x3F - 0x7E
+                    //
+                    // The list below is the bytes that are allowed in the
+                    // query but not in the path. The bytes allowed in both
+                    // are matched in the previous arm.
+                    0x3F |
+                    0x60 |
+                    0x7B |
+                    0x7D if query.is_some() => Ok((query, i+1)),
+
+                    // all bytes 0x80 and above match here (among others)
+                    // so all Ok() returns in the other match arms identify
+                    // a single byte UTF-8 code point
+                    _ => Err(InvalidUri(ErrorKind::InvalidUriChar)),
                 }
-            }
+            })?;
 
-            // query ...
-            if query != NONE {
-                for (i, &b) in iter {
-                    match b {
-                        // While queries *should* be percent-encoded, most
-                        // bytes are actually allowed...
-                        // See https://url.spec.whatwg.org/#query-state
-                        //
-                        // Allowed: 0x21 / 0x24 - 0x3B / 0x3D / 0x3F - 0x7E
-                        0x21 |
-                        0x24..=0x3B |
-                        0x3D |
-                        0x3F..=0x7E => {},
-
-                        b'#' => {
-                            fragment = Some(i);
-                            break;
-                        }
-
-                        _ => return Err(ErrorKind::InvalidUriChar.into()),
-                    }
-                }
-            }
-        }
-
-        if let Some(i) = fragment {
-            src.truncate(i);
-        }
+        src.truncate(len as usize);
 
         Ok(PathAndQuery {
+            // Safety: The try_fold() checks that each byte in the now truncated
+            // src is a single byte UTF-8 code point so src as a whole is valid
+            // UTF-8.
             data: unsafe { ByteStr::from_utf8_unchecked(src) },
-            query: query,
+            query: query.unwrap_or(NONE),
         })
     }
 
@@ -511,7 +493,40 @@ mod tests {
         assert_eq!("qr=%3", pq("/a/b?qr=%3").query().unwrap());
     }
 
+    #[test]
+    fn from_u8_slice_equates_with_string() {
+        assert_eq!("/", pq_bytes(&[b'/']));
+        assert_eq!("/ab", pq_bytes(&[b'/', b'a', b'b']));
+        assert_eq!("a", pq_bytes(&[b'a']));
+        assert_eq!("a?", pq_bytes(&[b'a', b'?']));
+        assert_eq!("a?b", pq_bytes(&[b'a', b'?', b'b']));
+        assert_eq!("a?{b}", pq_bytes(&[b'a', b'?', b'{', b'b', b'}']));
+        assert_eq!("a", pq_bytes(&[b'a', b'#', b'b']));
+        assert_eq!("a?b", pq_bytes(&[b'a', b'?', b'b', b'#', b'c']));
+
+        // This example has invalid UTF-8 in the fragment but the path
+        // and query are valid UTF-8
+        assert_eq!("a?b", pq_bytes(&[b'a', b'?', b'b', b'#', 0xc0]));
+    }
+
+    #[test]
+    fn from_invalid_u8_slice_is_error() {
+        assert!(is_invalid_pq_bytes(&[0xc0])); // invalid UTF-8
+        assert!(is_invalid_pq_bytes(&[b' '])); // need percent encoding
+        assert!(is_invalid_pq_bytes(&[b'{'])); // need percent encoding
+        assert!(is_invalid_pq_bytes(&[b'a', b'?', 0xc0])); // invalid UTF-8
+        assert!(is_invalid_pq_bytes(&[b'a', b'?', b' '])); // needs percent encoding
+    }
+
     fn pq(s: &str) -> PathAndQuery {
         s.parse().expect(&format!("parsing {}", s))
     }
+
+    fn pq_bytes(s: &[u8]) -> PathAndQuery {
+        PathAndQuery::try_from(s).expect(&format!("converting {:?}", s))
+    }
+
+    fn is_invalid_pq_bytes(s: &[u8]) -> bool {
+        PathAndQuery::try_from(s).is_err()
+    }
 }