From fc4e628ae7d9443e474ec0add1691b665c84a569 Mon Sep 17 00:00:00 2001
From: Cameron Bytheway <bytheway.cameron@gmail.com>
Date: Thu, 31 Oct 2024 11:04:25 -0600
Subject: [PATCH] refactor(duvet-core): various fixes needed for integration
 into main crate (#129)

* refactor: improve IETF parser

* refactor(duvet-core): various fixes needed for integration into main crate
---
 .gitattributes                                |   1 +
 .gitignore                                    |   1 +
 Cargo.toml                                    |   5 -
 duvet-core/Cargo.toml                         |   2 +
 duvet-core/src/diagnostic.rs                  |  29 +-
 duvet-core/src/env.rs                         |  32 +
 duvet-core/src/file.rs                        |  22 +-
 duvet-core/src/glob.rs                        |  47 +-
 duvet-core/src/hash.rs                        |   3 +-
 duvet-core/src/http.rs                        |  25 +-
 duvet-core/src/lib.rs                         |  15 +-
 duvet-core/src/path.rs                        |  70 +-
 duvet-core/src/query.rs                       |  10 +-
 duvet/Cargo.toml                              |   6 +-
 duvet/src/extract.rs                          |   4 +-
 duvet/src/extract/tests.rs                    |   4 +-
 duvet/src/lib.rs                              |   6 +
 duvet/src/main.rs                             |  20 +-
 duvet/src/object.rs                           | 126 ----
 duvet/src/report/html.rs                      |   5 +-
 duvet/src/report/mod.rs                       |   5 +-
 duvet/src/specification/ietf.rs               | 243 ++----
 duvet/src/specification/ietf/break_filter.rs  |  39 +
 duvet/src/specification/ietf/parser.rs        | 151 ++++
 duvet/src/specification/ietf/snapshots.tar.gz |   3 +
 duvet/src/specification/ietf/tests.rs         | 465 ++++++++++++
 duvet/src/specification/ietf/tokenizer.rs     | 705 ++++++++++++++++++
 duvet/src/specification/mod.rs                |  12 +-
 integration/snapshots/h3.snap                 |   4 +-
 integration/snapshots/s2n-quic.snap           |   4 +-
 integration/snapshots/s2n-tls.snap            |   4 +-
 xtask/src/tests.rs                            |  38 +
 32 files changed, 1725 insertions(+), 381 deletions(-)
 create mode 100644 duvet-core/src/env.rs
 delete mode 100644 duvet/src/object.rs
 create mode 100644 duvet/src/specification/ietf/break_filter.rs
 create mode 100644 duvet/src/specification/ietf/parser.rs
 create mode 100644 duvet/src/specification/ietf/snapshots.tar.gz
 create mode 100644 duvet/src/specification/ietf/tests.rs
 create mode 100644 duvet/src/specification/ietf/tokenizer.rs

diff --git a/.gitattributes b/.gitattributes
index f3c8d172..35175715 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1 +1,2 @@
 integration/snapshots/*.snap filter=lfs diff=lfs merge=lfs -text
+**/snapshots.tar.gz filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
index 253578a9..c74ae1bd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@ Cargo.lock
 target
 .history
 *.snap.new
+/duvet/src/specification/ietf/snapshots
diff --git a/Cargo.toml b/Cargo.toml
index a299a7e8..6330da62 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -7,11 +7,6 @@ members = [
 ]
 resolver = "2"
 
-[profile.release]
-lto = true
-codegen-units = 1
-incremental = false
-
 [profile.bench]
 lto = true
 codegen-units = 1
diff --git a/duvet-core/Cargo.toml b/duvet-core/Cargo.toml
index ff2957ba..92d62f7b 100644
--- a/duvet-core/Cargo.toml
+++ b/duvet-core/Cargo.toml
@@ -13,6 +13,7 @@ http = ["dep:http", "reqwest"]
 testing = ["tracing-subscriber"]
 
 [dependencies]
+anyhow = "1"
 blake3 = "1"
 bytes = "1"
 duvet-macros = { version = "0.1", path = "../duvet-macros" }
@@ -24,6 +25,7 @@ miette = { version = "7", features = ["fancy"] }
 once_cell = "1"
 reqwest = { version = "0.12", optional = true }
 serde = { version = "1", features = ["derive", "rc"] }
+serde_json = "1"
 tokio = { version = "1", features = ["fs", "sync"] }
 tokio-util = "0.7"
 toml_edit = { version = "0.22", features = ["parse", "serde"] }
diff --git a/duvet-core/src/diagnostic.rs b/duvet-core/src/diagnostic.rs
index e1802021..770fcdd0 100644
--- a/duvet-core/src/diagnostic.rs
+++ b/duvet-core/src/diagnostic.rs
@@ -96,6 +96,24 @@ impl Diagnostic for Error {
     }
 }
 
+impl From<anyhow::Error> for Error {
+    fn from(value: anyhow::Error) -> Self {
+        Report::msg(value).into()
+    }
+}
+
+impl From<std::io::Error> for Error {
+    fn from(value: std::io::Error) -> Self {
+        Report::msg(value).into()
+    }
+}
+
+impl From<serde_json::Error> for Error {
+    fn from(value: serde_json::Error) -> Self {
+        Report::msg(value).into()
+    }
+}
+
 impl From<Report> for Error {
     fn from(err: Report) -> Self {
         Self(Arc::new(err))
@@ -210,7 +228,7 @@ impl From<Vec<Error>> for Set {
 impl fmt::Display for Set {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         for error in self.errors.iter() {
-            writeln!(f, "{}", error)?;
+            writeln!(f, "{:?}", error)?;
         }
         Ok(())
     }
@@ -227,12 +245,3 @@ impl StdError for Set {
         Some(&self.main)
     }
 }
-
-/*
-impl Diagnostic for Set {
-    fn related<'a>(&'a self) -> Option<Box<dyn Iterator<Item = &'a dyn Diagnostic> + 'a>> {
-        let iter = self.errors.iter().map(|e| e as &dyn Diagnostic);
-        Some(Box::new(iter))
-    }
-}
-*/
diff --git a/duvet-core/src/env.rs b/duvet-core/src/env.rs
new file mode 100644
index 00000000..403fc74e
--- /dev/null
+++ b/duvet-core/src/env.rs
@@ -0,0 +1,32 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+use crate::{diagnostic::IntoDiagnostic, path::Path, Result};
+use core::cell::RefCell;
+use once_cell::sync::Lazy;
+use std::sync::Arc;
+
+static GLOBAL_ARGS: Lazy<Arc<[String]>> = Lazy::new(|| std::env::args().collect());
+static GLOBAL_DIR: Lazy<Result<Path>> =
+    Lazy::new(|| std::env::current_dir().map(|v| v.into()).into_diagnostic());
+
+thread_local! {
+    static ARGS: RefCell<Arc<[String]>> = RefCell::new(GLOBAL_ARGS.clone());
+    static DIR: RefCell<Result<Path>> = RefCell::new(GLOBAL_DIR.clone());
+}
+
+pub fn args() -> Arc<[String]> {
+    ARGS.with(|current| current.borrow().clone())
+}
+
+pub fn set_args(args: Arc<[String]>) {
+    ARGS.with(|current| *current.borrow_mut() = args);
+}
+
+pub fn current_dir() -> Result<Path> {
+    DIR.with(|current| current.borrow().clone())
+}
+
+pub fn set_current_dir(dir: Path) {
+    DIR.with(|current| *current.borrow_mut() = Ok(dir));
+}
diff --git a/duvet-core/src/file.rs b/duvet-core/src/file.rs
index 8f019a02..64675bf0 100644
--- a/duvet-core/src/file.rs
+++ b/duvet-core/src/file.rs
@@ -91,6 +91,26 @@ impl SourceFile {
             .await
     }
 
+    pub async fn as_json<T>(&self) -> crate::Result<Arc<T>>
+    where
+        T: 'static + Send + Sync + serde::de::DeserializeOwned,
+    {
+        let path = self.path.clone();
+        let contents = self.contents.clone();
+        // TODO can we get better errors by mapping string ranges?
+        crate::Cache::current()
+            .get_or_init(*self.hash(), move || {
+                crate::Query::from(
+                    serde_json::from_slice(contents.data())
+                        .map(Arc::new)
+                        .into_diagnostic()
+                        .wrap_err(path)
+                        .map_err(|err| err.into()),
+                )
+            })
+            .await
+    }
+
     pub fn substr(&self, v: &str) -> Option<Slice<SourceFile>> {
         unsafe {
             let beginning = self.as_bytes().as_ptr();
@@ -164,7 +184,7 @@ impl SourceCode for SourceFile {
 }
 
 #[derive(Clone, PartialEq, PartialOrd, Hash, Eq, Ord)]
-pub struct Slice<File> {
+pub struct Slice<File = SourceFile> {
     file: File,
     start: usize,
     end: usize,
diff --git a/duvet-core/src/glob.rs b/duvet-core/src/glob.rs
index 0a86a426..d268e7ef 100644
--- a/duvet-core/src/glob.rs
+++ b/duvet-core/src/glob.rs
@@ -6,25 +6,39 @@ use globset as g;
 use serde::de;
 use std::{str::FromStr, sync::Arc};
 
-#[derive(Clone, Debug)]
+#[derive(Clone)]
 pub struct Glob {
-    set: Arc<g::GlobSet>,
+    set: Arc<(g::GlobSet, Vec<String>)>,
+}
+
+impl fmt::Debug for Glob {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let list = &self.set.1;
+        if list.len() == 1 {
+            list[0].fmt(f)
+        } else {
+            list.fmt(f)
+        }
+    }
 }
 
 impl Glob {
     pub fn is_match<P: AsRef<std::path::Path>>(&self, path: &P) -> bool {
-        self.set.is_match(path)
+        self.set.0.is_match(path)
     }
 
     pub fn try_from_iter<T: IntoIterator<Item = I>, I: AsRef<str>>(
         iter: T,
     ) -> Result<Glob, g::Error> {
         let mut builder = g::GlobSetBuilder::new();
+        let mut display = vec![];
         for item in iter {
-            builder.add(g::Glob::new(item.as_ref())?);
+            let value = format_value(item.as_ref());
+            builder.add(g::Glob::new(&value)?);
+            display.push(value);
         }
         let set = builder.build()?;
-        let set = Arc::new(set);
+        let set = Arc::new((set, display));
         Ok(Self { set })
     }
 }
@@ -37,6 +51,14 @@ impl FromStr for Glob {
     }
 }
 
+impl TryFrom<&str> for Glob {
+    type Error = g::Error;
+
+    fn try_from(value: &str) -> Result<Self, Self::Error> {
+        value.parse()
+    }
+}
+
 impl<'de> de::Deserialize<'de> for Glob {
     fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
     where
@@ -67,12 +89,23 @@ impl<'de> de::Visitor<'de> for StringOrList {
         S: de::SeqAccess<'de>,
     {
         let mut builder = g::GlobSetBuilder::new();
+        let mut display = vec![];
         while let Some(value) = seq.next_element()? {
-            let item = g::Glob::new(value).map_err(serde::de::Error::custom)?;
+            let value = format_value(value);
+            let item = g::Glob::new(&value).map_err(serde::de::Error::custom)?;
             builder.add(item);
+            display.push(value);
         }
         let set = builder.build().map_err(serde::de::Error::custom)?;
-        let set = Arc::new(set);
+        let set = Arc::new((set, display));
         Ok(Glob { set })
     }
 }
+
+fn format_value(v: &str) -> String {
+    if v.starts_with("**/") || v.starts_with('/') {
+        v.to_string()
+    } else {
+        format!("**/{v}")
+    }
+}
diff --git a/duvet-core/src/hash.rs b/duvet-core/src/hash.rs
index 2e163597..5e95ccee 100644
--- a/duvet-core/src/hash.rs
+++ b/duvet-core/src/hash.rs
@@ -9,8 +9,9 @@ pub struct Hash([u8; HASH_LEN]);
 
 impl fmt::Debug for Hash {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "0x")?;
         for byte in &self.0 {
-            write!(f, "{byte:#02x}")?;
+            write!(f, "{byte:02x}")?;
         }
         Ok(())
     }
diff --git a/duvet-core/src/http.rs b/duvet-core/src/http.rs
index 8e603c40..c697d807 100644
--- a/duvet-core/src/http.rs
+++ b/duvet-core/src/http.rs
@@ -13,14 +13,31 @@ use std::sync::Arc;
 pub use http::response::Parts;
 pub use reqwest::Client;
 
+fn default_headers() -> reqwest::header::HeaderMap {
+    let mut map = reqwest::header::HeaderMap::new();
+
+    map.insert("accept", "text/plain".parse().unwrap());
+
+    map
+}
+
 pub fn client() -> Query<Client> {
     #[derive(Clone, Copy, Debug, Hash, Eq, PartialEq)]
     struct Q;
 
-    // TODO configure the client more
-    //     - User-Agent headers
-    //     - Accept headers?
-    Cache::current().get_or_init(Q, || Query::from(Client::builder().build().unwrap()))
+    Cache::current().get_or_init(Q, || {
+        Query::from(
+            Client::builder()
+                .user_agent(concat!(
+                    env!("CARGO_PKG_NAME"),
+                    "/",
+                    env!("CARGO_PKG_VERSION")
+                ))
+                .default_headers(default_headers())
+                .build()
+                .unwrap(),
+        )
+    })
 }
 
 pub fn get_full<U>(url: U) -> Query<Result<(Arc<Parts>, Contents)>>
diff --git a/duvet-core/src/lib.rs b/duvet-core/src/lib.rs
index 613aa1b8..75ca3c05 100644
--- a/duvet-core/src/lib.rs
+++ b/duvet-core/src/lib.rs
@@ -1,6 +1,18 @@
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 // SPDX-License-Identifier: Apache-2.0
 
+#[macro_export]
+macro_rules! ensure {
+    ($cond:expr) => {
+        ensure!($cond, ());
+    };
+    ($cond:expr, $otherwise:expr) => {
+        if !($cond) {
+            return $otherwise;
+        }
+    };
+}
+
 #[cfg(any(test, feature = "testing"))]
 pub mod testing;
 
@@ -8,6 +20,7 @@ mod cache;
 pub mod contents;
 pub mod diagnostic;
 pub mod dir;
+pub mod env;
 pub mod file;
 pub mod glob;
 pub mod hash;
@@ -24,4 +37,4 @@ pub use cache::Cache;
 pub use duvet_macros::*;
 pub use query::Query;
 
-pub type Result<T, E = diagnostic::Error> = core::result::Result<T, E>;
+pub type Result<T = (), E = diagnostic::Error> = core::result::Result<T, E>;
diff --git a/duvet-core/src/path.rs b/duvet-core/src/path.rs
index 7ac810a0..1529a842 100644
--- a/duvet-core/src/path.rs
+++ b/duvet-core/src/path.rs
@@ -1,16 +1,61 @@
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 // SPDX-License-Identifier: Apache-2.0
 
-use core::fmt;
+use core::{cmp::Ordering, fmt};
 use serde::Deserialize;
 use std::{ffi::OsStr, ops::Deref, path::PathBuf, sync::Arc};
 
-#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Deserialize)]
+#[derive(Clone, Deserialize)]
 #[serde(transparent)]
 pub struct Path {
     path: Arc<OsStr>,
 }
 
+impl Path {
+    pub fn pop(&mut self) -> bool {
+        if let Some(parent) = self.parent() {
+            *self = parent.into();
+            true
+        } else {
+            false
+        }
+    }
+
+    pub fn push<V: AsRef<std::path::Path>>(&mut self, component: V) {
+        *self = self.join(component);
+    }
+
+    pub fn join<V: AsRef<std::path::Path>>(&self, component: V) -> Self {
+        self.as_ref().join(component).into()
+    }
+}
+
+impl PartialEq for Path {
+    fn eq(&self, other: &Self) -> bool {
+        self.as_ref().eq(other.as_ref())
+    }
+}
+
+impl Eq for Path {}
+
+impl PartialOrd for Path {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for Path {
+    fn cmp(&self, other: &Self) -> Ordering {
+        self.as_ref().cmp(other.as_ref())
+    }
+}
+
+impl core::hash::Hash for Path {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.as_ref().hash(state)
+    }
+}
+
 impl fmt::Debug for Path {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         self.as_ref().fmt(f)
@@ -19,7 +64,12 @@ impl fmt::Debug for Path {
 
 impl fmt::Display for Path {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        self.as_ref().display().fmt(f)
+        let path = self.as_ref();
+        let path = crate::env::current_dir()
+            .ok()
+            .and_then(|dir| path.strip_prefix(dir).ok())
+            .unwrap_or(path);
+        path.display().fmt(f)
     }
 }
 
@@ -65,6 +115,20 @@ impl From<PathBuf> for Path {
     }
 }
 
+impl From<&std::path::Path> for Path {
+    fn from(path: &std::path::Path) -> Self {
+        Self {
+            path: path.as_os_str().into(),
+        }
+    }
+}
+
+impl From<Path> for PathBuf {
+    fn from(value: Path) -> Self {
+        PathBuf::from(&value.path)
+    }
+}
+
 impl From<&Path> for Path {
     fn from(path: &Path) -> Self {
         Self {
diff --git a/duvet-core/src/query.rs b/duvet-core/src/query.rs
index 9dd1ecfe..2b5f8ee4 100644
--- a/duvet-core/src/query.rs
+++ b/duvet-core/src/query.rs
@@ -445,14 +445,16 @@ mod tests {
         let query = Query::new(async move { rx.await.unwrap() });
 
         let a = query.clone();
-        let a = tokio::spawn(async move { *a.get().await });
+        let a = async move { *a.get().await };
 
         let b = query;
-        let b = tokio::spawn(async move { *b.get().await });
+        let b = async move { *b.get().await };
 
         tx.send(123).unwrap();
 
-        assert_eq!(a.await.unwrap(), 123);
-        assert_eq!(b.await.unwrap(), 123);
+        let (a, b) = tokio::join!(a, b);
+
+        assert_eq!(a, 123);
+        assert_eq!(b, 123);
     }
 }
diff --git a/duvet/Cargo.toml b/duvet/Cargo.toml
index cdc2d766..b0400cfc 100644
--- a/duvet/Cargo.toml
+++ b/duvet/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "duvet"
 version = "0.3.0"
-description = "A code quality tool to help bound correctness."
+description = "A requirements traceability tool"
 authors = ["Cameron Bytheway <bythewc@amazon.com>", "Ryan Emery <ryanemer@amazon.com>"]
 edition = "2021"
 license = "Apache-2.0"
@@ -16,8 +16,10 @@ anyhow = "1"
 clap = { version = "4", features = ["derive"] }
 duvet-core = { version = "0.1", path = "../duvet-core" }
 fnv = { version = "1", default-features = false }
+futures = { version = "0.3" }
 glob = "0.3"
 lazy_static = "1"
+once_cell = "1"
 pathdiff = "0.2"
 pulldown-cmark = { version = "0.12", default-features = false }
 rayon = "1"
@@ -26,6 +28,8 @@ reqwest = { version = "0.12", features = ["blocking", "native-tls"] }
 serde = { version = "1", features = ["derive"] }
 slug = { version = "0.1" }
 tokio = { version = "1", features = ["macros", "rt-multi-thread"] }
+tracing = "0.1"
+tracing-subscriber = { version = "0.3", features = ["env-filter"] }
 toml = "0.5"
 triple_accel = "0.4"
 url = "2"
diff --git a/duvet/src/extract.rs b/duvet/src/extract.rs
index 1e05760e..4530633e 100644
--- a/duvet/src/extract.rs
+++ b/duvet/src/extract.rs
@@ -69,9 +69,11 @@ pub struct Extract {
 impl Extract {
     pub async fn exec(&self) -> Result<(), Error> {
         let contents = self.target.load(self.spec_path.as_deref())?;
+        let local_path = self.target.local(self.spec_path.as_deref());
+        let contents = duvet_core::file::SourceFile::new(&*local_path, contents).unwrap();
+
         let spec = self.format.parse(&contents)?;
         let sections = extract_sections(&spec);
-        let local_path = self.target.local(self.spec_path.as_deref());
 
         if self.out.extension().is_some() {
             // assume a path with an extension is a single file
diff --git a/duvet/src/extract/tests.rs b/duvet/src/extract/tests.rs
index c0e510aa..5204a7bf 100644
--- a/duvet/src/extract/tests.rs
+++ b/duvet/src/extract/tests.rs
@@ -16,8 +16,10 @@ macro_rules! snapshot_test {
                 stringify!($name),
                 $ext,
             ));
+            let path = concat!(stringify!($name), $ext);
+            let contents = duvet_core::file::SourceFile::new(path, contents).unwrap();
 
-            let spec = Format::Auto.parse(contents).unwrap();
+            let spec = Format::Auto.parse(&contents).unwrap();
             let sections = extract_sections(&spec);
 
             let results: Vec<_> = sections
diff --git a/duvet/src/lib.rs b/duvet/src/lib.rs
index 4c9ad2e5..caaeec02 100644
--- a/duvet/src/lib.rs
+++ b/duvet/src/lib.rs
@@ -20,6 +20,7 @@ mod text;
 mod tests;
 
 pub use anyhow::Error;
+pub use duvet_core::Result;
 
 #[allow(clippy::large_enum_variant)]
 #[derive(Debug, Parser)]
@@ -42,6 +43,11 @@ impl Arguments {
     }
 }
 
+pub async fn run() -> Result {
+    arguments().await.exec().await?;
+    Ok(())
+}
+
 pub(crate) fn fnv<H: core::hash::Hash + ?Sized>(value: &H) -> u64 {
     use core::hash::Hasher;
     let mut hasher = fnv::FnvHasher::default();
diff --git a/duvet/src/main.rs b/duvet/src/main.rs
index 0bf00b50..a2fbce98 100644
--- a/duvet/src/main.rs
+++ b/duvet/src/main.rs
@@ -2,10 +2,24 @@
 // SPDX-License-Identifier: Apache-2.0
 
 fn main() {
+    let format = tracing_subscriber::fmt::format().compact(); // Use a less verbose output format.
+
+    let env_filter = tracing_subscriber::EnvFilter::builder()
+        .with_default_directive(tracing::Level::ERROR.into())
+        .with_env_var("DUVET_LOG")
+        .from_env()
+        .unwrap();
+
+    tracing_subscriber::fmt()
+        .with_env_filter(env_filter)
+        .event_format(format)
+        .with_test_writer()
+        .init();
+
     let cache = duvet_core::Cache::default();
     let fs = duvet_core::vfs::fs::Fs::default();
 
-    let runtime = tokio::runtime::Builder::new_multi_thread()
+    let runtime = tokio::runtime::Builder::new_current_thread()
         .on_thread_start({
             let cache = cache.clone();
             let fs = fs.clone();
@@ -19,8 +33,8 @@ fn main() {
         .unwrap();
 
     runtime.block_on(async {
-        if let Err(err) = duvet::arguments().await.exec().await {
-            eprintln!("{}", err);
+        if let Err(err) = duvet::run().await {
+            eprintln!("{err:?}");
             std::process::exit(1);
         }
     });
diff --git a/duvet/src/object.rs b/duvet/src/object.rs
deleted file mode 100644
index 2032a848..00000000
--- a/duvet/src/object.rs
+++ /dev/null
@@ -1,126 +0,0 @@
-// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-use crate::{annotation::AnnotationSet, parser::Parser, Error};
-use anyhow::anyhow;
-use goblin::{
-    archive::Archive,
-    elf::Elf,
-    mach::{Mach, MachO, MultiArch},
-    pe::PE,
-    Object,
-};
-
-pub fn extract(buffer: &[u8], annotations: &mut AnnotationSet) -> Result<(), Error> {
-    let object = Object::parse(buffer)?;
-    (&object, buffer).load(annotations)?;
-
-    Ok(())
-}
-
-trait AnnoObject {
-    fn load(&self, annotations: &mut AnnotationSet) -> Result<(), Error>;
-}
-
-impl<'a> AnnoObject for (&Object<'a>, &'a [u8]) {
-    fn load(&self, annotations: &mut AnnotationSet) -> Result<(), Error> {
-        match &self.0 {
-            Object::Elf(obj) => (obj, self.1).load(annotations),
-            Object::PE(obj) => (obj, self.1).load(annotations),
-            Object::Mach(obj) => (obj, self.1).load(annotations),
-            Object::Archive(obj) => (obj, self.1).load(annotations),
-            _ => Err(anyhow!("Unknown file format".to_string())),
-        }
-    }
-}
-
-impl<'a> AnnoObject for (&Elf<'a>, &'a [u8]) {
-    fn load(&self, annotations: &mut AnnotationSet) -> Result<(), Error> {
-        let elf = &self.0;
-        for sect in &elf.section_headers {
-            if sect.sh_type != goblin::elf::section_header::SHT_NOTE {
-                continue;
-            }
-
-            if !elf
-                .shdr_strtab
-                .get(sect.sh_name)
-                .map_or(false, |r| r.ok() == Some(".note.compliance"))
-            {
-                continue;
-            }
-
-            let addr = sect.sh_offset as usize;
-            let len = sect.sh_size as usize;
-            for annotation in Parser(&self.1[addr..(addr + len)]) {
-                annotations.insert(annotation?);
-            }
-        }
-
-        Ok(())
-    }
-}
-
-impl<'a> AnnoObject for (&PE<'a>, &'a [u8]) {
-    fn load(&self, annotations: &mut AnnotationSet) -> Result<(), Error> {
-        for section in &self.0.sections {
-            if Some(".debug_compliance") == section.real_name.as_deref() {
-                let addr = section.pointer_to_raw_data as usize;
-                let len = section.virtual_size as usize;
-
-                for annotation in Parser(&self.1[addr..(addr + len)]) {
-                    annotations.insert(annotation?);
-                }
-            }
-        }
-        Ok(())
-    }
-}
-
-impl<'a> AnnoObject for (&Mach<'a>, &'a [u8]) {
-    fn load(&self, annotations: &mut AnnotationSet) -> Result<(), Error> {
-        match &self.0 {
-            Mach::Fat(obj) => (obj, self.1).load(annotations),
-            Mach::Binary(obj) => obj.load(annotations),
-        }
-    }
-}
-
-impl<'a> AnnoObject for (&MultiArch<'a>, &'a [u8]) {
-    fn load(&self, annotations: &mut AnnotationSet) -> Result<(), Error> {
-        for arch in self.0.iter_arches() {
-            let arch = arch?;
-            extract(arch.slice(self.1), annotations)?;
-        }
-        Ok(())
-    }
-}
-
-impl<'a> AnnoObject for MachO<'a> {
-    fn load(&self, annotations: &mut AnnotationSet) -> Result<(), Error> {
-        for sections in self.segments.sections() {
-            for (section, data) in sections.flatten() {
-                if let (b"__DATA\0\0\0\0\0\0\0\0\0\0", b"__compliance\0\0\0\0") =
-                    (&section.segname, &section.sectname)
-                {
-                    for annotation in Parser(data) {
-                        annotations.insert(annotation?);
-                    }
-                }
-            }
-        }
-
-        Ok(())
-    }
-}
-
-impl<'a> AnnoObject for (&Archive<'a>, &'a [u8]) {
-    fn load(&self, annotations: &mut AnnotationSet) -> Result<(), Error> {
-        for member in self.0.members() {
-            if let Ok(contents) = self.0.extract(member, self.1) {
-                let _ = extract(contents, annotations);
-            }
-        }
-        Ok(())
-    }
-}
diff --git a/duvet/src/report/html.rs b/duvet/src/report/html.rs
index 56da05ff..84364756 100644
--- a/duvet/src/report/html.rs
+++ b/duvet/src/report/html.rs
@@ -51,7 +51,10 @@ pub fn report_writer<Output: Write>(
     w!("<body>");
     w!("<div id=root></div>");
     w!(r#"<script>"#);
-    w!(include_str!("../../www/public/script.js"));
+    w!(include_str!(concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/www/public/script.js"
+    )));
     w!(r#"</script>"#);
     w!("</body>");
     w!("</html>");
diff --git a/duvet/src/report/mod.rs b/duvet/src/report/mod.rs
index 44eee6f2..32cb6717 100644
--- a/duvet/src/report/mod.rs
+++ b/duvet/src/report/mod.rs
@@ -115,7 +115,10 @@ impl Report {
         let contents: HashMap<_, _> = targets
             .par_iter()
             .map(|target| {
-                let contents = target.path.load(self.project.spec_path.as_deref()).unwrap();
+                let spec_path = self.project.spec_path.as_deref();
+                let path = target.path.local(spec_path);
+                let contents = target.path.load(spec_path).unwrap();
+                let contents = duvet_core::file::SourceFile::new(path, contents).unwrap();
                 (target, contents)
             })
             .collect();
diff --git a/duvet/src/specification/ietf.rs b/duvet/src/specification/ietf.rs
index 7122d28b..5b507728 100644
--- a/duvet/src/specification/ietf.rs
+++ b/duvet/src/specification/ietf.rs
@@ -1,201 +1,54 @@
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 // SPDX-License-Identifier: Apache-2.0
 
-use super::{Section, Specification, Str};
-use crate::{sourcemap::LinesIter, Error};
-use core::ops::Deref;
-use lazy_static::lazy_static;
-use regex::Regex;
-
-lazy_static! {
-    static ref SECTION_HEADER_RE: Regex = Regex::new(r"^(([A-Z]\.)?[0-9\.]+)\s+(.*)").unwrap();
-    static ref APPENDIX_HEADER_RE: Regex = Regex::new(r"^Appendix ([A-Z]\.)\s+(.*)").unwrap();
-
-    /// Table of contents have at least 5 periods
-    static ref TOC_RE: Regex = Regex::new(r"\.{5,}").unwrap();
-}
-
-pub fn parse(contents: &str) -> Result<Specification, Error> {
-    let mut parser = Parser::default();
-
-    for line in LinesIter::new(contents) {
-        parser.on_line(line)?;
-    }
-
-    let mut spec = parser.done()?;
-
-    spec.format = super::Format::Ietf;
-
-    Ok(spec)
-}
-
-#[derive(Debug, Default)]
-pub struct Parser<'a> {
-    spec: Specification<'a>,
-    state: ParserState<'a>,
-}
-
-#[derive(Debug)]
-pub enum ParserState<'a> {
-    Init,
-    Section { section: Section<'a>, indent: usize },
-}
-
-impl Default for ParserState<'_> {
-    fn default() -> Self {
-        Self::Init
-    }
-}
-
-fn section_header(line: Str) -> Option<Section> {
-    let full_title = line;
-    if let Some(info) = SECTION_HEADER_RE.captures(&line) {
-        let id = info.get(1)?;
-        let title = info.get(3)?;
-
-        if TOC_RE.is_match(title.as_str()) {
-            return None;
-        }
-
-        let id = line.slice(id.range()).trim_end_matches('.');
-        let id = match id.chars().next() {
-            Some('0'..='9') => format!("section-{}", id),
-            _ => format!("appendix-{}", id),
-        };
-        let title = line.slice(title.range()).to_string();
-
-        Some(Section {
-            id,
-            title,
-            full_title,
-            lines: vec![],
+use super::{Format, Line, Section, Specification, Str};
+use crate::Error;
+use duvet_core::file::SourceFile;
+
+pub mod break_filter;
+pub mod parser;
+pub mod tokenizer;
+
+#[cfg(test)]
+mod tests;
+
+pub fn parse(contents: &SourceFile) -> Result<Specification, Error> {
+    let tokens = tokenizer::tokens(contents);
+    let tokens = break_filter::break_filter(tokens);
+    let parser = parser::parse(tokens);
+
+    let pos = |substr: &str| substr.as_ptr() as usize - contents.as_ptr() as usize;
+    let substr = |substr: &str, line: usize| {
+        let pos = pos(substr);
+        let value = &contents[pos..pos + substr.len()];
+        Str { value, pos, line }
+    };
+
+    let sections = parser
+        .map(|section| {
+            let id = section.id.to_string();
+
+            let section = Section {
+                title: section.title.to_string(),
+                id: id.clone(),
+                full_title: substr(&section.title, section.line),
+                lines: section
+                    .lines
+                    .into_iter()
+                    .map(|(line, value)| {
+                        let value = substr(&value, line);
+                        Line::Str(value)
+                    })
+                    .collect(),
+            };
+
+            (id, section)
         })
-    } else if let Some(info) = APPENDIX_HEADER_RE.captures(&line) {
-        let id = info.get(1)?;
-        let title = info.get(2)?;
+        .collect();
 
-        if TOC_RE.is_match(title.as_str()) {
-            return None;
-        }
-
-        let id = line.slice(id.range()).trim_end_matches('.');
-        let id = format!("appendix-{}", id);
-        let title = line.slice(title.range()).to_string();
-
-        Some(Section {
-            id,
-            title,
-            full_title,
-            lines: vec![],
-        })
-    } else {
-        None
-    }
+    Ok(Specification {
+        title: None,
+        sections,
+        format: Format::Ietf,
+    })
 }
-
-impl<'a> Parser<'a> {
-    pub fn on_line(&mut self, line: Str<'a>) -> Result<(), Error> {
-        // remove footer marker
-        if line.deref() == "\u{c}" {
-            return Ok(());
-        }
-
-        match core::mem::replace(&mut self.state, ParserState::Init) {
-            ParserState::Init => {
-                if let Some(section) = section_header(line) {
-                    self.state = ParserState::Section {
-                        section,
-                        indent: usize::MAX,
-                    };
-                }
-            }
-            ParserState::Section {
-                mut section,
-                indent,
-            } => {
-                let line_indent = line.indentation();
-
-                // dedup whitespace
-                if line_indent == line.len()
-                    && section.lines.last().map(|l| !l.is_empty()).unwrap_or(false)
-                {
-                    section.lines.push(line.trim().into());
-
-                    // most likely the footer/header
-                    self.state = ParserState::Section { section, indent };
-
-                    return Ok(());
-                }
-
-                if line_indent == 0 {
-                    if let Some(new_section) = section_header(line) {
-                        self.on_section(section, indent);
-                        self.state = ParserState::Section {
-                            section: new_section,
-                            indent: usize::MAX,
-                        };
-                    } else {
-                        // most likely the footer/header
-                        self.state = ParserState::Section { section, indent };
-                    }
-
-                    return Ok(());
-                }
-
-                section.lines.push(line.into());
-
-                self.state = ParserState::Section {
-                    section,
-                    indent: indent.min(line_indent),
-                };
-            }
-        }
-
-        Ok(())
-    }
-
-    fn on_section(&mut self, mut section: Section<'a>, indent: usize) {
-        for content in &mut section.lines {
-            if let super::Line::Str(content) = content {
-                if !content.is_empty() {
-                    let range = indent..content.len();
-                    *content = content.slice(range);
-                }
-            }
-        }
-
-        // remove last whitespace
-        if section.lines.last().map(|l| l.is_empty()).unwrap_or(false) {
-            section.lines.pop();
-        }
-
-        let id = section.id.clone();
-        self.spec.sections.insert(id, section);
-    }
-
-    pub fn done(mut self) -> Result<Specification<'a>, Error> {
-        match core::mem::replace(&mut self.state, ParserState::Init) {
-            ParserState::Init => Ok(self.spec),
-            ParserState::Section { section, indent } => {
-                self.on_section(section, indent);
-                Ok(self.spec)
-            }
-        }
-    }
-}
-
-// macro_rules! ietf_test {
-//     ($name:ident, $file:expr) => {
-//         #[ignore]   // TODO: Escaping of apostrophes has changed in Rust beta, so the Compliance snapshot tests are failing [rust-lang/rust#83079](https://github.com/rust-lang/rust/pull/83079)
-//         #[test]
-//         fn $name() {
-//             let res = parse(include_str!(concat!(
-//                 env!("CARGO_MANIFEST_DIR"),
-//                 "/../specs/",
-//                 $file
-//             )))
-//             .unwrap();
-//             insta::assert_debug_snapshot!($file, res);
-//         }
-//     };
-// }
diff --git a/duvet/src/specification/ietf/break_filter.rs b/duvet/src/specification/ietf/break_filter.rs
new file mode 100644
index 00000000..ca90cbd9
--- /dev/null
+++ b/duvet/src/specification/ietf/break_filter.rs
@@ -0,0 +1,39 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+use super::tokenizer::{Break, Token};
+
+/// Filters out duplicate breaks, headers, and headers misclassified as contents
+pub fn break_filter<T: Iterator<Item = Token>>(tokens: T) -> impl Iterator<Item = Token> {
+    let mut break_ty = None;
+    tokens.filter(move |token| {
+        let prev_break = core::mem::take(&mut break_ty);
+
+        match token {
+            Token::Section { .. } | Token::Appendix { .. } | Token::NamedSection { .. } => {}
+            Token::Break { ty, .. } => {
+                break_ty = Some(*ty);
+
+                // dedupe breaks
+                if prev_break.is_some() {
+                    return false;
+                }
+            }
+            Token::Content { .. } => {
+                // if we previously had a page break then ignore the next line - it's a header that
+                // didn't tokenize correctly
+                if matches!(prev_break, Some(Break::Page)) {
+                    break_ty = Some(Break::Line);
+                    return false;
+                }
+            }
+            Token::Header { value: _, line: _ } => {
+                // set up a break since we skipped a line
+                break_ty = Some(Break::Line);
+                return false;
+            }
+        }
+
+        true
+    })
+}
diff --git a/duvet/src/specification/ietf/parser.rs b/duvet/src/specification/ietf/parser.rs
new file mode 100644
index 00000000..9ca7b446
--- /dev/null
+++ b/duvet/src/specification/ietf/parser.rs
@@ -0,0 +1,151 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+use super::tokenizer::Token;
+use core::fmt;
+use duvet_core::file::Slice;
+
+pub fn parse<T: IntoIterator<Item = Token>>(tokens: T) -> Parser<T::IntoIter> {
+    Parser {
+        section: None,
+        tokens: tokens.into_iter(),
+    }
+}
+
+pub struct Parser<T> {
+    section: Option<Section>,
+    tokens: T,
+}
+
+pub struct Section {
+    pub id: Id,
+    pub title: Slice,
+    pub line: usize,
+    pub lines: Vec<(usize, Slice)>,
+}
+
+pub enum Id {
+    Section(Slice),
+    Appendix(Slice),
+    NamedSection(Slice),
+}
+
+impl fmt::Display for Id {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Id::Section(id) => write!(f, "section-{id}"),
+            Id::Appendix(id) => write!(f, "appendix-{id}"),
+            // TODO slugify
+            Id::NamedSection(title) => write!(f, "named-{title}"),
+        }
+    }
+}
+
+impl Section {
+    fn push(&mut self, line: usize, value: Slice) {
+        // don't push an empty first line
+        if self.lines.is_empty() && value.trim().is_empty() {
+            return;
+        }
+
+        self.lines.push((line, value));
+    }
+}
+
+impl<T: Iterator<Item = Token>> Parser<T> {
+    fn on_token(&mut self, token: Token) -> Option<Section> {
+        match token {
+            Token::Section { id, title, line } => {
+                let prev = self.flush();
+
+                self.section = Some(Section {
+                    id: Id::Section(id),
+                    title,
+                    line,
+                    lines: vec![],
+                });
+
+                prev
+            }
+            Token::Appendix { id, title, line } => {
+                let prev = self.flush();
+
+                self.section = Some(Section {
+                    id: Id::Appendix(id),
+                    title,
+                    line,
+                    lines: vec![],
+                });
+
+                prev
+            }
+            Token::NamedSection { title, line } => {
+                let prev = self.flush();
+
+                self.section = Some(Section {
+                    id: Id::NamedSection(title.clone()),
+                    title,
+                    line,
+                    lines: vec![],
+                });
+
+                prev
+            }
+            Token::Break { line, value, ty: _ } => {
+                if let Some(section) = self.section.as_mut() {
+                    // just get the line offset
+                    let trimmed = &value[0..0];
+                    let value = value.file().substr(trimmed).unwrap();
+                    section.push(line, value);
+                }
+
+                None
+            }
+            Token::Content { line, value } => {
+                if let Some(section) = self.section.as_mut() {
+                    section.push(line, value);
+                }
+
+                None
+            }
+            Token::Header { value: _, line: _ } => {
+                // ignore headers
+                None
+            }
+        }
+    }
+
+    fn flush(&mut self) -> Option<Section> {
+        let mut section = core::mem::take(&mut self.section)?;
+
+        // trim any trailing lines
+        loop {
+            let Some((_lineno, line)) = section.lines.last() else {
+                break;
+            };
+
+            if !line.trim().is_empty() {
+                break;
+            }
+
+            section.lines.pop();
+        }
+
+        Some(section)
+    }
+}
+
+impl<T: Iterator<Item = Token>> Iterator for Parser<T> {
+    type Item = Section;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        loop {
+            let Some(token) = self.tokens.next() else {
+                return self.flush();
+            };
+            if let Some(section) = self.on_token(token) {
+                return Some(section);
+            }
+        }
+    }
+}
diff --git a/duvet/src/specification/ietf/snapshots.tar.gz b/duvet/src/specification/ietf/snapshots.tar.gz
new file mode 100644
index 00000000..dec9ce39
--- /dev/null
+++ b/duvet/src/specification/ietf/snapshots.tar.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c39aefea4476c89a2f71a1885eb9b1d1b0549e8d2a3715a7c667ef7ec786e540
+size 573450240
diff --git a/duvet/src/specification/ietf/tests.rs b/duvet/src/specification/ietf/tests.rs
new file mode 100644
index 00000000..ca643986
--- /dev/null
+++ b/duvet/src/specification/ietf/tests.rs
@@ -0,0 +1,465 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+use super::tokenizer::{self, tokens, Token};
+
+const HIGHEST_KNOWN_ID: usize = 9663;
+
+macro_rules! tests {
+    ($(($name:ident, $id:expr)),* $(,)?) => {
+        $(
+            #[tokio::test]
+            async fn $name() {
+                test_range($id..$id + 100).await
+            }
+        )*
+    }
+}
+
+tests!(
+    (rfc_30xx, 3000),
+    (rfc_31xx, 3100),
+    (rfc_32xx, 3200),
+    (rfc_33xx, 3300),
+    (rfc_34xx, 3400),
+    (rfc_35xx, 3500),
+    (rfc_36xx, 3600),
+    (rfc_37xx, 3700),
+    (rfc_38xx, 3800),
+    (rfc_39xx, 3900),
+);
+
+tests!(
+    (rfc_40xx, 4000),
+    (rfc_41xx, 4100),
+    (rfc_42xx, 4200),
+    (rfc_43xx, 4300),
+    (rfc_44xx, 4400),
+    (rfc_45xx, 4500),
+    (rfc_46xx, 4600),
+    (rfc_47xx, 4700),
+    (rfc_48xx, 4800),
+    (rfc_49xx, 4900),
+);
+
+tests!(
+    (rfc_50xx, 5000),
+    (rfc_51xx, 5100),
+    (rfc_52xx, 5200),
+    (rfc_53xx, 5300),
+    (rfc_54xx, 5400),
+    (rfc_55xx, 5500),
+    (rfc_56xx, 5600),
+    (rfc_57xx, 5700),
+    (rfc_58xx, 5800),
+    (rfc_59xx, 5900),
+);
+
+tests!(
+    (rfc_60xx, 6000),
+    (rfc_61xx, 6100),
+    (rfc_62xx, 6200),
+    (rfc_63xx, 6300),
+    (rfc_64xx, 6400),
+    (rfc_65xx, 6500),
+    (rfc_66xx, 6600),
+    (rfc_67xx, 6700),
+    (rfc_68xx, 6800),
+    (rfc_69xx, 6900),
+);
+
+tests!(
+    (rfc_70xx, 7000),
+    (rfc_71xx, 7100),
+    (rfc_72xx, 7200),
+    (rfc_73xx, 7300),
+    (rfc_74xx, 7400),
+    (rfc_75xx, 7500),
+    (rfc_76xx, 7600),
+    (rfc_77xx, 7700),
+    (rfc_78xx, 7800),
+    (rfc_79xx, 7900),
+);
+
+tests!(
+    (rfc_80xx, 8000),
+    (rfc_81xx, 8100),
+    (rfc_82xx, 8200),
+    (rfc_83xx, 8300),
+    (rfc_84xx, 8400),
+    (rfc_85xx, 8500),
+    (rfc_86xx, 8600),
+    (rfc_87xx, 8700),
+    (rfc_88xx, 8800),
+    (rfc_89xx, 8900),
+);
+
+tests!(
+    (rfc_90xx, 9000),
+    (rfc_91xx, 9100),
+    (rfc_92xx, 9200),
+    (rfc_93xx, 9300),
+    (rfc_94xx, 9400),
+    (rfc_95xx, 9500),
+    (rfc_96xx, 9600),
+    (rfc_97xx, 9700),
+    (rfc_98xx, 9800),
+    (rfc_99xx, 9900),
+);
+
+async fn test_range(range: core::ops::Range<usize>) {
+    let mut saw_any = false;
+    for rfc in range {
+        saw_any |= test_rfc(rfc).await;
+    }
+
+    assert!(saw_any, "missing RFC download - run `cargo xtask test`");
+}
+
+async fn test_rfc(rfc: usize) -> bool {
+    let etc = std::path::Path::new(concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/../target/www.rfc-editor.org"
+    ));
+
+    // these RFCs don't have any sections
+    let empty = [
+        3005, 3099, 3129, 3199, 3232, 3268, 3299, 3364, 3442, 3494, 3499, 3599, 3818,
+    ];
+
+    // these RFCs have empty section titles
+    let empty_titles = [
+        (3002, "4.1.1"),
+        (3002, "4.1.2"),
+        (3002, "4.1.3"),
+        (3002, "4.2.1"),
+        (3002, "4.2.2"),
+        (3002, "4.3.1"),
+        (3002, "4.3.2"),
+        (3002, "4.3.3"),
+        (3002, "4.3.4"),
+        (3002, "4.4.1"),
+        (3002, "4.4.2"),
+        (3002, "4.4.3"),
+        (3002, "4.5.1"),
+        (3002, "4.5.2"),
+        (3002, "4.5.3"),
+        (3002, "4.5.4"),
+        (3002, "4.5.5"),
+        (3002, "4.5.6"),
+        (3002, "4.6.1"),
+        (3002, "4.6.2"),
+        (3002, "4.7.1"),
+        (3002, "4.7.2"),
+        (3172, "A"),
+        (3258, "A"),
+        (3304, "2.1.1"),
+        (3304, "2.1.2"),
+        (3304, "2.1.3"),
+        (3304, "2.1.4"),
+        (3304, "2.1.5"),
+        (3304, "2.1.6"),
+        (3304, "2.1.7"),
+        (3304, "2.1.8"),
+        (3304, "2.1.9"),
+        (3304, "2.1.10"),
+        (3304, "2.1.11"),
+        (3304, "2.1.12"),
+        (3304, "2.2.1"),
+        (3304, "2.2.2"),
+        (3304, "2.2.3"),
+        (3304, "2.2.4"),
+        (3304, "2.2.5"),
+        (3304, "2.2.6"),
+        (3304, "2.2.7"),
+        (3304, "2.2.8"),
+        (3304, "2.2.9"),
+        (3304, "2.2.10"),
+        (3304, "2.2.11"),
+        (3304, "2.3.1"),
+        (3304, "2.3.2"),
+        (3304, "2.3.3"),
+        (3304, "2.3.4"),
+        (3332, "A"),
+        (3411, "A"),
+        (3552, "A"),
+        (4009, "B.1"),
+        (4009, "B.2"),
+        (4009, "B.3"),
+        (4009, "B.4"),
+        (4233, "A"),
+        (4269, "B.1"),
+        (4269, "B.2"),
+        (4269, "B.3"),
+        (4269, "B.4"),
+        (4523, "A"),
+        (4666, "A"),
+        (4951, "A"),
+        (4951, "B"),
+        (4951, "C"),
+    ];
+
+    // RFCs that use numbers for appendix IDs
+    let number_appendix_ids = [
+        (3175, "1"),
+        (3946, "1"),
+        (3549, "1"),
+        (4258, "1"),
+        (4606, "1"),
+    ];
+
+    // RFCs that use roman numberals
+    let roman_appendix_ids = [(5357, "I")];
+
+    // RFCs that have indented sections
+    let indented_sections = [(3003, "4")];
+
+    // these RFCs skip/reorder sections
+    let skips = [
+        (1050, "11.1"),
+        (1125, "11"),
+        (3090, "10"),
+        (3132, "4.1.2.4"),
+        (3134, "1.2.31"),
+        (3162, "2.3"),
+        (3186, "2.3.5"),
+        (3204, "3"),
+        (3208, "9.7.3"),
+        (3212, "10"),
+        (3234, "1.4"),
+        (3257, "8"),
+        (3258, "7"),
+        (3261, "F1"),
+        (3261, "25"),
+        (3284, "5.6"),
+        (3296, "5.6"),
+        (3326, "8"),
+        (3326, "7"),
+        (3326, "9"),
+        (3331, "11.0"),
+        (3348, "5"),
+        (3383, "10"),
+        (3428, "16"),
+        (3475, "9"),
+        (3509, "10"),
+        (3568, "8"),
+        (3608, "F1"),
+        (3608, "6.4.2"),
+        (3608, "7"),
+        (3671, "3.13"),
+        (3701, "5"),
+        (3810, "5.1.7"),
+        (3825, "6"),
+        (3868, "7.3.4"),
+        (3877, "3.3.5"),
+        (3929, "10"),
+        (4037, "16"),
+        (4160, "4.6"),
+        (4469, "9"),
+        (4540, "3.5.16"),
+        (4540, "5.3.17"),
+        (4604, "8"),
+        (4715, "10"),
+        (4842, "18"),
+        (4853, "6"),
+        (5013, "10"),
+        (5322, "7"),
+        (5570, "5.1.5"),
+        (5805, "4.4"),
+        (5849, "6"),
+        (5850, "5"),
+        (5858, "8"),
+        (5892, "8"),
+        (6219, "11"),
+        (6484, "1.5.4"),
+        (6484, "5.4.8"),
+        (6484, "5.6"),
+        (6485, "9"),
+        (6722, "5"),
+        (6730, "12"),
+    ];
+
+    // these RFCs have duplicate sections
+    let duplicate = [
+        (3063, "6.2.1"),
+        (3063, "A.5.2"),
+        (3093, "3.2"),
+        (3119, "11"),
+        (3131, "10"),
+        (3250, "3"),
+        (3284, "5.4"),
+        (3302, "6"),
+        (3414, "12.1"),
+        (3418, "6.1"),
+        (3476, "8"),
+        (3562, "3"),
+        (3640, "A"),
+        (3745, "6"),
+        (3785, "6.1"),
+        (3946, "1"), // uses both Appendix and Annex
+        (4511, "C.2.1"),
+        (4520, "A.8"),
+        (4606, "1"), // uses both Appendix and Annex
+        (4949, "7"),
+        (5570, "2.4.2"),
+        (5755, "10.2"),
+    ];
+
+    // _really_ messed up RFCs
+    let janky_sections = [
+        (3015, "A"),
+        (3113, "8"),
+        (3113, "9"),
+        (3122, "A"),
+        (3133, "1"),
+        (3134, "1"),
+        (3411, "A"),
+        (3525, "A.1"),
+        (3525, "I"),
+        (3730, "1"), // Appendices repeat section counters
+        (3730, "B"), // Appendices repeat section counters
+        (5038, "B"), // Appendices repeat B and C
+    ];
+
+    println!("rfc{rfc}");
+
+    // ignore any that we haven't snapshotted
+    if HIGHEST_KNOWN_ID < rfc {
+        return true;
+    }
+
+    let Ok(file) = duvet_core::vfs::read_string(etc.join(format!("rfc{rfc}.txt"))).await else {
+        println!("  NOT FOUND");
+        return false;
+    };
+
+    let tokens = tokens(&file).collect::<Vec<_>>();
+
+    insta::assert_debug_snapshot!(format!("rfc{rfc}_tokens"), tokens);
+
+    // don't do any checks right now
+    if ERRORS.iter().any(|e| e.contains(&rfc)) {
+        return true;
+    }
+
+    let mut sections = vec![];
+
+    let mut prev_section = None;
+
+    let mut check_section = |id: &str, title: &str, is_section: bool| {
+        assert!(!id.is_empty());
+
+        let prev = core::mem::replace(&mut prev_section, Some(id.to_string()));
+
+        if janky_sections.contains(&(rfc, id)) {
+            return;
+        }
+
+        assert_eq!(empty_titles.contains(&(rfc, id)), title.is_empty());
+
+        let Some(prev) = prev else {
+            if is_section {
+                assert!(["1", "1.0"].contains(&id));
+            }
+            return;
+        };
+
+        if *prev == *id {
+            assert!(duplicate.contains(&(rfc, id)), "duplicate section: {id:?}");
+            return;
+        }
+
+        let is_ok = tokenizer::section_id_monotonic(&prev, id);
+
+        let key = &(rfc, id);
+        let expected = !(skips.contains(key)
+            || indented_sections.contains(key)
+            || number_appendix_ids.contains(key)
+            || roman_appendix_ids.contains(key));
+
+        assert_eq!(
+            is_ok, expected,
+            "unexpected section number: prev={prev:?} current={id:?}"
+        );
+    };
+
+    let mut line = 1;
+    for token in tokens {
+        // make sure we don't drop any lines
+        assert_eq!(line, token.line());
+        line = token.line() + 1;
+
+        match &token {
+            Token::Section { id, title, .. } => {
+                println!("  SECTION(id={id:?} title={title:?})");
+
+                check_section(id, title, true);
+
+                sections.push(token);
+            }
+            Token::Appendix { id, title, .. } => {
+                println!(" APPENDIX(id={id:?} title={title:?})");
+
+                check_section(id, title, false);
+
+                sections.push(token);
+            }
+            Token::NamedSection { title, .. } => {
+                println!("  SECTION(title={title:?})");
+                // TODO
+            }
+            Token::Break { .. } => {
+                // TODO
+            }
+            Token::Content { .. } => {
+                // TODO
+            }
+            Token::Header { .. } => {
+                // TODO
+            }
+        }
+    }
+
+    assert_eq!(
+        sections.is_empty(),
+        empty.contains(&rfc),
+        "RFC sections is empty"
+    );
+
+    true
+}
+
+// these currently have parsing errors
+static ERRORS: &[&[usize]] = &[
+    &[
+        19, 70, 77, 98, 107, 155, 172, 194, 199, 230, 240, 254, 271, 293, 329, 330, 331, 332, 333,
+        354,
+        // TODO gap
+    ],
+    &[
+        768, 778, 782, 783, 787, 789, 799, 800, 802, 803, 810, 869, 876, 887, 891, 892, 896, 899,
+        904, 911, 914, 994, 995, 999, 1001, 1002, 1005, 1014, 1035, 1038, 1045, 1076, 1099, 1123,
+        1138, 1142, 1148, 1163, 1180, 1190, 1195, 1199, 1244, 1245, 1246,
+        // TODO gap
+    ],
+    &[
+        3064, // The first sections is `1.0.Introduction`
+        3502, // This starts on 6.3.11
+        3877, // The sections embed sequence diagrams
+    ],
+    &[
+        5054, // this has a section with a title with lots of spaces
+        5165, // this section has poorly formatted sections
+    ],
+    &[
+        6503, // this embeds messages into the section
+        6504, // this embeds messages into the section
+        6917, // this embeds messages into the section
+    ],
+    &[
+        7058, // This RFC embeds sequence diagrams in the sections
+    ],
+    &[
+        9592, // This RFC embeds another RFC in the appendix, which fails the monotonic check
+    ],
+];
diff --git a/duvet/src/specification/ietf/tokenizer.rs b/duvet/src/specification/ietf/tokenizer.rs
new file mode 100644
index 00000000..d3e308ca
--- /dev/null
+++ b/duvet/src/specification/ietf/tokenizer.rs
@@ -0,0 +1,705 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+use core::fmt;
+use duvet_core::{
+    ensure,
+    file::{Slice, SourceFile},
+};
+use once_cell::sync::Lazy;
+use regex::Regex;
+
+macro_rules! regex {
+    ($str:literal) => {{
+        static R: Lazy<Regex> = Lazy::new(|| Regex::new($str).unwrap());
+        &*R
+    }};
+}
+
+#[derive(Clone, Copy, Debug)]
+pub enum Break {
+    Line,
+    Page,
+}
+
+#[derive(Clone)]
+pub enum Token {
+    Section {
+        id: Slice<SourceFile>,
+        title: Slice<SourceFile>,
+        line: usize,
+    },
+    Appendix {
+        id: Slice<SourceFile>,
+        title: Slice<SourceFile>,
+        line: usize,
+    },
+    NamedSection {
+        title: Slice<SourceFile>,
+        line: usize,
+    },
+    Break {
+        value: Slice<SourceFile>,
+        ty: Break,
+        line: usize,
+    },
+    Content {
+        value: Slice<SourceFile>,
+        line: usize,
+    },
+    Header {
+        value: Slice<SourceFile>,
+        line: usize,
+    },
+}
+
+impl Token {
+    #[allow(dead_code)]
+    pub fn line(&self) -> usize {
+        match self {
+            Token::Section { line, .. } => *line,
+            Token::Appendix { line, .. } => *line,
+            Token::NamedSection { line, .. } => *line,
+            Token::Break { line, .. } => *line,
+            Token::Content { line, .. } => *line,
+            Token::Header { line, .. } => *line,
+        }
+    }
+
+    fn section(
+        id: Slice<SourceFile>,
+        title: Slice<SourceFile>,
+        line: usize,
+        force_appendix: bool,
+    ) -> Self {
+        if !force_appendix && id.starts_with(char::is_numeric) {
+            Token::Section { id, title, line }
+        } else {
+            Token::Appendix { id, title, line }
+        }
+    }
+}
+
+impl fmt::Debug for Token {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            Self::Section { id, title, line } => {
+                write!(f, " SECTION#{}(id={}, title={})", line, id, title)
+            }
+            Self::Appendix { id, title, line } => {
+                write!(f, "APPENDIX#{}(id={}, title={})", line, id, title)
+            }
+            Self::NamedSection { title, line } => {
+                write!(f, " SECTION#{}(title={})", line, title)
+            }
+            Self::Break {
+                line,
+                ty: Break::Page,
+                value: _,
+            } => write!(f, "   BREAK#{}", line),
+            Self::Break {
+                line,
+                ty: Break::Line,
+                value: _,
+            } => write!(f, " NEWLINE#{}", line),
+            Self::Content { value, line } => write!(f, " CONTENT#{}({})", line, value),
+            Self::Header { value, line } => write!(f, "  HEADER#{}({})", line, value),
+        }
+    }
+}
+
+pub fn tokens(contents: &SourceFile) -> impl Iterator<Item = Token> + '_ {
+    let tokens = lines(contents);
+    let tokens = page_breaks(tokens);
+    let tokens = line_breaks(tokens);
+    let tokens = sections(tokens);
+    let tokens = headers(tokens);
+    named_sections(tokens)
+}
+
+macro_rules! expect_contents {
+    ($token:expr) => {
+        match $token {
+            Token::Content { value, line } => (value, line),
+            token => return token,
+        }
+    };
+}
+
+/// Transforms file contents into lines
+fn lines(contents: &SourceFile) -> impl Iterator<Item = Token> + '_ {
+    contents.lines().enumerate().map(move |(line, value)| {
+        // line numbers start at 1
+        let line = line + 1;
+        let value = contents.substr(value).unwrap();
+        Token::Content { value, line }
+    })
+}
+
+/// Looks for page breaks in `Content` tokens
+fn page_breaks<I: Iterator<Item = Token>>(i: I) -> impl Iterator<Item = Token> {
+    i.map(|token| {
+        let (value, line) = expect_contents!(token);
+
+        if &*value == "\u{C}" {
+            return Token::Break {
+                value,
+                line,
+                ty: Break::Page,
+            };
+        }
+
+        Token::Content { value, line }
+    })
+}
+
+/// Looks for line breaks in `Content` tokens
+fn line_breaks<I: Iterator<Item = Token>>(i: I) -> impl Iterator<Item = Token> {
+    i.map(|token| {
+        let (value, line) = expect_contents!(token);
+
+        if value.is_empty() || value.trim().is_empty() {
+            return Token::Break {
+                value,
+                line,
+                ty: Break::Line,
+            };
+        }
+
+        Token::Content { value, line }
+    })
+}
+
+/// Looks for headers/footers
+fn headers<I: Iterator<Item = Token>>(i: I) -> impl Iterator<Item = Token> {
+    i.map(|token| {
+        let (value, line) = expect_contents!(token);
+
+        let beginning_patterns = [
+            regex!(r"^RFC [1-9][0-9]*  "),
+            regex!(r"^\[Page [1-9][0-9]*\]"),
+        ];
+
+        let trim_start = value.trim_start();
+        if trim_start.len() == value.len() {
+            for pattern in beginning_patterns {
+                if pattern.is_match(&value) {
+                    return Token::Header { value, line };
+                }
+            }
+        }
+
+        let ending_patterns = [regex!(r" \[Page [1-9][0-9]*\]$")];
+
+        let trim_end = value.trim_end();
+        if trim_end.len() == value.len() {
+            for pattern in ending_patterns {
+                if pattern.is_match(&value) {
+                    return Token::Header { value, line };
+                }
+            }
+        }
+
+        Token::Content { value, line }
+    })
+}
+
+fn named_sections<I: Iterator<Item = Token>>(i: I) -> impl Iterator<Item = Token> {
+    struct NamedSections<I: Iterator<Item = Token>> {
+        state: State,
+        queue: Queue,
+        iter: I,
+    }
+
+    impl<I: Iterator<Item = Token>> Iterator for NamedSections<I> {
+        type Item = Token;
+
+        fn next(&mut self) -> Option<Token> {
+            loop {
+                if let Some(token) = self.queue.next() {
+                    return Some(token);
+                }
+
+                if let Some(token) = self.iter.next() {
+                    self.state.on_token(token, &mut self.queue);
+                } else {
+                    self.state.flush(&mut self.queue);
+                    return self.queue.next();
+                }
+            }
+        }
+    }
+
+    enum Queue {
+        Zero,
+        One(Token),
+        Two(Token, Token),
+        Three(Token, Token, Token),
+    }
+
+    impl Queue {
+        fn push(&mut self, token: Token) {
+            match core::mem::replace(self, Self::Zero) {
+                Self::Three(_, _, _) => {
+                    panic!("at capacity");
+                }
+                Self::Two(a, b) => {
+                    *self = Self::Three(a, b, token);
+                }
+                Self::One(a) => {
+                    *self = Self::Two(a, token);
+                }
+                Self::Zero => {
+                    *self = Self::One(token);
+                }
+            }
+        }
+
+        fn next(&mut self) -> Option<Token> {
+            match core::mem::replace(self, Self::Zero) {
+                Self::Three(a, b, c) => {
+                    *self = Self::Two(b, c);
+                    Some(a)
+                }
+                Self::Two(a, b) => {
+                    *self = Self::One(b);
+                    Some(a)
+                }
+                Self::One(a) => Some(a),
+                Self::Zero => None,
+            }
+        }
+    }
+
+    enum State {
+        Init,
+        // we have a line break
+        First {
+            break_token: Token,
+        },
+        // we have a line break and a named section - just waiting on another line break
+        Second {
+            break_token: Token,
+            title: Slice,
+            line: usize,
+        },
+    }
+
+    impl State {
+        fn on_token(&mut self, token: Token, queue: &mut Queue) {
+            debug_assert!(matches!(queue, Queue::Zero));
+
+            match (core::mem::replace(self, Self::Init), token) {
+                (
+                    Self::Init,
+                    token @ Token::Break {
+                        ty: Break::Line, ..
+                    },
+                ) => {
+                    *self = Self::First { break_token: token };
+                }
+                (Self::Init, token) => {
+                    queue.push(token);
+                }
+                (Self::First { break_token }, Token::Content { value, line }) => {
+                    let patterns = [
+                        "Acknowledgments",
+                        "Acknowledgement",
+                        "Acknowledgements",
+                        "Index",
+                        "Author's Address",
+                        "Authors' Addresses",
+                        "Normative References",
+                        "Informative References",
+                        "References",
+                        "REFERENCES",
+                        "AUTHORS' ADDRESSES",
+                        "Full Copyright Statement",
+                        "Security Considerations",
+                        "Intellectual Property",
+                        "Intellectual Property Statement",
+                        "Working Group Information",
+                        "Contributors",
+                        "Editors' Addresses",
+                        "IANA Considerations",
+                        "Abstract",
+                        "Status of this Memo",
+                        "Status of This Memo",
+                        "Copyright Notice",
+                        "Table of Contents",
+                        "Appendix",
+                    ];
+
+                    if patterns.contains(&&*value) {
+                        *self = Self::Second {
+                            break_token,
+                            title: value,
+                            line,
+                        };
+                    } else {
+                        queue.push(break_token);
+                        queue.push(Token::Content { value, line });
+                    }
+                }
+                (
+                    Self::First { break_token },
+                    token @ Token::Break {
+                        ty: Break::Line, ..
+                    },
+                ) => {
+                    queue.push(break_token);
+                    *self = Self::First { break_token: token };
+                }
+                (Self::First { break_token }, token) => {
+                    queue.push(break_token);
+                    queue.push(token);
+                }
+                (
+                    Self::Second {
+                        break_token,
+                        title,
+                        line,
+                    },
+                    token @ Token::Break {
+                        ty: Break::Line, ..
+                    },
+                ) => {
+                    let title = Token::NamedSection { title, line };
+                    queue.push(break_token);
+                    queue.push(title);
+                    queue.push(token);
+                }
+                (
+                    Self::Second {
+                        break_token,
+                        title,
+                        line,
+                    },
+                    token,
+                ) => {
+                    let title = Token::Content { value: title, line };
+                    queue.push(break_token);
+                    queue.push(title);
+                    queue.push(token);
+                }
+            }
+        }
+
+        fn flush(&mut self, queue: &mut Queue) {
+            match core::mem::replace(self, Self::Init) {
+                Self::Init => {}
+                Self::First { break_token } => {
+                    queue.push(break_token);
+                }
+                Self::Second {
+                    break_token,
+                    title,
+                    line,
+                } => {
+                    queue.push(break_token);
+                    queue.push(Token::Content { value: title, line });
+                }
+            }
+        }
+    }
+
+    NamedSections {
+        state: State::Init,
+        queue: Queue::Zero,
+        iter: i,
+    }
+}
+
+fn sections<I: Iterator<Item = Token>>(i: I) -> impl Iterator<Item = Token> {
+    Sections::new(i)
+}
+
+#[derive(Debug)]
+struct Sections<T: Iterator<Item = Token>> {
+    tokens: T,
+    was_break: bool,
+    prev_section: Option<Slice<SourceFile>>,
+}
+
+impl<T: Iterator<Item = Token>> Sections<T> {
+    pub fn new(tokens: T) -> Self {
+        Self {
+            tokens,
+            was_break: false,
+            prev_section: None,
+        }
+    }
+
+    fn on_token(&mut self, token: Token) -> Token {
+        let token = self.on_token_impl(token);
+
+        self.was_break = matches!(token, Token::Break { .. });
+
+        if let Token::Section { id, .. } = &token {
+            self.prev_section = Some(id.clone());
+        }
+
+        if let Token::Appendix { id, .. } = &token {
+            self.prev_section = Some(id.clone());
+        }
+
+        token
+    }
+
+    fn on_token_impl(&mut self, token: Token) -> Token {
+        let (value, line) = expect_contents!(token);
+
+        let mut force_appendix = false;
+        let mut section_candidate = &*value;
+
+        for prefix in ["Appendix ", "APPENDIX ", "Annex "] {
+            if let Some(value) = value.strip_prefix(prefix) {
+                section_candidate = value;
+                force_appendix = true;
+                break;
+            }
+        }
+
+        if force_appendix {
+            let candidates = [
+                regex!(r"^([A-Z])$"),
+                regex!(r"^([A-Z])\.$"),
+                regex!(r"^([A-Z])\.\s+(.*)"),
+                regex!(r"^([A-Z]):\s+(.*)"),
+                regex!(r"^([A-Z]) :\s+(.*)"),
+                regex!(r"^([A-Z]) -\s+(.*)"),
+                regex!(r"^([A-Z]) --\s+(.*)"),
+                regex!(r"^([A-Z])\s+(.*)"),
+            ];
+
+            for candidate in candidates {
+                if let Some(section) = candidate.captures(section_candidate) {
+                    let id = section.get(1).unwrap();
+                    let id = &section_candidate[id.range()];
+
+                    let title = if let Some(title) = section.get(2) {
+                        section_candidate[title.range()].trim()
+                    } else {
+                        &id[id.len()..]
+                    };
+
+                    if !self.section_check_candidate(id, title) {
+                        continue;
+                    }
+
+                    let id = value.file().substr(id).unwrap();
+                    let title = value.file().substr(title).unwrap();
+
+                    return Token::section(id, title, line, true);
+                }
+            }
+        }
+
+        let candidates = [regex!(r"^(([A-Z]\.?)?[0-9\.]+):?\s+(.*)")];
+
+        for candidate in candidates {
+            if let Some(section) = candidate.captures(section_candidate) {
+                let id = section.get(1).unwrap();
+                let id = &section_candidate[id.range()].trim_end_matches('.');
+
+                let title = section.get(3).unwrap();
+                let title = &section_candidate[title.range()].trim();
+
+                if self.section_check_candidate(id, title) {
+                    let id = value.file().substr(id).unwrap();
+                    let title = value.file().substr(title).unwrap();
+
+                    return Token::section(id, title, line, force_appendix);
+                }
+            }
+        }
+
+        if regex!(r"^(([A-Z]\.)?[0-9\.]+)$").is_match(section_candidate) {
+            let id = section_candidate.trim_end_matches('.');
+
+            if self.section_check_candidate(id, "") {
+                let id = value.file().substr(id).unwrap();
+
+                let title = value.file().substr(&id[id.len()..]).unwrap();
+
+                return Token::section(id, title, line, force_appendix);
+            }
+        }
+
+        Token::Content { value, line }
+    }
+
+    fn section_check_candidate(&self, id: &str, title: &str) -> bool {
+        ensure!(Self::section_check_toc(title), false);
+        ensure!(Self::section_check_weird_title(title), false);
+
+        // if we have a possibly weird title, then use a monotonicity check
+        let check_monotonic = !Self::section_check_possible_weird_title(title);
+
+        self.section_check_id(id, check_monotonic)
+    }
+
+    fn section_check_id(&self, id: &str, check_monotonic: bool) -> bool {
+        for res in parse_id(id) {
+            ensure!(res.is_ok(), false);
+        }
+
+        // if we previously had a break then it's likely a valid section
+        if self.was_break && !check_monotonic {
+            return true;
+        }
+
+        let Some(prev) = self.prev_section.as_ref() else {
+            // if we don't have a section then make sure the first one is `1`
+            return ["1", "1.0"].contains(&id);
+        };
+
+        section_id_monotonic(prev, id)
+    }
+
+    fn section_check_toc(title: &str) -> bool {
+        // try to detect if this is a Table of Contents entry - they usually have period
+        // separators
+        ensure!(!title.contains("....."), false);
+        ensure!(!title.contains(". . ."), false);
+        ensure!(!title.contains(" . . "), false);
+
+        true
+    }
+
+    fn section_check_weird_title(title: &str) -> bool {
+        // try to filter out weird titles
+        ensure!(!title.starts_with(';'), false);
+        ensure!(!title.ends_with(['{', '[', '(', ';']), false);
+
+        // check if the title contains too much spacing
+        ensure!(!title.trim().contains("     "), false);
+
+        true
+    }
+
+    fn section_check_possible_weird_title(title: &str) -> bool {
+        // try to filter out weird titles
+        ensure!(!title.trim_end_matches('|').contains("|"), false);
+
+        true
+    }
+}
+
+pub(super) fn section_id_monotonic(prev: &str, current: &str) -> bool {
+    ensure!(prev != current, false);
+
+    let prev_parts = parse_id(prev);
+    let current_parts = parse_id(current);
+
+    for (idx, (prev_part, current_part)) in prev_parts.zip(current_parts).enumerate() {
+        let Some(prev_part): Option<Part> = prev_part.ok() else {
+            return false;
+        };
+        let Some(current_part): Option<Part> = current_part.ok() else {
+            return false;
+        };
+
+        // only the first part is allowed to be a number
+        if idx > 0 {
+            ensure!(matches!(current_part, Part::Num(_)), false);
+        }
+
+        // no need to keep comparing the parts
+        if prev_part.is_next(&current_part) {
+            break;
+        }
+
+        // the current part can't be less than the previous
+        ensure!(prev_part == current_part, false);
+    }
+
+    true
+}
+
+impl<T: Iterator<Item = Token>> Iterator for Sections<T> {
+    type Item = Token;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let token = self.tokens.next()?;
+        let token = self.on_token(token);
+        Some(token)
+    }
+}
+
+fn parse_id(id: &str) -> impl Iterator<Item = Result<Part, ()>> + '_ {
+    let mut digit_offset = 0;
+
+    for (idx, c) in id.char_indices() {
+        if c.is_ascii_digit() {
+            digit_offset = idx;
+            break;
+        }
+    }
+
+    let (prefix, digits) = id.split_at(digit_offset);
+
+    let prefix = if prefix.is_empty() {
+        None
+    } else {
+        Some(prefix.trim_end_matches('.').parse())
+    };
+
+    prefix
+        .into_iter()
+        .chain(digits.split('.').map(|v| v.parse()))
+        .enumerate()
+        .map(|(idx, part)| {
+            let part = part?;
+
+            if idx == 0 {
+                if let Part::Num(num) = part {
+                    ensure!(num > 0, Err(()));
+                }
+            }
+
+            // only the first part is allowed to be a letter
+            if idx > 0 {
+                ensure!(matches!(part, Part::Num(_)), Err(()));
+            }
+
+            Ok(part)
+        })
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+enum Part {
+    Num(u8),
+    Appendix(char),
+}
+
+impl Part {
+    fn is_next(&self, other: &Self) -> bool {
+        match (self, other) {
+            (Part::Num(a), Part::Num(b)) => (*a as usize) + 1 == *b as usize,
+            (Part::Num(_), Part::Appendix(a)) => *a == 'A',
+            (Part::Appendix(_), Part::Num(_)) => false,
+            (Part::Appendix(a), Part::Appendix(b)) => (*a as u32 + 1) == *b as u32,
+        }
+    }
+}
+
+impl core::str::FromStr for Part {
+    type Err = ();
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        if let Ok(v) = s.parse() {
+            // RFCs don't exceed this value
+            ensure!(v <= 199, Err(()));
+
+            return Ok(Self::Num(v));
+        }
+
+        ensure!(s.len() == 1, Err(()));
+
+        let c = s.chars().next().unwrap();
+        ensure!(c.is_ascii_uppercase(), Err(()));
+
+        Ok(Self::Appendix(c))
+    }
+}
diff --git a/duvet/src/specification/mod.rs b/duvet/src/specification/mod.rs
index 8fcb75e2..bfbc98c5 100644
--- a/duvet/src/specification/mod.rs
+++ b/duvet/src/specification/mod.rs
@@ -9,6 +9,7 @@ use core::{
     ops::{Deref, Range},
     str::FromStr,
 };
+use duvet_core::file::SourceFile;
 use std::collections::HashMap;
 
 pub mod ietf;
@@ -89,7 +90,7 @@ impl fmt::Display for Format {
 }
 
 impl Format {
-    pub fn parse(self, contents: &str) -> Result<Specification, Error> {
+    pub fn parse(self, contents: &SourceFile) -> Result<Specification, Error> {
         let spec = match self {
             Self::Auto => {
                 // Markdown MAY start with a header (#),
@@ -149,15 +150,6 @@ pub enum Line<'a> {
     Break,
 }
 
-impl Line<'_> {
-    pub fn is_empty(&self) -> bool {
-        match self {
-            Self::Str(s) => s.is_empty(),
-            Self::Break => true,
-        }
-    }
-}
-
 impl<'a> From<Str<'a>> for Line<'a> {
     fn from(s: Str<'a>) -> Self {
         Self::Str(s)
diff --git a/integration/snapshots/h3.snap b/integration/snapshots/h3.snap
index 8cfce600..14e54c02 100644
--- a/integration/snapshots/h3.snap
+++ b/integration/snapshots/h3.snap
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6b01b4e38a484f631375a9a0b419aaff5a65d4648c3ea0b2580ad748f218acc0
-size 598588
+oid sha256:a46aa96fc1de7b5505eb9f1717aad530481974dc01200de7be97cda6d26cd40b
+size 672704
diff --git a/integration/snapshots/s2n-quic.snap b/integration/snapshots/s2n-quic.snap
index 470c27d6..5f179b90 100644
--- a/integration/snapshots/s2n-quic.snap
+++ b/integration/snapshots/s2n-quic.snap
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7096ec9263bc04bc1505495357d6204c96d10d3785763489b9f3fa2cdbe3d4d9
-size 5584556
+oid sha256:8b0850fdbedf4f5d6008ea784bb5e92eb8500fc772121c22af1794cdd8e24b97
+size 6255510
diff --git a/integration/snapshots/s2n-tls.snap b/integration/snapshots/s2n-tls.snap
index e0ad840f..c3fae6f1 100644
--- a/integration/snapshots/s2n-tls.snap
+++ b/integration/snapshots/s2n-tls.snap
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7039f0469e2fcd1ee6ff3ac0c71a6722b8456b1d7d405270ce554ac71f4f1d88
-size 2890616
+oid sha256:6fb408b428c3968f5b89ea910b64e2333f3b0d77e0248ffe0cfd1ce04fd4a758
+size 3217571
diff --git a/xtask/src/tests.rs b/xtask/src/tests.rs
index 41789190..bba81445 100644
--- a/xtask/src/tests.rs
+++ b/xtask/src/tests.rs
@@ -31,7 +31,14 @@ impl Tests {
 
         let default_tests = self.default_tests.is_enabled(true);
 
+        self.download_rfcs(sh)?;
+
         if self.unit.is_enabled(default_tests) {
+            if !sh.path_exists("duvet/src/specification/ietf/snapshots") {
+                let _dir = sh.push_dir("duvet/src/specification/ietf");
+                cmd!(sh, "tar -xf snapshots.tar.gz").run()?;
+            }
+
             cmd!(sh, "cargo test").run()?;
         }
 
@@ -42,6 +49,37 @@ impl Tests {
         Ok(())
     }
 
+    fn download_rfcs(&self, sh: &Shell) -> Result {
+        let url = "https://www.rfc-editor.org/rfc/tar/RFC-all.tar.gz";
+
+        let dir = "target/www.rfc-editor.org";
+        sh.create_dir(dir)?;
+        let _dir = sh.push_dir(dir);
+
+        let tar_gz = Path::new("RFC-all.tar.gz");
+        if !sh.path_exists(tar_gz) {
+            eprintln!("downloading {url}");
+            cmd!(sh, "curl --fail --output {tar_gz} {url}").run()?;
+            cmd!(sh, "tar -xf {tar_gz}").run()?;
+        }
+
+        for file in sh.read_dir(".")? {
+            if file.ends_with(tar_gz) {
+                continue;
+            }
+
+            if let Some(ext) = file.extension().and_then(|v| v.to_str()) {
+                if ext != "txt" {
+                    sh.remove_path(file)?;
+                }
+            } else {
+                sh.remove_path(file)?;
+            }
+        }
+
+        Ok(())
+    }
+
     fn integration(&self, sh: &Shell, bin: &Path) -> Result {
         let tests = sh.read_dir("integration")?;