From fc4e628ae7d9443e474ec0add1691b665c84a569 Mon Sep 17 00:00:00 2001 From: Cameron Bytheway Date: Thu, 31 Oct 2024 11:04:25 -0600 Subject: [PATCH] refactor(duvet-core): various fixes needed for integration into main crate (#129) * refactor: improve IETF parser * refactor(duvet-core): various fixes needed for integration into main crate --- .gitattributes | 1 + .gitignore | 1 + Cargo.toml | 5 - duvet-core/Cargo.toml | 2 + duvet-core/src/diagnostic.rs | 29 +- duvet-core/src/env.rs | 32 + duvet-core/src/file.rs | 22 +- duvet-core/src/glob.rs | 47 +- duvet-core/src/hash.rs | 3 +- duvet-core/src/http.rs | 25 +- duvet-core/src/lib.rs | 15 +- duvet-core/src/path.rs | 70 +- duvet-core/src/query.rs | 10 +- duvet/Cargo.toml | 6 +- duvet/src/extract.rs | 4 +- duvet/src/extract/tests.rs | 4 +- duvet/src/lib.rs | 6 + duvet/src/main.rs | 20 +- duvet/src/object.rs | 126 ---- duvet/src/report/html.rs | 5 +- duvet/src/report/mod.rs | 5 +- duvet/src/specification/ietf.rs | 243 ++---- duvet/src/specification/ietf/break_filter.rs | 39 + duvet/src/specification/ietf/parser.rs | 151 ++++ duvet/src/specification/ietf/snapshots.tar.gz | 3 + duvet/src/specification/ietf/tests.rs | 465 ++++++++++++ duvet/src/specification/ietf/tokenizer.rs | 705 ++++++++++++++++++ duvet/src/specification/mod.rs | 12 +- integration/snapshots/h3.snap | 4 +- integration/snapshots/s2n-quic.snap | 4 +- integration/snapshots/s2n-tls.snap | 4 +- xtask/src/tests.rs | 38 + 32 files changed, 1725 insertions(+), 381 deletions(-) create mode 100644 duvet-core/src/env.rs delete mode 100644 duvet/src/object.rs create mode 100644 duvet/src/specification/ietf/break_filter.rs create mode 100644 duvet/src/specification/ietf/parser.rs create mode 100644 duvet/src/specification/ietf/snapshots.tar.gz create mode 100644 duvet/src/specification/ietf/tests.rs create mode 100644 duvet/src/specification/ietf/tokenizer.rs diff --git a/.gitattributes b/.gitattributes index f3c8d172..35175715 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +1,2 @@ integration/snapshots/*.snap filter=lfs diff=lfs merge=lfs -text +**/snapshots.tar.gz filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore index 253578a9..c74ae1bd 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ Cargo.lock target .history *.snap.new +/duvet/src/specification/ietf/snapshots diff --git a/Cargo.toml b/Cargo.toml index a299a7e8..6330da62 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,11 +7,6 @@ members = [ ] resolver = "2" -[profile.release] -lto = true -codegen-units = 1 -incremental = false - [profile.bench] lto = true codegen-units = 1 diff --git a/duvet-core/Cargo.toml b/duvet-core/Cargo.toml index ff2957ba..92d62f7b 100644 --- a/duvet-core/Cargo.toml +++ b/duvet-core/Cargo.toml @@ -13,6 +13,7 @@ http = ["dep:http", "reqwest"] testing = ["tracing-subscriber"] [dependencies] +anyhow = "1" blake3 = "1" bytes = "1" duvet-macros = { version = "0.1", path = "../duvet-macros" } @@ -24,6 +25,7 @@ miette = { version = "7", features = ["fancy"] } once_cell = "1" reqwest = { version = "0.12", optional = true } serde = { version = "1", features = ["derive", "rc"] } +serde_json = "1" tokio = { version = "1", features = ["fs", "sync"] } tokio-util = "0.7" toml_edit = { version = "0.22", features = ["parse", "serde"] } diff --git a/duvet-core/src/diagnostic.rs b/duvet-core/src/diagnostic.rs index e1802021..770fcdd0 100644 --- a/duvet-core/src/diagnostic.rs +++ b/duvet-core/src/diagnostic.rs @@ -96,6 +96,24 @@ impl Diagnostic for Error { } } +impl From for Error { + fn from(value: anyhow::Error) -> Self { + Report::msg(value).into() + } +} + +impl From for Error { + fn from(value: std::io::Error) -> Self { + Report::msg(value).into() + } +} + +impl From for Error { + fn from(value: serde_json::Error) -> Self { + Report::msg(value).into() + } +} + impl From for Error { fn from(err: Report) -> Self { Self(Arc::new(err)) @@ -210,7 +228,7 @@ impl From> for Set { impl fmt::Display for Set { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { for error in self.errors.iter() { - writeln!(f, "{}", error)?; + writeln!(f, "{:?}", error)?; } Ok(()) } @@ -227,12 +245,3 @@ impl StdError for Set { Some(&self.main) } } - -/* -impl Diagnostic for Set { - fn related<'a>(&'a self) -> Option + 'a>> { - let iter = self.errors.iter().map(|e| e as &dyn Diagnostic); - Some(Box::new(iter)) - } -} -*/ diff --git a/duvet-core/src/env.rs b/duvet-core/src/env.rs new file mode 100644 index 00000000..403fc74e --- /dev/null +++ b/duvet-core/src/env.rs @@ -0,0 +1,32 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use crate::{diagnostic::IntoDiagnostic, path::Path, Result}; +use core::cell::RefCell; +use once_cell::sync::Lazy; +use std::sync::Arc; + +static GLOBAL_ARGS: Lazy> = Lazy::new(|| std::env::args().collect()); +static GLOBAL_DIR: Lazy> = + Lazy::new(|| std::env::current_dir().map(|v| v.into()).into_diagnostic()); + +thread_local! { + static ARGS: RefCell> = RefCell::new(GLOBAL_ARGS.clone()); + static DIR: RefCell> = RefCell::new(GLOBAL_DIR.clone()); +} + +pub fn args() -> Arc<[String]> { + ARGS.with(|current| current.borrow().clone()) +} + +pub fn set_args(args: Arc<[String]>) { + ARGS.with(|current| *current.borrow_mut() = args); +} + +pub fn current_dir() -> Result { + DIR.with(|current| current.borrow().clone()) +} + +pub fn set_current_dir(dir: Path) { + DIR.with(|current| *current.borrow_mut() = Ok(dir)); +} diff --git a/duvet-core/src/file.rs b/duvet-core/src/file.rs index 8f019a02..64675bf0 100644 --- a/duvet-core/src/file.rs +++ b/duvet-core/src/file.rs @@ -91,6 +91,26 @@ impl SourceFile { .await } + pub async fn as_json(&self) -> crate::Result> + where + T: 'static + Send + Sync + serde::de::DeserializeOwned, + { + let path = self.path.clone(); + let contents = self.contents.clone(); + // TODO can we get better errors by mapping string ranges? + crate::Cache::current() + .get_or_init(*self.hash(), move || { + crate::Query::from( + serde_json::from_slice(contents.data()) + .map(Arc::new) + .into_diagnostic() + .wrap_err(path) + .map_err(|err| err.into()), + ) + }) + .await + } + pub fn substr(&self, v: &str) -> Option> { unsafe { let beginning = self.as_bytes().as_ptr(); @@ -164,7 +184,7 @@ impl SourceCode for SourceFile { } #[derive(Clone, PartialEq, PartialOrd, Hash, Eq, Ord)] -pub struct Slice { +pub struct Slice { file: File, start: usize, end: usize, diff --git a/duvet-core/src/glob.rs b/duvet-core/src/glob.rs index 0a86a426..d268e7ef 100644 --- a/duvet-core/src/glob.rs +++ b/duvet-core/src/glob.rs @@ -6,25 +6,39 @@ use globset as g; use serde::de; use std::{str::FromStr, sync::Arc}; -#[derive(Clone, Debug)] +#[derive(Clone)] pub struct Glob { - set: Arc, + set: Arc<(g::GlobSet, Vec)>, +} + +impl fmt::Debug for Glob { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let list = &self.set.1; + if list.len() == 1 { + list[0].fmt(f) + } else { + list.fmt(f) + } + } } impl Glob { pub fn is_match>(&self, path: &P) -> bool { - self.set.is_match(path) + self.set.0.is_match(path) } pub fn try_from_iter, I: AsRef>( iter: T, ) -> Result { let mut builder = g::GlobSetBuilder::new(); + let mut display = vec![]; for item in iter { - builder.add(g::Glob::new(item.as_ref())?); + let value = format_value(item.as_ref()); + builder.add(g::Glob::new(&value)?); + display.push(value); } let set = builder.build()?; - let set = Arc::new(set); + let set = Arc::new((set, display)); Ok(Self { set }) } } @@ -37,6 +51,14 @@ impl FromStr for Glob { } } +impl TryFrom<&str> for Glob { + type Error = g::Error; + + fn try_from(value: &str) -> Result { + value.parse() + } +} + impl<'de> de::Deserialize<'de> for Glob { fn deserialize(deserializer: D) -> Result where @@ -67,12 +89,23 @@ impl<'de> de::Visitor<'de> for StringOrList { S: de::SeqAccess<'de>, { let mut builder = g::GlobSetBuilder::new(); + let mut display = vec![]; while let Some(value) = seq.next_element()? { - let item = g::Glob::new(value).map_err(serde::de::Error::custom)?; + let value = format_value(value); + let item = g::Glob::new(&value).map_err(serde::de::Error::custom)?; builder.add(item); + display.push(value); } let set = builder.build().map_err(serde::de::Error::custom)?; - let set = Arc::new(set); + let set = Arc::new((set, display)); Ok(Glob { set }) } } + +fn format_value(v: &str) -> String { + if v.starts_with("**/") || v.starts_with('/') { + v.to_string() + } else { + format!("**/{v}") + } +} diff --git a/duvet-core/src/hash.rs b/duvet-core/src/hash.rs index 2e163597..5e95ccee 100644 --- a/duvet-core/src/hash.rs +++ b/duvet-core/src/hash.rs @@ -9,8 +9,9 @@ pub struct Hash([u8; HASH_LEN]); impl fmt::Debug for Hash { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "0x")?; for byte in &self.0 { - write!(f, "{byte:#02x}")?; + write!(f, "{byte:02x}")?; } Ok(()) } diff --git a/duvet-core/src/http.rs b/duvet-core/src/http.rs index 8e603c40..c697d807 100644 --- a/duvet-core/src/http.rs +++ b/duvet-core/src/http.rs @@ -13,14 +13,31 @@ use std::sync::Arc; pub use http::response::Parts; pub use reqwest::Client; +fn default_headers() -> reqwest::header::HeaderMap { + let mut map = reqwest::header::HeaderMap::new(); + + map.insert("accept", "text/plain".parse().unwrap()); + + map +} + pub fn client() -> Query { #[derive(Clone, Copy, Debug, Hash, Eq, PartialEq)] struct Q; - // TODO configure the client more - // - User-Agent headers - // - Accept headers? - Cache::current().get_or_init(Q, || Query::from(Client::builder().build().unwrap())) + Cache::current().get_or_init(Q, || { + Query::from( + Client::builder() + .user_agent(concat!( + env!("CARGO_PKG_NAME"), + "/", + env!("CARGO_PKG_VERSION") + )) + .default_headers(default_headers()) + .build() + .unwrap(), + ) + }) } pub fn get_full(url: U) -> Query, Contents)>> diff --git a/duvet-core/src/lib.rs b/duvet-core/src/lib.rs index 613aa1b8..75ca3c05 100644 --- a/duvet-core/src/lib.rs +++ b/duvet-core/src/lib.rs @@ -1,6 +1,18 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +#[macro_export] +macro_rules! ensure { + ($cond:expr) => { + ensure!($cond, ()); + }; + ($cond:expr, $otherwise:expr) => { + if !($cond) { + return $otherwise; + } + }; +} + #[cfg(any(test, feature = "testing"))] pub mod testing; @@ -8,6 +20,7 @@ mod cache; pub mod contents; pub mod diagnostic; pub mod dir; +pub mod env; pub mod file; pub mod glob; pub mod hash; @@ -24,4 +37,4 @@ pub use cache::Cache; pub use duvet_macros::*; pub use query::Query; -pub type Result = core::result::Result; +pub type Result = core::result::Result; diff --git a/duvet-core/src/path.rs b/duvet-core/src/path.rs index 7ac810a0..1529a842 100644 --- a/duvet-core/src/path.rs +++ b/duvet-core/src/path.rs @@ -1,16 +1,61 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -use core::fmt; +use core::{cmp::Ordering, fmt}; use serde::Deserialize; use std::{ffi::OsStr, ops::Deref, path::PathBuf, sync::Arc}; -#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Deserialize)] +#[derive(Clone, Deserialize)] #[serde(transparent)] pub struct Path { path: Arc, } +impl Path { + pub fn pop(&mut self) -> bool { + if let Some(parent) = self.parent() { + *self = parent.into(); + true + } else { + false + } + } + + pub fn push>(&mut self, component: V) { + *self = self.join(component); + } + + pub fn join>(&self, component: V) -> Self { + self.as_ref().join(component).into() + } +} + +impl PartialEq for Path { + fn eq(&self, other: &Self) -> bool { + self.as_ref().eq(other.as_ref()) + } +} + +impl Eq for Path {} + +impl PartialOrd for Path { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for Path { + fn cmp(&self, other: &Self) -> Ordering { + self.as_ref().cmp(other.as_ref()) + } +} + +impl core::hash::Hash for Path { + fn hash(&self, state: &mut H) { + self.as_ref().hash(state) + } +} + impl fmt::Debug for Path { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { self.as_ref().fmt(f) @@ -19,7 +64,12 @@ impl fmt::Debug for Path { impl fmt::Display for Path { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - self.as_ref().display().fmt(f) + let path = self.as_ref(); + let path = crate::env::current_dir() + .ok() + .and_then(|dir| path.strip_prefix(dir).ok()) + .unwrap_or(path); + path.display().fmt(f) } } @@ -65,6 +115,20 @@ impl From for Path { } } +impl From<&std::path::Path> for Path { + fn from(path: &std::path::Path) -> Self { + Self { + path: path.as_os_str().into(), + } + } +} + +impl From for PathBuf { + fn from(value: Path) -> Self { + PathBuf::from(&value.path) + } +} + impl From<&Path> for Path { fn from(path: &Path) -> Self { Self { diff --git a/duvet-core/src/query.rs b/duvet-core/src/query.rs index 9dd1ecfe..2b5f8ee4 100644 --- a/duvet-core/src/query.rs +++ b/duvet-core/src/query.rs @@ -445,14 +445,16 @@ mod tests { let query = Query::new(async move { rx.await.unwrap() }); let a = query.clone(); - let a = tokio::spawn(async move { *a.get().await }); + let a = async move { *a.get().await }; let b = query; - let b = tokio::spawn(async move { *b.get().await }); + let b = async move { *b.get().await }; tx.send(123).unwrap(); - assert_eq!(a.await.unwrap(), 123); - assert_eq!(b.await.unwrap(), 123); + let (a, b) = tokio::join!(a, b); + + assert_eq!(a, 123); + assert_eq!(b, 123); } } diff --git a/duvet/Cargo.toml b/duvet/Cargo.toml index cdc2d766..b0400cfc 100644 --- a/duvet/Cargo.toml +++ b/duvet/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "duvet" version = "0.3.0" -description = "A code quality tool to help bound correctness." +description = "A requirements traceability tool" authors = ["Cameron Bytheway ", "Ryan Emery "] edition = "2021" license = "Apache-2.0" @@ -16,8 +16,10 @@ anyhow = "1" clap = { version = "4", features = ["derive"] } duvet-core = { version = "0.1", path = "../duvet-core" } fnv = { version = "1", default-features = false } +futures = { version = "0.3" } glob = "0.3" lazy_static = "1" +once_cell = "1" pathdiff = "0.2" pulldown-cmark = { version = "0.12", default-features = false } rayon = "1" @@ -26,6 +28,8 @@ reqwest = { version = "0.12", features = ["blocking", "native-tls"] } serde = { version = "1", features = ["derive"] } slug = { version = "0.1" } tokio = { version = "1", features = ["macros", "rt-multi-thread"] } +tracing = "0.1" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } toml = "0.5" triple_accel = "0.4" url = "2" diff --git a/duvet/src/extract.rs b/duvet/src/extract.rs index 1e05760e..4530633e 100644 --- a/duvet/src/extract.rs +++ b/duvet/src/extract.rs @@ -69,9 +69,11 @@ pub struct Extract { impl Extract { pub async fn exec(&self) -> Result<(), Error> { let contents = self.target.load(self.spec_path.as_deref())?; + let local_path = self.target.local(self.spec_path.as_deref()); + let contents = duvet_core::file::SourceFile::new(&*local_path, contents).unwrap(); + let spec = self.format.parse(&contents)?; let sections = extract_sections(&spec); - let local_path = self.target.local(self.spec_path.as_deref()); if self.out.extension().is_some() { // assume a path with an extension is a single file diff --git a/duvet/src/extract/tests.rs b/duvet/src/extract/tests.rs index c0e510aa..5204a7bf 100644 --- a/duvet/src/extract/tests.rs +++ b/duvet/src/extract/tests.rs @@ -16,8 +16,10 @@ macro_rules! snapshot_test { stringify!($name), $ext, )); + let path = concat!(stringify!($name), $ext); + let contents = duvet_core::file::SourceFile::new(path, contents).unwrap(); - let spec = Format::Auto.parse(contents).unwrap(); + let spec = Format::Auto.parse(&contents).unwrap(); let sections = extract_sections(&spec); let results: Vec<_> = sections diff --git a/duvet/src/lib.rs b/duvet/src/lib.rs index 4c9ad2e5..caaeec02 100644 --- a/duvet/src/lib.rs +++ b/duvet/src/lib.rs @@ -20,6 +20,7 @@ mod text; mod tests; pub use anyhow::Error; +pub use duvet_core::Result; #[allow(clippy::large_enum_variant)] #[derive(Debug, Parser)] @@ -42,6 +43,11 @@ impl Arguments { } } +pub async fn run() -> Result { + arguments().await.exec().await?; + Ok(()) +} + pub(crate) fn fnv(value: &H) -> u64 { use core::hash::Hasher; let mut hasher = fnv::FnvHasher::default(); diff --git a/duvet/src/main.rs b/duvet/src/main.rs index 0bf00b50..a2fbce98 100644 --- a/duvet/src/main.rs +++ b/duvet/src/main.rs @@ -2,10 +2,24 @@ // SPDX-License-Identifier: Apache-2.0 fn main() { + let format = tracing_subscriber::fmt::format().compact(); // Use a less verbose output format. + + let env_filter = tracing_subscriber::EnvFilter::builder() + .with_default_directive(tracing::Level::ERROR.into()) + .with_env_var("DUVET_LOG") + .from_env() + .unwrap(); + + tracing_subscriber::fmt() + .with_env_filter(env_filter) + .event_format(format) + .with_test_writer() + .init(); + let cache = duvet_core::Cache::default(); let fs = duvet_core::vfs::fs::Fs::default(); - let runtime = tokio::runtime::Builder::new_multi_thread() + let runtime = tokio::runtime::Builder::new_current_thread() .on_thread_start({ let cache = cache.clone(); let fs = fs.clone(); @@ -19,8 +33,8 @@ fn main() { .unwrap(); runtime.block_on(async { - if let Err(err) = duvet::arguments().await.exec().await { - eprintln!("{}", err); + if let Err(err) = duvet::run().await { + eprintln!("{err:?}"); std::process::exit(1); } }); diff --git a/duvet/src/object.rs b/duvet/src/object.rs deleted file mode 100644 index 2032a848..00000000 --- a/duvet/src/object.rs +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -use crate::{annotation::AnnotationSet, parser::Parser, Error}; -use anyhow::anyhow; -use goblin::{ - archive::Archive, - elf::Elf, - mach::{Mach, MachO, MultiArch}, - pe::PE, - Object, -}; - -pub fn extract(buffer: &[u8], annotations: &mut AnnotationSet) -> Result<(), Error> { - let object = Object::parse(buffer)?; - (&object, buffer).load(annotations)?; - - Ok(()) -} - -trait AnnoObject { - fn load(&self, annotations: &mut AnnotationSet) -> Result<(), Error>; -} - -impl<'a> AnnoObject for (&Object<'a>, &'a [u8]) { - fn load(&self, annotations: &mut AnnotationSet) -> Result<(), Error> { - match &self.0 { - Object::Elf(obj) => (obj, self.1).load(annotations), - Object::PE(obj) => (obj, self.1).load(annotations), - Object::Mach(obj) => (obj, self.1).load(annotations), - Object::Archive(obj) => (obj, self.1).load(annotations), - _ => Err(anyhow!("Unknown file format".to_string())), - } - } -} - -impl<'a> AnnoObject for (&Elf<'a>, &'a [u8]) { - fn load(&self, annotations: &mut AnnotationSet) -> Result<(), Error> { - let elf = &self.0; - for sect in &elf.section_headers { - if sect.sh_type != goblin::elf::section_header::SHT_NOTE { - continue; - } - - if !elf - .shdr_strtab - .get(sect.sh_name) - .map_or(false, |r| r.ok() == Some(".note.compliance")) - { - continue; - } - - let addr = sect.sh_offset as usize; - let len = sect.sh_size as usize; - for annotation in Parser(&self.1[addr..(addr + len)]) { - annotations.insert(annotation?); - } - } - - Ok(()) - } -} - -impl<'a> AnnoObject for (&PE<'a>, &'a [u8]) { - fn load(&self, annotations: &mut AnnotationSet) -> Result<(), Error> { - for section in &self.0.sections { - if Some(".debug_compliance") == section.real_name.as_deref() { - let addr = section.pointer_to_raw_data as usize; - let len = section.virtual_size as usize; - - for annotation in Parser(&self.1[addr..(addr + len)]) { - annotations.insert(annotation?); - } - } - } - Ok(()) - } -} - -impl<'a> AnnoObject for (&Mach<'a>, &'a [u8]) { - fn load(&self, annotations: &mut AnnotationSet) -> Result<(), Error> { - match &self.0 { - Mach::Fat(obj) => (obj, self.1).load(annotations), - Mach::Binary(obj) => obj.load(annotations), - } - } -} - -impl<'a> AnnoObject for (&MultiArch<'a>, &'a [u8]) { - fn load(&self, annotations: &mut AnnotationSet) -> Result<(), Error> { - for arch in self.0.iter_arches() { - let arch = arch?; - extract(arch.slice(self.1), annotations)?; - } - Ok(()) - } -} - -impl<'a> AnnoObject for MachO<'a> { - fn load(&self, annotations: &mut AnnotationSet) -> Result<(), Error> { - for sections in self.segments.sections() { - for (section, data) in sections.flatten() { - if let (b"__DATA\0\0\0\0\0\0\0\0\0\0", b"__compliance\0\0\0\0") = - (§ion.segname, §ion.sectname) - { - for annotation in Parser(data) { - annotations.insert(annotation?); - } - } - } - } - - Ok(()) - } -} - -impl<'a> AnnoObject for (&Archive<'a>, &'a [u8]) { - fn load(&self, annotations: &mut AnnotationSet) -> Result<(), Error> { - for member in self.0.members() { - if let Ok(contents) = self.0.extract(member, self.1) { - let _ = extract(contents, annotations); - } - } - Ok(()) - } -} diff --git a/duvet/src/report/html.rs b/duvet/src/report/html.rs index 56da05ff..84364756 100644 --- a/duvet/src/report/html.rs +++ b/duvet/src/report/html.rs @@ -51,7 +51,10 @@ pub fn report_writer( w!(""); w!("
"); w!(r#""#); w!(""); w!(""); diff --git a/duvet/src/report/mod.rs b/duvet/src/report/mod.rs index 44eee6f2..32cb6717 100644 --- a/duvet/src/report/mod.rs +++ b/duvet/src/report/mod.rs @@ -115,7 +115,10 @@ impl Report { let contents: HashMap<_, _> = targets .par_iter() .map(|target| { - let contents = target.path.load(self.project.spec_path.as_deref()).unwrap(); + let spec_path = self.project.spec_path.as_deref(); + let path = target.path.local(spec_path); + let contents = target.path.load(spec_path).unwrap(); + let contents = duvet_core::file::SourceFile::new(path, contents).unwrap(); (target, contents) }) .collect(); diff --git a/duvet/src/specification/ietf.rs b/duvet/src/specification/ietf.rs index 7122d28b..5b507728 100644 --- a/duvet/src/specification/ietf.rs +++ b/duvet/src/specification/ietf.rs @@ -1,201 +1,54 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -use super::{Section, Specification, Str}; -use crate::{sourcemap::LinesIter, Error}; -use core::ops::Deref; -use lazy_static::lazy_static; -use regex::Regex; - -lazy_static! { - static ref SECTION_HEADER_RE: Regex = Regex::new(r"^(([A-Z]\.)?[0-9\.]+)\s+(.*)").unwrap(); - static ref APPENDIX_HEADER_RE: Regex = Regex::new(r"^Appendix ([A-Z]\.)\s+(.*)").unwrap(); - - /// Table of contents have at least 5 periods - static ref TOC_RE: Regex = Regex::new(r"\.{5,}").unwrap(); -} - -pub fn parse(contents: &str) -> Result { - let mut parser = Parser::default(); - - for line in LinesIter::new(contents) { - parser.on_line(line)?; - } - - let mut spec = parser.done()?; - - spec.format = super::Format::Ietf; - - Ok(spec) -} - -#[derive(Debug, Default)] -pub struct Parser<'a> { - spec: Specification<'a>, - state: ParserState<'a>, -} - -#[derive(Debug)] -pub enum ParserState<'a> { - Init, - Section { section: Section<'a>, indent: usize }, -} - -impl Default for ParserState<'_> { - fn default() -> Self { - Self::Init - } -} - -fn section_header(line: Str) -> Option
{ - let full_title = line; - if let Some(info) = SECTION_HEADER_RE.captures(&line) { - let id = info.get(1)?; - let title = info.get(3)?; - - if TOC_RE.is_match(title.as_str()) { - return None; - } - - let id = line.slice(id.range()).trim_end_matches('.'); - let id = match id.chars().next() { - Some('0'..='9') => format!("section-{}", id), - _ => format!("appendix-{}", id), - }; - let title = line.slice(title.range()).to_string(); - - Some(Section { - id, - title, - full_title, - lines: vec![], +use super::{Format, Line, Section, Specification, Str}; +use crate::Error; +use duvet_core::file::SourceFile; + +pub mod break_filter; +pub mod parser; +pub mod tokenizer; + +#[cfg(test)] +mod tests; + +pub fn parse(contents: &SourceFile) -> Result { + let tokens = tokenizer::tokens(contents); + let tokens = break_filter::break_filter(tokens); + let parser = parser::parse(tokens); + + let pos = |substr: &str| substr.as_ptr() as usize - contents.as_ptr() as usize; + let substr = |substr: &str, line: usize| { + let pos = pos(substr); + let value = &contents[pos..pos + substr.len()]; + Str { value, pos, line } + }; + + let sections = parser + .map(|section| { + let id = section.id.to_string(); + + let section = Section { + title: section.title.to_string(), + id: id.clone(), + full_title: substr(§ion.title, section.line), + lines: section + .lines + .into_iter() + .map(|(line, value)| { + let value = substr(&value, line); + Line::Str(value) + }) + .collect(), + }; + + (id, section) }) - } else if let Some(info) = APPENDIX_HEADER_RE.captures(&line) { - let id = info.get(1)?; - let title = info.get(2)?; + .collect(); - if TOC_RE.is_match(title.as_str()) { - return None; - } - - let id = line.slice(id.range()).trim_end_matches('.'); - let id = format!("appendix-{}", id); - let title = line.slice(title.range()).to_string(); - - Some(Section { - id, - title, - full_title, - lines: vec![], - }) - } else { - None - } + Ok(Specification { + title: None, + sections, + format: Format::Ietf, + }) } - -impl<'a> Parser<'a> { - pub fn on_line(&mut self, line: Str<'a>) -> Result<(), Error> { - // remove footer marker - if line.deref() == "\u{c}" { - return Ok(()); - } - - match core::mem::replace(&mut self.state, ParserState::Init) { - ParserState::Init => { - if let Some(section) = section_header(line) { - self.state = ParserState::Section { - section, - indent: usize::MAX, - }; - } - } - ParserState::Section { - mut section, - indent, - } => { - let line_indent = line.indentation(); - - // dedup whitespace - if line_indent == line.len() - && section.lines.last().map(|l| !l.is_empty()).unwrap_or(false) - { - section.lines.push(line.trim().into()); - - // most likely the footer/header - self.state = ParserState::Section { section, indent }; - - return Ok(()); - } - - if line_indent == 0 { - if let Some(new_section) = section_header(line) { - self.on_section(section, indent); - self.state = ParserState::Section { - section: new_section, - indent: usize::MAX, - }; - } else { - // most likely the footer/header - self.state = ParserState::Section { section, indent }; - } - - return Ok(()); - } - - section.lines.push(line.into()); - - self.state = ParserState::Section { - section, - indent: indent.min(line_indent), - }; - } - } - - Ok(()) - } - - fn on_section(&mut self, mut section: Section<'a>, indent: usize) { - for content in &mut section.lines { - if let super::Line::Str(content) = content { - if !content.is_empty() { - let range = indent..content.len(); - *content = content.slice(range); - } - } - } - - // remove last whitespace - if section.lines.last().map(|l| l.is_empty()).unwrap_or(false) { - section.lines.pop(); - } - - let id = section.id.clone(); - self.spec.sections.insert(id, section); - } - - pub fn done(mut self) -> Result, Error> { - match core::mem::replace(&mut self.state, ParserState::Init) { - ParserState::Init => Ok(self.spec), - ParserState::Section { section, indent } => { - self.on_section(section, indent); - Ok(self.spec) - } - } - } -} - -// macro_rules! ietf_test { -// ($name:ident, $file:expr) => { -// #[ignore] // TODO: Escaping of apostrophes has changed in Rust beta, so the Compliance snapshot tests are failing [rust-lang/rust#83079](https://github.com/rust-lang/rust/pull/83079) -// #[test] -// fn $name() { -// let res = parse(include_str!(concat!( -// env!("CARGO_MANIFEST_DIR"), -// "/../specs/", -// $file -// ))) -// .unwrap(); -// insta::assert_debug_snapshot!($file, res); -// } -// }; -// } diff --git a/duvet/src/specification/ietf/break_filter.rs b/duvet/src/specification/ietf/break_filter.rs new file mode 100644 index 00000000..ca90cbd9 --- /dev/null +++ b/duvet/src/specification/ietf/break_filter.rs @@ -0,0 +1,39 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use super::tokenizer::{Break, Token}; + +/// Filters out duplicate breaks, headers, and headers misclassified as contents +pub fn break_filter>(tokens: T) -> impl Iterator { + let mut break_ty = None; + tokens.filter(move |token| { + let prev_break = core::mem::take(&mut break_ty); + + match token { + Token::Section { .. } | Token::Appendix { .. } | Token::NamedSection { .. } => {} + Token::Break { ty, .. } => { + break_ty = Some(*ty); + + // dedupe breaks + if prev_break.is_some() { + return false; + } + } + Token::Content { .. } => { + // if we previously had a page break then ignore the next line - it's a header that + // didn't tokenize correctly + if matches!(prev_break, Some(Break::Page)) { + break_ty = Some(Break::Line); + return false; + } + } + Token::Header { value: _, line: _ } => { + // set up a break since we skipped a line + break_ty = Some(Break::Line); + return false; + } + } + + true + }) +} diff --git a/duvet/src/specification/ietf/parser.rs b/duvet/src/specification/ietf/parser.rs new file mode 100644 index 00000000..9ca7b446 --- /dev/null +++ b/duvet/src/specification/ietf/parser.rs @@ -0,0 +1,151 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use super::tokenizer::Token; +use core::fmt; +use duvet_core::file::Slice; + +pub fn parse>(tokens: T) -> Parser { + Parser { + section: None, + tokens: tokens.into_iter(), + } +} + +pub struct Parser { + section: Option
, + tokens: T, +} + +pub struct Section { + pub id: Id, + pub title: Slice, + pub line: usize, + pub lines: Vec<(usize, Slice)>, +} + +pub enum Id { + Section(Slice), + Appendix(Slice), + NamedSection(Slice), +} + +impl fmt::Display for Id { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Id::Section(id) => write!(f, "section-{id}"), + Id::Appendix(id) => write!(f, "appendix-{id}"), + // TODO slugify + Id::NamedSection(title) => write!(f, "named-{title}"), + } + } +} + +impl Section { + fn push(&mut self, line: usize, value: Slice) { + // don't push an empty first line + if self.lines.is_empty() && value.trim().is_empty() { + return; + } + + self.lines.push((line, value)); + } +} + +impl> Parser { + fn on_token(&mut self, token: Token) -> Option
{ + match token { + Token::Section { id, title, line } => { + let prev = self.flush(); + + self.section = Some(Section { + id: Id::Section(id), + title, + line, + lines: vec![], + }); + + prev + } + Token::Appendix { id, title, line } => { + let prev = self.flush(); + + self.section = Some(Section { + id: Id::Appendix(id), + title, + line, + lines: vec![], + }); + + prev + } + Token::NamedSection { title, line } => { + let prev = self.flush(); + + self.section = Some(Section { + id: Id::NamedSection(title.clone()), + title, + line, + lines: vec![], + }); + + prev + } + Token::Break { line, value, ty: _ } => { + if let Some(section) = self.section.as_mut() { + // just get the line offset + let trimmed = &value[0..0]; + let value = value.file().substr(trimmed).unwrap(); + section.push(line, value); + } + + None + } + Token::Content { line, value } => { + if let Some(section) = self.section.as_mut() { + section.push(line, value); + } + + None + } + Token::Header { value: _, line: _ } => { + // ignore headers + None + } + } + } + + fn flush(&mut self) -> Option
{ + let mut section = core::mem::take(&mut self.section)?; + + // trim any trailing lines + loop { + let Some((_lineno, line)) = section.lines.last() else { + break; + }; + + if !line.trim().is_empty() { + break; + } + + section.lines.pop(); + } + + Some(section) + } +} + +impl> Iterator for Parser { + type Item = Section; + + fn next(&mut self) -> Option { + loop { + let Some(token) = self.tokens.next() else { + return self.flush(); + }; + if let Some(section) = self.on_token(token) { + return Some(section); + } + } + } +} diff --git a/duvet/src/specification/ietf/snapshots.tar.gz b/duvet/src/specification/ietf/snapshots.tar.gz new file mode 100644 index 00000000..dec9ce39 --- /dev/null +++ b/duvet/src/specification/ietf/snapshots.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c39aefea4476c89a2f71a1885eb9b1d1b0549e8d2a3715a7c667ef7ec786e540 +size 573450240 diff --git a/duvet/src/specification/ietf/tests.rs b/duvet/src/specification/ietf/tests.rs new file mode 100644 index 00000000..ca643986 --- /dev/null +++ b/duvet/src/specification/ietf/tests.rs @@ -0,0 +1,465 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use super::tokenizer::{self, tokens, Token}; + +const HIGHEST_KNOWN_ID: usize = 9663; + +macro_rules! tests { + ($(($name:ident, $id:expr)),* $(,)?) => { + $( + #[tokio::test] + async fn $name() { + test_range($id..$id + 100).await + } + )* + } +} + +tests!( + (rfc_30xx, 3000), + (rfc_31xx, 3100), + (rfc_32xx, 3200), + (rfc_33xx, 3300), + (rfc_34xx, 3400), + (rfc_35xx, 3500), + (rfc_36xx, 3600), + (rfc_37xx, 3700), + (rfc_38xx, 3800), + (rfc_39xx, 3900), +); + +tests!( + (rfc_40xx, 4000), + (rfc_41xx, 4100), + (rfc_42xx, 4200), + (rfc_43xx, 4300), + (rfc_44xx, 4400), + (rfc_45xx, 4500), + (rfc_46xx, 4600), + (rfc_47xx, 4700), + (rfc_48xx, 4800), + (rfc_49xx, 4900), +); + +tests!( + (rfc_50xx, 5000), + (rfc_51xx, 5100), + (rfc_52xx, 5200), + (rfc_53xx, 5300), + (rfc_54xx, 5400), + (rfc_55xx, 5500), + (rfc_56xx, 5600), + (rfc_57xx, 5700), + (rfc_58xx, 5800), + (rfc_59xx, 5900), +); + +tests!( + (rfc_60xx, 6000), + (rfc_61xx, 6100), + (rfc_62xx, 6200), + (rfc_63xx, 6300), + (rfc_64xx, 6400), + (rfc_65xx, 6500), + (rfc_66xx, 6600), + (rfc_67xx, 6700), + (rfc_68xx, 6800), + (rfc_69xx, 6900), +); + +tests!( + (rfc_70xx, 7000), + (rfc_71xx, 7100), + (rfc_72xx, 7200), + (rfc_73xx, 7300), + (rfc_74xx, 7400), + (rfc_75xx, 7500), + (rfc_76xx, 7600), + (rfc_77xx, 7700), + (rfc_78xx, 7800), + (rfc_79xx, 7900), +); + +tests!( + (rfc_80xx, 8000), + (rfc_81xx, 8100), + (rfc_82xx, 8200), + (rfc_83xx, 8300), + (rfc_84xx, 8400), + (rfc_85xx, 8500), + (rfc_86xx, 8600), + (rfc_87xx, 8700), + (rfc_88xx, 8800), + (rfc_89xx, 8900), +); + +tests!( + (rfc_90xx, 9000), + (rfc_91xx, 9100), + (rfc_92xx, 9200), + (rfc_93xx, 9300), + (rfc_94xx, 9400), + (rfc_95xx, 9500), + (rfc_96xx, 9600), + (rfc_97xx, 9700), + (rfc_98xx, 9800), + (rfc_99xx, 9900), +); + +async fn test_range(range: core::ops::Range) { + let mut saw_any = false; + for rfc in range { + saw_any |= test_rfc(rfc).await; + } + + assert!(saw_any, "missing RFC download - run `cargo xtask test`"); +} + +async fn test_rfc(rfc: usize) -> bool { + let etc = std::path::Path::new(concat!( + env!("CARGO_MANIFEST_DIR"), + "/../target/www.rfc-editor.org" + )); + + // these RFCs don't have any sections + let empty = [ + 3005, 3099, 3129, 3199, 3232, 3268, 3299, 3364, 3442, 3494, 3499, 3599, 3818, + ]; + + // these RFCs have empty section titles + let empty_titles = [ + (3002, "4.1.1"), + (3002, "4.1.2"), + (3002, "4.1.3"), + (3002, "4.2.1"), + (3002, "4.2.2"), + (3002, "4.3.1"), + (3002, "4.3.2"), + (3002, "4.3.3"), + (3002, "4.3.4"), + (3002, "4.4.1"), + (3002, "4.4.2"), + (3002, "4.4.3"), + (3002, "4.5.1"), + (3002, "4.5.2"), + (3002, "4.5.3"), + (3002, "4.5.4"), + (3002, "4.5.5"), + (3002, "4.5.6"), + (3002, "4.6.1"), + (3002, "4.6.2"), + (3002, "4.7.1"), + (3002, "4.7.2"), + (3172, "A"), + (3258, "A"), + (3304, "2.1.1"), + (3304, "2.1.2"), + (3304, "2.1.3"), + (3304, "2.1.4"), + (3304, "2.1.5"), + (3304, "2.1.6"), + (3304, "2.1.7"), + (3304, "2.1.8"), + (3304, "2.1.9"), + (3304, "2.1.10"), + (3304, "2.1.11"), + (3304, "2.1.12"), + (3304, "2.2.1"), + (3304, "2.2.2"), + (3304, "2.2.3"), + (3304, "2.2.4"), + (3304, "2.2.5"), + (3304, "2.2.6"), + (3304, "2.2.7"), + (3304, "2.2.8"), + (3304, "2.2.9"), + (3304, "2.2.10"), + (3304, "2.2.11"), + (3304, "2.3.1"), + (3304, "2.3.2"), + (3304, "2.3.3"), + (3304, "2.3.4"), + (3332, "A"), + (3411, "A"), + (3552, "A"), + (4009, "B.1"), + (4009, "B.2"), + (4009, "B.3"), + (4009, "B.4"), + (4233, "A"), + (4269, "B.1"), + (4269, "B.2"), + (4269, "B.3"), + (4269, "B.4"), + (4523, "A"), + (4666, "A"), + (4951, "A"), + (4951, "B"), + (4951, "C"), + ]; + + // RFCs that use numbers for appendix IDs + let number_appendix_ids = [ + (3175, "1"), + (3946, "1"), + (3549, "1"), + (4258, "1"), + (4606, "1"), + ]; + + // RFCs that use roman numberals + let roman_appendix_ids = [(5357, "I")]; + + // RFCs that have indented sections + let indented_sections = [(3003, "4")]; + + // these RFCs skip/reorder sections + let skips = [ + (1050, "11.1"), + (1125, "11"), + (3090, "10"), + (3132, "4.1.2.4"), + (3134, "1.2.31"), + (3162, "2.3"), + (3186, "2.3.5"), + (3204, "3"), + (3208, "9.7.3"), + (3212, "10"), + (3234, "1.4"), + (3257, "8"), + (3258, "7"), + (3261, "F1"), + (3261, "25"), + (3284, "5.6"), + (3296, "5.6"), + (3326, "8"), + (3326, "7"), + (3326, "9"), + (3331, "11.0"), + (3348, "5"), + (3383, "10"), + (3428, "16"), + (3475, "9"), + (3509, "10"), + (3568, "8"), + (3608, "F1"), + (3608, "6.4.2"), + (3608, "7"), + (3671, "3.13"), + (3701, "5"), + (3810, "5.1.7"), + (3825, "6"), + (3868, "7.3.4"), + (3877, "3.3.5"), + (3929, "10"), + (4037, "16"), + (4160, "4.6"), + (4469, "9"), + (4540, "3.5.16"), + (4540, "5.3.17"), + (4604, "8"), + (4715, "10"), + (4842, "18"), + (4853, "6"), + (5013, "10"), + (5322, "7"), + (5570, "5.1.5"), + (5805, "4.4"), + (5849, "6"), + (5850, "5"), + (5858, "8"), + (5892, "8"), + (6219, "11"), + (6484, "1.5.4"), + (6484, "5.4.8"), + (6484, "5.6"), + (6485, "9"), + (6722, "5"), + (6730, "12"), + ]; + + // these RFCs have duplicate sections + let duplicate = [ + (3063, "6.2.1"), + (3063, "A.5.2"), + (3093, "3.2"), + (3119, "11"), + (3131, "10"), + (3250, "3"), + (3284, "5.4"), + (3302, "6"), + (3414, "12.1"), + (3418, "6.1"), + (3476, "8"), + (3562, "3"), + (3640, "A"), + (3745, "6"), + (3785, "6.1"), + (3946, "1"), // uses both Appendix and Annex + (4511, "C.2.1"), + (4520, "A.8"), + (4606, "1"), // uses both Appendix and Annex + (4949, "7"), + (5570, "2.4.2"), + (5755, "10.2"), + ]; + + // _really_ messed up RFCs + let janky_sections = [ + (3015, "A"), + (3113, "8"), + (3113, "9"), + (3122, "A"), + (3133, "1"), + (3134, "1"), + (3411, "A"), + (3525, "A.1"), + (3525, "I"), + (3730, "1"), // Appendices repeat section counters + (3730, "B"), // Appendices repeat section counters + (5038, "B"), // Appendices repeat B and C + ]; + + println!("rfc{rfc}"); + + // ignore any that we haven't snapshotted + if HIGHEST_KNOWN_ID < rfc { + return true; + } + + let Ok(file) = duvet_core::vfs::read_string(etc.join(format!("rfc{rfc}.txt"))).await else { + println!(" NOT FOUND"); + return false; + }; + + let tokens = tokens(&file).collect::>(); + + insta::assert_debug_snapshot!(format!("rfc{rfc}_tokens"), tokens); + + // don't do any checks right now + if ERRORS.iter().any(|e| e.contains(&rfc)) { + return true; + } + + let mut sections = vec![]; + + let mut prev_section = None; + + let mut check_section = |id: &str, title: &str, is_section: bool| { + assert!(!id.is_empty()); + + let prev = core::mem::replace(&mut prev_section, Some(id.to_string())); + + if janky_sections.contains(&(rfc, id)) { + return; + } + + assert_eq!(empty_titles.contains(&(rfc, id)), title.is_empty()); + + let Some(prev) = prev else { + if is_section { + assert!(["1", "1.0"].contains(&id)); + } + return; + }; + + if *prev == *id { + assert!(duplicate.contains(&(rfc, id)), "duplicate section: {id:?}"); + return; + } + + let is_ok = tokenizer::section_id_monotonic(&prev, id); + + let key = &(rfc, id); + let expected = !(skips.contains(key) + || indented_sections.contains(key) + || number_appendix_ids.contains(key) + || roman_appendix_ids.contains(key)); + + assert_eq!( + is_ok, expected, + "unexpected section number: prev={prev:?} current={id:?}" + ); + }; + + let mut line = 1; + for token in tokens { + // make sure we don't drop any lines + assert_eq!(line, token.line()); + line = token.line() + 1; + + match &token { + Token::Section { id, title, .. } => { + println!(" SECTION(id={id:?} title={title:?})"); + + check_section(id, title, true); + + sections.push(token); + } + Token::Appendix { id, title, .. } => { + println!(" APPENDIX(id={id:?} title={title:?})"); + + check_section(id, title, false); + + sections.push(token); + } + Token::NamedSection { title, .. } => { + println!(" SECTION(title={title:?})"); + // TODO + } + Token::Break { .. } => { + // TODO + } + Token::Content { .. } => { + // TODO + } + Token::Header { .. } => { + // TODO + } + } + } + + assert_eq!( + sections.is_empty(), + empty.contains(&rfc), + "RFC sections is empty" + ); + + true +} + +// these currently have parsing errors +static ERRORS: &[&[usize]] = &[ + &[ + 19, 70, 77, 98, 107, 155, 172, 194, 199, 230, 240, 254, 271, 293, 329, 330, 331, 332, 333, + 354, + // TODO gap + ], + &[ + 768, 778, 782, 783, 787, 789, 799, 800, 802, 803, 810, 869, 876, 887, 891, 892, 896, 899, + 904, 911, 914, 994, 995, 999, 1001, 1002, 1005, 1014, 1035, 1038, 1045, 1076, 1099, 1123, + 1138, 1142, 1148, 1163, 1180, 1190, 1195, 1199, 1244, 1245, 1246, + // TODO gap + ], + &[ + 3064, // The first sections is `1.0.Introduction` + 3502, // This starts on 6.3.11 + 3877, // The sections embed sequence diagrams + ], + &[ + 5054, // this has a section with a title with lots of spaces + 5165, // this section has poorly formatted sections + ], + &[ + 6503, // this embeds messages into the section + 6504, // this embeds messages into the section + 6917, // this embeds messages into the section + ], + &[ + 7058, // This RFC embeds sequence diagrams in the sections + ], + &[ + 9592, // This RFC embeds another RFC in the appendix, which fails the monotonic check + ], +]; diff --git a/duvet/src/specification/ietf/tokenizer.rs b/duvet/src/specification/ietf/tokenizer.rs new file mode 100644 index 00000000..d3e308ca --- /dev/null +++ b/duvet/src/specification/ietf/tokenizer.rs @@ -0,0 +1,705 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use core::fmt; +use duvet_core::{ + ensure, + file::{Slice, SourceFile}, +}; +use once_cell::sync::Lazy; +use regex::Regex; + +macro_rules! regex { + ($str:literal) => {{ + static R: Lazy = Lazy::new(|| Regex::new($str).unwrap()); + &*R + }}; +} + +#[derive(Clone, Copy, Debug)] +pub enum Break { + Line, + Page, +} + +#[derive(Clone)] +pub enum Token { + Section { + id: Slice, + title: Slice, + line: usize, + }, + Appendix { + id: Slice, + title: Slice, + line: usize, + }, + NamedSection { + title: Slice, + line: usize, + }, + Break { + value: Slice, + ty: Break, + line: usize, + }, + Content { + value: Slice, + line: usize, + }, + Header { + value: Slice, + line: usize, + }, +} + +impl Token { + #[allow(dead_code)] + pub fn line(&self) -> usize { + match self { + Token::Section { line, .. } => *line, + Token::Appendix { line, .. } => *line, + Token::NamedSection { line, .. } => *line, + Token::Break { line, .. } => *line, + Token::Content { line, .. } => *line, + Token::Header { line, .. } => *line, + } + } + + fn section( + id: Slice, + title: Slice, + line: usize, + force_appendix: bool, + ) -> Self { + if !force_appendix && id.starts_with(char::is_numeric) { + Token::Section { id, title, line } + } else { + Token::Appendix { id, title, line } + } + } +} + +impl fmt::Debug for Token { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Section { id, title, line } => { + write!(f, " SECTION#{}(id={}, title={})", line, id, title) + } + Self::Appendix { id, title, line } => { + write!(f, "APPENDIX#{}(id={}, title={})", line, id, title) + } + Self::NamedSection { title, line } => { + write!(f, " SECTION#{}(title={})", line, title) + } + Self::Break { + line, + ty: Break::Page, + value: _, + } => write!(f, " BREAK#{}", line), + Self::Break { + line, + ty: Break::Line, + value: _, + } => write!(f, " NEWLINE#{}", line), + Self::Content { value, line } => write!(f, " CONTENT#{}({})", line, value), + Self::Header { value, line } => write!(f, " HEADER#{}({})", line, value), + } + } +} + +pub fn tokens(contents: &SourceFile) -> impl Iterator + '_ { + let tokens = lines(contents); + let tokens = page_breaks(tokens); + let tokens = line_breaks(tokens); + let tokens = sections(tokens); + let tokens = headers(tokens); + named_sections(tokens) +} + +macro_rules! expect_contents { + ($token:expr) => { + match $token { + Token::Content { value, line } => (value, line), + token => return token, + } + }; +} + +/// Transforms file contents into lines +fn lines(contents: &SourceFile) -> impl Iterator + '_ { + contents.lines().enumerate().map(move |(line, value)| { + // line numbers start at 1 + let line = line + 1; + let value = contents.substr(value).unwrap(); + Token::Content { value, line } + }) +} + +/// Looks for page breaks in `Content` tokens +fn page_breaks>(i: I) -> impl Iterator { + i.map(|token| { + let (value, line) = expect_contents!(token); + + if &*value == "\u{C}" { + return Token::Break { + value, + line, + ty: Break::Page, + }; + } + + Token::Content { value, line } + }) +} + +/// Looks for line breaks in `Content` tokens +fn line_breaks>(i: I) -> impl Iterator { + i.map(|token| { + let (value, line) = expect_contents!(token); + + if value.is_empty() || value.trim().is_empty() { + return Token::Break { + value, + line, + ty: Break::Line, + }; + } + + Token::Content { value, line } + }) +} + +/// Looks for headers/footers +fn headers>(i: I) -> impl Iterator { + i.map(|token| { + let (value, line) = expect_contents!(token); + + let beginning_patterns = [ + regex!(r"^RFC [1-9][0-9]* "), + regex!(r"^\[Page [1-9][0-9]*\]"), + ]; + + let trim_start = value.trim_start(); + if trim_start.len() == value.len() { + for pattern in beginning_patterns { + if pattern.is_match(&value) { + return Token::Header { value, line }; + } + } + } + + let ending_patterns = [regex!(r" \[Page [1-9][0-9]*\]$")]; + + let trim_end = value.trim_end(); + if trim_end.len() == value.len() { + for pattern in ending_patterns { + if pattern.is_match(&value) { + return Token::Header { value, line }; + } + } + } + + Token::Content { value, line } + }) +} + +fn named_sections>(i: I) -> impl Iterator { + struct NamedSections> { + state: State, + queue: Queue, + iter: I, + } + + impl> Iterator for NamedSections { + type Item = Token; + + fn next(&mut self) -> Option { + loop { + if let Some(token) = self.queue.next() { + return Some(token); + } + + if let Some(token) = self.iter.next() { + self.state.on_token(token, &mut self.queue); + } else { + self.state.flush(&mut self.queue); + return self.queue.next(); + } + } + } + } + + enum Queue { + Zero, + One(Token), + Two(Token, Token), + Three(Token, Token, Token), + } + + impl Queue { + fn push(&mut self, token: Token) { + match core::mem::replace(self, Self::Zero) { + Self::Three(_, _, _) => { + panic!("at capacity"); + } + Self::Two(a, b) => { + *self = Self::Three(a, b, token); + } + Self::One(a) => { + *self = Self::Two(a, token); + } + Self::Zero => { + *self = Self::One(token); + } + } + } + + fn next(&mut self) -> Option { + match core::mem::replace(self, Self::Zero) { + Self::Three(a, b, c) => { + *self = Self::Two(b, c); + Some(a) + } + Self::Two(a, b) => { + *self = Self::One(b); + Some(a) + } + Self::One(a) => Some(a), + Self::Zero => None, + } + } + } + + enum State { + Init, + // we have a line break + First { + break_token: Token, + }, + // we have a line break and a named section - just waiting on another line break + Second { + break_token: Token, + title: Slice, + line: usize, + }, + } + + impl State { + fn on_token(&mut self, token: Token, queue: &mut Queue) { + debug_assert!(matches!(queue, Queue::Zero)); + + match (core::mem::replace(self, Self::Init), token) { + ( + Self::Init, + token @ Token::Break { + ty: Break::Line, .. + }, + ) => { + *self = Self::First { break_token: token }; + } + (Self::Init, token) => { + queue.push(token); + } + (Self::First { break_token }, Token::Content { value, line }) => { + let patterns = [ + "Acknowledgments", + "Acknowledgement", + "Acknowledgements", + "Index", + "Author's Address", + "Authors' Addresses", + "Normative References", + "Informative References", + "References", + "REFERENCES", + "AUTHORS' ADDRESSES", + "Full Copyright Statement", + "Security Considerations", + "Intellectual Property", + "Intellectual Property Statement", + "Working Group Information", + "Contributors", + "Editors' Addresses", + "IANA Considerations", + "Abstract", + "Status of this Memo", + "Status of This Memo", + "Copyright Notice", + "Table of Contents", + "Appendix", + ]; + + if patterns.contains(&&*value) { + *self = Self::Second { + break_token, + title: value, + line, + }; + } else { + queue.push(break_token); + queue.push(Token::Content { value, line }); + } + } + ( + Self::First { break_token }, + token @ Token::Break { + ty: Break::Line, .. + }, + ) => { + queue.push(break_token); + *self = Self::First { break_token: token }; + } + (Self::First { break_token }, token) => { + queue.push(break_token); + queue.push(token); + } + ( + Self::Second { + break_token, + title, + line, + }, + token @ Token::Break { + ty: Break::Line, .. + }, + ) => { + let title = Token::NamedSection { title, line }; + queue.push(break_token); + queue.push(title); + queue.push(token); + } + ( + Self::Second { + break_token, + title, + line, + }, + token, + ) => { + let title = Token::Content { value: title, line }; + queue.push(break_token); + queue.push(title); + queue.push(token); + } + } + } + + fn flush(&mut self, queue: &mut Queue) { + match core::mem::replace(self, Self::Init) { + Self::Init => {} + Self::First { break_token } => { + queue.push(break_token); + } + Self::Second { + break_token, + title, + line, + } => { + queue.push(break_token); + queue.push(Token::Content { value: title, line }); + } + } + } + } + + NamedSections { + state: State::Init, + queue: Queue::Zero, + iter: i, + } +} + +fn sections>(i: I) -> impl Iterator { + Sections::new(i) +} + +#[derive(Debug)] +struct Sections> { + tokens: T, + was_break: bool, + prev_section: Option>, +} + +impl> Sections { + pub fn new(tokens: T) -> Self { + Self { + tokens, + was_break: false, + prev_section: None, + } + } + + fn on_token(&mut self, token: Token) -> Token { + let token = self.on_token_impl(token); + + self.was_break = matches!(token, Token::Break { .. }); + + if let Token::Section { id, .. } = &token { + self.prev_section = Some(id.clone()); + } + + if let Token::Appendix { id, .. } = &token { + self.prev_section = Some(id.clone()); + } + + token + } + + fn on_token_impl(&mut self, token: Token) -> Token { + let (value, line) = expect_contents!(token); + + let mut force_appendix = false; + let mut section_candidate = &*value; + + for prefix in ["Appendix ", "APPENDIX ", "Annex "] { + if let Some(value) = value.strip_prefix(prefix) { + section_candidate = value; + force_appendix = true; + break; + } + } + + if force_appendix { + let candidates = [ + regex!(r"^([A-Z])$"), + regex!(r"^([A-Z])\.$"), + regex!(r"^([A-Z])\.\s+(.*)"), + regex!(r"^([A-Z]):\s+(.*)"), + regex!(r"^([A-Z]) :\s+(.*)"), + regex!(r"^([A-Z]) -\s+(.*)"), + regex!(r"^([A-Z]) --\s+(.*)"), + regex!(r"^([A-Z])\s+(.*)"), + ]; + + for candidate in candidates { + if let Some(section) = candidate.captures(section_candidate) { + let id = section.get(1).unwrap(); + let id = §ion_candidate[id.range()]; + + let title = if let Some(title) = section.get(2) { + section_candidate[title.range()].trim() + } else { + &id[id.len()..] + }; + + if !self.section_check_candidate(id, title) { + continue; + } + + let id = value.file().substr(id).unwrap(); + let title = value.file().substr(title).unwrap(); + + return Token::section(id, title, line, true); + } + } + } + + let candidates = [regex!(r"^(([A-Z]\.?)?[0-9\.]+):?\s+(.*)")]; + + for candidate in candidates { + if let Some(section) = candidate.captures(section_candidate) { + let id = section.get(1).unwrap(); + let id = §ion_candidate[id.range()].trim_end_matches('.'); + + let title = section.get(3).unwrap(); + let title = §ion_candidate[title.range()].trim(); + + if self.section_check_candidate(id, title) { + let id = value.file().substr(id).unwrap(); + let title = value.file().substr(title).unwrap(); + + return Token::section(id, title, line, force_appendix); + } + } + } + + if regex!(r"^(([A-Z]\.)?[0-9\.]+)$").is_match(section_candidate) { + let id = section_candidate.trim_end_matches('.'); + + if self.section_check_candidate(id, "") { + let id = value.file().substr(id).unwrap(); + + let title = value.file().substr(&id[id.len()..]).unwrap(); + + return Token::section(id, title, line, force_appendix); + } + } + + Token::Content { value, line } + } + + fn section_check_candidate(&self, id: &str, title: &str) -> bool { + ensure!(Self::section_check_toc(title), false); + ensure!(Self::section_check_weird_title(title), false); + + // if we have a possibly weird title, then use a monotonicity check + let check_monotonic = !Self::section_check_possible_weird_title(title); + + self.section_check_id(id, check_monotonic) + } + + fn section_check_id(&self, id: &str, check_monotonic: bool) -> bool { + for res in parse_id(id) { + ensure!(res.is_ok(), false); + } + + // if we previously had a break then it's likely a valid section + if self.was_break && !check_monotonic { + return true; + } + + let Some(prev) = self.prev_section.as_ref() else { + // if we don't have a section then make sure the first one is `1` + return ["1", "1.0"].contains(&id); + }; + + section_id_monotonic(prev, id) + } + + fn section_check_toc(title: &str) -> bool { + // try to detect if this is a Table of Contents entry - they usually have period + // separators + ensure!(!title.contains("....."), false); + ensure!(!title.contains(". . ."), false); + ensure!(!title.contains(" . . "), false); + + true + } + + fn section_check_weird_title(title: &str) -> bool { + // try to filter out weird titles + ensure!(!title.starts_with(';'), false); + ensure!(!title.ends_with(['{', '[', '(', ';']), false); + + // check if the title contains too much spacing + ensure!(!title.trim().contains(" "), false); + + true + } + + fn section_check_possible_weird_title(title: &str) -> bool { + // try to filter out weird titles + ensure!(!title.trim_end_matches('|').contains("|"), false); + + true + } +} + +pub(super) fn section_id_monotonic(prev: &str, current: &str) -> bool { + ensure!(prev != current, false); + + let prev_parts = parse_id(prev); + let current_parts = parse_id(current); + + for (idx, (prev_part, current_part)) in prev_parts.zip(current_parts).enumerate() { + let Some(prev_part): Option = prev_part.ok() else { + return false; + }; + let Some(current_part): Option = current_part.ok() else { + return false; + }; + + // only the first part is allowed to be a number + if idx > 0 { + ensure!(matches!(current_part, Part::Num(_)), false); + } + + // no need to keep comparing the parts + if prev_part.is_next(¤t_part) { + break; + } + + // the current part can't be less than the previous + ensure!(prev_part == current_part, false); + } + + true +} + +impl> Iterator for Sections { + type Item = Token; + + fn next(&mut self) -> Option { + let token = self.tokens.next()?; + let token = self.on_token(token); + Some(token) + } +} + +fn parse_id(id: &str) -> impl Iterator> + '_ { + let mut digit_offset = 0; + + for (idx, c) in id.char_indices() { + if c.is_ascii_digit() { + digit_offset = idx; + break; + } + } + + let (prefix, digits) = id.split_at(digit_offset); + + let prefix = if prefix.is_empty() { + None + } else { + Some(prefix.trim_end_matches('.').parse()) + }; + + prefix + .into_iter() + .chain(digits.split('.').map(|v| v.parse())) + .enumerate() + .map(|(idx, part)| { + let part = part?; + + if idx == 0 { + if let Part::Num(num) = part { + ensure!(num > 0, Err(())); + } + } + + // only the first part is allowed to be a letter + if idx > 0 { + ensure!(matches!(part, Part::Num(_)), Err(())); + } + + Ok(part) + }) +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +enum Part { + Num(u8), + Appendix(char), +} + +impl Part { + fn is_next(&self, other: &Self) -> bool { + match (self, other) { + (Part::Num(a), Part::Num(b)) => (*a as usize) + 1 == *b as usize, + (Part::Num(_), Part::Appendix(a)) => *a == 'A', + (Part::Appendix(_), Part::Num(_)) => false, + (Part::Appendix(a), Part::Appendix(b)) => (*a as u32 + 1) == *b as u32, + } + } +} + +impl core::str::FromStr for Part { + type Err = (); + + fn from_str(s: &str) -> Result { + if let Ok(v) = s.parse() { + // RFCs don't exceed this value + ensure!(v <= 199, Err(())); + + return Ok(Self::Num(v)); + } + + ensure!(s.len() == 1, Err(())); + + let c = s.chars().next().unwrap(); + ensure!(c.is_ascii_uppercase(), Err(())); + + Ok(Self::Appendix(c)) + } +} diff --git a/duvet/src/specification/mod.rs b/duvet/src/specification/mod.rs index 8fcb75e2..bfbc98c5 100644 --- a/duvet/src/specification/mod.rs +++ b/duvet/src/specification/mod.rs @@ -9,6 +9,7 @@ use core::{ ops::{Deref, Range}, str::FromStr, }; +use duvet_core::file::SourceFile; use std::collections::HashMap; pub mod ietf; @@ -89,7 +90,7 @@ impl fmt::Display for Format { } impl Format { - pub fn parse(self, contents: &str) -> Result { + pub fn parse(self, contents: &SourceFile) -> Result { let spec = match self { Self::Auto => { // Markdown MAY start with a header (#), @@ -149,15 +150,6 @@ pub enum Line<'a> { Break, } -impl Line<'_> { - pub fn is_empty(&self) -> bool { - match self { - Self::Str(s) => s.is_empty(), - Self::Break => true, - } - } -} - impl<'a> From> for Line<'a> { fn from(s: Str<'a>) -> Self { Self::Str(s) diff --git a/integration/snapshots/h3.snap b/integration/snapshots/h3.snap index 8cfce600..14e54c02 100644 --- a/integration/snapshots/h3.snap +++ b/integration/snapshots/h3.snap @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b01b4e38a484f631375a9a0b419aaff5a65d4648c3ea0b2580ad748f218acc0 -size 598588 +oid sha256:a46aa96fc1de7b5505eb9f1717aad530481974dc01200de7be97cda6d26cd40b +size 672704 diff --git a/integration/snapshots/s2n-quic.snap b/integration/snapshots/s2n-quic.snap index 470c27d6..5f179b90 100644 --- a/integration/snapshots/s2n-quic.snap +++ b/integration/snapshots/s2n-quic.snap @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7096ec9263bc04bc1505495357d6204c96d10d3785763489b9f3fa2cdbe3d4d9 -size 5584556 +oid sha256:8b0850fdbedf4f5d6008ea784bb5e92eb8500fc772121c22af1794cdd8e24b97 +size 6255510 diff --git a/integration/snapshots/s2n-tls.snap b/integration/snapshots/s2n-tls.snap index e0ad840f..c3fae6f1 100644 --- a/integration/snapshots/s2n-tls.snap +++ b/integration/snapshots/s2n-tls.snap @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7039f0469e2fcd1ee6ff3ac0c71a6722b8456b1d7d405270ce554ac71f4f1d88 -size 2890616 +oid sha256:6fb408b428c3968f5b89ea910b64e2333f3b0d77e0248ffe0cfd1ce04fd4a758 +size 3217571 diff --git a/xtask/src/tests.rs b/xtask/src/tests.rs index 41789190..bba81445 100644 --- a/xtask/src/tests.rs +++ b/xtask/src/tests.rs @@ -31,7 +31,14 @@ impl Tests { let default_tests = self.default_tests.is_enabled(true); + self.download_rfcs(sh)?; + if self.unit.is_enabled(default_tests) { + if !sh.path_exists("duvet/src/specification/ietf/snapshots") { + let _dir = sh.push_dir("duvet/src/specification/ietf"); + cmd!(sh, "tar -xf snapshots.tar.gz").run()?; + } + cmd!(sh, "cargo test").run()?; } @@ -42,6 +49,37 @@ impl Tests { Ok(()) } + fn download_rfcs(&self, sh: &Shell) -> Result { + let url = "https://www.rfc-editor.org/rfc/tar/RFC-all.tar.gz"; + + let dir = "target/www.rfc-editor.org"; + sh.create_dir(dir)?; + let _dir = sh.push_dir(dir); + + let tar_gz = Path::new("RFC-all.tar.gz"); + if !sh.path_exists(tar_gz) { + eprintln!("downloading {url}"); + cmd!(sh, "curl --fail --output {tar_gz} {url}").run()?; + cmd!(sh, "tar -xf {tar_gz}").run()?; + } + + for file in sh.read_dir(".")? { + if file.ends_with(tar_gz) { + continue; + } + + if let Some(ext) = file.extension().and_then(|v| v.to_str()) { + if ext != "txt" { + sh.remove_path(file)?; + } + } else { + sh.remove_path(file)?; + } + } + + Ok(()) + } + fn integration(&self, sh: &Shell, bin: &Path) -> Result { let tests = sh.read_dir("integration")?;