Skip to content

Commit b5ba3c9

Browse files
committed
rustdoc: add HtmlRemover and Plain
1 parent 4c0f500 commit b5ba3c9

File tree

2 files changed

+156
-1
lines changed

2 files changed

+156
-1
lines changed

src/librustdoc/html/format.rs

+122-1
Original file line numberDiff line numberDiff line change
@@ -1311,7 +1311,94 @@ impl clean::BareFunctionDecl {
13111311
}
13121312
}
13131313

1314-
// Implements Write but only counts the bytes "written".
1314+
/// This is a simplified HTML processor, intended for counting the number of characters
1315+
/// of text that a stream of HTML is equivalent to. This is used to calculate the width
1316+
/// (in characters) of a function declaration, to decide whether to line-wrap it like
1317+
/// rustfmt would do. It's only valid for use with HTML emitted from within this module,
1318+
/// so it is intentionally not pub(crate).
1319+
///
1320+
/// This makes some assumptions that are specifically tied to the HTML emitted in format.rs:
1321+
/// - Whitespace is significant.
1322+
/// - All tags display their contents as text.
1323+
/// - Each call to write() contains a sequence of bytes that is valid UTF-8 on its own.
1324+
/// - All '<' in HTML attributes are escaped.
1325+
/// - HTML attributes are quoted with double quotes.
1326+
/// - The only HTML entities used are `&lt;`, `&gt;`, `&amp;`, `&quot`, and `&#39;`
1327+
#[derive(Debug, Clone)]
1328+
struct HtmlRemover<W: fmt::Write> {
1329+
inner: W,
1330+
state: HtmlTextCounterState,
1331+
}
1332+
1333+
impl<W: fmt::Write> HtmlRemover<W> {
1334+
fn new(w: W) -> Self {
1335+
HtmlRemover { inner: w, state: HtmlTextCounterState::Text }
1336+
}
1337+
}
1338+
1339+
// A state machine that tracks our progress through the HTML.
1340+
#[derive(Debug, Clone)]
1341+
enum HtmlTextCounterState {
1342+
Text,
1343+
// A small buffer to store the entity name
1344+
Entity(u8, [u8; 4]),
1345+
Tag,
1346+
}
1347+
1348+
impl<W: fmt::Write> fmt::Write for HtmlRemover<W> {
1349+
fn write_str(&mut self, s: &str) -> fmt::Result {
1350+
use HtmlTextCounterState::*;
1351+
for c in s.chars() {
1352+
match (&mut self.state, c) {
1353+
(Text, '<') => self.state = Tag,
1354+
(Text, '&') => self.state = Entity(0, Default::default()),
1355+
(Text, _) => write!(self.inner, "{c}")?,
1356+
// Note: `>` can occur in attribute values, but we always escape
1357+
// them internally, so we don't have to have an extra state for
1358+
// "in attribute value."
1359+
// https://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attributes
1360+
(Tag, '>') => self.state = Text,
1361+
(Tag, '<') => Err(fmt::Error)?,
1362+
// Within a tag, do nothing.
1363+
(Tag, _) => {}
1364+
// Finish an entity
1365+
(Entity(len, arr), ';') => {
1366+
let emit = match std::str::from_utf8(&arr[0..*len as usize]).unwrap() {
1367+
"lt" => '<',
1368+
"gt" => '>',
1369+
"amp" => '&',
1370+
"quot" => '"',
1371+
"#39" => '\'',
1372+
_ => Err(fmt::Error)?,
1373+
};
1374+
write!(self.inner, "{emit}")?;
1375+
self.state = Text;
1376+
}
1377+
// Read one character of an entity name
1378+
(Entity(ref mut len, ref mut arr), c) => {
1379+
if *len as usize > arr.len() - 1 {
1380+
Err(fmt::Error)?;
1381+
}
1382+
arr[*len as usize] = c as u8;
1383+
*len += 1;
1384+
}
1385+
}
1386+
}
1387+
Ok(())
1388+
}
1389+
}
1390+
1391+
/// This generates the plain text form of a marked-up HTML input, using HtmlRemover.
1392+
struct Plain<D: fmt::Display>(D);
1393+
1394+
impl<D: fmt::Display> fmt::Display for Plain<D> {
1395+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1396+
let mut remover = HtmlRemover::new(f);
1397+
write!(&mut remover, "{}", self.0)
1398+
}
1399+
}
1400+
1401+
/// Implements Write but only counts the bytes "written".
13151402
struct WriteCounter(usize);
13161403

13171404
impl std::fmt::Write for WriteCounter {
@@ -1714,3 +1801,37 @@ pub(crate) fn display_fn(
17141801

17151802
WithFormatter(Cell::new(Some(f)))
17161803
}
1804+
1805+
#[test]
1806+
fn test_html_remover() {
1807+
use std::fmt::Write;
1808+
1809+
fn assert_removed_eq(input: &str, output: &str) {
1810+
let mut remover = HtmlRemover::new(String::new());
1811+
write!(&mut remover, "{}", input).unwrap();
1812+
assert_eq!(&remover.inner, output);
1813+
}
1814+
1815+
assert_removed_eq("a<a href='https://example.com'>b", "ab");
1816+
assert_removed_eq("alpha &lt;bet&gt;", "alpha <bet>");
1817+
assert_removed_eq("<a href=\"&quot;\">", "");
1818+
assert_removed_eq("<tag>&gt;</tag>text&lt;<tag>", ">text<");
1819+
1820+
let mut remover = HtmlRemover::new(String::new());
1821+
assert!(write!(&mut remover, "&ent;").is_err());
1822+
1823+
let mut remover = HtmlRemover::new(String::new());
1824+
assert!(write!(&mut remover, "&entity").is_err());
1825+
1826+
let mut remover = HtmlRemover::new(String::new());
1827+
assert!(write!(&mut remover, "&&").is_err());
1828+
1829+
let mut remover = HtmlRemover::new(String::new());
1830+
assert!(write!(&mut remover, "<open <tag").is_err());
1831+
}
1832+
1833+
#[test]
1834+
fn test_plain() {
1835+
let d = Plain::new("<strong>alpha</strong> &lt;bet&gt;");
1836+
assert_eq!(&d.to_string(), "alpha <bet>");
1837+
}

src/librustdoc/html/tests.rs

+34
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,37 @@ fn href_relative_parts_root() {
4848
let fqp = &[sym::std];
4949
assert_relative_path(&[sym::std], relative_to_fqp, fqp);
5050
}
51+
52+
#[test]
53+
fn test_html_remover() {
54+
use super::format::HtmlRemover;
55+
use std::fmt::Write;
56+
57+
fn assert_removed_eq(input: &str, output: &str) {
58+
let mut remover = HtmlRemover::new(String::new());
59+
write!(&mut remover, "{}", input).unwrap();
60+
assert_eq!(&remover.into_inner(), output);
61+
}
62+
63+
assert_removed_eq("a<a href='https://example.com'>b", "ab");
64+
assert_removed_eq("alpha &lt;bet&gt;", "alpha <bet>");
65+
assert_removed_eq("<a href=\"&quot;\">", "");
66+
assert_removed_eq("<tag>&gt;</tag>text&lt;<tag>", ">text<");
67+
assert_removed_eq("&quot;&#39;", "\"'");
68+
69+
let mut remover = HtmlRemover::new(String::new());
70+
assert!(write!(&mut remover, "&ent;").is_err());
71+
72+
let mut remover = HtmlRemover::new(String::new());
73+
assert!(write!(&mut remover, "&longentity").is_err());
74+
75+
let mut remover = HtmlRemover::new(String::new());
76+
assert!(write!(&mut remover, "<open <tag").is_err());
77+
}
78+
79+
#[test]
80+
fn test_plain() {
81+
use super::format::Plain;
82+
let d = Plain("<strong>alpha</strong> &lt;bet&gt;");
83+
assert_eq!(&d.to_string(), "alpha <bet>");
84+
}

0 commit comments

Comments
 (0)