Skip to content

Commit 6ef6986

Browse files
committed
xml5ever: Bubble parser blocking scripts to the caller instead of the TreeSink
This is the same approach used by html5ever, which will hopefully allow unifying their API a bit as part of the effort for encoding support. This is a breaking change. The TreeSink::complete_script method is removed.
1 parent 31a2c31 commit 6ef6986

File tree

15 files changed

+166
-161
lines changed

15 files changed

+166
-161
lines changed

html5ever/src/driver.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@
1010
//! High-level interface to the parser.
1111
1212
use crate::buffer_queue::BufferQueue;
13-
use crate::tokenizer::{Tokenizer, TokenizerOpts, TokenizerResult};
13+
use crate::tokenizer::{Tokenizer, TokenizerOpts};
1414
use crate::tree_builder::{create_element, TreeBuilder, TreeBuilderOpts, TreeSink};
1515
use crate::{Attribute, QualName};
16-
16+
use markup5ever::TokenizerResult;
1717
use std::borrow::Cow;
1818

1919
use crate::tendril;

html5ever/src/tokenizer/mod.rs

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ use crate::util::str::lower_ascii_letter;
2525

2626
use log::{debug, trace};
2727
use mac::format_if;
28-
use markup5ever::{namespace_url, ns, small_char_set};
28+
use markup5ever::{namespace_url, ns, small_char_set, TokenizerResult};
2929
use std::borrow::Cow::{self, Borrowed};
3030
use std::cell::{Cell, RefCell, RefMut};
3131
use std::collections::BTreeMap;
@@ -45,13 +45,6 @@ pub enum ProcessResult<Handle> {
4545
Script(Handle),
4646
}
4747

48-
#[must_use]
49-
#[derive(Debug)]
50-
pub enum TokenizerResult<Handle> {
51-
Done,
52-
Script(Handle),
53-
}
54-
5548
fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
5649
match *opt_str {
5750
Some(ref mut s) => s.push_char(c),

html5ever/src/tree_builder/mod.rs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,7 @@
99

1010
//! The HTML5 tree builder.
1111
12-
pub use crate::interface::{
13-
create_element, ElemName, ElementFlags, NextParserState, Tracer, TreeSink,
14-
};
12+
pub use crate::interface::{create_element, ElemName, ElementFlags, Tracer, TreeSink};
1513
pub use crate::interface::{AppendNode, AppendText, Attribute, NodeOrText};
1614
pub use crate::interface::{LimitedQuirks, NoQuirks, Quirks, QuirksMode};
1715

markup5ever/interface/mod.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ use std::fmt;
1313
use tendril::StrTendril;
1414

1515
pub use self::tree_builder::{create_element, AppendNode, AppendText, ElementFlags, NodeOrText};
16-
pub use self::tree_builder::{ElemName, NextParserState, Tracer, TreeSink};
16+
pub use self::tree_builder::{ElemName, Tracer, TreeSink};
1717
pub use self::tree_builder::{LimitedQuirks, NoQuirks, Quirks, QuirksMode};
1818
use super::{LocalName, Namespace, Prefix};
1919

@@ -60,6 +60,13 @@ impl fmt::Debug for ExpandedName<'_> {
6060
}
6161
}
6262

63+
#[must_use]
64+
#[derive(Debug)]
65+
pub enum TokenizerResult<Handle> {
66+
Done,
67+
Script(Handle),
68+
}
69+
6370
/// Helper to quickly create an expanded name.
6471
///
6572
/// Can be used with no namespace as `expanded_name!("", "some_name")`

markup5ever/interface/tree_builder.rs

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -43,17 +43,6 @@ pub enum QuirksMode {
4343
NoQuirks,
4444
}
4545

46-
/// Whether to interrupt further parsing of the current input until
47-
/// the next explicit resumption of the tokenizer, or continue without
48-
/// any interruption.
49-
#[derive(PartialEq, Eq, Copy, Clone, Hash, Debug)]
50-
pub enum NextParserState {
51-
/// Stop further parsing.
52-
Suspend,
53-
/// Continue without interruptions.
54-
Continue,
55-
}
56-
5746
/// Special properties of an element, useful for tagging elements with this information.
5847
#[derive(Default)]
5948
#[non_exhaustive]
@@ -256,11 +245,6 @@ pub trait TreeSink {
256245
/// Called whenever the line number changes.
257246
fn set_current_line(&self, _line_number: u64) {}
258247

259-
/// Indicate that a `script` element is complete.
260-
fn complete_script(&self, _node: &Self::Handle) -> NextParserState {
261-
NextParserState::Continue
262-
}
263-
264248
fn allow_declarative_shadow_roots(&self, _intended_parent: &Self::Handle) -> bool {
265249
true
266250
}

markup5ever/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,6 @@ mod util {
4545
pub mod smallcharset;
4646
}
4747

48-
pub use interface::{Attribute, ExpandedName, QualName};
48+
pub use interface::{Attribute, ExpandedName, QualName, TokenizerResult};
4949
pub use util::smallcharset::SmallCharSet;
5050
pub use util::*;

rcdom/tests/xml-tokenizer.rs

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ use std::env;
1414
use std::ffi::OsStr;
1515
use std::io::Read;
1616
use std::path::Path;
17+
use xml5ever::tokenizer::ProcessResult;
1718

1819
use util::find_tests::foreach_xml5lib_test;
1920
use util::runner::{run_all, Test};
@@ -91,7 +92,9 @@ impl TokenLogger {
9192
}
9293

9394
impl TokenSink for TokenLogger {
94-
fn process_token(&self, token: Token) {
95+
type Handle = ();
96+
97+
fn process_token(&self, token: Token) -> ProcessResult<()> {
9598
match token {
9699
CharacterTokens(b) => {
97100
self.current_str.borrow_mut().push_slice(&b);
@@ -123,7 +126,8 @@ impl TokenSink for TokenLogger {
123126
EOFToken => (),
124127

125128
_ => self.push(token),
126-
}
129+
};
130+
ProcessResult::Continue
127131
}
128132
}
129133

@@ -134,9 +138,9 @@ fn tokenize_xml(input: Vec<StrTendril>, opts: XmlTokenizerOpts) -> Vec<Token> {
134138

135139
for chunk in input.into_iter() {
136140
buf.push_back(chunk);
137-
tok.feed(&buf);
141+
let _ = tok.feed(&buf);
138142
}
139-
tok.feed(&buf);
143+
let _ = tok.feed(&buf);
140144
tok.end();
141145
tok.sink.get_tokens()
142146
}
@@ -274,9 +278,11 @@ fn json_to_tokens(js: &Value, exact_errors: bool) -> Vec<Token> {
274278
for tok in js.as_array().unwrap().iter() {
275279
match *tok {
276280
Value::String(ref s) if &s[..] == "ParseError" => {
277-
sink.process_token(ParseError(Borrowed("")))
281+
let _ = sink.process_token(ParseError(Borrowed("")));
282+
},
283+
_ => {
284+
let _ = sink.process_token(json_to_token(tok));
278285
},
279-
_ => sink.process_token(json_to_token(tok)),
280286
}
281287
}
282288
sink.get_tokens()

xml5ever/benches/xml5ever.rs

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,18 @@ use criterion::{black_box, Criterion};
1010

1111
use markup5ever::buffer_queue::BufferQueue;
1212
use xml5ever::tendril::*;
13-
use xml5ever::tokenizer::{Token, TokenSink, XmlTokenizer};
13+
use xml5ever::tokenizer::{ProcessResult, Token, TokenSink, XmlTokenizer};
1414

1515
struct Sink;
1616

1717
impl TokenSink for Sink {
18-
fn process_token(&self, token: Token) {
18+
type Handle = ();
19+
20+
fn process_token(&self, token: Token) -> ProcessResult<()> {
1921
// Don't use the token, but make sure we don't get
2022
// optimized out entirely.
2123
black_box(token);
24+
ProcessResult::Continue
2225
}
2326
}
2427

@@ -58,9 +61,9 @@ fn run_bench(c: &mut Criterion, name: &str) {
5861
// necessary since our iterator consumes the underlying buffer.
5962
for buf in input.clone().into_iter() {
6063
buffer.push_back(buf);
61-
tok.feed(&buffer);
64+
let _ = tok.feed(&buffer);
6265
}
63-
tok.feed(&buffer);
66+
let _ = tok.feed(&buffer);
6467
tok.end();
6568
})
6669
});

xml5ever/examples/simple_xml_tokenizer.rs

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,17 @@ use std::io;
1616

1717
use markup5ever::buffer_queue::BufferQueue;
1818
use xml5ever::tendril::{ByteTendril, ReadExt};
19-
use xml5ever::tokenizer::{CharacterTokens, NullCharacterToken, TagToken};
19+
use xml5ever::tokenizer::{CharacterTokens, NullCharacterToken, ProcessResult, TagToken};
2020
use xml5ever::tokenizer::{CommentToken, PIToken, Pi};
2121
use xml5ever::tokenizer::{Doctype, DoctypeToken, EOFToken};
2222
use xml5ever::tokenizer::{ParseError, Token, TokenSink, XmlTokenizer};
2323

2424
struct SimpleTokenPrinter;
2525

2626
impl TokenSink for SimpleTokenPrinter {
27-
fn process_token(&self, token: Token) {
27+
type Handle = ();
28+
29+
fn process_token(&self, token: Token) -> ProcessResult<()> {
2830
match token {
2931
CharacterTokens(b) => {
3032
println!("TEXT: {}", &*b);
@@ -55,7 +57,8 @@ impl TokenSink for SimpleTokenPrinter {
5557
}) => {
5658
println!("<!DOCTYPE {name:?} {public_id:?}>");
5759
},
58-
}
60+
};
61+
ProcessResult::Continue
5962
}
6063
}
6164

@@ -76,6 +79,6 @@ fn main() {
7679
input_buffer.push_back(input.try_reinterpret().unwrap());
7780
// Here we create and run tokenizer
7881
let tok = XmlTokenizer::new(sink, Default::default());
79-
tok.feed(&input_buffer);
82+
let _ = tok.feed(&input_buffer);
8083
tok.end();
8184
}

xml5ever/examples/xml_tokenizer.rs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ use std::io;
1717

1818
use markup5ever::buffer_queue::BufferQueue;
1919
use xml5ever::tendril::{ByteTendril, ReadExt};
20-
use xml5ever::tokenizer::{CharacterTokens, NullCharacterToken, TagToken};
20+
use xml5ever::tokenizer::{CharacterTokens, NullCharacterToken, ProcessResult, TagToken};
2121
use xml5ever::tokenizer::{EmptyTag, EndTag, ShortTag, StartTag};
2222
use xml5ever::tokenizer::{PIToken, Pi};
2323
use xml5ever::tokenizer::{ParseError, Token, TokenSink, XmlTokenizer, XmlTokenizerOpts};
@@ -44,7 +44,9 @@ impl TokenPrinter {
4444
}
4545

4646
impl TokenSink for TokenPrinter {
47-
fn process_token(&self, token: Token) {
47+
type Handle = ();
48+
49+
fn process_token(&self, token: Token) -> ProcessResult<()> {
4850
match token {
4951
CharacterTokens(b) => {
5052
for c in b.chars() {
@@ -84,7 +86,9 @@ impl TokenSink for TokenPrinter {
8486
self.is_char(false);
8587
println!("OTHER: {token:?}");
8688
},
87-
}
89+
};
90+
91+
ProcessResult::Continue
8892
}
8993
}
9094

@@ -105,7 +109,7 @@ fn main() {
105109
..Default::default()
106110
},
107111
);
108-
tok.feed(&input_buffer);
112+
let _ = tok.feed(&input_buffer);
109113
tok.end();
110114
tok.sink.is_char(false);
111115
}

xml5ever/src/driver.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,8 @@ impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for XmlParser<Sink> {
6363

6464
fn process(&mut self, t: StrTendril) {
6565
self.input_buffer.push_back(t);
66-
self.tokenizer.feed(&self.input_buffer);
66+
// FIXME: Properly support </script> somehow.
67+
let _ = self.tokenizer.feed(&self.input_buffer);
6768
}
6869

6970
// FIXME: Is it too noisy to report every character decoding error?

xml5ever/src/tokenizer/interface.rs

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,13 @@
1010
use std::borrow::Cow;
1111

1212
use crate::tendril::StrTendril;
13+
use crate::tokenizer::ProcessResult;
1314
use crate::{Attribute, QualName};
1415

1516
pub use self::TagKind::{EmptyTag, EndTag, ShortTag, StartTag};
1617
pub use self::Token::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
1718
pub use self::Token::{CommentToken, DoctypeToken, PIToken, TagToken};
1819

19-
use super::states;
20-
2120
/// Tag kind denotes which kind of tag did we encounter.
2221
#[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)]
2322
pub enum TagKind {
@@ -108,16 +107,12 @@ pub enum Token {
108107

109108
/// Types which can receive tokens from the tokenizer.
110109
pub trait TokenSink {
110+
/// Handle to a DOM script element
111+
type Handle;
112+
111113
/// Process a token.
112-
fn process_token(&self, token: Token);
114+
fn process_token(&self, token: Token) -> ProcessResult<Self::Handle>;
113115

114116
/// Signal to the sink that parsing has ended.
115117
fn end(&self) {}
116-
117-
/// The tokenizer will call this after emitting any start tag.
118-
/// This allows the tree builder to change the tokenizer's state.
119-
/// By default no state changes occur.
120-
fn query_state_change(&self) -> Option<states::XmlState> {
121-
None
122-
}
123118
}

0 commit comments

Comments
 (0)