diff --git a/html5ever/src/tokenizer/mod.rs b/html5ever/src/tokenizer/mod.rs
index 80b0f6d1..7a3da9a8 100644
--- a/html5ever/src/tokenizer/mod.rs
+++ b/html5ever/src/tokenizer/mod.rs
@@ -704,7 +704,52 @@ impl Tokenizer {
match self.state.get() {
//ยง data-state
states::Data => loop {
- match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
+ let set = small_char_set!('\r' '\0' '&' '<' '\n');
+
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ let set_result = if !(self.opts.exact_errors
+ || self.reconsume.get()
+ || self.ignore_lf.get())
+ && is_x86_feature_detected!("sse2")
+ {
+ let front_buffer = input.peek_front_chunk_mut();
+ let Some(mut front_buffer) = front_buffer else {
+ return ProcessResult::Suspend;
+ };
+
+ // Special case: The fast path is not worth taking if the first character is already in the set,
+ // which is fairly common
+ let first_char = front_buffer
+ .chars()
+ .next()
+ .expect("Input buffers are never empty");
+
+ if matches!(first_char, '\r' | '\0' | '&' | '<' | '\n') {
+ drop(front_buffer);
+ self.pop_except_from(input, set)
+ } else {
+ // SAFETY:
+ // This CPU is guaranteed to support SSE2 due to the is_x86_feature_detected check above
+ let result = unsafe { self.data_state_sse2_fast_path(&mut front_buffer) };
+
+ if front_buffer.is_empty() {
+ drop(front_buffer);
+ input.pop_front();
+ }
+
+ result
+ }
+ } else {
+ self.pop_except_from(input, set)
+ };
+
+ #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+ let set_result = self.pop_except_from(input, set);
+
+ let Some(set_result) = set_result else {
+ return ProcessResult::Suspend;
+ };
+ match set_result {
FromSet('\0') => {
self.bad_char_error();
self.emit_char('\0');
@@ -1839,6 +1884,121 @@ impl Tokenizer {
states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection),
}
}
+
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ #[target_feature(enable = "sse2")]
+ /// Implements the [data state] with SIMD instructions.
+ ///
+ /// The algorithm implemented is the naive SIMD approach described [here].
+ ///
+ /// ### SAFETY:
+ /// Calling this function on a CPU that does not support SSE2 causes undefined behaviour.
+ ///
+ /// [data state]: https://html.spec.whatwg.org/#data-state
+ /// [here]: https://lemire.me/blog/2024/06/08/scan-html-faster-with-simd-instructions-chrome-edition/
+ unsafe fn data_state_sse2_fast_path(&self, input: &mut StrTendril) -> Option {
+ #[cfg(target_arch = "x86")]
+ use std::arch::x86::{
+ __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
+ _mm_set1_epi8,
+ };
+ #[cfg(target_arch = "x86_64")]
+ use std::arch::x86_64::{
+ __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
+ _mm_set1_epi8,
+ };
+
+ debug_assert!(!input.is_empty());
+
+ let quote_mask = _mm_set1_epi8('<' as i8);
+ let escape_mask = _mm_set1_epi8('&' as i8);
+ let carriage_return_mask = _mm_set1_epi8('\r' as i8);
+ let zero_mask = _mm_set1_epi8('\0' as i8);
+ let newline_mask = _mm_set1_epi8('\n' as i8);
+
+ let raw_bytes: &[u8] = input.as_bytes();
+ let start = raw_bytes.as_ptr();
+
+ const STRIDE: usize = 16;
+ let mut i = 0;
+ let mut n_newlines = 0;
+ while i + STRIDE <= raw_bytes.len() {
+ // Load a 16 byte chunk from the input
+ let data = _mm_loadu_si128(start.add(i) as *const __m128i);
+
+ // Compare the chunk against each mask
+ let quotes = _mm_cmpeq_epi8(data, quote_mask);
+ let escapes = _mm_cmpeq_epi8(data, escape_mask);
+ let carriage_returns = _mm_cmpeq_epi8(data, carriage_return_mask);
+ let zeros = _mm_cmpeq_epi8(data, zero_mask);
+ let newlines = _mm_cmpeq_epi8(data, newline_mask);
+
+ // Combine all test results and create a bitmask from them.
+ // Each bit in the mask will be 1 if the character at the bit position is in the set and 0 otherwise.
+ let test_result = _mm_or_si128(
+ _mm_or_si128(quotes, zeros),
+ _mm_or_si128(escapes, carriage_returns),
+ );
+ let bitmask = _mm_movemask_epi8(test_result);
+ let newline_mask = _mm_movemask_epi8(newlines);
+
+ if (bitmask != 0) {
+ // We have reached one of the characters that cause the state machine to transition
+ let position = if cfg!(target_endian = "little") {
+ bitmask.trailing_zeros() as usize
+ } else {
+ bitmask.leading_zeros() as usize
+ };
+
+ n_newlines += (newline_mask & ((1 << position) - 1)).count_ones() as u64;
+ i += position;
+ break;
+ } else {
+ n_newlines += newline_mask.count_ones() as u64;
+ }
+
+ i += STRIDE;
+ }
+
+ // Process any remaining bytes (less than STRIDE)
+ while let Some(c) = raw_bytes.get(i) {
+ if matches!(*c, b'<' | b'&' | b'\r' | b'\0') {
+ break;
+ }
+ if *c == b'\n' {
+ n_newlines += 1;
+ }
+
+ i += 1;
+ }
+
+ let set_result = if i == 0 {
+ let first_char = input.pop_front_char().unwrap();
+ debug_assert!(matches!(first_char, '<' | '&' | '\r' | '\0'));
+
+ // FIXME: Passing a bogus input queue is only relevant when c is \n, which can never happen in this case.
+ // Still, it would be nice to not have to do that.
+ // The same is true for the unwrap call.
+ let preprocessed_char = self
+ .get_preprocessed_char(first_char, &BufferQueue::default())
+ .unwrap();
+ SetResult::FromSet(preprocessed_char)
+ } else {
+ debug_assert!(
+ input.len() >= i,
+ "Trying to remove {:?} bytes from a tendril that is only {:?} bytes long",
+ i,
+ input.len()
+ );
+ let consumed_chunk = input.unsafe_subtendril(0, i as u32);
+ input.unsafe_pop_front(i as u32);
+ SetResult::NotFromSet(consumed_chunk)
+ };
+
+ self.current_line.set(self.current_line.get() + n_newlines);
+
+ Some(set_result)
+ }
}
#[cfg(test)]
diff --git a/markup5ever/util/buffer_queue.rs b/markup5ever/util/buffer_queue.rs
index 95a571e2..dca37f43 100644
--- a/markup5ever/util/buffer_queue.rs
+++ b/markup5ever/util/buffer_queue.rs
@@ -18,7 +18,11 @@
//!
//! [`BufferQueue`]: struct.BufferQueue.html
-use std::{cell::RefCell, collections::VecDeque, mem};
+use std::{
+ cell::{RefCell, RefMut},
+ collections::VecDeque,
+ mem,
+};
use tendril::StrTendril;
@@ -246,6 +250,19 @@ impl BufferQueue {
&mut *other.buffers.borrow_mut(),
);
}
+
+ /// Return a mutable reference to the first tendril in the queue.
+ pub fn peek_front_chunk_mut(&self) -> Option> {
+ let buffers = self.buffers.borrow_mut();
+ if buffers.is_empty() {
+ return None;
+ }
+
+ let front_buffer = RefMut::map(buffers, |buffers| {
+ buffers.front_mut().expect("there is at least one buffer")
+ });
+ Some(front_buffer)
+ }
}
#[cfg(test)]