Skip to content

Commit 61e4c45

Browse files
committed
Add a fast path for the data state using SSE2
Signed-off-by: Simon Wülker <[email protected]>
1 parent c24853e commit 61e4c45

File tree

2 files changed

+179
-2
lines changed

2 files changed

+179
-2
lines changed

html5ever/src/tokenizer/mod.rs

Lines changed: 161 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -704,7 +704,52 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
704704
match self.state.get() {
705705
//§ data-state
706706
states::Data => loop {
707-
match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
707+
let set = small_char_set!('\r' '\0' '&' '<' '\n');
708+
709+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
710+
let set_result = if !(self.opts.exact_errors
711+
|| self.reconsume.get()
712+
|| self.ignore_lf.get())
713+
&& is_x86_feature_detected!("sse2")
714+
{
715+
let front_buffer = input.peek_front_chunk_mut();
716+
let Some(mut front_buffer) = front_buffer else {
717+
return ProcessResult::Suspend;
718+
};
719+
720+
// Special case: The fast path is not worth taking if the first character is already in the set,
721+
// which is fairly common
722+
let first_char = front_buffer
723+
.chars()
724+
.next()
725+
.expect("Input buffers are never empty");
726+
727+
if matches!(first_char, '\r' | '\0' | '&' | '<' | '\n') {
728+
drop(front_buffer);
729+
self.pop_except_from(input, set)
730+
} else {
731+
// SAFETY:
732+
// This CPU is guaranteed to support SSE2 due to the is_x86_feature_detected check above
733+
let result = unsafe { self.data_state_sse2_fast_path(&mut front_buffer) };
734+
735+
if front_buffer.is_empty() {
736+
drop(front_buffer);
737+
input.pop_front();
738+
}
739+
740+
result
741+
}
742+
} else {
743+
self.pop_except_from(input, set)
744+
};
745+
746+
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
747+
let set_result = self.pop_except_from(input, set);
748+
749+
let Some(set_result) = set_result else {
750+
return ProcessResult::Suspend;
751+
};
752+
match set_result {
708753
FromSet('\0') => {
709754
self.bad_char_error();
710755
self.emit_char('\0');
@@ -1839,6 +1884,121 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
18391884
states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection),
18401885
}
18411886
}
1887+
1888+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1889+
#[target_feature(enable = "sse2")]
1890+
/// Implements the [data state] with SIMD instructions.
1891+
///
1892+
/// The algorithm implemented is the naive SIMD approach described [here].
1893+
///
1894+
/// ### SAFETY:
1895+
/// Calling this function on a CPU that does not support SSE2 causes undefined behaviour.
1896+
///
1897+
/// [data state]: https://html.spec.whatwg.org/#data-state
1898+
/// [here]: https://lemire.me/blog/2024/06/08/scan-html-faster-with-simd-instructions-chrome-edition/
1899+
unsafe fn data_state_sse2_fast_path(&self, input: &mut StrTendril) -> Option<SetResult> {
1900+
#[cfg(target_arch = "x86")]
1901+
use std::arch::x86::{
1902+
__m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
1903+
_mm_set1_epi8,
1904+
};
1905+
#[cfg(target_arch = "x86_64")]
1906+
use std::arch::x86_64::{
1907+
__m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
1908+
_mm_set1_epi8,
1909+
};
1910+
1911+
debug_assert!(!input.is_empty());
1912+
1913+
let quote_mask = _mm_set1_epi8('<' as i8);
1914+
let escape_mask = _mm_set1_epi8('&' as i8);
1915+
let carriage_return_mask = _mm_set1_epi8('\r' as i8);
1916+
let zero_mask = _mm_set1_epi8('\0' as i8);
1917+
let newline_mask = _mm_set1_epi8('\n' as i8);
1918+
1919+
let raw_bytes: &[u8] = input.as_bytes();
1920+
let start = raw_bytes.as_ptr();
1921+
1922+
const STRIDE: usize = 16;
1923+
let mut i = 0;
1924+
let mut n_newlines = 0;
1925+
while i + STRIDE <= raw_bytes.len() {
1926+
// Load a 16 byte chunk from the input
1927+
let data = _mm_loadu_si128(start.add(i) as *const __m128i);
1928+
1929+
// Compare the chunk against each mask
1930+
let quotes = _mm_cmpeq_epi8(data, quote_mask);
1931+
let escapes = _mm_cmpeq_epi8(data, escape_mask);
1932+
let carriage_returns = _mm_cmpeq_epi8(data, carriage_return_mask);
1933+
let zeros = _mm_cmpeq_epi8(data, zero_mask);
1934+
let newlines = _mm_cmpeq_epi8(data, newline_mask);
1935+
1936+
// Combine all test results and create a bitmask from them.
1937+
// Each bit in the mask will be 1 if the character at the bit position is in the set and 0 otherwise.
1938+
let test_result = _mm_or_si128(
1939+
_mm_or_si128(quotes, zeros),
1940+
_mm_or_si128(escapes, carriage_returns),
1941+
);
1942+
let bitmask = _mm_movemask_epi8(test_result);
1943+
let newline_mask = _mm_movemask_epi8(newlines);
1944+
1945+
if (bitmask != 0) {
1946+
// We have reached one of the characters that cause the state machine to transition
1947+
let position = if cfg!(target_endian = "little") {
1948+
bitmask.trailing_zeros() as usize
1949+
} else {
1950+
bitmask.leading_zeros() as usize
1951+
};
1952+
1953+
n_newlines += (newline_mask & ((1 << position) - 1)).count_ones() as u64;
1954+
i += position;
1955+
break;
1956+
} else {
1957+
n_newlines += newline_mask.count_ones() as u64;
1958+
}
1959+
1960+
i += STRIDE;
1961+
}
1962+
1963+
// Process any remaining bytes (less than STRIDE)
1964+
while let Some(c) = raw_bytes.get(i) {
1965+
if matches!(*c, b'<' | b'&' | b'\r' | b'\0') {
1966+
break;
1967+
}
1968+
if *c == b'\n' {
1969+
n_newlines += 1;
1970+
}
1971+
1972+
i += 1;
1973+
}
1974+
1975+
let set_result = if i == 0 {
1976+
let first_char = input.pop_front_char().unwrap();
1977+
debug_assert!(matches!(first_char, '<' | '&' | '\r' | '\0'));
1978+
1979+
// FIXME: Passing a bogus input queue is only relevant when c is \n, which can never happen in this case.
1980+
// Still, it would be nice to not have to do that.
1981+
// The same is true for the unwrap call.
1982+
let preprocessed_char = self
1983+
.get_preprocessed_char(first_char, &BufferQueue::default())
1984+
.unwrap();
1985+
SetResult::FromSet(preprocessed_char)
1986+
} else {
1987+
debug_assert!(
1988+
input.len() >= i,
1989+
"Trying to remove {:?} bytes from a tendril that is only {:?} bytes long",
1990+
i,
1991+
input.len()
1992+
);
1993+
let consumed_chunk = input.unsafe_subtendril(0, i as u32);
1994+
input.unsafe_pop_front(i as u32);
1995+
SetResult::NotFromSet(consumed_chunk)
1996+
};
1997+
1998+
self.current_line.set(self.current_line.get() + n_newlines);
1999+
2000+
Some(set_result)
2001+
}
18422002
}
18432003

18442004
#[cfg(test)]

markup5ever/util/buffer_queue.rs

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,11 @@
1818
//!
1919
//! [`BufferQueue`]: struct.BufferQueue.html
2020
21-
use std::{cell::RefCell, collections::VecDeque, mem};
21+
use std::{
22+
cell::{RefCell, RefMut},
23+
collections::VecDeque,
24+
mem,
25+
};
2226

2327
use tendril::StrTendril;
2428

@@ -246,6 +250,19 @@ impl BufferQueue {
246250
&mut *other.buffers.borrow_mut(),
247251
);
248252
}
253+
254+
/// Return a mutable reference to the first tendril in the queue.
255+
pub fn peek_front_chunk_mut(&self) -> Option<RefMut<StrTendril>> {
256+
let buffers = self.buffers.borrow_mut();
257+
if buffers.is_empty() {
258+
return None;
259+
}
260+
261+
let front_buffer = RefMut::map(buffers, |buffers| {
262+
buffers.front_mut().expect("there is at least one buffer")
263+
});
264+
Some(front_buffer)
265+
}
249266
}
250267

251268
#[cfg(test)]

0 commit comments

Comments
 (0)