Skip to content

Commit 5aa7688

Browse files
committed
Add a fast path for the data state using SSE2
Signed-off-by: Simon Wülker <[email protected]>
1 parent a1486b0 commit 5aa7688

File tree

2 files changed

+179
-2
lines changed

2 files changed

+179
-2
lines changed

html5ever/src/tokenizer/mod.rs

Lines changed: 161 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -700,7 +700,52 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
700700
match self.state.get() {
701701
//§ data-state
702702
states::Data => loop {
703-
match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
703+
let set = small_char_set!('\r' '\0' '&' '<' '\n');
704+
705+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
706+
let set_result = if !(self.opts.exact_errors
707+
|| self.reconsume.get()
708+
|| self.ignore_lf.get())
709+
&& is_x86_feature_detected!("sse2")
710+
{
711+
let front_buffer = input.peek_front_chunk_mut();
712+
let Some(mut front_buffer) = front_buffer else {
713+
return ProcessResult::Suspend;
714+
};
715+
716+
// Special case: The fast path is not worth taking if the first character is already in the set,
717+
// which is fairly common
718+
let first_char = front_buffer
719+
.chars()
720+
.next()
721+
.expect("Input buffers are never empty");
722+
723+
if matches!(first_char, '\r' | '\0' | '&' | '<' | '\n') {
724+
drop(front_buffer);
725+
self.pop_except_from(input, set)
726+
} else {
727+
// SAFETY:
728+
// This CPU is guaranteed to support SSE2 due to the is_x86_feature_detected check above
729+
let result = unsafe { self.data_state_sse2_fast_path(&mut front_buffer) };
730+
731+
if front_buffer.is_empty() {
732+
drop(front_buffer);
733+
input.pop_front();
734+
}
735+
736+
result
737+
}
738+
} else {
739+
self.pop_except_from(input, set)
740+
};
741+
742+
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
743+
let set_result = self.pop_except_from(input, set);
744+
745+
let Some(set_result) = set_result else {
746+
return ProcessResult::Suspend;
747+
};
748+
match set_result {
704749
FromSet('\0') => {
705750
self.bad_char_error();
706751
go!(self: emit '\0')
@@ -1752,6 +1797,121 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
17521797
states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection),
17531798
}
17541799
}
1800+
1801+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1802+
#[target_feature(enable = "sse2")]
1803+
/// Implements the [data state] with SIMD instructions.
1804+
///
1805+
/// The algorithm implemented is the naive SIMD approach described [here].
1806+
///
1807+
/// ### SAFETY:
1808+
/// Calling this function on a CPU that does not support SSE2 causes undefined behaviour.
1809+
///
1810+
/// [data state]: https://html.spec.whatwg.org/#data-state
1811+
/// [here]: https://lemire.me/blog/2024/06/08/scan-html-faster-with-simd-instructions-chrome-edition/
1812+
unsafe fn data_state_sse2_fast_path(&self, input: &mut StrTendril) -> Option<SetResult> {
1813+
#[cfg(target_arch = "x86")]
1814+
use std::arch::x86::{
1815+
__m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
1816+
_mm_set1_epi8,
1817+
};
1818+
#[cfg(target_arch = "x86_64")]
1819+
use std::arch::x86_64::{
1820+
__m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
1821+
_mm_set1_epi8,
1822+
};
1823+
1824+
debug_assert!(!input.is_empty());
1825+
1826+
let quote_mask = _mm_set1_epi8('<' as i8);
1827+
let escape_mask = _mm_set1_epi8('&' as i8);
1828+
let carriage_return_mask = _mm_set1_epi8('\r' as i8);
1829+
let zero_mask = _mm_set1_epi8('\0' as i8);
1830+
let newline_mask = _mm_set1_epi8('\n' as i8);
1831+
1832+
let raw_bytes: &[u8] = input.as_bytes();
1833+
let start = raw_bytes.as_ptr();
1834+
1835+
const STRIDE: usize = 16;
1836+
let mut i = 0;
1837+
let mut n_newlines = 0;
1838+
while i + STRIDE <= raw_bytes.len() {
1839+
// Load a 16 byte chunk from the input
1840+
let data = _mm_loadu_si128(start.add(i) as *const __m128i);
1841+
1842+
// Compare the chunk against each mask
1843+
let quotes = _mm_cmpeq_epi8(data, quote_mask);
1844+
let escapes = _mm_cmpeq_epi8(data, escape_mask);
1845+
let carriage_returns = _mm_cmpeq_epi8(data, carriage_return_mask);
1846+
let zeros = _mm_cmpeq_epi8(data, zero_mask);
1847+
let newlines = _mm_cmpeq_epi8(data, newline_mask);
1848+
1849+
// Combine all test results and create a bitmask from them.
1850+
// Each bit in the mask will be 1 if the character at the bit position is in the set and 0 otherwise.
1851+
let test_result = _mm_or_si128(
1852+
_mm_or_si128(quotes, zeros),
1853+
_mm_or_si128(escapes, carriage_returns),
1854+
);
1855+
let bitmask = _mm_movemask_epi8(test_result);
1856+
let newline_mask = _mm_movemask_epi8(newlines);
1857+
1858+
if (bitmask != 0) {
1859+
// We have reached one of the characters that cause the state machine to transition
1860+
let position = if cfg!(target_endian = "little") {
1861+
bitmask.trailing_zeros() as usize
1862+
} else {
1863+
bitmask.leading_zeros() as usize
1864+
};
1865+
1866+
n_newlines += (newline_mask & ((1 << position) - 1)).count_ones() as u64;
1867+
i += position;
1868+
break;
1869+
} else {
1870+
n_newlines += newline_mask.count_ones() as u64;
1871+
}
1872+
1873+
i += STRIDE;
1874+
}
1875+
1876+
// Process any remaining bytes (less than STRIDE)
1877+
while let Some(c) = raw_bytes.get(i) {
1878+
if matches!(*c, b'<' | b'&' | b'\r' | b'\0') {
1879+
break;
1880+
}
1881+
if *c == b'\n' {
1882+
n_newlines += 1;
1883+
}
1884+
1885+
i += 1;
1886+
}
1887+
1888+
let set_result = if i == 0 {
1889+
let c = input.pop_front_char().unwrap();
1890+
debug_assert!(matches!(c, '<' | '&' | '\r' | '\0'));
1891+
1892+
// FIXME: Passing a bogus input queue is only relevant when c is \n, which can never happen in this case.
1893+
// Still, it would be nice to not have to do that.
1894+
// The same is true for the unwrap call.
1895+
let preprocessed_char = self
1896+
.get_preprocessed_char(c, &BufferQueue::default())
1897+
.unwrap();
1898+
SetResult::FromSet(preprocessed_char)
1899+
} else {
1900+
debug_assert!(
1901+
input.len() >= i,
1902+
"Trying to remove {:?} bytes from a tendril that is only {:?} bytes long",
1903+
i,
1904+
input.len()
1905+
);
1906+
let consumed_chunk = input.unsafe_subtendril(0, i as u32);
1907+
input.unsafe_pop_front(i as u32);
1908+
SetResult::NotFromSet(consumed_chunk)
1909+
};
1910+
1911+
self.current_line.set(self.current_line.get() + n_newlines);
1912+
1913+
Some(set_result)
1914+
}
17551915
}
17561916

17571917
#[cfg(test)]

markup5ever/util/buffer_queue.rs

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,11 @@
1818
//!
1919
//! [`BufferQueue`]: struct.BufferQueue.html
2020
21-
use std::{cell::RefCell, collections::VecDeque, mem};
21+
use std::{
22+
cell::{RefCell, RefMut},
23+
collections::VecDeque,
24+
mem,
25+
};
2226

2327
use tendril::StrTendril;
2428

@@ -246,6 +250,19 @@ impl BufferQueue {
246250
&mut *other.buffers.borrow_mut(),
247251
);
248252
}
253+
254+
/// Return a mutable reference to the first tendril in the queue.
255+
pub fn peek_front_chunk_mut(&self) -> Option<RefMut<StrTendril>> {
256+
let buffers = self.buffers.borrow_mut();
257+
if buffers.is_empty() {
258+
return None;
259+
}
260+
261+
let front_buffer = RefMut::map(buffers, |buffers| {
262+
buffers.front_mut().expect("there is at least one buffer")
263+
});
264+
Some(front_buffer)
265+
}
249266
}
250267

251268
#[cfg(test)]

0 commit comments

Comments
 (0)