Skip to content

Commit 8d6c79d

Browse files
committed
Add a fast path for the data state using SSE2
Signed-off-by: Simon Wülker <[email protected]>
1 parent a1486b0 commit 8d6c79d

File tree

2 files changed

+180
-2
lines changed

2 files changed

+180
-2
lines changed

html5ever/src/tokenizer/mod.rs

Lines changed: 162 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -700,7 +700,53 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
700700
match self.state.get() {
701701
//§ data-state
702702
states::Data => loop {
703-
match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
703+
let set = small_char_set!('\r' '\0' '&' '<' '\n');
704+
705+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
706+
let set_result = if !(self.opts.exact_errors
707+
|| self.reconsume.get()
708+
|| self.ignore_lf.get())
709+
&& is_x86_feature_detected!("sse2")
710+
{
711+
let front_buffer = input.peek_front_chunk_mut();
712+
let Some(mut front_buffer) = front_buffer else {
713+
return ProcessResult::Suspend;
714+
};
715+
716+
// Special case: The fast path is not worth taking if the first character is already in the set,
717+
// which is fairly common
718+
let first_char = front_buffer
719+
.chars()
720+
.next()
721+
.expect("Input buffers are never empty");
722+
let result = if matches!(first_char, '\r' | '\0' | '&' | '<' | '\n') {
723+
drop(front_buffer);
724+
self.pop_except_from(input, set)
725+
} else {
726+
// SAFETY:
727+
// This CPU is guaranteed to support SSE2 due to the is_x86_feature_detected check above
728+
let result = unsafe { self.data_state_sse2_fast_path(&mut front_buffer) };
729+
730+
if front_buffer.is_empty() {
731+
drop(front_buffer);
732+
input.pop_front();
733+
}
734+
735+
result
736+
};
737+
738+
result
739+
} else {
740+
self.pop_except_from(input, set)
741+
};
742+
743+
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
744+
let set_result = self.pop_except_from(input, set);
745+
746+
let Some(set_result) = set_result else {
747+
return ProcessResult::Suspend;
748+
};
749+
match set_result {
704750
FromSet('\0') => {
705751
self.bad_char_error();
706752
go!(self: emit '\0')
@@ -1752,6 +1798,121 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
17521798
states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection),
17531799
}
17541800
}
1801+
1802+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1803+
#[target_feature(enable = "sse2")]
1804+
/// Implements the [data state] with SIMD instructions.
1805+
///
1806+
/// The algorithm implemented is the naive SIMD approach described [here].
1807+
///
1808+
/// ### SAFETY:
1809+
/// Calling this function on a CPU that does not support SSE2 causes undefined behaviour.
1810+
///
1811+
/// [data state]: https://html.spec.whatwg.org/#data-state
1812+
/// [here]: https://lemire.me/blog/2024/06/08/scan-html-faster-with-simd-instructions-chrome-edition/
1813+
unsafe fn data_state_sse2_fast_path(&self, input: &mut StrTendril) -> Option<SetResult> {
1814+
#[cfg(target_arch = "x86")]
1815+
use std::arch::x86::{
1816+
__m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
1817+
_mm_set1_epi8,
1818+
};
1819+
#[cfg(target_arch = "x86_64")]
1820+
use std::arch::x86_64::{
1821+
__m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
1822+
_mm_set1_epi8,
1823+
};
1824+
1825+
debug_assert!(!input.is_empty());
1826+
1827+
let quote_mask = _mm_set1_epi8('<' as i8);
1828+
let escape_mask = _mm_set1_epi8('&' as i8);
1829+
let carriage_return_mask = _mm_set1_epi8('\r' as i8);
1830+
let zero_mask = _mm_set1_epi8('\0' as i8);
1831+
let newline_mask = _mm_set1_epi8('\n' as i8);
1832+
1833+
let raw_bytes: &[u8] = &input.as_bytes();
1834+
let start = raw_bytes.as_ptr();
1835+
1836+
const STRIDE: usize = 16;
1837+
let mut i = 0;
1838+
let mut n_newlines = 0;
1839+
while i + STRIDE <= raw_bytes.len() {
1840+
// Load a 16 byte chunk from the input
1841+
let data = _mm_loadu_si128(start.offset(i as isize) as *const __m128i);
1842+
1843+
// Compare the chunk against each mask
1844+
let quotes = _mm_cmpeq_epi8(data, quote_mask);
1845+
let escapes = _mm_cmpeq_epi8(data, escape_mask);
1846+
let carriage_returns = _mm_cmpeq_epi8(data, carriage_return_mask);
1847+
let zeros = _mm_cmpeq_epi8(data, zero_mask);
1848+
let newlines = _mm_cmpeq_epi8(data, newline_mask);
1849+
1850+
// Combine all test results and create a bitmask from them.
1851+
// Each bit in the mask will be 1 if the character at the bit position is in the set and 0 otherwise.
1852+
let test_result = _mm_or_si128(
1853+
_mm_or_si128(quotes, zeros),
1854+
_mm_or_si128(escapes, carriage_returns),
1855+
);
1856+
let bitmask = _mm_movemask_epi8(test_result);
1857+
let newline_mask = _mm_movemask_epi8(newlines);
1858+
1859+
if (bitmask != 0) {
1860+
// We have reached one of the characters that cause the state machine to transition
1861+
let position = if cfg!(target_endian = "little") {
1862+
bitmask.trailing_zeros() as usize
1863+
} else {
1864+
bitmask.leading_zeros() as usize
1865+
};
1866+
1867+
n_newlines += (newline_mask & ((1 << position) - 1)).count_ones() as u64;
1868+
i += position;
1869+
break;
1870+
} else {
1871+
n_newlines += newline_mask.count_ones() as u64;
1872+
}
1873+
1874+
i += STRIDE;
1875+
}
1876+
1877+
// Process any remaining bytes (less than STRIDE)
1878+
while let Some(c) = raw_bytes.get(i) {
1879+
if matches!(*c, b'<' | b'&' | b'\r' | b'\0') {
1880+
break;
1881+
}
1882+
if *c == b'\n' {
1883+
n_newlines += 1;
1884+
}
1885+
1886+
i += 1;
1887+
}
1888+
1889+
let set_result = if i == 0 {
1890+
let c = input.pop_front_char().unwrap();
1891+
debug_assert!(matches!(c, '<' | '&' | '\r' | '\0'));
1892+
1893+
// FIXME: Passing a bogus input queue is only relevant when c is \n, which can never happen in this case.
1894+
// Still, it would be nice to not have to do that.
1895+
// The same is true for the unwrap call.
1896+
let preprocessed_char = self
1897+
.get_preprocessed_char(c, &BufferQueue::default())
1898+
.unwrap();
1899+
SetResult::FromSet(preprocessed_char)
1900+
} else {
1901+
debug_assert!(
1902+
input.len() >= i,
1903+
"Trying to remove {:?} bytes from a tendril that is only {:?} bytes long",
1904+
i,
1905+
input.len()
1906+
);
1907+
let consumed_chunk = input.unsafe_subtendril(0, i as u32);
1908+
input.unsafe_pop_front(i as u32);
1909+
SetResult::NotFromSet(consumed_chunk)
1910+
};
1911+
1912+
self.current_line.set(self.current_line.get() + n_newlines);
1913+
1914+
Some(set_result)
1915+
}
17551916
}
17561917

17571918
#[cfg(test)]

markup5ever/util/buffer_queue.rs

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,11 @@
1818
//!
1919
//! [`BufferQueue`]: struct.BufferQueue.html
2020
21-
use std::{cell::RefCell, collections::VecDeque, mem};
21+
use std::{
22+
cell::{RefCell, RefMut},
23+
collections::VecDeque,
24+
mem,
25+
};
2226

2327
use tendril::StrTendril;
2428

@@ -246,6 +250,19 @@ impl BufferQueue {
246250
&mut *other.buffers.borrow_mut(),
247251
);
248252
}
253+
254+
/// Return a mutable reference to the first tendril in the queue.
255+
pub fn peek_front_chunk_mut(&self) -> Option<RefMut<StrTendril>> {
256+
let buffers = self.buffers.borrow_mut();
257+
if buffers.is_empty() {
258+
return None;
259+
}
260+
261+
let front_buffer = RefMut::map(buffers, |buffers| {
262+
buffers.front_mut().expect("there is at least one buffer")
263+
});
264+
Some(front_buffer)
265+
}
249266
}
250267

251268
#[cfg(test)]

0 commit comments

Comments
 (0)