Skip to content

Add a fast path for the data state using SSE2 instructions #601

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 161 additions & 1 deletion html5ever/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -704,7 +704,52 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
match self.state.get() {
//§ data-state
states::Data => loop {
match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
let set = small_char_set!('\r' '\0' '&' '<' '\n');

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
let set_result = if !(self.opts.exact_errors
|| self.reconsume.get()
|| self.ignore_lf.get())
&& is_x86_feature_detected!("sse2")
{
let front_buffer = input.peek_front_chunk_mut();
let Some(mut front_buffer) = front_buffer else {
return ProcessResult::Suspend;
};

// Special case: The fast path is not worth taking if the first character is already in the set,
// which is fairly common
let first_char = front_buffer
.chars()
.next()
.expect("Input buffers are never empty");

if matches!(first_char, '\r' | '\0' | '&' | '<' | '\n') {
drop(front_buffer);
self.pop_except_from(input, set)
} else {
// SAFETY:
// This CPU is guaranteed to support SSE2 due to the is_x86_feature_detected check above
let result = unsafe { self.data_state_sse2_fast_path(&mut front_buffer) };

if front_buffer.is_empty() {
drop(front_buffer);
input.pop_front();
}

result
}
} else {
self.pop_except_from(input, set)
};

#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
let set_result = self.pop_except_from(input, set);

let Some(set_result) = set_result else {
return ProcessResult::Suspend;
};
match set_result {
FromSet('\0') => {
self.bad_char_error();
self.emit_char('\0');
Expand Down Expand Up @@ -1839,6 +1884,121 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection),
}
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "sse2")]
/// Implements the [data state] with SIMD instructions.
///
/// The algorithm implemented is the naive SIMD approach described [here].
///
/// ### SAFETY:
/// Calling this function on a CPU that does not support SSE2 causes undefined behaviour.
///
/// [data state]: https://html.spec.whatwg.org/#data-state
/// [here]: https://lemire.me/blog/2024/06/08/scan-html-faster-with-simd-instructions-chrome-edition/
unsafe fn data_state_sse2_fast_path(&self, input: &mut StrTendril) -> Option<SetResult> {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SIMD can only process 16-byte chunks. So instead of scanning the input character-by-character we unroll the loop like this:

// This is pseudocode
while remaining_input.len() > SIMD_CHUNK_SIZE {
    // use SIMD algorithm
}

while !remaining_input.is_empty() {
     // use scalar algorithm
}

#[cfg(target_arch = "x86")]
use std::arch::x86::{
__m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
_mm_set1_epi8,
};
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::{
__m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
_mm_set1_epi8,
};

debug_assert!(!input.is_empty());

let quote_mask = _mm_set1_epi8('<' as i8);
let escape_mask = _mm_set1_epi8('&' as i8);
let carriage_return_mask = _mm_set1_epi8('\r' as i8);
let zero_mask = _mm_set1_epi8('\0' as i8);
let newline_mask = _mm_set1_epi8('\n' as i8);
Comment on lines +1913 to +1917
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_mm_set1_epi8 creates a SIMD vector where each element has the same value.

For example _mm_set1_epi8('<' as i8) returns a __m128i, a 128 bit vector consisting of 16 8-bit integers. Each of those integers has the value 60.


let raw_bytes: &[u8] = input.as_bytes();
let start = raw_bytes.as_ptr();

const STRIDE: usize = 16;
let mut i = 0;
let mut n_newlines = 0;
while i + STRIDE <= raw_bytes.len() {
// Load a 16 byte chunk from the input
let data = _mm_loadu_si128(start.add(i) as *const __m128i);
Comment on lines +1926 to +1927
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_mm_loadu_si128 takes a pointer and reads 128 bits at the given address into a SIMD register.


// Compare the chunk against each mask
let quotes = _mm_cmpeq_epi8(data, quote_mask);
let escapes = _mm_cmpeq_epi8(data, escape_mask);
let carriage_returns = _mm_cmpeq_epi8(data, carriage_return_mask);
let zeros = _mm_cmpeq_epi8(data, zero_mask);
let newlines = _mm_cmpeq_epi8(data, newline_mask);
Comment on lines +1930 to +1934
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_mm_cmpeq_epi8 takes two SIMD vectors and compares them element wise. Each entry in the result is one if the two operands match and zero otherwise.

Therefore, quotes, escapes etc now contain the test results for the 16-byte input chunk that we just loaded.


// Combine all test results and create a bitmask from them.
// Each bit in the mask will be 1 if the character at the bit position is in the set and 0 otherwise.
let test_result = _mm_or_si128(
_mm_or_si128(quotes, zeros),
_mm_or_si128(escapes, carriage_returns),
);
Comment on lines +1938 to +1941
Copy link
Contributor Author

@simonwuelker simonwuelker May 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_mm_or_si128 does exactly what it sounds like: It computes the elementwise OR of two SIMD vectors.

Therefore, test_result now contains 16 8-bit integers that are either zero or one - zero if the character at the position did not match an element in the set and one otherwise.

let bitmask = _mm_movemask_epi8(test_result);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_mm_movemask_epi8 throws away all the bits we don't need anymore. It creates a 16-bit integer consisting of the least-significant-bits from each SIMD entry.

For example, the SIMD vector [1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ] becomes 0b1010000000000000.

let newline_mask = _mm_movemask_epi8(newlines);

if (bitmask != 0) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if at least one bit in the mask is set then one of the 16 input characters was in the set that we were looking for.

// We have reached one of the characters that cause the state machine to transition
let position = if cfg!(target_endian = "little") {
bitmask.trailing_zeros() as usize
} else {
bitmask.leading_zeros() as usize
};
Comment on lines +1947 to +1951
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To find the exact position of the character we can just count the leading zeros (or, the leading characters that were not in the set) and add them to the offset of the 16 byte chunk that we loaded.


n_newlines += (newline_mask & ((1 << position) - 1)).count_ones() as u64;
i += position;
break;
} else {
n_newlines += newline_mask.count_ones() as u64;
Comment on lines +1953 to +1957
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Additionally, html5ever emits the line number for each token.
In SIMD this is implemented as follows:

  1. Compute the bitmask like before, but only for \n characters
  2. If the 16 byte chunk contains a character in the set:
    2.1 Mask the bits that come before the first character in the set, let the number of newlines be the set bits in the newline-bitmask
  3. Otherwise:
    3.1 Let the number of newlines be the set bits in the newline-bitmask.

Unfortunately, this makes the algorithm significantly slower than it could be.

}

i += STRIDE;
}

// Process any remaining bytes (less than STRIDE)
while let Some(c) = raw_bytes.get(i) {
if matches!(*c, b'<' | b'&' | b'\r' | b'\0') {
break;
}
if *c == b'\n' {
n_newlines += 1;
}

i += 1;
}
Comment on lines +1963 to +1973
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This block takes care of any input chunks that are too small for SIMD.


let set_result = if i == 0 {
let first_char = input.pop_front_char().unwrap();
debug_assert!(matches!(first_char, '<' | '&' | '\r' | '\0'));

// FIXME: Passing a bogus input queue is only relevant when c is \n, which can never happen in this case.
// Still, it would be nice to not have to do that.
// The same is true for the unwrap call.
let preprocessed_char = self
.get_preprocessed_char(first_char, &BufferQueue::default())
.unwrap();
SetResult::FromSet(preprocessed_char)
} else {
debug_assert!(
input.len() >= i,
"Trying to remove {:?} bytes from a tendril that is only {:?} bytes long",
i,
input.len()
);
let consumed_chunk = input.unsafe_subtendril(0, i as u32);
input.unsafe_pop_front(i as u32);
SetResult::NotFromSet(consumed_chunk)
};

self.current_line.set(self.current_line.get() + n_newlines);

Some(set_result)
}
}

#[cfg(test)]
Expand Down
19 changes: 18 additions & 1 deletion markup5ever/util/buffer_queue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,11 @@
//!
//! [`BufferQueue`]: struct.BufferQueue.html
use std::{cell::RefCell, collections::VecDeque, mem};
use std::{
cell::{RefCell, RefMut},
collections::VecDeque,
mem,
};

use tendril::StrTendril;

Expand Down Expand Up @@ -246,6 +250,19 @@ impl BufferQueue {
&mut *other.buffers.borrow_mut(),
);
}

/// Return a mutable reference to the first tendril in the queue.
pub fn peek_front_chunk_mut(&self) -> Option<RefMut<StrTendril>> {
let buffers = self.buffers.borrow_mut();
if buffers.is_empty() {
return None;
}

let front_buffer = RefMut::map(buffers, |buffers| {
buffers.front_mut().expect("there is at least one buffer")
});
Some(front_buffer)
}
}

#[cfg(test)]
Expand Down