-
Notifications
You must be signed in to change notification settings - Fork 235
Add a fast path for the data state using SSE2 instructions #601
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -704,7 +704,52 @@ impl<Sink: TokenSink> Tokenizer<Sink> { | |
match self.state.get() { | ||
//§ data-state | ||
states::Data => loop { | ||
match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) { | ||
let set = small_char_set!('\r' '\0' '&' '<' '\n'); | ||
|
||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] | ||
let set_result = if !(self.opts.exact_errors | ||
|| self.reconsume.get() | ||
|| self.ignore_lf.get()) | ||
&& is_x86_feature_detected!("sse2") | ||
{ | ||
let front_buffer = input.peek_front_chunk_mut(); | ||
let Some(mut front_buffer) = front_buffer else { | ||
return ProcessResult::Suspend; | ||
}; | ||
|
||
// Special case: The fast path is not worth taking if the first character is already in the set, | ||
// which is fairly common | ||
let first_char = front_buffer | ||
.chars() | ||
.next() | ||
.expect("Input buffers are never empty"); | ||
|
||
if matches!(first_char, '\r' | '\0' | '&' | '<' | '\n') { | ||
drop(front_buffer); | ||
self.pop_except_from(input, set) | ||
} else { | ||
// SAFETY: | ||
// This CPU is guaranteed to support SSE2 due to the is_x86_feature_detected check above | ||
let result = unsafe { self.data_state_sse2_fast_path(&mut front_buffer) }; | ||
|
||
if front_buffer.is_empty() { | ||
drop(front_buffer); | ||
input.pop_front(); | ||
} | ||
|
||
result | ||
} | ||
} else { | ||
self.pop_except_from(input, set) | ||
}; | ||
|
||
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] | ||
let set_result = self.pop_except_from(input, set); | ||
|
||
let Some(set_result) = set_result else { | ||
return ProcessResult::Suspend; | ||
}; | ||
match set_result { | ||
FromSet('\0') => { | ||
self.bad_char_error(); | ||
self.emit_char('\0'); | ||
|
@@ -1839,6 +1884,121 @@ impl<Sink: TokenSink> Tokenizer<Sink> { | |
states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection), | ||
} | ||
} | ||
|
||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] | ||
#[target_feature(enable = "sse2")] | ||
/// Implements the [data state] with SIMD instructions. | ||
/// | ||
/// The algorithm implemented is the naive SIMD approach described [here]. | ||
/// | ||
/// ### SAFETY: | ||
/// Calling this function on a CPU that does not support SSE2 causes undefined behaviour. | ||
/// | ||
/// [data state]: https://html.spec.whatwg.org/#data-state | ||
/// [here]: https://lemire.me/blog/2024/06/08/scan-html-faster-with-simd-instructions-chrome-edition/ | ||
unsafe fn data_state_sse2_fast_path(&self, input: &mut StrTendril) -> Option<SetResult> { | ||
#[cfg(target_arch = "x86")] | ||
use std::arch::x86::{ | ||
__m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128, | ||
_mm_set1_epi8, | ||
}; | ||
#[cfg(target_arch = "x86_64")] | ||
use std::arch::x86_64::{ | ||
__m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128, | ||
_mm_set1_epi8, | ||
}; | ||
|
||
debug_assert!(!input.is_empty()); | ||
|
||
let quote_mask = _mm_set1_epi8('<' as i8); | ||
let escape_mask = _mm_set1_epi8('&' as i8); | ||
let carriage_return_mask = _mm_set1_epi8('\r' as i8); | ||
let zero_mask = _mm_set1_epi8('\0' as i8); | ||
let newline_mask = _mm_set1_epi8('\n' as i8); | ||
Comment on lines
+1913
to
+1917
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
For example |
||
|
||
let raw_bytes: &[u8] = input.as_bytes(); | ||
let start = raw_bytes.as_ptr(); | ||
|
||
const STRIDE: usize = 16; | ||
let mut i = 0; | ||
let mut n_newlines = 0; | ||
while i + STRIDE <= raw_bytes.len() { | ||
// Load a 16 byte chunk from the input | ||
let data = _mm_loadu_si128(start.add(i) as *const __m128i); | ||
Comment on lines
+1926
to
+1927
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
||
// Compare the chunk against each mask | ||
let quotes = _mm_cmpeq_epi8(data, quote_mask); | ||
let escapes = _mm_cmpeq_epi8(data, escape_mask); | ||
let carriage_returns = _mm_cmpeq_epi8(data, carriage_return_mask); | ||
let zeros = _mm_cmpeq_epi8(data, zero_mask); | ||
let newlines = _mm_cmpeq_epi8(data, newline_mask); | ||
Comment on lines
+1930
to
+1934
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Therefore, |
||
|
||
// Combine all test results and create a bitmask from them. | ||
// Each bit in the mask will be 1 if the character at the bit position is in the set and 0 otherwise. | ||
let test_result = _mm_or_si128( | ||
_mm_or_si128(quotes, zeros), | ||
_mm_or_si128(escapes, carriage_returns), | ||
); | ||
Comment on lines
+1938
to
+1941
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Therefore, |
||
let bitmask = _mm_movemask_epi8(test_result); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
For example, the SIMD vector |
||
let newline_mask = _mm_movemask_epi8(newlines); | ||
|
||
if (bitmask != 0) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if at least one bit in the mask is set then one of the 16 input characters was in the set that we were looking for. |
||
// We have reached one of the characters that cause the state machine to transition | ||
let position = if cfg!(target_endian = "little") { | ||
bitmask.trailing_zeros() as usize | ||
} else { | ||
bitmask.leading_zeros() as usize | ||
}; | ||
Comment on lines
+1947
to
+1951
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To find the exact position of the character we can just count the leading zeros (or, the leading characters that were not in the set) and add them to the offset of the 16 byte chunk that we loaded. |
||
|
||
n_newlines += (newline_mask & ((1 << position) - 1)).count_ones() as u64; | ||
i += position; | ||
break; | ||
} else { | ||
n_newlines += newline_mask.count_ones() as u64; | ||
Comment on lines
+1953
to
+1957
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Additionally,
Unfortunately, this makes the algorithm significantly slower than it could be. |
||
} | ||
|
||
i += STRIDE; | ||
} | ||
|
||
// Process any remaining bytes (less than STRIDE) | ||
while let Some(c) = raw_bytes.get(i) { | ||
if matches!(*c, b'<' | b'&' | b'\r' | b'\0') { | ||
break; | ||
} | ||
if *c == b'\n' { | ||
n_newlines += 1; | ||
} | ||
|
||
i += 1; | ||
} | ||
Comment on lines
+1963
to
+1973
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This block takes care of any input chunks that are too small for SIMD. |
||
|
||
let set_result = if i == 0 { | ||
let first_char = input.pop_front_char().unwrap(); | ||
debug_assert!(matches!(first_char, '<' | '&' | '\r' | '\0')); | ||
|
||
// FIXME: Passing a bogus input queue is only relevant when c is \n, which can never happen in this case. | ||
// Still, it would be nice to not have to do that. | ||
// The same is true for the unwrap call. | ||
let preprocessed_char = self | ||
.get_preprocessed_char(first_char, &BufferQueue::default()) | ||
.unwrap(); | ||
SetResult::FromSet(preprocessed_char) | ||
} else { | ||
debug_assert!( | ||
input.len() >= i, | ||
"Trying to remove {:?} bytes from a tendril that is only {:?} bytes long", | ||
i, | ||
input.len() | ||
); | ||
let consumed_chunk = input.unsafe_subtendril(0, i as u32); | ||
input.unsafe_pop_front(i as u32); | ||
SetResult::NotFromSet(consumed_chunk) | ||
}; | ||
|
||
self.current_line.set(self.current_line.get() + n_newlines); | ||
|
||
Some(set_result) | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SIMD can only process 16-byte chunks. So instead of scanning the input character-by-character we unroll the loop like this: