@@ -700,7 +700,53 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
700
700
match self . state . get ( ) {
701
701
//§ data-state
702
702
states:: Data => loop {
703
- match pop_except_from ! ( self , input, small_char_set!( '\r' '\0' '&' '<' '\n' ) ) {
703
+ let set = small_char_set ! ( '\r' '\0' '&' '<' '\n' ) ;
704
+
705
+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
706
+ let set_result = if !( self . opts . exact_errors
707
+ || self . reconsume . get ( )
708
+ || self . ignore_lf . get ( ) )
709
+ && is_x86_feature_detected ! ( "sse2" )
710
+ {
711
+ let front_buffer = input. peek_front_chunk_mut ( ) ;
712
+ let Some ( mut front_buffer) = front_buffer else {
713
+ return ProcessResult :: Suspend ;
714
+ } ;
715
+
716
+ // Special case: The fast path is not worth taking if the first character is already in the set,
717
+ // which is fairly common
718
+ let first_char = front_buffer
719
+ . chars ( )
720
+ . next ( )
721
+ . expect ( "Input buffers are never empty" ) ;
722
+ let result = if matches ! ( first_char, '\r' | '\0' | '&' | '<' | '\n' ) {
723
+ drop ( front_buffer) ;
724
+ self . pop_except_from ( input, set)
725
+ } else {
726
+ // SAFETY:
727
+ // This CPU is guaranteed to support SSE2 due to the is_x86_feature_detected check above
728
+ let result = unsafe { self . data_state_sse2_fast_path ( & mut front_buffer) } ;
729
+
730
+ if front_buffer. is_empty ( ) {
731
+ drop ( front_buffer) ;
732
+ input. pop_front ( ) ;
733
+ }
734
+
735
+ result
736
+ } ;
737
+
738
+ result
739
+ } else {
740
+ self . pop_except_from ( input, set)
741
+ } ;
742
+
743
+ #[ cfg( not( any( target_arch = "x86" , target_arch = "x86_64" ) ) ) ]
744
+ let set_result = self . pop_except_from ( input, set) ;
745
+
746
+ let Some ( set_result) = set_result else {
747
+ return ProcessResult :: Suspend ;
748
+ } ;
749
+ match set_result {
704
750
FromSet ( '\0' ) => {
705
751
self . bad_char_error ( ) ;
706
752
go ! ( self : emit '\0' )
@@ -1752,6 +1798,121 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
1752
1798
states:: CdataSectionEnd => go ! ( self : push_temp ']' ; push_temp ']' ; to CdataSection ) ,
1753
1799
}
1754
1800
}
1801
+
1802
+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
1803
+ #[ target_feature( enable = "sse2" ) ]
1804
+ /// Implements the [data state] with SIMD instructions.
1805
+ ///
1806
+ /// The algorithm implemented is the naive SIMD approach described [here].
1807
+ ///
1808
+ /// ### SAFETY:
1809
+ /// Calling this function on a CPU that does not support SSE2 causes undefined behaviour.
1810
+ ///
1811
+ /// [data state]: https://html.spec.whatwg.org/#data-state
1812
+ /// [here]: https://lemire.me/blog/2024/06/08/scan-html-faster-with-simd-instructions-chrome-edition/
1813
+ unsafe fn data_state_sse2_fast_path ( & self , input : & mut StrTendril ) -> Option < SetResult > {
1814
+ #[ cfg( target_arch = "x86" ) ]
1815
+ use std:: arch:: x86:: {
1816
+ __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
1817
+ _mm_set1_epi8,
1818
+ } ;
1819
+ #[ cfg( target_arch = "x86_64" ) ]
1820
+ use std:: arch:: x86_64:: {
1821
+ __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
1822
+ _mm_set1_epi8,
1823
+ } ;
1824
+
1825
+ debug_assert ! ( !input. is_empty( ) ) ;
1826
+
1827
+ let quote_mask = _mm_set1_epi8 ( '<' as i8 ) ;
1828
+ let escape_mask = _mm_set1_epi8 ( '&' as i8 ) ;
1829
+ let carriage_return_mask = _mm_set1_epi8 ( '\r' as i8 ) ;
1830
+ let zero_mask = _mm_set1_epi8 ( '\0' as i8 ) ;
1831
+ let newline_mask = _mm_set1_epi8 ( '\n' as i8 ) ;
1832
+
1833
+ let raw_bytes: & [ u8 ] = & input. as_bytes ( ) ;
1834
+ let start = raw_bytes. as_ptr ( ) ;
1835
+
1836
+ const STRIDE : usize = 16 ;
1837
+ let mut i = 0 ;
1838
+ let mut n_newlines = 0 ;
1839
+ while i + STRIDE <= raw_bytes. len ( ) {
1840
+ // Load a 16 byte chunk from the input
1841
+ let data = _mm_loadu_si128 ( start. offset ( i as isize ) as * const __m128i ) ;
1842
+
1843
+ // Compare the chunk against each mask
1844
+ let quotes = _mm_cmpeq_epi8 ( data, quote_mask) ;
1845
+ let escapes = _mm_cmpeq_epi8 ( data, escape_mask) ;
1846
+ let carriage_returns = _mm_cmpeq_epi8 ( data, carriage_return_mask) ;
1847
+ let zeros = _mm_cmpeq_epi8 ( data, zero_mask) ;
1848
+ let newlines = _mm_cmpeq_epi8 ( data, newline_mask) ;
1849
+
1850
+ // Combine all test results and create a bitmask from them.
1851
+ // Each bit in the mask will be 1 if the character at the bit position is in the set and 0 otherwise.
1852
+ let test_result = _mm_or_si128 (
1853
+ _mm_or_si128 ( quotes, zeros) ,
1854
+ _mm_or_si128 ( escapes, carriage_returns) ,
1855
+ ) ;
1856
+ let bitmask = _mm_movemask_epi8 ( test_result) ;
1857
+ let newline_mask = _mm_movemask_epi8 ( newlines) ;
1858
+
1859
+ if ( bitmask != 0 ) {
1860
+ // We have reached one of the characters that cause the state machine to transition
1861
+ let position = if cfg ! ( target_endian = "little" ) {
1862
+ bitmask. trailing_zeros ( ) as usize
1863
+ } else {
1864
+ bitmask. leading_zeros ( ) as usize
1865
+ } ;
1866
+
1867
+ n_newlines += ( newline_mask & ( ( 1 << position) - 1 ) ) . count_ones ( ) as u64 ;
1868
+ i += position;
1869
+ break ;
1870
+ } else {
1871
+ n_newlines += newline_mask. count_ones ( ) as u64 ;
1872
+ }
1873
+
1874
+ i += STRIDE ;
1875
+ }
1876
+
1877
+ // Process any remaining bytes (less than STRIDE)
1878
+ while let Some ( c) = raw_bytes. get ( i) {
1879
+ if matches ! ( * c, b'<' | b'&' | b'\r' | b'\0' ) {
1880
+ break ;
1881
+ }
1882
+ if * c == b'\n' {
1883
+ n_newlines += 1 ;
1884
+ }
1885
+
1886
+ i += 1 ;
1887
+ }
1888
+
1889
+ let set_result = if i == 0 {
1890
+ let c = input. pop_front_char ( ) . unwrap ( ) ;
1891
+ debug_assert ! ( matches!( c, '<' | '&' | '\r' | '\0' ) ) ;
1892
+
1893
+ // FIXME: Passing a bogus input queue is only relevant when c is \n, which can never happen in this case.
1894
+ // Still, it would be nice to not have to do that.
1895
+ // The same is true for the unwrap call.
1896
+ let preprocessed_char = self
1897
+ . get_preprocessed_char ( c, & BufferQueue :: default ( ) )
1898
+ . unwrap ( ) ;
1899
+ SetResult :: FromSet ( preprocessed_char)
1900
+ } else {
1901
+ debug_assert ! (
1902
+ input. len( ) >= i,
1903
+ "Trying to remove {:?} bytes from a tendril that is only {:?} bytes long" ,
1904
+ i,
1905
+ input. len( )
1906
+ ) ;
1907
+ let consumed_chunk = input. unsafe_subtendril ( 0 , i as u32 ) ;
1908
+ input. unsafe_pop_front ( i as u32 ) ;
1909
+ SetResult :: NotFromSet ( consumed_chunk)
1910
+ } ;
1911
+
1912
+ self . current_line . set ( self . current_line . get ( ) + n_newlines) ;
1913
+
1914
+ Some ( set_result)
1915
+ }
1755
1916
}
1756
1917
1757
1918
#[ cfg( test) ]
0 commit comments