@@ -700,7 +700,52 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
700
700
match self . state . get ( ) {
701
701
//§ data-state
702
702
states:: Data => loop {
703
- match pop_except_from ! ( self , input, small_char_set!( '\r' '\0' '&' '<' '\n' ) ) {
703
+ let set = small_char_set ! ( '\r' '\0' '&' '<' '\n' ) ;
704
+
705
+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
706
+ let set_result = if !( self . opts . exact_errors
707
+ || self . reconsume . get ( )
708
+ || self . ignore_lf . get ( ) )
709
+ && is_x86_feature_detected ! ( "sse2" )
710
+ {
711
+ let front_buffer = input. peek_front_chunk_mut ( ) ;
712
+ let Some ( mut front_buffer) = front_buffer else {
713
+ return ProcessResult :: Suspend ;
714
+ } ;
715
+
716
+ // Special case: The fast path is not worth taking if the first character is already in the set,
717
+ // which is fairly common
718
+ let first_char = front_buffer
719
+ . chars ( )
720
+ . next ( )
721
+ . expect ( "Input buffers are never empty" ) ;
722
+
723
+ if matches ! ( first_char, '\r' | '\0' | '&' | '<' | '\n' ) {
724
+ drop ( front_buffer) ;
725
+ self . pop_except_from ( input, set)
726
+ } else {
727
+ // SAFETY:
728
+ // This CPU is guaranteed to support SSE2 due to the is_x86_feature_detected check above
729
+ let result = unsafe { self . data_state_sse2_fast_path ( & mut front_buffer) } ;
730
+
731
+ if front_buffer. is_empty ( ) {
732
+ drop ( front_buffer) ;
733
+ input. pop_front ( ) ;
734
+ }
735
+
736
+ result
737
+ }
738
+ } else {
739
+ self . pop_except_from ( input, set)
740
+ } ;
741
+
742
+ #[ cfg( not( any( target_arch = "x86" , target_arch = "x86_64" ) ) ) ]
743
+ let set_result = self . pop_except_from ( input, set) ;
744
+
745
+ let Some ( set_result) = set_result else {
746
+ return ProcessResult :: Suspend ;
747
+ } ;
748
+ match set_result {
704
749
FromSet ( '\0' ) => {
705
750
self . bad_char_error ( ) ;
706
751
go ! ( self : emit '\0' )
@@ -1752,6 +1797,121 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
1752
1797
states:: CdataSectionEnd => go ! ( self : push_temp ']' ; push_temp ']' ; to CdataSection ) ,
1753
1798
}
1754
1799
}
1800
+
1801
+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
1802
+ #[ target_feature( enable = "sse2" ) ]
1803
+ /// Implements the [data state] with SIMD instructions.
1804
+ ///
1805
+ /// The algorithm implemented is the naive SIMD approach described [here].
1806
+ ///
1807
+ /// ### SAFETY:
1808
+ /// Calling this function on a CPU that does not support SSE2 causes undefined behaviour.
1809
+ ///
1810
+ /// [data state]: https://html.spec.whatwg.org/#data-state
1811
+ /// [here]: https://lemire.me/blog/2024/06/08/scan-html-faster-with-simd-instructions-chrome-edition/
1812
+ unsafe fn data_state_sse2_fast_path ( & self , input : & mut StrTendril ) -> Option < SetResult > {
1813
+ #[ cfg( target_arch = "x86" ) ]
1814
+ use std:: arch:: x86:: {
1815
+ __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
1816
+ _mm_set1_epi8,
1817
+ } ;
1818
+ #[ cfg( target_arch = "x86_64" ) ]
1819
+ use std:: arch:: x86_64:: {
1820
+ __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
1821
+ _mm_set1_epi8,
1822
+ } ;
1823
+
1824
+ debug_assert ! ( !input. is_empty( ) ) ;
1825
+
1826
+ let quote_mask = _mm_set1_epi8 ( '<' as i8 ) ;
1827
+ let escape_mask = _mm_set1_epi8 ( '&' as i8 ) ;
1828
+ let carriage_return_mask = _mm_set1_epi8 ( '\r' as i8 ) ;
1829
+ let zero_mask = _mm_set1_epi8 ( '\0' as i8 ) ;
1830
+ let newline_mask = _mm_set1_epi8 ( '\n' as i8 ) ;
1831
+
1832
+ let raw_bytes: & [ u8 ] = input. as_bytes ( ) ;
1833
+ let start = raw_bytes. as_ptr ( ) ;
1834
+
1835
+ const STRIDE : usize = 16 ;
1836
+ let mut i = 0 ;
1837
+ let mut n_newlines = 0 ;
1838
+ while i + STRIDE <= raw_bytes. len ( ) {
1839
+ // Load a 16 byte chunk from the input
1840
+ let data = _mm_loadu_si128 ( start. add ( i) as * const __m128i ) ;
1841
+
1842
+ // Compare the chunk against each mask
1843
+ let quotes = _mm_cmpeq_epi8 ( data, quote_mask) ;
1844
+ let escapes = _mm_cmpeq_epi8 ( data, escape_mask) ;
1845
+ let carriage_returns = _mm_cmpeq_epi8 ( data, carriage_return_mask) ;
1846
+ let zeros = _mm_cmpeq_epi8 ( data, zero_mask) ;
1847
+ let newlines = _mm_cmpeq_epi8 ( data, newline_mask) ;
1848
+
1849
+ // Combine all test results and create a bitmask from them.
1850
+ // Each bit in the mask will be 1 if the character at the bit position is in the set and 0 otherwise.
1851
+ let test_result = _mm_or_si128 (
1852
+ _mm_or_si128 ( quotes, zeros) ,
1853
+ _mm_or_si128 ( escapes, carriage_returns) ,
1854
+ ) ;
1855
+ let bitmask = _mm_movemask_epi8 ( test_result) ;
1856
+ let newline_mask = _mm_movemask_epi8 ( newlines) ;
1857
+
1858
+ if ( bitmask != 0 ) {
1859
+ // We have reached one of the characters that cause the state machine to transition
1860
+ let position = if cfg ! ( target_endian = "little" ) {
1861
+ bitmask. trailing_zeros ( ) as usize
1862
+ } else {
1863
+ bitmask. leading_zeros ( ) as usize
1864
+ } ;
1865
+
1866
+ n_newlines += ( newline_mask & ( ( 1 << position) - 1 ) ) . count_ones ( ) as u64 ;
1867
+ i += position;
1868
+ break ;
1869
+ } else {
1870
+ n_newlines += newline_mask. count_ones ( ) as u64 ;
1871
+ }
1872
+
1873
+ i += STRIDE ;
1874
+ }
1875
+
1876
+ // Process any remaining bytes (less than STRIDE)
1877
+ while let Some ( c) = raw_bytes. get ( i) {
1878
+ if matches ! ( * c, b'<' | b'&' | b'\r' | b'\0' ) {
1879
+ break ;
1880
+ }
1881
+ if * c == b'\n' {
1882
+ n_newlines += 1 ;
1883
+ }
1884
+
1885
+ i += 1 ;
1886
+ }
1887
+
1888
+ let set_result = if i == 0 {
1889
+ let c = input. pop_front_char ( ) . unwrap ( ) ;
1890
+ debug_assert ! ( matches!( c, '<' | '&' | '\r' | '\0' ) ) ;
1891
+
1892
+ // FIXME: Passing a bogus input queue is only relevant when c is \n, which can never happen in this case.
1893
+ // Still, it would be nice to not have to do that.
1894
+ // The same is true for the unwrap call.
1895
+ let preprocessed_char = self
1896
+ . get_preprocessed_char ( c, & BufferQueue :: default ( ) )
1897
+ . unwrap ( ) ;
1898
+ SetResult :: FromSet ( preprocessed_char)
1899
+ } else {
1900
+ debug_assert ! (
1901
+ input. len( ) >= i,
1902
+ "Trying to remove {:?} bytes from a tendril that is only {:?} bytes long" ,
1903
+ i,
1904
+ input. len( )
1905
+ ) ;
1906
+ let consumed_chunk = input. unsafe_subtendril ( 0 , i as u32 ) ;
1907
+ input. unsafe_pop_front ( i as u32 ) ;
1908
+ SetResult :: NotFromSet ( consumed_chunk)
1909
+ } ;
1910
+
1911
+ self . current_line . set ( self . current_line . get ( ) + n_newlines) ;
1912
+
1913
+ Some ( set_result)
1914
+ }
1755
1915
}
1756
1916
1757
1917
#[ cfg( test) ]
0 commit comments