From c15230eb369c40fe3cfb53b191a49f0b161ee2a1 Mon Sep 17 00:00:00 2001 From: arya dradjica Date: Thu, 26 Sep 2024 14:15:32 +0200 Subject: [PATCH] Begin working on an AVX2 parser I've implemented the parser thus far as an example program so that I can execute it directly. At the moment, it operates on a 64K chunk, splits it into four segments, and finds every word and line boundary within it. I will benchmark it soon to evaluate how effective this paradigm can be. --- examples/parser.rs | 393 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 393 insertions(+) create mode 100644 examples/parser.rs diff --git a/examples/parser.rs b/examples/parser.rs new file mode 100644 index 00000000..318dbdae --- /dev/null +++ b/examples/parser.rs @@ -0,0 +1,393 @@ +use core::arch::x86_64::*; +use std::io::{self, Read}; +use std::{env, fs::File}; + +#[derive(Debug)] +struct Frame<'a> { + /// The input data for a frame. + /// + /// There must be at least 8 bytes of additional memory lying beyond the end + /// of this array that are safe to read. + input: &'a [u8; 65536], + + /// Entries parsed from this frame. + /// + /// A frame can contain at most 8K entries, since entries with fewer than + /// 8 bytes are virtually impossible. Nicely, this means another 64KiB of + /// data storage. + entries: &'a mut [Entry; 8192], +} + +impl<'a> Frame<'a> { + /// Lex the frame into entries. + #[inline] + #[target_feature(enable = "avx2")] + unsafe fn lex(self) -> Result<(), Error> { + // The start and limit of the segment. + let (start, limit) = self.segment()?; + + // The current offset in the segment. + let mut offset = start; + + // Whether the segment is still working. + let mut working = _mm256_set1_epi64x(!0); + + let mut count = 0usize; + + loop { + // Load the next 8 bytes of the segment. + let view = self.view(offset, working); + + // Move to the next view. + offset = self.next_offset(view, offset, working); + + // Stop processing if the segment has ended. + working = _mm256_cmpgt_epi64(limit, offset); + + count += 1; + + // If all processing is finished, stop. + if _mm256_testz_si256(working, working) != 0 { + break; + } + } + + println!("Performed {count} iterations"); + + Ok(()) + } + + /// The offset of the next view. + /// + /// The next view will begin at a new word (if one was found within the + /// current view), or at the next 8 bytes of the current word. + /// + /// Latency (view -> return): 17c + /// Latency (offset -> return): 1c + /// Latency (working -> return): 2c + #[inline] + #[target_feature(enable = "avx2")] + unsafe fn next_offset( + &self, + view: __m256i, + offset: __m256i, + working: __m256i, + ) -> __m256i { + // Mark bytes in the current word. + let w = self.word_mask(view); + + // Increment the number of set bytes, saturating to 8. + let w = _mm256_or_si256(w, _mm256_set1_epi64x((0xFFu64 << 56) as _)); + + // Count the number of set bytes. + let c = Self::count_bytes(w); + + // Zero the count if the segment is finished working. + let c = _mm256_and_si256(c, working); + + // Add the count to the current offset. + _mm256_add_epi64(offset, c) + } + + /// Whether this is the last token on the line. + /// + /// This is all-ones if the whitespace terminating the current token is a + /// newline (so that the next token will follow it). + /// + /// Latency (view -> return): 7c + #[inline] + #[target_feature(enable = "avx2")] + unsafe fn end_of_line(&self, view: __m256i) -> __m256i { + // If the mid-line word mask contains bits that the line mask does not, + // then the line mask is shorter than the mid-line word mask, and thus + // the current word terminates on that newline. If any such bits exist, + // the MSB will be one of them, thus it is a negative integer. + + // Mark bytes in the same word (ignoring newlines). + let mlw = self.mid_line_word_mask(view); + + // Mark bytes in the same line. + let l = self.line_mask(view); + + // Mark bytes that are not on this line (including line terminator). + let nl = _mm256_andnot_si256(l, mlw); + + // Check that at least one byte is not on this line. + // If any bytes are not on this line, the MSB will be set. + _mm256_cmpgt_epi64(_mm256_setzero_si256(), nl) + } + + /// Whether the end of the word was found. + /// + /// This is all-ones if whitespace was found in the current block, as this + /// delimits the current token. + /// + /// Latency (view -> return): 7c + #[inline] + #[target_feature(enable = "avx2")] + unsafe fn end_of_word(&self, view: __m256i) -> __m256i { + // If the MSB is not set, the word mask did not include the last byte. + _mm256_cmpgt_epi64(self.word_mask(view), _mm256_set1_epi64x(-1)) + } + + /// A mask of the current word. + /// + /// Bytes are all-ones if they are within the current word (excluding the + /// terminating whitespace). + /// + /// Latency (view -> return): 6c + #[inline] + #[target_feature(enable = "avx2")] + unsafe fn word_mask(&self, view: __m256i) -> __m256i { + // Truncate the mid-line word mask if a newline is within it. + _mm256_and_si256(self.mid_line_word_mask(view), self.line_mask(view)) + } + + /// A mask of the current line. + /// + /// Bytes are all-ones if they are within the current line (excluding the + /// line terminator itself). + /// + /// Latency (view -> return): 4c + #[inline] + #[target_feature(enable = "avx2")] + unsafe fn line_mask(&self, view: __m256i) -> __m256i { + // Mark ASCII newlines (0x0A). + let lf = _mm256_cmpeq_epi8(view, _mm256_set1_epi8(b'\n' as i8)); + + // Mark newline bytes by their LSB. + let lf = _mm256_and_si256(lf, _mm256_set1_epi8(1)); + + // Toggle bytes up to (and including) the first newline. + let mask = _mm256_sub_epi64(lf, _mm256_set1_epi64x(1)); + + // Mark all bytes up to (but excluding) the first newline. + _mm256_andnot_si256(lf, mask) + } + + /// A mask of the current word, assuming newlines are absent. + /// + /// Bytes are all-ones if they are within the current word (excluding the + /// terminating whitespace). Newlines are not considered as terminators, + /// instead being treated as parts of words. + /// + /// Latency (view -> return): 5c + #[inline] + #[target_feature(enable = "avx2")] + unsafe fn mid_line_word_mask(&self, view: __m256i) -> __m256i { + // Mark mid-line spaces. + let sp = self.mid_line_space(view); + + // Mark mid-line bytes by their LSB. + let sp = _mm256_and_si256(sp, _mm256_set1_epi8(1)); + + // Toggle bytes up to (and including) the first mid-line space. + let mask = _mm256_sub_epi64(sp, _mm256_set1_epi64x(1)); + + // Mark all bytes up to (but excluding) the first mid-line space. + _mm256_andnot_si256(sp, mask) + } + + /// A mask of mid-line whitespace (spaces and tabs). + /// + /// Bytes are all-ones if they are ASCII spaces ('0x20') or horizontal tabs + /// ('0x09'), and are otherwise all-zeros. + /// + /// Latency (view -> return): 2c + #[inline] + #[target_feature(enable = "avx2")] + unsafe fn mid_line_space(&self, view: __m256i) -> __m256i { + // Mark ASCII spaces (0x20). + let sp = _mm256_cmpeq_epi8(view, _mm256_set1_epi8(b'=' as i8)); + + // Mark ASCII horizontal tabs (0x09). + let ht = _mm256_cmpeq_epi8(view, _mm256_set1_epi8(b'\t' as i8)); + + // Combine marks. + _mm256_or_si256(sp, ht) + } + + /// The current view into the segment. + /// + /// The view consists of the next 8 bytes of the segment. If the segment's + /// work has been completed, all-zeros is returned. + /// + /// Latency (offset -> return): 16c + /// Latency (working -> return): 15c + #[inline] + #[target_feature(enable = "avx2")] + unsafe fn view(&self, offset: __m256i, working: __m256i) -> __m256i { + // The value returned if the segment is not working. + let fallback = _mm256_setzero_si256(); + + // The base address the segment is loaded from. + let addr = self.input.as_ptr() as *const i64; + + // Load the segment's bytes if it is working. + _mm256_mask_i64gather_epi64(fallback, addr, offset, working, 1) + } + + /// Divide the frame into segments. + /// + /// A segment is a slice of the frame that begins with a newline. This is + /// important since lexing cannot start in the middle of a record. + /// + /// Initially, 4 segments are drawn by dividing the frame equally. Each of + /// the frames is then skipped forward until a newline. + #[inline] + #[target_feature(enable = "avx2")] + unsafe fn segment(&self) -> Result<(__m256i, __m256i), Error> { + // The offset of the segment within the frame. + let mut offset = _mm256_set_epi64x(4, 3, 2, 1); + offset = _mm256_slli_epi64(offset, 16 - 2 - 3); + offset = _mm256_sub_epi64(offset, _mm256_set1_epi64x(1)); + + // The first 8 bytes in the segment. + let mut input = _mm256_setzero_si256(); + + // A mask of newlines in the input. + let mut lines = _mm256_setzero_si256(); + + // Whether a newline needs to be searched for. + let mut search = _mm256_set1_epi64x(!0); + + // Search for a newline. + // + // Latency (offset -> input): 16c + // Latency (search -> input): 15c + // Latency (input -> search): 4c + // Latency (input -> next offset): 4c + // Latency (offset -> next offset): 1c + // Latency (input -> break): 14c + for _ in 0..2048 { + // Load the next 8 bytes of the segment. + input = _mm256_mask_i64gather_epi64( + input, + self.input.as_ptr() as *const _, + offset, + search, + 8, + ); + + // Mark newline bytes (all bits set if byte is equal). + lines = _mm256_cmpeq_epi8(input, _mm256_set1_epi8(b'\n' as _)); + + // Check whether the search needs to continue. + search = _mm256_cmpeq_epi64(lines, _mm256_setzero_si256()); + + // Push the offset backward if the search needs to continue. + offset = _mm256_add_epi64(offset, search); + + // If all searches are finished, stop. + if _mm256_testz_si256(search, search) != 0 { + break; + } + } + + // If a newline wasn't found, return an error. + if _mm256_testz_si256(search, search) == 0 { + return Err(Error); + } + + // Mark newline bytes (MSB is set for newlines). + lines = _mm256_and_si256(lines, _mm256_set1_epi8(!0x7F)); + + // Switch the offsets to units of bytes. + offset = _mm256_slli_epi64(offset, 3); + + // Select all bytes up to (and including) the first newline byte. + let to_skip = _mm256_sub_epi64(lines, _mm256_set1_epi64x(1)); + let to_skip = _mm256_xor_si256(lines, to_skip); + + // Update the offset to skip past the newline. + offset = _mm256_add_epi64(offset, Self::count_bytes(to_skip)); + + // Calculate the start of each segment. + let start = _mm256_permute4x64_epi64(offset, 0b10010011); + let start = _mm256_and_si256(start, _mm256_set_epi64x(!0, !0, !0, 0)); + + Ok((start, offset)) + } + + /// Count the number of set bytes. + /// + /// The input must contain bytes that are all-ones or all-zeros. The total + /// number of all-ones bytes (between 0 and 8, inclusive) is returned. + /// + /// Latency (bytes -> result): 8c + #[inline] + #[target_feature(enable = "avx2")] + unsafe fn count_bytes(mut bytes: __m256i) -> __m256i { + // Negate bytes so that each one is 0 or 1. + bytes = _mm256_sub_epi8(_mm256_setzero_si256(), bytes); + + // Pair bytes in the low and high dwords and add them. + bytes = _mm256_add_epi8(bytes, _mm256_srli_epi64(bytes, 32)); + + // Pair bytes in the low and high words and add them. + bytes = _mm256_add_epi8(bytes, _mm256_srli_epi64(bytes, 16)); + + // Pair the low and high bytes and add them. + bytes = _mm256_add_epi8(bytes, _mm256_srli_epi64(bytes, 8)); + + // Mask out intermediate results. + bytes = _mm256_and_si256(bytes, _mm256_set1_epi64x(0xFF)); + + bytes + } +} + +/// A lexical entry. +/// +/// Entries are lines in a zone file providing DNS information. This type is an +/// efficient and compact representation for an entry that can be produced by a +/// vectorized lexer. +#[derive(Copy, Clone, Debug, Default)] +#[repr(C)] +struct Entry { + /// The address of this record. + /// + /// This is an offset in bytes relative to the frame. + addr: u32, + + /// The location of the record data. + /// + /// The most-significant bit is set if an explicit domain name was present + /// in the record. + /// + /// The remaining bits are an offset in bytes from the start of the record. + data: u16, + + /// The location of the TTL. + /// + /// This is an offset in bytes from the start of the record. If it is zero, + /// an implicit TTL was used. + ttl: u8, + + /// The record type. + /// + /// This is a unique identifier for a record or control entry type, derived + /// by a perfect hash of the first few bytes of its name. + hash: u8, +} + +#[derive(Copy, Clone, Debug)] +struct Error; + +fn main() -> io::Result<()> { + let path = env::args().nth(1).unwrap(); + let mut input = Vec::with_capacity(4096 * 16 + 8); + File::open(path)?.take(4096 * 16).read_to_end(&mut input)?; + input.resize(input.capacity(), 0u8); + + let mut entries = vec![Entry::default(); 512 * 16]; + + let frame = Frame { + input: (&input[..65536]).try_into().unwrap(), + entries: (&mut entries[..8192]).try_into().unwrap(), + }; + + unsafe { frame.lex() }.unwrap(); + + Ok(()) +}