From a65ad0d7dd46f7b61205687cba5050ac35ddaff0 Mon Sep 17 00:00:00 2001
From: Elijah Mirecki <elimirks@gmail.com>
Date: Fri, 17 Dec 2021 20:05:47 -0500
Subject: [PATCH] Improve ws lexing SIMD

---
 src/tokenizer.rs   | 108 +++++++++++++++++++++++++++++----------------
 src/util.rs        |  14 ++++++
 test/dereference.b |   2 +
 test/fib_rec.b     |   6 +--
 4 files changed, 89 insertions(+), 41 deletions(-)

diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 66d44d2..b80d748 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1,5 +1,6 @@
 use crate::parser::ParseContext;
 use crate::ast::{Pos, CompErr};
+use crate::util::lsb_number;
 
 #[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
 use std::arch::x86_64::*;
@@ -349,73 +350,103 @@ fn get_tok_word(c: &mut ParseContext) -> Result<(Pos, Token), CompErr> {
 }
 
 /**
- * Extract an alphanumeric slice at the given offset
+ * Extract an alphanumeric (and underscore) slice at the given offset
  * @return An empty slice if the offset is out of bounds,
  *         or if there are no alphanumeric characters at that position
  */
 fn alphanumeric_slice<'a>(
     pos: &Pos, slice: &'a [u8], offset: usize
 ) -> Result<&'a str, CompErr> {
+    let len = alphanumeric_len(slice, offset);
+    match std::str::from_utf8(&slice[offset..offset + len]) {
+        Ok(s) => Ok(s),
+        _     => CompErr::err(pos, "Only ASCII is supported".to_string()),
+    }
+}
+
+pub fn is_alpha(c: u8) -> bool {
+    (c >= 97 && c <= 122) | (c >= 65 && c <= 90) | (c >= 48 && c <= 57) | (c == 95)
+}
+
+fn alphanumeric_len(
+    slice: &[u8], offset: usize
+) -> usize {
     let mut len = 0;
-    // TODO: SIMD
+
     while offset + len < slice.len() {
-        let c = slice[offset + len] as char;
-        if c.is_alphanumeric() || c == '_' {
+        if is_alpha(slice[offset + len]) {
             len += 1;
         } else {
             break;
         }
     }
-    match std::str::from_utf8(&slice[offset..offset + len]) {
-        Ok(s) => Ok(s),
-        _     => CompErr::err(pos, "Only ASCII is supported".to_string()),
-    }
+    len
 }
 
+/// Returns true when it hit the end of the ws
 #[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
-unsafe fn simd_consume_ws(c: &mut ParseContext) {
-    let space = ' ' as i8;
-    let space_vec = _mm_set_epi8(
-        space, space, space, space,
-        space, space, space, space,
-        space, space, space, space,
-        space, space, space, space
-    );
-    // Bitmask that covers both newlines & tabs.
-    // It also covers a bunch of other chars that we don't care about
-    let nl_tab = 0b00001000i8;
-    let nl_tab_vec = _mm_set_epi8(
-        nl_tab, nl_tab, nl_tab, nl_tab,
-        nl_tab, nl_tab, nl_tab, nl_tab,
-        nl_tab, nl_tab, nl_tab, nl_tab,
-        nl_tab, nl_tab, nl_tab, nl_tab
-    );
+#[allow(overflowing_literals)]
+unsafe fn simd_consume_ws(c: &mut ParseContext) -> bool {
+    let space_vec = _mm_set1_epi8(' ' as i8);
+    let tab_nl_vec = _mm_set1_epi8(0b11111000);
+    let tab_nl_stat_vec = _mm_set1_epi8(0b00001000);
+
     while c.offset + 16 < c.content.len() {
         let values = _mm_loadu_si128(&c.content[c.offset] as *const u8 as *const _);
-        let result = _mm_or_si128(
+
+        // Values will be 255 if they're whitespace
+        // andnot(a, b) does ((NOT a) AND b)
+        let result = _mm_andnot_si128(
             _mm_cmpeq_epi8(values, space_vec),
-            _mm_cmpeq_epi8(values, nl_tab_vec)
+            // Negated since it's gt instead of eq
+            _mm_cmpgt_epi8(
+                _mm_and_si128(values, tab_nl_vec),
+                tab_nl_stat_vec
+            )
         );
 
-        let p = &result as *const _ as *const u8;
+        // Compute bitmask of which values are 255
+        // Mask is zeros going from from right to left
+        let mask = _mm_movemask_epi8(result) as u32;
 
-        // TODO: Is there a better way than a filthy for loop?
-        for i in 0..16 {
-            if *p.add(i) == 0 {
-                // We aren't at a whitespace char anymore
-                return;
-            } else {
-                c.offset += 1;
-            }
+        //println!("mask: {:016b}, {}", mask, std::str::from_utf8(&c.content[c.offset..c.offset + 10]).unwrap());
+        //println!("values: {:016b}", mask);
+
+        if mask == 0 {
+            c.offset += 16;
+        } else {
+            let lsb = lsb_number(mask);
+            //println!("lsb:  {}", lsb);
+
+            c.offset += lsb as usize;
+            return true;
         }
     }
+    false
 }
 
 // Parse any amount of whitespace, including comments
 fn consume_ws(c: &mut ParseContext) {
     #[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
     unsafe {
-        simd_consume_ws(c);
+        if c.offset < c.content.len() {
+            match c.content[c.offset] as char {
+                ' ' | '\n' | '\t' => {},
+                '/' => {
+                    consume_comment(c);
+                },
+                _ => return,
+            };
+        }
+
+        while simd_consume_ws(c) {
+            match c.content[c.offset] as char {
+                '/' => if !consume_comment(c) {
+                    return
+                },
+                _ => return,
+            }
+        }
     }
 
     while c.offset < c.content.len() {
@@ -443,10 +474,11 @@ fn consume_comment(c: &mut ParseContext) -> bool {
     }
     c.offset += 2;
 
+    // TODO: SIMD goes here
+
     let mut one;
     let mut two = 0;
 
-    // TODO: SIMD to search for */
     while c.offset < c.content.len() {
         one = two;
         two = c.content[c.offset];
diff --git a/src/util.rs b/src/util.rs
index 1cf1106..7197941 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -19,3 +19,17 @@ fn try_logical_cpu_count() -> Result<usize, io::Error> {
     }
     Ok(logical_cpus)
 }
+
+const DE_BRUIJN_LBS_BIT_POS: [u32; 32] = [
+    0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 
+    31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
+];
+/// TL;DR Fucking fast algos to find the least significant bit and most significant
+/// It will find the value in 4 arithmetic operations and a memory lookup.
+/// # References
+/// http://supertech.csail.mit.edu/papers/debruijn.pdf
+pub fn lsb_number(v: u32) -> u32 {
+    let vi = v as i32;
+    let index = ((((vi & -vi) as u32).wrapping_mul(0x077CB531u32))).wrapping_shr(27);
+    return DE_BRUIJN_LBS_BIT_POS[index as usize];
+}
diff --git a/test/dereference.b b/test/dereference.b
index 4819d20..5c001cf 100644
--- a/test/dereference.b
+++ b/test/dereference.b
@@ -1,3 +1,5 @@
+
+    /* test comment */
 doubleplus1(xref) {
     *xref = *xref + *xref;
     *xref = 1 + **&xref;
diff --git a/test/fib_rec.b b/test/fib_rec.b
index 73e5dba..c8ab94d 100644
--- a/test/fib_rec.b
+++ b/test/fib_rec.b
@@ -1,6 +1,3 @@
-  
-
-
 /* Returns nonsense for n < 0 */
 fib(n) {
   return(
@@ -11,6 +8,9 @@ fib(n) {
 }
 
 main() {
+    auto x;
+    x = 2;
+    /* Should return "55" */
     return(fib(10));
 }