Skip to content

Commit

Permalink
Use SIMD for whitespace parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
elimirks committed Dec 17, 2021
1 parent bc5e9f0 commit 1e4f5ad
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 13 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,11 @@ In C, this expression would be true: `2 && 1`. But this would be false: `2 & 1`.
The `+=`, `-=`, ..., operators in C are written as `=+`, `=-`, ...

This leads to some whitespace dependence to avoid ambiguity. Specifically, `a =*b` is the same as `a =* b`, but is different than `a = *b`. Similarly with `=-`.

## Compiler notes
### UTF-8
- It will mostly work inside comments (except for UTF-8 sequences with trailing `*/` bytes!).
- It won't work anywhere else

### SIMD
Compile with `RUSTFLAGS='-C target-feature=+avx2'` to leverage SIMD in the lexer. It slightly improves performance
3 changes: 2 additions & 1 deletion src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,9 @@ impl ParseState {
fn pop_path_to_parse(&mut self) -> Option<(usize, PathBuf)> {
self.parse_stack.pop().map(|path| {
self.running_parsers += 1;
let res = (self.file_id, path);
self.file_id += 1;
(self.file_id, path)
res
})
}
}
Expand Down
78 changes: 66 additions & 12 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
use crate::parser::ParseContext;
use crate::ast::{Pos, CompErr};

#[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
use std::arch::x86_64::*;

#[derive(Debug, PartialEq)]
pub enum Token {
Id(String),
Expand Down Expand Up @@ -85,7 +88,7 @@ pub fn pop_tok(c: &mut ParseContext) -> Result<(Pos, Token), CompErr> {
},
Some(ch) => {
if ch == '_' || ch.is_alphabetic() {
Ok(get_tok_word(c))
get_tok_word(c)
} else if ch.is_numeric() {
get_tok_int(c, 10)
} else if ch == '\'' {
Expand Down Expand Up @@ -168,13 +171,12 @@ fn get_tok_symbol(c: &mut ParseContext) -> Result<(Pos, Token), CompErr> {
// Assumes the @ token has been parsed
// Returns a metaprogramming token
fn get_tok_meta(c: &mut ParseContext) -> Result<(Pos, Token), CompErr> {
let next_chars = alphanumeric_slice(&c.content, c.offset + 1);
let next_word = std::str::from_utf8(next_chars).unwrap();
let pos = c.pos();
let next_word = alphanumeric_slice(&pos, &c.content, c.offset + 1)?;

match next_word {
"import" => {
c.offset += 1 + next_chars.len();
c.offset += 1 + next_word.len();
Ok((c.pos(), Token::Import))
},
other => {
Expand Down Expand Up @@ -225,9 +227,9 @@ fn get_tok_int(
c: &mut ParseContext, radix: u32
) -> Result<(Pos, Token), CompErr> {
let pos = c.pos();
let current_word = alphanumeric_slice(&c.content, c.offset);
let current_word = alphanumeric_slice(&pos, &c.content, c.offset)?;
// TODO: No need to allocate a new string here. Reimplement from radix!
let str_word: String = std::str::from_utf8(current_word).unwrap().to_string();
let str_word: String = current_word.to_string();

match i64::from_str_radix(&str_word, radix) {
Ok(num) => {
Expand Down Expand Up @@ -314,13 +316,13 @@ fn get_inside_quotes(
}

// Parsed word-like tokens. Includes keywords and IDs
fn get_tok_word(c: &mut ParseContext) -> (Pos, Token) {
fn get_tok_word(c: &mut ParseContext) -> Result<(Pos, Token), CompErr> {
let pos = c.pos();
let slice = alphanumeric_slice(&c.content, c.offset);
let slice = alphanumeric_slice(&pos, &c.content, c.offset)?;
c.offset += slice.len();

// Safe to assume it's valid utf8 since we enforce ASCII
let tok = match std::str::from_utf8(slice).unwrap() {
let tok = match slice {
"auto" => Token::Auto,
"break" => Token::Break,
"else" => Token::Else,
Expand All @@ -343,16 +345,19 @@ fn get_tok_word(c: &mut ParseContext) -> (Pos, Token) {
},
};

(pos, tok)
Ok((pos, tok))
}

/**
* Extract an alphanumeric slice at the given offset
* @return An empty slice if the offset is out of bounds,
* or if there are no alphanumeric characters at that position
*/
fn alphanumeric_slice(slice: &[u8], offset: usize) -> &[u8] {
fn alphanumeric_slice<'a>(
pos: &Pos, slice: &'a [u8], offset: usize
) -> Result<&'a str, CompErr> {
let mut len = 0;
// TODO: SIMD
while offset + len < slice.len() {
let c = slice[offset + len] as char;
if c.is_alphanumeric() || c == '_' {
Expand All @@ -361,15 +366,63 @@ fn alphanumeric_slice(slice: &[u8], offset: usize) -> &[u8] {
break;
}
}
&slice[offset..offset + len]
match std::str::from_utf8(&slice[offset..offset + len]) {
Ok(s) => Ok(s),
_ => CompErr::err(pos, "Only ASCII is supported".to_string()),
}
}

#[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
unsafe fn simd_consume_ws(c: &mut ParseContext) {
let space = ' ' as i8;
let space_vec = _mm_set_epi8(
space, space, space, space,
space, space, space, space,
space, space, space, space,
space, space, space, space
);
// Bitmask that covers both newlines & tabs.
// It also covers a bunch of other chars that we don't care about
let nl_tab = 0b00001000i8;
let nl_tab_vec = _mm_set_epi8(
nl_tab, nl_tab, nl_tab, nl_tab,
nl_tab, nl_tab, nl_tab, nl_tab,
nl_tab, nl_tab, nl_tab, nl_tab,
nl_tab, nl_tab, nl_tab, nl_tab
);
while c.offset + 16 < c.content.len() {
let values = _mm_loadu_si128(&c.content[c.offset] as *const u8 as *const _);
let result = _mm_or_si128(
_mm_cmpeq_epi8(values, space_vec),
_mm_cmpeq_epi8(values, nl_tab_vec)
);

let p = &result as *const _ as *const u8;

// TODO: Is there a better way than a filthy for loop?
for i in 0..16 {
if *p.add(i) == 0 {
// We aren't at a whitespace char anymore
return;
} else {
c.offset += 1;
}
}
}
}

// Parse any amount of whitespace, including comments
fn consume_ws(c: &mut ParseContext) {
#[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
unsafe {
simd_consume_ws(c);
}

while c.offset < c.content.len() {
match c.content[c.offset] as char {
' ' => c.offset += 1,
'\n' => c.offset += 1,
'\t' => c.offset += 1,
'/' => if !consume_comment(c) {
break
},
Expand All @@ -393,6 +446,7 @@ fn consume_comment(c: &mut ParseContext) -> bool {
let mut one;
let mut two = 0;

// TODO: SIMD to search for */
while c.offset < c.content.len() {
one = two;
two = c.content[c.offset];
Expand Down
5 changes: 5 additions & 0 deletions test/fib_rec.b
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@



/* Returns nonsense for n < 0 */
fib(n) {
return(
Expand All @@ -10,3 +13,5 @@ fib(n) {
main() {
return(fib(10));
}

/* Some utf-8 Ω */

0 comments on commit 1e4f5ad

Please sign in to comment.