Skip to content

Commit

Permalink
Add a dynamic array to our scanner
Browse files Browse the repository at this point in the history
  • Loading branch information
tgross35 committed Jan 5, 2024
1 parent f1381cd commit f3b88d9
Show file tree
Hide file tree
Showing 2 changed files with 196 additions and 44 deletions.
14 changes: 13 additions & 1 deletion grammar.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,19 @@ function array(item) {

module.exports = grammar({
name: "just",
externals: ($) => [$._indent, $._dedent, $._newline],
externals: (
$,
) => [
$._indent,
$._dedent,
$._newline,
$._string_start,
$._string_end,
$._raw_string_start,
$._raw_string_end,
$._command_start,
$._command_end,
],
inline: (
$,
) => [
Expand Down
226 changes: 183 additions & 43 deletions src/scanner.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,22 @@
// Our scanner is always stored in a buffer of length
// TREE_SITTER_SERIALIZATION_BUFFER_SIZE. We use the rest of the data as the
// buffer for our dynamic array.
#define ARR_PTR(scan_ptr) ((uint8_t *)scan_ptr + SCAN_SIZE)
#define ARR_PTR(state_ptr) ((uint8_t *)state_ptr + SCAN_SIZE)
#define ARR_ITEM(state_ptr, index) ARR_PTR(state_ptr)[index]

enum TokenType {
INDENT, // start of indentation
DEDENT, // end of indentation
NEWLINE, // EOL or EOF, excluding backslashes
STRING_START,
STRING_END,
COMMAND_START,
COMMAND_END,
INDENT, // start of indentation
DEDENT, // end of indentation
NEWLINE, // EOL or EOF, excluding backslashes
STRING_START, // `"`, or its tripled version
STRING_END, // same as the above
STRING_BODY, // contents of a string, excludes escapes
RAW_STRING_START, // `'`, or its tripled version
RAW_STRING_END, // same as the above
RAW_STRING_BODY, // contents of a raw string
COMMAND_START, // `` ` `` or its triple`
COMMAND_END, // same as the above
COMMAND_BODY, // contents of a command, excluding interpolations
};

// A lot of this was taken from the Julia scanner,
Expand All @@ -42,58 +48,56 @@ typedef struct Scanner {
bool has_seen_eof;
} Scanner;

void push(Scanner *scanner, char c, bool is_triple) {
void push(Scanner *state, char c, bool is_triple) {
assert(c < SCHAR_MAX);
assert(scanner->len <= ARR_SIZE);
ARR_PTR(scanner)[scanner->len++] = is_triple ? (c * 2) : c;
assert(state->len <= ARR_SIZE);
ARR_ITEM(state, state->len++) = is_triple ? (c * 2) : c;
}

Delimiter pop(Scanner *scanner) {
assert(scanner->len > 0);
return ARR_PTR(scanner)[scanner->len--];
Delimiter pop(Scanner *state) {
assert(state->len > 0);
Delimiter item = ARR_ITEM(state, state->len--);
}

/// Initialize our struct. We allocate once and store the array after our struct.
/// This makes serializing and deserializing much easier.
/// Initialize our struct. We allocate once and store the array after our
/// struct. This makes serializing and deserializing much easier.
void *tree_sitter_just_external_scanner_create() {
char *buf = malloc(TREE_SITTER_SERIALIZATION_BUFFER_SIZE);
assert(buf);

Scanner *scanner = (Scanner *)buf;
scanner->len=0;
scanner->prev_indent=0;
scanner->has_seen_eof=false;

return scanner;
}

void tree_sitter_just_external_scanner_destroy(void *payload) {
free(payload);
Scanner *state = (Scanner *)buf;
state->len = 0;
state->prev_indent = 0;
state->has_seen_eof = false;

return state;
}

void tree_sitter_just_external_scanner_destroy(void *payload) { free(payload); }

unsigned tree_sitter_just_external_scanner_serialize(void *payload,
char *buffer) {
Scanner *scanner = (Scanner *)payload;
size_t to_write = SCAN_SIZE + scanner->len;
Scanner *state = (Scanner *)payload;
size_t to_write = SCAN_SIZE + state->len;

assert(to_write < TREE_SITTER_SERIALIZATION_BUFFER_SIZE);
memcpy(buffer, scanner, to_write);
memcpy(buffer, state, to_write);

return to_write;
}

void tree_sitter_just_external_scanner_deserialize(void *payload,
const char *buffer,
unsigned length) {
Scanner *scanner = (Scanner *)payload;
Scanner *state = (Scanner *)payload;

// No state to deserialize, just reset ourselves
if (length == 0) {
scanner->prev_indent = 0;
scanner->has_seen_eof = false;
state->prev_indent = 0;
state->has_seen_eof = false;
return;
}
memcpy(scanner, buffer, length);
memcpy(state, buffer, length);
}

// Continue and include the preceding character in the token
Expand All @@ -103,29 +107,150 @@ void advance(TSLexer *lexer) { return lexer->advance(lexer, false); }
void skip(TSLexer *lexer) { return lexer->advance(lexer, true); }

void mark_end(TSLexer *lexer) { return lexer->mark_end(lexer); }
bool eof(TSLexer *lexer) { return lexer->eof(lexer); }

// Identify the start of a string
bool scan_string_start(TSLexer *lexer, Scanner *scanner, char start_char) {
/// Identify the start of a string. Detects the beginning of single or tripled
/// `'`, `"`, and `` ` `` block.
bool scan_string_start(TSLexer *lexer, Scanner *state, const char start_char) {
if (lexer->lookahead != start_char)
return false;
advance(lexer);
mark_end(lexer);
for (unsigned count = 1; count < 3; count++) {
for (unsigned count = 1; count < 3; ++count) {
if (lexer->lookahead != start_char) {
// It's not a triple quoted delimiter.
push(scanner, start_char, false);
push(state, start_char, false);
return true;
}
advance(lexer);
}
mark_end(lexer);
push(scanner, start_char, true);
push(state, start_char, true);
return true;
}

bool scan_string_content(TSLexer *lexer, Scanner *state) {
if (state->len == 0)
return false; // Stack is empty. We're not in a string.
Delimiter end_char = ARR_ITEM(state, state->len - 1); // peek
bool is_triple = false;

if (end_char > SCHAR_MAX) {
is_triple = true;
end_char /= 2;
}

bool is_str = false;
bool is_raw_str = false;
bool is_cmd = false;
TSSymbol end_symbol;
TSSymbol end_body;

switch (end_char) {
case '"':
end_symbol = STRING_END;
end_body = STRING_BODY;
is_str = true;
break;
case '\'':
end_symbol = RAW_STRING_END;
end_body = RAW_STRING_BODY;
is_raw_str = true;
break;
case '`':
end_symbol = COMMAND_END;
end_body = COMMAND_BODY;
is_cmd = true;
break;
default:
assert(false);
}

bool has_content = false;

while (!eof(lexer)) {
if (is_str && lexer->lookahead == '\\') {
// detected escape, stop parsing
mark_end(lexer);
lexer -> result_symbol = end_body;
return has_content;
}

if (is_cmd && lexer->lookahead == '{') {
mark_end(lexer);
lexer->result_symbol=end_body;
return has_content;
}

if (lexer->lookahead == end_char) {
if (is_triple) {
mark_end(lexer);

for (unsigned count = 1; count < 3; ++count) {
advance(lexer);

if (lexer->lookahead != end_char) {
mark_end(lexer);
lexer->result_symbol = end_body;
return true;
}
}
}

if (has_content) {
lexer->result_symbol = end_body;
} else {
pop(state);
advance(lexer);
mark_end(lexer);
lexer->result_symbol = end_symbol;
}

return true;
}

advance(lexer);
has_content=true;

// if (interp && lexer->lookahead == '$') {
// mark_end(lexer);
// lexer->result_symbol = end_body;
// return has_content;
// } else if (lexer->lookahead == '\\') {
// mark_end(lexer);
// lexer->result_symbol = end_body;
// return has_content;
// } else if (lexer->lookahead == end_char) {
// if (is_triple) {
// mark_end(lexer);
// for (unsigned count = 1; count < 3; count++) {
// advance(lexer);
// if (lexer->lookahead != end_char) {
// mark_end(lexer);
// lexer->result_symbol = end_body;
// return true;
// }
// }
// }
// if (has_content) {
// lexer->result_symbol = end_body;
// } else {
// pop(state);
// advance(lexer);
// mark_end(lexer);
// lexer->result_symbol = end_symbol;
// }
// return true;
// }
// advance(lexer);
// has_content = true;
}
return false;
}

// An EOF works as a dedent
bool handle_eof(TSLexer *lexer, Scanner *state, const bool *valid_symbols) {
assert(lexer->eof(lexer));
assert(eof(lexer));
lexer->mark_end(lexer);

if (valid_symbols[DEDENT]) {
Expand All @@ -151,12 +276,27 @@ bool tree_sitter_just_external_scanner_scan(void *payload, TSLexer *lexer,
const bool *valid_symbols) {
Scanner *state = (Scanner *)(payload);
int32_t *lookahead = &lexer->lookahead;
bool (*eof)(const TSLexer *) = lexer->eof;

if (eof(lexer)) {
return handle_eof(lexer, state, valid_symbols);
}

if (valid_symbols[STRING_START] && scan_string_start(lexer, state, '"')) {
lexer->result_symbol = STRING_START;
return true;
}

if (valid_symbols[RAW_STRING_START] &&
scan_string_start(lexer, state, '\'')) {
lexer->result_symbol = RAW_STRING_START;
return true;
}

if (valid_symbols[COMMAND_START] && scan_string_start(lexer, state, '`')) {
lexer->result_symbol = COMMAND_START;
return true;
}

// Handle backslash escaping for newlines
if (valid_symbols[NEWLINE]) {
bool escape = false;
Expand All @@ -178,7 +318,7 @@ bool tree_sitter_just_external_scanner_scan(void *payload, TSLexer *lexer,
}

if (valid_symbols[INDENT] || valid_symbols[DEDENT]) {
while (!lexer->eof(lexer) && isspace(*lookahead)) {
while (!eof(lexer) && isspace(*lookahead)) {
switch (*lookahead) {
case '\n':
return false;
Expand All @@ -190,7 +330,7 @@ bool tree_sitter_just_external_scanner_scan(void *payload, TSLexer *lexer,
}
}

if (lexer->eof(lexer)) {
if (eof(lexer)) {
return handle_eof(lexer, state, valid_symbols);
}

Expand Down

0 comments on commit f3b88d9

Please sign in to comment.