Skip to content

Commit

Permalink
Everything almost working, just duplicate bodies for backtick escape
Browse files Browse the repository at this point in the history
  • Loading branch information
tgross35 committed Jan 7, 2024
1 parent ac72cf3 commit 4460a10
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 59 deletions.
9 changes: 9 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,15 @@ jobs:
echo '::warning::run `npm run gen` and commit the changes'
fi
- run: npm test
- name: Check C files with args
shell: bash
run: >
find src/ -name '*.c' ! -name "parser.c" |
xargs -IFNAME sh -c
'echo "\nCHECKING FILE FNAME" &&
clang FNAME -c -Wall -Werror --pedantic
-Wno-format-pedantic
-o/dev/null'
static-validation:
runs-on: ubuntu-latest
Expand Down
134 changes: 76 additions & 58 deletions src/scanner.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,15 @@
#error "expected assertions to be enabled"
#endif

// #define DEBUG_PRINT
#define DEBUG_PRINT
#ifdef DEBUG_PRINT
#define printdbg(...) printf(__VA_ARGS__)
#define dbg_print(...) \
do { \
printf(" \033[96;1mparse: \033[0m"); \
printf(__VA_ARGS__); \
} while (0)
#else
#define printdbg(...)
#define dbg_print(...)
#endif

#define SCAN_SIZE sizeof(Scanner)
Expand Down Expand Up @@ -59,16 +63,18 @@ typedef struct Scanner {
char *arr_ptr(Scanner *state) { return ((char *)state + SCAN_SIZE); }

void print_arr(Scanner *state, char *cmt) {
printdbg(" %s: ", cmt);
#ifdef DEBUG_PRINT
dbg_print("%s: ", cmt);
for (int i = 0; i < 20; ++i) {
printdbg("%02x ", ((char *)state)[i]);
printf("%02x ", ((char *)state)[i]);
}
printdbg(" \n");
printf("\n");
#endif
}

void push(Scanner *state, char c, IsTriple is_triple) {
printdbg(" pushing %c triple %d len %zu ptr %p\n", c, is_triple,
state->len, ARR_PTR(state));
dbg_print("pushing %c triple %d len %zu ptr %p\n", c, is_triple, state->len,
ARR_PTR(state));
assert(c < SCHAR_MAX);
assert(state->len <= ARR_SIZE);
// print_arr(state, "before push");
Expand All @@ -80,15 +86,22 @@ Delimiter pop(Scanner *state) {
assert(state->len > 0);
// print_arr(state, "before pop ");
Delimiter popped = arr_ptr(state)[--state->len];
printdbg(" popped %c triple %d len %zu ptr %p\n", popped,
popped > SCHAR_MAX, state->len, arr_ptr(state));

bool is_triple = false;
Delimiter pop_tmp = popped;
if (popped > SCHAR_MAX) {
is_triple = true;
pop_tmp -= SCHAR_MAX;
}
dbg_print("popped %c triple %d len %zu ptr %p\n", pop_tmp, is_triple,
state->len, arr_ptr(state));
// print_arr(state, "after pop ");
return popped;
}

/// Initialize our struct. We allocate once and store the array after our
/// struct. This makes serializing and deserializing much easier.
void *tree_sitter_just_external_scanner_create() {
void *tree_sitter_just_external_scanner_create(void) {
char *buf = malloc(TREE_SITTER_SERIALIZATION_BUFFER_SIZE);
assert(buf);

Expand Down Expand Up @@ -128,13 +141,14 @@ void tree_sitter_just_external_scanner_deserialize(void *payload,
}

// Continue and include the preceding character in the token
void advance(TSLexer *lexer) { return lexer->advance(lexer, false); }
void advance(TSLexer *lexer) { lexer->advance(lexer, false); }

// Continue and discard the preceding character
void skip(TSLexer *lexer) { return lexer->advance(lexer, true); }
void skip(TSLexer *lexer) { lexer->advance(lexer, true); }

void mark_end(TSLexer *lexer) { return lexer->mark_end(lexer); }
void mark_end(TSLexer *lexer) { lexer->mark_end(lexer); }
bool eof(TSLexer *lexer) { return lexer->eof(lexer); }
uint32_t get_column(TSLexer *lexer) { return lexer->get_column(lexer); }

/// Identify the start of a string. Detects the beginning of single or tripled
/// `'`, `"`, and `` ` `` block.
Expand Down Expand Up @@ -171,11 +185,11 @@ bool scan_string_content(TSLexer *lexer, Scanner *state) {
end_char -= SCHAR_MAX;
}

printdbg(" end char %c is triple %d \n", end_char, is_triple);
dbg_print("end char %c is triple %d \n", end_char, is_triple);

bool backslash_escapes = false;
bool brace_escapees = false;
TSSymbol end_symbol;
bool brace_escapes = false;
TSSymbol end_symbol = 0;
TSSymbol end_body = STRING_BODY;

switch (end_char) {
Expand All @@ -188,44 +202,47 @@ bool scan_string_content(TSLexer *lexer, Scanner *state) {
break;
case '`':
end_symbol = COMMAND_END;
brace_escapees = true;
brace_escapes = true;
break;
default:
printdbg(" end char: %c len: %zu\n", end_char, state->len);
dbg_print("end char: %c len: %zu\n", end_char, state->len);

assert(false);
}

bool has_content = false;

while (!eof(lexer)) {
dbg_print("check char '%c' %d\n", lexer->lookahead, get_column(lexer));
if (backslash_escapes && lexer->lookahead == '\\') {
// detected escape, stop parsing
printdbg(" return escape\n");
dbg_print("return escape\n");
mark_end(lexer);
lexer->result_symbol = end_body;
return has_content;
}

if (brace_escapees && lexer->lookahead == '{') {
if (brace_escapes && lexer->lookahead == '{') {
mark_end(lexer);
unsigned braces_count = 0;

unsigned count = 0;
while (lexer->lookahead == '{') {
dbg_print("braces check char '%c' %d count %d\n", lexer->lookahead,
get_column(lexer), braces_count);
++braces_count;
advance(lexer);
if (count == 2) {
mark_end(lexer);
}
++count;
}

if (count >= 2 && count < 4) {
if (braces_count >= 2 && braces_count < 4) {
// 2 brackets make an interpolation, 4 escapes one
printdbg(" return at the start of interop\n");
dbg_print("return at the start of interop, found %d braces\n",
braces_count);
lexer->result_symbol = end_body;
return has_content;
}

dbg_print("continuing with count at %d\n", braces_count);

// no escape found, continue matching a string
mark_end(lexer);
}
Expand All @@ -238,7 +255,7 @@ bool scan_string_content(TSLexer *lexer, Scanner *state) {
advance(lexer);

if (lexer->lookahead != end_char) {
printdbg(" return found triple");
dbg_print("return found triple\n");
pop(state);
mark_end(lexer);
lexer->result_symbol = end_body;
Expand All @@ -256,14 +273,15 @@ bool scan_string_content(TSLexer *lexer, Scanner *state) {
lexer->result_symbol = end_symbol;
}

printdbg(" return found single");
dbg_print("return found single\n");
return true;
}

advance(lexer);
has_content = true;
}
printdbg(" return nothing");

dbg_print(" return nothing");
return false;
}

Expand Down Expand Up @@ -295,49 +313,48 @@ bool tree_sitter_just_external_scanner_scan(void *payload, TSLexer *lexer,
const bool *valid_symbols) {
Scanner *state = (Scanner *)(payload);
int32_t *lookahead = &lexer->lookahead;
printdbg(
" valid indent %d dedent %d newline %d str_start %d str_end %d \
str_body %d raw_start %d raw_end %d cmd_start %d cmd_end %d \n",
valid_symbols[INDENT], valid_symbols[DEDENT], valid_symbols[NEWLINE],
valid_symbols[STRING_START], valid_symbols[STRING_END],
valid_symbols[STRING_BODY], valid_symbols[RAW_STRING_START],
valid_symbols[RAW_STRING_END], valid_symbols[COMMAND_START],
valid_symbols[COMMAND_END]);
dbg_print("valid indent %d dedent %d newline %d str_start %d str_end %d "
"str_body %d raw_start %d raw_end %d cmd_start %d cmd_end %d \n",
valid_symbols[INDENT], valid_symbols[DEDENT],
valid_symbols[NEWLINE], valid_symbols[STRING_START],
valid_symbols[STRING_END], valid_symbols[STRING_BODY],
valid_symbols[RAW_STRING_START], valid_symbols[RAW_STRING_END],
valid_symbols[COMMAND_START], valid_symbols[COMMAND_END]);

if (eof(lexer)) {
return handle_eof(lexer, state, valid_symbols);
}

// Handle backslash escaping for newlines
if (valid_symbols[NEWLINE]) {
printdbg(" newline valid\n");
dbg_print("newline valid\n");
bool escape = false;
if (*lookahead == '\\') {
escape = true;
printdbg(" skip backslash\n");
dbg_print("skip backslash\n");
skip(lexer);
}

bool eol = false;
while (*lookahead == '\n' || *lookahead == '\r') {
eol = true;
printdbg(" skip newline\n");
dbg_print("skip newline\n");
skip(lexer);
}

if (eol && !escape) {
printdbg(" returning newline\n");
dbg_print("returning newline\n");
lexer->result_symbol = NEWLINE;
return true;
}
}

if (valid_symbols[INDENT] || valid_symbols[DEDENT]) {
printdbg(" indent dedent valid\n");
dbg_print("indent dedent valid\n");
while (!eof(lexer) && isspace(*lookahead)) {
switch (*lookahead) {
case '\n':
printdbg(" return false\n");
dbg_print("return false\n");
return false;

case '\t':
Expand All @@ -356,13 +373,13 @@ bool tree_sitter_just_external_scanner_scan(void *payload, TSLexer *lexer,
state->prev_indent == 0) {
lexer->result_symbol = INDENT;
state->prev_indent = indent;
printdbg(" return indent true\n");
dbg_print("return indent true\n");
return true;
} else if (indent < state->prev_indent && valid_symbols[DEDENT] &&
indent == 0) {
lexer->result_symbol = DEDENT;
state->prev_indent = indent;
printdbg(" return dedent true\n");
dbg_print("return dedent true\n");
return true;
}
}
Expand All @@ -380,37 +397,38 @@ bool tree_sitter_just_external_scanner_scan(void *payload, TSLexer *lexer,
valid_symbols[RAW_STRING_END] || valid_symbols[COMMAND_END]) &&
scan_string_content(lexer, payload)) {
if (!valid_symbols[lexer->result_symbol]) {
printdbg(
" valid se %d sb %d rse %d ce %d result %d vals %d %d %d "
"%d \n",
valid_symbols[STRING_END], valid_symbols[STRING_BODY],
valid_symbols[RAW_STRING_END], valid_symbols[COMMAND_END],
lexer->result_symbol, STRING_END, STRING_BODY, RAW_STRING_END,
COMMAND_END);
dbg_print(" valid se %d sb %d rse %d ce %d result %d vals %d %d "
"%d %d\n",
valid_symbols[STRING_END], valid_symbols[STRING_BODY],
valid_symbols[RAW_STRING_END], valid_symbols[COMMAND_END],
lexer->result_symbol, STRING_END, STRING_BODY, RAW_STRING_END,
COMMAND_END);
}
// Just check the precondition, since `scan_string_content` always assumes
// either body or end is always valid.
assert(valid_symbols[lexer->result_symbol]);
return true;
}

if (valid_symbols[STRING_START] && scan_string_start(lexer, state, '"')) {
printdbg(" scanned string start\n");
dbg_print("scanned string start\n");
lexer->result_symbol = STRING_START;
return true;
}

if (valid_symbols[RAW_STRING_START] &&
scan_string_start(lexer, state, '\'')) {
printdbg(" scanned raw start\n");
dbg_print("scanned raw start\n");
lexer->result_symbol = RAW_STRING_START;
return true;
}

if (valid_symbols[COMMAND_START] && scan_string_start(lexer, state, '`')) {
printdbg(" scanned cmd start\n");
dbg_print("scanned cmd start\n");
lexer->result_symbol = COMMAND_START;
return true;
}

printdbg(" returning false\n");
dbg_print("returning false\n");
return false;
}
1 change: 0 additions & 1 deletion test/corpus/injections.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ foo := `echo "foo {{{{ bar }}"`
(expression
(value
(external_command
(command_body)
(command_body))))
(eol)))
(item
Expand Down

0 comments on commit 4460a10

Please sign in to comment.