From 4460a10a9d307d85632225e02bd72541b7842d94 Mon Sep 17 00:00:00 2001 From: Trevor Gross Date: Sat, 6 Jan 2024 19:44:44 -0600 Subject: [PATCH] Everything almost working, just duplicate bodies for backtick escape --- .github/workflows/ci.yaml | 9 +++ src/scanner.c | 134 +++++++++++++++++++++---------------- test/corpus/injections.txt | 1 - 3 files changed, 85 insertions(+), 59 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index b313105..17ad524 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -42,6 +42,15 @@ jobs: echo '::warning::run `npm run gen` and commit the changes' fi - run: npm test + - name: Check C files with args + shell: bash + run: > + find src/ -name '*.c' ! -name "parser.c" | + xargs -IFNAME sh -c + 'echo "\nCHECKING FILE FNAME" && + clang FNAME -c -Wall -Werror --pedantic + -Wno-format-pedantic + -o/dev/null' static-validation: runs-on: ubuntu-latest diff --git a/src/scanner.c b/src/scanner.c index c5eca1a..8f98d1f 100644 --- a/src/scanner.c +++ b/src/scanner.c @@ -9,11 +9,15 @@ #error "expected assertions to be enabled" #endif -// #define DEBUG_PRINT +#define DEBUG_PRINT #ifdef DEBUG_PRINT -#define printdbg(...) printf(__VA_ARGS__) +#define dbg_print(...) \ + do { \ + printf(" \033[96;1mparse: \033[0m"); \ + printf(__VA_ARGS__); \ + } while (0) #else -#define printdbg(...) +#define dbg_print(...) #endif #define SCAN_SIZE sizeof(Scanner) @@ -59,16 +63,18 @@ typedef struct Scanner { char *arr_ptr(Scanner *state) { return ((char *)state + SCAN_SIZE); } void print_arr(Scanner *state, char *cmt) { - printdbg(" %s: ", cmt); +#ifdef DEBUG_PRINT + dbg_print("%s: ", cmt); for (int i = 0; i < 20; ++i) { - printdbg("%02x ", ((char *)state)[i]); + printf("%02x ", ((char *)state)[i]); } - printdbg(" \n"); + printf("\n"); +#endif } void push(Scanner *state, char c, IsTriple is_triple) { - printdbg(" pushing %c triple %d len %zu ptr %p\n", c, is_triple, - state->len, ARR_PTR(state)); + dbg_print("pushing %c triple %d len %zu ptr %p\n", c, is_triple, state->len, + ARR_PTR(state)); assert(c < SCHAR_MAX); assert(state->len <= ARR_SIZE); // print_arr(state, "before push"); @@ -80,15 +86,22 @@ Delimiter pop(Scanner *state) { assert(state->len > 0); // print_arr(state, "before pop "); Delimiter popped = arr_ptr(state)[--state->len]; - printdbg(" popped %c triple %d len %zu ptr %p\n", popped, - popped > SCHAR_MAX, state->len, arr_ptr(state)); + + bool is_triple = false; + Delimiter pop_tmp = popped; + if (popped > SCHAR_MAX) { + is_triple = true; + pop_tmp -= SCHAR_MAX; + } + dbg_print("popped %c triple %d len %zu ptr %p\n", pop_tmp, is_triple, + state->len, arr_ptr(state)); // print_arr(state, "after pop "); return popped; } /// Initialize our struct. We allocate once and store the array after our /// struct. This makes serializing and deserializing much easier. -void *tree_sitter_just_external_scanner_create() { +void *tree_sitter_just_external_scanner_create(void) { char *buf = malloc(TREE_SITTER_SERIALIZATION_BUFFER_SIZE); assert(buf); @@ -128,13 +141,14 @@ void tree_sitter_just_external_scanner_deserialize(void *payload, } // Continue and include the preceding character in the token -void advance(TSLexer *lexer) { return lexer->advance(lexer, false); } +void advance(TSLexer *lexer) { lexer->advance(lexer, false); } // Continue and discard the preceding character -void skip(TSLexer *lexer) { return lexer->advance(lexer, true); } +void skip(TSLexer *lexer) { lexer->advance(lexer, true); } -void mark_end(TSLexer *lexer) { return lexer->mark_end(lexer); } +void mark_end(TSLexer *lexer) { lexer->mark_end(lexer); } bool eof(TSLexer *lexer) { return lexer->eof(lexer); } +uint32_t get_column(TSLexer *lexer) { return lexer->get_column(lexer); } /// Identify the start of a string. Detects the beginning of single or tripled /// `'`, `"`, and `` ` `` block. @@ -171,11 +185,11 @@ bool scan_string_content(TSLexer *lexer, Scanner *state) { end_char -= SCHAR_MAX; } - printdbg(" end char %c is triple %d \n", end_char, is_triple); + dbg_print("end char %c is triple %d \n", end_char, is_triple); bool backslash_escapes = false; - bool brace_escapees = false; - TSSymbol end_symbol; + bool brace_escapes = false; + TSSymbol end_symbol = 0; TSSymbol end_body = STRING_BODY; switch (end_char) { @@ -188,10 +202,10 @@ bool scan_string_content(TSLexer *lexer, Scanner *state) { break; case '`': end_symbol = COMMAND_END; - brace_escapees = true; + brace_escapes = true; break; default: - printdbg(" end char: %c len: %zu\n", end_char, state->len); + dbg_print("end char: %c len: %zu\n", end_char, state->len); assert(false); } @@ -199,33 +213,36 @@ bool scan_string_content(TSLexer *lexer, Scanner *state) { bool has_content = false; while (!eof(lexer)) { + dbg_print("check char '%c' %d\n", lexer->lookahead, get_column(lexer)); if (backslash_escapes && lexer->lookahead == '\\') { // detected escape, stop parsing - printdbg(" return escape\n"); + dbg_print("return escape\n"); mark_end(lexer); lexer->result_symbol = end_body; return has_content; } - if (brace_escapees && lexer->lookahead == '{') { + if (brace_escapes && lexer->lookahead == '{') { mark_end(lexer); + unsigned braces_count = 0; - unsigned count = 0; while (lexer->lookahead == '{') { + dbg_print("braces check char '%c' %d count %d\n", lexer->lookahead, + get_column(lexer), braces_count); + ++braces_count; advance(lexer); - if (count == 2) { - mark_end(lexer); - } - ++count; } - if (count >= 2 && count < 4) { + if (braces_count >= 2 && braces_count < 4) { // 2 brackets make an interpolation, 4 escapes one - printdbg(" return at the start of interop\n"); + dbg_print("return at the start of interop, found %d braces\n", + braces_count); lexer->result_symbol = end_body; return has_content; } + dbg_print("continuing with count at %d\n", braces_count); + // no escape found, continue matching a string mark_end(lexer); } @@ -238,7 +255,7 @@ bool scan_string_content(TSLexer *lexer, Scanner *state) { advance(lexer); if (lexer->lookahead != end_char) { - printdbg(" return found triple"); + dbg_print("return found triple\n"); pop(state); mark_end(lexer); lexer->result_symbol = end_body; @@ -256,14 +273,15 @@ bool scan_string_content(TSLexer *lexer, Scanner *state) { lexer->result_symbol = end_symbol; } - printdbg(" return found single"); + dbg_print("return found single\n"); return true; } advance(lexer); has_content = true; } - printdbg(" return nothing"); + + dbg_print(" return nothing"); return false; } @@ -295,14 +313,13 @@ bool tree_sitter_just_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) { Scanner *state = (Scanner *)(payload); int32_t *lookahead = &lexer->lookahead; - printdbg( - " valid indent %d dedent %d newline %d str_start %d str_end %d \ - str_body %d raw_start %d raw_end %d cmd_start %d cmd_end %d \n", - valid_symbols[INDENT], valid_symbols[DEDENT], valid_symbols[NEWLINE], - valid_symbols[STRING_START], valid_symbols[STRING_END], - valid_symbols[STRING_BODY], valid_symbols[RAW_STRING_START], - valid_symbols[RAW_STRING_END], valid_symbols[COMMAND_START], - valid_symbols[COMMAND_END]); + dbg_print("valid indent %d dedent %d newline %d str_start %d str_end %d " + "str_body %d raw_start %d raw_end %d cmd_start %d cmd_end %d \n", + valid_symbols[INDENT], valid_symbols[DEDENT], + valid_symbols[NEWLINE], valid_symbols[STRING_START], + valid_symbols[STRING_END], valid_symbols[STRING_BODY], + valid_symbols[RAW_STRING_START], valid_symbols[RAW_STRING_END], + valid_symbols[COMMAND_START], valid_symbols[COMMAND_END]); if (eof(lexer)) { return handle_eof(lexer, state, valid_symbols); @@ -310,34 +327,34 @@ bool tree_sitter_just_external_scanner_scan(void *payload, TSLexer *lexer, // Handle backslash escaping for newlines if (valid_symbols[NEWLINE]) { - printdbg(" newline valid\n"); + dbg_print("newline valid\n"); bool escape = false; if (*lookahead == '\\') { escape = true; - printdbg(" skip backslash\n"); + dbg_print("skip backslash\n"); skip(lexer); } bool eol = false; while (*lookahead == '\n' || *lookahead == '\r') { eol = true; - printdbg(" skip newline\n"); + dbg_print("skip newline\n"); skip(lexer); } if (eol && !escape) { - printdbg(" returning newline\n"); + dbg_print("returning newline\n"); lexer->result_symbol = NEWLINE; return true; } } if (valid_symbols[INDENT] || valid_symbols[DEDENT]) { - printdbg(" indent dedent valid\n"); + dbg_print("indent dedent valid\n"); while (!eof(lexer) && isspace(*lookahead)) { switch (*lookahead) { case '\n': - printdbg(" return false\n"); + dbg_print("return false\n"); return false; case '\t': @@ -356,13 +373,13 @@ bool tree_sitter_just_external_scanner_scan(void *payload, TSLexer *lexer, state->prev_indent == 0) { lexer->result_symbol = INDENT; state->prev_indent = indent; - printdbg(" return indent true\n"); + dbg_print("return indent true\n"); return true; } else if (indent < state->prev_indent && valid_symbols[DEDENT] && indent == 0) { lexer->result_symbol = DEDENT; state->prev_indent = indent; - printdbg(" return dedent true\n"); + dbg_print("return dedent true\n"); return true; } } @@ -380,37 +397,38 @@ bool tree_sitter_just_external_scanner_scan(void *payload, TSLexer *lexer, valid_symbols[RAW_STRING_END] || valid_symbols[COMMAND_END]) && scan_string_content(lexer, payload)) { if (!valid_symbols[lexer->result_symbol]) { - printdbg( - " valid se %d sb %d rse %d ce %d result %d vals %d %d %d " - "%d \n", - valid_symbols[STRING_END], valid_symbols[STRING_BODY], - valid_symbols[RAW_STRING_END], valid_symbols[COMMAND_END], - lexer->result_symbol, STRING_END, STRING_BODY, RAW_STRING_END, - COMMAND_END); + dbg_print(" valid se %d sb %d rse %d ce %d result %d vals %d %d " + "%d %d\n", + valid_symbols[STRING_END], valid_symbols[STRING_BODY], + valid_symbols[RAW_STRING_END], valid_symbols[COMMAND_END], + lexer->result_symbol, STRING_END, STRING_BODY, RAW_STRING_END, + COMMAND_END); } + // Just check the precondition, since `scan_string_content` always assumes + // either body or end is always valid. assert(valid_symbols[lexer->result_symbol]); return true; } if (valid_symbols[STRING_START] && scan_string_start(lexer, state, '"')) { - printdbg(" scanned string start\n"); + dbg_print("scanned string start\n"); lexer->result_symbol = STRING_START; return true; } if (valid_symbols[RAW_STRING_START] && scan_string_start(lexer, state, '\'')) { - printdbg(" scanned raw start\n"); + dbg_print("scanned raw start\n"); lexer->result_symbol = RAW_STRING_START; return true; } if (valid_symbols[COMMAND_START] && scan_string_start(lexer, state, '`')) { - printdbg(" scanned cmd start\n"); + dbg_print("scanned cmd start\n"); lexer->result_symbol = COMMAND_START; return true; } - printdbg(" returning false\n"); + dbg_print("returning false\n"); return false; } diff --git a/test/corpus/injections.txt b/test/corpus/injections.txt index 18ed623..2baa57e 100644 --- a/test/corpus/injections.txt +++ b/test/corpus/injections.txt @@ -42,7 +42,6 @@ foo := `echo "foo {{{{ bar }}"` (expression (value (external_command - (command_body) (command_body)))) (eol))) (item