From 9a04980accfe29a708e738376af50b96d8b562b7 Mon Sep 17 00:00:00 2001 From: Alan Zimmerman Date: Wed, 3 Jul 2024 03:34:15 -0700 Subject: [PATCH] Improve white space matching in external scanner Summary: Factor out the `is_whitespace` test, and match the one used in `grammar.js` Reviewed By: jcpetruzza Differential Revision: D59276662 fbshipit-source-id: 49b4f10d71e61b9b80f051fc7d27e46b0bd678d2 --- grammar.js | 3 ++- src/scanner.c | 18 +++++++++++------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/grammar.js b/grammar.js index 917fc8b..27188f5 100644 --- a/grammar.js +++ b/grammar.js @@ -114,7 +114,8 @@ module.exports = grammar({ extras: $ => [ // $.whitespace causes issues with error recovery, - // emulate it manually when traversing the tree + // emulate it manually when traversing the tree. + // The regexp is based on the ?WHITE_SPACE/1 macro in elp_scan.erl /[\x01-\x20\x80-\xA0]/, $.comment, ], diff --git a/src/scanner.c b/src/scanner.c index 303d55f..092899a 100644 --- a/src/scanner.c +++ b/src/scanner.c @@ -62,6 +62,14 @@ static inline void skip(TSLexer* lexer) { /* fprintf(stderr, "Scanner lookahead: '%c'.\n", lexer->lookahead); */ /* } */ +static inline bool is_whitespace(TSLexer* lexer) { + return ( + /* The test is based on the ?WHITE_SPACE/1 macro in + elp_scan.erl, and matches the one in grammar.js */ + (lexer->lookahead >= 0x01 && lexer->lookahead <= 0x20) || + (lexer->lookahead >= 0x80 && lexer->lookahead <= 0xA0)); +} + bool tree_sitter_erlang_external_scanner_scan( void* unused_payload, TSLexer* lexer, @@ -70,9 +78,7 @@ bool tree_sitter_erlang_external_scanner_scan( if (valid_symbols[TQ_STRING] || valid_symbols[TQ_SIGIL_STRING]) { /* Skip any leading whitespace */ - while (lexer->lookahead == ' ' || lexer->lookahead == '\t' || - lexer->lookahead == '\f' || lexer->lookahead == '\r' || - lexer->lookahead == '\n') { + while (is_whitespace(lexer)) { skip(lexer); } bool is_sigil_string = false; @@ -112,8 +118,7 @@ bool tree_sitter_erlang_external_scanner_scan( advance(lexer); } /* skip whitespace to end of line */ - while (lexer->lookahead == ' ' || lexer->lookahead == '\t' || - lexer->lookahead == '\f' || lexer->lookahead == '\r') { + while (lexer->lookahead != '\n' && is_whitespace(lexer)) { advance(lexer); } @@ -129,8 +134,7 @@ bool tree_sitter_erlang_external_scanner_scan( if (lexer->lookahead == '\n') { advance(lexer); /* skip whitespace to first '"' */ - while (lexer->lookahead == ' ' || lexer->lookahead == '\t' || - lexer->lookahead == '\f' || lexer->lookahead == '\r') { + while (lexer->lookahead != '\n' && is_whitespace(lexer)) { advance(lexer); }