From 884d18b0a4ab727bff51822b7e255af8789c4118 Mon Sep 17 00:00:00 2001 From: MegaIng Date: Thu, 20 Jun 2024 14:58:59 +0200 Subject: [PATCH] Address review comments --- lark/lark.py | 4 ++-- lark/lexer.py | 12 ++++++------ lark/parser_frontends.py | 17 +++++++++++------ lark/tools/standalone.py | 2 +- tests/test_parser.py | 2 ++ tests/test_scan.py | 35 +++++++++++++++++++++++++++++++++++ 6 files changed, 57 insertions(+), 15 deletions(-) diff --git a/lark/lark.py b/lark/lark.py index 9d897ffb..8b2ba356 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -16,7 +16,7 @@ from typing import Literal else: from typing_extensions import Literal - from .parser_frontends import ParsingFrontend + from .parser_frontends import ParsingFrontend, ScanMatch from .exceptions import ConfigurationError, assert_config, UnexpectedInput from .utils import Serialize, SerializeMemoizer, FS, isascii, logger @@ -661,7 +661,7 @@ def parse(self, text: str, start: Optional[str] = None, return self.parser.parse(text, start=start, on_error=on_error, start_pos=start_pos, end_pos=end_pos) def scan(self, text: str, start: Optional[str] = None, *, start_pos: Optional[int] = None, - end_pos: Optional[int] = None) -> Iterator[Tuple[Tuple[int, int], 'ParseTree']]: + end_pos: Optional[int] = None) -> Iterable['ScanMatch']: """ Scans the input text for non-overlapping matches of the rule specified by 'start' and yields the start and end position as well as the resulting tree. diff --git a/lark/lexer.py b/lark/lexer.py index e4aa28e2..4a574b70 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -404,17 +404,17 @@ def match(self, text, pos, *, end_pos=sys.maxsize): return m.group(0), m.lastgroup def search(self, text, start_pos, end_pos): - best = None, float("inf") + best = None for mre in self._mres: mre: re.Pattern m = mre.search(text, start_pos, end_pos) if m: - if m.start() < best[1]: - best = (m.group(0), m.lastgroup), m.start() - if best[0] is None: - return None - else: + if best is None or m.start() < best.start(): + best = m + if best is None: return best + else: + return (best.group(0), best.lastgroup), best.start() def _regexp_has_newline(r: str): diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 0712ccfa..fd582e1a 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -1,4 +1,4 @@ -from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING +from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING, NamedTuple, Iterable, Tuple from .exceptions import ConfigurationError, GrammarError, assert_config, UnexpectedInput from .utils import get_regexp_width, Serialize @@ -14,6 +14,12 @@ ###{standalone + +class ScanMatch(NamedTuple): + range: Tuple[int, int] + tree: Tree + + def _wrap_lexer(lexer_class): future_interface = getattr(lexer_class, '__future_interface__', False) if future_interface: @@ -128,13 +134,13 @@ def parse_interactive(self, text: Optional[str]=None, start=None, def scan(self, text: str, start: Optional[str]=None, *, start_pos: Optional[int] = None, - end_pos: Optional[int] = None): + end_pos: Optional[int] = None) -> Iterable[ScanMatch]: """ In contrast to the other functions here, this one actually does work. See `Lark.scan` for a description of what this function is for. """ if self.options.parser != 'lalr': - raise ValueError("scan requires parser='lalr' and lexer='contextual'") + raise ValueError("scan requires parser='lalr'") start_states = self.parser._parse_table.start_states chosen_start = self._verify_start(start) start_state = start_states[chosen_start] @@ -143,8 +149,7 @@ def scan(self, text: str, start: Optional[str]=None, *, start_pos: Optional[int] if pos < 0: pos += len(text) if end_pos < 0: - pos += len(text) - del start_pos + end_pos += len(text) while True: # Find the next candidate location found = self.lexer.search_start(text, start_state, pos, end_pos) @@ -175,7 +180,7 @@ def scan(self, text: str, start: Optional[str]=None, *, start_pos: Optional[int] except UnexpectedInput: continue else: - yield ((found.start_pos, last.end_pos), res) + yield ScanMatch((found.start_pos, last.end_pos), res) pos = last.end_pos break else: diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py index 9940ccbf..92b9cf9a 100644 --- a/lark/tools/standalone.py +++ b/lark/tools/standalone.py @@ -30,7 +30,7 @@ from typing import ( TypeVar, Generic, Type, Tuple, List, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any, Union, Iterable, IO, TYPE_CHECKING, overload, Sequence, - Pattern as REPattern, ClassVar, Set, Mapping + Pattern as REPattern, ClassVar, Set, Mapping, NamedTuple ) ###} diff --git a/tests/test_parser.py b/tests/test_parser.py index a4627134..c0437830 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2599,6 +2599,8 @@ def test_subset_parse(self): parser = _Lark(grammar) self.assertEqual(parser.parse(" abc def ", start_pos=1, end_pos=-1), Tree('start', [Token('WORD', 'abc'), Token('WORD', 'def')])) + self.assertEqual(parser.parse(" abc def ", start_pos=1-9, end_pos=-1+9), + Tree('start', [Token('WORD', 'abc'), Token('WORD', 'def')])) self.assertEqual(parser.parse("xabc def ", start_pos=1, end_pos=-1), Tree('start', [Token('FRAG_END', 'abc'), Token('WORD', 'def')])) diff --git a/tests/test_scan.py b/tests/test_scan.py index 830c1d1f..cbfaf5de 100644 --- a/tests/test_scan.py +++ b/tests/test_scan.py @@ -18,6 +18,20 @@ def test_scan(self): ((21, 30), Tree('expr', ['c', Tree('expr', [Tree('expr', ['d'])])])), ]) + def test_scan_basic_lexer(self): + parser = Lark(r""" + expr: "(" (WORD|expr)* ")" + %ignore / +/ + WORD: /\w+/ + """, parser='lalr', start="expr", lexer='basic') + + text = "|() | (a) | ((//)) | (c ((d))) |" + finds = list(parser.scan(text)) + self.assertEqual(finds, [((1, 3), Tree('expr', [])), + ((6, 9), Tree('expr', ['a'])), + ((21, 30), Tree('expr', ['c', Tree('expr', [Tree('expr', ['d'])])])), + ]) + def test_scan_meta(self): parser = Lark(r""" expr: "(" (WORD|expr)* ")" @@ -70,3 +84,24 @@ def test_scan_backtrack(self): ((15, 18), Tree('start', [Tree('expr', ['e'])])), ((22, 25), Tree('start', [Tree('expr', ['f'])])), ]) + + def test_scan_subset(self): + parser = Lark(r""" + expr: "(" (WORD|expr)* ")" + %ignore /\s+/ + WORD: /\w+/ + """, parser='lalr', start="expr", propagate_positions=True) + + text = "()\n()(a)\n(b)\n (\n) | \n(\n)" + finds = list(parser.scan(text, start_pos=5, end_pos=-1)) + self.assertEqual(finds, [((5, 8), Tree('expr', ['a'])), + ((9, 12), Tree('expr', ['b'])), + ((14, 17), Tree('expr', []))]) + self.assertEqual(2, finds[0][1].meta.line) + + text = "()\n()(a)\n(b)\n (\n) | \n(\n)" + finds = list(parser.scan(text, start_pos=5-len(text), end_pos=-1+len(text))) + self.assertEqual(finds, [((5, 8), Tree('expr', ['a'])), + ((9, 12), Tree('expr', ['b'])), + ((14, 17), Tree('expr', []))]) + self.assertEqual(2, finds[0][1].meta.line)