Skip to content

Commit

Permalink
Address review comments
Browse files Browse the repository at this point in the history
  • Loading branch information
MegaIng committed Jun 20, 2024
1 parent ca0cd55 commit 884d18b
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 15 deletions.
4 changes: 2 additions & 2 deletions lark/lark.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from typing import Literal
else:
from typing_extensions import Literal
from .parser_frontends import ParsingFrontend
from .parser_frontends import ParsingFrontend, ScanMatch

from .exceptions import ConfigurationError, assert_config, UnexpectedInput
from .utils import Serialize, SerializeMemoizer, FS, isascii, logger
Expand Down Expand Up @@ -661,7 +661,7 @@ def parse(self, text: str, start: Optional[str] = None,
return self.parser.parse(text, start=start, on_error=on_error, start_pos=start_pos, end_pos=end_pos)

def scan(self, text: str, start: Optional[str] = None, *, start_pos: Optional[int] = None,
end_pos: Optional[int] = None) -> Iterator[Tuple[Tuple[int, int], 'ParseTree']]:
end_pos: Optional[int] = None) -> Iterable['ScanMatch']:
"""
Scans the input text for non-overlapping matches of the rule specified by 'start' and
yields the start and end position as well as the resulting tree.
Expand Down
12 changes: 6 additions & 6 deletions lark/lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,17 +404,17 @@ def match(self, text, pos, *, end_pos=sys.maxsize):
return m.group(0), m.lastgroup

def search(self, text, start_pos, end_pos):
best = None, float("inf")
best = None
for mre in self._mres:
mre: re.Pattern
m = mre.search(text, start_pos, end_pos)
if m:
if m.start() < best[1]:
best = (m.group(0), m.lastgroup), m.start()
if best[0] is None:
return None
else:
if best is None or m.start() < best.start():
best = m
if best is None:
return best
else:
return (best.group(0), best.lastgroup), best.start()


def _regexp_has_newline(r: str):
Expand Down
17 changes: 11 additions & 6 deletions lark/parser_frontends.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING
from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING, NamedTuple, Iterable, Tuple

from .exceptions import ConfigurationError, GrammarError, assert_config, UnexpectedInput
from .utils import get_regexp_width, Serialize
Expand All @@ -14,6 +14,12 @@

###{standalone


class ScanMatch(NamedTuple):
range: Tuple[int, int]
tree: Tree


def _wrap_lexer(lexer_class):
future_interface = getattr(lexer_class, '__future_interface__', False)
if future_interface:
Expand Down Expand Up @@ -128,13 +134,13 @@ def parse_interactive(self, text: Optional[str]=None, start=None,


def scan(self, text: str, start: Optional[str]=None, *, start_pos: Optional[int] = None,
end_pos: Optional[int] = None):
end_pos: Optional[int] = None) -> Iterable[ScanMatch]:
"""
In contrast to the other functions here, this one actually does work. See `Lark.scan`
for a description of what this function is for.
"""
if self.options.parser != 'lalr':
raise ValueError("scan requires parser='lalr' and lexer='contextual'")
raise ValueError("scan requires parser='lalr'")
start_states = self.parser._parse_table.start_states
chosen_start = self._verify_start(start)
start_state = start_states[chosen_start]
Expand All @@ -143,8 +149,7 @@ def scan(self, text: str, start: Optional[str]=None, *, start_pos: Optional[int]
if pos < 0:
pos += len(text)
if end_pos < 0:
pos += len(text)
del start_pos
end_pos += len(text)
while True:
# Find the next candidate location
found = self.lexer.search_start(text, start_state, pos, end_pos)
Expand Down Expand Up @@ -175,7 +180,7 @@ def scan(self, text: str, start: Optional[str]=None, *, start_pos: Optional[int]
except UnexpectedInput:
continue
else:
yield ((found.start_pos, last.end_pos), res)
yield ScanMatch((found.start_pos, last.end_pos), res)
pos = last.end_pos
break
else:
Expand Down
2 changes: 1 addition & 1 deletion lark/tools/standalone.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from typing import (
TypeVar, Generic, Type, Tuple, List, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any,
Union, Iterable, IO, TYPE_CHECKING, overload, Sequence,
Pattern as REPattern, ClassVar, Set, Mapping
Pattern as REPattern, ClassVar, Set, Mapping, NamedTuple
)
###}

Expand Down
2 changes: 2 additions & 0 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2599,6 +2599,8 @@ def test_subset_parse(self):
parser = _Lark(grammar)
self.assertEqual(parser.parse(" abc def ", start_pos=1, end_pos=-1),
Tree('start', [Token('WORD', 'abc'), Token('WORD', 'def')]))
self.assertEqual(parser.parse(" abc def ", start_pos=1-9, end_pos=-1+9),
Tree('start', [Token('WORD', 'abc'), Token('WORD', 'def')]))
self.assertEqual(parser.parse("xabc def ", start_pos=1, end_pos=-1),
Tree('start', [Token('FRAG_END', 'abc'), Token('WORD', 'def')]))

Expand Down
35 changes: 35 additions & 0 deletions tests/test_scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,20 @@ def test_scan(self):
((21, 30), Tree('expr', ['c', Tree('expr', [Tree('expr', ['d'])])])),
])

def test_scan_basic_lexer(self):
parser = Lark(r"""
expr: "(" (WORD|expr)* ")"
%ignore / +/
WORD: /\w+/
""", parser='lalr', start="expr", lexer='basic')

text = "|() | (a) | ((//)) | (c ((d))) |"
finds = list(parser.scan(text))
self.assertEqual(finds, [((1, 3), Tree('expr', [])),
((6, 9), Tree('expr', ['a'])),
((21, 30), Tree('expr', ['c', Tree('expr', [Tree('expr', ['d'])])])),
])

def test_scan_meta(self):
parser = Lark(r"""
expr: "(" (WORD|expr)* ")"
Expand Down Expand Up @@ -70,3 +84,24 @@ def test_scan_backtrack(self):
((15, 18), Tree('start', [Tree('expr', ['e'])])),
((22, 25), Tree('start', [Tree('expr', ['f'])])),
])

def test_scan_subset(self):
parser = Lark(r"""
expr: "(" (WORD|expr)* ")"
%ignore /\s+/
WORD: /\w+/
""", parser='lalr', start="expr", propagate_positions=True)

text = "()\n()(a)\n(b)\n (\n) | \n(\n)"
finds = list(parser.scan(text, start_pos=5, end_pos=-1))
self.assertEqual(finds, [((5, 8), Tree('expr', ['a'])),
((9, 12), Tree('expr', ['b'])),
((14, 17), Tree('expr', []))])
self.assertEqual(2, finds[0][1].meta.line)

text = "()\n()(a)\n(b)\n (\n) | \n(\n)"
finds = list(parser.scan(text, start_pos=5-len(text), end_pos=-1+len(text)))
self.assertEqual(finds, [((5, 8), Tree('expr', ['a'])),
((9, 12), Tree('expr', ['b'])),
((14, 17), Tree('expr', []))])
self.assertEqual(2, finds[0][1].meta.line)

0 comments on commit 884d18b

Please sign in to comment.