From 26c002b97a44cf1baac0c2e234062a00fb8efc09 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 19 Aug 2024 10:37:35 +0200 Subject: [PATCH] Added TextSlice; Lark can now parse/lex a text-slice Based on previous PR by MegaIng --- lark/__init__.py | 3 +- lark/lark.py | 8 +++--- lark/lexer.py | 53 ++++++++++++++++++++++------------- lark/parser_frontends.py | 43 ++++++++++++++++++++-------- lark/parsers/lalr_parser.py | 2 +- lark/utils.py | 46 ++++++++++++++++++++++++++++++ tests/test_lexer.py | 17 +++++++++-- tests/test_parser.py | 56 +++++++++++++++++++++++++++++++++++-- 8 files changed, 187 insertions(+), 41 deletions(-) diff --git a/lark/__init__.py b/lark/__init__.py index d22cc2d9c..5b150b614 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -11,7 +11,7 @@ from .lark import Lark from .lexer import Token from .tree import ParseTree, Tree -from .utils import logger +from .utils import logger, TextSlice from .visitors import Discard, Transformer, Transformer_NonRecursive, Visitor, v_args __version__: str = "1.2.2" @@ -33,6 +33,7 @@ "Discard", "Transformer", "Transformer_NonRecursive", + "TextSlice", "Visitor", "v_args", ) diff --git a/lark/lark.py b/lark/lark.py index 7ae1f2404..0f3caf911 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -16,7 +16,7 @@ from .parser_frontends import ParsingFrontend from .exceptions import ConfigurationError, assert_config, UnexpectedInput -from .utils import Serialize, SerializeMemoizer, FS, logger +from .utils import Serialize, SerializeMemoizer, FS, logger, TextOrSlice from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files, PackageResource, sha256_digest from .tree import Tree from .common import LexerConf, ParserConf, _ParserArgType, _LexerArgType @@ -598,7 +598,7 @@ def __repr__(self): return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer) - def lex(self, text: str, dont_ignore: bool=False) -> Iterator[Token]: + def lex(self, text: TextOrSlice, dont_ignore: bool=False) -> Iterator[Token]: """Only lex (and postlex) the text, without parsing it. Only relevant when lexer='basic' When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore. @@ -620,7 +620,7 @@ def get_terminal(self, name: str) -> TerminalDef: """Get information about a terminal""" return self._terminals_dict[name] - def parse_interactive(self, text: Optional[str]=None, start: Optional[str]=None) -> 'InteractiveParser': + def parse_interactive(self, text: Optional[TextOrSlice]=None, start: Optional[str]=None) -> 'InteractiveParser': """Start an interactive parsing session. Parameters: @@ -634,7 +634,7 @@ def parse_interactive(self, text: Optional[str]=None, start: Optional[str]=None) """ return self.parser.parse_interactive(text, start=start) - def parse(self, text: str, start: Optional[str]=None, on_error: 'Optional[Callable[[UnexpectedInput], bool]]'=None) -> 'ParseTree': + def parse(self, text: TextOrSlice, start: Optional[str]=None, on_error: 'Optional[Callable[[UnexpectedInput], bool]]'=None) -> 'ParseTree': """Parse the given text, according to the options provided. Parameters: diff --git a/lark/lexer.py b/lark/lexer.py index 24695772a..59d9acfd1 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -17,7 +17,7 @@ from .common import LexerConf from .parsers.lalr_parser_state import ParserState -from .utils import classify, get_regexp_width, Serialize, logger +from .utils import classify, get_regexp_width, Serialize, logger, TextSlice, TextOrSlice from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken from .grammar import TOKEN_DEFAULT_PRIORITY @@ -289,7 +289,7 @@ def __eq__(self, other): return self.char_pos == other.char_pos and self.newline_char == other.newline_char - def feed(self, token: Token, test_newline=True): + def feed(self, token: TextOrSlice, test_newline=True): """Consume a token and calculate the new line & column. As an optional optimization, set test_newline=False if token doesn't contain a newline. @@ -382,9 +382,9 @@ def _build_mres(self, terminals, max_size): terminals = terminals[max_size:] return mres - def match(self, text, pos): + def match(self, text: TextSlice, pos): for mre in self._mres: - m = mre.match(text, pos) + m = mre.match(text.text, pos, text.end) if m: return m.group(0), m.lastgroup @@ -394,6 +394,7 @@ def fullmatch(self, text: str) -> Optional[str]: m = mre.fullmatch(text) if m: return m.lastgroup + return None def _regexp_has_newline(r: str): r"""Expressions that may indicate newlines in a regexp: @@ -413,20 +414,31 @@ class LexerState: __slots__ = 'text', 'line_ctr', 'last_token' - text: str + text: TextSlice line_ctr: LineCounter last_token: Optional[Token] - def __init__(self, text: str, line_ctr: Optional[LineCounter]=None, last_token: Optional[Token]=None): + def __init__(self, text: TextSlice, line_ctr: Optional[LineCounter] = None, last_token: Optional[Token]=None): + if line_ctr is None: + line_ctr = LineCounter(b'\n' if isinstance(text.text, bytes) else '\n') + + if text.start > 0: + # Advance the line-count until line_ctr.char_pos == text.start + line_ctr.feed(TextSlice(text.text, 0, text.start)) + + if not (text.start <= line_ctr.char_pos <= text.end): + raise ValueError("LineCounter.char_pos is out of bounds") + self.text = text - self.line_ctr = line_ctr or LineCounter(b'\n' if isinstance(text, bytes) else '\n') + self.line_ctr = line_ctr self.last_token = last_token + def __eq__(self, other): if not isinstance(other, LexerState): return NotImplemented - return self.text is other.text and self.line_ctr == other.line_ctr and self.last_token == other.last_token + return self.text == other.text and self.line_ctr == other.line_ctr and self.last_token == other.last_token def __copy__(self): return type(self)(self.text, copy(self.line_ctr), self.last_token) @@ -436,15 +448,18 @@ class LexerThread: """A thread that ties a lexer instance and a lexer state, to be used by the parser """ - def __init__(self, lexer: 'Lexer', lexer_state: LexerState): + def __init__(self, lexer: 'Lexer', lexer_state: Optional[LexerState]): self.lexer = lexer self.state = lexer_state @classmethod - def from_text(cls, lexer: 'Lexer', text: str) -> 'LexerThread': + def from_text(cls, lexer: 'Lexer', text_or_slice: TextOrSlice) -> 'LexerThread': + text = TextSlice.cast_from(text_or_slice) return cls(lexer, LexerState(text)) def lex(self, parser_state): + if self.state is None: + raise TypeError("Cannot lex: No text assigned to lexer state") return self.lexer.lex(self.state, parser_state) def __copy__(self): @@ -465,9 +480,9 @@ class Lexer(ABC): def lex(self, lexer_state: LexerState, parser_state: Any) -> Iterator[Token]: return NotImplemented - def make_lexer_state(self, text): + def make_lexer_state(self, text: str): "Deprecated" - return LexerState(text) + return LexerState(TextSlice.cast_from(text)) def _check_regex_collisions(terminal_to_regexp: Dict[TerminalDef, str], comparator, strict_mode, max_collisions_to_show=8): @@ -567,9 +582,9 @@ def __init__(self, conf: 'LexerConf', comparator=None) -> None: self.use_bytes = conf.use_bytes self.terminals_by_name = conf.terminals_by_name - self._scanner = None + self._scanner: Optional[Scanner] = None - def _build_scanner(self): + def _build_scanner(self) -> Scanner: terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes) assert all(self.callback.values()) @@ -580,12 +595,12 @@ def _build_scanner(self): else: self.callback[type_] = f - self._scanner = Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes) + return Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes) @property - def scanner(self): + def scanner(self) -> Scanner: if self._scanner is None: - self._build_scanner() + self._scanner = self._build_scanner() return self._scanner def match(self, text, pos): @@ -593,13 +608,13 @@ def match(self, text, pos): def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token: line_ctr = lex_state.line_ctr - while line_ctr.char_pos < len(lex_state.text): + while line_ctr.char_pos < lex_state.text.end: res = self.match(lex_state.text, line_ctr.char_pos) if not res: allowed = self.scanner.allowed_types - self.ignore_types if not allowed: allowed = {""} - raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, + raise UnexpectedCharacters(lex_state.text.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token], state=parser_state, terminals_by_name=self.terminals_by_name) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 186058a6b..bfe4eba98 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -1,7 +1,7 @@ from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING from .exceptions import ConfigurationError, GrammarError, assert_config -from .utils import get_regexp_width, Serialize +from .utils import get_regexp_width, Serialize, TextOrSlice, TextSlice from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer from .parsers import earley, xearley, cyk from .parsers.lalr_parser import LALR_Parser @@ -15,16 +15,31 @@ ###{standalone def _wrap_lexer(lexer_class): - future_interface = getattr(lexer_class, '__future_interface__', False) - if future_interface: + future_interface = getattr(lexer_class, '__future_interface__', 0) + if future_interface == 2: return lexer_class - else: - class CustomLexerWrapper(Lexer): + elif future_interface == 1: + class CustomLexerWrapper1(Lexer): + def __init__(self, lexer_conf): + self.lexer = lexer_class(lexer_conf) + def lex(self, lexer_state, parser_state): + if not lexer_state.text.is_complete_text(): + raise TypeError("Interface=1 Custom Lexer don't support TextSlice") + lexer_state.text = lexer_state.text + return self.lexer.lex(lexer_state, parser_state) + return CustomLexerWrapper1 + elif future_interface == 0: + class CustomLexerWrapper0(Lexer): def __init__(self, lexer_conf): self.lexer = lexer_class(lexer_conf) + def lex(self, lexer_state, parser_state): - return self.lexer.lex(lexer_state.text) - return CustomLexerWrapper + if not lexer_state.text.is_complete_text(): + raise TypeError("Interface=0 Custom Lexer don't support TextSlice") + return self.lexer.lex(lexer_state.text.text) + return CustomLexerWrapper0 + else: + raise ValueError(f"Unknown __future_interface__ value {future_interface}, integer 0-2 expected") def _deserialize_parsing_frontend(data, memo, lexer_conf, callbacks, options): @@ -93,23 +108,27 @@ def _verify_start(self, start=None): raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start)) return start - def _make_lexer_thread(self, text: str) -> Union[str, LexerThread]: + def _make_lexer_thread(self, text: Optional[TextOrSlice]) -> Union[TextOrSlice, LexerThread, None]: cls = (self.options and self.options._plugins.get('LexerThread')) or LexerThread - return text if self.skip_lexer else cls.from_text(self.lexer, text) + return text if self.skip_lexer else cls(self.lexer, None) if text is None else cls.from_text(self.lexer, text) + + def parse(self, text: Optional[TextOrSlice], start=None, on_error=None): + if self.lexer_conf.lexer_type in ("dynamic", "dynamic_complete"): + if isinstance(text, TextSlice) and not text.is_complete_text(): + raise TypeError(f"Lexer {self.lexer_conf.lexer_type} does not support text slices.") - def parse(self, text: str, start=None, on_error=None): chosen_start = self._verify_start(start) kw = {} if on_error is None else {'on_error': on_error} stream = self._make_lexer_thread(text) return self.parser.parse(stream, chosen_start, **kw) - def parse_interactive(self, text: Optional[str]=None, start=None): + def parse_interactive(self, text: Optional[TextOrSlice]=None, start=None): # TODO BREAK - Change text from Optional[str] to text: str = ''. # Would break behavior of exhaust_lexer(), which currently raises TypeError, and after the change would just return [] chosen_start = self._verify_start(start) if self.parser_conf.parser_type != 'lalr': raise ConfigurationError("parse_interactive() currently only works with parser='lalr' ") - stream = self._make_lexer_thread(text) # type: ignore[arg-type] + stream = self._make_lexer_thread(text) return self.parser.parse_interactive(stream, chosen_start) diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 6ae2a04fd..728753c44 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -55,7 +55,7 @@ def parse(self, lexer, start, on_error=None): if isinstance(e, UnexpectedCharacters): # If user didn't change the character position, then we should if p == s.line_ctr.char_pos: - s.line_ctr.feed(s.text[p:p+1]) + s.line_ctr.feed(s.text.text[p:p+1]) try: return e.interactive_parser.resume_parse() diff --git a/lark/utils.py b/lark/utils.py index 3767a66da..861d6b211 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -7,6 +7,8 @@ ###{standalone import sys, re import logging +from dataclasses import dataclass +from typing import Generic, AnyStr logger: logging.Logger = logging.getLogger("lark") logger.addHandler(logging.StreamHandler()) @@ -158,6 +160,49 @@ def get_regexp_width(expr: str) -> Union[Tuple[int, int], List[int]]: else: return 0, int(MAXWIDTH) + +@dataclass(frozen=True) +class TextSlice(Generic[AnyStr]): + text: AnyStr + start: int + end: int + + def __post_init__(self): + if not isinstance(self.text, (str, bytes)): + raise TypeError("text must be str or bytes") + + if self.start < 0: + object.__setattr__(self, 'start', self.start + len(self.text)) + assert self.start >=0 + + if self.end is None: + object.__setattr__(self, 'end', len(self.text)) + elif self.end < 0: + object.__setattr__(self, 'end', self.end + len(self.text)) + assert self.end <= len(self.text) + + @classmethod + def cast_from(cls, text: 'TextOrSlice') -> 'TextSlice[AnyStr]': + if isinstance(text, TextSlice): + return text + + return cls(text, 0, len(text)) + + def is_complete_text(self): + return self.start == 0 and self.end == len(self.text) + + def __len__(self): + return self.end - self.start + + def count(self, substr: AnyStr): + return self.text.count(substr, self.start, self.end) + + def rindex(self, substr: AnyStr): + return self.text.rindex(substr, self.start, self.end) + + +TextOrSlice = Union[AnyStr, 'TextSlice[AnyStr]'] + ###} @@ -344,3 +389,4 @@ def __len__(self) -> int: def __repr__(self): return f"{type(self).__name__}({', '.join(map(repr,self))})" + diff --git a/tests/test_lexer.py b/tests/test_lexer.py index 0996c8973..9234e69fd 100644 --- a/tests/test_lexer.py +++ b/tests/test_lexer.py @@ -1,6 +1,7 @@ from unittest import TestCase, main -from lark import Lark, Tree +from lark import Lark, Tree, TextSlice + class TestLexer(TestCase): def setUp(self): @@ -18,6 +19,18 @@ def test_basic(self): res = list(p.lex("abc cba dd", dont_ignore=True)) assert res == list('abc cba dd') + def test_subset_lex(self): + p = Lark(""" + start: "a" "b" "c" "d" + %ignore " " + """) + + res = list(p.lex(TextSlice("xxxabc cba ddxx", 3, -2))) + assert res == list('abccbadd') + + res = list(p.lex(TextSlice("aaaabc cba dddd", 3, -2))) + assert res == list('abccbadd') + if __name__ == '__main__': - main() + main() \ No newline at end of file diff --git a/tests/test_parser.py b/tests/test_parser.py index 59e9a718a..f5e5d2c01 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -25,6 +25,7 @@ import lark from lark import logger from lark.lark import Lark +from lark.utils import TextSlice from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters from lark.tree import Tree from lark.visitors import Transformer, Transformer_InPlace, v_args, Transformer_InPlaceRecursive @@ -1008,7 +1009,7 @@ def __init__(self, lexer_conf): def lex(self, lexer_state, parser_state): return self.lexer.lex(lexer_state, parser_state) - __future_interface__ = True + __future_interface__ = 2 class CustomLexerOld(Lexer): """ @@ -1021,7 +1022,7 @@ def lex(self, text): ls = self.lexer.make_lexer_state(text) return self.lexer.lex(ls, None) - __future_interface__ = False + __future_interface__ = 0 def _tree_structure_check(a, b): """ @@ -2665,6 +2666,57 @@ def test_strict(self): """ self.assertRaises(GrammarError, _Lark, grammar, strict=True) + @unittest.skipIf(LEXER in ('dynamic', 'dynamic_complete', 'custom_old'), + "start_pos and end_pos not compatible with old style custom/dynamic lexer ") + def test_parse_textslice(self): + grammar = r""" + start: (WORD|FRAG_END|FRAG_START)+ + WORD: /\b\w+\b/ # match full word + FRAG_END: /\B\w+/ # end of a word, i.e. start is not at a word boundary + FRAG_START: /\w+\B/ # start of a word, i.e. end is not at a word boundary + %ignore /\s+/ + """ + + parser = _Lark(grammar) + self.assertEqual(parser.parse(TextSlice(" abc def ", 1, -1)), + Tree('start', [Token('WORD', 'abc'), Token('WORD', 'def')])) + self.assertEqual(parser.parse(TextSlice(" abc def ", 1-9, -1+9)), + Tree('start', [Token('WORD', 'abc'), Token('WORD', 'def')])) + self.assertEqual(parser.parse(TextSlice("xabc def ", 1, -1)), + Tree('start', [Token('FRAG_END', 'abc'), Token('WORD', 'def')])) + + # We match the behavior of python's re module here: It doesn't look ahead beyond `end_pos`, + # despite looking behind before `start_pos` + self.assertEqual(parser.parse(TextSlice(" abc defx", 1, -1)), + Tree('start', [Token('WORD', 'abc'), Token('WORD', 'def')])) + + grammar = r""" + start: (_NL | ANY)+ + _NL: "\n" + ANY: /[^\n]/ + """ + parser = _Lark(grammar) + digits = "\n".join("123456789") + tree = parser.parse(TextSlice(digits, 2, 3)) + self.assertEqual(tree.children, ["2"]) + t:Token = tree.children[0] + assert t.start_pos == (2-1)*2 + assert t.line == 2 + + tree = parser.parse(TextSlice(digits, -1, None)) + self.assertEqual(tree.children, ["9"]) + t:Token = tree.children[0] + assert t.start_pos == (9-1)*2 + assert t.line == 9 + + + @unittest.skipIf(LEXER not in ('dynamic', 'dynamic_complete', 'custom_old'), + "start_pos and end_pos not compatible with old style custom/dynamic lexer ") + def test_parse_textslice_fails(self): + parser = _Lark("start: ") + s = TextSlice("hello", 2, 3) + self.assertRaises(TypeError, parser.parse, s) + _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() _TestParser.__name__ = _NAME