From 84283060144144832f847d5b258c929696d2300b Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 26 Sep 2024 15:36:01 +0800 Subject: [PATCH] Don't let section start token break unclosed parent node it shouldn't pop nodes in parser stack if it's inside an unclosed node like a template --- src/wikitextprocessor/parser.py | 59 ++++++++++++++++++--------------- tests/test_parser.py | 20 +++++++++++ 2 files changed, 53 insertions(+), 26 deletions(-) diff --git a/src/wikitextprocessor/parser.py b/src/wikitextprocessor/parser.py index b19a2e99..50562602 100644 --- a/src/wikitextprocessor/parser.py +++ b/src/wikitextprocessor/parser.py @@ -259,27 +259,27 @@ class NodeKind(enum.Flag): ) # Node types that have arguments separated by the vertical bar (|) -HAVE_ARGS_KINDS: tuple[NodeKind, ...] = ( - NodeKind.LINK, - NodeKind.TEMPLATE, - NodeKind.TEMPLATE_ARG, - NodeKind.PARSER_FN, - NodeKind.URL, +HAVE_ARGS_KIND_FLAGS = ( + NodeKind.LINK + | NodeKind.TEMPLATE + | NodeKind.TEMPLATE_ARG + | NodeKind.PARSER_FN + | NodeKind.URL ) # Node kinds that generate an error if they have not been properly closed. -MUST_CLOSE_KINDS: tuple[NodeKind, ...] = ( - NodeKind.ITALIC, - NodeKind.BOLD, - NodeKind.PRE, - NodeKind.HTML, - NodeKind.LINK, - NodeKind.TEMPLATE, - NodeKind.TEMPLATE_ARG, - NodeKind.PARSER_FN, - NodeKind.URL, - NodeKind.TABLE, +MUST_CLOSE_KIND_FLAGS = ( + NodeKind.ITALIC + | NodeKind.BOLD + | NodeKind.PRE + | NodeKind.HTML + | NodeKind.LINK + | NodeKind.TEMPLATE + | NodeKind.TEMPLATE_ARG + | NodeKind.PARSER_FN + | NodeKind.URL + | NodeKind.TABLE ) # regex for finding html-tags so that we can replace single-quotes @@ -714,7 +714,7 @@ def _parser_pop(ctx: "Wtp", warn_unclosed: bool) -> None: node = ctx.parser_stack[-1] # Warn about unclosed syntaxes. - if warn_unclosed and node.kind in MUST_CLOSE_KINDS: + if warn_unclosed and node.kind in MUST_CLOSE_KIND_FLAGS: if node.kind == NodeKind.HTML: ctx.debug( "HTML tag <{}> not properly closed".format(node.sarg), @@ -769,7 +769,7 @@ def _parser_pop(ctx: "Wtp", warn_unclosed: bool) -> None: # If the node has arguments, move remaining children to be the last # argument - if node.kind in HAVE_ARGS_KINDS: + if node.kind in HAVE_ARGS_KIND_FLAGS: node.largs.append(node.children) node.children = [] @@ -805,11 +805,11 @@ def _parser_pop(ctx: "Wtp", warn_unclosed: bool) -> None: ctx.parser_stack.pop() -def _parser_have(ctx: "Wtp", kind: NodeKind) -> bool: +def _parser_have(ctx: "Wtp", kind_flags: NodeKind) -> bool: """Returns True if any node on the stack is of the given kind.""" - assert isinstance(kind, NodeKind) + assert isinstance(kind_flags, NodeKind) for node in ctx.parser_stack: - if node.kind == kind: + if node.kind in kind_flags: return True return False @@ -1000,6 +1000,8 @@ def subtitle_start_fn(ctx, token) -> None: break if node.kind == NodeKind.HTML and node.sarg not in ("span",): break + if node.kind in MUST_CLOSE_KIND_FLAGS & ~NodeKind.HTML: + break _parser_pop(ctx, True) # Push the subtitle node. Subtitle start nodes are guaranteed to have @@ -1552,14 +1554,19 @@ def vbar_fn(ctx: "Wtp", token: str) -> None: templates, template argument references, links, etc, and it can also separate table row cells.""" node = ctx.parser_stack[-1] - if node.kind in HAVE_ARGS_KINDS and node.kind is not NodeKind.URL: + if node.kind in HAVE_ARGS_KIND_FLAGS and node.kind is not NodeKind.URL: # [http://url.com these do not use vbars, only one initial space] _parser_merge_str_children(ctx) node.largs.append(node.children) node.children = [] return - - table_cell_fn(ctx, token) + elif _parser_have(ctx, NodeKind.TABLE): + table_cell_fn(ctx, token) + elif _parser_have(ctx, HAVE_ARGS_KIND_FLAGS): + _parser_pop(ctx, True) + vbar_fn(ctx, token) + else: + text_fn(ctx, token) def double_vbar_fn(ctx: "Wtp", token: str) -> None: @@ -1570,7 +1577,7 @@ def double_vbar_fn(ctx: "Wtp", token: str) -> None: contain header cells this actually generates a new header cell in MediaWiki, so we'll do the same.""" node = ctx.parser_stack[-1] - if node.kind in HAVE_ARGS_KINDS: + if node.kind in HAVE_ARGS_KIND_FLAGS: vbar_fn(ctx, "|") vbar_fn(ctx, "|") return diff --git a/tests/test_parser.py b/tests/test_parser.py index 802f4e63..091316b7 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2977,6 +2977,26 @@ def test_clean_node_lists(self): print(cleaned) self.assertEqual(cleaned, wikitext) + def test_section_in_template(self): + # https://fr.wiktionary.org/wiki/Conjugaison:français/bayer + # GH issue #310 + self.ctx.start_page("") + root = self.ctx.parse("""{{Onglets conjugaison +| contenu1 = 1 +| contenu2 = 2 +=== section === +text +| contenu3 = 3 +}}""") + self.assertEqual(len(root.children), 1) + template = root.children[0] + second_arg_list = template.template_parameters.get("contenu2") + self.assertEqual(len(second_arg_list), 2) + heading_node = second_arg_list[1] + self.assertIsInstance(heading_node, WikiNode) + self.assertEqual(heading_node.kind, NodeKind.LEVEL3) + self.assertEqual(template.template_parameters.get("contenu3"), "3") + # XXX implement marking for links, templates # - https://en.wikipedia.org/wiki/Help:Wikitext#Nowiki