From 11ede27f7f392ac42dadf9e8e856a57e13f97d27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Fri, 15 Nov 2024 08:06:34 +0200 Subject: [PATCH] [en] Filter out LINK nodes that are files from headers Fixes #910 Files are links, and so if they have an alt-text that alt-text would pop up in heads because we handle En heads by looking at the nodes and don't use clean_value. We might want to consider creating a LinkNode and FileLinkNode class like we have for TemplateNode so that we can more easily filter out file links. --- src/wiktextract/extractor/en/page.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py index a3c47d70..4d856ccd 100644 --- a/src/wiktextract/extractor/en/page.py +++ b/src/wiktextract/extractor/en/page.py @@ -1165,8 +1165,16 @@ def parse_part_of_speech(posnode: WikiNode, pos: str) -> None: posnode.children, lambda x: ( isinstance(x, WikiNode) - and x.kind == NodeKind.TEMPLATE - and x.largs[0][0] in FLOATING_TABLE_TEMPLATES + and ( + ( + x.kind == NodeKind.TEMPLATE + and x.largs[0][0] in FLOATING_TABLE_TEMPLATES + ) + or ( + x.kind == NodeKind.LINK + and x.largs[0][0].lower().startswith("file:") # type:ignore[union-attr] + ) + ) ), ) tempnode = WikiNode(NodeKind.LEVEL6, 0) @@ -1445,6 +1453,7 @@ def process_gloss_header( new_nodes = [] info_template_data = [] for node in header_nodes: + # print(f"{node=}") info_data, info_out = parse_info_template_node(wxr, node, "head") if info_data or info_out: if info_data: @@ -1913,7 +1922,7 @@ def extract_link_texts(item: GeneralNode) -> None: elif rawgloss == "Technical or specialized senses.": rawgloss = "" elif rawgloss.startswith("inflection of "): - parsed = parse_alt_or_inflection_of(wxr, rawgloss, set()) + parsed = parse_alt_or_inflection_of(wxr, rawgloss, set()) if parsed is not None: tags, origins = parsed if origins is not None: