From d6317feb8c7278e39e95383921a9ab1a352ebb18 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 19 Nov 2024 11:52:31 +0800 Subject: [PATCH] [nl] extract "-csadjc-comp-*" forms table templates --- src/wiktextract/extractor/nl/inflection.py | 27 ++++++++++++++++++++++ src/wiktextract/extractor/nl/page.py | 15 +++++++----- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/src/wiktextract/extractor/nl/inflection.py b/src/wiktextract/extractor/nl/inflection.py index 9c8804db..c8ad4e08 100644 --- a/src/wiktextract/extractor/nl/inflection.py +++ b/src/wiktextract/extractor/nl/inflection.py @@ -22,6 +22,8 @@ def extract_inflection_template( extract_noun_adj_table(wxr, word_entry, t_node) elif t_node.template_name == "-nlstam-": extract_nlstam_template(wxr, word_entry, t_node) + elif t_node.template_name.startswith("-csadjc-comp-"): + extract_csadjc_comp_template(wxr, word_entry, t_node) def extract_noun_adj_table( @@ -241,3 +243,28 @@ def nlverb_table_cell_is_header(node: WikiNode) -> bool: node.kind == NodeKind.TABLE_HEADER_CELL or node.attrs.get("class", "") == "infoboxrijhoofding" ) + + +def extract_csadjc_comp_template( + wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode +) -> None: + # https://nl.wiktionary.org/wiki/Sjabloon:-csadjc-comp-ý3- + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(t_node), expand_all=True + ) + for table in expanded_node.find_child(NodeKind.TABLE): + for row in table.find_child(NodeKind.TABLE_ROW): + row_header = "" + for cell_node in row.find_child( + NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL + ): + if cell_node.kind == NodeKind.TABLE_HEADER_CELL: + row_header = clean_node(wxr, None, cell_node) + elif cell_node.kind == NodeKind.TABLE_CELL: + form_text = clean_node(wxr, None, cell_node) + if form_text not in ["", wxr.wtp.title]: + form = Form(form=form_text) + if row_header != "": + form.raw_tags.append(row_header) + translate_raw_tags(form) + word_entry.forms.append(form) diff --git a/src/wiktextract/extractor/nl/page.py b/src/wiktextract/extractor/nl/page.py index 296d8de6..24db9f64 100644 --- a/src/wiktextract/extractor/nl/page.py +++ b/src/wiktextract/extractor/nl/page.py @@ -92,10 +92,6 @@ def parse_section( extract_fixed_preposition_section( wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node ) - elif title_text == "Vervoeging": - pass # conjugation - elif title_text == "Verbuiging": - pass # inflection elif title_text in [ "Gangbaarheid", "Meer informatie", @@ -103,7 +99,7 @@ def parse_section( "Citaten", ]: pass # ignore - else: + elif not title_text.startswith(("Vervoeging", "Verbuiging")): wxr.wtp.debug(f"unknown title: {title_text}", sortid="nl/page/60") for next_level in level_node.find_child(LEVEL_KIND_FLAGS): @@ -112,7 +108,14 @@ def parse_section( wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node ) for t_node in level_node.find_child(NodeKind.TEMPLATE): - extract_inflection_template(wxr, forms_data, t_node) + extract_inflection_template( + wxr, + page_data[-1] + if title_text.startswith(("Vervoeging", "Verbuiging")) + and len(page_data) > 0 + else forms_data, + t_node, + ) return etymology_data