Skip to content

Commit

Permalink
[nl] extract "-csadjc-comp-*" forms table templates
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Nov 19, 2024
1 parent 70c2e05 commit d6317fe
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 6 deletions.
27 changes: 27 additions & 0 deletions src/wiktextract/extractor/nl/inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ def extract_inflection_template(
extract_noun_adj_table(wxr, word_entry, t_node)
elif t_node.template_name == "-nlstam-":
extract_nlstam_template(wxr, word_entry, t_node)
elif t_node.template_name.startswith("-csadjc-comp-"):
extract_csadjc_comp_template(wxr, word_entry, t_node)


def extract_noun_adj_table(
Expand Down Expand Up @@ -241,3 +243,28 @@ def nlverb_table_cell_is_header(node: WikiNode) -> bool:
node.kind == NodeKind.TABLE_HEADER_CELL
or node.attrs.get("class", "") == "infoboxrijhoofding"
)


def extract_csadjc_comp_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
# https://nl.wiktionary.org/wiki/Sjabloon:-csadjc-comp-ý3-
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
for table in expanded_node.find_child(NodeKind.TABLE):
for row in table.find_child(NodeKind.TABLE_ROW):
row_header = ""
for cell_node in row.find_child(
NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
):
if cell_node.kind == NodeKind.TABLE_HEADER_CELL:
row_header = clean_node(wxr, None, cell_node)
elif cell_node.kind == NodeKind.TABLE_CELL:
form_text = clean_node(wxr, None, cell_node)
if form_text not in ["", wxr.wtp.title]:
form = Form(form=form_text)
if row_header != "":
form.raw_tags.append(row_header)
translate_raw_tags(form)
word_entry.forms.append(form)
15 changes: 9 additions & 6 deletions src/wiktextract/extractor/nl/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,18 +92,14 @@ def parse_section(
extract_fixed_preposition_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)
elif title_text == "Vervoeging":
pass # conjugation
elif title_text == "Verbuiging":
pass # inflection
elif title_text in [
"Gangbaarheid",
"Meer informatie",
"Verwijzingen",
"Citaten",
]:
pass # ignore
else:
elif not title_text.startswith(("Vervoeging", "Verbuiging")):
wxr.wtp.debug(f"unknown title: {title_text}", sortid="nl/page/60")

for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
Expand All @@ -112,7 +108,14 @@ def parse_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)
for t_node in level_node.find_child(NodeKind.TEMPLATE):
extract_inflection_template(wxr, forms_data, t_node)
extract_inflection_template(
wxr,
page_data[-1]
if title_text.startswith(("Vervoeging", "Verbuiging"))
and len(page_data) > 0
else forms_data,
t_node,
)
return etymology_data


Expand Down

0 comments on commit d6317fe

Please sign in to comment.