Skip to content

Commit

Permalink
Merge pull request #914 from xxyzz/nl
Browse files Browse the repository at this point in the history
[nl] changes for extract verb forms table code
  • Loading branch information
xxyzz authored Nov 20, 2024
2 parents 8972197 + 98434a4 commit f92d800
Show file tree
Hide file tree
Showing 4 changed files with 287 additions and 53 deletions.
137 changes: 93 additions & 44 deletions src/wiktextract/extractor/nl/inflection.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,30 @@
import re
from dataclasses import dataclass

from wikitextprocessor import NodeKind, TemplateNode, WikiNode
from wikitextprocessor.parser import (
LEVEL_KIND_FLAGS,
NodeKind,
TemplateNode,
WikiNode,
)

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Form, WordEntry
from .tags import translate_raw_tags

FORMS_TABLE_TEMPLATES = frozenset(
[
"-nlnoun-",
"adjcomp",
"-nlname-",
"-denoun-",
"-denoun1-",
"-nlstam-",
"-csadjc-comp-",
]
)


def extract_inflection_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
Expand Down Expand Up @@ -72,20 +89,25 @@ def extract_nlstam_template(
# verb table
# https://nl.wiktionary.org/wiki/Sjabloon:-nlstam-
for arg in [2, 3]:
form_str = clean_node(
form_texts = clean_node(
wxr, None, t_node.template_parameters.get(arg, "")
)
if form_str != "":
form = Form(
form=form_str,
ipa=clean_node(
wxr, None, t_node.template_parameters.get(arg + 3, "")
),
)
form.tags.extend(["past"] if arg == 2 else ["past", "participle"])
word_entry.forms.append(form)
ipa_texts = clean_node(
wxr, None, t_node.template_parameters.get(arg + 3, "")
).splitlines()
for index, form_str in enumerate(form_texts.splitlines()):
if form_str != "":
form = Form(form=form_str)
if index < len(ipa_texts):
form.ipa = ipa_texts[index]
form.tags.extend(
["past"] if arg == 2 else ["past", "participle"]
)
word_entry.forms.append(form)
clean_node(wxr, word_entry, t_node)
extract_vervoeging_page(wxr, word_entry)
if not word_entry.extracted_vervoeging_page:
extract_vervoeging_page(wxr, word_entry)
word_entry.extracted_vervoeging_page = True


def extract_vervoeging_page(
Expand All @@ -95,9 +117,16 @@ def extract_vervoeging_page(
if page is None:
return
root = wxr.wtp.parse(page.body)
table_templates = ["-nlverb-", "-nlverb-reflex-", "-nlverb-onp-"]
for t_node in root.find_child(NodeKind.TEMPLATE):
if t_node.template_name in ["-nlverb-", "-nlverb-reflex-"]:
extract_nlverb_template(wxr, word_entry, t_node)
if t_node.template_name in table_templates:
extract_nlverb_template(wxr, word_entry, t_node, "")
sense = ""
for level_node in root.find_child_recursively(LEVEL_KIND_FLAGS):
sense = clean_node(wxr, None, level_node.largs)
for t_node in level_node.find_child(NodeKind.TEMPLATE):
if t_node.template_name in table_templates:
extract_nlverb_template(wxr, word_entry, t_node, sense)


@dataclass
Expand All @@ -113,12 +142,14 @@ class TableHeader:
"vervoeging van de bedrijvende vorm van": ["active"],
"onpersoonlijke lijdende vorm": ["impersonal", "passive"],
"lijdende vorm": ["passive"],
"vervoeging van het Nederlandse werkwoord": [],
}


def extract_nlverb_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
wxr: WiktextractContext,
word_entry: WordEntry,
t_node: TemplateNode,
sense: str,
) -> None:
# https://nl.wiktionary.org/wiki/Sjabloon:-nlverb-
# Sjabloon:-nlverb-reflex-
Expand Down Expand Up @@ -170,7 +201,7 @@ def extract_nlverb_template(
if re.fullmatch(r"\d+", cell_rowspan_str):
cell_rowspan = int(cell_rowspan_str)
cell_str = clean_node(wxr, None, cell_node).strip("| ")
if cell_str in ["", wxr.wtp.title]:
if cell_str in ["", "—", wxr.wtp.title]:
col_index += cell_colspan
is_row_first_node = False
continue
Expand All @@ -183,7 +214,9 @@ def extract_nlverb_template(
shared_tags.extend(prefix_tags)
break
else:
if current_row_all_header:
if cell_str.startswith("vervoeging van "):
pass
elif current_row_all_header:
if (
is_row_first_node
and t_node.template_name == "-nlverb-"
Expand Down Expand Up @@ -213,39 +246,55 @@ def extract_nlverb_template(
cell_rowspan,
)
)
else:
else: # data cell
has_small_tag = False
for small_node in cell_node.find_html("small"):
has_small_tag = True
if has_small_tag:
small_tag = cell_str
col_index += cell_colspan
continue
form = Form(
form=cell_str,
tags=shared_tags,
raw_tags=shared_raw_tags,
source=f"{wxr.wtp.title}/vervoeging",
)
if small_tag != "":
form.raw_tags.append(small_tag)
small_tag = ""
for row_header in row_headers:
if (
row_index >= row_header.row_index
and row_index
< row_header.row_index + row_header.rowspan
):
form.raw_tags.append(row_header.text)
for col_header in col_headers:
if (
col_index >= col_header.col_index
and col_index
< col_header.col_index + col_header.colspan
):
form.raw_tags.append(col_header.text)
translate_raw_tags(form)
word_entry.forms.append(form)
form_texts = [cell_str]
if "/ " in cell_str: # "zweerde/ zwoor"
form_texts = cell_str.split("/")
elif "/" in cell_str and " " in cell_str:
# "zult/zal zweren" -> ["zult zweren", "zal zweren"]
space_index = cell_str.index(" ")
second_part = cell_str[space_index:]
form_texts = [
f_str + second_part
for f_str in cell_str[:space_index].split("/")
]
for form_str in form_texts:
form_str = form_str.strip()
if len(form_str) == 0:
continue
form = Form(
form=form_str,
tags=shared_tags,
raw_tags=shared_raw_tags,
source=f"{wxr.wtp.title}/vervoeging",
sense=sense,
)
if small_tag != "":
form.raw_tags.append(small_tag)
small_tag = ""
for row_header in row_headers:
if (
row_index >= row_header.row_index
and row_index
< row_header.row_index + row_header.rowspan
):
form.raw_tags.append(row_header.text)
for col_header in col_headers:
if (
col_index >= col_header.col_index
and col_index
< col_header.col_index + col_header.colspan
):
form.raw_tags.append(col_header.text)
translate_raw_tags(form)
word_entry.forms.append(form)

col_index += cell_colspan
is_row_first_node = False
Expand Down
2 changes: 2 additions & 0 deletions src/wiktextract/extractor/nl/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ class Form(DutchBaseModel):
raw_tags: list[str] = []
ipa: str = ""
source: str = ""
sense: str = ""


class Descendant(DutchBaseModel):
Expand Down Expand Up @@ -125,3 +126,4 @@ class WordEntry(DutchBaseModel):
forms: list[Form] = []
notes: list[str] = []
descendants: list[Descendant] = []
extracted_vervoeging_page: bool = Field(default=False, exclude=True)
27 changes: 18 additions & 9 deletions src/wiktextract/extractor/nl/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from ...wxr_context import WiktextractContext
from .descendant import extract_descendant_section
from .etymology import extract_etymology_section
from .inflection import extract_inflection_template
from .inflection import FORMS_TABLE_TEMPLATES, extract_inflection_template
from .linkage import extract_fixed_preposition_section, extract_linkage_section
from .models import Etymology, Sense, WordEntry
from .pos import extract_pos_section
Expand Down Expand Up @@ -107,15 +107,22 @@ def parse_section(
extract_section_categories(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)
is_first_forms_template = True
for t_node in level_node.find_child(NodeKind.TEMPLATE):
extract_inflection_template(
wxr,
page_data[-1]
if title_text.startswith(("Vervoeging", "Verbuiging"))
and len(page_data) > 0
else forms_data,
t_node,
)
if t_node.template_name in FORMS_TABLE_TEMPLATES:
if is_first_forms_template:
is_first_forms_template = False
if len(forms_data.forms) > 0:
forms_data.forms.clear()
forms_data.extracted_vervoeging_page = False
extract_inflection_template(
wxr,
page_data[-1]
if title_text.startswith(("Vervoeging", "Verbuiging"))
and len(page_data) > 0
else forms_data,
t_node,
)
return etymology_data


Expand Down Expand Up @@ -151,6 +158,8 @@ def parse_page(
forms_data = base_data.model_copy(deep=True)
extract_section_categories(wxr, base_data, level2_node)
etymology_data = []
for t_node in level2_node.find_child(NodeKind.TEMPLATE):
extract_inflection_template(wxr, forms_data, t_node)
for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS):
new_e_data = parse_section(
wxr, page_data, base_data, forms_data, next_level_node
Expand Down
Loading

0 comments on commit f92d800

Please sign in to comment.