Skip to content

Commit

Permalink
Merge pull request #890 from xxyzz/nl
Browse files Browse the repository at this point in the history
[nl] improve gloss, linkage, translation section code
  • Loading branch information
xxyzz authored Oct 28, 2024
2 parents 7675eda + 6ad8ca9 commit 658a856
Show file tree
Hide file tree
Showing 6 changed files with 197 additions and 53 deletions.
46 changes: 38 additions & 8 deletions src/wiktextract/extractor/nl/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Linkage, WordEntry
from .tags import LIST_ITEM_TAG_TEMPLATES


def extract_linkage_section(
Expand Down Expand Up @@ -73,24 +74,53 @@ def extract_linkage_list_item(
sense: str,
sense_index: str,
) -> None:
for node in list_item.children:
linkage_list = getattr(word_entry, linkage_type)
orig_len = len(linkage_list)
tags = []
for index, node in enumerate(list_item.children):
if isinstance(node, str):
m = re.search(r"\[(\d+)\]", node)
if m is not None:
sense_index = int(m.group(1))
elif node.strip().startswith("="):
sense = node.strip().removeprefix("=").strip()
linkage_list = getattr(word_entry, linkage_type)
if len(linkage_list) > 0:
elif node.strip().startswith(("=", "–")):
sense = clean_node(wxr, None, list_item.children[index:]).strip(
"=– "
)
if len(linkage_list) > orig_len:
linkage_list[-1].sense = sense
else:
word_nodes = [
n
for n in list_item.children[:index]
if not isinstance(n, TemplateNode)
]
word = clean_node(wxr, None, word_nodes)
if word != "":
linkage_list.append(
Linkage(
word=word,
sense=sense,
sense_index=sense_index,
tags=tags,
)
)
return
elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
word = clean_node(wxr, None, node)
if word != "":
getattr(word_entry, linkage_type).append(
linkage_list.append(
Linkage(word=word, sense=sense, sense_index=sense_index)
)
elif isinstance(node, TemplateNode) and node.template_name == "expr":
extract_expr_template(wxr, word_entry, node, linkage_type)
elif isinstance(node, TemplateNode):
if node.template_name == "expr":
extract_expr_template(wxr, word_entry, node, linkage_type)
elif node.template_name in LIST_ITEM_TAG_TEMPLATES:
if len(linkage_list) > orig_len:
linkage_list[-1].tags.append(
LIST_ITEM_TAG_TEMPLATES[node.template_name]
)
else:
tags.append(LIST_ITEM_TAG_TEMPLATES[node.template_name])


def extract_nld_template(
Expand Down
74 changes: 49 additions & 25 deletions src/wiktextract/extractor/nl/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@
)
from .models import AltForm, Sense, WordEntry
from .section_titles import POS_DATA
from .tags import translate_raw_tags
from .tags import (
GLOSS_TAG_TEMPLATES,
LIST_ITEM_TAG_TEMPLATES,
translate_raw_tags,
)


def extract_pos_section(
Expand Down Expand Up @@ -74,34 +78,46 @@ def extract_pos_section_nodes(
and len(page_data[-1].senses) > 0
):
extract_example_template(wxr, page_data[-1].senses[-1], node)
elif isinstance(node, TemplateNode) and node.template_name in [
"noun-pl",
"noun-form",
]:
elif isinstance(node, TemplateNode) and (
node.template_name
in [
"noun-pl",
"nl-advb-form",
"noun-dim",
"noun-dim-pl",
"num-form",
"ordn-form",
"prep-form",
"pronom-dem-form",
"pronom-pos-form",
"xh-pronom-pos-form",
]
or node.template_name.endswith(
("adjc-form", "adverb-form", "noun-form")
)
):
extract_noun_form_of_template(wxr, page_data[-1], node)
elif isinstance(node, TemplateNode) and node.template_name.startswith(
(
"1ps",
"2ps",
"aanv-w",
"onv-d",
"ott-",
"ovt-",
"tps",
"volt-d",
"eng-onv-d",
elif isinstance(node, TemplateNode) and (
node.template_name.startswith(
(
"1ps",
"2ps",
"aanv-w",
"onv-d",
"ott-",
"ovt-",
"tps",
"volt-d",
"eng-onv-d",
)
)
or node.template_name.endswith("verb-form")
):
extract_verb_form_of_template(
wxr, page_data, base_data, forms_data, node
)


# https://nl.wiktionary.org/wiki/Categorie:Lemmasjablonen
# https://nl.wiktionary.org/wiki/Categorie:Werkwoordsjablonen
GLOSS_TAG_TEMPLATES = frozenset(["auxl", "erga", "inerg"])


def extract_gloss_list_item(
wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode
) -> None:
Expand All @@ -111,6 +127,8 @@ def extract_gloss_list_item(
if isinstance(child, TemplateNode):
if child.template_name in GLOSS_TAG_TEMPLATES:
sense.raw_tags.append(clean_node(wxr, sense, child))
elif child.template_name in LIST_ITEM_TAG_TEMPLATES:
sense.tags.append(LIST_ITEM_TAG_TEMPLATES[child.template_name])
else:
expanded_text = clean_node(wxr, sense, child)
if expanded_text.startswith("(") and expanded_text.endswith(
Expand All @@ -133,7 +151,7 @@ def extract_gloss_list_item(
gloss_nodes.append(child)

gloss_text = clean_node(wxr, sense, gloss_nodes)
if gloss_text.startswith(","): # between qualifier templates
while gloss_text.startswith(","): # between qualifier templates
gloss_text = gloss_text.removeprefix(",").strip()
m = re.match(r"\(([^()]+)\)", gloss_text)
if m is not None: # expanded "verouderd" template in "2ps" template
Expand All @@ -153,8 +171,12 @@ def extract_pos_header_line_nodes(
m = re.search(r"\[(.+)\]", node.strip())
if m is not None:
word_entry.etymology_index = m.group(1).strip()
elif isinstance(node, TemplateNode) and node.template_name == "-l-":
extract_l_template(wxr, word_entry, node)
elif isinstance(node, TemplateNode):
if node.template_name == "-l-":
extract_l_template(wxr, word_entry, node)
elif node.template_name == "dimt":
word_entry.raw_tags.append(clean_node(wxr, word_entry, node))
translate_raw_tags(word_entry)


def extract_l_template(
Expand Down Expand Up @@ -198,8 +220,9 @@ def extract_l_template(
def extract_noun_form_of_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
# https://nl.wiktionary.org/wiki/Categorie:Vormsjablonen
sense = Sense(tags=["form-of"])
if t_node.template_name == "noun-pl":
if t_node.template_name.endswith("-pl"):
sense.tags.append("plural")
else:
num_arg = t_node.template_parameters.get("getal", "")
Expand Down Expand Up @@ -236,6 +259,7 @@ def extract_verb_form_of_template(
t_node: TemplateNode,
) -> None:
# https://nl.wiktionary.org/wiki/Categorie:Werkwoordsvormsjablonen_voor_het_Nederlands
# https://nl.wiktionary.org/wiki/Categorie:Werkwoordsvormsjablonen
from .page import extract_section_categories

orig_data_len = len(page_data)
Expand Down
69 changes: 63 additions & 6 deletions src/wiktextract/extractor/nl/tags.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,32 @@
from .models import WordEntry

# https://nl.wiktionary.org/wiki/Categorie:Lemmasjablonen
# https://nl.wiktionary.org/wiki/Categorie:Werkwoordsjablonen
VERB_TAGS = {
"ergatief": "ergative", # Sjabloon:erga
"inergatief": "unergative", # Sjabloon:inerg
"hulpwerkwoord": "auxiliary", # Sjabloon:auxl
}
GLOSS_TAG_TEMPLATES = frozenset(
[
"absol",
"accus",
"auxl",
"copl",
"deponens",
"ditr",
"erga",
"inerg",
"intr",
"modl",
"onpr",
"ov",
"rcpq",
"refl",
"s-verb",
"plurt",
"singt",
"versterkend voorvoegsel",
]
)


# https://nl.wiktionary.org/wiki/Categorie:Werkwoordsjablonen
# https://nl.wiktionary.org/wiki/Categorie:WikiWoordenboek:Contextlabels
GLOSS_TAGS = {
"figuurlijk": "figuratively",
Expand Down Expand Up @@ -55,6 +75,23 @@
"zegswijze": "idiomatic",
"zeldzaam": "rare",
"Latijns-Amerika": "Latin-America",
"absoluut": "absolute", # Sjabloon:absol
"accusatief": "accusative", # Sjabloon:accus
"hulpwerkwoord": "auxiliary", # Sjabloon:auxl
"koppelwerkwoord": "copulative", # Sjabloon:copl
"deponens": "deponent",
"ditransitief": "ditransitive", # Sjabloon:ditr
"ergatief": "ergative", # Sjabloon:erga
"inergatief": "unergative", # Sjabloon:inerg
"onovergankelijk": "intransitive", # Sjabloon:intr
"modaal werkwoord": ["modal", "verb"], # Sjabloon:modl
"onpersoonlijk": "impersonal", # Sjabloon:onpr
"overgankelijk": "transitive", # Sjabloon:ov
"wederkerig": "reciprocal", # Sjabloon:rcpq
"wederkerend": "reflexive", # Sjabloon:refl
"alleen meervoud": "plural-only", # Sjabloon:plurt
"geen meervoud": "no-plural", # Sjabloon:singt
"versterkend voorvoegsel": ["intensifier", "prefix"],
}

TABLE_TAGS = {
Expand Down Expand Up @@ -89,7 +126,12 @@
}


TAGS = {**VERB_TAGS, **GLOSS_TAGS, **TABLE_TAGS}
HEADER_LINE_TAGS = {
"dim. tant.": ["diminutive", "noun"], # Sjabloon:dimt
}


TAGS = {**GLOSS_TAGS, **TABLE_TAGS, **HEADER_LINE_TAGS}

# https://nl.wiktionary.org/wiki/Categorie:WikiWoordenboek:Contextlabels
TOPICS = {
Expand Down Expand Up @@ -342,3 +384,18 @@ def translate_raw_tags(data: WordEntry) -> None:
else:
raw_tags.append(raw_tag)
data.raw_tags = raw_tags


# used in translation, linkage and gloss lists
LIST_ITEM_TAG_TEMPLATES = {
"m": "masculine",
"f": "feminine",
"n": "neuter",
"c": "common",
"s": "singular",
"p": "plural",
"a": "animate",
"i": "inanimate",
"impf": "imperfective",
"pf": "perfective",
}
22 changes: 8 additions & 14 deletions src/wiktextract/extractor/nl/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Translation, WordEntry
from .tags import LIST_ITEM_TAG_TEMPLATES


def extract_translation_section(
Expand All @@ -30,18 +31,6 @@ def extract_translation_section(
)


TR_TEMPLATES = {
"m": "masculine",
"f": "feminine",
"n": "neuter",
"c": "common",
"s": "singular",
"p": "plural",
"a": "animate",
"i": "inanimate",
}


def extract_translation_list_item(
wxr: WiktextractContext,
word_entry: WordEntry,
Expand Down Expand Up @@ -75,11 +64,11 @@ def extract_translation_list_item(
)
)
elif (
node.template_name in TR_TEMPLATES
node.template_name in LIST_ITEM_TAG_TEMPLATES
and len(word_entry.translations) > 0
):
word_entry.translations[-1].tags.append(
TR_TEMPLATES[node.template_name]
LIST_ITEM_TAG_TEMPLATES[node.template_name]
)
elif isinstance(node, str):
for c in node:
Expand All @@ -93,5 +82,10 @@ def extract_translation_list_item(
roman_str = ""
elif brackets > 0:
roman_str += c
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
for next_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_translation_list_item(
wxr, word_entry, next_list_item, sense, sense_index
)
elif brackets > 0:
roman_str += clean_node(wxr, None, node)
20 changes: 20 additions & 0 deletions tests/test_nl_linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,3 +148,23 @@ def test_sense_text_after_link(self):
data[0]["related"],
[{"sense": "met grote passen lopen", "word": "benen"}],
)

data = parse_page(
self.wxr,
"omyl",
"""==Tsjechisch==
====Zelfstandig naamwoord====
# fout
=====Typische woordcombinaties=====
* justiční ''omyl'' {{m}}{{i}} – justitiële ''dwaling''""",
)
self.assertEqual(
data[0]["derived"],
[
{
"sense": "justitiële dwaling",
"word": "justiční omyl",
"tags": ["masculine", "inanimate"],
}
],
)
19 changes: 19 additions & 0 deletions tests/test_nl_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,22 @@ def test_plain_text_lang_name(self):
},
],
)

def test_nested_list(self):
self.wxr.wtp.add_page("Sjabloon:cmn", 10, "Mandarijn")
data = parse_page(
self.wxr,
"kijken",
"""==Nederlands==
====Werkwoord====
# met de ogen waarnemen
=====Vertalingen=====
* Chinees:
** {{cmn}}: {{trad|cmn|看}}""",
)
self.assertEqual(
data[0]["translations"],
[
{"word": "看", "lang": "Mandarijn", "lang_code": "cmn"},
],
)

0 comments on commit 658a856

Please sign in to comment.