Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[nl] reduce data with "no-gloss" tag #900

Merged
merged 2 commits into from
Nov 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/wiktextract/extractor/nl/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ class WordEntry(DutchBaseModel):
etymology_index: str = Field(default="", exclude=True)
etymology_texts: list[str] = []
sounds: list[Sound] = []
abbreviations: list[Linkage] = []
anagrams: list[Linkage] = []
antonyms: list[Linkage] = []
derived: list[Linkage] = []
Expand Down
8 changes: 8 additions & 0 deletions src/wiktextract/extractor/nl/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,17 @@ def parse_section(
wxr.wtp.start_subsection(title_text)
etymology_data = []
if title_text in POS_DATA:
last_data_len = len(page_data)
extract_pos_section(
wxr, page_data, base_data, forms_data, level_node, title_text
)
if len(page_data) == last_data_len and title_text in LINKAGE_SECTIONS:
extract_linkage_section(
wxr,
page_data[-1] if len(page_data) > 0 else base_data,
level_node,
LINKAGE_SECTIONS[title_text],
)
elif title_text == "Uitspraak":
extract_sound_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
Expand Down
24 changes: 22 additions & 2 deletions src/wiktextract/extractor/nl/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
extract_example_template,
)
from .models import AltForm, Sense, WordEntry
from .section_titles import POS_DATA
from .section_titles import LINKAGE_SECTIONS, POS_DATA
from .tags import (
GLOSS_TAG_TEMPLATES,
LIST_ITEM_TAG_TEMPLATES,
Expand Down Expand Up @@ -40,6 +40,8 @@ def extract_pos_section(
forms_data.forms.clear()
forms_data.categories.clear()
extract_pos_section_nodes(wxr, page_data, base_data, forms_data, level_node)
if len(page_data[-1].senses) == 0:
page_data.pop()


def extract_pos_section_nodes(
Expand All @@ -65,7 +67,7 @@ def extract_pos_section_nodes(
extract_gloss_list_item(wxr, page_data[-1], list_item)
elif isinstance(node, LevelNode):
title_text = clean_node(wxr, None, node.largs)
if title_text in POS_DATA:
if title_text in POS_DATA and title_text not in LINKAGE_SECTIONS:
# expanded from "eng-onv-d" form-of template
from .page import parse_section

Expand All @@ -91,10 +93,12 @@ def extract_pos_section_nodes(
"pronom-dem-form",
"pronom-pos-form",
"xh-pronom-pos-form",
"oudeschrijfwijze",
]
or node.template_name.endswith(
("adjc-form", "adverb-form", "noun-form")
)
or re.search(r"-dec\d+", node.template_name) is not None
):
extract_noun_form_of_template(wxr, page_data[-1], node)
elif isinstance(node, TemplateNode) and (
Expand Down Expand Up @@ -199,12 +203,15 @@ def extract_l_template(

# https://nl.wiktionary.org/wiki/Sjabloon:noun-pl
# https://nl.wiktionary.org/wiki/Sjabloon:noun-form
# https://nl.wiktionary.org/wiki/Sjabloon:oudeschrijfwijze
# "getal" and "gesl" args
NOUN_FORM_OF_TEMPLATE_NUM_TAGS = {
"s": "singular",
"p": "plural",
"d": "dual",
"c": "collective",
"a": "animate",
"i": "inanimate",
}
NOUN_FORM_OF_TEMPLATE_GENDER_TAGS = {
"m": "masculine",
Expand Down Expand Up @@ -237,6 +244,19 @@ def extract_noun_form_of_template(
elif isinstance(gender_tag, list):
sense.tags.extend(gender_tag)

# Sjabloon:oudeschrijfwijze
g_arg = t_node.template_parameters.get("g", "")
for tags_dict in [
NOUN_FORM_OF_TEMPLATE_GENDER_TAGS,
NOUN_FORM_OF_TEMPLATE_NUM_TAGS,
]:
if g_arg in tags_dict:
tag = tags_dict[g_arg]
if isinstance(tag, str):
sense.tags.append(tag)
elif isinstance(tag, list):
sense.tags.extend(tag)

form_of = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
if form_of != "":
sense.form_of.append(AltForm(word=form_of))
Expand Down
1 change: 1 addition & 0 deletions src/wiktextract/extractor/nl/section_titles.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@


LINKAGE_SECTIONS = {
"Afkorting": "abbreviations",
"Anagrammen": "anagrams",
"Antoniemen": "antonyms",
"Afgeleide begrippen": "derived",
Expand Down
2 changes: 1 addition & 1 deletion tests/test_nl_inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def test_nlnoun_different_pos(self):
{{-l-|m}}
#voorste deel van een [[wapen]]
====Werkwoord====
{{1ps|lopen}}""",
# gloss""",
)
self.assertEqual(len(data), 2)
self.assertEqual(
Expand Down
13 changes: 13 additions & 0 deletions tests/test_nl_linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,3 +168,16 @@ def test_sense_text_after_link(self):
}
],
)

def test_abbr(self):
data = parse_page(
self.wxr,
"A grote terts",
"""==Nederlands==
====Zelfstandig naamwoord====
# het akkoord
=====''[[WikiWoordenboek:Afkorting|Afkorting]]''=====
*[[A]]""",
)
self.assertEqual(len(data), 1)
self.assertEqual(data[0]["abbreviations"], [{"word": "A"}])