From 18d0511bf7bddceb57481f5611b57701beaec66a Mon Sep 17 00:00:00 2001 From: xxyzz Date: Wed, 6 Nov 2024 16:30:46 +0800 Subject: [PATCH 1/2] [nl] extract "oudeschrijfwijze" and "*-dec*" form-of templates --- src/wiktextract/extractor/nl/pos.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/wiktextract/extractor/nl/pos.py b/src/wiktextract/extractor/nl/pos.py index c16de747..eb50a25b 100644 --- a/src/wiktextract/extractor/nl/pos.py +++ b/src/wiktextract/extractor/nl/pos.py @@ -91,10 +91,12 @@ def extract_pos_section_nodes( "pronom-dem-form", "pronom-pos-form", "xh-pronom-pos-form", + "oudeschrijfwijze", ] or node.template_name.endswith( ("adjc-form", "adverb-form", "noun-form") ) + or re.search(r"-dec\d+", node.template_name) is not None ): extract_noun_form_of_template(wxr, page_data[-1], node) elif isinstance(node, TemplateNode) and ( @@ -199,12 +201,15 @@ def extract_l_template( # https://nl.wiktionary.org/wiki/Sjabloon:noun-pl # https://nl.wiktionary.org/wiki/Sjabloon:noun-form +# https://nl.wiktionary.org/wiki/Sjabloon:oudeschrijfwijze # "getal" and "gesl" args NOUN_FORM_OF_TEMPLATE_NUM_TAGS = { "s": "singular", "p": "plural", "d": "dual", "c": "collective", + "a": "animate", + "i": "inanimate", } NOUN_FORM_OF_TEMPLATE_GENDER_TAGS = { "m": "masculine", @@ -237,6 +242,19 @@ def extract_noun_form_of_template( elif isinstance(gender_tag, list): sense.tags.extend(gender_tag) + # Sjabloon:oudeschrijfwijze + g_arg = t_node.template_parameters.get("g", "") + for tags_dict in [ + NOUN_FORM_OF_TEMPLATE_GENDER_TAGS, + NOUN_FORM_OF_TEMPLATE_NUM_TAGS, + ]: + if g_arg in tags_dict: + tag = tags_dict[g_arg] + if isinstance(tag, str): + sense.tags.append(tag) + elif isinstance(tag, list): + sense.tags.extend(tag) + form_of = clean_node(wxr, None, t_node.template_parameters.get(1, "")) if form_of != "": sense.form_of.append(AltForm(word=form_of)) From 0b5450c57a8091b8961620f7b090f725838efa59 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Wed, 6 Nov 2024 16:49:11 +0800 Subject: [PATCH 2/2] [nl] try extract POS section as linkage if not find gloss list "Afkorting" could also be linkage heading --- src/wiktextract/extractor/nl/models.py | 1 + src/wiktextract/extractor/nl/page.py | 8 ++++++++ src/wiktextract/extractor/nl/pos.py | 6 ++++-- src/wiktextract/extractor/nl/section_titles.py | 1 + tests/test_nl_inflection.py | 2 +- tests/test_nl_linkage.py | 13 +++++++++++++ 6 files changed, 28 insertions(+), 3 deletions(-) diff --git a/src/wiktextract/extractor/nl/models.py b/src/wiktextract/extractor/nl/models.py index 106629b0..73af427a 100644 --- a/src/wiktextract/extractor/nl/models.py +++ b/src/wiktextract/extractor/nl/models.py @@ -106,6 +106,7 @@ class WordEntry(DutchBaseModel): etymology_index: str = Field(default="", exclude=True) etymology_texts: list[str] = [] sounds: list[Sound] = [] + abbreviations: list[Linkage] = [] anagrams: list[Linkage] = [] antonyms: list[Linkage] = [] derived: list[Linkage] = [] diff --git a/src/wiktextract/extractor/nl/page.py b/src/wiktextract/extractor/nl/page.py index ec816d12..296d8de6 100644 --- a/src/wiktextract/extractor/nl/page.py +++ b/src/wiktextract/extractor/nl/page.py @@ -44,9 +44,17 @@ def parse_section( wxr.wtp.start_subsection(title_text) etymology_data = [] if title_text in POS_DATA: + last_data_len = len(page_data) extract_pos_section( wxr, page_data, base_data, forms_data, level_node, title_text ) + if len(page_data) == last_data_len and title_text in LINKAGE_SECTIONS: + extract_linkage_section( + wxr, + page_data[-1] if len(page_data) > 0 else base_data, + level_node, + LINKAGE_SECTIONS[title_text], + ) elif title_text == "Uitspraak": extract_sound_section( wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node diff --git a/src/wiktextract/extractor/nl/pos.py b/src/wiktextract/extractor/nl/pos.py index eb50a25b..4ce46523 100644 --- a/src/wiktextract/extractor/nl/pos.py +++ b/src/wiktextract/extractor/nl/pos.py @@ -10,7 +10,7 @@ extract_example_template, ) from .models import AltForm, Sense, WordEntry -from .section_titles import POS_DATA +from .section_titles import LINKAGE_SECTIONS, POS_DATA from .tags import ( GLOSS_TAG_TEMPLATES, LIST_ITEM_TAG_TEMPLATES, @@ -40,6 +40,8 @@ def extract_pos_section( forms_data.forms.clear() forms_data.categories.clear() extract_pos_section_nodes(wxr, page_data, base_data, forms_data, level_node) + if len(page_data[-1].senses) == 0: + page_data.pop() def extract_pos_section_nodes( @@ -65,7 +67,7 @@ def extract_pos_section_nodes( extract_gloss_list_item(wxr, page_data[-1], list_item) elif isinstance(node, LevelNode): title_text = clean_node(wxr, None, node.largs) - if title_text in POS_DATA: + if title_text in POS_DATA and title_text not in LINKAGE_SECTIONS: # expanded from "eng-onv-d" form-of template from .page import parse_section diff --git a/src/wiktextract/extractor/nl/section_titles.py b/src/wiktextract/extractor/nl/section_titles.py index 7a4cdcc1..40d842f6 100644 --- a/src/wiktextract/extractor/nl/section_titles.py +++ b/src/wiktextract/extractor/nl/section_titles.py @@ -51,6 +51,7 @@ LINKAGE_SECTIONS = { + "Afkorting": "abbreviations", "Anagrammen": "anagrams", "Antoniemen": "antonyms", "Afgeleide begrippen": "derived", diff --git a/tests/test_nl_inflection.py b/tests/test_nl_inflection.py index 700fb5f6..e4ce1072 100644 --- a/tests/test_nl_inflection.py +++ b/tests/test_nl_inflection.py @@ -52,7 +52,7 @@ def test_nlnoun_different_pos(self): {{-l-|m}} #voorste deel van een [[wapen]] ====Werkwoord==== -{{1ps|lopen}}""", +# gloss""", ) self.assertEqual(len(data), 2) self.assertEqual( diff --git a/tests/test_nl_linkage.py b/tests/test_nl_linkage.py index c3389341..3ba4815d 100644 --- a/tests/test_nl_linkage.py +++ b/tests/test_nl_linkage.py @@ -168,3 +168,16 @@ def test_sense_text_after_link(self): } ], ) + + def test_abbr(self): + data = parse_page( + self.wxr, + "A grote terts", + """==Nederlands== +====Zelfstandig naamwoord==== +# het akkoord +=====''[[WikiWoordenboek:Afkorting|Afkorting]]''===== +*[[A]]""", + ) + self.assertEqual(len(data), 1) + self.assertEqual(data[0]["abbreviations"], [{"word": "A"}])