tatuylonen · xxyzz · Nov 6, 2024 · Nov 6, 2024 · Nov 6, 2024
diff --git a/src/wiktextract/extractor/nl/models.py b/src/wiktextract/extractor/nl/models.py
@@ -106,6 +106,7 @@ class WordEntry(DutchBaseModel):
     etymology_index: str = Field(default="", exclude=True)
     etymology_texts: list[str] = []
     sounds: list[Sound] = []
+    abbreviations: list[Linkage] = []
     anagrams: list[Linkage] = []
     antonyms: list[Linkage] = []
     derived: list[Linkage] = []

diff --git a/src/wiktextract/extractor/nl/page.py b/src/wiktextract/extractor/nl/page.py
@@ -44,9 +44,17 @@ def parse_section(
     wxr.wtp.start_subsection(title_text)
     etymology_data = []
     if title_text in POS_DATA:
+        last_data_len = len(page_data)
         extract_pos_section(
             wxr, page_data, base_data, forms_data, level_node, title_text
         )
+        if len(page_data) == last_data_len and title_text in LINKAGE_SECTIONS:
+            extract_linkage_section(
+                wxr,
+                page_data[-1] if len(page_data) > 0 else base_data,
+                level_node,
+                LINKAGE_SECTIONS[title_text],
+            )
     elif title_text == "Uitspraak":
         extract_sound_section(
             wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node

diff --git a/src/wiktextract/extractor/nl/pos.py b/src/wiktextract/extractor/nl/pos.py
@@ -10,7 +10,7 @@
     extract_example_template,
 )
 from .models import AltForm, Sense, WordEntry
-from .section_titles import POS_DATA
+from .section_titles import LINKAGE_SECTIONS, POS_DATA
 from .tags import (
     GLOSS_TAG_TEMPLATES,
     LIST_ITEM_TAG_TEMPLATES,
@@ -40,6 +40,8 @@ def extract_pos_section(
         forms_data.forms.clear()
         forms_data.categories.clear()
     extract_pos_section_nodes(wxr, page_data, base_data, forms_data, level_node)
+    if len(page_data[-1].senses) == 0:
+        page_data.pop()
 
 
 def extract_pos_section_nodes(
@@ -65,7 +67,7 @@ def extract_pos_section_nodes(
                 extract_gloss_list_item(wxr, page_data[-1], list_item)
         elif isinstance(node, LevelNode):
             title_text = clean_node(wxr, None, node.largs)
-            if title_text in POS_DATA:
+            if title_text in POS_DATA and title_text not in LINKAGE_SECTIONS:
                 # expanded from "eng-onv-d" form-of template
                 from .page import parse_section
 
@@ -91,10 +93,12 @@ def extract_pos_section_nodes(
                 "pronom-dem-form",
                 "pronom-pos-form",
                 "xh-pronom-pos-form",
+                "oudeschrijfwijze",
             ]
             or node.template_name.endswith(
                 ("adjc-form", "adverb-form", "noun-form")
             )
+            or re.search(r"-dec\d+", node.template_name) is not None
         ):
             extract_noun_form_of_template(wxr, page_data[-1], node)
         elif isinstance(node, TemplateNode) and (
@@ -199,12 +203,15 @@ def extract_l_template(
 
 # https://nl.wiktionary.org/wiki/Sjabloon:noun-pl
 # https://nl.wiktionary.org/wiki/Sjabloon:noun-form
+# https://nl.wiktionary.org/wiki/Sjabloon:oudeschrijfwijze
 # "getal" and "gesl" args
 NOUN_FORM_OF_TEMPLATE_NUM_TAGS = {
     "s": "singular",
     "p": "plural",
     "d": "dual",
     "c": "collective",
+    "a": "animate",
+    "i": "inanimate",
 }
 NOUN_FORM_OF_TEMPLATE_GENDER_TAGS = {
     "m": "masculine",
@@ -237,6 +244,19 @@ def extract_noun_form_of_template(
         elif isinstance(gender_tag, list):
             sense.tags.extend(gender_tag)
 
+    # Sjabloon:oudeschrijfwijze
+    g_arg = t_node.template_parameters.get("g", "")
+    for tags_dict in [
+        NOUN_FORM_OF_TEMPLATE_GENDER_TAGS,
+        NOUN_FORM_OF_TEMPLATE_NUM_TAGS,
+    ]:
+        if g_arg in tags_dict:
+            tag = tags_dict[g_arg]
+            if isinstance(tag, str):
+                sense.tags.append(tag)
+            elif isinstance(tag, list):
+                sense.tags.extend(tag)
+
     form_of = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
     if form_of != "":
         sense.form_of.append(AltForm(word=form_of))

diff --git a/src/wiktextract/extractor/nl/section_titles.py b/src/wiktextract/extractor/nl/section_titles.py
@@ -51,6 +51,7 @@
 
 
 LINKAGE_SECTIONS = {
+    "Afkorting": "abbreviations",
     "Anagrammen": "anagrams",
     "Antoniemen": "antonyms",
     "Afgeleide begrippen": "derived",

diff --git a/tests/test_nl_inflection.py b/tests/test_nl_inflection.py
@@ -52,7 +52,7 @@ def test_nlnoun_different_pos(self):
 {{-l-|m}}
 #voorste deel van een [[wapen]]
 ====Werkwoord====
-{{1ps|lopen}}""",
+# gloss""",
         )
         self.assertEqual(len(data), 2)
         self.assertEqual(

diff --git a/tests/test_nl_linkage.py b/tests/test_nl_linkage.py
@@ -168,3 +168,16 @@ def test_sense_text_after_link(self):
                 }
             ],
         )
+
+    def test_abbr(self):
+        data = parse_page(
+            self.wxr,
+            "A grote terts",
+            """==Nederlands==
+====Zelfstandig naamwoord====
+# het akkoord
+=====''[[WikiWoordenboek:Afkorting|Afkorting]]''=====
+*[[A]]""",
+        )
+        self.assertEqual(len(data), 1)
+        self.assertEqual(data[0]["abbreviations"], [{"word": "A"}])