Merge pull request #890 from xxyzz/nl

[nl] improve gloss, linkage, translation section code
tatuylonen · Oct 28, 2024 · 658a856 · 658a856
2 parents 7675eda + 6ad8ca9
commit 658a856
Show file tree

Hide file tree

Showing 6 changed files with 197 additions and 53 deletions.
diff --git a/src/wiktextract/extractor/nl/linkage.py b/src/wiktextract/extractor/nl/linkage.py
@@ -5,6 +5,7 @@
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
 from .models import Linkage, WordEntry
+from .tags import LIST_ITEM_TAG_TEMPLATES
 
 
 def extract_linkage_section(
@@ -73,24 +74,53 @@ def extract_linkage_list_item(
     sense: str,
     sense_index: str,
 ) -> None:
-    for node in list_item.children:
+    linkage_list = getattr(word_entry, linkage_type)
+    orig_len = len(linkage_list)
+    tags = []
+    for index, node in enumerate(list_item.children):
         if isinstance(node, str):
             m = re.search(r"\[(\d+)\]", node)
             if m is not None:
                 sense_index = int(m.group(1))
-            elif node.strip().startswith("="):
-                sense = node.strip().removeprefix("=").strip()
-                linkage_list = getattr(word_entry, linkage_type)
-                if len(linkage_list) > 0:
+            elif node.strip().startswith(("=", "–")):
+                sense = clean_node(wxr, None, list_item.children[index:]).strip(
+                    "=– "
+                )
+                if len(linkage_list) > orig_len:
                     linkage_list[-1].sense = sense
+                else:
+                    word_nodes = [
+                        n
+                        for n in list_item.children[:index]
+                        if not isinstance(n, TemplateNode)
+                    ]
+                    word = clean_node(wxr, None, word_nodes)
+                    if word != "":
+                        linkage_list.append(
+                            Linkage(
+                                word=word,
+                                sense=sense,
+                                sense_index=sense_index,
+                                tags=tags,
+                            )
+                        )
+                return
         elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
             word = clean_node(wxr, None, node)
             if word != "":
-                getattr(word_entry, linkage_type).append(
+                linkage_list.append(
                     Linkage(word=word, sense=sense, sense_index=sense_index)
                 )
-        elif isinstance(node, TemplateNode) and node.template_name == "expr":
-            extract_expr_template(wxr, word_entry, node, linkage_type)
+        elif isinstance(node, TemplateNode):
+            if node.template_name == "expr":
+                extract_expr_template(wxr, word_entry, node, linkage_type)
+            elif node.template_name in LIST_ITEM_TAG_TEMPLATES:
+                if len(linkage_list) > orig_len:
+                    linkage_list[-1].tags.append(
+                        LIST_ITEM_TAG_TEMPLATES[node.template_name]
+                    )
+                else:
+                    tags.append(LIST_ITEM_TAG_TEMPLATES[node.template_name])
 
 
 def extract_nld_template(

diff --git a/src/wiktextract/extractor/nl/pos.py b/src/wiktextract/extractor/nl/pos.py
@@ -11,7 +11,11 @@
 )
 from .models import AltForm, Sense, WordEntry
 from .section_titles import POS_DATA
-from .tags import translate_raw_tags
+from .tags import (
+    GLOSS_TAG_TEMPLATES,
+    LIST_ITEM_TAG_TEMPLATES,
+    translate_raw_tags,
+)
 
 
 def extract_pos_section(
@@ -74,34 +78,46 @@ def extract_pos_section_nodes(
             and len(page_data[-1].senses) > 0
         ):
             extract_example_template(wxr, page_data[-1].senses[-1], node)
-        elif isinstance(node, TemplateNode) and node.template_name in [
-            "noun-pl",
-            "noun-form",
-        ]:
+        elif isinstance(node, TemplateNode) and (
+            node.template_name
+            in [
+                "noun-pl",
+                "nl-advb-form",
+                "noun-dim",
+                "noun-dim-pl",
+                "num-form",
+                "ordn-form",
+                "prep-form",
+                "pronom-dem-form",
+                "pronom-pos-form",
+                "xh-pronom-pos-form",
+            ]
+            or node.template_name.endswith(
+                ("adjc-form", "adverb-form", "noun-form")
+            )
+        ):
             extract_noun_form_of_template(wxr, page_data[-1], node)
-        elif isinstance(node, TemplateNode) and node.template_name.startswith(
-            (
-                "1ps",
-                "2ps",
-                "aanv-w",
-                "onv-d",
-                "ott-",
-                "ovt-",
-                "tps",
-                "volt-d",
-                "eng-onv-d",
+        elif isinstance(node, TemplateNode) and (
+            node.template_name.startswith(
+                (
+                    "1ps",
+                    "2ps",
+                    "aanv-w",
+                    "onv-d",
+                    "ott-",
+                    "ovt-",
+                    "tps",
+                    "volt-d",
+                    "eng-onv-d",
+                )
             )
+            or node.template_name.endswith("verb-form")
         ):
             extract_verb_form_of_template(
                 wxr, page_data, base_data, forms_data, node
             )
 
 
-# https://nl.wiktionary.org/wiki/Categorie:Lemmasjablonen
-# https://nl.wiktionary.org/wiki/Categorie:Werkwoordsjablonen
-GLOSS_TAG_TEMPLATES = frozenset(["auxl", "erga", "inerg"])
-
-
 def extract_gloss_list_item(
     wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode
 ) -> None:
@@ -111,6 +127,8 @@ def extract_gloss_list_item(
         if isinstance(child, TemplateNode):
             if child.template_name in GLOSS_TAG_TEMPLATES:
                 sense.raw_tags.append(clean_node(wxr, sense, child))
+            elif child.template_name in LIST_ITEM_TAG_TEMPLATES:
+                sense.tags.append(LIST_ITEM_TAG_TEMPLATES[child.template_name])
             else:
                 expanded_text = clean_node(wxr, sense, child)
                 if expanded_text.startswith("(") and expanded_text.endswith(
@@ -133,7 +151,7 @@ def extract_gloss_list_item(
             gloss_nodes.append(child)
 
     gloss_text = clean_node(wxr, sense, gloss_nodes)
-    if gloss_text.startswith(","):  # between qualifier templates
+    while gloss_text.startswith(","):  # between qualifier templates
         gloss_text = gloss_text.removeprefix(",").strip()
     m = re.match(r"\(([^()]+)\)", gloss_text)
     if m is not None:  # expanded "verouderd" template in "2ps" template
@@ -153,8 +171,12 @@ def extract_pos_header_line_nodes(
             m = re.search(r"\[(.+)\]", node.strip())
             if m is not None:
                 word_entry.etymology_index = m.group(1).strip()
-        elif isinstance(node, TemplateNode) and node.template_name == "-l-":
-            extract_l_template(wxr, word_entry, node)
+        elif isinstance(node, TemplateNode):
+            if node.template_name == "-l-":
+                extract_l_template(wxr, word_entry, node)
+            elif node.template_name == "dimt":
+                word_entry.raw_tags.append(clean_node(wxr, word_entry, node))
+    translate_raw_tags(word_entry)
 
 
 def extract_l_template(
@@ -198,8 +220,9 @@ def extract_l_template(
 def extract_noun_form_of_template(
     wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
 ) -> None:
+    # https://nl.wiktionary.org/wiki/Categorie:Vormsjablonen
     sense = Sense(tags=["form-of"])
-    if t_node.template_name == "noun-pl":
+    if t_node.template_name.endswith("-pl"):
         sense.tags.append("plural")
     else:
         num_arg = t_node.template_parameters.get("getal", "")
@@ -236,6 +259,7 @@ def extract_verb_form_of_template(
     t_node: TemplateNode,
 ) -> None:
     # https://nl.wiktionary.org/wiki/Categorie:Werkwoordsvormsjablonen_voor_het_Nederlands
+    # https://nl.wiktionary.org/wiki/Categorie:Werkwoordsvormsjablonen
     from .page import extract_section_categories
 
     orig_data_len = len(page_data)

diff --git a/src/wiktextract/extractor/nl/tags.py b/src/wiktextract/extractor/nl/tags.py
@@ -1,12 +1,32 @@
 from .models import WordEntry
 
+# https://nl.wiktionary.org/wiki/Categorie:Lemmasjablonen
 # https://nl.wiktionary.org/wiki/Categorie:Werkwoordsjablonen
-VERB_TAGS = {
-    "ergatief": "ergative",  # Sjabloon:erga
-    "inergatief": "unergative",  # Sjabloon:inerg
-    "hulpwerkwoord": "auxiliary",  # Sjabloon:auxl
-}
+GLOSS_TAG_TEMPLATES = frozenset(
+    [
+        "absol",
+        "accus",
+        "auxl",
+        "copl",
+        "deponens",
+        "ditr",
+        "erga",
+        "inerg",
+        "intr",
+        "modl",
+        "onpr",
+        "ov",
+        "rcpq",
+        "refl",
+        "s-verb",
+        "plurt",
+        "singt",
+        "versterkend voorvoegsel",
+    ]
+)
 
+
+# https://nl.wiktionary.org/wiki/Categorie:Werkwoordsjablonen
 # https://nl.wiktionary.org/wiki/Categorie:WikiWoordenboek:Contextlabels
 GLOSS_TAGS = {
     "figuurlijk": "figuratively",
@@ -55,6 +75,23 @@
     "zegswijze": "idiomatic",
     "zeldzaam": "rare",
     "Latijns-Amerika": "Latin-America",
+    "absoluut": "absolute",  # Sjabloon:absol
+    "accusatief": "accusative",  # Sjabloon:accus
+    "hulpwerkwoord": "auxiliary",  # Sjabloon:auxl
+    "koppelwerkwoord": "copulative",  # Sjabloon:copl
+    "deponens": "deponent",
+    "ditransitief": "ditransitive",  # Sjabloon:ditr
+    "ergatief": "ergative",  # Sjabloon:erga
+    "inergatief": "unergative",  # Sjabloon:inerg
+    "onovergankelijk": "intransitive",  # Sjabloon:intr
+    "modaal werkwoord": ["modal", "verb"],  # Sjabloon:modl
+    "onpersoonlijk": "impersonal",  # Sjabloon:onpr
+    "overgankelijk": "transitive",  # Sjabloon:ov
+    "wederkerig": "reciprocal",  # Sjabloon:rcpq
+    "wederkerend": "reflexive",  # Sjabloon:refl
+    "alleen meervoud": "plural-only",  # Sjabloon:plurt
+    "geen meervoud": "no-plural",  # Sjabloon:singt
+    "versterkend voorvoegsel": ["intensifier", "prefix"],
 }
 
 TABLE_TAGS = {
@@ -89,7 +126,12 @@
 }
 
 
-TAGS = {**VERB_TAGS, **GLOSS_TAGS, **TABLE_TAGS}
+HEADER_LINE_TAGS = {
+    "dim. tant.": ["diminutive", "noun"],  # Sjabloon:dimt
+}
+
+
+TAGS = {**GLOSS_TAGS, **TABLE_TAGS, **HEADER_LINE_TAGS}
 
 # https://nl.wiktionary.org/wiki/Categorie:WikiWoordenboek:Contextlabels
 TOPICS = {
@@ -342,3 +384,18 @@ def translate_raw_tags(data: WordEntry) -> None:
         else:
             raw_tags.append(raw_tag)
     data.raw_tags = raw_tags
+
+
+# used in translation, linkage and gloss lists
+LIST_ITEM_TAG_TEMPLATES = {
+    "m": "masculine",
+    "f": "feminine",
+    "n": "neuter",
+    "c": "common",
+    "s": "singular",
+    "p": "plural",
+    "a": "animate",
+    "i": "inanimate",
+    "impf": "imperfective",
+    "pf": "perfective",
+}
diff --git a/src/wiktextract/extractor/nl/translation.py b/src/wiktextract/extractor/nl/translation.py
@@ -5,6 +5,7 @@
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
 from .models import Translation, WordEntry
+from .tags import LIST_ITEM_TAG_TEMPLATES
 
 
 def extract_translation_section(
@@ -30,18 +31,6 @@ def extract_translation_section(
                 )
 
 
-TR_TEMPLATES = {
-    "m": "masculine",
-    "f": "feminine",
-    "n": "neuter",
-    "c": "common",
-    "s": "singular",
-    "p": "plural",
-    "a": "animate",
-    "i": "inanimate",
-}
-
-
 def extract_translation_list_item(
     wxr: WiktextractContext,
     word_entry: WordEntry,
@@ -75,11 +64,11 @@ def extract_translation_list_item(
                         )
                     )
                 elif (
-                    node.template_name in TR_TEMPLATES
+                    node.template_name in LIST_ITEM_TAG_TEMPLATES
                     and len(word_entry.translations) > 0
                 ):
                     word_entry.translations[-1].tags.append(
-                        TR_TEMPLATES[node.template_name]
+                        LIST_ITEM_TAG_TEMPLATES[node.template_name]
                     )
             elif isinstance(node, str):
                 for c in node:
@@ -93,5 +82,10 @@ def extract_translation_list_item(
                             roman_str = ""
                     elif brackets > 0:
                         roman_str += c
+            elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
+                for next_list_item in node.find_child(NodeKind.LIST_ITEM):
+                    extract_translation_list_item(
+                        wxr, word_entry, next_list_item, sense, sense_index
+                    )
             elif brackets > 0:
                 roman_str += clean_node(wxr, None, node)
diff --git a/tests/test_nl_linkage.py b/tests/test_nl_linkage.py
@@ -148,3 +148,23 @@ def test_sense_text_after_link(self):
             data[0]["related"],
             [{"sense": "met grote passen lopen", "word": "benen"}],
         )
+
+        data = parse_page(
+            self.wxr,
+            "omyl",
+            """==Tsjechisch==
+====Zelfstandig naamwoord====
+# fout
+=====Typische woordcombinaties=====
+* justiční ''omyl'' {{m}}{{i}} –  justitiële ''dwaling''""",
+        )
+        self.assertEqual(
+            data[0]["derived"],
+            [
+                {
+                    "sense": "justitiële dwaling",
+                    "word": "justiční omyl",
+                    "tags": ["masculine", "inanimate"],
+                }
+            ],
+        )
diff --git a/tests/test_nl_translation.py b/tests/test_nl_translation.py
@@ -90,3 +90,22 @@ def test_plain_text_lang_name(self):
                 },
             ],
         )
+
+    def test_nested_list(self):
+        self.wxr.wtp.add_page("Sjabloon:cmn", 10, "Mandarijn")
+        data = parse_page(
+            self.wxr,
+            "kijken",
+            """==Nederlands==
+====Werkwoord====
+# met de ogen waarnemen
+=====Vertalingen=====
+* Chinees:
+** {{cmn}}: {{trad|cmn|看}}""",
+        )
+        self.assertEqual(
+            data[0]["translations"],
+            [
+                {"word": "看", "lang": "Mandarijn", "lang_code": "cmn"},
+            ],
+        )