Extract POS for Spanish Wiktionary

This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France.
tatuylonen · Nov 23, 2023 · a821647 · a821647
1 parent e2f8b2f
commit a821647
Show file tree

Hide file tree

Showing 6 changed files with 201 additions and 10 deletions.
diff --git a/src/wiktextract/data/es/other_subtitles.json b/src/wiktextract/data/es/other_subtitles.json
@@ -0,0 +1,5 @@
+{
+  "etymology": ["Etimología"],
+  "pronunciation": ["pronunciación"],
+  "ignored_sections": ["Véase también"]
+}
diff --git a/src/wiktextract/data/es/pos_subtitles.json b/src/wiktextract/data/es/pos_subtitles.json
@@ -0,0 +1,90 @@
+{
+  "abreviatura": { "pos": "abbrev" },
+  "acrónimo": { "pos": "abbrev" },
+  "adjetivo": { "pos": "adj" },
+  "adjetivo cardinal": { "pos": "num" },
+  "adjetivo demostrativo": { "pos": "adj" },
+  "adjetivo indefinido": { "pos": "adj" },
+  "adjetivo indeterminado": { "pos": "adj" },
+  "adjetivo interrogativo": { "pos": "adj" },
+  "adjetivo numeral": { "pos": "num" },
+  "adjetivo ordinal": { "pos": "num" },
+  "adjetivo posesivo": { "pos": "adj" },
+  "adjetivo relativo": { "pos": "adj" },
+  "adverbio": { "pos": "adv" },
+  "adverbio comparativo": { "pos": "adv" },
+  "adverbio de afirmación": { "pos": "adv" },
+  "adverbio de cantidad": { "pos": "adv" },
+  "adverbio de duda": { "pos": "adv" },
+  "adverbio de lugar": { "pos": "adv" },
+  "adverbio de modo": { "pos": "adv" },
+  "adverbio de negación": { "pos": "adv" },
+  "adverbio de orden": { "pos": "adv" },
+  "adverbio de tiempo": { "pos": "adv" },
+  "adverbio demostrativo": { "pos": "adv" },
+  "adverbio interrogativo": { "pos": "adv" },
+  "adverbio relativo": { "pos": "adv" },
+  "afijo": { "pos": "affix" },
+  "artículo": { "pos": "article" },
+  "artículo determinado": { "pos": "article" },
+  "artículo indeterminado": { "pos": "article" },
+  "circunfijo": { "pos": "circumfix" },
+  "conjunción": { "pos": "conj" },
+  "conjunción adversativa": { "pos": "conj" },
+  "conjunción ilativa": { "pos": "conj" },
+  "dígrafo": { "pos": "character" },
+  "expresión": { "pos": "phrase" },
+  "forma verbal": { "pos": "verb" },
+  "interjección": { "pos": "intj" },
+  "letra": { "pos": "character" },
+  "locución": { "pos": "phrase" },
+  "locución adjetiva": { "pos": "phrase" },
+  "locución adverbial": { "pos": "phrase" },
+  "locución conjuntiva": { "pos": "phrase" },
+  "locución interjectiva": { "pos": "phrase" },
+  "locución prepositiva": { "pos": "phrase" },
+  "locución pronominal": { "pos": "phrase" },
+  "locución sustantiva": { "pos": "phrase" },
+  "locución verbal": { "pos": "phrase" },
+  "onomatopeya": { "pos": "noun" },
+  "partícula": { "pos": "particle" },
+  "postposición": { "pos": "postp" },
+  "prefijo": { "pos": "prefix" },
+  "preposición": { "pos": "prep" },
+  "preposición de ablativo": { "pos": "prep" },
+  "preposición de acusativo": { "pos": "prep" },
+  "preposición de acusativo o ablativo": { "pos": "prep" },
+  "preposición de genitivo": { "pos": "prep" },
+  "pronombre": { "pos": "pron" },
+  "pronombre demostrativo": { "pos": "pron" },
+  "pronombre indefinido": { "pos": "pron" },
+  "pronombre interrogativo": { "pos": "pron" },
+  "pronombre personal": { "pos": "pron" },
+  "pronombre posesivo": { "pos": "det" },
+  "pronombre relativo": { "pos": "pron" },
+  "refrán": { "pos": "proverb" },
+  "sigla": { "pos": "abbrev" },
+  "sufijo": { "pos": "suffix" },
+  "sufijo flexivo": { "pos": "suffix" },
+  "sustantivo": { "pos": "noun" },
+  "sustantivo ambiguo": { "pos": "noun" },
+  "sustantivo animado": { "pos": "noun" },
+  "sustantivo común": { "pos": "noun" },
+  "sustantivo femenino": { "pos": "noun" },
+  "sustantivo femenino y masculino": { "pos": "noun" },
+  "sustantivo inanimado": { "pos": "noun" },
+  "sustantivo masculino": { "pos": "noun" },
+  "sustantivo neutro": { "pos": "noun" },
+  "sustantivo neutro y masculino": { "pos": "noun" },
+  "sustantivo propio": { "pos": "name" },
+  "sustantivo propio/pruebas": { "pos": "name" },
+  "símbolo": { "pos": "symbol" },
+  "verbo": { "pos": "verb" },
+  "verbo auxiliar": { "pos": "verb" },
+  "verbo impersonal": { "pos": "verb" },
+  "verbo intransitivo": { "pos": "verb" },
+  "verbo modal": { "pos": "verb" },
+  "verbo perfectivo": { "pos": "verb" },
+  "verbo pronominal": { "pos": "verb" },
+  "verbo transitivo": { "pos": "verb" }
+}
diff --git a/src/wiktextract/datautils.py b/src/wiktextract/datautils.py
@@ -6,6 +6,7 @@
 from collections import defaultdict
 from functools import lru_cache, partial
 from typing import Any, Dict, Iterable, List, Tuple
+from wiktextract.extractor.es.models import BaseModelWrap
 
 from wiktextract.wxr_context import WiktextractContext
 
@@ -30,7 +31,7 @@ def data_append(
     """Appends ``value`` under ``key`` in the dictionary ``data``.  The key
     is created if it does not exist."""
     assert isinstance(wxr, WiktextractContext)
-    assert isinstance(data, dict)
+    assert isinstance(data, dict) or isinstance(data, BaseModelWrap)
     assert isinstance(key, str)
 
     if key in str_keys:
@@ -69,7 +70,8 @@ def make_split_re(seps):
     """Cached helper function for split_at_comma_semi."""
 
 
-def split_at_comma_semi(text: str, separators=(",", ";", "，", "،"), extra=()
+def split_at_comma_semi(
+    text: str, separators=(",", ";", "，", "،"), extra=()
 ) -> List[str]:
     """Splits the text at commas and semicolons, unless they are inside
     parenthesis.  ``separators`` is default separators (setting it eliminates

diff --git a/src/wiktextract/extractor/es/models.py b/src/wiktextract/extractor/es/models.py
@@ -3,7 +3,7 @@
 
 import logging
 
-from pydantic import BaseModel, Field, model_validator
+from pydantic import BaseModel, Field, model_validator, ValidationError
 from pydantic.json_schema import GenerateJsonSchema
 
 from wiktextract.wxr_context import WiktextractContext
@@ -28,16 +28,22 @@ class Config:
         validate_assignment = True
 
     def update(self, data: dict):
-        update = self.dict(exclude_defaults=True, exclude_none=True)
-        update.update(data)
-        for k, v in (
-            self.validate(update)
-            .dict(exclude_defaults=True, exclude_none=True)
-            .items()
-        ):
+        for k, v in data.items():
             setattr(self, k, v)
         return self
 
+    def get(self, key: str, _=None):
+        return getattr(self, key)
+
+    def __getitem__(self, item):
+        return getattr(self, item)
+
+    def __setitem__(self, item, value):
+        try:
+            setattr(self, item, value)
+        except ValidationError:
+            pass
+
 
 class LoggingExtraFieldsModel(BaseModelWrap):
     @model_validator(mode="before")
@@ -80,6 +86,7 @@ class WordEntry(LoggingExtraFieldsModel):
 
     word: str = Field(description="word string")
     pos: str = Field(default=None, description="Part of speech type")
+    pos_title: str = Field(default=None, description="Original POS title")
     lang_code: str = Field(
         description="Wiktionary language code", examples=["es"]
     )

diff --git a/src/wiktextract/extractor/es/page.py b/src/wiktextract/extractor/es/page.py
@@ -4,6 +4,8 @@
 from typing import Dict, List
 
 from wikitextprocessor import NodeKind, WikiNode
+from wiktextract.datautils import append_base_data
+from wiktextract.extractor.es.pronunciation import extract_pronunciation
 from wiktextract.extractor.es.models import WordEntry, PydanticLogger
 
 from wiktextract.page import clean_node
@@ -28,6 +30,58 @@ def parse_section(
     base_data: Dict,
     level_node: WikiNode,
 ) -> None:
+    # Page Structure: https://es.wiktionary.org/wiki/Wikcionario:Estructura
+    subtitle = clean_node(wxr, page_data[-1], level_node.largs)
+    wxr.wtp.start_subsection(subtitle)
+
+    pos_template_name = None
+    for level_node_template in level_node.find_content(NodeKind.TEMPLATE):
+        pos_template_name = level_node_template.template_name
+
+    if subtitle in wxr.config.OTHER_SUBTITLES["ignored_sections"]:
+        pass
+
+    elif pos_template_name and pos_template_name in wxr.config.POS_SUBTITLES:
+        process_pos_block(
+            wxr, page_data, base_data, level_node, pos_template_name, subtitle
+        )
+    else:
+        wxr.wtp.debug(
+            f"Unprocessed section: {subtitle}",
+            sortid="extractor/es/page/parse_section/48",
+        )
+
+
+def process_pos_block(
+    wxr: WiktextractContext,
+    page_data: List[Dict],
+    base_data: Dict,
+    pos_level_node: WikiNode,
+    pos_template_name: str,
+    pos_title: str,
+):
+    pos_type = wxr.config.POS_SUBTITLES[pos_template_name]["pos"]
+    append_base_data(page_data, "pos", pos_type, base_data)
+    page_data[-1]["pos_title"] = pos_title
+    child_nodes = list(pos_level_node.filter_empty_str_child())
+
+    for child in child_nodes:
+        if (
+            isinstance(child, WikiNode)
+            and child.kind == NodeKind.TEMPLATE
+            and (
+                "inflect" in child.template_name
+                or "v.conj" in child.template_name
+            )
+        ):
+            # XXX: Extract forms
+            pass
+        elif isinstance(child, WikiNode) and child.kind == NodeKind.LIST:
+            # XXX: Extract data
+            pass
+        else:
+            # XXX: Extract data
+            pass
     pass
 
 
@@ -58,6 +112,12 @@ def parse_page(
             if subtitle_template.template_name == "lengua":
                 categories_and_links = defaultdict(list)
                 lang_code = subtitle_template.template_parameters.get(1)
+                if (
+                    wxr.config.capture_language_codes is not None
+                    and lang_code not in wxr.config.capture_language_codes
+                ):
+                    continue
+
                 lang_name = clean_node(
                     wxr, categories_and_links, subtitle_template
                 )
@@ -70,4 +130,22 @@ def parse_page(
                 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
                     parse_section(wxr, page_data, base_data, level3_node)
 
+                for not_level3_node in level2_node.invert_find_child(
+                    NodeKind.LEVEL3
+                ):
+                    if (
+                        isinstance(not_level3_node, WikiNode)
+                        and not_level3_node.kind == NodeKind.TEMPLATE
+                        and not_level3_node.template_name == "pron-graf"
+                    ):
+                        if wxr.config.capture_pronunciation:
+                            extract_pronunciation(
+                                wxr, page_data[-1], not_level3_node
+                            )
+                    else:
+                        wxr.wtp.debug(
+                            f"Found unexpected child in level 2 'lengua' node: {not_level3_node}",
+                            sortid="extractor/es/page/parse_page/80",
+                        )
+
     return [d.model_dump(exclude_defaults=True) for d in page_data]
diff --git a/src/wiktextract/extractor/es/pronunciation.py b/src/wiktextract/extractor/es/pronunciation.py
@@ -0,0 +1,9 @@
+from wiktextract.wxr_context import WiktextractContext
+from typing import Dict, List
+from wikitextprocessor import WikiNode
+
+
+def extract_pronunciation(
+    wxr: WiktextractContext, page_data: List[Dict], template_node: WikiNode
+) -> None:
+    pass