Skip to content

Commit

Permalink
Extract POS for Spanish Wiktionary
Browse files Browse the repository at this point in the history
This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France.
  • Loading branch information
empiriker committed Nov 23, 2023
1 parent e2f8b2f commit a821647
Show file tree
Hide file tree
Showing 6 changed files with 201 additions and 10 deletions.
5 changes: 5 additions & 0 deletions src/wiktextract/data/es/other_subtitles.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"etymology": ["Etimología"],
"pronunciation": ["pronunciación"],
"ignored_sections": ["Véase también"]
}
90 changes: 90 additions & 0 deletions src/wiktextract/data/es/pos_subtitles.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
{
"abreviatura": { "pos": "abbrev" },
"acrónimo": { "pos": "abbrev" },
"adjetivo": { "pos": "adj" },
"adjetivo cardinal": { "pos": "num" },
"adjetivo demostrativo": { "pos": "adj" },
"adjetivo indefinido": { "pos": "adj" },
"adjetivo indeterminado": { "pos": "adj" },
"adjetivo interrogativo": { "pos": "adj" },
"adjetivo numeral": { "pos": "num" },
"adjetivo ordinal": { "pos": "num" },
"adjetivo posesivo": { "pos": "adj" },
"adjetivo relativo": { "pos": "adj" },
"adverbio": { "pos": "adv" },
"adverbio comparativo": { "pos": "adv" },
"adverbio de afirmación": { "pos": "adv" },
"adverbio de cantidad": { "pos": "adv" },
"adverbio de duda": { "pos": "adv" },
"adverbio de lugar": { "pos": "adv" },
"adverbio de modo": { "pos": "adv" },
"adverbio de negación": { "pos": "adv" },
"adverbio de orden": { "pos": "adv" },
"adverbio de tiempo": { "pos": "adv" },
"adverbio demostrativo": { "pos": "adv" },
"adverbio interrogativo": { "pos": "adv" },
"adverbio relativo": { "pos": "adv" },
"afijo": { "pos": "affix" },
"artículo": { "pos": "article" },
"artículo determinado": { "pos": "article" },
"artículo indeterminado": { "pos": "article" },
"circunfijo": { "pos": "circumfix" },
"conjunción": { "pos": "conj" },
"conjunción adversativa": { "pos": "conj" },
"conjunción ilativa": { "pos": "conj" },
"dígrafo": { "pos": "character" },
"expresión": { "pos": "phrase" },
"forma verbal": { "pos": "verb" },
"interjección": { "pos": "intj" },
"letra": { "pos": "character" },
"locución": { "pos": "phrase" },
"locución adjetiva": { "pos": "phrase" },
"locución adverbial": { "pos": "phrase" },
"locución conjuntiva": { "pos": "phrase" },
"locución interjectiva": { "pos": "phrase" },
"locución prepositiva": { "pos": "phrase" },
"locución pronominal": { "pos": "phrase" },
"locución sustantiva": { "pos": "phrase" },
"locución verbal": { "pos": "phrase" },
"onomatopeya": { "pos": "noun" },
"partícula": { "pos": "particle" },
"postposición": { "pos": "postp" },
"prefijo": { "pos": "prefix" },
"preposición": { "pos": "prep" },
"preposición de ablativo": { "pos": "prep" },
"preposición de acusativo": { "pos": "prep" },
"preposición de acusativo o ablativo": { "pos": "prep" },
"preposición de genitivo": { "pos": "prep" },
"pronombre": { "pos": "pron" },
"pronombre demostrativo": { "pos": "pron" },
"pronombre indefinido": { "pos": "pron" },
"pronombre interrogativo": { "pos": "pron" },
"pronombre personal": { "pos": "pron" },
"pronombre posesivo": { "pos": "det" },
"pronombre relativo": { "pos": "pron" },
"refrán": { "pos": "proverb" },
"sigla": { "pos": "abbrev" },
"sufijo": { "pos": "suffix" },
"sufijo flexivo": { "pos": "suffix" },
"sustantivo": { "pos": "noun" },
"sustantivo ambiguo": { "pos": "noun" },
"sustantivo animado": { "pos": "noun" },
"sustantivo común": { "pos": "noun" },
"sustantivo femenino": { "pos": "noun" },
"sustantivo femenino y masculino": { "pos": "noun" },
"sustantivo inanimado": { "pos": "noun" },
"sustantivo masculino": { "pos": "noun" },
"sustantivo neutro": { "pos": "noun" },
"sustantivo neutro y masculino": { "pos": "noun" },
"sustantivo propio": { "pos": "name" },
"sustantivo propio/pruebas": { "pos": "name" },
"símbolo": { "pos": "symbol" },
"verbo": { "pos": "verb" },
"verbo auxiliar": { "pos": "verb" },
"verbo impersonal": { "pos": "verb" },
"verbo intransitivo": { "pos": "verb" },
"verbo modal": { "pos": "verb" },
"verbo perfectivo": { "pos": "verb" },
"verbo pronominal": { "pos": "verb" },
"verbo transitivo": { "pos": "verb" }
}
6 changes: 4 additions & 2 deletions src/wiktextract/datautils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from collections import defaultdict
from functools import lru_cache, partial
from typing import Any, Dict, Iterable, List, Tuple
from wiktextract.extractor.es.models import BaseModelWrap

from wiktextract.wxr_context import WiktextractContext

Expand All @@ -30,7 +31,7 @@ def data_append(
"""Appends ``value`` under ``key`` in the dictionary ``data``. The key
is created if it does not exist."""
assert isinstance(wxr, WiktextractContext)
assert isinstance(data, dict)
assert isinstance(data, dict) or isinstance(data, BaseModelWrap)
assert isinstance(key, str)

if key in str_keys:
Expand Down Expand Up @@ -69,7 +70,8 @@ def make_split_re(seps):
"""Cached helper function for split_at_comma_semi."""


def split_at_comma_semi(text: str, separators=(",", ";", ",", "،"), extra=()
def split_at_comma_semi(
text: str, separators=(",", ";", ",", "،"), extra=()
) -> List[str]:
"""Splits the text at commas and semicolons, unless they are inside
parenthesis. ``separators`` is default separators (setting it eliminates
Expand Down
23 changes: 15 additions & 8 deletions src/wiktextract/extractor/es/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import logging

from pydantic import BaseModel, Field, model_validator
from pydantic import BaseModel, Field, model_validator, ValidationError
from pydantic.json_schema import GenerateJsonSchema

from wiktextract.wxr_context import WiktextractContext
Expand All @@ -28,16 +28,22 @@ class Config:
validate_assignment = True

def update(self, data: dict):
update = self.dict(exclude_defaults=True, exclude_none=True)
update.update(data)
for k, v in (
self.validate(update)
.dict(exclude_defaults=True, exclude_none=True)
.items()
):
for k, v in data.items():
setattr(self, k, v)
return self

def get(self, key: str, _=None):
return getattr(self, key)

def __getitem__(self, item):
return getattr(self, item)

def __setitem__(self, item, value):
try:
setattr(self, item, value)
except ValidationError:
pass


class LoggingExtraFieldsModel(BaseModelWrap):
@model_validator(mode="before")
Expand Down Expand Up @@ -80,6 +86,7 @@ class WordEntry(LoggingExtraFieldsModel):

word: str = Field(description="word string")
pos: str = Field(default=None, description="Part of speech type")
pos_title: str = Field(default=None, description="Original POS title")
lang_code: str = Field(
description="Wiktionary language code", examples=["es"]
)
Expand Down
78 changes: 78 additions & 0 deletions src/wiktextract/extractor/es/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from typing import Dict, List

from wikitextprocessor import NodeKind, WikiNode
from wiktextract.datautils import append_base_data
from wiktextract.extractor.es.pronunciation import extract_pronunciation
from wiktextract.extractor.es.models import WordEntry, PydanticLogger

from wiktextract.page import clean_node
Expand All @@ -28,6 +30,58 @@ def parse_section(
base_data: Dict,
level_node: WikiNode,
) -> None:
# Page Structure: https://es.wiktionary.org/wiki/Wikcionario:Estructura
subtitle = clean_node(wxr, page_data[-1], level_node.largs)
wxr.wtp.start_subsection(subtitle)

pos_template_name = None
for level_node_template in level_node.find_content(NodeKind.TEMPLATE):
pos_template_name = level_node_template.template_name

if subtitle in wxr.config.OTHER_SUBTITLES["ignored_sections"]:
pass

elif pos_template_name and pos_template_name in wxr.config.POS_SUBTITLES:
process_pos_block(
wxr, page_data, base_data, level_node, pos_template_name, subtitle
)
else:
wxr.wtp.debug(
f"Unprocessed section: {subtitle}",
sortid="extractor/es/page/parse_section/48",
)


def process_pos_block(
wxr: WiktextractContext,
page_data: List[Dict],
base_data: Dict,
pos_level_node: WikiNode,
pos_template_name: str,
pos_title: str,
):
pos_type = wxr.config.POS_SUBTITLES[pos_template_name]["pos"]
append_base_data(page_data, "pos", pos_type, base_data)
page_data[-1]["pos_title"] = pos_title
child_nodes = list(pos_level_node.filter_empty_str_child())

for child in child_nodes:
if (
isinstance(child, WikiNode)
and child.kind == NodeKind.TEMPLATE
and (
"inflect" in child.template_name
or "v.conj" in child.template_name
)
):
# XXX: Extract forms
pass
elif isinstance(child, WikiNode) and child.kind == NodeKind.LIST:
# XXX: Extract data
pass
else:
# XXX: Extract data
pass
pass


Expand Down Expand Up @@ -58,6 +112,12 @@ def parse_page(
if subtitle_template.template_name == "lengua":
categories_and_links = defaultdict(list)
lang_code = subtitle_template.template_parameters.get(1)
if (
wxr.config.capture_language_codes is not None
and lang_code not in wxr.config.capture_language_codes
):
continue

lang_name = clean_node(
wxr, categories_and_links, subtitle_template
)
Expand All @@ -70,4 +130,22 @@ def parse_page(
for level3_node in level2_node.find_child(NodeKind.LEVEL3):
parse_section(wxr, page_data, base_data, level3_node)

for not_level3_node in level2_node.invert_find_child(
NodeKind.LEVEL3
):
if (
isinstance(not_level3_node, WikiNode)
and not_level3_node.kind == NodeKind.TEMPLATE
and not_level3_node.template_name == "pron-graf"
):
if wxr.config.capture_pronunciation:
extract_pronunciation(
wxr, page_data[-1], not_level3_node
)
else:
wxr.wtp.debug(
f"Found unexpected child in level 2 'lengua' node: {not_level3_node}",
sortid="extractor/es/page/parse_page/80",
)

return [d.model_dump(exclude_defaults=True) for d in page_data]
9 changes: 9 additions & 0 deletions src/wiktextract/extractor/es/pronunciation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from wiktextract.wxr_context import WiktextractContext
from typing import Dict, List
from wikitextprocessor import WikiNode


def extract_pronunciation(
wxr: WiktextractContext, page_data: List[Dict], template_node: WikiNode
) -> None:
pass

0 comments on commit a821647

Please sign in to comment.