From 44e35cb1f5d61949bf0fe53c573638c8b1b9dfcf Mon Sep 17 00:00:00 2001 From: Empiriker Date: Tue, 28 Nov 2023 09:29:01 +0100 Subject: [PATCH] Extract linkages from Spanish Wiktionary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. --- json_schema/es.json | 314 +++++++++++++++++- .../data/es/linkage_subtitles.json | 4 + src/wiktextract/data/es/other_subtitles.json | 6 +- src/wiktextract/extractor/es/linkage.py | 80 ++++- src/wiktextract/extractor/es/models.py | 33 +- src/wiktextract/extractor/es/page.py | 20 +- src/wiktextract/extractor/es/sense_data.py | 7 +- tests/test_es_linkage.py | 142 ++++++++ 8 files changed, 581 insertions(+), 25 deletions(-) create mode 100644 tests/test_es_linkage.py diff --git a/json_schema/es.json b/json_schema/es.json index eba0bd5c..7dc1d6f2 100644 --- a/json_schema/es.json +++ b/json_schema/es.json @@ -40,6 +40,45 @@ "title": "Example", "type": "object" }, + "Linkage": { + "additionalProperties": false, + "properties": { + "alternative_spelling": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Alternative spelling of the word", + "title": "Alternative Spelling" + }, + "note": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Note" + }, + "word": { + "title": "Word", + "type": "string" + } + }, + "required": [ + "word" + ], + "title": "Linkage", + "type": "object" + }, "Reference": { "additionalProperties": false, "properties": { @@ -193,6 +232,21 @@ "Sense": { "additionalProperties": false, "properties": { + "antonyms": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/Linkage" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Antonyms" + }, "categories": { "default": [], "description": "list of sense-disambiguated category names extracted from (a subset) of the Category links on the page", @@ -202,6 +256,36 @@ "title": "Categories", "type": "array" }, + "compounds": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/Linkage" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Compounds" + }, + "derived": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/Linkage" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Derived" + }, "examples": { "default": [], "description": "List of examples", @@ -219,6 +303,81 @@ "title": "Glosses", "type": "array" }, + "hypernyms": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/Linkage" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Hypernyms" + }, + "hyponyms": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/Linkage" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Hyponyms" + }, + "idioms": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/Linkage" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Idioms" + }, + "meronyms": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/Linkage" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Meronyms" + }, + "related": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/Linkage" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Related" + }, "senseid": { "anyOf": [ { @@ -232,14 +391,20 @@ "description": "Sense number used in Wiktionary", "title": "Senseid" }, - "subsenses": { + "synonyms": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/Linkage" + }, + "type": "array" + }, + { + "type": "null" + } + ], "default": [], - "description": "List of subsenses", - "items": { - "$ref": "#/$defs/Sense" - }, - "title": "Subsenses", - "type": "array" + "title": "Synonyms" }, "tags": { "default": [], @@ -463,6 +628,21 @@ "additionalProperties": false, "description": "WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.", "properties": { + "antonyms": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/Linkage" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Antonyms" + }, "categories": { "default": [], "description": "list of non-disambiguated categories for the word", @@ -472,6 +652,81 @@ "title": "Categories", "type": "array" }, + "compounds": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/Linkage" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Compounds" + }, + "derived": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/Linkage" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Derived" + }, + "hypernyms": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/Linkage" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Hypernyms" + }, + "hyponyms": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/Linkage" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Hyponyms" + }, + "idioms": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/Linkage" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Idioms" + }, "lang_code": { "description": "Wiktionary language code", "examples": [ @@ -488,6 +743,21 @@ "title": "Lang Name", "type": "string" }, + "meronyms": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/Linkage" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Meronyms" + }, "pos": { "default": null, "description": "Part of speech type", @@ -500,6 +770,21 @@ "title": "Pos Title", "type": "string" }, + "related": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/Linkage" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Related" + }, "senses": { "anyOf": [ { @@ -545,6 +830,21 @@ "default": [], "title": "Spellings" }, + "synonyms": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/Linkage" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Synonyms" + }, "translations": { "anyOf": [ { diff --git a/src/wiktextract/data/es/linkage_subtitles.json b/src/wiktextract/data/es/linkage_subtitles.json index b00b03fa..f960b6a6 100644 --- a/src/wiktextract/data/es/linkage_subtitles.json +++ b/src/wiktextract/data/es/linkage_subtitles.json @@ -1,9 +1,13 @@ { "antónimo": "antonyms", + "compuestos": "compounds", "derivad": "derived", "hipónimo": "hyponyms", "hiperónimo": "hypernyms", "merónimo": "meronyms", + "locucion": "idioms", + "locuciones": "idioms", "relacionado": "related", + "refranes": "proverbs", "sinónimo": "synonyms" } diff --git a/src/wiktextract/data/es/other_subtitles.json b/src/wiktextract/data/es/other_subtitles.json index badd6f79..31e0af4f 100644 --- a/src/wiktextract/data/es/other_subtitles.json +++ b/src/wiktextract/data/es/other_subtitles.json @@ -1,5 +1,5 @@ { - "etymology": ["Etimología"], - "ignored_sections": ["Véase también"], - "translations": ["Traducciones", "Traducción"] + "etymology": ["etimología"], + "ignored_sections": ["véase también"], + "translations": ["traducciones", "traducción"] } diff --git a/src/wiktextract/extractor/es/linkage.py b/src/wiktextract/extractor/es/linkage.py index 7286790d..5c74feda 100644 --- a/src/wiktextract/extractor/es/linkage.py +++ b/src/wiktextract/extractor/es/linkage.py @@ -1,20 +1,88 @@ +from typing import Union + +from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import WikiNodeChildrenList -from wiktextract.extractor.es.models import WordEntry +from wiktextract.extractor.es.models import Linkage, Sense, WordEntry +from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext def extract_linkage( wxr: WiktextractContext, - page_data: list[WordEntry], - nodes: WikiNodeChildrenList, + data_container: Union[WordEntry, Sense], + node: WikiNode, + linkage_type: str, ): - pass + if not linkage_type in data_container.model_fields: + wxr.wtp.debug( + f"Linkage type {linkage_type} not found in pydantic model", + sortid="extractor/es/linkage/extract_linkage/20", + ) + return + + for link_node in node.find_child_recursively(NodeKind.LINK): + word = clean_node(wxr, {}, link_node) + if word: + getattr(data_container, linkage_type).append(Linkage(word=word)) + + for template_node in node.find_child_recursively(NodeKind.TEMPLATE): + if template_node.template_name == "l": + word = clean_node(wxr, {}, template_node) + if word: + getattr(data_container, linkage_type).append(Linkage(word=word)) + + +def process_linkage_template( + wxr: WiktextractContext, + data_container: Union[WordEntry, Sense], + template_node: WikiNode, +): + linkage_type = wxr.config.LINKAGE_SUBTITLES.get( + template_node.template_name.removesuffix("s") + ) + if not linkage_type in data_container.model_fields: + wxr.wtp.debug( + f"Linkage type {linkage_type} not found in pydantic model", + sortid="extractor/es/linkage/process_linkage_template/51", + ) + return + + for key, value_raw in template_node.template_parameters.items(): + value = clean_node(wxr, {}, value_raw) + if isinstance(key, int): + getattr(data_container, linkage_type).append(Linkage(word=value)) + + elif isinstance(key, str): + if key.startswith("nota"): + idx = int(key[4:]) - 1 if len(key) > 4 else 0 + + if len(getattr(data_container, linkage_type)) > idx: + getattr(data_container, linkage_type)[idx].note = value + + elif key.startswith("alt"): + idx = int(key[3:]) - 1 if len(key) > 3 else 0 + + if len(getattr(data_container, linkage_type)) > idx: + getattr(data_container, linkage_type)[ + idx + ].alternative_spelling = value def process_linkage_list_children( wxr: WiktextractContext, - page_data: list[WordEntry], + data_container: Union[WordEntry, Sense], nodes: WikiNodeChildrenList, + linkage_type: str, ): - pass + if not linkage_type in data_container.model_fields: + wxr.wtp.debug( + f"Linkage type {linkage_type} not found in pydantic model", + sortid="extractor/es/linkage/process_linkage_list_children/89", + ) + return + for node in nodes: + if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: + word = clean_node(wxr, {}, node) + if word: + getattr(data_container, linkage_type).append(Linkage(word=word)) diff --git a/src/wiktextract/extractor/es/models.py b/src/wiktextract/extractor/es/models.py index cff79ebf..97732fec 100644 --- a/src/wiktextract/extractor/es/models.py +++ b/src/wiktextract/extractor/es/models.py @@ -7,6 +7,14 @@ class BaseModelWrap(BaseModel): model_config = ConfigDict(validate_assignment=True, extra="forbid") +class Linkage(BaseModelWrap): + word: str + note: Optional[str] = Field(default=None) + alternative_spelling: Optional[str] = Field( + default=None, description="Alternative spelling of the word" + ) + + class Translation(BaseModelWrap): word: str = Field(description="Translation term") lang_code: str = Field( @@ -71,12 +79,21 @@ class Sense(BaseModelWrap): examples: list["Example"] = Field( default=[], description="List of examples" ) - subsenses: list["Sense"] = Field( - default=[], description="List of subsenses" - ) + # subsenses: list["Sense"] = Field( + # default=[], description="List of subsenses" + # ) senseid: Optional[int] = Field( default=None, description="Sense number used in Wiktionary" ) + antonyms: Optional[list[Linkage]] = [] + compounds: Optional[list[Linkage]] = [] + derived: Optional[list[Linkage]] = [] + hyponyms: Optional[list[Linkage]] = [] + hypernyms: Optional[list[Linkage]] = [] + idioms: Optional[list[Linkage]] = [] + meronyms: Optional[list[Linkage]] = [] + related: Optional[list[Linkage]] = [] + synonyms: Optional[list[Linkage]] = [] class Spelling(BaseModelWrap): @@ -119,6 +136,7 @@ class WordEntry(BaseModelWrap): """ WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract. """ + model_config = ConfigDict(title="Spanish Wiktionary") word: str = Field(description="word string") @@ -138,3 +156,12 @@ class WordEntry(BaseModelWrap): sounds: Optional[list[Sound]] = [] spellings: Optional[list[Spelling]] = [] translations: Optional[list[Translation]] = [] + antonyms: Optional[list[Linkage]] = [] + compounds: Optional[list[Linkage]] = [] + derived: Optional[list[Linkage]] = [] + hyponyms: Optional[list[Linkage]] = [] + hypernyms: Optional[list[Linkage]] = [] + idioms: Optional[list[Linkage]] = [] + meronyms: Optional[list[Linkage]] = [] + related: Optional[list[Linkage]] = [] + synonyms: Optional[list[Linkage]] = [] diff --git a/src/wiktextract/extractor/es/page.py b/src/wiktextract/extractor/es/page.py index 024bdb9a..bddf6085 100644 --- a/src/wiktextract/extractor/es/page.py +++ b/src/wiktextract/extractor/es/page.py @@ -7,7 +7,10 @@ from wiktextract.extractor.es.example import extract_example from wiktextract.extractor.es.gloss import extract_gloss -from wiktextract.extractor.es.linkage import extract_linkage +from wiktextract.extractor.es.linkage import ( + extract_linkage, + process_linkage_template, +) from wiktextract.extractor.es.models import WordEntry from wiktextract.extractor.es.pronunciation import process_pron_graf_template from wiktextract.extractor.es.sense_data import process_sense_data_list @@ -98,14 +101,14 @@ def parse_section( https://es.wiktionary.org/wiki/Wikcionario:Estructura """ - section_title = clean_node(wxr, base_data, level_node.largs) + section_title = clean_node(wxr, base_data, level_node.largs).lower() wxr.wtp.start_subsection(section_title) pos_template_name = None for level_node_template in level_node.find_content(NodeKind.TEMPLATE): pos_template_name = level_node_template.template_name - if re.match(r"Etimología \d+", section_title): + if re.match(r"etimología \d+", section_title): parse_entries(wxr, page_data, base_data, level_node) elif section_title in wxr.config.OTHER_SUBTITLES["ignored_sections"]: @@ -132,8 +135,13 @@ def parse_section( for template_node in level_node.find_child_recursively( NodeKind.TEMPLATE ): - if template_node.template_name == "t+" and len(page_data)>0: + if template_node.template_name == "t+" and len(page_data) > 0: extract_translation(wxr, page_data[-1], template_node) + elif section_title in wxr.config.LINKAGE_SUBTITLES: + linkage_type = wxr.config.LINKAGE_SUBTITLES[section_title] + + extract_linkage(wxr, page_data[-1], level_node, linkage_type) + else: wxr.wtp.debug( f"Unprocessed section: {section_title}", @@ -246,7 +254,9 @@ def process_group( elif ( template_name.removesuffix("s") in wxr.config.LINKAGE_SUBTITLES ): - extract_linkage(wxr, page_data, group) + process_linkage_template( + wxr, page_data[-1].senses[-1], group[0] + ) elif template_name in ["ejemplo", "ejemplos", "ejemplo_y_trad"]: extract_example(wxr, page_data[-1].senses[-1], group) elif template_name == "uso": diff --git a/src/wiktextract/extractor/es/sense_data.py b/src/wiktextract/extractor/es/sense_data.py index 21191f3b..27882550 100644 --- a/src/wiktextract/extractor/es/sense_data.py +++ b/src/wiktextract/extractor/es/sense_data.py @@ -32,7 +32,12 @@ def process_sense_data_list( if list_type == "ejemplo": process_example_list(wxr, sense_data, list_item) elif list_type in wxr.config.LINKAGE_SUBTITLES: - process_linkage_list_children(wxr, sense_data, children[1:]) + process_linkage_list_children( + wxr, + sense_data, + children[1:], + wxr.config.LINKAGE_SUBTITLES.get(list_type), + ) elif list_type == "ámbito": # XXX: Extract scope tag pass diff --git a/tests/test_es_linkage.py b/tests/test_es_linkage.py new file mode 100644 index 00000000..9ab69f6d --- /dev/null +++ b/tests/test_es_linkage.py @@ -0,0 +1,142 @@ +import unittest + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.es.linkage import ( + extract_linkage, + process_linkage_list_children, + process_linkage_template, +) +from wiktextract.extractor.es.models import Sense +from wiktextract.wxr_context import WiktextractContext + + +class TestESLinkage(unittest.TestCase): + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="es"), + WiktionaryConfig(dump_file_lang_code="es"), + ) + + def tearDown(self) -> None: + self.wxr.wtp.close_db_conn() + + def get_default_sense_data(self) -> Sense: + return Sense(glosses=["gloss1"]) + + def test_es_extract_linkage(self): + test_cases = [ + # https://es.wiktionary.org/wiki/Fett + { + "input": "* {{l|de|Fettgewebe}}: ''tejido adiposo''", + "expected": [{"word": "Fettgewebe"}], + }, + # https://es.wiktionary.org/wiki/presunción + { + "input": "* [[presunción absoluta]]\n* [[presunción de hecho y de derecho]]", + "expected": [ + {"word": "presunción absoluta"}, + {"word": "presunción de hecho y de derecho"}, + ], + }, + ] + for case in test_cases: + with self.subTest(case=case): + self.wxr.wtp.start_page("") + self.wxr.wtp.add_page("Plantilla:l", 10, "Fettgewebe") + sense_data = self.get_default_sense_data() + + root = self.wxr.wtp.parse(case["input"]) + + extract_linkage( + self.wxr, sense_data, root.children[0], "compounds" + ) + + linkages = [ + t.model_dump(exclude_defaults=True) + for t in sense_data.compounds + ] + self.assertEqual( + linkages, + case["expected"], + ) + + def test_es_process_linkage_template(self): + # Test cases from https://es.wiktionary.org/wiki/Plantilla:t+ + test_cases = [ + { + "input": "{{sinónimo|leng=la|nasus|alt=nāsus}}", + "expected": [ + {"word": "nasus", "alternative_spelling": "nāsus"} + ], + }, + { + "input": "{{sinónimo|automóvil|coche|nota2=España|carro|nota3=Colombia, Estados Unidos, México, Venezuela}}", + "expected": [ + {"word": "automóvil"}, + { + "word": "coche", + "note": "España", + }, + { + "word": "carro", + "note": "Colombia, Estados Unidos, México, Venezuela", + }, + ], + }, + ] + for case in test_cases: + with self.subTest(case=case): + self.wxr.wtp.start_page("") + sense_data = self.get_default_sense_data() + + root = self.wxr.wtp.parse(case["input"]) + + process_linkage_template(self.wxr, sense_data, root.children[0]) + + linkages = [ + t.model_dump(exclude_defaults=True) + for t in sense_data.synonyms + ] + self.assertEqual( + linkages, + case["expected"], + ) + + def test_process_linkage_list_children(self): + test_cases = [ + # https://es.wiktionary.org/wiki/abalanzar + { + "input": ":*'''Sinónimos:''' [[balancear]], [[contrapesar]], [[equilibrar]], [[nivelar]] [[estabilizar]]", + "expected": [ + {"word": "balancear"}, + {"word": "contrapesar"}, + {"word": "equilibrar"}, + {"word": "nivelar"}, + {"word": "estabilizar"}, + ], + }, + ] + for case in test_cases: + with self.subTest(case=case): + self.wxr.wtp.start_page("") + sense_data = self.get_default_sense_data() + + root = self.wxr.wtp.parse(case["input"]) + + process_linkage_list_children( + self.wxr, + sense_data, + root.children[0].children[0].children[1:], + "synonyms", + ) + + linkages = [ + t.model_dump(exclude_defaults=True) + for t in sense_data.synonyms + ] + self.assertEqual( + linkages, + case["expected"], + )