diff --git a/src/wiktextract/data/it/config.json b/src/wiktextract/data/it/config.json new file mode 100644 index 00000000..830bf609 --- /dev/null +++ b/src/wiktextract/data/it/config.json @@ -0,0 +1,9 @@ +{ + "save_ns_names": [ + "Main", + "Template", + "Module", + "Project", + "Appendix" + ] +} diff --git a/src/wiktextract/extractor/it/analyze_template.py b/src/wiktextract/extractor/it/analyze_template.py new file mode 100644 index 00000000..d76e34af --- /dev/null +++ b/src/wiktextract/extractor/it/analyze_template.py @@ -0,0 +1,91 @@ +from wikitextprocessor import Page, Wtp + +SECTION_TITLE_TEMPLATES = { + # POS titles + # https://it.wiktionary.org/wiki/Categoria:Template_parti_del_discorso + "Template:-acron-", + "Template:-art-", + "Template:-avv-", + "Template:-class-", + "Template:-cong-", + "Template:-espr-", + "Template:-hanzi-", + "Template:-inter-", + "Template:-kanpr-", + "Template:-loc agg-", + "Template:-loc avv-", + "Template:-loc cong-", + "Template:-loc inter-", + "Template:-loc nom-", + "Template:-loc nom form-", + "Template:-loc prep-", + "Template:-loc verb-", + "Template:-nome form-", + "Template:-nome-", + "Template:-sost form-", + "Template:-part-", + "Template:-posp-", + "Template:-prep-", + "Template:-pron dim-", + "Template:-pron indef-", + "Template:-pron interrog-", + "Template:-pron poss-", + "Template:-pron rel-", + "Template:-pron rifl-", + "Template:-pronome-", + "Template:-pron form-", + "Template:-sost-", + "Template:-voce verb-", + # POS titles + # https://it.wiktionary.org/wiki/Categoria:Template_aggiornati + "Template:-agg-", + "Template:-agg dim-", + "Template:-agg nom-", + "Template:-agg num-", + "Template:-agg poss-", + "Template:-cifr-", + "Template:-lett-", + "Template:-prefissoide-", + "Template:-suffissoide-", + "Template:-pref-", + "Template:-interp-", + "Template:-suff-", + "Template:-verb-", + # POS + # https://it.wiktionary.org/wiki/Categoria:Template_per_gli_aggettivi + "Template:-agg form-", + "Template:-agg num form-", + # other sections + # https://it.wiktionary.org/wiki/Categoria:Template_sezione + "Template:-esempio-", + "Template:-iperon-", + "Template:-ipon-", + "Template:-noconf-", + "Template:-rel-", + "Template:-sill-", + "Template:-sin-", + "Template:-uso-", + "Template:-var-", + "Template:-alter-", + "Template:-chat-", # pos + "Template:-coni-", + "Template:-decl-", + "Template:-der-", + "Template:-fal-", # pos + "Template:-ref-", + "Template:-pron-", + "Template:-prov-", + "Template:-trascrizione-", # pos + # https://it.wiktionary.org/wiki/Categoria:Template_vocabolo + "Template:-etim-", + "Template:-trad-", + "Template:-ant-", + "Template:-cod-", # pos + "Template:-carhi-", # pos + "Template:-quote-", +} + + +def analyze_template(wtp: Wtp, page: Page) -> tuple[set[str], bool]: + # don't pre-expand language title templates, like "-it-" + return set(), page.title in SECTION_TITLE_TEMPLATES diff --git a/src/wiktextract/extractor/it/example.py b/src/wiktextract/extractor/it/example.py new file mode 100644 index 00000000..6117b854 --- /dev/null +++ b/src/wiktextract/extractor/it/example.py @@ -0,0 +1,24 @@ +from wikitextprocessor import NodeKind, WikiNode + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .models import Example, Sense + + +def extract_example_list_item( + wxr: WiktextractContext, sense: Sense, list_item: WikiNode +) -> None: + example = Example() + for node in list_item.children: + if isinstance(node, WikiNode): + match node.kind: + case NodeKind.ITALIC: + example.text = clean_node(wxr, sense, node) + case NodeKind.LIST: + for tr_list_item in node.find_child(NodeKind.LIST_ITEM): + example.translation = clean_node( + wxr, sense, tr_list_item.children + ) + + if example.text != "": + sense.examples.append(example) diff --git a/src/wiktextract/extractor/it/models.py b/src/wiktextract/extractor/it/models.py new file mode 100644 index 00000000..113da01c --- /dev/null +++ b/src/wiktextract/extractor/it/models.py @@ -0,0 +1,37 @@ +from pydantic import BaseModel, ConfigDict, Field + + +class ItalianBaseModel(BaseModel): + model_config = ConfigDict( + extra="forbid", + strict=True, + validate_assignment=True, + validate_default=True, + ) + + +class Example(ItalianBaseModel): + text: str = "" + translation: str = "" + ref: str = "" + + +class Sense(ItalianBaseModel): + glosses: list[str] = [] + tags: list[str] = [] + raw_tags: list[str] = [] + categories: list[str] = [] + examples: list[Example] = [] + + +class WordEntry(ItalianBaseModel): + model_config = ConfigDict(title="Italian Wiktionary") + word: str = Field(description="Word string", min_length=1) + lang_code: str = Field(description="Wiktionary language code", min_length=1) + lang: str = Field(description="Localized language name", min_length=1) + pos: str = Field(description="Part of speech type", min_length=1) + pos_title: str = "" + senses: list[Sense] = [] + categories: list[str] = [] + tags: list[str] = [] + raw_tags: list[str] = [] diff --git a/src/wiktextract/extractor/it/page.py b/src/wiktextract/extractor/it/page.py new file mode 100644 index 00000000..3be347cd --- /dev/null +++ b/src/wiktextract/extractor/it/page.py @@ -0,0 +1,60 @@ +from typing import Any + +from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .models import Sense, WordEntry +from .pos import extract_pos_section +from .section_titles import POS_DATA + + +def parse_section( + wxr: WiktextractContext, + page_data: list[WordEntry], + base_data: WordEntry, + level_node: LevelNode, +) -> None: + title_text = clean_node(wxr, None, level_node.largs) + if title_text in POS_DATA: + extract_pos_section(wxr, page_data, base_data, level_node, title_text) + + for next_level in level_node.find_child(LEVEL_KIND_FLAGS): + parse_section(wxr, page_data, base_data, next_level) + + +def parse_page( + wxr: WiktextractContext, page_title: str, page_text: str +) -> list[dict[str, Any]]: + # page layout + # https://it.wiktionary.org/wiki/Wikizionario:Manuale_di_stile + wxr.wtp.start_page(page_title) + tree = wxr.wtp.parse(page_text, pre_expand=True) + page_data: list[WordEntry] = [] + for level2_node in tree.find_child(NodeKind.LEVEL2): + lang_cats = {} + lang_name = clean_node(wxr, lang_cats, level2_node.largs) + lang_code = "unknown" + for lang_template in level2_node.find_content(NodeKind.TEMPLATE): + lang_code = lang_template.template_name.strip("-") + break + if ( + wxr.config.capture_language_codes is not None + and lang_code not in wxr.config.capture_language_codes + ): + continue + wxr.wtp.start_section(lang_name) + base_data = WordEntry( + word=wxr.wtp.title, + lang_code=lang_code, + lang=lang_name, + pos="unknown", + categories=lang_cats.get("categories", []), + ) + for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS): + parse_section(wxr, page_data, base_data, next_level_node) + + for data in page_data: + if len(data.senses) == 0: + data.senses.append(Sense(tags=["no-gloss"])) + return [m.model_dump(exclude_defaults=True) for m in page_data] diff --git a/src/wiktextract/extractor/it/pos.py b/src/wiktextract/extractor/it/pos.py new file mode 100644 index 00000000..590cbd56 --- /dev/null +++ b/src/wiktextract/extractor/it/pos.py @@ -0,0 +1,56 @@ +from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .example import extract_example_list_item +from .models import Sense, WordEntry +from .section_titles import POS_DATA + + +def extract_pos_section( + wxr: WiktextractContext, + page_data: list[WordEntry], + base_data: WordEntry, + level_node: LevelNode, + pos_title: str, +) -> None: + page_data.append(base_data.model_copy(deep=True)) + page_data[-1].pos_title = pos_title + pos_data = POS_DATA[pos_title] + page_data[-1].pos = pos_data["pos"] + page_data[-1].tags.extend(pos_data.get("tags", [])) + for link_node in level_node.find_child(NodeKind.LINK): + clean_node(wxr, page_data[-1], link_node) + + for list_node in level_node.find_child(NodeKind.LIST): + if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"): + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + extract_gloss_list_item(wxr, page_data[-1], list_item) + + +def extract_gloss_list_item( + wxr: WiktextractContext, + word_entry: WordEntry, + list_item: WikiNode, +) -> None: + gloss_nodes = [] + sense = Sense() + for node in list_item.children: + if isinstance(node, TemplateNode): + match node.template_name: + case "Term": + raw_tag = clean_node(wxr, sense, node).strip("() \n") + if raw_tag != "": + sense.raw_tags.append(raw_tag) + case _: + gloss_nodes.append(node) + elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: + if node.sarg.endswith("*"): + for example_list_item in node.find_child(NodeKind.LIST_ITEM): + extract_example_list_item(wxr, sense, example_list_item) + else: + gloss_nodes.append(node) + gloss_str = clean_node(wxr, sense, gloss_nodes) + if gloss_str != "": + sense.glosses.append(gloss_str) + word_entry.senses.append(sense) diff --git a/src/wiktextract/extractor/it/section_titles.py b/src/wiktextract/extractor/it/section_titles.py new file mode 100644 index 00000000..1fc81b45 --- /dev/null +++ b/src/wiktextract/extractor/it/section_titles.py @@ -0,0 +1,64 @@ +# https://it.wiktionary.org/wiki/Categoria:Template_parti_del_discorso +# https://it.wiktionary.org/wiki/Categoria:Template_aggiornati +# https://it.wiktionary.org/wiki/Categoria:Template_per_gli_aggettivi +POS_DATA = { + "Acronimo / Abbreviazione": {"pos": "abbrev", "tags": ["abbreviation"]}, + "Articolo": {"pos": "article"}, + "Avverbio": {"pos": "adv"}, + "Classificatore": {"pos": "classifier"}, + "Congiunzione": {"pos": "conj"}, + "Espressione": {"pos": "phrase"}, + "Hanzi": {"pos": "character", "tags": ["hanzi"]}, + "Interiezione": {"pos": "intj"}, + "Pronuncia kanji": {"pos": "character", "tags": ["kanji"]}, + "Locuzione aggettivale": {"pos": "phrase", "tags": ["adjective"]}, + "Locuzione avverbiale": {"pos": "adv_phrase"}, + "Locuzione congiuntiva": {"pos": "phrase", "tags": ["conjunctive"]}, + "Locuzione interiettiva": {"pos": "phrase", "tags": ["interjection"]}, + "Locuzione nominale": {"pos": "phrase", "tags": ["noun"]}, + "Locuzione nominale, forma flessa": { + "pos": "phrase", + "tags": ["noun", "form-of"], + }, + "Locuzione prepositiva": {"pos": "prep_phrase"}, + "Locuzione verbale": {"pos": "phrase", "tags": ["verb"]}, + "Nome proprio, forma flessa": {"pos": "name", "tags": ["form-of"]}, + "Nome proprio": {"pos": "name"}, + "Particella": {"pos": "particle"}, + "Posposizione": {"pos": "postp"}, + "Preposizione": {"pos": "prep"}, + "Pronome dimostrativo": {"pos": "pron", "tags": ["demonstrative"]}, + "Pronome indefinito": {"pos": "pron", "tags": ["indefinite"]}, + "Pronome interrogativo": {"pos": "pron", "tags": ["interrogative"]}, + "Pronome possessivo": {"pos": "pron", "tags": ["possessive"]}, + "Pronome relativo": {"pos": "pron", "tags": ["relative"]}, + "Pronome riflessivo": {"pos": "pron", "tags": ["reflexive"]}, + "Pronome": {"pos": "pron"}, + "Pronome, forma flessa": {"pos": "pron", "tags": ["form-of"]}, + "Sostantivo": {"pos": "noun"}, + "Sostantivo, forma flessa": {"pos": "noun", "tags": ["form-of"]}, + "Verbo": {"pos": "verb"}, + "Voce verbale": {"pos": "verb", "tags": ["form-of"]}, + "Lettera": {"pos": "character", "tags": ["letter"]}, + "Prefisso": {"pos": "prefix", "tags": ["morpheme"]}, + "Aggettivo": {"pos": "adj"}, + "Aggettivo dimostrativo": {"pos": "adj", "tags": ["demonstrative"]}, + "Aggettivo nominale": {"pos": "adj_noun"}, + "Aggettivo numerale": {"pos": "adj", "tags": ["numeral"]}, + "Aggettivo possessivo": {"pos": "adj", "tags": ["possessive"]}, + "Cifra": {"pos": "num"}, + "Prefissoide": {"pos": "prefix", "tags": ["morpheme"]}, + "Segno di interpunzione": {"pos": "punct", "tags": ["punctuation"]}, + "Suffisso": {"pos": "suffix", "tags": ["morpheme"]}, + "Aggettivo, forma flessa": {"pos": "adj", "tags": ["form-of"]}, + "Aggettivo numerale, forma flessa": { + "pos": "adj", + "tags": ["numeral", "form-of"], + }, + "Abbreviazione in uso nelle chat": { + "pos": "abbrev", + "tags": ["abbreviation"], + }, + "Codice / Simbolo": {"pos": "symbol"}, + "Carattere hiragana": {"pos": "character", "tags": ["hiragana"]}, +} diff --git a/tests/test_it_example.py b/tests/test_it_example.py new file mode 100644 index 00000000..ae66a81e --- /dev/null +++ b/tests/test_it_example.py @@ -0,0 +1,45 @@ +from unittest import TestCase + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.it.page import parse_page +from wiktextract.wxr_context import WiktextractContext + + +class TestItExample(TestCase): + maxDiff = None + + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="it"), + WiktionaryConfig( + dump_file_lang_code="it", capture_language_codes=None + ), + ) + + def test_list_example(self): + self.wxr.wtp.add_page("Template:-br-", 10, "Bretone") + data = parse_page( + self.wxr, + "dog", + """== {{-br-}} == +===Sostantivo=== +# mutazione +#* ''Da '''dog''', e '''dog'''.'' +#*: Il tuo cappello, il suo cappello.""", + ) + self.assertEqual( + data[0]["senses"], + [ + { + "glosses": ["mutazione"], + "examples": [ + { + "text": "Da dog, e dog.", + "translation": "Il tuo cappello, il suo cappello.", + } + ], + } + ], + ) diff --git a/tests/test_it_gloss.py b/tests/test_it_gloss.py new file mode 100644 index 00000000..2863cbfd --- /dev/null +++ b/tests/test_it_gloss.py @@ -0,0 +1,56 @@ +from unittest import TestCase + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.it.page import parse_page +from wiktextract.wxr_context import WiktextractContext + + +class TestItGloss(TestCase): + maxDiff = None + + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="it"), + WiktionaryConfig( + dump_file_lang_code="it", capture_language_codes=None + ), + ) + + def test_gloss_list(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + self.wxr.wtp.add_page( + "Template:Term", + 10, + "([[mammalogia]])[[Categoria:Mammalogia-IT]]", + ) + data = parse_page( + self.wxr, + "cane", + """== {{-it-}} == +===[[Image:Open_book_01.svg|30px|]]''[[sostantivo|Sostantivo]]''=== +[[Categoria:Sostantivi in italiano]] + +# {{Term|mammalogia|it}} [[animale]]""", + ) + self.assertEqual( + data, + [ + { + "categories": ["Sostantivi in italiano"], + "lang": "Italiano", + "lang_code": "it", + "word": "cane", + "pos": "noun", + "pos_title": "Sostantivo", + "senses": [ + { + "glosses": ["animale"], + "raw_tags": ["mammalogia"], + "categories": ["Mammalogia-IT"], + } + ], + } + ], + )