Skip to content

Commit

Permalink
[it] extract gloss lists
Browse files Browse the repository at this point in the history
"Appendix" and "Project" namespace pages are added because they are
used in section title templates "Template:Intestazione voce" and
"Template:Sezione voce"
  • Loading branch information
xxyzz committed Dec 11, 2024
1 parent e314ba5 commit b64f6dc
Show file tree
Hide file tree
Showing 7 changed files with 343 additions and 0 deletions.
9 changes: 9 additions & 0 deletions src/wiktextract/data/it/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"save_ns_names": [
"Main",
"Template",
"Module",
"Project",
"Appendix"
]
}
84 changes: 84 additions & 0 deletions src/wiktextract/extractor/it/analyze_template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from wikitextprocessor import Page, Wtp

SECTION_TITLE_TEMPLATES = {
# POS titles
# https://it.wiktionary.org/wiki/Categoria:Template_parti_del_discorso
"Template:-acron-",
"Template:-art-",
"Template:-avv-",
"Template:-class-",
"Template:-cong-",
"Template:-espr-",
"Template:-hanzi-",
"Template:-inter-",
"Template:-kanpr-",
"Template:-loc agg-",
"Template:-loc avv-",
"Template:-loc cong-",
"Template:-loc inter-",
"Template:-loc nom-",
"Template:-loc nom form-",
"Template:-loc prep-",
"Template:-loc verb-",
"Template:-nome form-",
"Template:-nome-",
"Template:-sost form-",
"Template:-part-",
"Template:-posp-",
"Template:-prep-",
"Template:-pron dim-",
"Template:-pron indef-",
"Template:-pron interrog-",
"Template:-pron poss-",
"Template:-pron rel-",
"Template:-pron rifl-",
"Template:-pronome-",
"Template:-pron form-",
"Template:-sost-",
"Template:-voce verb-",
# POS titles
# https://it.wiktionary.org/wiki/Categoria:Template_aggiornati
"Template:-agg-",
"Template:-agg dim-",
"Template:-agg nom-",
"Template:-agg num-",
"Template:-agg poss-",
"Template:-cifr-",
"Template:-lett-",
"Template:-prefissoide-",
"Template:-suffissoide-",
"Template:-pref-",
"Template:-interp-",
"Template:-suff-",
"Template:-verb-",
# POS
# https://it.wiktionary.org/wiki/Categoria:Template_per_gli_aggettivi
"Template:-agg form-",
"Template:-agg num form-",
# other sections
# https://it.wiktionary.org/wiki/Categoria:Template_sezione
"Template:-esempio-",
"Template:-iperon-",
"Template:-ipon-",
"Template:-noconf-",
"Template:-rel-",
"Template:-sill-",
"Template:-sin-",
"Template:-uso-",
"Template:-var-",
"Template:-alter-",
"Template:-chat-",
"Template:-coni-",
"Template:-decl-",
"Template:-der-",
"Template:-fal-", # pos
"Template:-ref-",
"Template:-pron-",
"Template:-prov-",
"Template:-trascrizione-", # pos
}


def analyze_template(wtp: Wtp, page: Page) -> tuple[set[str], bool]:
# don't pre-expand language title templates, like "-it-"
return set(), page.title in SECTION_TITLE_TEMPLATES
30 changes: 30 additions & 0 deletions src/wiktextract/extractor/it/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from pydantic import BaseModel, ConfigDict, Field


class ItalianBaseModel(BaseModel):
model_config = ConfigDict(
extra="forbid",
strict=True,
validate_assignment=True,
validate_default=True,
)


class Sense(ItalianBaseModel):
glosses: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
categories: list[str] = []


class WordEntry(ItalianBaseModel):
model_config = ConfigDict(title="Italian Wiktionary")
word: str = Field(description="Word string", min_length=1)
lang_code: str = Field(description="Wiktionary language code", min_length=1)
lang: str = Field(description="Localized language name", min_length=1)
pos: str = Field(description="Part of speech type", min_length=1)
pos_title: str = ""
senses: list[Sense] = []
categories: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
60 changes: 60 additions & 0 deletions src/wiktextract/extractor/it/page.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from typing import Any

from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .section_titles import POS_DATA


def parse_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
base_data: WordEntry,
level_node: LevelNode,
) -> None:
title_text = clean_node(wxr, None, level_node.largs)
if title_text in POS_DATA:
extract_pos_section(wxr, page_data, base_data, level_node, title_text)

for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level)


def parse_page(
wxr: WiktextractContext, page_title: str, page_text: str
) -> list[dict[str, Any]]:
# page layout
# https://it.wiktionary.org/wiki/Wikizionario:Manuale_di_stile
wxr.wtp.start_page(page_title)
tree = wxr.wtp.parse(page_text, pre_expand=True)
page_data: list[WordEntry] = []
for level2_node in tree.find_child(NodeKind.LEVEL2):
lang_cats = {}
lang_name = clean_node(wxr, lang_cats, level2_node.largs)
lang_code = "unknown"
for lang_template in level2_node.find_content(NodeKind.TEMPLATE):
lang_code = lang_template.template_name.strip("-")
break
if (
wxr.config.capture_language_codes is not None
and lang_code not in wxr.config.capture_language_codes
):
continue
wxr.wtp.start_section(lang_name)
base_data = WordEntry(
word=wxr.wtp.title,
lang_code=lang_code,
lang=lang_name,
pos="unknown",
categories=lang_cats.get("categories", []),
)
for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level_node)

for data in page_data:
if len(data.senses) == 0:
data.senses.append(Sense(tags=["no-gloss"]))
return [m.model_dump(exclude_defaults=True) for m in page_data]
43 changes: 43 additions & 0 deletions src/wiktextract/extractor/it/pos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from wikitextprocessor import LevelNode, NodeKind, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Sense, WordEntry
from .section_titles import POS_DATA


def extract_pos_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
base_data: WordEntry,
level_node: LevelNode,
pos_title: str,
) -> None:
page_data.append(base_data.model_copy(deep=True))
page_data[-1].pos_title = pos_title
pos_data = POS_DATA[pos_title]
page_data[-1].pos = pos_data["pos"]
page_data[-1].tags.extend(pos_data.get("tags", []))
for link_node in level_node.find_child(NodeKind.LINK):
clean_node(wxr, page_data[-1], link_node)

for list_node in level_node.find_child(NodeKind.LIST):
if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
extract_gloss_list_item(wxr, page_data[-1], list_item)


def extract_gloss_list_item(
wxr: WiktextractContext,
word_entry: WordEntry,
list_item: WikiNode,
) -> None:
gloss_nodes = []
sense = Sense()
for node in list_item.children:
if not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):
gloss_nodes.append(node)
gloss_str = clean_node(wxr, sense, gloss_nodes)
if gloss_str != "":
sense.glosses.append(gloss_str)
word_entry.senses.append(sense)
62 changes: 62 additions & 0 deletions src/wiktextract/extractor/it/section_titles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# https://it.wiktionary.org/wiki/Categoria:Template_parti_del_discorso
# https://it.wiktionary.org/wiki/Categoria:Template_aggiornati
# https://it.wiktionary.org/wiki/Categoria:Template_per_gli_aggettivi
POS_DATA = {
"Acronimo / Abbreviazione": {"pos": "abbrev", "tags": ["abbreviation"]},
"Articolo": {"pos": "article"},
"Avverbio": {"pos": "adv"},
"Classificatore": {"pos": "classifier"},
"Congiunzione": {"pos": "conj"},
"Espressione": {"pos": "phrase"},
"Hanzi": {"pos": "character", "tags": ["hanzi"]},
"Interiezione": {"pos": "intj"},
"Pronuncia kanji": {"pos": "character", "tags": ["kanji"]},
"Locuzione aggettivale": {"pos": "phrase", "tags": ["adjective"]},
"Locuzione avverbiale": {"pos": "adv_phrase"},
"Locuzione congiuntiva": {"pos": "phrase", "tags": ["conjunctive"]},
"Locuzione interiettiva": {"pos": "phrase", "tags": ["interjection"]},
"Locuzione nominale": {"pos": "phrase", "tags": ["noun"]},
"Locuzione nominale, forma flessa": {
"pos": "phrase",
"tags": ["noun", "form-of"],
},
"Locuzione prepositiva": {"pos": "prep_phrase"},
"Locuzione verbale": {"pos": "phrase", "tags": ["verb"]},
"Nome proprio, forma flessa": {"pos": "name", "tags": ["form-of"]},
"Nome proprio": {"pos": "name"},
"Particella": {"pos": "particle"},
"Posposizione": {"pos": "postp"},
"Preposizione": {"pos": "prep"},
"Pronome dimostrativo": {"pos": "pron", "tags": ["demonstrative"]},
"Pronome indefinito": {"pos": "pron", "tags": ["indefinite"]},
"Pronome interrogativo": {"pos": "pron", "tags": ["interrogative"]},
"Pronome possessivo": {"pos": "pron", "tags": ["possessive"]},
"Pronome relativo": {"pos": "pron", "tags": ["relative"]},
"Pronome riflessivo": {"pos": "pron", "tags": ["reflexive"]},
"Pronome": {"pos": "pron"},
"Pronome, forma flessa": {"pos": "pron", "tags": ["form-of"]},
"Sostantivo": {"pos": "noun"},
"Sostantivo, forma flessa": {"pos": "noun", "tags": ["form-of"]},
"Verbo": {"pos": "verb"},
"Voce verbale": {"pos": "verb", "tags": ["form-of"]},
"Lettera": {"pos": "character", "tags": ["letter"]},
"Prefisso": {"pos": "prefix", "tags": ["morpheme"]},
"Aggettivo": {"pos": "adj"},
"Aggettivo dimostrativo": {"pos": "adj", "tags": ["demonstrative"]},
"Aggettivo nominale": {"pos": "adj_noun"},
"Aggettivo numerale": {"pos": "adj", "tags": ["numeral"]},
"Aggettivo possessivo": {"pos": "adj", "tags": ["possessive"]},
"Cifra": {"pos": "num"},
"Prefissoide": {"pos": "prefix", "tags": ["morpheme"]},
"Segno di interpunzione": {"pos": "punct", "tags": ["punctuation"]},
"Suffisso": {"pos": "suffix", "tags": ["morpheme"]},
"Aggettivo, forma flessa": {"pos": "adj", "tags": ["form-of"]},
"Aggettivo numerale, forma flessa": {
"pos": "adj",
"tags": ["numeral", "form-of"],
},
"Abbreviazione in uso nelle chat": {
"pos": "abbrev",
"tags": ["abbreviation"],
},
}
55 changes: 55 additions & 0 deletions tests/test_it_gloss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from unittest import TestCase

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.it.page import parse_page
from wiktextract.wxr_context import WiktextractContext


class TestItGloss(TestCase):
maxDiff = None

def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="it"),
WiktionaryConfig(
dump_file_lang_code="it", capture_language_codes=None
),
)

def test_gloss_list(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
self.wxr.wtp.add_page(
"Template:Term",
10,
"<small>(<i>[[mammalogia]]</i>)</small>[[Categoria:Mammalogia-IT]]",
)
data = parse_page(
self.wxr,
"cane",
"""== {{-it-}} ==
===[[Image:Open_book_01.svg|30px|]]''[[sostantivo|Sostantivo]]''===
[[Categoria:Sostantivi in italiano]]
# {{Term|mammalogia|it}} [[animale]]""",
)
self.assertEqual(
data,
[
{
"categories": ["Sostantivi in italiano"],
"lang": "Italiano",
"lang_code": "it",
"word": "cane",
"pos": "noun",
"pos_title": "Sostantivo",
"senses": [
{
"glosses": ["(mammalogia) animale"],
"categories": ["Mammalogia-IT"],
}
],
}
],
)

0 comments on commit b64f6dc

Please sign in to comment.