Skip to content

Commit

Permalink
Merge pull request #944 from xxyzz/it
Browse files Browse the repository at this point in the history
[it] extract gloss and example lists
  • Loading branch information
xxyzz authored Dec 11, 2024
2 parents e314ba5 + 7a65b8c commit ba9f46d
Show file tree
Hide file tree
Showing 9 changed files with 442 additions and 0 deletions.
9 changes: 9 additions & 0 deletions src/wiktextract/data/it/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"save_ns_names": [
"Main",
"Template",
"Module",
"Project",
"Appendix"
]
}
91 changes: 91 additions & 0 deletions src/wiktextract/extractor/it/analyze_template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
from wikitextprocessor import Page, Wtp

SECTION_TITLE_TEMPLATES = {
# POS titles
# https://it.wiktionary.org/wiki/Categoria:Template_parti_del_discorso
"Template:-acron-",
"Template:-art-",
"Template:-avv-",
"Template:-class-",
"Template:-cong-",
"Template:-espr-",
"Template:-hanzi-",
"Template:-inter-",
"Template:-kanpr-",
"Template:-loc agg-",
"Template:-loc avv-",
"Template:-loc cong-",
"Template:-loc inter-",
"Template:-loc nom-",
"Template:-loc nom form-",
"Template:-loc prep-",
"Template:-loc verb-",
"Template:-nome form-",
"Template:-nome-",
"Template:-sost form-",
"Template:-part-",
"Template:-posp-",
"Template:-prep-",
"Template:-pron dim-",
"Template:-pron indef-",
"Template:-pron interrog-",
"Template:-pron poss-",
"Template:-pron rel-",
"Template:-pron rifl-",
"Template:-pronome-",
"Template:-pron form-",
"Template:-sost-",
"Template:-voce verb-",
# POS titles
# https://it.wiktionary.org/wiki/Categoria:Template_aggiornati
"Template:-agg-",
"Template:-agg dim-",
"Template:-agg nom-",
"Template:-agg num-",
"Template:-agg poss-",
"Template:-cifr-",
"Template:-lett-",
"Template:-prefissoide-",
"Template:-suffissoide-",
"Template:-pref-",
"Template:-interp-",
"Template:-suff-",
"Template:-verb-",
# POS
# https://it.wiktionary.org/wiki/Categoria:Template_per_gli_aggettivi
"Template:-agg form-",
"Template:-agg num form-",
# other sections
# https://it.wiktionary.org/wiki/Categoria:Template_sezione
"Template:-esempio-",
"Template:-iperon-",
"Template:-ipon-",
"Template:-noconf-",
"Template:-rel-",
"Template:-sill-",
"Template:-sin-",
"Template:-uso-",
"Template:-var-",
"Template:-alter-",
"Template:-chat-", # pos
"Template:-coni-",
"Template:-decl-",
"Template:-der-",
"Template:-fal-", # pos
"Template:-ref-",
"Template:-pron-",
"Template:-prov-",
"Template:-trascrizione-", # pos
# https://it.wiktionary.org/wiki/Categoria:Template_vocabolo
"Template:-etim-",
"Template:-trad-",
"Template:-ant-",
"Template:-cod-", # pos
"Template:-carhi-", # pos
"Template:-quote-",
}


def analyze_template(wtp: Wtp, page: Page) -> tuple[set[str], bool]:
# don't pre-expand language title templates, like "-it-"
return set(), page.title in SECTION_TITLE_TEMPLATES
24 changes: 24 additions & 0 deletions src/wiktextract/extractor/it/example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from wikitextprocessor import NodeKind, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Example, Sense


def extract_example_list_item(
wxr: WiktextractContext, sense: Sense, list_item: WikiNode
) -> None:
example = Example()
for node in list_item.children:
if isinstance(node, WikiNode):
match node.kind:
case NodeKind.ITALIC:
example.text = clean_node(wxr, sense, node)
case NodeKind.LIST:
for tr_list_item in node.find_child(NodeKind.LIST_ITEM):
example.translation = clean_node(
wxr, sense, tr_list_item.children
)

if example.text != "":
sense.examples.append(example)
37 changes: 37 additions & 0 deletions src/wiktextract/extractor/it/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from pydantic import BaseModel, ConfigDict, Field


class ItalianBaseModel(BaseModel):
model_config = ConfigDict(
extra="forbid",
strict=True,
validate_assignment=True,
validate_default=True,
)


class Example(ItalianBaseModel):
text: str = ""
translation: str = ""
ref: str = ""


class Sense(ItalianBaseModel):
glosses: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
categories: list[str] = []
examples: list[Example] = []


class WordEntry(ItalianBaseModel):
model_config = ConfigDict(title="Italian Wiktionary")
word: str = Field(description="Word string", min_length=1)
lang_code: str = Field(description="Wiktionary language code", min_length=1)
lang: str = Field(description="Localized language name", min_length=1)
pos: str = Field(description="Part of speech type", min_length=1)
pos_title: str = ""
senses: list[Sense] = []
categories: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
60 changes: 60 additions & 0 deletions src/wiktextract/extractor/it/page.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from typing import Any

from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .section_titles import POS_DATA


def parse_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
base_data: WordEntry,
level_node: LevelNode,
) -> None:
title_text = clean_node(wxr, None, level_node.largs)
if title_text in POS_DATA:
extract_pos_section(wxr, page_data, base_data, level_node, title_text)

for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level)


def parse_page(
wxr: WiktextractContext, page_title: str, page_text: str
) -> list[dict[str, Any]]:
# page layout
# https://it.wiktionary.org/wiki/Wikizionario:Manuale_di_stile
wxr.wtp.start_page(page_title)
tree = wxr.wtp.parse(page_text, pre_expand=True)
page_data: list[WordEntry] = []
for level2_node in tree.find_child(NodeKind.LEVEL2):
lang_cats = {}
lang_name = clean_node(wxr, lang_cats, level2_node.largs)
lang_code = "unknown"
for lang_template in level2_node.find_content(NodeKind.TEMPLATE):
lang_code = lang_template.template_name.strip("-")
break
if (
wxr.config.capture_language_codes is not None
and lang_code not in wxr.config.capture_language_codes
):
continue
wxr.wtp.start_section(lang_name)
base_data = WordEntry(
word=wxr.wtp.title,
lang_code=lang_code,
lang=lang_name,
pos="unknown",
categories=lang_cats.get("categories", []),
)
for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level_node)

for data in page_data:
if len(data.senses) == 0:
data.senses.append(Sense(tags=["no-gloss"]))
return [m.model_dump(exclude_defaults=True) for m in page_data]
56 changes: 56 additions & 0 deletions src/wiktextract/extractor/it/pos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .example import extract_example_list_item
from .models import Sense, WordEntry
from .section_titles import POS_DATA


def extract_pos_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
base_data: WordEntry,
level_node: LevelNode,
pos_title: str,
) -> None:
page_data.append(base_data.model_copy(deep=True))
page_data[-1].pos_title = pos_title
pos_data = POS_DATA[pos_title]
page_data[-1].pos = pos_data["pos"]
page_data[-1].tags.extend(pos_data.get("tags", []))
for link_node in level_node.find_child(NodeKind.LINK):
clean_node(wxr, page_data[-1], link_node)

for list_node in level_node.find_child(NodeKind.LIST):
if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
extract_gloss_list_item(wxr, page_data[-1], list_item)


def extract_gloss_list_item(
wxr: WiktextractContext,
word_entry: WordEntry,
list_item: WikiNode,
) -> None:
gloss_nodes = []
sense = Sense()
for node in list_item.children:
if isinstance(node, TemplateNode):
match node.template_name:
case "Term":
raw_tag = clean_node(wxr, sense, node).strip("() \n")
if raw_tag != "":
sense.raw_tags.append(raw_tag)
case _:
gloss_nodes.append(node)
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
if node.sarg.endswith("*"):
for example_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_example_list_item(wxr, sense, example_list_item)
else:
gloss_nodes.append(node)
gloss_str = clean_node(wxr, sense, gloss_nodes)
if gloss_str != "":
sense.glosses.append(gloss_str)
word_entry.senses.append(sense)
64 changes: 64 additions & 0 deletions src/wiktextract/extractor/it/section_titles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# https://it.wiktionary.org/wiki/Categoria:Template_parti_del_discorso
# https://it.wiktionary.org/wiki/Categoria:Template_aggiornati
# https://it.wiktionary.org/wiki/Categoria:Template_per_gli_aggettivi
POS_DATA = {
"Acronimo / Abbreviazione": {"pos": "abbrev", "tags": ["abbreviation"]},
"Articolo": {"pos": "article"},
"Avverbio": {"pos": "adv"},
"Classificatore": {"pos": "classifier"},
"Congiunzione": {"pos": "conj"},
"Espressione": {"pos": "phrase"},
"Hanzi": {"pos": "character", "tags": ["hanzi"]},
"Interiezione": {"pos": "intj"},
"Pronuncia kanji": {"pos": "character", "tags": ["kanji"]},
"Locuzione aggettivale": {"pos": "phrase", "tags": ["adjective"]},
"Locuzione avverbiale": {"pos": "adv_phrase"},
"Locuzione congiuntiva": {"pos": "phrase", "tags": ["conjunctive"]},
"Locuzione interiettiva": {"pos": "phrase", "tags": ["interjection"]},
"Locuzione nominale": {"pos": "phrase", "tags": ["noun"]},
"Locuzione nominale, forma flessa": {
"pos": "phrase",
"tags": ["noun", "form-of"],
},
"Locuzione prepositiva": {"pos": "prep_phrase"},
"Locuzione verbale": {"pos": "phrase", "tags": ["verb"]},
"Nome proprio, forma flessa": {"pos": "name", "tags": ["form-of"]},
"Nome proprio": {"pos": "name"},
"Particella": {"pos": "particle"},
"Posposizione": {"pos": "postp"},
"Preposizione": {"pos": "prep"},
"Pronome dimostrativo": {"pos": "pron", "tags": ["demonstrative"]},
"Pronome indefinito": {"pos": "pron", "tags": ["indefinite"]},
"Pronome interrogativo": {"pos": "pron", "tags": ["interrogative"]},
"Pronome possessivo": {"pos": "pron", "tags": ["possessive"]},
"Pronome relativo": {"pos": "pron", "tags": ["relative"]},
"Pronome riflessivo": {"pos": "pron", "tags": ["reflexive"]},
"Pronome": {"pos": "pron"},
"Pronome, forma flessa": {"pos": "pron", "tags": ["form-of"]},
"Sostantivo": {"pos": "noun"},
"Sostantivo, forma flessa": {"pos": "noun", "tags": ["form-of"]},
"Verbo": {"pos": "verb"},
"Voce verbale": {"pos": "verb", "tags": ["form-of"]},
"Lettera": {"pos": "character", "tags": ["letter"]},
"Prefisso": {"pos": "prefix", "tags": ["morpheme"]},
"Aggettivo": {"pos": "adj"},
"Aggettivo dimostrativo": {"pos": "adj", "tags": ["demonstrative"]},
"Aggettivo nominale": {"pos": "adj_noun"},
"Aggettivo numerale": {"pos": "adj", "tags": ["numeral"]},
"Aggettivo possessivo": {"pos": "adj", "tags": ["possessive"]},
"Cifra": {"pos": "num"},
"Prefissoide": {"pos": "prefix", "tags": ["morpheme"]},
"Segno di interpunzione": {"pos": "punct", "tags": ["punctuation"]},
"Suffisso": {"pos": "suffix", "tags": ["morpheme"]},
"Aggettivo, forma flessa": {"pos": "adj", "tags": ["form-of"]},
"Aggettivo numerale, forma flessa": {
"pos": "adj",
"tags": ["numeral", "form-of"],
},
"Abbreviazione in uso nelle chat": {
"pos": "abbrev",
"tags": ["abbreviation"],
},
"Codice / Simbolo": {"pos": "symbol"},
"Carattere hiragana": {"pos": "character", "tags": ["hiragana"]},
}
45 changes: 45 additions & 0 deletions tests/test_it_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from unittest import TestCase

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.it.page import parse_page
from wiktextract.wxr_context import WiktextractContext


class TestItExample(TestCase):
maxDiff = None

def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="it"),
WiktionaryConfig(
dump_file_lang_code="it", capture_language_codes=None
),
)

def test_list_example(self):
self.wxr.wtp.add_page("Template:-br-", 10, "Bretone")
data = parse_page(
self.wxr,
"dog",
"""== {{-br-}} ==
===Sostantivo===
# mutazione
#* ''Da '''dog''', e '''dog'''.''
#*: Il tuo cappello, il suo cappello.""",
)
self.assertEqual(
data[0]["senses"],
[
{
"glosses": ["mutazione"],
"examples": [
{
"text": "Da dog, e dog.",
"translation": "Il tuo cappello, il suo cappello.",
}
],
}
],
)
Loading

0 comments on commit ba9f46d

Please sign in to comment.