-
Notifications
You must be signed in to change notification settings - Fork 88
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #944 from xxyzz/it
[it] extract gloss and example lists
- Loading branch information
Showing
9 changed files
with
442 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
{ | ||
"save_ns_names": [ | ||
"Main", | ||
"Template", | ||
"Module", | ||
"Project", | ||
"Appendix" | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
from wikitextprocessor import Page, Wtp | ||
|
||
SECTION_TITLE_TEMPLATES = { | ||
# POS titles | ||
# https://it.wiktionary.org/wiki/Categoria:Template_parti_del_discorso | ||
"Template:-acron-", | ||
"Template:-art-", | ||
"Template:-avv-", | ||
"Template:-class-", | ||
"Template:-cong-", | ||
"Template:-espr-", | ||
"Template:-hanzi-", | ||
"Template:-inter-", | ||
"Template:-kanpr-", | ||
"Template:-loc agg-", | ||
"Template:-loc avv-", | ||
"Template:-loc cong-", | ||
"Template:-loc inter-", | ||
"Template:-loc nom-", | ||
"Template:-loc nom form-", | ||
"Template:-loc prep-", | ||
"Template:-loc verb-", | ||
"Template:-nome form-", | ||
"Template:-nome-", | ||
"Template:-sost form-", | ||
"Template:-part-", | ||
"Template:-posp-", | ||
"Template:-prep-", | ||
"Template:-pron dim-", | ||
"Template:-pron indef-", | ||
"Template:-pron interrog-", | ||
"Template:-pron poss-", | ||
"Template:-pron rel-", | ||
"Template:-pron rifl-", | ||
"Template:-pronome-", | ||
"Template:-pron form-", | ||
"Template:-sost-", | ||
"Template:-voce verb-", | ||
# POS titles | ||
# https://it.wiktionary.org/wiki/Categoria:Template_aggiornati | ||
"Template:-agg-", | ||
"Template:-agg dim-", | ||
"Template:-agg nom-", | ||
"Template:-agg num-", | ||
"Template:-agg poss-", | ||
"Template:-cifr-", | ||
"Template:-lett-", | ||
"Template:-prefissoide-", | ||
"Template:-suffissoide-", | ||
"Template:-pref-", | ||
"Template:-interp-", | ||
"Template:-suff-", | ||
"Template:-verb-", | ||
# POS | ||
# https://it.wiktionary.org/wiki/Categoria:Template_per_gli_aggettivi | ||
"Template:-agg form-", | ||
"Template:-agg num form-", | ||
# other sections | ||
# https://it.wiktionary.org/wiki/Categoria:Template_sezione | ||
"Template:-esempio-", | ||
"Template:-iperon-", | ||
"Template:-ipon-", | ||
"Template:-noconf-", | ||
"Template:-rel-", | ||
"Template:-sill-", | ||
"Template:-sin-", | ||
"Template:-uso-", | ||
"Template:-var-", | ||
"Template:-alter-", | ||
"Template:-chat-", # pos | ||
"Template:-coni-", | ||
"Template:-decl-", | ||
"Template:-der-", | ||
"Template:-fal-", # pos | ||
"Template:-ref-", | ||
"Template:-pron-", | ||
"Template:-prov-", | ||
"Template:-trascrizione-", # pos | ||
# https://it.wiktionary.org/wiki/Categoria:Template_vocabolo | ||
"Template:-etim-", | ||
"Template:-trad-", | ||
"Template:-ant-", | ||
"Template:-cod-", # pos | ||
"Template:-carhi-", # pos | ||
"Template:-quote-", | ||
} | ||
|
||
|
||
def analyze_template(wtp: Wtp, page: Page) -> tuple[set[str], bool]: | ||
# don't pre-expand language title templates, like "-it-" | ||
return set(), page.title in SECTION_TITLE_TEMPLATES |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
from wikitextprocessor import NodeKind, WikiNode | ||
|
||
from ...page import clean_node | ||
from ...wxr_context import WiktextractContext | ||
from .models import Example, Sense | ||
|
||
|
||
def extract_example_list_item( | ||
wxr: WiktextractContext, sense: Sense, list_item: WikiNode | ||
) -> None: | ||
example = Example() | ||
for node in list_item.children: | ||
if isinstance(node, WikiNode): | ||
match node.kind: | ||
case NodeKind.ITALIC: | ||
example.text = clean_node(wxr, sense, node) | ||
case NodeKind.LIST: | ||
for tr_list_item in node.find_child(NodeKind.LIST_ITEM): | ||
example.translation = clean_node( | ||
wxr, sense, tr_list_item.children | ||
) | ||
|
||
if example.text != "": | ||
sense.examples.append(example) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
from pydantic import BaseModel, ConfigDict, Field | ||
|
||
|
||
class ItalianBaseModel(BaseModel): | ||
model_config = ConfigDict( | ||
extra="forbid", | ||
strict=True, | ||
validate_assignment=True, | ||
validate_default=True, | ||
) | ||
|
||
|
||
class Example(ItalianBaseModel): | ||
text: str = "" | ||
translation: str = "" | ||
ref: str = "" | ||
|
||
|
||
class Sense(ItalianBaseModel): | ||
glosses: list[str] = [] | ||
tags: list[str] = [] | ||
raw_tags: list[str] = [] | ||
categories: list[str] = [] | ||
examples: list[Example] = [] | ||
|
||
|
||
class WordEntry(ItalianBaseModel): | ||
model_config = ConfigDict(title="Italian Wiktionary") | ||
word: str = Field(description="Word string", min_length=1) | ||
lang_code: str = Field(description="Wiktionary language code", min_length=1) | ||
lang: str = Field(description="Localized language name", min_length=1) | ||
pos: str = Field(description="Part of speech type", min_length=1) | ||
pos_title: str = "" | ||
senses: list[Sense] = [] | ||
categories: list[str] = [] | ||
tags: list[str] = [] | ||
raw_tags: list[str] = [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
from typing import Any | ||
|
||
from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind | ||
|
||
from ...page import clean_node | ||
from ...wxr_context import WiktextractContext | ||
from .models import Sense, WordEntry | ||
from .pos import extract_pos_section | ||
from .section_titles import POS_DATA | ||
|
||
|
||
def parse_section( | ||
wxr: WiktextractContext, | ||
page_data: list[WordEntry], | ||
base_data: WordEntry, | ||
level_node: LevelNode, | ||
) -> None: | ||
title_text = clean_node(wxr, None, level_node.largs) | ||
if title_text in POS_DATA: | ||
extract_pos_section(wxr, page_data, base_data, level_node, title_text) | ||
|
||
for next_level in level_node.find_child(LEVEL_KIND_FLAGS): | ||
parse_section(wxr, page_data, base_data, next_level) | ||
|
||
|
||
def parse_page( | ||
wxr: WiktextractContext, page_title: str, page_text: str | ||
) -> list[dict[str, Any]]: | ||
# page layout | ||
# https://it.wiktionary.org/wiki/Wikizionario:Manuale_di_stile | ||
wxr.wtp.start_page(page_title) | ||
tree = wxr.wtp.parse(page_text, pre_expand=True) | ||
page_data: list[WordEntry] = [] | ||
for level2_node in tree.find_child(NodeKind.LEVEL2): | ||
lang_cats = {} | ||
lang_name = clean_node(wxr, lang_cats, level2_node.largs) | ||
lang_code = "unknown" | ||
for lang_template in level2_node.find_content(NodeKind.TEMPLATE): | ||
lang_code = lang_template.template_name.strip("-") | ||
break | ||
if ( | ||
wxr.config.capture_language_codes is not None | ||
and lang_code not in wxr.config.capture_language_codes | ||
): | ||
continue | ||
wxr.wtp.start_section(lang_name) | ||
base_data = WordEntry( | ||
word=wxr.wtp.title, | ||
lang_code=lang_code, | ||
lang=lang_name, | ||
pos="unknown", | ||
categories=lang_cats.get("categories", []), | ||
) | ||
for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS): | ||
parse_section(wxr, page_data, base_data, next_level_node) | ||
|
||
for data in page_data: | ||
if len(data.senses) == 0: | ||
data.senses.append(Sense(tags=["no-gloss"])) | ||
return [m.model_dump(exclude_defaults=True) for m in page_data] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode | ||
|
||
from ...page import clean_node | ||
from ...wxr_context import WiktextractContext | ||
from .example import extract_example_list_item | ||
from .models import Sense, WordEntry | ||
from .section_titles import POS_DATA | ||
|
||
|
||
def extract_pos_section( | ||
wxr: WiktextractContext, | ||
page_data: list[WordEntry], | ||
base_data: WordEntry, | ||
level_node: LevelNode, | ||
pos_title: str, | ||
) -> None: | ||
page_data.append(base_data.model_copy(deep=True)) | ||
page_data[-1].pos_title = pos_title | ||
pos_data = POS_DATA[pos_title] | ||
page_data[-1].pos = pos_data["pos"] | ||
page_data[-1].tags.extend(pos_data.get("tags", [])) | ||
for link_node in level_node.find_child(NodeKind.LINK): | ||
clean_node(wxr, page_data[-1], link_node) | ||
|
||
for list_node in level_node.find_child(NodeKind.LIST): | ||
if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"): | ||
for list_item in list_node.find_child(NodeKind.LIST_ITEM): | ||
extract_gloss_list_item(wxr, page_data[-1], list_item) | ||
|
||
|
||
def extract_gloss_list_item( | ||
wxr: WiktextractContext, | ||
word_entry: WordEntry, | ||
list_item: WikiNode, | ||
) -> None: | ||
gloss_nodes = [] | ||
sense = Sense() | ||
for node in list_item.children: | ||
if isinstance(node, TemplateNode): | ||
match node.template_name: | ||
case "Term": | ||
raw_tag = clean_node(wxr, sense, node).strip("() \n") | ||
if raw_tag != "": | ||
sense.raw_tags.append(raw_tag) | ||
case _: | ||
gloss_nodes.append(node) | ||
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: | ||
if node.sarg.endswith("*"): | ||
for example_list_item in node.find_child(NodeKind.LIST_ITEM): | ||
extract_example_list_item(wxr, sense, example_list_item) | ||
else: | ||
gloss_nodes.append(node) | ||
gloss_str = clean_node(wxr, sense, gloss_nodes) | ||
if gloss_str != "": | ||
sense.glosses.append(gloss_str) | ||
word_entry.senses.append(sense) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
# https://it.wiktionary.org/wiki/Categoria:Template_parti_del_discorso | ||
# https://it.wiktionary.org/wiki/Categoria:Template_aggiornati | ||
# https://it.wiktionary.org/wiki/Categoria:Template_per_gli_aggettivi | ||
POS_DATA = { | ||
"Acronimo / Abbreviazione": {"pos": "abbrev", "tags": ["abbreviation"]}, | ||
"Articolo": {"pos": "article"}, | ||
"Avverbio": {"pos": "adv"}, | ||
"Classificatore": {"pos": "classifier"}, | ||
"Congiunzione": {"pos": "conj"}, | ||
"Espressione": {"pos": "phrase"}, | ||
"Hanzi": {"pos": "character", "tags": ["hanzi"]}, | ||
"Interiezione": {"pos": "intj"}, | ||
"Pronuncia kanji": {"pos": "character", "tags": ["kanji"]}, | ||
"Locuzione aggettivale": {"pos": "phrase", "tags": ["adjective"]}, | ||
"Locuzione avverbiale": {"pos": "adv_phrase"}, | ||
"Locuzione congiuntiva": {"pos": "phrase", "tags": ["conjunctive"]}, | ||
"Locuzione interiettiva": {"pos": "phrase", "tags": ["interjection"]}, | ||
"Locuzione nominale": {"pos": "phrase", "tags": ["noun"]}, | ||
"Locuzione nominale, forma flessa": { | ||
"pos": "phrase", | ||
"tags": ["noun", "form-of"], | ||
}, | ||
"Locuzione prepositiva": {"pos": "prep_phrase"}, | ||
"Locuzione verbale": {"pos": "phrase", "tags": ["verb"]}, | ||
"Nome proprio, forma flessa": {"pos": "name", "tags": ["form-of"]}, | ||
"Nome proprio": {"pos": "name"}, | ||
"Particella": {"pos": "particle"}, | ||
"Posposizione": {"pos": "postp"}, | ||
"Preposizione": {"pos": "prep"}, | ||
"Pronome dimostrativo": {"pos": "pron", "tags": ["demonstrative"]}, | ||
"Pronome indefinito": {"pos": "pron", "tags": ["indefinite"]}, | ||
"Pronome interrogativo": {"pos": "pron", "tags": ["interrogative"]}, | ||
"Pronome possessivo": {"pos": "pron", "tags": ["possessive"]}, | ||
"Pronome relativo": {"pos": "pron", "tags": ["relative"]}, | ||
"Pronome riflessivo": {"pos": "pron", "tags": ["reflexive"]}, | ||
"Pronome": {"pos": "pron"}, | ||
"Pronome, forma flessa": {"pos": "pron", "tags": ["form-of"]}, | ||
"Sostantivo": {"pos": "noun"}, | ||
"Sostantivo, forma flessa": {"pos": "noun", "tags": ["form-of"]}, | ||
"Verbo": {"pos": "verb"}, | ||
"Voce verbale": {"pos": "verb", "tags": ["form-of"]}, | ||
"Lettera": {"pos": "character", "tags": ["letter"]}, | ||
"Prefisso": {"pos": "prefix", "tags": ["morpheme"]}, | ||
"Aggettivo": {"pos": "adj"}, | ||
"Aggettivo dimostrativo": {"pos": "adj", "tags": ["demonstrative"]}, | ||
"Aggettivo nominale": {"pos": "adj_noun"}, | ||
"Aggettivo numerale": {"pos": "adj", "tags": ["numeral"]}, | ||
"Aggettivo possessivo": {"pos": "adj", "tags": ["possessive"]}, | ||
"Cifra": {"pos": "num"}, | ||
"Prefissoide": {"pos": "prefix", "tags": ["morpheme"]}, | ||
"Segno di interpunzione": {"pos": "punct", "tags": ["punctuation"]}, | ||
"Suffisso": {"pos": "suffix", "tags": ["morpheme"]}, | ||
"Aggettivo, forma flessa": {"pos": "adj", "tags": ["form-of"]}, | ||
"Aggettivo numerale, forma flessa": { | ||
"pos": "adj", | ||
"tags": ["numeral", "form-of"], | ||
}, | ||
"Abbreviazione in uso nelle chat": { | ||
"pos": "abbrev", | ||
"tags": ["abbreviation"], | ||
}, | ||
"Codice / Simbolo": {"pos": "symbol"}, | ||
"Carattere hiragana": {"pos": "character", "tags": ["hiragana"]}, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
from unittest import TestCase | ||
|
||
from wikitextprocessor import Wtp | ||
|
||
from wiktextract.config import WiktionaryConfig | ||
from wiktextract.extractor.it.page import parse_page | ||
from wiktextract.wxr_context import WiktextractContext | ||
|
||
|
||
class TestItExample(TestCase): | ||
maxDiff = None | ||
|
||
def setUp(self) -> None: | ||
self.wxr = WiktextractContext( | ||
Wtp(lang_code="it"), | ||
WiktionaryConfig( | ||
dump_file_lang_code="it", capture_language_codes=None | ||
), | ||
) | ||
|
||
def test_list_example(self): | ||
self.wxr.wtp.add_page("Template:-br-", 10, "Bretone") | ||
data = parse_page( | ||
self.wxr, | ||
"dog", | ||
"""== {{-br-}} == | ||
===Sostantivo=== | ||
# mutazione | ||
#* ''Da '''dog''', e '''dog'''.'' | ||
#*: Il tuo cappello, il suo cappello.""", | ||
) | ||
self.assertEqual( | ||
data[0]["senses"], | ||
[ | ||
{ | ||
"glosses": ["mutazione"], | ||
"examples": [ | ||
{ | ||
"text": "Da dog, e dog.", | ||
"translation": "Il tuo cappello, il suo cappello.", | ||
} | ||
], | ||
} | ||
], | ||
) |
Oops, something went wrong.