-
Notifications
You must be signed in to change notification settings - Fork 88
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
"Appendix" and "Project" namespace pages are added because they are used in section title templates "Template:Intestazione voce" and "Template:Sezione voce"
- Loading branch information
Showing
7 changed files
with
343 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
{ | ||
"save_ns_names": [ | ||
"Main", | ||
"Template", | ||
"Module", | ||
"Project", | ||
"Appendix" | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
from wikitextprocessor import Page, Wtp | ||
|
||
SECTION_TITLE_TEMPLATES = { | ||
# POS titles | ||
# https://it.wiktionary.org/wiki/Categoria:Template_parti_del_discorso | ||
"Template:-acron-", | ||
"Template:-art-", | ||
"Template:-avv-", | ||
"Template:-class-", | ||
"Template:-cong-", | ||
"Template:-espr-", | ||
"Template:-hanzi-", | ||
"Template:-inter-", | ||
"Template:-kanpr-", | ||
"Template:-loc agg-", | ||
"Template:-loc avv-", | ||
"Template:-loc cong-", | ||
"Template:-loc inter-", | ||
"Template:-loc nom-", | ||
"Template:-loc nom form-", | ||
"Template:-loc prep-", | ||
"Template:-loc verb-", | ||
"Template:-nome form-", | ||
"Template:-nome-", | ||
"Template:-sost form-", | ||
"Template:-part-", | ||
"Template:-posp-", | ||
"Template:-prep-", | ||
"Template:-pron dim-", | ||
"Template:-pron indef-", | ||
"Template:-pron interrog-", | ||
"Template:-pron poss-", | ||
"Template:-pron rel-", | ||
"Template:-pron rifl-", | ||
"Template:-pronome-", | ||
"Template:-pron form-", | ||
"Template:-sost-", | ||
"Template:-voce verb-", | ||
# POS titles | ||
# https://it.wiktionary.org/wiki/Categoria:Template_aggiornati | ||
"Template:-agg-", | ||
"Template:-agg dim-", | ||
"Template:-agg nom-", | ||
"Template:-agg num-", | ||
"Template:-agg poss-", | ||
"Template:-cifr-", | ||
"Template:-lett-", | ||
"Template:-prefissoide-", | ||
"Template:-suffissoide-", | ||
"Template:-pref-", | ||
"Template:-interp-", | ||
"Template:-suff-", | ||
"Template:-verb-", | ||
# POS | ||
# https://it.wiktionary.org/wiki/Categoria:Template_per_gli_aggettivi | ||
"Template:-agg form-", | ||
"Template:-agg num form-", | ||
# other sections | ||
# https://it.wiktionary.org/wiki/Categoria:Template_sezione | ||
"Template:-esempio-", | ||
"Template:-iperon-", | ||
"Template:-ipon-", | ||
"Template:-noconf-", | ||
"Template:-rel-", | ||
"Template:-sill-", | ||
"Template:-sin-", | ||
"Template:-uso-", | ||
"Template:-var-", | ||
"Template:-alter-", | ||
"Template:-chat-", | ||
"Template:-coni-", | ||
"Template:-decl-", | ||
"Template:-der-", | ||
"Template:-fal-", # pos | ||
"Template:-ref-", | ||
"Template:-pron-", | ||
"Template:-prov-", | ||
"Template:-trascrizione-", # pos | ||
} | ||
|
||
|
||
def analyze_template(wtp: Wtp, page: Page) -> tuple[set[str], bool]: | ||
# don't pre-expand language title templates, like "-it-" | ||
return set(), page.title in SECTION_TITLE_TEMPLATES |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
from pydantic import BaseModel, ConfigDict, Field | ||
|
||
|
||
class ItalianBaseModel(BaseModel): | ||
model_config = ConfigDict( | ||
extra="forbid", | ||
strict=True, | ||
validate_assignment=True, | ||
validate_default=True, | ||
) | ||
|
||
|
||
class Sense(ItalianBaseModel): | ||
glosses: list[str] = [] | ||
tags: list[str] = [] | ||
raw_tags: list[str] = [] | ||
categories: list[str] = [] | ||
|
||
|
||
class WordEntry(ItalianBaseModel): | ||
model_config = ConfigDict(title="Italian Wiktionary") | ||
word: str = Field(description="Word string", min_length=1) | ||
lang_code: str = Field(description="Wiktionary language code", min_length=1) | ||
lang: str = Field(description="Localized language name", min_length=1) | ||
pos: str = Field(description="Part of speech type", min_length=1) | ||
pos_title: str = "" | ||
senses: list[Sense] = [] | ||
categories: list[str] = [] | ||
tags: list[str] = [] | ||
raw_tags: list[str] = [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
from typing import Any | ||
|
||
from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind | ||
|
||
from ...page import clean_node | ||
from ...wxr_context import WiktextractContext | ||
from .models import Sense, WordEntry | ||
from .pos import extract_pos_section | ||
from .section_titles import POS_DATA | ||
|
||
|
||
def parse_section( | ||
wxr: WiktextractContext, | ||
page_data: list[WordEntry], | ||
base_data: WordEntry, | ||
level_node: LevelNode, | ||
) -> None: | ||
title_text = clean_node(wxr, None, level_node.largs) | ||
if title_text in POS_DATA: | ||
extract_pos_section(wxr, page_data, base_data, level_node, title_text) | ||
|
||
for next_level in level_node.find_child(LEVEL_KIND_FLAGS): | ||
parse_section(wxr, page_data, base_data, next_level) | ||
|
||
|
||
def parse_page( | ||
wxr: WiktextractContext, page_title: str, page_text: str | ||
) -> list[dict[str, Any]]: | ||
# page layout | ||
# https://it.wiktionary.org/wiki/Wikizionario:Manuale_di_stile | ||
wxr.wtp.start_page(page_title) | ||
tree = wxr.wtp.parse(page_text, pre_expand=True) | ||
page_data: list[WordEntry] = [] | ||
for level2_node in tree.find_child(NodeKind.LEVEL2): | ||
lang_cats = {} | ||
lang_name = clean_node(wxr, lang_cats, level2_node.largs) | ||
lang_code = "unknown" | ||
for lang_template in level2_node.find_content(NodeKind.TEMPLATE): | ||
lang_code = lang_template.template_name.strip("-") | ||
break | ||
if ( | ||
wxr.config.capture_language_codes is not None | ||
and lang_code not in wxr.config.capture_language_codes | ||
): | ||
continue | ||
wxr.wtp.start_section(lang_name) | ||
base_data = WordEntry( | ||
word=wxr.wtp.title, | ||
lang_code=lang_code, | ||
lang=lang_name, | ||
pos="unknown", | ||
categories=lang_cats.get("categories", []), | ||
) | ||
for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS): | ||
parse_section(wxr, page_data, base_data, next_level_node) | ||
|
||
for data in page_data: | ||
if len(data.senses) == 0: | ||
data.senses.append(Sense(tags=["no-gloss"])) | ||
return [m.model_dump(exclude_defaults=True) for m in page_data] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
from wikitextprocessor import LevelNode, NodeKind, WikiNode | ||
|
||
from ...page import clean_node | ||
from ...wxr_context import WiktextractContext | ||
from .models import Sense, WordEntry | ||
from .section_titles import POS_DATA | ||
|
||
|
||
def extract_pos_section( | ||
wxr: WiktextractContext, | ||
page_data: list[WordEntry], | ||
base_data: WordEntry, | ||
level_node: LevelNode, | ||
pos_title: str, | ||
) -> None: | ||
page_data.append(base_data.model_copy(deep=True)) | ||
page_data[-1].pos_title = pos_title | ||
pos_data = POS_DATA[pos_title] | ||
page_data[-1].pos = pos_data["pos"] | ||
page_data[-1].tags.extend(pos_data.get("tags", [])) | ||
for link_node in level_node.find_child(NodeKind.LINK): | ||
clean_node(wxr, page_data[-1], link_node) | ||
|
||
for list_node in level_node.find_child(NodeKind.LIST): | ||
if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"): | ||
for list_item in list_node.find_child(NodeKind.LIST_ITEM): | ||
extract_gloss_list_item(wxr, page_data[-1], list_item) | ||
|
||
|
||
def extract_gloss_list_item( | ||
wxr: WiktextractContext, | ||
word_entry: WordEntry, | ||
list_item: WikiNode, | ||
) -> None: | ||
gloss_nodes = [] | ||
sense = Sense() | ||
for node in list_item.children: | ||
if not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST): | ||
gloss_nodes.append(node) | ||
gloss_str = clean_node(wxr, sense, gloss_nodes) | ||
if gloss_str != "": | ||
sense.glosses.append(gloss_str) | ||
word_entry.senses.append(sense) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
# https://it.wiktionary.org/wiki/Categoria:Template_parti_del_discorso | ||
# https://it.wiktionary.org/wiki/Categoria:Template_aggiornati | ||
# https://it.wiktionary.org/wiki/Categoria:Template_per_gli_aggettivi | ||
POS_DATA = { | ||
"Acronimo / Abbreviazione": {"pos": "abbrev", "tags": ["abbreviation"]}, | ||
"Articolo": {"pos": "article"}, | ||
"Avverbio": {"pos": "adv"}, | ||
"Classificatore": {"pos": "classifier"}, | ||
"Congiunzione": {"pos": "conj"}, | ||
"Espressione": {"pos": "phrase"}, | ||
"Hanzi": {"pos": "character", "tags": ["hanzi"]}, | ||
"Interiezione": {"pos": "intj"}, | ||
"Pronuncia kanji": {"pos": "character", "tags": ["kanji"]}, | ||
"Locuzione aggettivale": {"pos": "phrase", "tags": ["adjective"]}, | ||
"Locuzione avverbiale": {"pos": "adv_phrase"}, | ||
"Locuzione congiuntiva": {"pos": "phrase", "tags": ["conjunctive"]}, | ||
"Locuzione interiettiva": {"pos": "phrase", "tags": ["interjection"]}, | ||
"Locuzione nominale": {"pos": "phrase", "tags": ["noun"]}, | ||
"Locuzione nominale, forma flessa": { | ||
"pos": "phrase", | ||
"tags": ["noun", "form-of"], | ||
}, | ||
"Locuzione prepositiva": {"pos": "prep_phrase"}, | ||
"Locuzione verbale": {"pos": "phrase", "tags": ["verb"]}, | ||
"Nome proprio, forma flessa": {"pos": "name", "tags": ["form-of"]}, | ||
"Nome proprio": {"pos": "name"}, | ||
"Particella": {"pos": "particle"}, | ||
"Posposizione": {"pos": "postp"}, | ||
"Preposizione": {"pos": "prep"}, | ||
"Pronome dimostrativo": {"pos": "pron", "tags": ["demonstrative"]}, | ||
"Pronome indefinito": {"pos": "pron", "tags": ["indefinite"]}, | ||
"Pronome interrogativo": {"pos": "pron", "tags": ["interrogative"]}, | ||
"Pronome possessivo": {"pos": "pron", "tags": ["possessive"]}, | ||
"Pronome relativo": {"pos": "pron", "tags": ["relative"]}, | ||
"Pronome riflessivo": {"pos": "pron", "tags": ["reflexive"]}, | ||
"Pronome": {"pos": "pron"}, | ||
"Pronome, forma flessa": {"pos": "pron", "tags": ["form-of"]}, | ||
"Sostantivo": {"pos": "noun"}, | ||
"Sostantivo, forma flessa": {"pos": "noun", "tags": ["form-of"]}, | ||
"Verbo": {"pos": "verb"}, | ||
"Voce verbale": {"pos": "verb", "tags": ["form-of"]}, | ||
"Lettera": {"pos": "character", "tags": ["letter"]}, | ||
"Prefisso": {"pos": "prefix", "tags": ["morpheme"]}, | ||
"Aggettivo": {"pos": "adj"}, | ||
"Aggettivo dimostrativo": {"pos": "adj", "tags": ["demonstrative"]}, | ||
"Aggettivo nominale": {"pos": "adj_noun"}, | ||
"Aggettivo numerale": {"pos": "adj", "tags": ["numeral"]}, | ||
"Aggettivo possessivo": {"pos": "adj", "tags": ["possessive"]}, | ||
"Cifra": {"pos": "num"}, | ||
"Prefissoide": {"pos": "prefix", "tags": ["morpheme"]}, | ||
"Segno di interpunzione": {"pos": "punct", "tags": ["punctuation"]}, | ||
"Suffisso": {"pos": "suffix", "tags": ["morpheme"]}, | ||
"Aggettivo, forma flessa": {"pos": "adj", "tags": ["form-of"]}, | ||
"Aggettivo numerale, forma flessa": { | ||
"pos": "adj", | ||
"tags": ["numeral", "form-of"], | ||
}, | ||
"Abbreviazione in uso nelle chat": { | ||
"pos": "abbrev", | ||
"tags": ["abbreviation"], | ||
}, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
from unittest import TestCase | ||
|
||
from wikitextprocessor import Wtp | ||
|
||
from wiktextract.config import WiktionaryConfig | ||
from wiktextract.extractor.it.page import parse_page | ||
from wiktextract.wxr_context import WiktextractContext | ||
|
||
|
||
class TestItGloss(TestCase): | ||
maxDiff = None | ||
|
||
def setUp(self) -> None: | ||
self.wxr = WiktextractContext( | ||
Wtp(lang_code="it"), | ||
WiktionaryConfig( | ||
dump_file_lang_code="it", capture_language_codes=None | ||
), | ||
) | ||
|
||
def test_gloss_list(self): | ||
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") | ||
self.wxr.wtp.add_page( | ||
"Template:Term", | ||
10, | ||
"<small>(<i>[[mammalogia]]</i>)</small>[[Categoria:Mammalogia-IT]]", | ||
) | ||
data = parse_page( | ||
self.wxr, | ||
"cane", | ||
"""== {{-it-}} == | ||
===[[Image:Open_book_01.svg|30px|]]''[[sostantivo|Sostantivo]]''=== | ||
[[Categoria:Sostantivi in italiano]] | ||
# {{Term|mammalogia|it}} [[animale]]""", | ||
) | ||
self.assertEqual( | ||
data, | ||
[ | ||
{ | ||
"categories": ["Sostantivi in italiano"], | ||
"lang": "Italiano", | ||
"lang_code": "it", | ||
"word": "cane", | ||
"pos": "noun", | ||
"pos_title": "Sostantivo", | ||
"senses": [ | ||
{ | ||
"glosses": ["(mammalogia) animale"], | ||
"categories": ["Mammalogia-IT"], | ||
} | ||
], | ||
} | ||
], | ||
) |