Skip to content

Commit

Permalink
Merge pull request #947 from xxyzz/it
Browse files Browse the repository at this point in the history
[it] improve sound section code and extract linkage sections
  • Loading branch information
xxyzz authored Dec 16, 2024
2 parents 8a39820 + b92d96e commit 98779e3
Show file tree
Hide file tree
Showing 9 changed files with 341 additions and 35 deletions.
1 change: 1 addition & 0 deletions src/wiktextract/extractor/it/etymology.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
def extract_etymology_section(
wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
) -> None:
# https://it.wiktionary.org/wiki/Aiuto:Etimologia
etymology_texts = []
for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
Expand Down
50 changes: 50 additions & 0 deletions src/wiktextract/extractor/it/linkage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from wikitextprocessor import LevelNode, NodeKind, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Linkage, WordEntry


def extract_linkage_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
level_node: LevelNode,
linkage_type: str,
) -> None:
linkages = []
for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
linkages.extend(extract_linkage_list_item(wxr, list_item))

for data in page_data:
if data.lang_code == page_data[-1].lang_code:
getattr(data, linkage_type).extend(linkages)


def extract_linkage_list_item(
wxr: WiktextractContext, list_item: WikiNode
) -> list[Linkage]:
raw_tags = []
linkages = []
for node in list_item.children:
if isinstance(node, WikiNode):
match node.kind:
case NodeKind.LINK:
node_str = clean_node(wxr, None, node)
if node_str != "":
linkages.append(
Linkage(word=node_str, raw_tags=raw_tags)
)
raw_tags.clear()
case NodeKind.TEMPLATE | NodeKind.ITALIC:
node_str = clean_node(wxr, None, node)
if node_str.startswith("(") and node_str.endswith(")"):
raw_tags.append(node_str.strip("()"))
elif isinstance(node, str):
for word_str in node.split(","):
word_str = word_str.strip()
if word_str != "":
linkages.append(Linkage(word=word_str, raw_tags=raw_tags))
raw_tags.clear()

return linkages
21 changes: 20 additions & 1 deletion src/wiktextract/extractor/it/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,18 @@ class Sound(ItalianBaseModel):
flac_url: str = ""
tags: list[str] = []
raw_tags: list[str] = []
sense: str = ""


class Hyphenation(ItalianBaseModel):
hyphenation: str = ""
sense: str = ""


class Linkage(ItalianBaseModel):
word: str
tags: list[str] = []
raw_tags: list[str] = []


class WordEntry(ItalianBaseModel):
Expand All @@ -77,5 +89,12 @@ class WordEntry(ItalianBaseModel):
forms: list[Form] = []
etymology_texts: list[str] = []
etymology_examples: list[Example] = []
hyphenation: str = ""
hyphenations: list[Hyphenation] = []
sounds: list[Sound] = []
synonyms: list[Linkage] = []
antonyms: list[Linkage] = []
derived: list[Linkage] = []
related: list[Linkage] = []
hyponyms: list[Linkage] = []
hypernyms: list[Linkage] = []
proverbs: list[Linkage] = []
7 changes: 6 additions & 1 deletion src/wiktextract/extractor/it/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
from ...page import clean_node
from ...wxr_context import WiktextractContext
from .etymology import extract_citation_section, extract_etymology_section
from .linkage import extract_linkage_section
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .section_titles import POS_DATA
from .section_titles import LINKAGE_SECTIONS, POS_DATA
from .sound import extract_hyphenation_section, extract_pronunciation_section
from .translation import extract_translation_section

Expand All @@ -31,6 +32,10 @@ def parse_section(
extract_hyphenation_section(wxr, page_data, level_node)
elif title_text == "Pronuncia":
extract_pronunciation_section(wxr, page_data, level_node)
elif title_text in LINKAGE_SECTIONS:
extract_linkage_section(
wxr, page_data, level_node, LINKAGE_SECTIONS[title_text]
)

for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level)
Expand Down
12 changes: 5 additions & 7 deletions src/wiktextract/extractor/it/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,11 @@ def extract_gloss_list_item(
sense = Sense()
for node in list_item.children:
if isinstance(node, TemplateNode):
match node.template_name:
case "Term":
raw_tag = clean_node(wxr, sense, node).strip("() \n")
if raw_tag != "":
sense.raw_tags.append(raw_tag)
case _:
gloss_nodes.append(node)
t_str = clean_node(wxr, sense, node)
if t_str.startswith("(") and t_str.endswith(")"):
sense.raw_tags.append(t_str.strip("()"))
else:
gloss_nodes.append(t_str)
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
if node.sarg.endswith("*"):
for example_list_item in node.find_child(NodeKind.LIST_ITEM):
Expand Down
15 changes: 15 additions & 0 deletions src/wiktextract/extractor/it/section_titles.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,18 @@
"Codice / Simbolo": {"pos": "symbol"},
"Carattere hiragana": {"pos": "character", "tags": ["hiragana"]},
}


LINKAGE_SECTIONS = {
"Sinonimi": "synonyms",
"Contrari": "antonyms",
"Derivati": "derived",
"Termini correlati": "related",
"Varianti": "related",
"Alterati": "related",
"Iponimi": "hyponyms",
"Iperonimi": "hypernyms",
"Da non confondere con": "related",
"Proverbi e modi di dire": "proverbs",
"Parole derivate": "derived",
}
124 changes: 100 additions & 24 deletions src/wiktextract/extractor/it/sound.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,123 @@
from wikitextprocessor import LevelNode, NodeKind
from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from ..share import set_sound_file_url_fields
from .models import Sound, WordEntry
from .models import Hyphenation, Sound, WordEntry


def extract_hyphenation_section(
wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
) -> None:
hyphenation = ""
# https://it.wiktionary.org/wiki/Aiuto:Sillabazione
hyphenations = []
for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
hyphenation = clean_node(wxr, None, list_item.children)
match list_node.sarg:
case ";":
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
h_str = clean_node(wxr, None, list_item.children)
if h_str != "":
hyphenations.append(Hyphenation(hyphenation=h_str))
break
case "*":
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
h_data = Hyphenation()
for node in list_item.find_child(
NodeKind.ITALIC | NodeKind.BOLD
):
match node.kind:
case NodeKind.ITALIC:
h_data.sense = clean_node(
wxr, None, node
).strip("()")
case NodeKind.BOLD:
h_data.hyphenation = clean_node(wxr, None, node)
if h_data.hyphenation != "":
hyphenations.append(h_data)

# no list
for node in level_node.find_child(NodeKind.BOLD):
h_str = clean_node(wxr, None, node)
if h_str != "":
hyphenations.append(Hyphenation(hyphenation=h_str))

for data in page_data:
if data.lang_code == page_data[-1].lang_code:
data.hyphenation = hyphenation
data.hyphenations.extend(hyphenations)


def extract_pronunciation_section(
wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
) -> None:
# https://it.wiktionary.org/wiki/Aiuto:Pronuncia
sounds = []
for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
extract_sound_list_item(wxr, list_item, sounds)

# no list
for t_node in level_node.find_child(NodeKind.TEMPLATE):
match t_node.template_name.lower():
case "ipa":
ipa = clean_node(
wxr, None, t_node.template_parameters.get(1, "")
)
if ipa != "":
sounds.append(Sound(ipa=ipa))
case "audio":
sound_file = clean_node(
wxr, None, t_node.template_parameters.get(1, "")
)
if sound_file != "":
if len(sounds) > 0:
set_sound_file_url_fields(wxr, sound_file, sounds[-1])
else:
sound = Sound()
set_sound_file_url_fields(wxr, sound_file, sound)
sounds.append(sound)
extract_sound_template(wxr, t_node, sounds, "", [])

for data in page_data:
if data.lang_code == page_data[-1].lang_code:
data.sounds.extend(sounds)


def extract_sound_list_item(
wxr: WiktextractContext, list_item: WikiNode, sounds: list[Sound]
) -> None:
sense = ""
raw_tags = []
for node in list_item.find_child(NodeKind.ITALIC | NodeKind.TEMPLATE):
match node.kind:
case NodeKind.ITALIC:
sense = clean_node(wxr, None, node).strip("()")
case NodeKind.TEMPLATE:
if node.template_name.lower() == "glossa":
raw_tags.append(clean_node(wxr, None, node).strip("()"))
else:
extract_sound_template(wxr, node, sounds, sense, raw_tags)


def extract_sound_template(
wxr: WiktextractContext,
t_node: TemplateNode,
sounds: list[Sound],
sense: str,
raw_tags: list[str],
) -> None:
match t_node.template_name:
case "IPA" | "SAMPA":
# https://it.wiktionary.org/wiki/Template:IPA
# https://it.wiktionary.org/wiki/Template:SAMPA
for arg_name in range(1, 5):
if arg_name not in t_node.template_parameters:
break
ipa = clean_node(
wxr, None, t_node.template_parameters.get(arg_name, "")
)
if ipa != "":
sound = Sound(ipa=ipa, sense=sense, raw_tags=raw_tags)
if t_node.template_name.lower() == "sampa":
sound.tags.append("SAMPA")
sounds.append(sound)
case "Audio" | "audio":
# https://it.wiktionary.org/wiki/Template:Audio
sound_file = clean_node(
wxr, None, t_node.template_parameters.get(1, "")
)
raw_tag = clean_node(
wxr, None, t_node.template_parameters.get(2, "")
)
if sound_file != "":
if len(sounds) > 0:
set_sound_file_url_fields(wxr, sound_file, sounds[-1])
if raw_tag != "":
sounds[-1].raw_tags.append(raw_tag)
else:
sound = Sound(sense=sense, raw_tags=raw_tags)
set_sound_file_url_fields(wxr, sound_file, sound)
if raw_tag != "":
sound.raw_tags.append(raw_tag)
sounds.append(sound)
44 changes: 44 additions & 0 deletions tests/test_it_linkage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from unittest import TestCase

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.it.page import parse_page
from wiktextract.wxr_context import WiktextractContext


class TestItLinkage(TestCase):
maxDiff = None

def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="it"),
WiktionaryConfig(
dump_file_lang_code="it", capture_language_codes=None
),
)

def test_synonyms(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
self.wxr.wtp.add_page(
"Template:Fig", 10, "<small>(''senso figurato'')</small>"
)
data = parse_page(
self.wxr,
"cane",
"""== {{-it-}} ==
===Sostantivo===
# [[animale]]
===Sinonimi===
* [[animale]], amico dell’uomo
* {{Fig}} ''(di freddo)'' [[forte]], [[intenso]]""",
)
self.assertEqual(
data[0]["synonyms"],
[
{"word": "animale"},
{"word": "amico dell’uomo"},
{"word": "forte", "raw_tags": ["senso figurato", "di freddo"]},
{"word": "intenso"},
],
)
Loading

0 comments on commit 98779e3

Please sign in to comment.