Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[it] improve sound section code and extract linkage sections #947

Merged
merged 6 commits into from
Dec 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/wiktextract/extractor/it/etymology.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
def extract_etymology_section(
wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
) -> None:
# https://it.wiktionary.org/wiki/Aiuto:Etimologia
etymology_texts = []
for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
Expand Down
50 changes: 50 additions & 0 deletions src/wiktextract/extractor/it/linkage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from wikitextprocessor import LevelNode, NodeKind, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Linkage, WordEntry


def extract_linkage_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
level_node: LevelNode,
linkage_type: str,
) -> None:
linkages = []
for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
linkages.extend(extract_linkage_list_item(wxr, list_item))

for data in page_data:
if data.lang_code == page_data[-1].lang_code:
getattr(data, linkage_type).extend(linkages)


def extract_linkage_list_item(
wxr: WiktextractContext, list_item: WikiNode
) -> list[Linkage]:
raw_tags = []
linkages = []
for node in list_item.children:
if isinstance(node, WikiNode):
match node.kind:
case NodeKind.LINK:
node_str = clean_node(wxr, None, node)
if node_str != "":
linkages.append(
Linkage(word=node_str, raw_tags=raw_tags)
)
raw_tags.clear()
case NodeKind.TEMPLATE | NodeKind.ITALIC:
node_str = clean_node(wxr, None, node)
if node_str.startswith("(") and node_str.endswith(")"):
raw_tags.append(node_str.strip("()"))
elif isinstance(node, str):
for word_str in node.split(","):
word_str = word_str.strip()
if word_str != "":
linkages.append(Linkage(word=word_str, raw_tags=raw_tags))
raw_tags.clear()

return linkages
21 changes: 20 additions & 1 deletion src/wiktextract/extractor/it/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,18 @@ class Sound(ItalianBaseModel):
flac_url: str = ""
tags: list[str] = []
raw_tags: list[str] = []
sense: str = ""


class Hyphenation(ItalianBaseModel):
hyphenation: str = ""
sense: str = ""


class Linkage(ItalianBaseModel):
word: str
tags: list[str] = []
raw_tags: list[str] = []


class WordEntry(ItalianBaseModel):
Expand All @@ -77,5 +89,12 @@ class WordEntry(ItalianBaseModel):
forms: list[Form] = []
etymology_texts: list[str] = []
etymology_examples: list[Example] = []
hyphenation: str = ""
hyphenations: list[Hyphenation] = []
sounds: list[Sound] = []
synonyms: list[Linkage] = []
antonyms: list[Linkage] = []
derived: list[Linkage] = []
related: list[Linkage] = []
hyponyms: list[Linkage] = []
hypernyms: list[Linkage] = []
proverbs: list[Linkage] = []
7 changes: 6 additions & 1 deletion src/wiktextract/extractor/it/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
from ...page import clean_node
from ...wxr_context import WiktextractContext
from .etymology import extract_citation_section, extract_etymology_section
from .linkage import extract_linkage_section
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .section_titles import POS_DATA
from .section_titles import LINKAGE_SECTIONS, POS_DATA
from .sound import extract_hyphenation_section, extract_pronunciation_section
from .translation import extract_translation_section

Expand All @@ -31,6 +32,10 @@ def parse_section(
extract_hyphenation_section(wxr, page_data, level_node)
elif title_text == "Pronuncia":
extract_pronunciation_section(wxr, page_data, level_node)
elif title_text in LINKAGE_SECTIONS:
extract_linkage_section(
wxr, page_data, level_node, LINKAGE_SECTIONS[title_text]
)

for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level)
Expand Down
12 changes: 5 additions & 7 deletions src/wiktextract/extractor/it/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,11 @@ def extract_gloss_list_item(
sense = Sense()
for node in list_item.children:
if isinstance(node, TemplateNode):
match node.template_name:
case "Term":
raw_tag = clean_node(wxr, sense, node).strip("() \n")
if raw_tag != "":
sense.raw_tags.append(raw_tag)
case _:
gloss_nodes.append(node)
t_str = clean_node(wxr, sense, node)
if t_str.startswith("(") and t_str.endswith(")"):
sense.raw_tags.append(t_str.strip("()"))
else:
gloss_nodes.append(t_str)
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
if node.sarg.endswith("*"):
for example_list_item in node.find_child(NodeKind.LIST_ITEM):
Expand Down
15 changes: 15 additions & 0 deletions src/wiktextract/extractor/it/section_titles.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,18 @@
"Codice / Simbolo": {"pos": "symbol"},
"Carattere hiragana": {"pos": "character", "tags": ["hiragana"]},
}


LINKAGE_SECTIONS = {
"Sinonimi": "synonyms",
"Contrari": "antonyms",
"Derivati": "derived",
"Termini correlati": "related",
"Varianti": "related",
"Alterati": "related",
"Iponimi": "hyponyms",
"Iperonimi": "hypernyms",
"Da non confondere con": "related",
"Proverbi e modi di dire": "proverbs",
"Parole derivate": "derived",
}
124 changes: 100 additions & 24 deletions src/wiktextract/extractor/it/sound.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,123 @@
from wikitextprocessor import LevelNode, NodeKind
from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from ..share import set_sound_file_url_fields
from .models import Sound, WordEntry
from .models import Hyphenation, Sound, WordEntry


def extract_hyphenation_section(
wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
) -> None:
hyphenation = ""
# https://it.wiktionary.org/wiki/Aiuto:Sillabazione
hyphenations = []
for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
hyphenation = clean_node(wxr, None, list_item.children)
match list_node.sarg:
case ";":
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
h_str = clean_node(wxr, None, list_item.children)
if h_str != "":
hyphenations.append(Hyphenation(hyphenation=h_str))
break
case "*":
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
h_data = Hyphenation()
for node in list_item.find_child(
NodeKind.ITALIC | NodeKind.BOLD
):
match node.kind:
case NodeKind.ITALIC:
h_data.sense = clean_node(
wxr, None, node
).strip("()")
case NodeKind.BOLD:
h_data.hyphenation = clean_node(wxr, None, node)
if h_data.hyphenation != "":
hyphenations.append(h_data)

# no list
for node in level_node.find_child(NodeKind.BOLD):
h_str = clean_node(wxr, None, node)
if h_str != "":
hyphenations.append(Hyphenation(hyphenation=h_str))

for data in page_data:
if data.lang_code == page_data[-1].lang_code:
data.hyphenation = hyphenation
data.hyphenations.extend(hyphenations)


def extract_pronunciation_section(
wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
) -> None:
# https://it.wiktionary.org/wiki/Aiuto:Pronuncia
sounds = []
for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
extract_sound_list_item(wxr, list_item, sounds)

# no list
for t_node in level_node.find_child(NodeKind.TEMPLATE):
match t_node.template_name.lower():
case "ipa":
ipa = clean_node(
wxr, None, t_node.template_parameters.get(1, "")
)
if ipa != "":
sounds.append(Sound(ipa=ipa))
case "audio":
sound_file = clean_node(
wxr, None, t_node.template_parameters.get(1, "")
)
if sound_file != "":
if len(sounds) > 0:
set_sound_file_url_fields(wxr, sound_file, sounds[-1])
else:
sound = Sound()
set_sound_file_url_fields(wxr, sound_file, sound)
sounds.append(sound)
extract_sound_template(wxr, t_node, sounds, "", [])

for data in page_data:
if data.lang_code == page_data[-1].lang_code:
data.sounds.extend(sounds)


def extract_sound_list_item(
wxr: WiktextractContext, list_item: WikiNode, sounds: list[Sound]
) -> None:
sense = ""
raw_tags = []
for node in list_item.find_child(NodeKind.ITALIC | NodeKind.TEMPLATE):
match node.kind:
case NodeKind.ITALIC:
sense = clean_node(wxr, None, node).strip("()")
case NodeKind.TEMPLATE:
if node.template_name.lower() == "glossa":
raw_tags.append(clean_node(wxr, None, node).strip("()"))
else:
extract_sound_template(wxr, node, sounds, sense, raw_tags)


def extract_sound_template(
wxr: WiktextractContext,
t_node: TemplateNode,
sounds: list[Sound],
sense: str,
raw_tags: list[str],
) -> None:
match t_node.template_name:
case "IPA" | "SAMPA":
# https://it.wiktionary.org/wiki/Template:IPA
# https://it.wiktionary.org/wiki/Template:SAMPA
for arg_name in range(1, 5):
if arg_name not in t_node.template_parameters:
break
ipa = clean_node(
wxr, None, t_node.template_parameters.get(arg_name, "")
)
if ipa != "":
sound = Sound(ipa=ipa, sense=sense, raw_tags=raw_tags)
if t_node.template_name.lower() == "sampa":
sound.tags.append("SAMPA")
sounds.append(sound)
case "Audio" | "audio":
# https://it.wiktionary.org/wiki/Template:Audio
sound_file = clean_node(
wxr, None, t_node.template_parameters.get(1, "")
)
raw_tag = clean_node(
wxr, None, t_node.template_parameters.get(2, "")
)
if sound_file != "":
if len(sounds) > 0:
set_sound_file_url_fields(wxr, sound_file, sounds[-1])
if raw_tag != "":
sounds[-1].raw_tags.append(raw_tag)
else:
sound = Sound(sense=sense, raw_tags=raw_tags)
set_sound_file_url_fields(wxr, sound_file, sound)
if raw_tag != "":
sound.raw_tags.append(raw_tag)
sounds.append(sound)
44 changes: 44 additions & 0 deletions tests/test_it_linkage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from unittest import TestCase

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.it.page import parse_page
from wiktextract.wxr_context import WiktextractContext


class TestItLinkage(TestCase):
maxDiff = None

def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="it"),
WiktionaryConfig(
dump_file_lang_code="it", capture_language_codes=None
),
)

def test_synonyms(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
self.wxr.wtp.add_page(
"Template:Fig", 10, "<small>(''senso figurato'')</small>"
)
data = parse_page(
self.wxr,
"cane",
"""== {{-it-}} ==
===Sostantivo===
# [[animale]]
===Sinonimi===
* [[animale]], amico dell’uomo
* {{Fig}} ''(di freddo)'' [[forte]], [[intenso]]""",
)
self.assertEqual(
data[0]["synonyms"],
[
{"word": "animale"},
{"word": "amico dell’uomo"},
{"word": "forte", "raw_tags": ["senso figurato", "di freddo"]},
{"word": "intenso"},
],
)
Loading
Loading