diff --git a/src/wiktextract/extractor/it/etymology.py b/src/wiktextract/extractor/it/etymology.py index 8092e7af..b108f35c 100644 --- a/src/wiktextract/extractor/it/etymology.py +++ b/src/wiktextract/extractor/it/etymology.py @@ -8,6 +8,7 @@ def extract_etymology_section( wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode ) -> None: + # https://it.wiktionary.org/wiki/Aiuto:Etimologia etymology_texts = [] for list_node in level_node.find_child(NodeKind.LIST): for list_item in list_node.find_child(NodeKind.LIST_ITEM): diff --git a/src/wiktextract/extractor/it/linkage.py b/src/wiktextract/extractor/it/linkage.py new file mode 100644 index 00000000..75abd5d5 --- /dev/null +++ b/src/wiktextract/extractor/it/linkage.py @@ -0,0 +1,50 @@ +from wikitextprocessor import LevelNode, NodeKind, WikiNode + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .models import Linkage, WordEntry + + +def extract_linkage_section( + wxr: WiktextractContext, + page_data: list[WordEntry], + level_node: LevelNode, + linkage_type: str, +) -> None: + linkages = [] + for list_node in level_node.find_child(NodeKind.LIST): + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + linkages.extend(extract_linkage_list_item(wxr, list_item)) + + for data in page_data: + if data.lang_code == page_data[-1].lang_code: + getattr(data, linkage_type).extend(linkages) + + +def extract_linkage_list_item( + wxr: WiktextractContext, list_item: WikiNode +) -> list[Linkage]: + raw_tags = [] + linkages = [] + for node in list_item.children: + if isinstance(node, WikiNode): + match node.kind: + case NodeKind.LINK: + node_str = clean_node(wxr, None, node) + if node_str != "": + linkages.append( + Linkage(word=node_str, raw_tags=raw_tags) + ) + raw_tags.clear() + case NodeKind.TEMPLATE | NodeKind.ITALIC: + node_str = clean_node(wxr, None, node) + if node_str.startswith("(") and node_str.endswith(")"): + raw_tags.append(node_str.strip("()")) + elif isinstance(node, str): + for word_str in node.split(","): + word_str = word_str.strip() + if word_str != "": + linkages.append(Linkage(word=word_str, raw_tags=raw_tags)) + raw_tags.clear() + + return linkages diff --git a/src/wiktextract/extractor/it/models.py b/src/wiktextract/extractor/it/models.py index 1e42a0f5..7ba272a7 100644 --- a/src/wiktextract/extractor/it/models.py +++ b/src/wiktextract/extractor/it/models.py @@ -60,6 +60,18 @@ class Sound(ItalianBaseModel): flac_url: str = "" tags: list[str] = [] raw_tags: list[str] = [] + sense: str = "" + + +class Hyphenation(ItalianBaseModel): + hyphenation: str = "" + sense: str = "" + + +class Linkage(ItalianBaseModel): + word: str + tags: list[str] = [] + raw_tags: list[str] = [] class WordEntry(ItalianBaseModel): @@ -77,5 +89,12 @@ class WordEntry(ItalianBaseModel): forms: list[Form] = [] etymology_texts: list[str] = [] etymology_examples: list[Example] = [] - hyphenation: str = "" + hyphenations: list[Hyphenation] = [] sounds: list[Sound] = [] + synonyms: list[Linkage] = [] + antonyms: list[Linkage] = [] + derived: list[Linkage] = [] + related: list[Linkage] = [] + hyponyms: list[Linkage] = [] + hypernyms: list[Linkage] = [] + proverbs: list[Linkage] = [] diff --git a/src/wiktextract/extractor/it/page.py b/src/wiktextract/extractor/it/page.py index 7817a40b..68f51ce2 100644 --- a/src/wiktextract/extractor/it/page.py +++ b/src/wiktextract/extractor/it/page.py @@ -5,9 +5,10 @@ from ...page import clean_node from ...wxr_context import WiktextractContext from .etymology import extract_citation_section, extract_etymology_section +from .linkage import extract_linkage_section from .models import Sense, WordEntry from .pos import extract_pos_section -from .section_titles import POS_DATA +from .section_titles import LINKAGE_SECTIONS, POS_DATA from .sound import extract_hyphenation_section, extract_pronunciation_section from .translation import extract_translation_section @@ -31,6 +32,10 @@ def parse_section( extract_hyphenation_section(wxr, page_data, level_node) elif title_text == "Pronuncia": extract_pronunciation_section(wxr, page_data, level_node) + elif title_text in LINKAGE_SECTIONS: + extract_linkage_section( + wxr, page_data, level_node, LINKAGE_SECTIONS[title_text] + ) for next_level in level_node.find_child(LEVEL_KIND_FLAGS): parse_section(wxr, page_data, base_data, next_level) diff --git a/src/wiktextract/extractor/it/pos.py b/src/wiktextract/extractor/it/pos.py index f50a3605..f863ff48 100644 --- a/src/wiktextract/extractor/it/pos.py +++ b/src/wiktextract/extractor/it/pos.py @@ -50,13 +50,11 @@ def extract_gloss_list_item( sense = Sense() for node in list_item.children: if isinstance(node, TemplateNode): - match node.template_name: - case "Term": - raw_tag = clean_node(wxr, sense, node).strip("() \n") - if raw_tag != "": - sense.raw_tags.append(raw_tag) - case _: - gloss_nodes.append(node) + t_str = clean_node(wxr, sense, node) + if t_str.startswith("(") and t_str.endswith(")"): + sense.raw_tags.append(t_str.strip("()")) + else: + gloss_nodes.append(t_str) elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: if node.sarg.endswith("*"): for example_list_item in node.find_child(NodeKind.LIST_ITEM): diff --git a/src/wiktextract/extractor/it/section_titles.py b/src/wiktextract/extractor/it/section_titles.py index 1fc81b45..b5360d75 100644 --- a/src/wiktextract/extractor/it/section_titles.py +++ b/src/wiktextract/extractor/it/section_titles.py @@ -62,3 +62,18 @@ "Codice / Simbolo": {"pos": "symbol"}, "Carattere hiragana": {"pos": "character", "tags": ["hiragana"]}, } + + +LINKAGE_SECTIONS = { + "Sinonimi": "synonyms", + "Contrari": "antonyms", + "Derivati": "derived", + "Termini correlati": "related", + "Varianti": "related", + "Alterati": "related", + "Iponimi": "hyponyms", + "Iperonimi": "hypernyms", + "Da non confondere con": "related", + "Proverbi e modi di dire": "proverbs", + "Parole derivate": "derived", +} diff --git a/src/wiktextract/extractor/it/sound.py b/src/wiktextract/extractor/it/sound.py index 08b9074d..a507ad01 100644 --- a/src/wiktextract/extractor/it/sound.py +++ b/src/wiktextract/extractor/it/sound.py @@ -1,47 +1,123 @@ -from wikitextprocessor import LevelNode, NodeKind +from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode from ...page import clean_node from ...wxr_context import WiktextractContext from ..share import set_sound_file_url_fields -from .models import Sound, WordEntry +from .models import Hyphenation, Sound, WordEntry def extract_hyphenation_section( wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode ) -> None: - hyphenation = "" + # https://it.wiktionary.org/wiki/Aiuto:Sillabazione + hyphenations = [] for list_node in level_node.find_child(NodeKind.LIST): - for list_item in list_node.find_child(NodeKind.LIST_ITEM): - hyphenation = clean_node(wxr, None, list_item.children) + match list_node.sarg: + case ";": + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + h_str = clean_node(wxr, None, list_item.children) + if h_str != "": + hyphenations.append(Hyphenation(hyphenation=h_str)) + break + case "*": + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + h_data = Hyphenation() + for node in list_item.find_child( + NodeKind.ITALIC | NodeKind.BOLD + ): + match node.kind: + case NodeKind.ITALIC: + h_data.sense = clean_node( + wxr, None, node + ).strip("()") + case NodeKind.BOLD: + h_data.hyphenation = clean_node(wxr, None, node) + if h_data.hyphenation != "": + hyphenations.append(h_data) + + # no list + for node in level_node.find_child(NodeKind.BOLD): + h_str = clean_node(wxr, None, node) + if h_str != "": + hyphenations.append(Hyphenation(hyphenation=h_str)) + for data in page_data: if data.lang_code == page_data[-1].lang_code: - data.hyphenation = hyphenation + data.hyphenations.extend(hyphenations) def extract_pronunciation_section( wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode ) -> None: + # https://it.wiktionary.org/wiki/Aiuto:Pronuncia sounds = [] + for list_node in level_node.find_child(NodeKind.LIST): + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + extract_sound_list_item(wxr, list_item, sounds) + + # no list for t_node in level_node.find_child(NodeKind.TEMPLATE): - match t_node.template_name.lower(): - case "ipa": - ipa = clean_node( - wxr, None, t_node.template_parameters.get(1, "") - ) - if ipa != "": - sounds.append(Sound(ipa=ipa)) - case "audio": - sound_file = clean_node( - wxr, None, t_node.template_parameters.get(1, "") - ) - if sound_file != "": - if len(sounds) > 0: - set_sound_file_url_fields(wxr, sound_file, sounds[-1]) - else: - sound = Sound() - set_sound_file_url_fields(wxr, sound_file, sound) - sounds.append(sound) + extract_sound_template(wxr, t_node, sounds, "", []) for data in page_data: if data.lang_code == page_data[-1].lang_code: data.sounds.extend(sounds) + + +def extract_sound_list_item( + wxr: WiktextractContext, list_item: WikiNode, sounds: list[Sound] +) -> None: + sense = "" + raw_tags = [] + for node in list_item.find_child(NodeKind.ITALIC | NodeKind.TEMPLATE): + match node.kind: + case NodeKind.ITALIC: + sense = clean_node(wxr, None, node).strip("()") + case NodeKind.TEMPLATE: + if node.template_name.lower() == "glossa": + raw_tags.append(clean_node(wxr, None, node).strip("()")) + else: + extract_sound_template(wxr, node, sounds, sense, raw_tags) + + +def extract_sound_template( + wxr: WiktextractContext, + t_node: TemplateNode, + sounds: list[Sound], + sense: str, + raw_tags: list[str], +) -> None: + match t_node.template_name: + case "IPA" | "SAMPA": + # https://it.wiktionary.org/wiki/Template:IPA + # https://it.wiktionary.org/wiki/Template:SAMPA + for arg_name in range(1, 5): + if arg_name not in t_node.template_parameters: + break + ipa = clean_node( + wxr, None, t_node.template_parameters.get(arg_name, "") + ) + if ipa != "": + sound = Sound(ipa=ipa, sense=sense, raw_tags=raw_tags) + if t_node.template_name.lower() == "sampa": + sound.tags.append("SAMPA") + sounds.append(sound) + case "Audio" | "audio": + # https://it.wiktionary.org/wiki/Template:Audio + sound_file = clean_node( + wxr, None, t_node.template_parameters.get(1, "") + ) + raw_tag = clean_node( + wxr, None, t_node.template_parameters.get(2, "") + ) + if sound_file != "": + if len(sounds) > 0: + set_sound_file_url_fields(wxr, sound_file, sounds[-1]) + if raw_tag != "": + sounds[-1].raw_tags.append(raw_tag) + else: + sound = Sound(sense=sense, raw_tags=raw_tags) + set_sound_file_url_fields(wxr, sound_file, sound) + if raw_tag != "": + sound.raw_tags.append(raw_tag) + sounds.append(sound) diff --git a/tests/test_it_linkage.py b/tests/test_it_linkage.py new file mode 100644 index 00000000..3aabea24 --- /dev/null +++ b/tests/test_it_linkage.py @@ -0,0 +1,44 @@ +from unittest import TestCase + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.it.page import parse_page +from wiktextract.wxr_context import WiktextractContext + + +class TestItLinkage(TestCase): + maxDiff = None + + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="it"), + WiktionaryConfig( + dump_file_lang_code="it", capture_language_codes=None + ), + ) + + def test_synonyms(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + self.wxr.wtp.add_page( + "Template:Fig", 10, "(''senso figurato'')" + ) + data = parse_page( + self.wxr, + "cane", + """== {{-it-}} == +===Sostantivo=== +# [[animale]] +===Sinonimi=== +* [[animale]], amico dell’uomo +* {{Fig}} ''(di freddo)'' [[forte]], [[intenso]]""", + ) + self.assertEqual( + data[0]["synonyms"], + [ + {"word": "animale"}, + {"word": "amico dell’uomo"}, + {"word": "forte", "raw_tags": ["senso figurato", "di freddo"]}, + {"word": "intenso"}, + ], + ) diff --git a/tests/test_it_sound.py b/tests/test_it_sound.py index 62c695b2..30ba4a95 100644 --- a/tests/test_it_sound.py +++ b/tests/test_it_sound.py @@ -18,7 +18,7 @@ def setUp(self) -> None: ), ) - def test_hyphenation(self): + def test_hyphenation_single_list(self): self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") data = parse_page( self.wxr, @@ -29,7 +29,7 @@ def test_hyphenation(self): ===Sillabazione=== ; cà | ne""", ) - self.assertEqual(data[0]["hyphenation"], "cà | ne") + self.assertEqual(data[0]["hyphenations"], [{"hyphenation": "cà | ne"}]) def test_ipa_audio_templates(self): self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") @@ -46,3 +46,101 @@ def test_ipa_audio_templates(self): sound = data[0]["sounds"][0] self.assertEqual(sound["ipa"], "/ˈkaːne/") self.assertEqual(sound["audio"], "it-cane.ogg") + + def test_hyphenation_lists(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + data = parse_page( + self.wxr, + "pesca", + """== {{-it-}} == +===Sostantivo=== +# [[frutto]] del [[pesco]] +===Sillabazione=== +* ''(il frutto e significati correlati)'' '''pè | sca''' +* ''(l'atto del pescare e significati correlati)'' '''pé | sca'''""", + ) + self.assertEqual( + data[0]["hyphenations"], + [ + { + "hyphenation": "pè | sca", + "sense": "il frutto e significati correlati", + }, + { + "hyphenation": "pé | sca", + "sense": "l'atto del pescare e significati correlati", + }, + ], + ) + + def test_hyphenation_no_list(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + data = parse_page( + self.wxr, + "cespita", + """== {{-it-}} == +===Sostantivo=== +# [[variante]] di [[ceppita]] +===Sillabazione=== +'''cè | spi | ta''' o '''cé | spi | ta'''""", + ) + self.assertEqual( + data[0]["hyphenations"], + [ + {"hyphenation": "cè | spi | ta"}, + {"hyphenation": "cé | spi | ta"}, + ], + ) + + def test_sampa(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + data = parse_page( + self.wxr, + "Italia", + """== {{-it-}} == +===Nome proprio=== +# [[stato]] +===Pronuncia=== +{{IPA|/iˈtalja/|/iˈtaː.li̯a/}}, {{SAMPA|/i"talja/}} +{{Audio|It-Italia.ogg}}""", + ) + self.assertEqual( + data[0]["sounds"][:2], + [{"ipa": "/iˈtalja/"}, {"ipa": "/iˈtaː.li̯a/"}], + ) + self.assertEqual(data[0]["sounds"][2]["ipa"], '/i"talja/') + self.assertEqual(data[0]["sounds"][2]["tags"], ["SAMPA"]) + self.assertEqual(data[0]["sounds"][2]["audio"], "It-Italia.ogg") + + def test_sound_list(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + data = parse_page( + self.wxr, + "pesca", + """== {{-it-}} == +===Nome proprio=== +# [[frutto]] +===Pronuncia=== +* ''(il frutto e significati correlati)'' {{IPA|/ˈpɛska/}} {{Audio|It-pesca_(frutto).ogg}}""", + ) + self.assertEqual( + data[0]["sounds"][0]["sense"], "il frutto e significati correlati" + ) + self.assertEqual(data[0]["sounds"][0]["ipa"], "/ˈpɛska/") + self.assertEqual(data[0]["sounds"][0]["audio"], "It-pesca_(frutto).ogg") + + def test_glossa_tag(self): + self.wxr.wtp.add_page("Template:-en-", 10, "Inglese") + self.wxr.wtp.add_page("Template:glossa", 10, "({{{1}}})") + data = parse_page( + self.wxr, + "large", + """== {{-en-}} == +===Aggettivo=== +# [[largo]] +===Pronuncia=== +*{{glossa|UK}} {{IPA|/lɑːd͡ʒ/}}""", + ) + self.assertEqual( + data[0]["sounds"], [{"raw_tags": ["UK"], "ipa": "/lɑːd͡ʒ/"}] + )