From a78495556a37b744ff3b3594347fc5b8706fb305 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Mon, 16 Dec 2024 12:00:45 +0800 Subject: [PATCH 1/6] [it] save hyphenation lists data to `hyphenations` field --- src/wiktextract/extractor/it/models.py | 7 +++++- src/wiktextract/extractor/it/sound.py | 32 ++++++++++++++++++++++---- tests/test_it_sound.py | 30 ++++++++++++++++++++++-- 3 files changed, 61 insertions(+), 8 deletions(-) diff --git a/src/wiktextract/extractor/it/models.py b/src/wiktextract/extractor/it/models.py index 1e42a0f5..83e9cdd1 100644 --- a/src/wiktextract/extractor/it/models.py +++ b/src/wiktextract/extractor/it/models.py @@ -62,6 +62,11 @@ class Sound(ItalianBaseModel): raw_tags: list[str] = [] +class Hyphenation(ItalianBaseModel): + hyphenation: str = "" + sense: str = "" + + class WordEntry(ItalianBaseModel): model_config = ConfigDict(title="Italian Wiktionary") word: str = Field(description="Word string", min_length=1) @@ -77,5 +82,5 @@ class WordEntry(ItalianBaseModel): forms: list[Form] = [] etymology_texts: list[str] = [] etymology_examples: list[Example] = [] - hyphenation: str = "" + hyphenations: list[Hyphenation] = [] sounds: list[Sound] = [] diff --git a/src/wiktextract/extractor/it/sound.py b/src/wiktextract/extractor/it/sound.py index 08b9074d..cd40cfd9 100644 --- a/src/wiktextract/extractor/it/sound.py +++ b/src/wiktextract/extractor/it/sound.py @@ -3,19 +3,41 @@ from ...page import clean_node from ...wxr_context import WiktextractContext from ..share import set_sound_file_url_fields -from .models import Sound, WordEntry +from .models import Hyphenation, Sound, WordEntry def extract_hyphenation_section( wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode ) -> None: - hyphenation = "" + # https://it.wiktionary.org/wiki/Aiuto:Sillabazione + hyphenations = [] for list_node in level_node.find_child(NodeKind.LIST): - for list_item in list_node.find_child(NodeKind.LIST_ITEM): - hyphenation = clean_node(wxr, None, list_item.children) + match list_node.sarg: + case ";": + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + h_str = clean_node(wxr, None, list_item.children) + if h_str != "": + hyphenations.append(Hyphenation(hyphenation=h_str)) + break + case "*": + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + h_data = Hyphenation() + for node in list_item.find_child( + NodeKind.ITALIC | NodeKind.BOLD + ): + match node.kind: + case NodeKind.ITALIC: + h_data.sense = clean_node( + wxr, None, node + ).strip("()") + case NodeKind.BOLD: + h_data.hyphenation = clean_node(wxr, None, node) + if h_data.hyphenation != "": + hyphenations.append(h_data) + for data in page_data: if data.lang_code == page_data[-1].lang_code: - data.hyphenation = hyphenation + data.hyphenations.extend(hyphenations) def extract_pronunciation_section( diff --git a/tests/test_it_sound.py b/tests/test_it_sound.py index 62c695b2..bef39981 100644 --- a/tests/test_it_sound.py +++ b/tests/test_it_sound.py @@ -18,7 +18,7 @@ def setUp(self) -> None: ), ) - def test_hyphenation(self): + def test_hyphenation_single_list(self): self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") data = parse_page( self.wxr, @@ -29,7 +29,7 @@ def test_hyphenation(self): ===Sillabazione=== ; cà | ne""", ) - self.assertEqual(data[0]["hyphenation"], "cà | ne") + self.assertEqual(data[0]["hyphenations"], [{"hyphenation": "cà | ne"}]) def test_ipa_audio_templates(self): self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") @@ -46,3 +46,29 @@ def test_ipa_audio_templates(self): sound = data[0]["sounds"][0] self.assertEqual(sound["ipa"], "/ˈkaːne/") self.assertEqual(sound["audio"], "it-cane.ogg") + + def test_hyphenation_lists(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + data = parse_page( + self.wxr, + "pesca", + """== {{-it-}} == +===Sostantivo=== +# [[frutto]] del [[pesco]] +===Sillabazione=== +* ''(il frutto e significati correlati)'' '''pè | sca''' +* ''(l'atto del pescare e significati correlati)'' '''pé | sca'''""", + ) + self.assertEqual( + data[0]["hyphenations"], + [ + { + "hyphenation": "pè | sca", + "sense": "il frutto e significati correlati", + }, + { + "hyphenation": "pé | sca", + "sense": "l'atto del pescare e significati correlati", + }, + ], + ) From 042f583ae249588248d9d81a989dc51480f761f8 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Mon, 16 Dec 2024 12:06:45 +0800 Subject: [PATCH 2/6] [it] extract direct bold child nodes in hyphenation section --- src/wiktextract/extractor/it/sound.py | 6 ++++++ tests/test_it_sound.py | 19 +++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/src/wiktextract/extractor/it/sound.py b/src/wiktextract/extractor/it/sound.py index cd40cfd9..6bc147bc 100644 --- a/src/wiktextract/extractor/it/sound.py +++ b/src/wiktextract/extractor/it/sound.py @@ -35,6 +35,12 @@ def extract_hyphenation_section( if h_data.hyphenation != "": hyphenations.append(h_data) + # no list + for node in level_node.find_child(NodeKind.BOLD): + h_str = clean_node(wxr, None, node) + if h_str != "": + hyphenations.append(Hyphenation(hyphenation=h_str)) + for data in page_data: if data.lang_code == page_data[-1].lang_code: data.hyphenations.extend(hyphenations) diff --git a/tests/test_it_sound.py b/tests/test_it_sound.py index bef39981..4dbe4b5b 100644 --- a/tests/test_it_sound.py +++ b/tests/test_it_sound.py @@ -72,3 +72,22 @@ def test_hyphenation_lists(self): }, ], ) + + def test_hyphenation_no_list(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + data = parse_page( + self.wxr, + "cespita", + """== {{-it-}} == +===Sostantivo=== +# [[variante]] di [[ceppita]] +===Sillabazione=== +'''cè | spi | ta''' o '''cé | spi | ta'''""", + ) + self.assertEqual( + data[0]["hyphenations"], + [ + {"hyphenation": "cè | spi | ta"}, + {"hyphenation": "cé | spi | ta"}, + ], + ) From db51476c9baffdb7415ea583b5ca75479398f1fd Mon Sep 17 00:00:00 2001 From: xxyzz Date: Mon, 16 Dec 2024 14:13:18 +0800 Subject: [PATCH 3/6] [it] extract the rest arguments of template "IPA" --- src/wiktextract/extractor/it/sound.py | 20 ++++++++++++++------ tests/test_it_sound.py | 20 ++++++++++++++++++++ 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/src/wiktextract/extractor/it/sound.py b/src/wiktextract/extractor/it/sound.py index 6bc147bc..84a6f811 100644 --- a/src/wiktextract/extractor/it/sound.py +++ b/src/wiktextract/extractor/it/sound.py @@ -52,12 +52,20 @@ def extract_pronunciation_section( sounds = [] for t_node in level_node.find_child(NodeKind.TEMPLATE): match t_node.template_name.lower(): - case "ipa": - ipa = clean_node( - wxr, None, t_node.template_parameters.get(1, "") - ) - if ipa != "": - sounds.append(Sound(ipa=ipa)) + case "ipa" | "sampa": + # https://it.wiktionary.org/wiki/Template:IPA + # https://it.wiktionary.org/wiki/Template:SAMPA + for arg_name in range(1, 5): + if arg_name not in t_node.template_parameters: + break + ipa = clean_node( + wxr, None, t_node.template_parameters.get(arg_name, "") + ) + if ipa != "": + sound = Sound(ipa=ipa) + if t_node.template_name.lower() == "sampa": + sound.tags.append("SAMPA") + sounds.append(sound) case "audio": sound_file = clean_node( wxr, None, t_node.template_parameters.get(1, "") diff --git a/tests/test_it_sound.py b/tests/test_it_sound.py index 4dbe4b5b..091c9489 100644 --- a/tests/test_it_sound.py +++ b/tests/test_it_sound.py @@ -91,3 +91,23 @@ def test_hyphenation_no_list(self): {"hyphenation": "cé | spi | ta"}, ], ) + + def test_sampa(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + data = parse_page( + self.wxr, + "Italia", + """== {{-it-}} == +===Nome proprio=== +# [[stato]] +===Pronuncia=== +{{IPA|/iˈtalja/|/iˈtaː.li̯a/}}, {{SAMPA|/i"talja/}} +{{Audio|It-Italia.ogg}}""", + ) + self.assertEqual( + data[0]["sounds"][:2], + [{"ipa": "/iˈtalja/"}, {"ipa": "/iˈtaː.li̯a/"}], + ) + self.assertEqual(data[0]["sounds"][2]["ipa"], '/i"talja/') + self.assertEqual(data[0]["sounds"][2]["tags"], ["SAMPA"]) + self.assertEqual(data[0]["sounds"][2]["audio"], "It-Italia.ogg") From 52fb94c47a18958ebe0b58158ce89b12f8656dd2 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Mon, 16 Dec 2024 15:02:32 +0800 Subject: [PATCH 4/6] [it] extract lists in pronunciation section --- src/wiktextract/extractor/it/models.py | 1 + src/wiktextract/extractor/it/sound.py | 89 ++++++++++++++++++-------- tests/test_it_sound.py | 17 +++++ 3 files changed, 80 insertions(+), 27 deletions(-) diff --git a/src/wiktextract/extractor/it/models.py b/src/wiktextract/extractor/it/models.py index 83e9cdd1..701f53b2 100644 --- a/src/wiktextract/extractor/it/models.py +++ b/src/wiktextract/extractor/it/models.py @@ -60,6 +60,7 @@ class Sound(ItalianBaseModel): flac_url: str = "" tags: list[str] = [] raw_tags: list[str] = [] + sense: str = "" class Hyphenation(ItalianBaseModel): diff --git a/src/wiktextract/extractor/it/sound.py b/src/wiktextract/extractor/it/sound.py index 84a6f811..daed473e 100644 --- a/src/wiktextract/extractor/it/sound.py +++ b/src/wiktextract/extractor/it/sound.py @@ -1,4 +1,4 @@ -from wikitextprocessor import LevelNode, NodeKind +from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode from ...page import clean_node from ...wxr_context import WiktextractContext @@ -49,35 +49,70 @@ def extract_hyphenation_section( def extract_pronunciation_section( wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode ) -> None: + # https://it.wiktionary.org/wiki/Aiuto:Pronuncia sounds = [] + for list_node in level_node.find_child(NodeKind.LIST): + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + extract_sound_list_item(wxr, list_item, sounds) + + # no list for t_node in level_node.find_child(NodeKind.TEMPLATE): - match t_node.template_name.lower(): - case "ipa" | "sampa": - # https://it.wiktionary.org/wiki/Template:IPA - # https://it.wiktionary.org/wiki/Template:SAMPA - for arg_name in range(1, 5): - if arg_name not in t_node.template_parameters: - break - ipa = clean_node( - wxr, None, t_node.template_parameters.get(arg_name, "") - ) - if ipa != "": - sound = Sound(ipa=ipa) - if t_node.template_name.lower() == "sampa": - sound.tags.append("SAMPA") - sounds.append(sound) - case "audio": - sound_file = clean_node( - wxr, None, t_node.template_parameters.get(1, "") - ) - if sound_file != "": - if len(sounds) > 0: - set_sound_file_url_fields(wxr, sound_file, sounds[-1]) - else: - sound = Sound() - set_sound_file_url_fields(wxr, sound_file, sound) - sounds.append(sound) + extract_sound_template(wxr, t_node, sounds, "") for data in page_data: if data.lang_code == page_data[-1].lang_code: data.sounds.extend(sounds) + + +def extract_sound_list_item( + wxr: WiktextractContext, list_item: WikiNode, sounds: list[Sound] +) -> None: + sense = "" + for node in list_item.find_child(NodeKind.ITALIC | NodeKind.TEMPLATE): + match node.kind: + case NodeKind.ITALIC: + sense = clean_node(wxr, None, node).strip("()") + case NodeKind.TEMPLATE: + extract_sound_template(wxr, node, sounds, sense) + + +def extract_sound_template( + wxr: WiktextractContext, + t_node: TemplateNode, + sounds: list[Sound], + sense: str, +) -> None: + match t_node.template_name: + case "IPA" | "SAMPA": + # https://it.wiktionary.org/wiki/Template:IPA + # https://it.wiktionary.org/wiki/Template:SAMPA + for arg_name in range(1, 5): + if arg_name not in t_node.template_parameters: + break + ipa = clean_node( + wxr, None, t_node.template_parameters.get(arg_name, "") + ) + if ipa != "": + sound = Sound(ipa=ipa, sense=sense) + if t_node.template_name.lower() == "sampa": + sound.tags.append("SAMPA") + sounds.append(sound) + case "Audio" | "audio": + # https://it.wiktionary.org/wiki/Template:Audio + sound_file = clean_node( + wxr, None, t_node.template_parameters.get(1, "") + ) + raw_tag = clean_node( + wxr, None, t_node.template_parameters.get(2, "") + ) + if sound_file != "": + if len(sounds) > 0: + set_sound_file_url_fields(wxr, sound_file, sounds[-1]) + if raw_tag != "": + sounds[-1].raw_tags.append(raw_tag) + else: + sound = Sound(sense=sense) + set_sound_file_url_fields(wxr, sound_file, sound) + if raw_tag != "": + sound.raw_tags.append(raw_tag) + sounds.append(sound) diff --git a/tests/test_it_sound.py b/tests/test_it_sound.py index 091c9489..72ca53bc 100644 --- a/tests/test_it_sound.py +++ b/tests/test_it_sound.py @@ -111,3 +111,20 @@ def test_sampa(self): self.assertEqual(data[0]["sounds"][2]["ipa"], '/i"talja/') self.assertEqual(data[0]["sounds"][2]["tags"], ["SAMPA"]) self.assertEqual(data[0]["sounds"][2]["audio"], "It-Italia.ogg") + + def test_sound_list(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + data = parse_page( + self.wxr, + "pesca", + """== {{-it-}} == +===Nome proprio=== +# [[frutto]] +===Pronuncia=== +* ''(il frutto e significati correlati)'' {{IPA|/ˈpɛska/}} {{Audio|It-pesca_(frutto).ogg}}""", + ) + self.assertEqual( + data[0]["sounds"][0]["sense"], "il frutto e significati correlati" + ) + self.assertEqual(data[0]["sounds"][0]["ipa"], "/ˈpɛska/") + self.assertEqual(data[0]["sounds"][0]["audio"], "It-pesca_(frutto).ogg") From f84cb54f57f5deb8233194364029db34485a00a5 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Mon, 16 Dec 2024 15:38:21 +0800 Subject: [PATCH 5/6] [it] extract "Glossa" tag template in sound lists --- src/wiktextract/extractor/it/pos.py | 12 +++++------- src/wiktextract/extractor/it/sound.py | 13 +++++++++---- tests/test_it_sound.py | 16 ++++++++++++++++ 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/src/wiktextract/extractor/it/pos.py b/src/wiktextract/extractor/it/pos.py index f50a3605..f863ff48 100644 --- a/src/wiktextract/extractor/it/pos.py +++ b/src/wiktextract/extractor/it/pos.py @@ -50,13 +50,11 @@ def extract_gloss_list_item( sense = Sense() for node in list_item.children: if isinstance(node, TemplateNode): - match node.template_name: - case "Term": - raw_tag = clean_node(wxr, sense, node).strip("() \n") - if raw_tag != "": - sense.raw_tags.append(raw_tag) - case _: - gloss_nodes.append(node) + t_str = clean_node(wxr, sense, node) + if t_str.startswith("(") and t_str.endswith(")"): + sense.raw_tags.append(t_str.strip("()")) + else: + gloss_nodes.append(t_str) elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: if node.sarg.endswith("*"): for example_list_item in node.find_child(NodeKind.LIST_ITEM): diff --git a/src/wiktextract/extractor/it/sound.py b/src/wiktextract/extractor/it/sound.py index daed473e..a507ad01 100644 --- a/src/wiktextract/extractor/it/sound.py +++ b/src/wiktextract/extractor/it/sound.py @@ -57,7 +57,7 @@ def extract_pronunciation_section( # no list for t_node in level_node.find_child(NodeKind.TEMPLATE): - extract_sound_template(wxr, t_node, sounds, "") + extract_sound_template(wxr, t_node, sounds, "", []) for data in page_data: if data.lang_code == page_data[-1].lang_code: @@ -68,12 +68,16 @@ def extract_sound_list_item( wxr: WiktextractContext, list_item: WikiNode, sounds: list[Sound] ) -> None: sense = "" + raw_tags = [] for node in list_item.find_child(NodeKind.ITALIC | NodeKind.TEMPLATE): match node.kind: case NodeKind.ITALIC: sense = clean_node(wxr, None, node).strip("()") case NodeKind.TEMPLATE: - extract_sound_template(wxr, node, sounds, sense) + if node.template_name.lower() == "glossa": + raw_tags.append(clean_node(wxr, None, node).strip("()")) + else: + extract_sound_template(wxr, node, sounds, sense, raw_tags) def extract_sound_template( @@ -81,6 +85,7 @@ def extract_sound_template( t_node: TemplateNode, sounds: list[Sound], sense: str, + raw_tags: list[str], ) -> None: match t_node.template_name: case "IPA" | "SAMPA": @@ -93,7 +98,7 @@ def extract_sound_template( wxr, None, t_node.template_parameters.get(arg_name, "") ) if ipa != "": - sound = Sound(ipa=ipa, sense=sense) + sound = Sound(ipa=ipa, sense=sense, raw_tags=raw_tags) if t_node.template_name.lower() == "sampa": sound.tags.append("SAMPA") sounds.append(sound) @@ -111,7 +116,7 @@ def extract_sound_template( if raw_tag != "": sounds[-1].raw_tags.append(raw_tag) else: - sound = Sound(sense=sense) + sound = Sound(sense=sense, raw_tags=raw_tags) set_sound_file_url_fields(wxr, sound_file, sound) if raw_tag != "": sound.raw_tags.append(raw_tag) diff --git a/tests/test_it_sound.py b/tests/test_it_sound.py index 72ca53bc..30ba4a95 100644 --- a/tests/test_it_sound.py +++ b/tests/test_it_sound.py @@ -128,3 +128,19 @@ def test_sound_list(self): ) self.assertEqual(data[0]["sounds"][0]["ipa"], "/ˈpɛska/") self.assertEqual(data[0]["sounds"][0]["audio"], "It-pesca_(frutto).ogg") + + def test_glossa_tag(self): + self.wxr.wtp.add_page("Template:-en-", 10, "Inglese") + self.wxr.wtp.add_page("Template:glossa", 10, "({{{1}}})") + data = parse_page( + self.wxr, + "large", + """== {{-en-}} == +===Aggettivo=== +# [[largo]] +===Pronuncia=== +*{{glossa|UK}} {{IPA|/lɑːd͡ʒ/}}""", + ) + self.assertEqual( + data[0]["sounds"], [{"raw_tags": ["UK"], "ipa": "/lɑːd͡ʒ/"}] + ) From b92d96e5a9575f3d4cb43e0023c05da0767d02b6 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Mon, 16 Dec 2024 17:06:12 +0800 Subject: [PATCH 6/6] [it] extract linkage sections --- src/wiktextract/extractor/it/etymology.py | 1 + src/wiktextract/extractor/it/linkage.py | 50 +++++++++++++++++++ src/wiktextract/extractor/it/models.py | 13 +++++ src/wiktextract/extractor/it/page.py | 7 ++- .../extractor/it/section_titles.py | 15 ++++++ tests/test_it_linkage.py | 44 ++++++++++++++++ 6 files changed, 129 insertions(+), 1 deletion(-) create mode 100644 src/wiktextract/extractor/it/linkage.py create mode 100644 tests/test_it_linkage.py diff --git a/src/wiktextract/extractor/it/etymology.py b/src/wiktextract/extractor/it/etymology.py index 8092e7af..b108f35c 100644 --- a/src/wiktextract/extractor/it/etymology.py +++ b/src/wiktextract/extractor/it/etymology.py @@ -8,6 +8,7 @@ def extract_etymology_section( wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode ) -> None: + # https://it.wiktionary.org/wiki/Aiuto:Etimologia etymology_texts = [] for list_node in level_node.find_child(NodeKind.LIST): for list_item in list_node.find_child(NodeKind.LIST_ITEM): diff --git a/src/wiktextract/extractor/it/linkage.py b/src/wiktextract/extractor/it/linkage.py new file mode 100644 index 00000000..75abd5d5 --- /dev/null +++ b/src/wiktextract/extractor/it/linkage.py @@ -0,0 +1,50 @@ +from wikitextprocessor import LevelNode, NodeKind, WikiNode + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .models import Linkage, WordEntry + + +def extract_linkage_section( + wxr: WiktextractContext, + page_data: list[WordEntry], + level_node: LevelNode, + linkage_type: str, +) -> None: + linkages = [] + for list_node in level_node.find_child(NodeKind.LIST): + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + linkages.extend(extract_linkage_list_item(wxr, list_item)) + + for data in page_data: + if data.lang_code == page_data[-1].lang_code: + getattr(data, linkage_type).extend(linkages) + + +def extract_linkage_list_item( + wxr: WiktextractContext, list_item: WikiNode +) -> list[Linkage]: + raw_tags = [] + linkages = [] + for node in list_item.children: + if isinstance(node, WikiNode): + match node.kind: + case NodeKind.LINK: + node_str = clean_node(wxr, None, node) + if node_str != "": + linkages.append( + Linkage(word=node_str, raw_tags=raw_tags) + ) + raw_tags.clear() + case NodeKind.TEMPLATE | NodeKind.ITALIC: + node_str = clean_node(wxr, None, node) + if node_str.startswith("(") and node_str.endswith(")"): + raw_tags.append(node_str.strip("()")) + elif isinstance(node, str): + for word_str in node.split(","): + word_str = word_str.strip() + if word_str != "": + linkages.append(Linkage(word=word_str, raw_tags=raw_tags)) + raw_tags.clear() + + return linkages diff --git a/src/wiktextract/extractor/it/models.py b/src/wiktextract/extractor/it/models.py index 701f53b2..7ba272a7 100644 --- a/src/wiktextract/extractor/it/models.py +++ b/src/wiktextract/extractor/it/models.py @@ -68,6 +68,12 @@ class Hyphenation(ItalianBaseModel): sense: str = "" +class Linkage(ItalianBaseModel): + word: str + tags: list[str] = [] + raw_tags: list[str] = [] + + class WordEntry(ItalianBaseModel): model_config = ConfigDict(title="Italian Wiktionary") word: str = Field(description="Word string", min_length=1) @@ -85,3 +91,10 @@ class WordEntry(ItalianBaseModel): etymology_examples: list[Example] = [] hyphenations: list[Hyphenation] = [] sounds: list[Sound] = [] + synonyms: list[Linkage] = [] + antonyms: list[Linkage] = [] + derived: list[Linkage] = [] + related: list[Linkage] = [] + hyponyms: list[Linkage] = [] + hypernyms: list[Linkage] = [] + proverbs: list[Linkage] = [] diff --git a/src/wiktextract/extractor/it/page.py b/src/wiktextract/extractor/it/page.py index 7817a40b..68f51ce2 100644 --- a/src/wiktextract/extractor/it/page.py +++ b/src/wiktextract/extractor/it/page.py @@ -5,9 +5,10 @@ from ...page import clean_node from ...wxr_context import WiktextractContext from .etymology import extract_citation_section, extract_etymology_section +from .linkage import extract_linkage_section from .models import Sense, WordEntry from .pos import extract_pos_section -from .section_titles import POS_DATA +from .section_titles import LINKAGE_SECTIONS, POS_DATA from .sound import extract_hyphenation_section, extract_pronunciation_section from .translation import extract_translation_section @@ -31,6 +32,10 @@ def parse_section( extract_hyphenation_section(wxr, page_data, level_node) elif title_text == "Pronuncia": extract_pronunciation_section(wxr, page_data, level_node) + elif title_text in LINKAGE_SECTIONS: + extract_linkage_section( + wxr, page_data, level_node, LINKAGE_SECTIONS[title_text] + ) for next_level in level_node.find_child(LEVEL_KIND_FLAGS): parse_section(wxr, page_data, base_data, next_level) diff --git a/src/wiktextract/extractor/it/section_titles.py b/src/wiktextract/extractor/it/section_titles.py index 1fc81b45..b5360d75 100644 --- a/src/wiktextract/extractor/it/section_titles.py +++ b/src/wiktextract/extractor/it/section_titles.py @@ -62,3 +62,18 @@ "Codice / Simbolo": {"pos": "symbol"}, "Carattere hiragana": {"pos": "character", "tags": ["hiragana"]}, } + + +LINKAGE_SECTIONS = { + "Sinonimi": "synonyms", + "Contrari": "antonyms", + "Derivati": "derived", + "Termini correlati": "related", + "Varianti": "related", + "Alterati": "related", + "Iponimi": "hyponyms", + "Iperonimi": "hypernyms", + "Da non confondere con": "related", + "Proverbi e modi di dire": "proverbs", + "Parole derivate": "derived", +} diff --git a/tests/test_it_linkage.py b/tests/test_it_linkage.py new file mode 100644 index 00000000..3aabea24 --- /dev/null +++ b/tests/test_it_linkage.py @@ -0,0 +1,44 @@ +from unittest import TestCase + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.it.page import parse_page +from wiktextract.wxr_context import WiktextractContext + + +class TestItLinkage(TestCase): + maxDiff = None + + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="it"), + WiktionaryConfig( + dump_file_lang_code="it", capture_language_codes=None + ), + ) + + def test_synonyms(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + self.wxr.wtp.add_page( + "Template:Fig", 10, "(''senso figurato'')" + ) + data = parse_page( + self.wxr, + "cane", + """== {{-it-}} == +===Sostantivo=== +# [[animale]] +===Sinonimi=== +* [[animale]], amico dell’uomo +* {{Fig}} ''(di freddo)'' [[forte]], [[intenso]]""", + ) + self.assertEqual( + data[0]["synonyms"], + [ + {"word": "animale"}, + {"word": "amico dell’uomo"}, + {"word": "forte", "raw_tags": ["senso figurato", "di freddo"]}, + {"word": "intenso"}, + ], + )