diff --git a/tests/test_fr_etymology.py b/tests/test_fr_etymology.py index 42fcc3aa4..56c2149a0 100644 --- a/tests/test_fr_etymology.py +++ b/tests/test_fr_etymology.py @@ -4,7 +4,10 @@ from wikitextprocessor import Wtp from wiktextract.config import WiktionaryConfig -from wiktextract.extractor.fr.page import extract_etymology +from wiktextract.extractor.fr.etymology import ( + extract_etymology, + insert_etymology_data, +) from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext @@ -23,9 +26,150 @@ def tearDown(self) -> None: def test_ebauche_etym(self): # https://fr.wiktionary.org/wiki/Hörsaal + # missing etymology template "ébauche-étym" should be ignored self.wxr.wtp.start_page("") root = self.wxr.wtp.parse(": {{ébauche-étym|de}}") - base_data = defaultdict(list, {"lang_code": "de"}) - page_data = [base_data] - extract_etymology(self.wxr, page_data, base_data, root.children) - self.assertEqual(page_data, [{"lang_code": "de"}]) + etymology_data = extract_etymology(self.wxr, root.children) + self.assertIsNone(etymology_data) + + def test_list_etymologies(self): + # https://fr.wiktionary.org/wiki/lenn + self.wxr.wtp.start_page("lenn") + root = self.wxr.wtp.parse( + """* [[#br-nom-1|Nom commun 1 :]] +: Du vieux breton lin (« lac, étang ; liquide, humeur »). +: Du moyen breton lenn. +* [[#br-nom-2|Nom commun 2 :]] +:Du vieux breton lenn (« pièce de toile, voile, manteau, rideau »).""" + ) + etymology_data = extract_etymology(self.wxr, root.children) + self.assertEqual( + etymology_data, + { + "Nom commun 1": [ + "Du vieux breton lin (« lac, étang ; liquide, humeur »).", + "Du moyen breton lenn.", + ], + "Nom commun 2": [ + "Du vieux breton lenn (« pièce de toile, voile, manteau, rideau »)." + ], + }, + ) + page_data = [ + defaultdict( + list, + {"lang_code": "fr", "pos": "noun", "pos_title": "Nom commun 1"}, + ), + defaultdict( + list, + {"lang_code": "fr", "pos": "noun", "pos_title": "Nom commun 2"}, + ), + ] + insert_etymology_data("fr", page_data, etymology_data) + self.assertEqual( + page_data, + [ + { + "lang_code": "fr", + "pos": "noun", + "pos_title": "Nom commun 1", + "etymology_texts": [ + "Du vieux breton lin (« lac, étang ; liquide, humeur »).", + "Du moyen breton lenn.", + ], + }, + { + "lang_code": "fr", + "pos": "noun", + "pos_title": "Nom commun 2", + "etymology_texts": [ + "Du vieux breton lenn (« pièce de toile, voile, manteau, rideau »)." + ], + }, + ], + ) + + def test_indent_etymology_with_pos_template(self): + # https://fr.wiktionary.org/wiki/dame + self.wxr.wtp.start_page("damn") + self.wxr.wtp.add_page("Modèle:lien-ancre-étym", 10, "({{{2}}} {{{3}}})") + root = self.wxr.wtp.parse( + """: {{lien-ancre-étym|fr|Nom commun|1}} Du latin domina (« maîtresse de maison »). +: {{lien-ancre-étym|fr|Nom commun|2}} Du moyen néerlandais dam (« digue »). +: {{lien-ancre-étym|fr|Interjection|1}} Abréviation de « [[Notre-Dame]] ! » ou de « dame Dieu ! » (« [[Seigneur Dieu]] ! »). +""" + ) + etymology_data = extract_etymology(self.wxr, root.children) + self.assertEqual( + etymology_data, + { + "Nom commun 1": ["Du latin domina (« maîtresse de maison »)."], + "Nom commun 2": ["Du moyen néerlandais dam (« digue »)."], + "Interjection 1": [ + "Abréviation de « Notre-Dame ! » ou de « dame Dieu ! » (« Seigneur Dieu ! »)." + ], + }, + ) + page_data = [ + defaultdict( + list, + {"lang_code": "fr", "pos": "noun", "pos_title": "Nom commun 1"}, + ), + defaultdict( + list, + {"lang_code": "fr", "pos": "noun", "pos_title": "Nom commun 2"}, + ), + defaultdict( + list, + {"lang_code": "fr", "pos": "intj", "pos_title": "Interjection"}, + ), + ] + insert_etymology_data("fr", page_data, etymology_data) + self.assertEqual( + page_data, + [ + { + "lang_code": "fr", + "pos": "noun", + "pos_title": "Nom commun 1", + "etymology_texts": [ + "Du latin domina (« maîtresse de maison »)." + ], + }, + { + "lang_code": "fr", + "pos": "noun", + "pos_title": "Nom commun 2", + "etymology_texts": [ + "Du moyen néerlandais dam (« digue »)." + ], + }, + { + "lang_code": "fr", + "pos": "intj", + "pos_title": "Interjection", + "etymology_texts": [ + "Abréviation de « Notre-Dame ! » ou de « dame Dieu ! » (« Seigneur Dieu ! »)." + ], + }, + ], + ) + + def test_indent_etymology_with_italic_pos(self): + # https://fr.wiktionary.org/wiki/hélas + self.wxr.wtp.start_page("hélas") + root = self.wxr.wtp.parse( + """: (''[[#Interjection|Interjection]]'') XIIe siècle, elas ; composé de hé et de las, au sens ancien de « malheureux ». +: (''[[#fr-nom|Nom]]'') Par [[substantivation]] de l’interjection. +""" + ) + etymology_data = extract_etymology(self.wxr, root.children) + self.assertEqual( + etymology_data, + { + "Interjection": [ + "XIIe siècle, elas ; composé de hé et de las, au sens ancien de « malheureux »." + ], + "Nom commun": ["Par substantivation de l’interjection."], + }, + ) diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py index 89b0e5660..43744e5b6 100644 --- a/tests/test_fr_gloss.py +++ b/tests/test_fr_gloss.py @@ -122,13 +122,19 @@ def test_zh_exemple_template(self): ) page_data = [defaultdict(list)] process_pos_block( - self.wxr, page_data, defaultdict(list), root.children[0], "nom" + self.wxr, + page_data, + defaultdict(list), + root.children[0], + "nom", + "Nom commun", ) self.assertEqual( page_data, [ { "pos": "noun", + "pos_title": "Nom commun", "senses": [ { "glosses": ["Cheval."], diff --git a/tests/test_fr_page.py b/tests/test_fr_page.py index d773ca776..c481027b9 100644 --- a/tests/test_fr_page.py +++ b/tests/test_fr_page.py @@ -18,12 +18,6 @@ def setUp(self): conf1 = WiktionaryConfig( dump_file_lang_code="fr", capture_language_codes=None, - capture_translations=True, - capture_pronunciation=True, - capture_linkages=True, - capture_compounds=True, - capture_redirects=True, - capture_examples=True, ) self.wxr = WiktextractContext(Wtp(lang_code="fr"), conf1) @@ -52,6 +46,7 @@ def test_fr_parse_page(self): "lang": "Français", "lang_code": "fr", "pos": "noun", + "pos_title": "Nom commun", "word": "exemple", } ], diff --git a/wiktextract/datautils.py b/wiktextract/datautils.py index e1f646605..313f82fc6 100644 --- a/wiktextract/datautils.py +++ b/wiktextract/datautils.py @@ -259,6 +259,7 @@ def append_base_data( # append new dictionary if the last dictionary has sense data and # also has the same key page_data.append(copy.deepcopy(base_data)) + page_data[-1][field] = value elif isinstance(page_data[-1].get(field), list): page_data[-1][field] += value else: diff --git a/wiktextract/extractor/fr/etymology.py b/wiktextract/extractor/fr/etymology.py new file mode 100644 index 000000000..1760a4996 --- /dev/null +++ b/wiktextract/extractor/fr/etymology.py @@ -0,0 +1,118 @@ +from collections import defaultdict +from typing import Dict, List, Optional, Tuple, Union + +from wikitextprocessor import NodeKind, WikiNode +from wikitextprocessor.parser import LevelNode, TemplateNode + +from wiktextract.page import LEVEL_KINDS, clean_node +from wiktextract.wxr_context import WiktextractContext + +EtymologyData = Dict[str, List[str]] + + +def extract_etymology( + wxr: WiktextractContext, + nodes: List[Union[WikiNode, str]], +) -> Optional[EtymologyData]: + etymology_dict: EtymologyData = defaultdict(list) + level_node_index = len(nodes) + # find nodes after the etymology subtitle and before the next level node + for index, node in enumerate(nodes): + if isinstance(node, WikiNode) and node.kind in LEVEL_KINDS: + level_node_index = index + break + + pos_title: Optional[str] = None + for etymology_node in nodes[:level_node_index]: + if ( + isinstance(etymology_node, WikiNode) + and etymology_node.kind == NodeKind.LIST + ): + if etymology_node.sarg == "*": + pos_title = clean_node(wxr, None, etymology_node) + pos_title = pos_title.removeprefix("* ").removesuffix(" :") + elif etymology_node.sarg == ":": + # ignore missing etymology template "ébauche-étym" + for template_node in etymology_node.find_child_recursively( + NodeKind.TEMPLATE + ): + if template_node.template_name == "ébauche-étym": + return + + for etymology_item in etymology_node.find_child( + NodeKind.LIST_ITEM + ): + etymology_data = find_pos_in_etymology_list( + wxr, etymology_item + ) + if etymology_data is not None: + new_pos_title, new_etymology_text = etymology_data + etymology_dict[new_pos_title].append(new_etymology_text) + else: + etymology_text = clean_node( + wxr, None, etymology_item.children + ) + etymology_dict[pos_title].append(etymology_text) + + return etymology_dict + + +def find_pos_in_etymology_list( + wxr: WiktextractContext, list_item_node: WikiNode +) -> Optional[Tuple[str, str]]: + """ + Return tuple of POS title and etymology text if the passed lis item node + starts with italic POS node or POS template, otherwise return None. + """ + child_nodes = list(list_item_node.filter_empty_str_child()) + for index, node in enumerate(child_nodes): + if ( + index == 0 + and isinstance(node, TemplateNode) + and node.template_name == "lien-ancre-étym" + ): + return clean_node(wxr, None, node).strip("()"), clean_node( + wxr, None, child_nodes[index + 1 :] + ) + if ( + index == 1 + and isinstance(node, WikiNode) + and node.kind == NodeKind.ITALIC + and isinstance(child_nodes[0], str) + and child_nodes[0].endswith("(") + and isinstance(child_nodes[2], str) + and child_nodes[2].startswith(")") + ): + # italic pos + pos_title = clean_node(wxr, None, node) + if pos_title == "Nom": + pos_title = "Nom commun" + return pos_title, clean_node( + wxr, None, child_nodes[index + 1 :] + ).removeprefix(") ") + + +def insert_etymology_data( + lang_code: str, page_data: List[Dict], etymology_data: EtymologyData +) -> None: + """ + Insert list of etymology data extracted from the level 3 node to each sense + dictionary matches the language and POS. + """ + sense_dict = {} # group by pos title + for sense_data in page_data: + if sense_data.get("lang_code") == lang_code: + sense_dict[sense_data.get("pos_title")] = sense_data + + for pos_title, etymology_texts in etymology_data.items(): + if pos_title is None: # add to all sense dictionaries + for sense_data in sense_dict.values(): + sense_data["etymology_texts"] = etymology_texts + elif pos_title in sense_dict: + sense_dict[pos_title]["etymology_texts"] = etymology_texts + elif pos_title.removesuffix(" 1") in sense_dict: + # an index number is added in the etymology section but not added in + # POS title + sense_dict[pos_title.removesuffix(" 1")][ + "etymology_texts" + ] = etymology_texts diff --git a/wiktextract/extractor/fr/inflection.py b/wiktextract/extractor/fr/inflection.py index e18e59932..81d390a9f 100644 --- a/wiktextract/extractor/fr/inflection.py +++ b/wiktextract/extractor/fr/inflection.py @@ -6,7 +6,7 @@ from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext -from .pronunciation import is_ipa_text, insert_ipa +from .pronunciation import insert_ipa, is_ipa_text def extract_inflection( diff --git a/wiktextract/extractor/fr/page.py b/wiktextract/extractor/fr/page.py index c2e05a515..ac24a2a50 100644 --- a/wiktextract/extractor/fr/page.py +++ b/wiktextract/extractor/fr/page.py @@ -1,7 +1,8 @@ import copy import logging from collections import defaultdict -from typing import Dict, List, Union +from copy import deepcopy +from typing import Dict, List, Optional, Union from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import TemplateNode @@ -10,6 +11,7 @@ from wiktextract.page import LEVEL_KINDS, clean_node from wiktextract.wxr_context import WiktextractContext +from .etymology import EtymologyData, extract_etymology, insert_etymology_data from .form_line import extract_form_line from .gloss import extract_gloss, process_exemple_template from .inflection import extract_inflection @@ -34,77 +36,61 @@ def parse_section( wxr: WiktextractContext, page_data: List[Dict], base_data: Dict, - level_node: Union[WikiNode, List[Union[WikiNode, str]]], -) -> None: + level_node: WikiNode, +) -> Optional[List[EtymologyData]]: # Page structure: https://fr.wiktionary.org/wiki/Wiktionnaire:Structure_des_pages - if isinstance(level_node, list): - for x in level_node: - parse_section(wxr, page_data, base_data, x) - return - if not isinstance(level_node, WikiNode): - return - if level_node.kind in LEVEL_KINDS: - for level_node_template in level_node.find_content(NodeKind.TEMPLATE): - if level_node_template.template_name == "S": - # French Wiktionary uses a `S` template for all subtitles, we - # could find the subtitle type by only checking the template - # parameter. - # https://fr.wiktionary.org/wiki/Modèle:S - # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections - section_type = level_node_template.template_parameters.get(1) - subtitle = clean_node(wxr, page_data[-1], level_node.largs) - wxr.wtp.start_subsection(subtitle) - if ( - section_type - in wxr.config.OTHER_SUBTITLES["ignored_sections"] - ): - pass - # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections_de_types_de_mots - elif section_type in wxr.config.POS_SUBTITLES: - process_pos_block( - wxr, page_data, base_data, level_node, section_type - ) - elif ( - wxr.config.capture_etymologies - and section_type in wxr.config.OTHER_SUBTITLES["etymology"] - ): - extract_etymology( - wxr, page_data, base_data, level_node.children - ) - elif ( - wxr.config.capture_pronunciation - and section_type - in wxr.config.OTHER_SUBTITLES["pronunciation"] - ): - extract_pronunciation(wxr, page_data, level_node) - elif ( - wxr.config.capture_linkages - and section_type in wxr.config.LINKAGE_SUBTITLES - ): - extract_linkage( - wxr, - page_data, - level_node, - wxr.config.LINKAGE_SUBTITLES.get(section_type), - ) - elif ( - wxr.config.capture_translations - and section_type - in wxr.config.OTHER_SUBTITLES["translations"] - ): - extract_translation(wxr, page_data, level_node) - elif ( - wxr.config.capture_inflections - and section_type - in wxr.config.OTHER_SUBTITLES["inflection_sections"] - ): - pass - else: - pass - # wxr.wtp.debug( - # f"Unhandled section type: {subtitle}", - # sortid="extractor/fr/page/parse_section/192", - # ) + for level_node_template in level_node.find_content(NodeKind.TEMPLATE): + if level_node_template.template_name == "S": + # French Wiktionary uses a `S` template for all subtitles, we could + # find the subtitle type by only checking the template parameter. + # https://fr.wiktionary.org/wiki/Modèle:S + # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections + section_type = level_node_template.template_parameters.get(1) + subtitle = clean_node(wxr, page_data[-1], level_node.largs) + wxr.wtp.start_subsection(subtitle) + if section_type in wxr.config.OTHER_SUBTITLES["ignored_sections"]: + pass + # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections_de_types_de_mots + elif section_type in wxr.config.POS_SUBTITLES: + process_pos_block( + wxr, + page_data, + base_data, + level_node, + section_type, + subtitle, + ) + elif ( + wxr.config.capture_etymologies + and section_type in wxr.config.OTHER_SUBTITLES["etymology"] + ): + return extract_etymology(wxr, level_node.children) + elif ( + wxr.config.capture_pronunciation + and section_type in wxr.config.OTHER_SUBTITLES["pronunciation"] + ): + extract_pronunciation(wxr, page_data, level_node) + elif ( + wxr.config.capture_linkages + and section_type in wxr.config.LINKAGE_SUBTITLES + ): + extract_linkage( + wxr, + page_data, + level_node, + wxr.config.LINKAGE_SUBTITLES.get(section_type), + ) + elif ( + wxr.config.capture_translations + and section_type in wxr.config.OTHER_SUBTITLES["translations"] + ): + extract_translation(wxr, page_data, level_node) + elif ( + wxr.config.capture_inflections + and section_type + in wxr.config.OTHER_SUBTITLES["inflection_sections"] + ): + pass def process_pos_block( @@ -113,10 +99,11 @@ def process_pos_block( base_data: Dict, pos_title_node: TemplateNode, pos_argument: str, + pos_title: str, ): pos_type = wxr.config.POS_SUBTITLES[pos_argument]["pos"] - base_data["pos"] = pos_type append_base_data(page_data, "pos", pos_type, base_data) + page_data[-1]["pos_title"] = pos_title child_nodes = list(pos_title_node.filter_empty_str_child()) form_line_start = 0 # Ligne de forme gloss_start = len(child_nodes) @@ -143,44 +130,11 @@ def process_pos_block( extract_gloss(wxr, page_data, child) elif child.kind in LEVEL_KINDS: parse_section(wxr, page_data, base_data, child) - else: - parse_section(wxr, page_data, base_data, child) form_line_nodes = child_nodes[form_line_start:gloss_start] extract_form_line(wxr, page_data, form_line_nodes) -def extract_etymology( - wxr: WiktextractContext, - page_data: List[Dict], - base_data: Dict, - nodes: List[Union[WikiNode, str]], -) -> None: - level_node_index = len(nodes) - for index, node in enumerate(nodes): - if isinstance(node, WikiNode) and node.kind in LEVEL_KINDS: - level_node_index = index - break - # ignore missing etymology template "ébauche-étym" - for etymology_node in nodes[:level_node_index]: - if isinstance(etymology_node, WikiNode): - if ( - etymology_node.kind == NodeKind.TEMPLATE - and etymology_node.template_name == "ébauche-étym" - ): - return - for node in etymology_node.find_child_recursively( - NodeKind.TEMPLATE - ): - if node.template_name == "ébauche-étym": - return - etymology = clean_node(wxr, page_data[-1], nodes[:level_node_index]) - base_data["etymology_text"] = etymology - append_base_data(page_data, "etymology_text", etymology, base_data) - if level_node_index < len(nodes): - parse_section(wxr, page_data, base_data, nodes[level_node_index:]) - - def parse_page( wxr: WiktextractContext, page_title: str, page_text: str ) -> List[Dict[str, str]]: @@ -240,6 +194,15 @@ def parse_page( ) base_data.update(categories_and_links) page_data.append(copy.deepcopy(base_data)) - parse_section(wxr, page_data, base_data, node.children) + etymology_data: Optional[EtymologyData] = None + for level_three_node in node.find_child(NodeKind.LEVEL3): + new_etymology_data = parse_section( + wxr, page_data, base_data, level_three_node + ) + if new_etymology_data is not None: + etymology_data = new_etymology_data + + if etymology_data is not None: + insert_etymology_data(lang_code, page_data, etymology_data) return page_data