From c8cff9c29f4659aec4dd5dc427a42ebabfe50161 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Wed, 27 Sep 2023 15:22:08 +0800 Subject: [PATCH 1/3] Extract "ja-mot" form line template --- wiktextract/extractor/fr/form_line.py | 32 ++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/wiktextract/extractor/fr/form_line.py b/wiktextract/extractor/fr/form_line.py index 80644cb8..eb20c30b 100644 --- a/wiktextract/extractor/fr/form_line.py +++ b/wiktextract/extractor/fr/form_line.py @@ -35,6 +35,8 @@ def extract_form_line( process_equiv_pour_template(wxr, node, page_data) elif node.template_name.startswith("zh-mot"): process_zh_mot_template(wxr, node, page_data) + elif node.template_name == "ja-mot": + process_ja_mot_template(wxr, node, page_data) else: tag = clean_node(wxr, page_data[-1], node) if ( @@ -75,7 +77,7 @@ def process_zh_mot_template( node: TemplateNode, page_data: List[Dict], ) -> None: - # zh-mot, zh-mot-s, zh-mot-t + # Chinese form line template: zh-mot, zh-mot-s, zh-mot-t # https://fr.wiktionary.org/wiki/Modèle:zh-mot node = wxr.wtp.parse( wxr.wtp.node_to_wikitext(node), @@ -83,14 +85,38 @@ def process_zh_mot_template( additional_expand={node.template_name}, ) for template_node in node.find_child(NodeKind.TEMPLATE): - if template_node.template_name == "lang": + if template_node.template_name.lower() == "lang": page_data[-1]["sounds"].append( { "zh-pron": clean_node(wxr, None, template_node), "tags": ["Pinyin"], } ) - elif template_node.template_name == "pron": + elif template_node.template_name in ("pron", "prononciation"): page_data[-1]["sounds"].append( {"ipa": clean_node(wxr, None, template_node)} ) + +def process_ja_mot_template( + wxr: WiktextractContext, + template_node: TemplateNode, + page_data: List[Dict], +) -> None: + # Japanese form line template: https://fr.wiktionary.org/wiki/Modèle:ja-mot + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(template_node), + expand_all=True + ) + existing_forms = {existing_form.get("form") for existing_form in page_data[-1].get("forms", [])} + for index, node in expanded_node.find_html("span", with_index=True): + # the first span tag is the word, the second is Hepburn romanization + if index == 1: + form_text = clean_node(wxr, None, node) + if form_text not in existing_forms: + # avoid adding duplicated form data extracted from + # inflection table before the form line + page_data[-1]["forms"].append({ + "form": roman_form, + "tags": ["romanization"] + }) + break From 53bf50258ba082a941278f466fa1572aee3dd8f2 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Wed, 27 Sep 2023 16:40:32 +0800 Subject: [PATCH 2/3] =?UTF-8?q?Extract=20inflection=20table=20expanded=20f?= =?UTF-8?q?rom=20template=20"lt-d=C3=A9cl-as"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit this template creates some empty table cells breaks code counts the table column number. --- tests/test_fr_inflection.py | 35 +++++++++++++++---- wiktextract/extractor/fr/form_line.py | 16 +++++---- wiktextract/extractor/fr/inflection.py | 47 +++++++++++++++----------- wiktextract/extractor/fr/page.py | 2 +- 4 files changed, 66 insertions(+), 34 deletions(-) diff --git a/tests/test_fr_inflection.py b/tests/test_fr_inflection.py index ed90a852..d68b328d 100644 --- a/tests/test_fr_inflection.py +++ b/tests/test_fr_inflection.py @@ -40,7 +40,7 @@ def test_fr_reg(self, mock_node_to_wikitext): page_data = [defaultdict(list, {"word": "productrice"})] node = TemplateNode(0) self.wxr.wtp.start_page("productrice") - extract_inflection(self.wxr, page_data, node, "fr-rég") + extract_inflection(self.wxr, page_data, node) self.assertEqual( page_data[-1].get("forms"), [{"form": "productrices", "tags": ["Pluriel"]}], @@ -68,7 +68,7 @@ def test_fr_accord_al(self, mock_node_to_wikitext): page_data = [defaultdict(list, {"word": "animal", "lang_code": "fr"})] node = TemplateNode(0) self.wxr.wtp.start_page("animal") - extract_inflection(self.wxr, page_data, node, "fr-accord-al") + extract_inflection(self.wxr, page_data, node) self.assertEqual( page_data[-1].get("forms"), [ @@ -104,7 +104,7 @@ def test_multiple_lines_ipa(self, mock_node_to_wikitext): page_data = [defaultdict(list, {"lang_code": "en", "word": "ration"})] node = TemplateNode(0) self.wxr.wtp.start_page("ration") - extract_inflection(self.wxr, page_data, node, "en-nom-rég") + extract_inflection(self.wxr, page_data, node) self.assertEqual( page_data[-1].get("forms"), [ @@ -131,7 +131,7 @@ def test_single_line_multiple_ipa(self, mock_node_to_wikitext): page_data = [defaultdict(list, {"lang_code": "en", "word": "ration"})] node = TemplateNode(0) self.wxr.wtp.start_page("ration") - extract_inflection(self.wxr, page_data, node, "en-conj-rég") + extract_inflection(self.wxr, page_data, node) self.assertEqual( page_data[-1].get("forms"), [ @@ -158,7 +158,7 @@ def test_invalid_ipa(self, mock_node_to_wikitext): page_data = [defaultdict(list, {"lang_code": "en", "word": "animal"})] node = TemplateNode(0) self.wxr.wtp.start_page("animal") - extract_inflection(self.wxr, page_data, node, "ast-accord-mf") + extract_inflection(self.wxr, page_data, node) self.assertEqual( page_data[-1].get("forms"), [{"tags": ["Pluriel"], "form": "animales"}], @@ -180,8 +180,31 @@ def test_no_column_headers(self, mock_node_to_wikitext): page_data = [defaultdict(list, {"lang_code": "zh", "word": "一万"})] node = TemplateNode(0) self.wxr.wtp.start_page("一万") - extract_inflection(self.wxr, page_data, node, "zh-formes") + extract_inflection(self.wxr, page_data, node) self.assertEqual( page_data[-1].get("forms"), [{"tags": ["Traditionnel"], "form": "一萬"}], ) + + @patch( + "wikitextprocessor.Wtp.node_to_wikitext", + return_value="""{| class="flextable" +!Cas +! Singulier +! Pluriel +|- +! Nominatif +|| [[abadas#lt|abadas]] +|| '''abadai''' +|}""", + ) + def test_lt_décl_as(self, mock_node_to_wikitext): + # empty table cells should be ignored + page_data = [defaultdict(list, {"lang_code": "lt", "word": "abadai"})] + node = TemplateNode(0) + self.wxr.wtp.start_page("abadai") + extract_inflection(self.wxr, page_data, node) + self.assertEqual( + page_data[-1].get("forms"), + [{"tags": ["Singulier", "Nominatif"], "form": "abadas"}], + ) diff --git a/wiktextract/extractor/fr/form_line.py b/wiktextract/extractor/fr/form_line.py index eb20c30b..21278c35 100644 --- a/wiktextract/extractor/fr/form_line.py +++ b/wiktextract/extractor/fr/form_line.py @@ -97,6 +97,7 @@ def process_zh_mot_template( {"ipa": clean_node(wxr, None, template_node)} ) + def process_ja_mot_template( wxr: WiktextractContext, template_node: TemplateNode, @@ -104,10 +105,12 @@ def process_ja_mot_template( ) -> None: # Japanese form line template: https://fr.wiktionary.org/wiki/Modèle:ja-mot expanded_node = wxr.wtp.parse( - wxr.wtp.node_to_wikitext(template_node), - expand_all=True + wxr.wtp.node_to_wikitext(template_node), expand_all=True ) - existing_forms = {existing_form.get("form") for existing_form in page_data[-1].get("forms", [])} + existing_forms = { + existing_form.get("form") + for existing_form in page_data[-1].get("forms", []) + } for index, node in expanded_node.find_html("span", with_index=True): # the first span tag is the word, the second is Hepburn romanization if index == 1: @@ -115,8 +118,7 @@ def process_ja_mot_template( if form_text not in existing_forms: # avoid adding duplicated form data extracted from # inflection table before the form line - page_data[-1]["forms"].append({ - "form": roman_form, - "tags": ["romanization"] - }) + page_data[-1]["forms"].append( + {"form": roman_form, "tags": ["romanization"]} + ) break diff --git a/wiktextract/extractor/fr/inflection.py b/wiktextract/extractor/fr/inflection.py index a978118b..8709b39e 100644 --- a/wiktextract/extractor/fr/inflection.py +++ b/wiktextract/extractor/fr/inflection.py @@ -2,6 +2,7 @@ from typing import Dict, List from wikitextprocessor import NodeKind, WikiNode +from wikitextprocessor.parser import TemplateNode from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext @@ -12,12 +13,11 @@ def extract_inflection( wxr: WiktextractContext, page_data: List[Dict], - node: WikiNode, - template_name: str, + template_node: TemplateNode, ) -> None: # inflection templates # https://fr.wiktionary.org/wiki/Catégorie:Modèles_d’accord_en_français - process_inflection_table(wxr, page_data, node) + process_inflection_table(wxr, page_data, template_node) IGNORE_TABLE_HEADERS = { @@ -43,15 +43,27 @@ def process_inflection_table( return table_node = table_nodes[0] column_headers = [] + first_row_has_data_cell = False for row_num, table_row in enumerate( table_node.find_child(NodeKind.TABLE_ROW) ): - table_row_nodes = list(table_row.filter_empty_str_child()) - first_row_has_data_cell = False + # filter empty table cells + table_row_nodes = [ + row_node_child + for row_node_child in table_row.children + if isinstance(row_node_child, WikiNode) + and ( + row_node_child.kind == NodeKind.TABLE_HEADER_CELL + or ( + row_node_child.kind == NodeKind.TABLE_CELL + and len(row_node_child.children) > 0 + ) + ) + and row_node_child.attrs.get("style") != "display:none" + ] if row_num == 0: - first_row_has_data_cell = not any( - isinstance(cell, WikiNode) - and cell.kind == NodeKind.TABLE_CELL + first_row_has_data_cell = any( + isinstance(cell, WikiNode) and cell.kind == NodeKind.TABLE_CELL for cell in table_row_nodes ) @@ -59,24 +71,19 @@ def process_inflection_table( # data row has one more column then header: "fr-accord-al" template column_headers.insert(0, "") - row_header = "" + row_headers = [] for column_num, table_cell in enumerate(table_row_nodes): form_data = defaultdict(list) if isinstance(table_cell, WikiNode): if table_cell.kind == NodeKind.TABLE_HEADER_CELL: table_header_text = clean_node(wxr, None, table_cell) - if row_num == 0 and first_row_has_data_cell: + if row_num == 0 and not first_row_has_data_cell: # if cells of the first row are not all header cells # then the header cells are row headers but not column # headers column_headers.append(table_header_text) - elif ( - column_num == 0 - and table_header_text not in IGNORE_TABLE_HEADERS - ): - row_header = table_header_text elif table_header_text not in IGNORE_TABLE_HEADERS: - form_data["tags"].append(table_header_text) + row_headers.append(table_header_text) elif table_cell.kind == NodeKind.TABLE_CELL: table_cell_lines = clean_node(wxr, None, table_cell) for table_cell_line in table_cell_lines.splitlines(): @@ -94,7 +101,7 @@ def process_inflection_table( ): form_data["tags"].append(column_headers[column_num]) - if len(row_header) > 0: - form_data["tags"].append(row_header) - if "form" in form_data: - page_data[-1]["forms"].append(form_data) + if len(row_headers) > 0: + form_data["tags"].extend(row_headers) + if "form" in form_data: + page_data[-1]["forms"].append(form_data) diff --git a/wiktextract/extractor/fr/page.py b/wiktextract/extractor/fr/page.py index a11558c1..d968e3f2 100644 --- a/wiktextract/extractor/fr/page.py +++ b/wiktextract/extractor/fr/page.py @@ -124,7 +124,7 @@ def process_pos_block( # skip form line templates continue elif template_name.startswith(f"{lang_code}-"): - extract_inflection(wxr, page_data, child, template_name) + extract_inflection(wxr, page_data, child) elif child.kind == NodeKind.BOLD: form_line_start = index + 1 elif child.kind == NodeKind.LIST: From 489751567791db342212b4699d1d6452c31af4f6 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Wed, 27 Sep 2023 18:08:38 +0800 Subject: [PATCH 3/3] Extract inflection table template "fr-accord-s" this template creates a table cell with "invisible" class which should be ignored --- tests/test_fr_inflection.py | 48 +++++++++++++++++++++++++- wiktextract/extractor/fr/inflection.py | 4 ++- 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/tests/test_fr_inflection.py b/tests/test_fr_inflection.py index d68b328d..bbb9dadb 100644 --- a/tests/test_fr_inflection.py +++ b/tests/test_fr_inflection.py @@ -64,7 +64,6 @@ def test_fr_reg(self, mock_node_to_wikitext): ) def test_fr_accord_al(self, mock_node_to_wikitext): # https://fr.wiktionary.org/wiki/animal#Adjectif - self.maxDiff = None page_data = [defaultdict(list, {"word": "animal", "lang_code": "fr"})] node = TemplateNode(0) self.wxr.wtp.start_page("animal") @@ -101,6 +100,7 @@ def test_fr_accord_al(self, mock_node_to_wikitext): ) def test_multiple_lines_ipa(self, mock_node_to_wikitext): # https://fr.wiktionary.org/wiki/ration#Nom_commun_2 + # template "en-nom-rég" page_data = [defaultdict(list, {"lang_code": "en", "word": "ration"})] node = TemplateNode(0) self.wxr.wtp.start_page("ration") @@ -128,6 +128,7 @@ def test_multiple_lines_ipa(self, mock_node_to_wikitext): ) def test_single_line_multiple_ipa(self, mock_node_to_wikitext): # https://fr.wiktionary.org/wiki/ration#Verbe + # template "en-conj-rég" page_data = [defaultdict(list, {"lang_code": "en", "word": "ration"})] node = TemplateNode(0) self.wxr.wtp.start_page("ration") @@ -155,6 +156,7 @@ def test_single_line_multiple_ipa(self, mock_node_to_wikitext): ) def test_invalid_ipa(self, mock_node_to_wikitext): # https://fr.wiktionary.org/wiki/animal#Nom_commun_3 + # template "ast-accord-mf" page_data = [defaultdict(list, {"lang_code": "en", "word": "animal"})] node = TemplateNode(0) self.wxr.wtp.start_page("animal") @@ -177,6 +179,7 @@ def test_invalid_ipa(self, mock_node_to_wikitext): ) def test_no_column_headers(self, mock_node_to_wikitext): # https://fr.wiktionary.org/wiki/一万#Nom_commun + # template "zh-formes" page_data = [defaultdict(list, {"lang_code": "zh", "word": "一万"})] node = TemplateNode(0) self.wxr.wtp.start_page("一万") @@ -208,3 +211,46 @@ def test_lt_décl_as(self, mock_node_to_wikitext): page_data[-1].get("forms"), [{"tags": ["Singulier", "Nominatif"], "form": "abadas"}], ) + + @patch( + "wikitextprocessor.Wtp.node_to_wikitext", + return_value="""{|class="flextable flextable-fr-mfsp" + +|- +| class="invisible" | +! scope="col" | Singulier +! scope="col" | Pluriel +|- class="flextable-fr-m" +! scope="row" | Masculin +|colspan="2"| [[aastais]]
[[Annexe:Prononciation/français|\\a.a.stɛ\\]] + +|- class="flextable-fr-f" +! scope="row" | Féminin +| [[aastaise]]
[[Annexe:Prononciation/français|\\a.a.stɛz\\]] +| [[aastaises]]
[[Annexe:Prononciation/français|\\a.a.stɛz\\]] +|}""", + ) + def test_fr_accord_s(self, mock_node_to_wikitext): + # https://fr.wiktionary.org/wiki/ + page_data = [defaultdict(list, {"lang_code": "fr", "word": "aastais"})] + node = TemplateNode(0) + self.wxr.wtp.start_page("aastais") + extract_inflection(self.wxr, page_data, node) + self.assertEqual( + page_data[-1].get("forms"), + [ + { + "tags": ["Singulier", "Féminin"], + "form": "aastaise", + "ipa": "\\a.a.stɛz\\", + }, + { + "tags": ["Pluriel", "Féminin"], + "form": "aastaises", + "ipa": "\\a.a.stɛz\\", + }, + ], + ) diff --git a/wiktextract/extractor/fr/inflection.py b/wiktextract/extractor/fr/inflection.py index 8709b39e..8e180949 100644 --- a/wiktextract/extractor/fr/inflection.py +++ b/wiktextract/extractor/fr/inflection.py @@ -63,7 +63,9 @@ def process_inflection_table( ] if row_num == 0: first_row_has_data_cell = any( - isinstance(cell, WikiNode) and cell.kind == NodeKind.TABLE_CELL + isinstance(cell, WikiNode) + and cell.kind == NodeKind.TABLE_CELL + and "invisible" not in cell.attrs.get("class", "") for cell in table_row_nodes )