From 4f390a9db870081869ff34dae04111dd9d568ff3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 25 Dec 2023 21:35:23 +0000 Subject: [PATCH 01/13] Bump actions/deploy-pages from 3 to 4 Bumps [actions/deploy-pages](https://github.com/actions/deploy-pages) from 3 to 4. - [Release notes](https://github.com/actions/deploy-pages/releases) - [Commits](https://github.com/actions/deploy-pages/compare/v3...v4) --- updated-dependencies: - dependency-name: actions/deploy-pages dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 58c80adb9..73e9188e5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -52,4 +52,4 @@ jobs: runs-on: ubuntu-latest steps: - id: deployment - uses: actions/deploy-pages@v3 + uses: actions/deploy-pages@v4 From 67c3df5fe68740a9a71182bc6ffb3ff6f68992c5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 25 Dec 2023 21:35:26 +0000 Subject: [PATCH 02/13] Bump actions/upload-pages-artifact from 2 to 3 Bumps [actions/upload-pages-artifact](https://github.com/actions/upload-pages-artifact) from 2 to 3. - [Release notes](https://github.com/actions/upload-pages-artifact/releases) - [Commits](https://github.com/actions/upload-pages-artifact/compare/v2...v3) --- updated-dependencies: - dependency-name: actions/upload-pages-artifact dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 58c80adb9..c6399b253 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -37,7 +37,7 @@ jobs: make coverage_report make github_pages REPO=${{ github.repository }} SHA=${{ github.sha }} if: github.ref_name == 'master' && matrix.python-version == '3.12' - - uses: actions/upload-pages-artifact@v2 + - uses: actions/upload-pages-artifact@v3 if: github.ref_name == 'master' && matrix.python-version == '3.12' deploy: From b0d9346e2b43671e303429788dc15c5734a30e12 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 26 Dec 2023 11:48:05 +0800 Subject: [PATCH 03/13] Change find gloss tags strategy of French extractor Previous code assume tag nodes are before gloss text and discards texts before the last tag node. But gloss text could be between tag nodes. --- src/wiktextract/extractor/fr/gloss.py | 20 +++++++------------- tests/test_fr_gloss.py | 26 ++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/src/wiktextract/extractor/fr/gloss.py b/src/wiktextract/extractor/fr/gloss.py index 8b69d5b97..cbaacce81 100644 --- a/src/wiktextract/extractor/fr/gloss.py +++ b/src/wiktextract/extractor/fr/gloss.py @@ -21,9 +21,9 @@ def extract_gloss( ) ) gloss_data = Sense() - gloss_start = 0 # process modifier, theme tempaltes before gloss text # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens + tag_indexes = set() for index, gloss_node in enumerate(gloss_nodes): if isinstance(gloss_node, TemplateNode): categories_data = defaultdict(list) @@ -31,7 +31,6 @@ def extract_gloss( if expanded_text.startswith("(") and expanded_text.endswith( ")" ): - gloss_start = index + 1 tag = expanded_text.strip("() \n") if len(tag) > 0: gloss_data.tags.append(tag) @@ -39,29 +38,24 @@ def extract_gloss( gloss_data.categories.extend( categories_data["categories"] ) - - gloss_only_nodes = [] - tag_indexes = set() - for index, node in enumerate(gloss_nodes[gloss_start:], gloss_start): + tag_indexes.add(index) # if an italic node is between parentheses then it's a tag, also # don't add the parenthese strings to `gloss_only_nodes` - if ( - isinstance(node, WikiNode) - and node.kind == NodeKind.ITALIC - and index > gloss_start + elif ( + isinstance(gloss_node, WikiNode) + and gloss_node.kind == NodeKind.ITALIC and isinstance(gloss_nodes[index - 1], str) and gloss_nodes[index - 1].strip() == "(" and index + 1 < len(gloss_nodes) and isinstance(gloss_nodes[index + 1], str) and gloss_nodes[index + 1].strip() == ")" ): - gloss_data.tags.append(clean_node(wxr, None, node)) + gloss_data.tags.append(clean_node(wxr, None, gloss_node)) tag_indexes |= {index - 1, index, index + 1} - continue gloss_only_nodes = [ node - for index, node in enumerate(gloss_nodes[gloss_start:], gloss_start) + for index, node in enumerate(gloss_nodes) if index not in tag_indexes ] gloss_text = clean_node(wxr, gloss_data, gloss_only_nodes) diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py index 573f20f0c..bf27f5d6d 100644 --- a/tests/test_fr_gloss.py +++ b/tests/test_fr_gloss.py @@ -280,3 +280,29 @@ def test_nest_gloss(self): }, ], ) + + def test_sandwich_tag(self): + # https://fr.wiktionary.org/wiki/autrice#Nom_commun_4 + self.wxr.wtp.start_page("autrice") + self.wxr.wtp.add_page("Modèle:lexique", 10, "''(Littérature)''") + self.wxr.wtp.add_page("Modèle:rare", 10, "''(Rare)''") + self.wxr.wtp.add_page("Modèle:lien", 10, "Autrice") + self.wxr.wtp.add_page("Modèle:absolument", 10, "''(Absolument)''") + root = self.wxr.wtp.parse( + "# {{lexique|littérature|nl}} {{rare|nl}} {{lien|autrice|fr|dif=Autrice}}, femme qui a créé une œuvre littéraire. {{absolument}} [[écrivaine|Écrivaine]]." + ) + page_data = [ + WordEntry(word="autrice", lang_code="nl", lang_name="Néerlandais") + ] + extract_gloss(self.wxr, page_data, root.children[0]) + self.assertEqual( + [d.model_dump(exclude_defaults=True) for d in page_data[-1].senses], + [ + { + "glosses": [ + "Autrice, femme qui a créé une œuvre littéraire. Écrivaine." + ], + "tags": ["Littérature", "Rare", "Absolument"] + } + ], + ) From cc9796b73e30b6ef8205a555415cb8446b8fcfb0 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 26 Dec 2023 13:03:35 +0800 Subject: [PATCH 04/13] Don't expand "trad*" templates that missing required parameter French Wiktionary's translation template requires two unnamed parameters for language code and translation term. But some page like "crise" only have one unnamed arg. Fix exception caused by `None` passed to `clean_node()` --- src/wiktextract/extractor/fr/translation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/wiktextract/extractor/fr/translation.py b/src/wiktextract/extractor/fr/translation.py index 484b8fa07..086cfa111 100644 --- a/src/wiktextract/extractor/fr/translation.py +++ b/src/wiktextract/extractor/fr/translation.py @@ -85,6 +85,8 @@ def process_translation_templates( ) elif template_node.template_name.startswith("trad"): # Translation term: https://fr.wiktionary.org/wiki/Modèle:trad + if 2 not in template_node.template_parameters: # required parameter + return translation_term = clean_node( wxr, None, From 12e4d8cb5be5b6805ed5c87c81b462c87ae65a7a Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 26 Dec 2023 16:11:08 +0800 Subject: [PATCH 05/13] Fix a bug that appends all senses to the same WordEntry object The code should append a new WordEntry object if the `pos` field is assigned in the last object of the `page_data` list. --- src/wiktextract/extractor/fr/page.py | 2 +- tests/test_fr_page.py | 81 ++++++++++++++++++++++++---- 2 files changed, 73 insertions(+), 10 deletions(-) diff --git a/src/wiktextract/extractor/fr/page.py b/src/wiktextract/extractor/fr/page.py index 27704a00d..f1d06d641 100644 --- a/src/wiktextract/extractor/fr/page.py +++ b/src/wiktextract/extractor/fr/page.py @@ -114,7 +114,7 @@ def process_pos_block( pos_title: str, ): pos_type = wxr.config.POS_SUBTITLES[pos_argument]["pos"] - if len(page_data) == 0 or "pos" not in page_data[-1].model_fields_set: + if len(page_data) == 0 or "pos" in page_data[-1].model_fields_set: page_data.append(base_data.model_copy(deep=True)) page_data[-1].pos = pos_type page_data[-1].pos_title = pos_title diff --git a/tests/test_fr_page.py b/tests/test_fr_page.py index d94e6d11a..1c2a9b631 100644 --- a/tests/test_fr_page.py +++ b/tests/test_fr_page.py @@ -23,16 +23,44 @@ def tearDown(self) -> None: self.wxr.wtp.close_db_conn() def test_fr_parse_page(self): - self.wxr.wtp.add_page("Modèle:langue", 10, "Français") - self.wxr.wtp.add_page("Modèle:S", 10, "Nom commun") + # https://fr.wiktionary.org/wiki/anthracite + self.wxr.wtp.add_page( + "Modèle:langue", + 10, + "{{#switch: {{{1}}} | fr = Français | en = Anglais }}", + ) + self.wxr.wtp.add_page( + "Modèle:S", + 10, + """{{#switch: {{{1}}} +| étymologie = Étymologie +| nom = Nom commun +| adjectif = Adjectif +}}""", + ) + self.wxr.wtp.add_page("Modèle:roches", 10, "''(Pétrographie)''") + self.wxr.wtp.add_page("Modèle:indénombrable", 10, "''(Indénombrable)''") + page_data = parse_page( self.wxr, - "exemple", - """ -== {{langue|fr}} == + "anthracite", + """== {{langue|fr}} == +=== {{S|étymologie}} === +: (1549) Du latin anthracites. + === {{S|nom|fr}} === -'''exemple''' -""", +# {{roches|fr}} [[variété|Variété]] de [[charbon de terre]], à [[reflet]] [[métallique]] et à [[combustion]] [[lent]]e. + +=== {{S|adjectif|fr}} === +# De couleur anthracite, gris très foncé, du nom de la variété de charbon du même nom. + +== {{langue|en}} == + +=== {{S|étymologie}} === +: Du latin anthracites. + +=== {{S|nom|en}} === +# {{indénombrable|en}} [[anthracite#fr|Anthracite]].""", ) self.assertEqual( page_data, @@ -42,7 +70,42 @@ def test_fr_parse_page(self): "lang_code": "fr", "pos": "noun", "pos_title": "Nom commun", - "word": "exemple", - } + "word": "anthracite", + "senses": [ + { + "glosses": [ + "Variété de charbon de terre, à reflet métallique et à combustion lente." + ], + "tags": ["Pétrographie"], + } + ], + "etymology_texts": ["(1549) Du latin anthracites."], + }, + { + "lang_name": "Français", + "lang_code": "fr", + "pos": "adj", + "pos_title": "Adjectif", + "word": "anthracite", + "senses": [ + { + "glosses": [ + "De couleur anthracite, gris très foncé, du nom de la variété de charbon du même nom." + ] + } + ], + "etymology_texts": ["(1549) Du latin anthracites."], + }, + { + "lang_name": "Anglais", + "lang_code": "en", + "pos": "noun", + "pos_title": "Nom commun", + "word": "anthracite", + "senses": [ + {"glosses": ["Anthracite."], "tags": ["Indénombrable"]} + ], + "etymology_texts": ["Du latin anthracites."], + }, ], ) From 34712de767e9d5516f54080e677925adec9d4c43 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Wed, 27 Dec 2023 15:24:10 +0800 Subject: [PATCH 06/13] =?UTF-8?q?Extract=20links=20in=20"D=C3=A9riv=C3=A9s?= =?UTF-8?q?=20dans=20d=E2=80=99autres=20langues"=20section?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Links also could be used in lists of this block. --- src/wiktextract/extractor/fr/linkage.py | 14 ++++++++------ tests/test_fr_linkage.py | 18 ++++++++++++++++-- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/src/wiktextract/extractor/fr/linkage.py b/src/wiktextract/extractor/fr/linkage.py index 81fdd981a..2494da816 100644 --- a/src/wiktextract/extractor/fr/linkage.py +++ b/src/wiktextract/extractor/fr/linkage.py @@ -33,12 +33,14 @@ def process_derives_autres_list( for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): lang_code = "" lang_name = "" - for template_node in list_item.find_child(NodeKind.TEMPLATE): - if template_node.template_name == "L": - lang_code = template_node.template_parameters.get(1) - lang_name = clean_node(wxr, None, template_node) - elif template_node.template_name == "lien": - word = clean_node(wxr, None, template_node) + for node in list_item.find_child(NodeKind.TEMPLATE | NodeKind.LINK): + if isinstance(node, TemplateNode) and node.template_name == "L": + lang_code = node.template_parameters.get(1) + lang_name = clean_node(wxr, None, node) + elif node.kind == NodeKind.LINK or ( + isinstance(node, TemplateNode) and node.template_name == "lien" + ): + word = clean_node(wxr, None, node) page_data[-1].derived.append( Linkage(lang_code=lang_code, lang_name=lang_name, word=word) ) diff --git a/tests/test_fr_linkage.py b/tests/test_fr_linkage.py index da2aadb2e..bb2f57c3e 100644 --- a/tests/test_fr_linkage.py +++ b/tests/test_fr_linkage.py @@ -143,14 +143,23 @@ def test_sense(self): def test_derives_autres_langues_section(self): # https://fr.wiktionary.org/wiki/eau#Dérivés_dans_d’autres_langues + # https://fr.wiktionary.org/wiki/caligineux#Dérivés_dans_d’autres_langues self.wxr.wtp.add_page("Modèle:lien", 10, body="{{{1}}}") - self.wxr.wtp.add_page("Modèle:L", 10, body="Karipúna") + self.wxr.wtp.add_page( + "Modèle:L", + 10, + body="""{{#switch: {{{1}}} +| kmv = Karipúna +| en = Anglais +}}""", + ) page_data = [ WordEntry(word="test", lang_code="fr", lang_name="Français") ] self.wxr.wtp.start_page("eau") root = self.wxr.wtp.parse( - "* {{L|kmv}} : {{lien|dlo|kmv}}, {{lien|djilo|kmv}}" + """* {{L|kmv}} : {{lien|dlo|kmv}}, {{lien|djilo|kmv}} +* {{L|en}} : [[caliginous#en|caliginous]]""" ) extract_linkage(self.wxr, page_data, root, "dérivés autres langues") self.assertEqual( @@ -169,6 +178,11 @@ def test_derives_autres_langues_section(self): "lang_code": "kmv", "lang_name": "Karipúna", }, + { + "word": "caliginous", + "lang_code": "en", + "lang_name": "Anglais", + }, ], ) From c39c4eda32ed063cccca67526392d6f8dc07fb9b Mon Sep 17 00:00:00 2001 From: xxyzz Date: Wed, 27 Dec 2023 16:22:00 +0800 Subject: [PATCH 07/13] Extract "Paronymes" section under the "Prononciation" section Other sections under "Prononciation" will also be processed now. --- src/wiktextract/extractor/fr/pronunciation.py | 19 ++++++++---- tests/test_fr_pronunciation.py | 31 +++++++++++++++++++ 2 files changed, 44 insertions(+), 6 deletions(-) diff --git a/src/wiktextract/extractor/fr/pronunciation.py b/src/wiktextract/extractor/fr/pronunciation.py index 61e934d28..39679636b 100644 --- a/src/wiktextract/extractor/fr/pronunciation.py +++ b/src/wiktextract/extractor/fr/pronunciation.py @@ -1,5 +1,5 @@ from wikitextprocessor import NodeKind, WikiNode -from wikitextprocessor.parser import TemplateNode +from wikitextprocessor.parser import LEVEL_KIND_FLAGS, TemplateNode from wiktextract.extractor.share import create_audio_url_dict from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext @@ -15,11 +15,18 @@ def extract_pronunciation( ) -> None: sound_data = [] lang_code = base_data.lang_code - for list_node in level_node.find_child(NodeKind.LIST): - for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): - sound_data.extend( - process_pron_list_item(wxr, list_item_node, Sound(), lang_code) - ) + for node in level_node.find_child(NodeKind.LIST | LEVEL_KIND_FLAGS): + if node.kind == NodeKind.LIST: + for list_item_node in node.find_child(NodeKind.LIST_ITEM): + sound_data.extend( + process_pron_list_item( + wxr, list_item_node, Sound(), lang_code + ) + ) + else: + from .page import parse_section + + parse_section(wxr, page_data, base_data, node) if len(sound_data) == 0: return diff --git a/tests/test_fr_pronunciation.py b/tests/test_fr_pronunciation.py index ff9063748..9b668507b 100644 --- a/tests/test_fr_pronunciation.py +++ b/tests/test_fr_pronunciation.py @@ -120,3 +120,34 @@ def test_no_ipa(self): "mp3_url": "https://upload.wikimedia.org/wikipedia/commons/transcoded/3/3f/LL-Q9027_(swe)-Moonhouse-mars.wav/LL-Q9027_(swe)-Moonhouse-mars.wav.mp3", }, ) + + def test_paronymes_subsection(self): + # https://fr.wiktionary.org/wiki/wagonnet + page_data = [] + self.wxr.wtp.add_page("Modèle:pron", 10, body="\\{{{1|}}}\\") + self.wxr.wtp.start_page("wagonnet") + root = self.wxr.wtp.parse( + """=== {{S|prononciation}} === +* {{pron|va.ɡɔ.nɛ|fr}} + +==== {{S|paronymes}} ==== +* [[wagonnée]] +* [[wagonnier]] +""" + ) + extract_pronunciation( + self.wxr, + page_data, + root.children[0], + WordEntry(word="wagonnet", lang_code="fr", lang_name="Français"), + ) + self.assertEqual( + page_data[0].model_dump(exclude_defaults=True), + { + "word": "wagonnet", + "lang_code": "fr", + "lang_name": "Français", + "paronyms": [{"word": "wagonnée"}, {"word": "wagonnier"}], + "sounds": [{"ipa": "\\va.ɡɔ.nɛ\\"}], + }, + ) From a22be0efa2f0299661f37c31cdde990ebfbc5edb Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 28 Dec 2023 10:04:47 +0800 Subject: [PATCH 08/13] Add "sense_index" field to the `Translation` class MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Translation list start template `trad-début` is like the `(` template, it also has a sense index parameter. --- src/wiktextract/extractor/fr/models.py | 3 +++ src/wiktextract/extractor/fr/translation.py | 12 +++++++----- tests/test_fr_translation.py | 1 + 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/wiktextract/extractor/fr/models.py b/src/wiktextract/extractor/fr/models.py index c3040625e..4814b2fd6 100644 --- a/src/wiktextract/extractor/fr/models.py +++ b/src/wiktextract/extractor/fr/models.py @@ -48,6 +48,9 @@ class Translation(FrenchBaseModel): lang_name: str = Field("", description="Translation language name") word: str = Field("", description="Translation term") sense: str = Field("", description="Translation gloss") + sense_index: int = Field( + 0, ge=0, description="Number of the definition, start from 1" + ) tags: list[str] = [] roman: str = "" traditional_writing: str = Field( diff --git a/src/wiktextract/extractor/fr/translation.py b/src/wiktextract/extractor/fr/translation.py index 086cfa111..ef96dc000 100644 --- a/src/wiktextract/extractor/fr/translation.py +++ b/src/wiktextract/extractor/fr/translation.py @@ -70,11 +70,13 @@ def process_translation_templates( return elif template_node.template_name == "trad-début": # translation box start: https://fr.wiktionary.org/wiki/Modèle:trad-début - sense_parameter = template_node.template_parameters.get(1) - if sense_parameter is not None: - sense_text = clean_node(wxr, None, sense_parameter) - if len(sense_text) > 0: - base_translation_data.sense = sense_text + sense_parameter = template_node.template_parameters.get(1, "") + sense_text = clean_node(wxr, None, sense_parameter) + base_translation_data.sense = sense_text + base_translation_data.sense_index = int( + template_node.template_parameters.get(2, "0") + ) + elif template_node.template_name == "T": # Translation language: https://fr.wiktionary.org/wiki/Modèle:T base_translation_data.lang_code = template_node.template_parameters.get( diff --git a/tests/test_fr_translation.py b/tests/test_fr_translation.py index 6feaf6c6f..f8d978f80 100644 --- a/tests/test_fr_translation.py +++ b/tests/test_fr_translation.py @@ -164,6 +164,7 @@ def test_template_sense_parameter(self): "lang_name": "Croate", "word": "masa", "sense": "(Finance)", + "sense_index": 12, }, ], }, From 98667b19867566946b3754eb51402b284c54b843 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 28 Dec 2023 10:46:15 +0800 Subject: [PATCH 09/13] Extract subsections under the translation section Translation lists without sense are in a level 5 node under the level 4 translation node. --- src/wiktextract/extractor/fr/page.py | 2 +- src/wiktextract/extractor/fr/translation.py | 11 +++- tests/test_fr_gloss.py | 2 +- tests/test_fr_inflection.py | 4 +- tests/test_fr_translation.py | 72 ++++++++++++++------- 5 files changed, 61 insertions(+), 30 deletions(-) diff --git a/src/wiktextract/extractor/fr/page.py b/src/wiktextract/extractor/fr/page.py index f1d06d641..bf92801e9 100644 --- a/src/wiktextract/extractor/fr/page.py +++ b/src/wiktextract/extractor/fr/page.py @@ -90,7 +90,7 @@ def parse_section( wxr.config.capture_translations and section_type in wxr.config.OTHER_SUBTITLES["translations"] ): - extract_translation(wxr, page_data, level_node) + extract_translation(wxr, page_data, base_data, level_node) elif ( wxr.config.capture_inflections and section_type diff --git a/src/wiktextract/extractor/fr/translation.py b/src/wiktextract/extractor/fr/translation.py index ef96dc000..11221d04a 100644 --- a/src/wiktextract/extractor/fr/translation.py +++ b/src/wiktextract/extractor/fr/translation.py @@ -1,7 +1,7 @@ from typing import Optional from wikitextprocessor import NodeKind, WikiNode -from wikitextprocessor.parser import TemplateNode +from wikitextprocessor.parser import LEVEL_KIND_FLAGS, TemplateNode from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext @@ -9,7 +9,10 @@ def extract_translation( - wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode + wxr: WiktextractContext, + page_data: list[WordEntry], + base_data: WordEntry, + level_node: WikiNode, ) -> None: base_translation_data = Translation() for level_node_child in level_node.filter_empty_str_child(): @@ -38,6 +41,10 @@ def extract_translation( wxr, child_node, previous_node, page_data ) previous_node = child_node + elif level_node_child.kind in LEVEL_KIND_FLAGS: + from .page import parse_section + + parse_section(wxr, page_data, base_data, level_node_child) def process_italic_node( diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py index bf27f5d6d..015bb17f1 100644 --- a/tests/test_fr_gloss.py +++ b/tests/test_fr_gloss.py @@ -302,7 +302,7 @@ def test_sandwich_tag(self): "glosses": [ "Autrice, femme qui a créé une œuvre littéraire. Écrivaine." ], - "tags": ["Littérature", "Rare", "Absolument"] + "tags": ["Littérature", "Rare", "Absolument"], } ], ) diff --git a/tests/test_fr_inflection.py b/tests/test_fr_inflection.py index 20e66eb8a..bd505a676 100644 --- a/tests/test_fr_inflection.py +++ b/tests/test_fr_inflection.py @@ -185,7 +185,9 @@ def test_invalid_ipa(self, mock_node_to_wikitext): def test_no_column_headers(self, mock_node_to_wikitext): # https://fr.wiktionary.org/wiki/一万#Nom_commun # template "zh-formes" - page_data = [WordEntry(word="一万", lang_code="zh", lang_name="Chinois")] + page_data = [ + WordEntry(word="一万", lang_code="zh", lang_name="Chinois") + ] node = TemplateNode(0) self.wxr.wtp.start_page("一万") extract_inflection(self.wxr, page_data, node) diff --git a/tests/test_fr_translation.py b/tests/test_fr_translation.py index f8d978f80..bcf4a2f0b 100644 --- a/tests/test_fr_translation.py +++ b/tests/test_fr_translation.py @@ -23,10 +23,11 @@ def test_italic_tag(self): root = self.wxr.wtp.parse( "=== Traductions ===\n* {{trad-début|Formule pour saluer}}\n* {{T|sq}} : {{trad+|sq|mirëdita}}, {{trad-|sq|mirë mëngjes}} ''(le matin)''" ) - page_data = [ - WordEntry(word="bonjour", lang_code="fr", lang_name="Français") - ] - extract_translation(self.wxr, page_data, root.children[0]) + base_data = WordEntry( + word="bonjour", lang_code="fr", lang_name="Français" + ) + page_data = [base_data.model_copy(deep=True)] + extract_translation(self.wxr, page_data, base_data, root.children[0]) self.assertEqual( page_data[-1].model_dump(exclude_defaults=True), { @@ -59,10 +60,11 @@ def test_template_tag(self): root = self.wxr.wtp.parse( "=== Traductions ===\n* {{T|ar}} : {{trad+|ar|مرحبا|dif=مرحبًا|tr={{transliterator|ar|مرحبا}}}} {{informel|nocat=1}}" ) - page_data = [ - WordEntry(word="bonjour", lang_code="fr", lang_name="Français") - ] - extract_translation(self.wxr, page_data, root.children[0]) + base_data = WordEntry( + word="bonjour", lang_code="fr", lang_name="Français" + ) + page_data = [base_data.model_copy(deep=True)] + extract_translation(self.wxr, page_data, base_data, root.children[0]) self.assertEqual( page_data[-1].model_dump(exclude_defaults=True), { @@ -87,10 +89,11 @@ def test_traditional_writing(self): root = self.wxr.wtp.parse( "=== Traductions ===\n* {{T|mn}} : {{trad+|mn|сайн байна уу|tr=sain baina uu|tradi=ᠰᠠᠶᠢᠨ ᠪᠠᠶᠢᠨ᠎ᠠ ᠤᠤ}}" ) - page_data = [ - WordEntry(word="bonjour", lang_code="fr", lang_name="Français") - ] - extract_translation(self.wxr, page_data, root.children[0]) + base_data = WordEntry( + word="bonjour", lang_code="fr", lang_name="Français" + ) + page_data = [base_data.model_copy(deep=True)] + extract_translation(self.wxr, page_data, base_data, root.children[0]) self.assertEqual( page_data[-1].model_dump(exclude_defaults=True), { @@ -117,10 +120,11 @@ def test_trad_template_gender_parameter(self): root = self.wxr.wtp.parse( "=== Traductions ===\n* {{T|de}} : {{trad|de|Kambium|n}}" ) - page_data = [ - WordEntry(word="cambium", lang_code="fr", lang_name="Français") - ] - extract_translation(self.wxr, page_data, root.children[0]) + base_data = WordEntry( + word="cambium", lang_code="fr", lang_name="Français" + ) + page_data = [base_data.model_copy(deep=True)] + extract_translation(self.wxr, page_data, base_data, root.children[0]) self.assertEqual( page_data[-1].model_dump(exclude_defaults=True), { @@ -140,18 +144,31 @@ def test_trad_template_gender_parameter(self): def test_template_sense_parameter(self): self.wxr.wtp.start_page("masse") - self.wxr.wtp.add_page("Modèle:info lex", 10, body="(Finance)") - self.wxr.wtp.add_page("Modèle:T", 10, body="Croate") - self.wxr.wtp.add_page("Modèle:trad+", 10, body="masa") + self.wxr.wtp.add_page("Modèle:S", 10, "{{{1}}}") + self.wxr.wtp.add_page("Modèle:info lex", 10, "(Finance)") + self.wxr.wtp.add_page( + "Modèle:T", + 10, + """{{#switch: {{{1}}} +| hr = Croate +| af = Afrikaans +}}""", + ) + self.wxr.wtp.add_page("Modèle:trad+", 10, "masa") root = self.wxr.wtp.parse( - """=== Traductions === + """==== {{S|traductions}} ==== {{trad-début|{{info lex|finance}}|12}} -* {{T|hr}} : {{trad+|hr|masa}}""" +* {{T|hr}} : {{trad+|hr|masa}} +{{trad-fin}} + +===== {{S|traductions à trier}} ===== +* {{T|af|trier}} : {{trad+|af|massa}}""" + ) + base_data = WordEntry( + word="masse", lang_code="fr", lang_name="Français" ) - page_data = [ - WordEntry(word="masse", lang_code="fr", lang_name="Français") - ] - extract_translation(self.wxr, page_data, root.children[0]) + page_data = [base_data.model_copy(deep=True)] + extract_translation(self.wxr, page_data, base_data, root.children[0]) self.assertEqual( page_data[-1].model_dump(exclude_defaults=True), { @@ -166,6 +183,11 @@ def test_template_sense_parameter(self): "sense": "(Finance)", "sense_index": 12, }, + { + "lang_code": "af", + "lang_name": "Afrikaans", + "word": "massa", + }, ], }, ) From 54c733ed5594deb9816133708ad5c10876b7b87c Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 28 Dec 2023 15:26:04 +0800 Subject: [PATCH 10/13] Extract sense text and index data from description list(`;`) --- src/wiktextract/extractor/fr/linkage.py | 14 ++++++++++++++ tests/test_fr_linkage.py | 12 +++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/src/wiktextract/extractor/fr/linkage.py b/src/wiktextract/extractor/fr/linkage.py index 2494da816..77eb516bb 100644 --- a/src/wiktextract/extractor/fr/linkage.py +++ b/src/wiktextract/extractor/fr/linkage.py @@ -1,3 +1,5 @@ +import re + from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import TemplateNode from wiktextract.page import clean_node @@ -71,6 +73,18 @@ def process_linkage_list( if sense_index_text.isdigit(): sense_index = int(sense_index_text) continue + # sense could also be in ";" description list + if ( + template_or_list_node.kind == NodeKind.LIST_ITEM + and template_or_list_node.sarg == ";" + ): + sense_text = clean_node(wxr, None, template_or_list_node.children) + index_pattern = r"\s*\((?:sens\s*)?(\d+)\)$" + m = re.search(index_pattern, sense_text) + if m is not None: + sense_text = re.sub(index_pattern, "", sense_text) + sense_index = int(m.group(1)) + continue linkage_data = Linkage() if len(sense_text) > 0: diff --git a/tests/test_fr_linkage.py b/tests/test_fr_linkage.py index bb2f57c3e..d7efa94fa 100644 --- a/tests/test_fr_linkage.py +++ b/tests/test_fr_linkage.py @@ -117,13 +117,18 @@ def test_sub_list(self): ) def test_sense(self): + # https://fr.wiktionary.org/wiki/autrice + # https://fr.wiktionary.org/wiki/embouteillage page_data = [ - WordEntry(word="test", lang_code="fr", lang_name="Français") + WordEntry(word="autrice", lang_code="fr", lang_name="Français") ] self.wxr.wtp.start_page("autrice") root = self.wxr.wtp.parse( """{{(|Celle qui est à l’origine de quelque chose|1}} * [[artisane]] + +; Mise en bouteille (sens 1) +* [[bouchonnerie]] """ ) extract_linkage(self.wxr, page_data, root, "synonymes") @@ -138,6 +143,11 @@ def test_sense(self): "sense": "Celle qui est à l’origine de quelque chose", "sense_index": 1, }, + { + "word": "bouchonnerie", + "sense": "Mise en bouteille", + "sense_index": 1, + }, ], ) From c994da3fd4e58c94083cd65c244a3841bc2f7c26 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 29 Dec 2023 14:46:48 +0800 Subject: [PATCH 11/13] Extract "en-adj" inflection tables in French Wiktionary These templates use the data wikitext for column header --- src/wiktextract/extractor/fr/inflection.py | 39 ++- tests/test_fr_inflection.py | 340 ++++++++++++--------- 2 files changed, 238 insertions(+), 141 deletions(-) diff --git a/src/wiktextract/extractor/fr/inflection.py b/src/wiktextract/extractor/fr/inflection.py index f21d81f75..a9a1220ca 100644 --- a/src/wiktextract/extractor/fr/inflection.py +++ b/src/wiktextract/extractor/fr/inflection.py @@ -16,7 +16,10 @@ def extract_inflection( ) -> None: # inflection templates # https://fr.wiktionary.org/wiki/Catégorie:Modèles_d’accord_en_français - process_inflection_table(wxr, page_data, template_node) + if template_node.template_name.startswith("en-adj"): + process_en_adj_table(wxr, page_data, template_node) + else: + process_inflection_table(wxr, page_data, template_node) IGNORE_TABLE_HEADERS = frozenset( @@ -192,3 +195,37 @@ def insert_ipa(form: Form, ipa_text: str) -> None: if len(ipa_data) == 0: return form.ipas.extend(ipa_data) + + +def process_en_adj_table( + wxr: WiktextractContext, + page_data: list[WordEntry], + template_node: WikiNode, +) -> None: + # https://fr.wiktionary.org/wiki/Modèle:en-adj + # and other en-adj* templates + # these templates use normal table cell for column table header + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(template_node), expand_all=True + ) + table_nodes = list(expanded_node.find_child(NodeKind.TABLE)) + if len(table_nodes) == 0: + return + table_node = table_nodes[0] + for row_num, table_row in enumerate( + table_node.find_child(NodeKind.TABLE_ROW) + ): + if row_num == 0: + # skip header + continue + if len(table_row.children) > 1: + form_data = Form() + form_data.tags.append(clean_node(wxr, None, table_row.children[0])) + form_text = clean_node(wxr, None, table_row.children[1]) + for form_line in form_text.splitlines(): + if is_ipa_text(form_line): + insert_ipa(form_data, form_line) + else: + form_data.form = form_line + if form_data.form != page_data[-1].word: + page_data[-1].forms.append(form_data) diff --git a/tests/test_fr_inflection.py b/tests/test_fr_inflection.py index bd505a676..e49d1a682 100644 --- a/tests/test_fr_inflection.py +++ b/tests/test_fr_inflection.py @@ -1,8 +1,6 @@ from unittest import TestCase -from unittest.mock import patch from wikitextprocessor import Wtp -from wikitextprocessor.parser import TemplateNode from wiktextract.config import WiktionaryConfig from wiktextract.extractor.fr.inflection import extract_inflection from wiktextract.extractor.fr.models import WordEntry @@ -18,9 +16,14 @@ def setUp(self) -> None: def tearDown(self) -> None: self.wxr.wtp.close_db_conn() - @patch( - "wikitextprocessor.Wtp.node_to_wikitext", - return_value=""" + def test_fr_reg(self): + page_data = [ + WordEntry(word="productrice", lang_code="fr", lang_name="Français") + ] + self.wxr.wtp.add_page( + "Modèle:fr-rég", + 10, + """ {| ! Singulier !! Pluriel |- @@ -28,24 +31,25 @@ def tearDown(self) -> None: | [[productrices#fr|productrices]] |- |[[Annexe:Prononciation/français|\\pʁɔ.dyk.tʁis\\]] -|} - """, - ) - def test_fr_reg(self, mock_node_to_wikitext): - page_data = [ - WordEntry(word="productrice", lang_code="fr", lang_name="Français") - ] - node = TemplateNode(0) +|}""", + ) self.wxr.wtp.start_page("productrice") - extract_inflection(self.wxr, page_data, node) + root = self.wxr.wtp.parse("{{fr-rég|pʁɔ.dyk.tʁis}}") + extract_inflection(self.wxr, page_data, root.children[0]) self.assertEqual( [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [{"form": "productrices", "tags": ["Pluriel"]}], ) - @patch( - "wikitextprocessor.Wtp.node_to_wikitext", - return_value="""{|class="flextable flextable-fr-mfsp" + def test_fr_accord_al(self): + # https://fr.wiktionary.org/wiki/animal#Adjectif + page_data = [ + WordEntry(word="animal", lang_code="fr", lang_name="Français") + ] + self.wxr.wtp.add_page( + "Modèle:fr-accord-al", + 10, + """{|class="flextable flextable-fr-mfsp" |- !scope="col"| Singulier !scope="col"| Pluriel @@ -58,15 +62,10 @@ def test_fr_reg(self, mock_node_to_wikitext): |[[animale]]
[[Annexe:Prononciation/français|\\a.ni.mal\\]] |[[animales]]
[[Annexe:Prononciation/français|\\a.ni.mal\\]] |}""", - ) - def test_fr_accord_al(self, mock_node_to_wikitext): - # https://fr.wiktionary.org/wiki/animal#Adjectif - page_data = [ - WordEntry(word="animal", lang_code="fr", lang_name="Français") - ] - node = TemplateNode(0) + ) self.wxr.wtp.start_page("animal") - extract_inflection(self.wxr, page_data, node) + root = self.wxr.wtp.parse("{{fr-accord-al|a.ni.m}}") + extract_inflection(self.wxr, page_data, root.children[0]) self.assertEqual( [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [ @@ -88,24 +87,25 @@ def test_fr_accord_al(self, mock_node_to_wikitext): ], ) - @patch( - "wikitextprocessor.Wtp.node_to_wikitext", - return_value="""{| class='flextable flextable-en' -! Singulier !! Pluriel -|- -| '''ration'''
[[Annexe:Prononciation/anglais|\\ˈɹæʃ.ən\\]]
ou [[Annexe:Prononciation/anglais|\\ˈɹeɪʃ.ən\\]] -| [[rations#en-flex-nom|rations]]
[[Annexe:Prononciation/anglais|\\ˈɹæʃ.ənz\\]]
ou [[Annexe:Prononciation/anglais|\\ˈɹeɪʃ.ənz\\]] -|}""", - ) - def test_multiple_lines_ipa(self, mock_node_to_wikitext): + def test_multiple_lines_ipa(self): # https://fr.wiktionary.org/wiki/ration#Nom_commun_2 # template "en-nom-rég" page_data = [ WordEntry(word="ration", lang_code="en", lang_name="Anglais") ] - node = TemplateNode(0) + self.wxr.wtp.add_page( + "Modèle:en-nom-rég", + 10, + """{| class='flextable flextable-en' +! Singulier !! Pluriel +|- +| '''ration'''
[[Annexe:Prononciation/anglais|\\ˈɹæʃ.ən\\]]
ou [[Annexe:Prononciation/anglais|\\ˈɹeɪʃ.ən\\]] +| [[rations#en-flex-nom|rations]]
[[Annexe:Prononciation/anglais|\\ˈɹæʃ.ənz\\]]
ou [[Annexe:Prononciation/anglais|\\ˈɹeɪʃ.ənz\\]] +|}""", + ) self.wxr.wtp.start_page("ration") - extract_inflection(self.wxr, page_data, node) + root = self.wxr.wtp.parse("{{en-nom-rég|ˈɹæʃ.ən|ˈɹeɪʃ.ən}}") + extract_inflection(self.wxr, page_data, root.children[0]) self.assertEqual( [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [ @@ -117,25 +117,28 @@ def test_multiple_lines_ipa(self, mock_node_to_wikitext): ], ) - @patch( - "wikitextprocessor.Wtp.node_to_wikitext", - return_value="""{|class='flextable' + def test_single_line_multiple_ipa(self): + # https://fr.wiktionary.org/wiki/ration#Verbe + # template "en-conj-rég" + page_data = [ + WordEntry(word="ration", lang_code="en", lang_name="Anglais") + ] + self.wxr.wtp.add_page( + "Modèle:en-conj-rég", + 10, + """{|class='flextable' ! Temps ! Forme |- ! Infinitif | to '''ration'''
[[Annexe:Prononciation/anglais|\\ˈɹæʃ.ən\\]] ou [[Annexe:Prononciation/anglais|\\ˈɹeɪʃ.ən\\]] |}""", - ) - def test_single_line_multiple_ipa(self, mock_node_to_wikitext): - # https://fr.wiktionary.org/wiki/ration#Verbe - # template "en-conj-rég" - page_data = [ - WordEntry(word="ration", lang_code="en", lang_name="Anglais") - ] - node = TemplateNode(0) + ) self.wxr.wtp.start_page("ration") - extract_inflection(self.wxr, page_data, node) + root = self.wxr.wtp.parse( + "{{en-conj-rég|inf.pron=ˈɹæʃ.ən|inf.pron2=ˈɹeɪʃ.ən}}" + ) + extract_inflection(self.wxr, page_data, root.children[0]) self.assertEqual( [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [ @@ -147,33 +150,42 @@ def test_single_line_multiple_ipa(self, mock_node_to_wikitext): ], ) - @patch( - "wikitextprocessor.Wtp.node_to_wikitext", - return_value="""{| + def test_invalid_ipa(self): + # https://fr.wiktionary.org/wiki/animal#Nom_commun_3 + page_data = [ + WordEntry(word="animal", lang_code="en", lang_name="Français") + ] + self.wxr.wtp.add_page( + "Modèle:ast-accord-mf", + 10, + """{| ! '''Singulier''' ! '''Pluriel''' |- | [[animal]]
\\[//fr.wiktionary.org/w/index.php?title=ration&action=edit Prononciation ?]\\
| [[animales]]
\\[//fr.wiktionary.org/w/index.php?title=ration&action=edit Prononciation ?]\\
|}""", - ) - def test_invalid_ipa(self, mock_node_to_wikitext): - # https://fr.wiktionary.org/wiki/animal#Nom_commun_3 - # template "ast-accord-mf" - page_data = [ - WordEntry(word="animal", lang_code="en", lang_name="Français") - ] - node = TemplateNode(0) + ) self.wxr.wtp.start_page("animal") - extract_inflection(self.wxr, page_data, node) + root = self.wxr.wtp.parse( + "{{ast-accord-mf|s=animal|ps=|p=animales|pp=}}" + ) + extract_inflection(self.wxr, page_data, root.children[0]) self.assertEqual( [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [{"tags": ["Pluriel"], "form": "animales"}], ) - @patch( - "wikitextprocessor.Wtp.node_to_wikitext", - return_value="""{| class="flextable" + def test_no_column_headers(self): + # https://fr.wiktionary.org/wiki/一万#Nom_commun + # template "zh-formes" + page_data = [ + WordEntry(word="一万", lang_code="zh", lang_name="Chinois") + ] + self.wxr.wtp.add_page( + "Modèle:zh-formes", + 10, + """{| class="flextable" |- ! Simplifié | [[一万#zh|一万]] @@ -181,24 +193,24 @@ def test_invalid_ipa(self, mock_node_to_wikitext): ! Traditionnel | [[一萬#zh|一萬]] |}""", - ) - def test_no_column_headers(self, mock_node_to_wikitext): - # https://fr.wiktionary.org/wiki/一万#Nom_commun - # template "zh-formes" - page_data = [ - WordEntry(word="一万", lang_code="zh", lang_name="Chinois") - ] - node = TemplateNode(0) + ) self.wxr.wtp.start_page("一万") - extract_inflection(self.wxr, page_data, node) + root = self.wxr.wtp.parse("{{zh-formes|一万|一萬}}") + extract_inflection(self.wxr, page_data, root.children[0]) self.assertEqual( [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [{"tags": ["Traditionnel"], "form": "一萬"}], ) - @patch( - "wikitextprocessor.Wtp.node_to_wikitext", - return_value="""{| class="flextable" + def test_lt_décl_as(self): + # empty table cells should be ignored + page_data = [ + WordEntry(word="abadai", lang_code="lt", lang_name="Lituanien") + ] + self.wxr.wtp.add_page( + "Modèle:lt-décl-as", + 10, + """{| class="flextable" !Cas ! Singulier ! Pluriel @@ -207,23 +219,23 @@ def test_no_column_headers(self, mock_node_to_wikitext): || [[abadas#lt|abadas]] || '''abadai''' |}""", - ) - def test_lt_décl_as(self, mock_node_to_wikitext): - # empty table cells should be ignored - page_data = [ - WordEntry(word="abadai", lang_code="lt", lang_name="Lituanien") - ] - node = TemplateNode(0) + ) self.wxr.wtp.start_page("abadai") - extract_inflection(self.wxr, page_data, node) + root = self.wxr.wtp.parse("{{lt-décl-as|abad}}") + extract_inflection(self.wxr, page_data, root.children[0]) self.assertEqual( [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [{"tags": ["Singulier", "Nominatif"], "form": "abadas"}], ) - @patch( - "wikitextprocessor.Wtp.node_to_wikitext", - return_value="""{|class="flextable flextable-fr-mfsp" + def test_fr_accord_s(self): + page_data = [ + WordEntry(word="aastais", lang_code="fr", lang_name="Français") + ] + self.wxr.wtp.add_page( + "Modèle:fr-accord-s", + 10, + """{|class="flextable flextable-fr-mfsp" |- | class="invisible" | @@ -241,14 +253,10 @@ def test_lt_décl_as(self, mock_node_to_wikitext): | [[aastaises]]
[[Annexe:Prononciation/français|\\a.a.stɛz\\]] |}""", - ) - def test_fr_accord_s(self, mock_node_to_wikitext): - page_data = [ - WordEntry(word="aastais", lang_code="fr", lang_name="Français") - ] - node = TemplateNode(0) + ) self.wxr.wtp.start_page("aastais") - extract_inflection(self.wxr, page_data, node) + root = self.wxr.wtp.parse("{{fr-accord-s|a.a.stɛ|ms=aastais}}") + extract_inflection(self.wxr, page_data, root.children[0]) self.assertEqual( [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [ @@ -265,9 +273,17 @@ def test_fr_accord_s(self, mock_node_to_wikitext): ], ) - @patch( - "wikitextprocessor.Wtp.node_to_wikitext", - return_value="""{| class="flextable" + def test_fr_accord_personne(self): + # https://fr.wiktionary.org/wiki/enculé_de_ta_race + page_data = [ + WordEntry( + word="enculé de ta race", lang_code="fr", lang_name="Français" + ) + ] + self.wxr.wtp.add_page( + "Modèle:fr-accord-personne", + 10, + """{| class="flextable" | colspan="2" | ! Singulier !! Pluriel |- @@ -280,17 +296,12 @@ def test_fr_accord_s(self, mock_node_to_wikitext): | [[enculée de ma race]]
[[Annexe:Prononciation/français|\\ɑ̃.ky.ˌle.də.ma.ˈʁas\\]] | [[enculées de notre race]]
[[Annexe:Prononciation/français|\\ɑ̃.ky.ˌle.də.ma.ˈʁas\\]] |}""", - ) - def test_fr_accord_personne(self, mock_node_to_wikitext): - # https://fr.wiktionary.org/wiki/enculé_de_ta_race - page_data = [ - WordEntry( - word="enculé de ta race", lang_code="fr", lang_name="Français" - ) - ] - node = TemplateNode(0) + ) self.wxr.wtp.start_page("enculé de ta race") - extract_inflection(self.wxr, page_data, node) + root = self.wxr.wtp.parse( + "{{fr-accord-personne|1ms = enculé de ma race}}" + ) + extract_inflection(self.wxr, page_data, root.children[0]) self.assertEqual( [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [ @@ -317,9 +328,15 @@ def test_fr_accord_personne(self, mock_node_to_wikitext): ], ) - @patch( - "wikitextprocessor.Wtp.node_to_wikitext", - return_value="""{| class="flextable" + def test_ro_nom_tab(self): + # https://fr.wiktionary.org/wiki/fenil#Nom_commun_4 + page_data = [ + WordEntry(word="fenil", lang_code="fr", lang_name="Français") + ] + self.wxr.wtp.add_page( + "Modèle:ro-nom-tab", + 10, + """{| class="flextable" ! masculin ! colspan=2 | Singulier ! colspan=2 | Pluriel @@ -336,15 +353,17 @@ def test_fr_accord_personne(self, mock_node_to_wikitext): | colspan=2| [[fenilule#ro-nom|fenilule]] | colspan=2| [[fenililor#ro-nom|fenililor]] |}""", - ) - def test_ro_nom_tab(self, mock_node_to_wikitext): - # https://fr.wiktionary.org/wiki/fenil#Nom_commun_4 - page_data = [ - WordEntry(word="fenil", lang_code="fr", lang_name="Français") - ] - node = TemplateNode(0) + ) self.wxr.wtp.start_page("fenil") - extract_inflection(self.wxr, page_data, node) + root = self.wxr.wtp.parse( + """{{ro-nom-tab|gen=masculin +|ns=fenil |np=fenili +|as=fenilul |ap=fenilii +|ds=fenilului |dp=fenililor +|vs=fenilule |vp=fenililor +}}""", + ) + extract_inflection(self.wxr, page_data, root.children[0]) self.assertEqual( [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [ @@ -365,9 +384,15 @@ def test_ro_nom_tab(self, mock_node_to_wikitext): ], ) - @patch( - "wikitextprocessor.Wtp.node_to_wikitext", - return_value="""{| class="flextable flextable-sv" + def test_sv_nom_c_ar(self): + # https://fr.wiktionary.org/wiki/robot#Nom_commun_7 + page_data = [ + WordEntry(word="robot", lang_code="fr", lang_name="Français") + ] + self.wxr.wtp.add_page( + "Modèle:sv-nom-c-ar", + 10, + """{| class="flextable flextable-sv" ! class="invisible" | |- ! Commun @@ -382,15 +407,10 @@ def test_ro_nom_tab(self, mock_node_to_wikitext): | class="plur-indef" |[[robotar#sv|robotar]] | class="plur-def" |[[robotarna#sv|robotarna]] |}""", - ) - def test_sv_nom_c_ar(self, mock_node_to_wikitext): - # https://fr.wiktionary.org/wiki/robot#Nom_commun_7 - page_data = [ - WordEntry(word="robot", lang_code="fr", lang_name="Français") - ] - node = TemplateNode(0) + ) self.wxr.wtp.start_page("robot") - extract_inflection(self.wxr, page_data, node) + root = self.wxr.wtp.parse("{{sv-nom-c-ar}}") + extract_inflection(self.wxr, page_data, root.children[0]) self.assertEqual( [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [ @@ -400,9 +420,15 @@ def test_sv_nom_c_ar(self, mock_node_to_wikitext): ], ) - @patch( - "wikitextprocessor.Wtp.node_to_wikitext", - return_value="""{|class="flextable" + def test_cs_decl_nom_ma_dur(self): + # https://fr.wiktionary.org/wiki/robot#Nom_commun_1_2 + page_data = [ + WordEntry(word="robot", lang_code="fr", lang_name="Français") + ] + self.wxr.wtp.add_page( + "Modèle:cs-décl-nom-ma-dur", + 10, + """{|class="flextable" |- !scope="col"| Cas !scope="col"| Singulier @@ -412,15 +438,10 @@ def test_sv_nom_c_ar(self, mock_node_to_wikitext): | [[robot#cs-nom|robot''' ''']] | [[roboti#cs-flex-nom|robot'''i ''']]
''ou'' [[robotové#cs-flex-nom|robot'''ové ''']] |}""", - ) - def test_cs_decl_nom_ma_dur(self, mock_node_to_wikitext): - # https://fr.wiktionary.org/wiki/robot#Nom_commun_1_2 - page_data = [ - WordEntry(word="robot", lang_code="fr", lang_name="Français") - ] - node = TemplateNode(0) + ) self.wxr.wtp.start_page("robot") - extract_inflection(self.wxr, page_data, node) + root = self.wxr.wtp.parse("{{cs-décl-nom-ma-dur|rad=robot}}") + extract_inflection(self.wxr, page_data, root.children[0]) self.assertEqual( [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [ @@ -428,3 +449,42 @@ def test_cs_decl_nom_ma_dur(self, mock_node_to_wikitext): {"form": "robotové", "tags": ["Pluriel", "Nominatif"]}, ], ) + + def test_en_adj(self): + # https://fr.wiktionary.org/wiki/new + page_data = [WordEntry(word="new", lang_code="en", lang_name="Anglais")] + self.wxr.wtp.start_page("new") + root = self.wxr.wtp.parse("{{en-adj-er|pron=ˈnu|pronGB=ˈnjuː}}") + self.wxr.wtp.add_page( + "Modèle:en-adj-er", + 10, + """{| class="flextable" +! Nature +! Forme +|- +| class="titre" | Positif +| '''new'''
[[Annexe:Prononciation/anglais|\\ˈnu\\]] ou [[Annexe:Prononciation/anglais|\\ˈnjuː\\]] +|- +| class="titre" | Comparatif +| [[newer#en|newer]]
[[Annexe:Prononciation/anglais|\\ˈnu.ɚ\\]] ou [[Annexe:Prononciation/anglais|\\ˈnjuː.ə\\]] +|- +| class="titre" | Superlatif +| [[newest#en|newest]]
[[Annexe:Prononciation/anglais|\\ˈnu.ɪst\\]] ou [[Annexe:Prononciation/anglais|\\ˈnjuː.ɪst\\]] +|}""", + ) + extract_inflection(self.wxr, page_data, root.children[0]) + self.assertEqual( + [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], + [ + { + "form": "newer", + "tags": ["Comparatif"], + "ipas": ["\\ˈnu.ɚ\\", "\\ˈnjuː.ə\\"], + }, + { + "form": "newest", + "tags": ["Superlatif"], + "ipas": ["\\ˈnu.ɪst\\", "\\ˈnjuː.ɪst\\"], + }, + ], + ) From 98c8a72fcd832f16f9cda8fd62c359f8f2b94a0d Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 29 Dec 2023 17:22:35 +0800 Subject: [PATCH 12/13] Add two new edition configuration options French Wiktionary has a unique "Conjugaison" namespace that contains conjugation tables. The new options allow each edition to save and process pages according to their needs. I don't know whether we need the "Category" namespace. There are 845761 category pages in the English dump file. The first stage will be faster and the db file size will be smaller if we don't save this namespace. --- src/wiktextract/config.py | 16 ++++++++++++++++ src/wiktextract/data/fr/config.json | 4 +++- src/wiktextract/wiktionary.py | 2 +- src/wiktextract/wiktwords.py | 16 ++-------------- 4 files changed, 22 insertions(+), 16 deletions(-) diff --git a/src/wiktextract/config.py b/src/wiktextract/config.py index 1680902aa..0f75767d1 100644 --- a/src/wiktextract/config.py +++ b/src/wiktextract/config.py @@ -50,6 +50,8 @@ class WiktionaryConfig: "ZH_PRON_TAGS", "analyze_templates", "extract_thesaurus_pages", + "save_ns_names", + "extract_ns_names", ) def __init__( @@ -111,6 +113,20 @@ def __init__( self.set_attr_from_json("ZH_PRON_TAGS", "zh_pron_tags.json") self.analyze_templates = True # find templates that need pre-expand self.extract_thesaurus_pages = True + # these namespace pages will be copied from the XML dump file and + # saved to a SQLite db file + self.save_ns_names = [ + "Main", + "Category", # do we use this? + "Appendix", + "Project", + "Thesaurus", + "Module", + "Template", + "Reconstruction", + ] + # these are extracted namespaces + self.extract_ns_names = ["Main", "Reconstruction"] self.load_edition_settings() def merge_return(self, ret: CollatedErrorReturnData): diff --git a/src/wiktextract/data/fr/config.json b/src/wiktextract/data/fr/config.json index 91a7ba446..eb4e717c6 100644 --- a/src/wiktextract/data/fr/config.json +++ b/src/wiktextract/data/fr/config.json @@ -1,4 +1,6 @@ { "analyze_templates": false, - "extract_thesaurus_pages": false + "extract_thesaurus_pages": false, + "save_ns_names": ["Main", "Template", "Module", "Conjugaison"], + "extract_ns_names": ["Main"] } diff --git a/src/wiktextract/wiktionary.py b/src/wiktextract/wiktionary.py index 6b6e93138..6e3c68789 100644 --- a/src/wiktextract/wiktionary.py +++ b/src/wiktextract/wiktionary.py @@ -184,7 +184,7 @@ def reprocess_wiktionary( process_ns_ids = list( { wxr.wtp.NAMESPACE_DATA.get(ns, {}).get("id", 0) - for ns in ["Main", "Reconstruction"] + for ns in wxr.config.extract_ns_names } ) start_time = time.time() diff --git a/src/wiktextract/wiktwords.py b/src/wiktextract/wiktwords.py index 6a896d178..ef1702b49 100755 --- a/src/wiktextract/wiktwords.py +++ b/src/wiktextract/wiktwords.py @@ -45,18 +45,6 @@ from wiktextract.wiktionary import write_json_data from wiktextract.wxr_context import WiktextractContext -# Pages within these namespaces are captured. -RECOGNIZED_NAMESPACE_NAMES = [ - "Main", - "Category", - "Appendix", - "Project", - "Thesaurus", - "Module", - "Template", - "Reconstruction", -] - def process_single_page( path_or_title: str, @@ -438,8 +426,8 @@ def main(): try: if args.path is not None: namespace_ids = { - wxr.wtp.NAMESPACE_DATA.get(name, {}).get("id") - for name in RECOGNIZED_NAMESPACE_NAMES + wxr.wtp.NAMESPACE_DATA.get(name, {}).get("id", 0) + for name in wxr.config.save_ns_names } # Parse the normal full Wiktionary data dump parse_wiktionary( From 2e978d52172a666fc3812fc207d127be40ec99d6 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 29 Dec 2023 17:58:45 +0800 Subject: [PATCH 13/13] Fix `ValueError` exception in some French Wiktionary pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some pages pass non-integer to the "trad-début" template, for exmaple: page "près" uses "{{trad-début|À petite distance ou à peu de temps.|1, 2}}" page "mousse" has "{{trad-début|matériau alvéolaire|3a}}" page "déchirer" has "{{trad-début|Rompre en tirant avec force un côté|1 {{trans|nocat=1}}}}" --- src/wiktextract/extractor/fr/linkage.py | 2 +- src/wiktextract/extractor/fr/translation.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/wiktextract/extractor/fr/linkage.py b/src/wiktextract/extractor/fr/linkage.py index 77eb516bb..55e8e390d 100644 --- a/src/wiktextract/extractor/fr/linkage.py +++ b/src/wiktextract/extractor/fr/linkage.py @@ -70,7 +70,7 @@ def process_linkage_list( sense_index_text = template_or_list_node.template_parameters.get( 2, "0" ) - if sense_index_text.isdigit(): + if isinstance(sense_index_text, str) and sense_index_text.isdigit(): sense_index = int(sense_index_text) continue # sense could also be in ";" description list diff --git a/src/wiktextract/extractor/fr/translation.py b/src/wiktextract/extractor/fr/translation.py index 11221d04a..cdb65980f 100644 --- a/src/wiktextract/extractor/fr/translation.py +++ b/src/wiktextract/extractor/fr/translation.py @@ -80,9 +80,9 @@ def process_translation_templates( sense_parameter = template_node.template_parameters.get(1, "") sense_text = clean_node(wxr, None, sense_parameter) base_translation_data.sense = sense_text - base_translation_data.sense_index = int( - template_node.template_parameters.get(2, "0") - ) + sense_index_str = template_node.template_parameters.get(2, "0") + if isinstance(sense_index_str, str) and sense_index_str.isdigit(): + base_translation_data.sense_index = int(sense_index_str) elif template_node.template_name == "T": # Translation language: https://fr.wiktionary.org/wiki/Modèle:T