Merge branch 'master' of github.com:tatuylonen/wiktextract

tatuylonen · Jan 2, 2024 · 9bd7131 · 9bd7131
2 parents f9db0e9 + e0524ae
commit 9bd7131
Show file tree

Hide file tree

Showing 18 changed files with 532 additions and 229 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -37,7 +37,7 @@ jobs:
           make coverage_report
           make github_pages REPO=${{ github.repository }} SHA=${{ github.sha }}
         if: github.ref_name == 'master' && matrix.python-version == '3.12'
-      - uses: actions/upload-pages-artifact@v2
+      - uses: actions/upload-pages-artifact@v3
         if: github.ref_name == 'master' && matrix.python-version == '3.12'
 
   deploy:
@@ -52,4 +52,4 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - id: deployment
-        uses: actions/deploy-pages@v3
+        uses: actions/deploy-pages@v4
diff --git a/src/wiktextract/config.py b/src/wiktextract/config.py
@@ -68,6 +68,8 @@ class WiktionaryConfig:
         "ZH_PRON_TAGS",
         "analyze_templates",
         "extract_thesaurus_pages",
+        "save_ns_names",
+        "extract_ns_names",
     )
 
     def __init__(
@@ -135,6 +137,20 @@ def __init__(
         self.set_attr_from_json("ZH_PRON_TAGS", "zh_pron_tags.json")
         self.analyze_templates = True  # find templates that need pre-expand
         self.extract_thesaurus_pages = True
+        # these namespace pages will be copied from the XML dump file and
+        # saved to a SQLite db file
+        self.save_ns_names = [
+            "Main",
+            "Category",  # do we use this?
+            "Appendix",
+            "Project",
+            "Thesaurus",
+            "Module",
+            "Template",
+            "Reconstruction",
+        ]
+        # these are extracted namespaces
+        self.extract_ns_names = ["Main", "Reconstruction"]
         self.load_edition_settings()
 
     def merge_return(self, ret: CollatedErrorReturnData):

diff --git a/src/wiktextract/data/fr/config.json b/src/wiktextract/data/fr/config.json
@@ -1,4 +1,6 @@
 {
   "analyze_templates": false,
-  "extract_thesaurus_pages": false
+  "extract_thesaurus_pages": false,
+  "save_ns_names": ["Main", "Template", "Module", "Conjugaison"],
+  "extract_ns_names": ["Main"]
 }
diff --git a/src/wiktextract/extractor/fr/gloss.py b/src/wiktextract/extractor/fr/gloss.py
@@ -21,47 +21,41 @@ def extract_gloss(
             )
         )
         gloss_data = Sense()
-        gloss_start = 0
         # process modifier, theme tempaltes before gloss text
         # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens
+        tag_indexes = set()
         for index, gloss_node in enumerate(gloss_nodes):
             if isinstance(gloss_node, TemplateNode):
                 categories_data = defaultdict(list)
                 expanded_text = clean_node(wxr, categories_data, gloss_node)
                 if expanded_text.startswith("(") and expanded_text.endswith(
                     ")"
                 ):
-                    gloss_start = index + 1
                     tag = expanded_text.strip("() \n")
                     if len(tag) > 0:
                         gloss_data.tags.append(tag)
                     if "categories" in categories_data:
                         gloss_data.categories.extend(
                             categories_data["categories"]
                         )
-
-        gloss_only_nodes = []
-        tag_indexes = set()
-        for index, node in enumerate(gloss_nodes[gloss_start:], gloss_start):
+                    tag_indexes.add(index)
             # if an italic node is between parentheses then it's a tag, also
             # don't add the parenthese strings to `gloss_only_nodes`
-            if (
-                isinstance(node, WikiNode)
-                and node.kind == NodeKind.ITALIC
-                and index > gloss_start
+            elif (
+                isinstance(gloss_node, WikiNode)
+                and gloss_node.kind == NodeKind.ITALIC
                 and isinstance(gloss_nodes[index - 1], str)
                 and gloss_nodes[index - 1].strip() == "("
                 and index + 1 < len(gloss_nodes)
                 and isinstance(gloss_nodes[index + 1], str)
                 and gloss_nodes[index + 1].strip() == ")"
             ):
-                gloss_data.tags.append(clean_node(wxr, None, node))
+                gloss_data.tags.append(clean_node(wxr, None, gloss_node))
                 tag_indexes |= {index - 1, index, index + 1}
-                continue
 
         gloss_only_nodes = [
             node
-            for index, node in enumerate(gloss_nodes[gloss_start:], gloss_start)
+            for index, node in enumerate(gloss_nodes)
             if index not in tag_indexes
         ]
         gloss_text = clean_node(wxr, gloss_data, gloss_only_nodes)

diff --git a/src/wiktextract/extractor/fr/inflection.py b/src/wiktextract/extractor/fr/inflection.py
@@ -16,7 +16,10 @@ def extract_inflection(
 ) -> None:
     # inflection templates
     # https://fr.wiktionary.org/wiki/Catégorie:Modèles_d’accord_en_français
-    process_inflection_table(wxr, page_data, template_node)
+    if template_node.template_name.startswith("en-adj"):
+        process_en_adj_table(wxr, page_data, template_node)
+    else:
+        process_inflection_table(wxr, page_data, template_node)
 
 
 IGNORE_TABLE_HEADERS = frozenset(
@@ -192,3 +195,37 @@ def insert_ipa(form: Form, ipa_text: str) -> None:
     if len(ipa_data) == 0:
         return
     form.ipas.extend(ipa_data)
+
+
+def process_en_adj_table(
+    wxr: WiktextractContext,
+    page_data: list[WordEntry],
+    template_node: WikiNode,
+) -> None:
+    # https://fr.wiktionary.org/wiki/Modèle:en-adj
+    # and other en-adj* templates
+    # these templates use normal table cell for column table header
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(template_node), expand_all=True
+    )
+    table_nodes = list(expanded_node.find_child(NodeKind.TABLE))
+    if len(table_nodes) == 0:
+        return
+    table_node = table_nodes[0]
+    for row_num, table_row in enumerate(
+        table_node.find_child(NodeKind.TABLE_ROW)
+    ):
+        if row_num == 0:
+            # skip header
+            continue
+        if len(table_row.children) > 1:
+            form_data = Form()
+            form_data.tags.append(clean_node(wxr, None, table_row.children[0]))
+            form_text = clean_node(wxr, None, table_row.children[1])
+            for form_line in form_text.splitlines():
+                if is_ipa_text(form_line):
+                    insert_ipa(form_data, form_line)
+                else:
+                    form_data.form = form_line
+            if form_data.form != page_data[-1].word:
+                page_data[-1].forms.append(form_data)
diff --git a/src/wiktextract/extractor/fr/linkage.py b/src/wiktextract/extractor/fr/linkage.py
@@ -1,3 +1,5 @@
+import re
+
 from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.parser import TemplateNode
 from wiktextract.page import clean_node
@@ -33,12 +35,14 @@ def process_derives_autres_list(
     for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
         lang_code = ""
         lang_name = ""
-        for template_node in list_item.find_child(NodeKind.TEMPLATE):
-            if template_node.template_name == "L":
-                lang_code = template_node.template_parameters.get(1)
-                lang_name = clean_node(wxr, None, template_node)
-            elif template_node.template_name == "lien":
-                word = clean_node(wxr, None, template_node)
+        for node in list_item.find_child(NodeKind.TEMPLATE | NodeKind.LINK):
+            if isinstance(node, TemplateNode) and node.template_name == "L":
+                lang_code = node.template_parameters.get(1)
+                lang_name = clean_node(wxr, None, node)
+            elif node.kind == NodeKind.LINK or (
+                isinstance(node, TemplateNode) and node.template_name == "lien"
+            ):
+                word = clean_node(wxr, None, node)
                 page_data[-1].derived.append(
                     Linkage(lang_code=lang_code, lang_name=lang_name, word=word)
                 )
@@ -66,9 +70,21 @@ def process_linkage_list(
             sense_index_text = template_or_list_node.template_parameters.get(
                 2, "0"
             )
-            if sense_index_text.isdigit():
+            if isinstance(sense_index_text, str) and sense_index_text.isdigit():
                 sense_index = int(sense_index_text)
             continue
+        # sense could also be in ";" description list
+        if (
+            template_or_list_node.kind == NodeKind.LIST_ITEM
+            and template_or_list_node.sarg == ";"
+        ):
+            sense_text = clean_node(wxr, None, template_or_list_node.children)
+            index_pattern = r"\s*\((?:sens\s*)?(\d+)\)$"
+            m = re.search(index_pattern, sense_text)
+            if m is not None:
+                sense_text = re.sub(index_pattern, "", sense_text)
+                sense_index = int(m.group(1))
+            continue
 
         linkage_data = Linkage()
         if len(sense_text) > 0:

diff --git a/src/wiktextract/extractor/fr/models.py b/src/wiktextract/extractor/fr/models.py
@@ -48,6 +48,9 @@ class Translation(FrenchBaseModel):
     lang_name: str = Field("", description="Translation language name")
     word: str = Field("", description="Translation term")
     sense: str = Field("", description="Translation gloss")
+    sense_index: int = Field(
+        0, ge=0, description="Number of the definition, start from 1"
+    )
     tags: list[str] = []
     roman: str = ""
     traditional_writing: str = Field(

diff --git a/src/wiktextract/extractor/fr/page.py b/src/wiktextract/extractor/fr/page.py
@@ -90,7 +90,7 @@ def parse_section(
                 wxr.config.capture_translations
                 and section_type in wxr.config.OTHER_SUBTITLES["translations"]
             ):
-                extract_translation(wxr, page_data, level_node)
+                extract_translation(wxr, page_data, base_data, level_node)
             elif (
                 wxr.config.capture_inflections
                 and section_type
@@ -114,7 +114,7 @@ def process_pos_block(
     pos_title: str,
 ):
     pos_type = wxr.config.POS_SUBTITLES[pos_argument]["pos"]
-    if len(page_data) == 0 or "pos" not in page_data[-1].model_fields_set:
+    if len(page_data) == 0 or "pos" in page_data[-1].model_fields_set:
         page_data.append(base_data.model_copy(deep=True))
     page_data[-1].pos = pos_type
     page_data[-1].pos_title = pos_title

diff --git a/src/wiktextract/extractor/fr/pronunciation.py b/src/wiktextract/extractor/fr/pronunciation.py
@@ -1,5 +1,5 @@
 from wikitextprocessor import NodeKind, WikiNode
-from wikitextprocessor.parser import TemplateNode
+from wikitextprocessor.parser import LEVEL_KIND_FLAGS, TemplateNode
 from wiktextract.extractor.share import create_audio_url_dict
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
@@ -15,11 +15,18 @@ def extract_pronunciation(
 ) -> None:
     sound_data = []
     lang_code = base_data.lang_code
-    for list_node in level_node.find_child(NodeKind.LIST):
-        for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
-            sound_data.extend(
-                process_pron_list_item(wxr, list_item_node, Sound(), lang_code)
-            )
+    for node in level_node.find_child(NodeKind.LIST | LEVEL_KIND_FLAGS):
+        if node.kind == NodeKind.LIST:
+            for list_item_node in node.find_child(NodeKind.LIST_ITEM):
+                sound_data.extend(
+                    process_pron_list_item(
+                        wxr, list_item_node, Sound(), lang_code
+                    )
+                )
+        else:
+            from .page import parse_section
+
+            parse_section(wxr, page_data, base_data, node)
 
     if len(sound_data) == 0:
         return

diff --git a/src/wiktextract/extractor/fr/translation.py b/src/wiktextract/extractor/fr/translation.py
@@ -1,15 +1,18 @@
 from typing import Optional
 
 from wikitextprocessor import NodeKind, WikiNode
-from wikitextprocessor.parser import TemplateNode
+from wikitextprocessor.parser import LEVEL_KIND_FLAGS, TemplateNode
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
 
 from .models import Translation, WordEntry
 
 
 def extract_translation(
-    wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode
+    wxr: WiktextractContext,
+    page_data: list[WordEntry],
+    base_data: WordEntry,
+    level_node: WikiNode,
 ) -> None:
     base_translation_data = Translation()
     for level_node_child in level_node.filter_empty_str_child():
@@ -38,6 +41,10 @@ def extract_translation(
                                     wxr, child_node, previous_node, page_data
                                 )
                             previous_node = child_node
+            elif level_node_child.kind in LEVEL_KIND_FLAGS:
+                from .page import parse_section
+
+                parse_section(wxr, page_data, base_data, level_node_child)
 
 
 def process_italic_node(
@@ -70,11 +77,13 @@ def process_translation_templates(
         return
     elif template_node.template_name == "trad-début":
         # translation box start: https://fr.wiktionary.org/wiki/Modèle:trad-début
-        sense_parameter = template_node.template_parameters.get(1)
-        if sense_parameter is not None:
-            sense_text = clean_node(wxr, None, sense_parameter)
-            if len(sense_text) > 0:
-                base_translation_data.sense = sense_text
+        sense_parameter = template_node.template_parameters.get(1, "")
+        sense_text = clean_node(wxr, None, sense_parameter)
+        base_translation_data.sense = sense_text
+        sense_index_str = template_node.template_parameters.get(2, "0")
+        if isinstance(sense_index_str, str) and sense_index_str.isdigit():
+            base_translation_data.sense_index = int(sense_index_str)
+
     elif template_node.template_name == "T":
         # Translation language: https://fr.wiktionary.org/wiki/Modèle:T
         base_translation_data.lang_code = template_node.template_parameters.get(
@@ -85,6 +94,8 @@ def process_translation_templates(
         )
     elif template_node.template_name.startswith("trad"):
         # Translation term: https://fr.wiktionary.org/wiki/Modèle:trad
+        if 2 not in template_node.template_parameters:  # required parameter
+            return
         translation_term = clean_node(
             wxr,
             None,

diff --git a/src/wiktextract/wiktionary.py b/src/wiktextract/wiktionary.py
@@ -184,7 +184,7 @@ def reprocess_wiktionary(
     process_ns_ids = list(
         {
             wxr.wtp.NAMESPACE_DATA.get(ns, {}).get("id", 0)
-            for ns in ["Main", "Reconstruction"]
+            for ns in wxr.config.extract_ns_names
         }
     )
     start_time = time.time()

diff --git a/src/wiktextract/wiktwords.py b/src/wiktextract/wiktwords.py
@@ -45,18 +45,6 @@
 from wiktextract.wiktionary import write_json_data
 from wiktextract.wxr_context import WiktextractContext
 
-# Pages within these namespaces are captured.
-RECOGNIZED_NAMESPACE_NAMES = [
-    "Main",
-    "Category",
-    "Appendix",
-    "Project",
-    "Thesaurus",
-    "Module",
-    "Template",
-    "Reconstruction",
-]
-
 
 def process_single_page(
     path_or_title: str,
@@ -440,8 +428,8 @@ def main():
     try:
         if args.path is not None:
             namespace_ids = {
-                wxr.wtp.NAMESPACE_DATA.get(name, {}).get("id")
-                for name in RECOGNIZED_NAMESPACE_NAMES
+                wxr.wtp.NAMESPACE_DATA.get(name, {}).get("id", 0)
+                for name in wxr.config.save_ns_names
             }
             # Parse the normal full Wiktionary data dump
             parse_wiktionary(

diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py
@@ -280,3 +280,29 @@ def test_nest_gloss(self):
                 },
             ],
         )
+
+    def test_sandwich_tag(self):
+        # https://fr.wiktionary.org/wiki/autrice#Nom_commun_4
+        self.wxr.wtp.start_page("autrice")
+        self.wxr.wtp.add_page("Modèle:lexique", 10, "''(Littérature)''")
+        self.wxr.wtp.add_page("Modèle:rare", 10, "''(Rare)''")
+        self.wxr.wtp.add_page("Modèle:lien", 10, "Autrice")
+        self.wxr.wtp.add_page("Modèle:absolument", 10, "''(Absolument)''")
+        root = self.wxr.wtp.parse(
+            "# {{lexique|littérature|nl}} {{rare|nl}} {{lien|autrice|fr|dif=Autrice}}, femme qui a créé une œuvre littéraire. {{absolument}} [[écrivaine|Écrivaine]]."
+        )
+        page_data = [
+            WordEntry(word="autrice", lang_code="nl", lang_name="Néerlandais")
+        ]
+        extract_gloss(self.wxr, page_data, root.children[0])
+        self.assertEqual(
+            [d.model_dump(exclude_defaults=True) for d in page_data[-1].senses],
+            [
+                {
+                    "glosses": [
+                        "Autrice, femme qui a créé une œuvre littéraire. Écrivaine."
+                    ],
+                    "tags": ["Littérature", "Rare", "Absolument"],
+                }
+            ],
+        )