Skip to content

Commit

Permalink
Merge pull request #928 from xxyzz/fr
Browse files Browse the repository at this point in the history
[fr] improve etymology and pos section code
  • Loading branch information
xxyzz authored Nov 28, 2024
2 parents 462713f + f3bc491 commit 5d8b946
Show file tree
Hide file tree
Showing 11 changed files with 98 additions and 44 deletions.
8 changes: 2 additions & 6 deletions src/wiktextract/extractor/fr/etymology.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from collections import defaultdict
from dataclasses import dataclass, field
from typing import Optional

from wikitextprocessor.parser import (
LEVEL_KIND_FLAGS,
Expand Down Expand Up @@ -34,11 +33,8 @@ def extract_etymology(
for node_index, node in level_node.find_child(
NodeKind.LIST | LEVEL_KIND_FLAGS, True
):
if node.kind in LEVEL_KIND_FLAGS:
if node.kind in LEVEL_KIND_FLAGS and node_index < level_node_index:
level_node_index = node_index
title_text = clean_node(wxr, None, node.largs)
if title_text == "Attestations historiques":
extract_etymology_examples(wxr, node, base_data)
elif node.kind == NodeKind.LIST:
for etymology_item in node.find_child(NodeKind.LIST_ITEM):
etymology_data = find_pos_in_etymology_list(wxr, etymology_item)
Expand Down Expand Up @@ -88,7 +84,7 @@ def extract_etymology(

def find_pos_in_etymology_list(
wxr: WiktextractContext, list_item_node: WikiNode
) -> Optional[tuple[str, str, str, list[str]]]:
) -> tuple[str, str, str, list[str]] | None:
"""
Return tuple of POS id, title, etymology text, categories if the passed
list item node starts with italic POS node or POS template, otherwise
Expand Down
20 changes: 13 additions & 7 deletions src/wiktextract/extractor/fr/form_line.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from typing import Union

from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode

from ...page import clean_node
Expand All @@ -17,7 +15,7 @@
def extract_form_line(
wxr: WiktextractContext,
page_data: list[WordEntry],
nodes: list[Union[WikiNode, str]],
nodes: list[WikiNode | str],
) -> None:
"""
Ligne de forme
Expand All @@ -32,7 +30,7 @@ def extract_form_line(

pre_template_name = ""
for index, node in enumerate(nodes):
if isinstance(node, WikiNode) and node.kind == NodeKind.TEMPLATE:
if isinstance(node, TemplateNode):
if node.template_name in IGNORE_TEMPLATES:
continue
elif node.template_name in PRON_TEMPLATES:
Expand All @@ -56,6 +54,11 @@ def extract_form_line(
continue
elif node.template_name == "lien pronominal":
process_lien_pronominal(wxr, node, page_data)
elif node.template_name == "note":
note = clean_node(wxr, page_data[-1], nodes[index + 1 :])
if note != "":
page_data[-1].notes.append(note)
break
else:
raw_tag = clean_node(wxr, page_data[-1], node)
expanded_template = wxr.wtp.parse(
Expand Down Expand Up @@ -94,7 +97,7 @@ def extract_form_line(

def process_equiv_pour_template(
wxr: WiktextractContext, node: TemplateNode, page_data: list[WordEntry]
) -> None:
) -> list[Form]:
# equivalent form: https://fr.wiktionary.org/wiki/Modèle:équiv-pour
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(node), expand_all=True
Expand All @@ -109,7 +112,7 @@ def process_equiv_pour_template(
"une fille": "feminine",
"une personne non-binaire": "neuter",
}

forms = []
for child in expanded_node.find_child(NodeKind.ITALIC | NodeKind.HTML):
if child.kind == NodeKind.ITALIC:
raw_gender_tag = clean_node(wxr, None, child).strip("() ")
Expand All @@ -127,7 +130,10 @@ def process_equiv_pour_template(
else:
form_data.raw_tags.append(raw_gender_tag)
if len(form_data.form) > 0:
page_data[-1].forms.append(form_data)
if len(page_data) > 0:
page_data[-1].forms.append(form_data)
forms.append(form_data)
return forms


def process_zh_mot_template(
Expand Down
30 changes: 21 additions & 9 deletions src/wiktextract/extractor/fr/gloss.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import re
from collections import defaultdict
from typing import Optional, Union

from wikitextprocessor import NodeKind, TemplateNode, WikiNode

Expand All @@ -13,7 +13,7 @@ def extract_gloss(
wxr: WiktextractContext,
page_data: list[WordEntry],
list_node: WikiNode,
parent_sense: Optional[Sense] = None,
parent_sense: Sense | None = None,
) -> None:
for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
gloss_nodes = list(
Expand All @@ -31,7 +31,10 @@ def extract_gloss(
# https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens
tag_indexes = set()
for index, gloss_node in enumerate(gloss_nodes):
if isinstance(gloss_node, TemplateNode):
if (
isinstance(gloss_node, TemplateNode)
and gloss_node.template_name != "équiv-pour"
):
categories_data = defaultdict(list)
expanded_text = clean_node(wxr, categories_data, gloss_node)
if (
Expand Down Expand Up @@ -74,7 +77,7 @@ def extract_gloss(
):
note_index = index
gloss_text = find_alt_of_form(
wxr, gloss_only_nodes[:note_index], page_data[-1].pos, gloss_data
wxr, gloss_only_nodes[:note_index], page_data[-1], gloss_data
)
if "form-of" in page_data[-1].tags:
find_form_of_word(wxr, gloss_only_nodes[:note_index], gloss_data)
Expand Down Expand Up @@ -140,7 +143,7 @@ def extract_examples(
def process_exemple_template(
wxr: WiktextractContext,
node: TemplateNode,
gloss_data: Optional[Sense],
gloss_data: Sense | None,
time: str = "",
) -> Example:
# https://fr.wiktionary.org/wiki/Modèle:exemple
Expand Down Expand Up @@ -176,13 +179,14 @@ def process_exemple_template(

def find_alt_of_form(
wxr: WiktextractContext,
gloss_nodes: list[Union[str, WikiNode]],
pos_type: str,
gloss_nodes: list[str | WikiNode],
word_entry: WordEntry,
gloss_data: Sense,
) -> str:
"""
Return gloss text, remove tag template expanded from "variante *" templates.
"""
from .form_line import process_equiv_pour_template

alt_of = ""
filtered_gloss_nodes = []
Expand Down Expand Up @@ -216,10 +220,17 @@ def find_alt_of_form(
gloss_data.raw_tags.append(raw_tag)
else:
filtered_gloss_nodes.append(node)
elif (
isinstance(gloss_node, TemplateNode)
and gloss_node.template_name == "équiv-pour"
):
for form_data in process_equiv_pour_template(wxr, gloss_node, []):
form_data.sense_index = len(word_entry.senses) + 1
word_entry.forms.append(form_data)
else:
filtered_gloss_nodes.append(gloss_node)

if alt_of == "" and pos_type == "typographic variant":
if alt_of == "" and word_entry.pos == "typographic variant":
for gloss_node in filter(
lambda n: isinstance(n, WikiNode), gloss_nodes
):
Expand All @@ -236,6 +247,7 @@ def find_alt_of_form(
gloss_data.alt_of.append(AltForm(word=alt_of))

gloss_text = clean_node(wxr, gloss_data, filtered_gloss_nodes)
gloss_text = re.sub(r"\s+\.$", ".", gloss_text)
brackets = 0
for char in gloss_text:
if char == "(":
Expand All @@ -249,7 +261,7 @@ def find_alt_of_form(

def find_form_of_word(
wxr: WiktextractContext,
gloss_nodes: list[Union[str, WikiNode]],
gloss_nodes: list[str | WikiNode],
gloss_data: Sense,
) -> None:
# https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_variantes
Expand Down
3 changes: 3 additions & 0 deletions src/wiktextract/extractor/fr/inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,9 @@ def process_inflection_table(
form_data.raw_tags.extend(row_headers)
if form_data.form != "":
for form in form_data.form.splitlines():
if form.startswith("(") and form.endswith(")"):
form_data.raw_tags.append(form.strip("()"))
continue
new_form_data = form_data.model_copy(deep=True)
new_form_data.form = form.removeprefix("ou ")
translate_raw_tags(
Expand Down
1 change: 1 addition & 0 deletions src/wiktextract/extractor/fr/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class Form(FrenchBaseModel):
)
hiragana: str = ""
roman: str = ""
sense_index: int = Field(default=0, ge=0)


class Sound(FrenchBaseModel):
Expand Down
6 changes: 3 additions & 3 deletions src/wiktextract/extractor/fr/page.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, Optional
from typing import Any

from wikitextprocessor.parser import (
LEVEL_KIND_FLAGS,
Expand Down Expand Up @@ -41,7 +41,7 @@ def parse_section(
page_data: list[WordEntry],
base_data: WordEntry,
level_node: WikiNode,
) -> Optional[EtymologyData]:
) -> EtymologyData | None:
etymology_data = None
for level_node_template in level_node.find_content(NodeKind.TEMPLATE):
if level_node_template.template_name == "S":
Expand Down Expand Up @@ -230,7 +230,7 @@ def parse_page(
pos="unknown",
categories=categories.get("categories", []),
)
etymology_data: Optional[EtymologyData] = None
etymology_data: EtymologyData | None = None
for level3_node in level2_node.find_child(NodeKind.LEVEL3):
new_etymology_data = parse_section(
wxr, page_data, base_data, level3_node
Expand Down
18 changes: 8 additions & 10 deletions src/wiktextract/extractor/fr/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,10 @@
# https://fr.wiktionary.org/wiki/Annexe:Glossaire_grammatical
# List of templates:
# https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles
from typing import Union

from .models import WordEntry

# https://en.wikipedia.org/wiki/Grammatical_gender
GENDER_TAGS: dict[str, Union[str, list[str]]] = {
GENDER_TAGS: dict[str, str | list[str]] = {
"commun": "common",
"féminin": "feminine",
"masculin": "masculine",
Expand All @@ -23,7 +21,7 @@
}

# https://en.wikipedia.org/wiki/Grammatical_number
NUMBER_TAGS: dict[str, Union[str, list[str]]] = {
NUMBER_TAGS: dict[str, str | list[str]] = {
"singulier": "singular",
"pluriel": "plural",
"duel": "dual",
Expand Down Expand Up @@ -51,7 +49,7 @@
"volitif": "volitive",
}

VERB_FORM_TAGS: dict[str, Union[str, list[str]]] = {
VERB_FORM_TAGS: dict[str, str | list[str]] = {
"participe": "participle",
"imparfait": "imperfect",
"infinitif": "infinitive",
Expand All @@ -62,7 +60,7 @@
}

# https://en.wikipedia.org/wiki/Grammatical_case
CASE_TAGS: dict[str, Union[str, list[str]]] = {
CASE_TAGS: dict[str, str | list[str]] = {
"ablatif": "ablative",
"accusatif": "accusative",
"accusatif génitif": ["accusative", "genitive"],
Expand All @@ -78,7 +76,7 @@
}

# https://en.wikipedia.org/wiki/Grammatical_tense
TENSE_TAGS: dict[str, Union[str, list[str]]] = {
TENSE_TAGS: dict[str, str | list[str]] = {
"présent": "present",
"passé": "past",
"passé simple": "past",
Expand All @@ -96,7 +94,7 @@
}

# https://en.wikipedia.org/wiki/Grammatical_person
PERSON_TAGS: dict[str, Union[str, list[str]]] = {
PERSON_TAGS: dict[str, str | list[str]] = {
"1ᵉ personne": "first-person",
"1ʳᵉ personne": "first-person",
"2ᵉ personne": "second-person",
Expand Down Expand Up @@ -216,7 +214,7 @@
}

# https://en.wikipedia.org/wiki/Voice_(grammar)
VOICE_TAGS: dict[str, Union[str, list[str]]] = {
VOICE_TAGS: dict[str, str | list[str]] = {
# https://fr.wiktionary.org/wiki/Modèle:eo-conj
"participe actif": ["participle", "active"],
"participe passif": ["participle", "passive"],
Expand Down Expand Up @@ -285,7 +283,7 @@
"imperfectif": "imperfective", # Modèle:imperfectif
}

GRAMMATICAL_TAGS: dict[str, Union[str, list[str]]] = {
GRAMMATICAL_TAGS: dict[str, str | list[str]] = {
**GENDER_TAGS,
**NUMBER_TAGS,
**MOOD_TAGS,
Expand Down
10 changes: 4 additions & 6 deletions src/wiktextract/extractor/fr/translation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from typing import Optional

from mediawiki_langcodes import code_to_name
from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode

Expand Down Expand Up @@ -59,8 +57,8 @@ def extract_translation(
def process_italic_node(
wxr: WiktextractContext,
italic_node: WikiNode,
previous_node: Optional[WikiNode],
translation_data: Optional[Translation],
previous_node: WikiNode | None,
translation_data: Translation | None,
) -> None:
# add italic text after a "trad" template as a tag
tag = clean_node(wxr, None, italic_node)
Expand All @@ -83,8 +81,8 @@ def process_translation_templates(
template_node: TemplateNode,
page_data: list[WordEntry],
base_translation_data: Translation,
translation_data: Optional[Translation],
) -> Optional[Translation]:
translation_data: Translation | None,
) -> Translation | None:
if template_node.template_name == "trad-fin":
# ignore translation end template
return
Expand Down
7 changes: 4 additions & 3 deletions tests/test_fr_etymology.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
insert_etymology_data,
)
from wiktextract.extractor.fr.models import WordEntry
from wiktextract.extractor.fr.page import parse_section
from wiktextract.wxr_context import WiktextractContext


Expand Down Expand Up @@ -293,7 +294,7 @@ def test_etymology_examples(self):
word_entry = WordEntry(
lang="Français", lang_code="fr", word="autrice", pos="noun"
)
extract_etymology(self.wxr, root, word_entry)
parse_section(self.wxr, [], word_entry, root.children[0])
data = word_entry.model_dump(exclude_defaults=True)
self.assertEqual(
data["etymology_examples"],
Expand Down Expand Up @@ -398,7 +399,7 @@ def test_etymology_examples_nested_lists(self):
word_entry = WordEntry(
lang="Français", lang_code="fr", word="drone", pos="noun"
)
extract_etymology(self.wxr, root, word_entry)
parse_section(self.wxr, [], word_entry, root.children[0])
data = word_entry.model_dump(exclude_defaults=True)
self.assertEqual(
data["etymology_examples"],
Expand All @@ -423,7 +424,7 @@ def test_etymology_examples_text(self):
word_entry = WordEntry(
lang="Français", lang_code="fr", word="préavertir", pos="verb"
)
extract_etymology(self.wxr, root, word_entry)
parse_section(self.wxr, [], word_entry, root.children[0])
data = word_entry.model_dump(exclude_defaults=True)
self.assertEqual(
data["etymology_examples"],
Expand Down
7 changes: 7 additions & 0 deletions tests/test_fr_form_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,3 +202,10 @@ def test_lien_pronominal(self):
[f.model_dump(exclude_defaults=True) for f in page_data[-1].forms],
[{"form": "se définir", "tags": ["pronominal"]}],
)

def test_note(self):
self.wxr.wtp.start_page("autaire")
page_data = [WordEntry(word="autaire", lang_code="fr", lang="Français")]
root = self.wxr.wtp.parse("'''autaire''' {{note}} note")
extract_form_line(self.wxr, page_data, root.children)
self.assertEqual(page_data[-1].notes, ["note"])
Loading

0 comments on commit 5d8b946

Please sign in to comment.