Skip to content

Commit

Permalink
Merge pull request #496 from xxyzz/fr
Browse files Browse the repository at this point in the history
Update fr edition extractor
  • Loading branch information
xxyzz authored Feb 7, 2024
2 parents b169749 + 902774a commit 5c4c72d
Show file tree
Hide file tree
Showing 11 changed files with 115 additions and 14 deletions.
1 change: 1 addition & 0 deletions src/wiktextract/data/fr/linkage_subtitles.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{
"abrév": "abbreviation",
"abréviations": "abbreviation",
"antonymes": "antonyms",
"app": "related",
"apparentés": "related",
"apr": "related",
Expand Down
15 changes: 15 additions & 0 deletions src/wiktextract/extractor/fr/gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ def extract_gloss(
find_alt_of_form(
wxr, gloss_only_nodes[:note_index], page_data[-1].pos, gloss_data
)
if "form-of" in page_data[-1].tags:
find_form_of_word(wxr, gloss_only_nodes[:note_index], gloss_data)
gloss_text = clean_node(
wxr, gloss_data, gloss_only_nodes[:note_index]
).strip(" ()")
Expand Down Expand Up @@ -192,3 +194,16 @@ def find_alt_of_form(
alt_of = clean_node(wxr, None, link)
if len(alt_of) > 0:
gloss_data.alt_of.append(AltForm(word=alt_of))


def find_form_of_word(
wxr: WiktextractContext,
gloss_nodes: list[Union[str, WikiNode]],
gloss_data: Sense,
) -> None:
form_of = ""
for node in gloss_nodes:
if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
form_of = clean_node(wxr, None, node)
if len(form_of) > 0:
gloss_data.form_of.append(AltForm(word=form_of))
11 changes: 10 additions & 1 deletion src/wiktextract/extractor/fr/inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ def extract_inflection(
"commun", # sv-nom-c-ar
}
)
IGNORE_TABLE_HEADER_PREFIXES = (
"voir la conjugaison du verbe", # Modèle:fr-verbe-flexion
)
IGNORE_TABLE_CELL = frozenset(
{
"Déclinaisons", # de-adj
Expand Down Expand Up @@ -108,7 +111,13 @@ def process_inflection_table(
table_header_text = clean_node(
wxr, None, table_cell
).replace("\n", " ")
if table_header_text.lower() in IGNORE_TABLE_HEADERS:
if (
table_header_text.lower() in IGNORE_TABLE_HEADERS
or table_header_text.lower().startswith(
IGNORE_TABLE_HEADER_PREFIXES
)
or len(table_header_text.strip()) == 0
):
continue
if not current_row_has_data_cell:
# if all cells of the row are header cells
Expand Down
2 changes: 2 additions & 0 deletions src/wiktextract/extractor/fr/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ class Sense(FrenchBaseModel):
examples: list[Example] = []
note: str = ""
alt_of: list[AltForm] = []
form_of: list[AltForm] = []


class WordEntry(FrenchBaseModel):
Expand All @@ -104,6 +105,7 @@ class WordEntry(FrenchBaseModel):
forms: list[Form] = Field([], description="Inflection forms list")
sounds: list[Sound] = []
translations: list[Translation] = []
antonyms: list[Linkage] = []
synonyms: list[Linkage] = []
hyponyms: list[Linkage] = []
hypernyms: list[Linkage] = []
Expand Down
10 changes: 8 additions & 2 deletions src/wiktextract/extractor/fr/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import Any, Optional

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LEVEL_KIND_FLAGS, TemplateNode
from wikitextprocessor.parser import LEVEL_KIND_FLAGS
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

Expand Down Expand Up @@ -109,7 +109,7 @@ def process_pos_block(
wxr: WiktextractContext,
page_data: list[WordEntry],
base_data: WordEntry,
pos_title_node: TemplateNode,
pos_title_node: WikiNode,
pos_argument: str,
pos_title: str,
):
Expand All @@ -120,6 +120,12 @@ def process_pos_block(
page_data[-1].pos = pos_type
page_data[-1].pos_title = pos_title
page_data[-1].tags.extend(pos_data.get("tags", []))
for level_node_template in pos_title_node.find_content(NodeKind.TEMPLATE):
if (
level_node_template.template_name == "S"
and level_node_template.template_parameters.get(3) == "flexion"
):
page_data[-1].tags.append("form-of")
child_nodes = list(pos_title_node.filter_empty_str_child())
form_line_start = 0 # Ligne de forme
gloss_start = len(child_nodes)
Expand Down
7 changes: 5 additions & 2 deletions src/wiktextract/extractor/fr/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,9 @@ def process_italic_node(
and previous_node.template_name.startswith("trad")
and len(page_data[-1].translations) > 0
):
page_data[-1].translations[-1].tags.append(tag.strip("()"))
tag = tag.strip("()")
if len(tag) > 0:
page_data[-1].translations[-1].tags.append(tag)


def process_translation_templates(
Expand Down Expand Up @@ -143,4 +145,5 @@ def process_translation_templates(
page_data[-1].translations.append(translation_data)
elif len(page_data[-1].translations) > 0:
tag = clean_node(wxr, None, template_node).strip("()")
page_data[-1].translations[-1].tags.append(tag)
if len(tag) > 0:
page_data[-1].translations[-1].tags.append(tag)
4 changes: 3 additions & 1 deletion src/wiktextract/extractor/share.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ def capture_text_in_parentheses(text: str) -> tuple[list[str], str]:
if len(not_captured) > 0:
rest_parts.append(not_captured)
last_group_end = m.end()
capture_text_list.append(m.group()[1:-1])
text = m.group()[1:-1].strip()
if len(text) > 0:
capture_text_list.append(text)

rest_text = " ".join(rest_parts) if len(rest_parts) > 0 else text
return capture_text_list, rest_text
Expand Down
49 changes: 47 additions & 2 deletions tests/test_fr_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,20 @@
from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.fr.gloss import extract_gloss
from wiktextract.extractor.fr.models import WordEntry
from wiktextract.extractor.fr.page import process_pos_block
from wiktextract.extractor.fr.page import parse_page, process_pos_block
from wiktextract.wxr_context import WiktextractContext


class TestFrGloss(TestCase):
maxDiff = None

def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="fr"), WiktionaryConfig(dump_file_lang_code="fr")
Wtp(lang_code="fr"),
WiktionaryConfig(
dump_file_lang_code="fr",
capture_language_codes=None,
),
)

def tearDown(self) -> None:
Expand Down Expand Up @@ -392,3 +398,42 @@ def test_variante_de_dif(self):
}
],
)

def test_form_of(self):
self.wxr.wtp.start_page("dièse")
self.wxr.wtp.add_page("Modèle:langue", 10, "Français")
self.wxr.wtp.add_page("Modèle:S", 10, "Forme de verbe")
self.assertEqual(
parse_page(
self.wxr,
"dièse",
"""== {{langue|fr}} ==
=== {{S|verbe|fr|flexion}} ===
# ''Première personne du singulier de l’indicatif présent du verbe'' [[diéser]].
# ''Troisième personne du singulier de l’indicatif présent du verbe'' [[diéser]].""",
),
[
{
"lang": "Français",
"lang_code": "fr",
"pos": "verb",
"pos_title": "Forme de verbe",
"senses": [
{
"form_of": [{"word": "diéser"}],
"glosses": [
"Première personne du singulier de l’indicatif présent du verbe diéser."
],
},
{
"form_of": [{"word": "diéser"}],
"glosses": [
"Troisième personne du singulier de l’indicatif présent du verbe diéser."
],
},
],
"tags": ["form-of"],
"word": "dièse"
}
],
)
15 changes: 15 additions & 0 deletions tests/test_fr_linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,3 +230,18 @@ def test_italic_sense_node(self):
{"word": "more", "sense": "selon les adjectifs"},
],
)

def test_no_linkage_empty_tag(self):
page_data = [WordEntry(word="gambo", lang_code="eo", lang="Espéranto")]
self.wxr.wtp.start_page("gambo")
root = self.wxr.wtp.parse("* [[korpo]] ( ''[[corps]]'' )")
extract_linkage(self.wxr, page_data, root, "holonymes")
self.assertEqual(
[
d.model_dump(exclude_defaults=True)
for d in page_data[-1].holonyms
],
[
{"word": "korpo", "sense": "corps"},
],
)
13 changes: 8 additions & 5 deletions tests/test_fr_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,16 @@


class TestFrPage(TestCase):
maxDiff = None

def setUp(self):
self.maxDiff = None
conf1 = WiktionaryConfig(
dump_file_lang_code="fr",
capture_language_codes=None,
self.wxr = WiktextractContext(
Wtp(lang_code="fr"),
WiktionaryConfig(
dump_file_lang_code="fr",
capture_language_codes=None,
),
)
self.wxr = WiktextractContext(Wtp(lang_code="fr"), conf1)

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()
Expand Down
2 changes: 1 addition & 1 deletion tests/test_fr_pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def test_no_ipa(self):
self.wxr.wtp.add_page(
"Modèle:écouter",
10,
'<span><span>Suède</span>&nbsp;: écouter «&nbsp;<span>mars</span> <span><span>[<small><span>[//fr.wiktionary.org Prononciation ?]</span></small>]</span></span>&nbsp;» <span>[[File:LL-Q9027 (swe)-Moonhouse-mars.wav]]</span></span>',
"<span><span>Suède</span>&nbsp;: écouter «&nbsp;<span>mars</span> <span><span>[<small><span>[//fr.wiktionary.org Prononciation ?]</span></small>]</span></span>&nbsp;» <span>[[File:LL-Q9027 (swe)-Moonhouse-mars.wav]]</span></span>",
)
extract_pronunciation(
self.wxr,
Expand Down

0 comments on commit 5c4c72d

Please sign in to comment.