Skip to content

Commit

Permalink
Extract "Onglets conjugaison" template and transclude pages
Browse files Browse the repository at this point in the history
Some IPA links and the final "Impératif" table can't be parsed because
the parser can't handle `<nowiki />` in these cases:

- `[[ignore|<span>\\<nowiki /> sə <nowiki/>ipa\\</span>]]`
- `<nowiki />` tag in table cell
  • Loading branch information
xxyzz committed Jan 9, 2024
1 parent a279eb5 commit 13c2181
Show file tree
Hide file tree
Showing 8 changed files with 132 additions and 123 deletions.
40 changes: 34 additions & 6 deletions src/wiktextract/extractor/fr/conjugation.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Optional

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import HTMLNode, TemplateNode
from wiktextract.page import clean_node
Expand All @@ -6,7 +8,12 @@
from .models import Form, WordEntry


def extract_conjugation(wxr: WiktextractContext, entry: WordEntry) -> None:
def extract_conjugation(
wxr: WiktextractContext,
entry: WordEntry,
word: str = "",
select_template: str = "1",
) -> None:
"""
Find and extract conjugation page.
Expand All @@ -15,16 +22,21 @@ def extract_conjugation(wxr: WiktextractContext, entry: WordEntry) -> None:
https://fr.wiktionary.org/wiki/Aide:Conjugaisons
"""
conj_ns = wxr.wtp.NAMESPACE_DATA["Conjugaison"]
conj_page_title = (
f"{conj_ns['name']}:{entry.lang.lower()}/{entry.word}"
)
if len(word) == 0:
word = entry.word
conj_page_title = f"{conj_ns['name']}:{entry.lang.lower()}/{word}"
conj_page = wxr.wtp.get_page_body(conj_page_title, conj_ns["id"])
if conj_page is None:
return
conj_root = wxr.wtp.parse(conj_page)
for conj_template in conj_root.find_child(NodeKind.TEMPLATE):
if conj_template.template_name.startswith("fr-conj-"):
process_fr_conj_template(wxr, entry, conj_template)
elif conj_template.template_name == "Onglets conjugaison":
process_onglets_template(wxr, entry, conj_template, select_template)
elif conj_template.template_name.startswith(":Conjugaison:"):
word = conj_template.template_name.rsplit("/", 1)[-1]
extract_conjugation(wxr, entry, word, "2")


def process_fr_conj_template(
Expand Down Expand Up @@ -154,9 +166,25 @@ def process_fr_conj_wiki_table(
cell_text = clean_node(wxr, None, cell)
if cell_index < 2:
form.form += cell_text
if cell_index == 0:
if cell_index == 0 and len(cell_text) > 0:
form.form += " "
else:
form.ipas.append(cell_text)

entry.forms.append(form)
if len(form.form) > 0 and form.form != "—":
entry.forms.append(form)


def process_onglets_template(
wxr: WiktextractContext,
entry: WordEntry,
template_node: TemplateNode,
select: str,
) -> None:
# https://fr.wiktionary.org/wiki/Modèle:Onglets_conjugaison
# this template expands to two tabs of tables
selected_template = template_node.template_parameters.get(
f"contenu{select}"
)
if selected_template is not None:
process_fr_conj_template(wxr, entry, selected_template)
59 changes: 59 additions & 0 deletions tests/test_fr_conj.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,62 @@ def test_fr_conj_1(self):
},
],
)

def test_onglets_conjugaison(self):
# https://fr.wiktionary.org/wiki/Conjugaison:français/s’abattre
self.wxr.wtp.start_page("s’abattre")
self.wxr.wtp.add_page(
"Conjugaison:français/abattre",
116,
"""{{Onglets conjugaison
| onglet1 =Conjugaison active
| contenu1 ={{fr-conj-3-attre|ab|a.b|'=oui}}
| onglet2 =Conjugaison pronominale
| contenu2 ={{fr-conj-3-attre|ab|a.b|'=oui|réfl=1}}
| sél ={{{sél|1}}}
}}""",
)
self.wxr.wtp.add_page(
"Conjugaison:français/s’abattre",
116,
"{{:Conjugaison:français/abattre|sél=2}}",
)
self.wxr.wtp.add_page(
"Modèle:fr-conj-3-attre",
10,
"""<h3> Modes impersonnels </h3>
<div>
{|
|-[[mode|Mode]]
!colspan=\"3\"|[[présent|Présent]]
!colspan=\"3\"|[[passé|Passé]]
|-
|'''[[infinitif|Infinitif]]'''
|s’
|[[abattre]]
|<span>\\s‿a.batʁ\\</span>
|s’être
|[[abattu]]
|<span>\\s‿ɛtʁ‿a.ba.ty\\</span>
|}
</div>""",
)
entry = WordEntry(lang_code="fr", lang="Français", word="s’abattre")
extract_conjugation(self.wxr, entry)
self.assertEqual(
[f.model_dump(exclude_defaults=True) for f in entry.forms],
[
{
"form": "s’abattre",
"ipas": ["\\s‿a.batʁ\\"],
"source": "Conjugaison page",
"tags": ["Modes impersonnels", "Infinitif", "Présent"],
},
{
"form": "s’être abattu",
"ipas": ["\\s‿ɛtʁ‿a.ba.ty\\"],
"source": "Conjugaison page",
"tags": ["Modes impersonnels", "Infinitif", "Passé"],
},
],
)
20 changes: 5 additions & 15 deletions tests/test_fr_form_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,7 @@ def test_ipa(self):
self.wxr.wtp.start_page("bonjour")
self.wxr.wtp.add_page("Modèle:pron", 10, "\\bɔ̃.ʒuʁ\\")
root = self.wxr.wtp.parse("'''bonjour''' {{pron|bɔ̃.ʒuʁ|fr}}")
page_data = [
WordEntry(word="bonjour", lang_code="fr", lang="Français")
]
page_data = [WordEntry(word="bonjour", lang_code="fr", lang="Français")]
extract_form_line(self.wxr, page_data, root.children)
self.assertEqual(
[d.model_dump(exclude_defaults=True) for d in page_data[-1].sounds],
Expand All @@ -37,9 +35,7 @@ def test_gender(self):
self.wxr.wtp.start_page("bonjour")
self.wxr.wtp.add_page("Modèle:m", 10, "masculin")
root = self.wxr.wtp.parse("'''bonjour''' {{m}}")
page_data = [
WordEntry(word="bonjour", lang_code="fr", lang="Français")
]
page_data = [WordEntry(word="bonjour", lang_code="fr", lang="Français")]
extract_form_line(self.wxr, page_data, root.children)
self.assertEqual(page_data[-1].tags, ["masculin"])

Expand All @@ -49,9 +45,7 @@ def test_zh_mot(self):
self.wxr.wtp.add_page("Modèle:lang", 10, body="mǎ")
self.wxr.wtp.add_page("Modèle:pron", 10, body="\\ma̠˨˩˦\\")
root = self.wxr.wtp.parse("{{zh-mot|马|mǎ}}")
page_data = [
WordEntry(word="test", lang_code="fr", lang="Français")
]
page_data = [WordEntry(word="test", lang_code="fr", lang="Français")]
process_zh_mot_template(self.wxr, root.children[0], page_data)
self.assertEqual(
[d.model_dump(exclude_defaults=True) for d in page_data[-1].sounds],
Expand Down Expand Up @@ -98,9 +92,7 @@ def test_template_in_pron_argument(self):
root = self.wxr.wtp.parse(
"'''minéral argileux''' {{pron|mi.ne.ʁa.l{{liaison|fr}}aʁ.ʒi.lø|fr}}"
)
page_data = [
WordEntry(word="test", lang_code="fr", lang="Français")
]
page_data = [WordEntry(word="test", lang_code="fr", lang="Français")]
extract_form_line(self.wxr, page_data, root.children)
self.assertEqual(
page_data[-1].sounds[0].model_dump(exclude_defaults=True),
Expand All @@ -117,9 +109,7 @@ def test_equiv_pour_template(self, mock_node_to_wikitext):
root = self.wxr.wtp.parse(
"{{équiv-pour|un homme|auteur|2egenre=une personne non-binaire|2egenre1=autaire|2egenre2=auteurice|2egenre3=auteur·ice|lang=fr}}"
)
page_data = [
WordEntry(word="autrice", lang_code="fr", lang="Français")
]
page_data = [WordEntry(word="autrice", lang_code="fr", lang="Français")]
extract_form_line(self.wxr, page_data, root.children)
self.assertEqual(
page_data[-1].model_dump(exclude_defaults=True),
Expand Down
36 changes: 9 additions & 27 deletions tests/test_fr_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,7 @@ def tearDown(self) -> None:
def test_theme_templates(self, mock_get_page):
self.wxr.wtp.start_page("")
root = self.wxr.wtp.parse("# {{sportifs|fr}} gloss.\n#* example")
page_data = [
WordEntry(word="test", lang_code="fr", lang="Français")
]
page_data = [WordEntry(word="test", lang_code="fr", lang="Français")]
extract_gloss(self.wxr, page_data, root.children[0])
self.assertEqual(
[d.model_dump(exclude_defaults=True) for d in page_data[-1].senses],
Expand All @@ -50,9 +48,7 @@ def test_example_template(self):
root = self.wxr.wtp.parse(
"# gloss.\n#* {{exemple|text|translation|roman|source=source}}"
)
page_data = [
WordEntry(word="test", lang_code="fr", lang="Français")
]
page_data = [WordEntry(word="test", lang_code="fr", lang="Français")]
extract_gloss(self.wxr, page_data, root.children[0])
self.assertEqual(
[d.model_dump(exclude_defaults=True) for d in page_data[-1].senses],
Expand Down Expand Up @@ -80,9 +76,7 @@ def test_example_source_template(self, mock_node_to_html):
root = self.wxr.wtp.parse(
"# gloss.\n#* example {{source|source_title}}"
)
page_data = [
WordEntry(word="test", lang_code="fr", lang="Français")
]
page_data = [WordEntry(word="test", lang_code="fr", lang="Français")]
extract_gloss(self.wxr, page_data, root.children[0])
self.assertEqual(
[d.model_dump(exclude_defaults=True) for d in page_data[-1].senses],
Expand Down Expand Up @@ -151,9 +145,7 @@ def test_variante_de(self):
root = self.wxr.wtp.parse(
"# {{désuet|en}} {{sports|en}} {{indénombrable|en}} {{variante de|basketball|en}}."
)
page_data = [
WordEntry(word="test", lang_code="fr", lang="Français")
]
page_data = [WordEntry(word="test", lang_code="fr", lang="Français")]
extract_gloss(self.wxr, page_data, root.children[0])
self.assertEqual(
[d.model_dump(exclude_defaults=True) for d in page_data[-1].senses],
Expand All @@ -171,9 +163,7 @@ def test_italic_tag(self):
root = self.wxr.wtp.parse(
"# (''localement'') [[bassin#Nom_commun|Bassin]], [[lavoir#Nom_commun|lavoir]]."
)
page_data = [
WordEntry(word="test", lang_code="fr", lang="Français")
]
page_data = [WordEntry(word="test", lang_code="fr", lang="Français")]
extract_gloss(self.wxr, page_data, root.children[0])
self.assertEqual(
[d.model_dump(exclude_defaults=True) for d in page_data[-1].senses],
Expand All @@ -186,9 +176,7 @@ def test_not_italic_tag(self):
root = self.wxr.wtp.parse(
"# [[oiseau|Oiseau]] aquatique de taille moyenne du genre ''[[Rhynchops]]''."
)
page_data = [
WordEntry(word="test", lang_code="fr", lang="Français")
]
page_data = [WordEntry(word="test", lang_code="fr", lang="Français")]
extract_gloss(self.wxr, page_data, root.children[0])
self.assertEqual(
[d.model_dump(exclude_defaults=True) for d in page_data[-1].senses],
Expand All @@ -206,9 +194,7 @@ def test_preserve_space_between_tags(self):
# the space between italic node and the link node should be preserved
self.wxr.wtp.start_page("becs-en-ciseaux")
root = self.wxr.wtp.parse("# ''Pluriel de'' [[bec-en-ciseaux]].")
page_data = [
WordEntry(word="test", lang_code="fr", lang="Français")
]
page_data = [WordEntry(word="test", lang_code="fr", lang="Français")]
extract_gloss(self.wxr, page_data, root.children[0])
self.assertEqual(
[d.model_dump(exclude_defaults=True) for d in page_data[-1].senses],
Expand All @@ -225,9 +211,7 @@ def test_template_is_not_tag(self, mock_get_page):
root = self.wxr.wtp.parse(
"# {{lien|autrice|fr|dif=Autrice}}, [[celle]] qui est à l’[[origine]] de [[quelque chose]]."
)
page_data = [
WordEntry(word="test", lang_code="fr", lang="Français")
]
page_data = [WordEntry(word="test", lang_code="fr", lang="Français")]
extract_gloss(self.wxr, page_data, root.children[0])
self.assertEqual(
[d.model_dump(exclude_defaults=True) for d in page_data[-1].senses],
Expand All @@ -250,9 +234,7 @@ def test_nest_gloss(self):
##* nest example
"""
)
page_data = [
WordEntry(word="test", lang_code="fr", lang="Français")
]
page_data = [WordEntry(word="test", lang_code="fr", lang="Français")]
extract_gloss(self.wxr, page_data, root.children[0])
self.assertEqual(
[d.model_dump(exclude_defaults=True) for d in page_data[-1].senses],
Expand Down
Loading

0 comments on commit 13c2181

Please sign in to comment.