Skip to content

Commit

Permalink
Extract French Wiktionary etymology lists
Browse files Browse the repository at this point in the history
Unlink English Wiktionary, French Wiktionary writes all etymology data
of different POS types inside the same section. And each POS data uses
a list("*") or indent(":").
  • Loading branch information
xxyzz committed Sep 25, 2023
1 parent 2c4523e commit 152cc43
Show file tree
Hide file tree
Showing 7 changed files with 346 additions and 119 deletions.
154 changes: 149 additions & 5 deletions tests/test_fr_etymology.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.fr.page import extract_etymology
from wiktextract.extractor.fr.etymology import (
extract_etymology,
insert_etymology_data,
)
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext

Expand All @@ -23,9 +26,150 @@ def tearDown(self) -> None:

def test_ebauche_etym(self):
# https://fr.wiktionary.org/wiki/Hörsaal
# missing etymology template "ébauche-étym" should be ignored
self.wxr.wtp.start_page("")
root = self.wxr.wtp.parse(": {{ébauche-étym|de}}")
base_data = defaultdict(list, {"lang_code": "de"})
page_data = [base_data]
extract_etymology(self.wxr, page_data, base_data, root.children)
self.assertEqual(page_data, [{"lang_code": "de"}])
etymology_data = extract_etymology(self.wxr, root.children)
self.assertIsNone(etymology_data)

def test_list_etymologies(self):
# https://fr.wiktionary.org/wiki/lenn
self.wxr.wtp.start_page("lenn")
root = self.wxr.wtp.parse(
"""* [[#br-nom-1|Nom commun 1 :]]
: Du vieux breton lin (« lac, étang ; liquide, humeur »).
: Du moyen breton lenn.
* [[#br-nom-2|Nom commun 2 :]]
:Du vieux breton lenn (« pièce de toile, voile, manteau, rideau »)."""
)
etymology_data = extract_etymology(self.wxr, root.children)
self.assertEqual(
etymology_data,
{
"Nom commun 1": [
"Du vieux breton lin (« lac, étang ; liquide, humeur »).",
"Du moyen breton lenn.",
],
"Nom commun 2": [
"Du vieux breton lenn (« pièce de toile, voile, manteau, rideau »)."
],
},
)
page_data = [
defaultdict(
list,
{"lang_code": "fr", "pos": "noun", "pos_title": "Nom commun 1"},
),
defaultdict(
list,
{"lang_code": "fr", "pos": "noun", "pos_title": "Nom commun 2"},
),
]
insert_etymology_data("fr", page_data, etymology_data)
self.assertEqual(
page_data,
[
{
"lang_code": "fr",
"pos": "noun",
"pos_title": "Nom commun 1",
"etymology_texts": [
"Du vieux breton lin (« lac, étang ; liquide, humeur »).",
"Du moyen breton lenn.",
],
},
{
"lang_code": "fr",
"pos": "noun",
"pos_title": "Nom commun 2",
"etymology_texts": [
"Du vieux breton lenn (« pièce de toile, voile, manteau, rideau »)."
],
},
],
)

def test_indent_etymology_with_pos_template(self):
# https://fr.wiktionary.org/wiki/dame
self.wxr.wtp.start_page("damn")
self.wxr.wtp.add_page("Modèle:lien-ancre-étym", 10, "({{{2}}} {{{3}}})")
root = self.wxr.wtp.parse(
""": {{lien-ancre-étym|fr|Nom commun|1}} Du latin domina (« maîtresse de maison »).
: {{lien-ancre-étym|fr|Nom commun|2}} Du moyen néerlandais dam (« digue »).
: {{lien-ancre-étym|fr|Interjection|1}} Abréviation de « [[Notre-Dame]] ! » ou de « dame Dieu ! » (« [[Seigneur Dieu]] ! »).
"""
)
etymology_data = extract_etymology(self.wxr, root.children)
self.assertEqual(
etymology_data,
{
"Nom commun 1": ["Du latin domina (« maîtresse de maison »)."],
"Nom commun 2": ["Du moyen néerlandais dam (« digue »)."],
"Interjection 1": [
"Abréviation de « Notre-Dame ! » ou de « dame Dieu ! » (« Seigneur Dieu ! »)."
],
},
)
page_data = [
defaultdict(
list,
{"lang_code": "fr", "pos": "noun", "pos_title": "Nom commun 1"},
),
defaultdict(
list,
{"lang_code": "fr", "pos": "noun", "pos_title": "Nom commun 2"},
),
defaultdict(
list,
{"lang_code": "fr", "pos": "intj", "pos_title": "Interjection"},
),
]
insert_etymology_data("fr", page_data, etymology_data)
self.assertEqual(
page_data,
[
{
"lang_code": "fr",
"pos": "noun",
"pos_title": "Nom commun 1",
"etymology_texts": [
"Du latin domina (« maîtresse de maison »)."
],
},
{
"lang_code": "fr",
"pos": "noun",
"pos_title": "Nom commun 2",
"etymology_texts": [
"Du moyen néerlandais dam (« digue »)."
],
},
{
"lang_code": "fr",
"pos": "intj",
"pos_title": "Interjection",
"etymology_texts": [
"Abréviation de « Notre-Dame ! » ou de « dame Dieu ! » (« Seigneur Dieu ! »)."
],
},
],
)

def test_indent_etymology_with_italic_pos(self):
# https://fr.wiktionary.org/wiki/hélas
self.wxr.wtp.start_page("hélas")
root = self.wxr.wtp.parse(
""": (''[[#Interjection|Interjection]]'') XIIe siècle, elas ; composé de hé et de las, au sens ancien de « malheureux ».
: (''[[#fr-nom|Nom]]'') Par [[substantivation]] de l’interjection.
"""
)
etymology_data = extract_etymology(self.wxr, root.children)
self.assertEqual(
etymology_data,
{
"Interjection": [
"XIIe siècle, elas ; composé de hé et de las, au sens ancien de « malheureux »."
],
"Nom commun": ["Par substantivation de l’interjection."],
},
)
8 changes: 7 additions & 1 deletion tests/test_fr_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,13 +122,19 @@ def test_zh_exemple_template(self):
)
page_data = [defaultdict(list)]
process_pos_block(
self.wxr, page_data, defaultdict(list), root.children[0], "nom"
self.wxr,
page_data,
defaultdict(list),
root.children[0],
"nom",
"Nom commun",
)
self.assertEqual(
page_data,
[
{
"pos": "noun",
"pos_title": "Nom commun",
"senses": [
{
"glosses": ["Cheval."],
Expand Down
7 changes: 1 addition & 6 deletions tests/test_fr_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,6 @@ def setUp(self):
conf1 = WiktionaryConfig(
dump_file_lang_code="fr",
capture_language_codes=None,
capture_translations=True,
capture_pronunciation=True,
capture_linkages=True,
capture_compounds=True,
capture_redirects=True,
capture_examples=True,
)
self.wxr = WiktextractContext(Wtp(lang_code="fr"), conf1)

Expand Down Expand Up @@ -52,6 +46,7 @@ def test_fr_parse_page(self):
"lang": "Français",
"lang_code": "fr",
"pos": "noun",
"pos_title": "Nom commun",
"word": "exemple",
}
],
Expand Down
1 change: 1 addition & 0 deletions wiktextract/datautils.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ def append_base_data(
# append new dictionary if the last dictionary has sense data and
# also has the same key
page_data.append(copy.deepcopy(base_data))
page_data[-1][field] = value
elif isinstance(page_data[-1].get(field), list):
page_data[-1][field] += value
else:
Expand Down
118 changes: 118 additions & 0 deletions wiktextract/extractor/fr/etymology.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
from collections import defaultdict
from typing import Dict, List, Optional, Tuple, Union

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode, TemplateNode

from wiktextract.page import LEVEL_KINDS, clean_node
from wiktextract.wxr_context import WiktextractContext

EtymologyData = Dict[str, List[str]]


def extract_etymology(
wxr: WiktextractContext,
nodes: List[Union[WikiNode, str]],
) -> Optional[EtymologyData]:
etymology_dict: EtymologyData = defaultdict(list)
level_node_index = len(nodes)
# find nodes after the etymology subtitle and before the next level node
for index, node in enumerate(nodes):
if isinstance(node, WikiNode) and node.kind in LEVEL_KINDS:
level_node_index = index
break

pos_title: Optional[str] = None
for etymology_node in nodes[:level_node_index]:
if (
isinstance(etymology_node, WikiNode)
and etymology_node.kind == NodeKind.LIST
):
if etymology_node.sarg == "*":
pos_title = clean_node(wxr, None, etymology_node)
pos_title = pos_title.removeprefix("* ").removesuffix(" :")
elif etymology_node.sarg == ":":
# ignore missing etymology template "ébauche-étym"
for template_node in etymology_node.find_child_recursively(
NodeKind.TEMPLATE
):
if template_node.template_name == "ébauche-étym":
return

for etymology_item in etymology_node.find_child(
NodeKind.LIST_ITEM
):
etymology_data = find_pos_in_etymology_list(
wxr, etymology_item
)
if etymology_data is not None:
new_pos_title, new_etymology_text = etymology_data
etymology_dict[new_pos_title].append(new_etymology_text)
else:
etymology_text = clean_node(
wxr, None, etymology_item.children
)
etymology_dict[pos_title].append(etymology_text)

return etymology_dict


def find_pos_in_etymology_list(
wxr: WiktextractContext, list_item_node: WikiNode
) -> Optional[Tuple[str, str]]:
"""
Return tuple of POS title and etymology text if the passed lis item node
starts with italic POS node or POS template, otherwise return None.
"""
child_nodes = list(list_item_node.filter_empty_str_child())
for index, node in enumerate(child_nodes):
if (
index == 0
and isinstance(node, TemplateNode)
and node.template_name == "lien-ancre-étym"
):
return clean_node(wxr, None, node).strip("()"), clean_node(
wxr, None, child_nodes[index + 1 :]
)
if (
index == 1
and isinstance(node, WikiNode)
and node.kind == NodeKind.ITALIC
and isinstance(child_nodes[0], str)
and child_nodes[0].endswith("(")
and isinstance(child_nodes[2], str)
and child_nodes[2].startswith(")")
):
# italic pos
pos_title = clean_node(wxr, None, node)
if pos_title == "Nom":
pos_title = "Nom commun"
return pos_title, clean_node(
wxr, None, child_nodes[index + 1 :]
).removeprefix(") ")


def insert_etymology_data(
lang_code: str, page_data: List[Dict], etymology_data: EtymologyData
) -> None:
"""
Insert list of etymology data extracted from the level 3 node to each sense
dictionary matches the language and POS.
"""
sense_dict = {} # group by pos title
for sense_data in page_data:
if sense_data.get("lang_code") == lang_code:
sense_dict[sense_data.get("pos_title")] = sense_data

for pos_title, etymology_texts in etymology_data.items():
if pos_title is None: # add to all sense dictionaries
for sense_data in sense_dict.values():
sense_data["etymology_texts"] = etymology_texts
elif pos_title in sense_dict:
sense_dict[pos_title]["etymology_texts"] = etymology_texts
elif pos_title.removesuffix(" 1") in sense_dict:
# an index number is added in the etymology section but not added in
# POS title
sense_dict[pos_title.removesuffix(" 1")][
"etymology_texts"
] = etymology_texts
2 changes: 1 addition & 1 deletion wiktextract/extractor/fr/inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

from .pronunciation import is_ipa_text, insert_ipa
from .pronunciation import insert_ipa, is_ipa_text


def extract_inflection(
Expand Down
Loading

0 comments on commit 152cc43

Please sign in to comment.