Skip to content

Commit

Permalink
Merge branch 'master' of github.com:tatuylonen/wiktextract
Browse files Browse the repository at this point in the history
  • Loading branch information
kristian-clausal committed Jan 2, 2024
2 parents f9db0e9 + e0524ae commit 9bd7131
Show file tree
Hide file tree
Showing 18 changed files with 532 additions and 229 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
make coverage_report
make github_pages REPO=${{ github.repository }} SHA=${{ github.sha }}
if: github.ref_name == 'master' && matrix.python-version == '3.12'
- uses: actions/upload-pages-artifact@v2
- uses: actions/upload-pages-artifact@v3
if: github.ref_name == 'master' && matrix.python-version == '3.12'

deploy:
Expand All @@ -52,4 +52,4 @@ jobs:
runs-on: ubuntu-latest
steps:
- id: deployment
uses: actions/deploy-pages@v3
uses: actions/deploy-pages@v4
16 changes: 16 additions & 0 deletions src/wiktextract/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ class WiktionaryConfig:
"ZH_PRON_TAGS",
"analyze_templates",
"extract_thesaurus_pages",
"save_ns_names",
"extract_ns_names",
)

def __init__(
Expand Down Expand Up @@ -135,6 +137,20 @@ def __init__(
self.set_attr_from_json("ZH_PRON_TAGS", "zh_pron_tags.json")
self.analyze_templates = True # find templates that need pre-expand
self.extract_thesaurus_pages = True
# these namespace pages will be copied from the XML dump file and
# saved to a SQLite db file
self.save_ns_names = [
"Main",
"Category", # do we use this?
"Appendix",
"Project",
"Thesaurus",
"Module",
"Template",
"Reconstruction",
]
# these are extracted namespaces
self.extract_ns_names = ["Main", "Reconstruction"]
self.load_edition_settings()

def merge_return(self, ret: CollatedErrorReturnData):
Expand Down
4 changes: 3 additions & 1 deletion src/wiktextract/data/fr/config.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
{
"analyze_templates": false,
"extract_thesaurus_pages": false
"extract_thesaurus_pages": false,
"save_ns_names": ["Main", "Template", "Module", "Conjugaison"],
"extract_ns_names": ["Main"]
}
20 changes: 7 additions & 13 deletions src/wiktextract/extractor/fr/gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,47 +21,41 @@ def extract_gloss(
)
)
gloss_data = Sense()
gloss_start = 0
# process modifier, theme tempaltes before gloss text
# https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens
tag_indexes = set()
for index, gloss_node in enumerate(gloss_nodes):
if isinstance(gloss_node, TemplateNode):
categories_data = defaultdict(list)
expanded_text = clean_node(wxr, categories_data, gloss_node)
if expanded_text.startswith("(") and expanded_text.endswith(
")"
):
gloss_start = index + 1
tag = expanded_text.strip("() \n")
if len(tag) > 0:
gloss_data.tags.append(tag)
if "categories" in categories_data:
gloss_data.categories.extend(
categories_data["categories"]
)

gloss_only_nodes = []
tag_indexes = set()
for index, node in enumerate(gloss_nodes[gloss_start:], gloss_start):
tag_indexes.add(index)
# if an italic node is between parentheses then it's a tag, also
# don't add the parenthese strings to `gloss_only_nodes`
if (
isinstance(node, WikiNode)
and node.kind == NodeKind.ITALIC
and index > gloss_start
elif (
isinstance(gloss_node, WikiNode)
and gloss_node.kind == NodeKind.ITALIC
and isinstance(gloss_nodes[index - 1], str)
and gloss_nodes[index - 1].strip() == "("
and index + 1 < len(gloss_nodes)
and isinstance(gloss_nodes[index + 1], str)
and gloss_nodes[index + 1].strip() == ")"
):
gloss_data.tags.append(clean_node(wxr, None, node))
gloss_data.tags.append(clean_node(wxr, None, gloss_node))
tag_indexes |= {index - 1, index, index + 1}
continue

gloss_only_nodes = [
node
for index, node in enumerate(gloss_nodes[gloss_start:], gloss_start)
for index, node in enumerate(gloss_nodes)
if index not in tag_indexes
]
gloss_text = clean_node(wxr, gloss_data, gloss_only_nodes)
Expand Down
39 changes: 38 additions & 1 deletion src/wiktextract/extractor/fr/inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@ def extract_inflection(
) -> None:
# inflection templates
# https://fr.wiktionary.org/wiki/Catégorie:Modèles_d’accord_en_français
process_inflection_table(wxr, page_data, template_node)
if template_node.template_name.startswith("en-adj"):
process_en_adj_table(wxr, page_data, template_node)
else:
process_inflection_table(wxr, page_data, template_node)


IGNORE_TABLE_HEADERS = frozenset(
Expand Down Expand Up @@ -192,3 +195,37 @@ def insert_ipa(form: Form, ipa_text: str) -> None:
if len(ipa_data) == 0:
return
form.ipas.extend(ipa_data)


def process_en_adj_table(
wxr: WiktextractContext,
page_data: list[WordEntry],
template_node: WikiNode,
) -> None:
# https://fr.wiktionary.org/wiki/Modèle:en-adj
# and other en-adj* templates
# these templates use normal table cell for column table header
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(template_node), expand_all=True
)
table_nodes = list(expanded_node.find_child(NodeKind.TABLE))
if len(table_nodes) == 0:
return
table_node = table_nodes[0]
for row_num, table_row in enumerate(
table_node.find_child(NodeKind.TABLE_ROW)
):
if row_num == 0:
# skip header
continue
if len(table_row.children) > 1:
form_data = Form()
form_data.tags.append(clean_node(wxr, None, table_row.children[0]))
form_text = clean_node(wxr, None, table_row.children[1])
for form_line in form_text.splitlines():
if is_ipa_text(form_line):
insert_ipa(form_data, form_line)
else:
form_data.form = form_line
if form_data.form != page_data[-1].word:
page_data[-1].forms.append(form_data)
30 changes: 23 additions & 7 deletions src/wiktextract/extractor/fr/linkage.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode
from wiktextract.page import clean_node
Expand Down Expand Up @@ -33,12 +35,14 @@ def process_derives_autres_list(
for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
lang_code = ""
lang_name = ""
for template_node in list_item.find_child(NodeKind.TEMPLATE):
if template_node.template_name == "L":
lang_code = template_node.template_parameters.get(1)
lang_name = clean_node(wxr, None, template_node)
elif template_node.template_name == "lien":
word = clean_node(wxr, None, template_node)
for node in list_item.find_child(NodeKind.TEMPLATE | NodeKind.LINK):
if isinstance(node, TemplateNode) and node.template_name == "L":
lang_code = node.template_parameters.get(1)
lang_name = clean_node(wxr, None, node)
elif node.kind == NodeKind.LINK or (
isinstance(node, TemplateNode) and node.template_name == "lien"
):
word = clean_node(wxr, None, node)
page_data[-1].derived.append(
Linkage(lang_code=lang_code, lang_name=lang_name, word=word)
)
Expand Down Expand Up @@ -66,9 +70,21 @@ def process_linkage_list(
sense_index_text = template_or_list_node.template_parameters.get(
2, "0"
)
if sense_index_text.isdigit():
if isinstance(sense_index_text, str) and sense_index_text.isdigit():
sense_index = int(sense_index_text)
continue
# sense could also be in ";" description list
if (
template_or_list_node.kind == NodeKind.LIST_ITEM
and template_or_list_node.sarg == ";"
):
sense_text = clean_node(wxr, None, template_or_list_node.children)
index_pattern = r"\s*\((?:sens\s*)?(\d+)\)$"
m = re.search(index_pattern, sense_text)
if m is not None:
sense_text = re.sub(index_pattern, "", sense_text)
sense_index = int(m.group(1))
continue

linkage_data = Linkage()
if len(sense_text) > 0:
Expand Down
3 changes: 3 additions & 0 deletions src/wiktextract/extractor/fr/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ class Translation(FrenchBaseModel):
lang_name: str = Field("", description="Translation language name")
word: str = Field("", description="Translation term")
sense: str = Field("", description="Translation gloss")
sense_index: int = Field(
0, ge=0, description="Number of the definition, start from 1"
)
tags: list[str] = []
roman: str = ""
traditional_writing: str = Field(
Expand Down
4 changes: 2 additions & 2 deletions src/wiktextract/extractor/fr/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def parse_section(
wxr.config.capture_translations
and section_type in wxr.config.OTHER_SUBTITLES["translations"]
):
extract_translation(wxr, page_data, level_node)
extract_translation(wxr, page_data, base_data, level_node)
elif (
wxr.config.capture_inflections
and section_type
Expand All @@ -114,7 +114,7 @@ def process_pos_block(
pos_title: str,
):
pos_type = wxr.config.POS_SUBTITLES[pos_argument]["pos"]
if len(page_data) == 0 or "pos" not in page_data[-1].model_fields_set:
if len(page_data) == 0 or "pos" in page_data[-1].model_fields_set:
page_data.append(base_data.model_copy(deep=True))
page_data[-1].pos = pos_type
page_data[-1].pos_title = pos_title
Expand Down
19 changes: 13 additions & 6 deletions src/wiktextract/extractor/fr/pronunciation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode
from wikitextprocessor.parser import LEVEL_KIND_FLAGS, TemplateNode
from wiktextract.extractor.share import create_audio_url_dict
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext
Expand All @@ -15,11 +15,18 @@ def extract_pronunciation(
) -> None:
sound_data = []
lang_code = base_data.lang_code
for list_node in level_node.find_child(NodeKind.LIST):
for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
sound_data.extend(
process_pron_list_item(wxr, list_item_node, Sound(), lang_code)
)
for node in level_node.find_child(NodeKind.LIST | LEVEL_KIND_FLAGS):
if node.kind == NodeKind.LIST:
for list_item_node in node.find_child(NodeKind.LIST_ITEM):
sound_data.extend(
process_pron_list_item(
wxr, list_item_node, Sound(), lang_code
)
)
else:
from .page import parse_section

parse_section(wxr, page_data, base_data, node)

if len(sound_data) == 0:
return
Expand Down
25 changes: 18 additions & 7 deletions src/wiktextract/extractor/fr/translation.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
from typing import Optional

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode
from wikitextprocessor.parser import LEVEL_KIND_FLAGS, TemplateNode
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

from .models import Translation, WordEntry


def extract_translation(
wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode
wxr: WiktextractContext,
page_data: list[WordEntry],
base_data: WordEntry,
level_node: WikiNode,
) -> None:
base_translation_data = Translation()
for level_node_child in level_node.filter_empty_str_child():
Expand Down Expand Up @@ -38,6 +41,10 @@ def extract_translation(
wxr, child_node, previous_node, page_data
)
previous_node = child_node
elif level_node_child.kind in LEVEL_KIND_FLAGS:
from .page import parse_section

parse_section(wxr, page_data, base_data, level_node_child)


def process_italic_node(
Expand Down Expand Up @@ -70,11 +77,13 @@ def process_translation_templates(
return
elif template_node.template_name == "trad-début":
# translation box start: https://fr.wiktionary.org/wiki/Modèle:trad-début
sense_parameter = template_node.template_parameters.get(1)
if sense_parameter is not None:
sense_text = clean_node(wxr, None, sense_parameter)
if len(sense_text) > 0:
base_translation_data.sense = sense_text
sense_parameter = template_node.template_parameters.get(1, "")
sense_text = clean_node(wxr, None, sense_parameter)
base_translation_data.sense = sense_text
sense_index_str = template_node.template_parameters.get(2, "0")
if isinstance(sense_index_str, str) and sense_index_str.isdigit():
base_translation_data.sense_index = int(sense_index_str)

elif template_node.template_name == "T":
# Translation language: https://fr.wiktionary.org/wiki/Modèle:T
base_translation_data.lang_code = template_node.template_parameters.get(
Expand All @@ -85,6 +94,8 @@ def process_translation_templates(
)
elif template_node.template_name.startswith("trad"):
# Translation term: https://fr.wiktionary.org/wiki/Modèle:trad
if 2 not in template_node.template_parameters: # required parameter
return
translation_term = clean_node(
wxr,
None,
Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/wiktionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def reprocess_wiktionary(
process_ns_ids = list(
{
wxr.wtp.NAMESPACE_DATA.get(ns, {}).get("id", 0)
for ns in ["Main", "Reconstruction"]
for ns in wxr.config.extract_ns_names
}
)
start_time = time.time()
Expand Down
16 changes: 2 additions & 14 deletions src/wiktextract/wiktwords.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,18 +45,6 @@
from wiktextract.wiktionary import write_json_data
from wiktextract.wxr_context import WiktextractContext

# Pages within these namespaces are captured.
RECOGNIZED_NAMESPACE_NAMES = [
"Main",
"Category",
"Appendix",
"Project",
"Thesaurus",
"Module",
"Template",
"Reconstruction",
]


def process_single_page(
path_or_title: str,
Expand Down Expand Up @@ -440,8 +428,8 @@ def main():
try:
if args.path is not None:
namespace_ids = {
wxr.wtp.NAMESPACE_DATA.get(name, {}).get("id")
for name in RECOGNIZED_NAMESPACE_NAMES
wxr.wtp.NAMESPACE_DATA.get(name, {}).get("id", 0)
for name in wxr.config.save_ns_names
}
# Parse the normal full Wiktionary data dump
parse_wiktionary(
Expand Down
26 changes: 26 additions & 0 deletions tests/test_fr_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,3 +280,29 @@ def test_nest_gloss(self):
},
],
)

def test_sandwich_tag(self):
# https://fr.wiktionary.org/wiki/autrice#Nom_commun_4
self.wxr.wtp.start_page("autrice")
self.wxr.wtp.add_page("Modèle:lexique", 10, "''(Littérature)''")
self.wxr.wtp.add_page("Modèle:rare", 10, "''(Rare)''")
self.wxr.wtp.add_page("Modèle:lien", 10, "Autrice")
self.wxr.wtp.add_page("Modèle:absolument", 10, "''(Absolument)''")
root = self.wxr.wtp.parse(
"# {{lexique|littérature|nl}} {{rare|nl}} {{lien|autrice|fr|dif=Autrice}}, femme qui a créé une œuvre littéraire. {{absolument}} [[écrivaine|Écrivaine]]."
)
page_data = [
WordEntry(word="autrice", lang_code="nl", lang_name="Néerlandais")
]
extract_gloss(self.wxr, page_data, root.children[0])
self.assertEqual(
[d.model_dump(exclude_defaults=True) for d in page_data[-1].senses],
[
{
"glosses": [
"Autrice, femme qui a créé une œuvre littéraire. Écrivaine."
],
"tags": ["Littérature", "Rare", "Absolument"],
}
],
)
Loading

0 comments on commit 9bd7131

Please sign in to comment.