Skip to content

Commit

Permalink
Don't use append_base_data() in French extractor
Browse files Browse the repository at this point in the history
This function is for the Chinese Wiktionary extractor.
  • Loading branch information
xxyzz committed Nov 29, 2023
1 parent 1ccc547 commit 201c4d5
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 19 deletions.
28 changes: 11 additions & 17 deletions src/wiktextract/extractor/fr/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode
from wiktextract.datautils import append_base_data
from wiktextract.page import LEVEL_KINDS, clean_node
from wiktextract.wxr_context import WiktextractContext

Expand Down Expand Up @@ -45,7 +44,11 @@ def parse_section(
# https://fr.wiktionary.org/wiki/Modèle:S
# https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections
section_type = level_node_template.template_parameters.get(1)
subtitle = clean_node(wxr, page_data[-1], level_node.largs)
subtitle = clean_node(
wxr,
page_data[-1] if len(page_data) > 0 else base_data,
level_node.largs,
)
wxr.wtp.start_subsection(subtitle)
if section_type in wxr.config.OTHER_SUBTITLES["ignored_sections"]:
pass
Expand Down Expand Up @@ -104,7 +107,8 @@ def process_pos_block(
pos_title: str,
):
pos_type = wxr.config.POS_SUBTITLES[pos_argument]["pos"]
append_base_data(page_data, "pos", pos_type, base_data)
page_data.append(copy.deepcopy(base_data))
page_data[-1]["pos"] = pos_type
page_data[-1]["pos_title"] = pos_title
child_nodes = list(pos_title_node.filter_empty_str_child())
form_line_start = 0 # Ligne de forme
Expand Down Expand Up @@ -163,27 +167,17 @@ def parse_page(
# https://fr.wiktionary.org/wiki/Modèle:langue
# https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_langues
if subtitle_template.template_name == "langue":
categories_and_links = defaultdict(list)
base_data = defaultdict(list, {"word": wxr.wtp.title})
lang_code = subtitle_template.template_parameters.get(1)
if (
wxr.config.capture_language_codes is not None
and lang_code not in wxr.config.capture_language_codes
):
continue
lang_name = clean_node(
wxr, categories_and_links, subtitle_template
)
lang_name = clean_node(wxr, base_data, subtitle_template)
wxr.wtp.start_section(lang_name)
base_data = defaultdict(
list,
{
"lang_name": lang_name,
"lang_code": lang_code,
"word": wxr.wtp.title,
},
)
base_data.update(categories_and_links)
page_data.append(copy.deepcopy(base_data))
base_data["lang_name"] = lang_name
base_data["lang_code"] = lang_code
etymology_data: Optional[EtymologyData] = None
for level3_node in level2_node.find_child(NodeKind.LEVEL3):
new_etymology_data = parse_section(
Expand Down
4 changes: 2 additions & 2 deletions tests/test_fr_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from wiktextract.wxr_context import WiktextractContext


class TestFormLine(unittest.TestCase):
class TestFrGloss(unittest.TestCase):
def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="fr"), WiktionaryConfig(dump_file_lang_code="fr")
Expand Down Expand Up @@ -115,7 +115,7 @@ def test_zh_exemple_template(self):
root = self.wxr.wtp.parse(
"=== {{S|nom|zh}} ===\n# Cheval.\n{{zh-exemple|这匹'''马'''很大。|Ce cheval est grand.|Zhè pǐ '''mǎ''' hěn dà.<br/>⠌⠢⠆ ⠏⠊⠄ ⠍⠔⠄ ⠓⠴⠄ ⠙⠔⠆⠐⠆}}"
)
page_data = [defaultdict(list)]
page_data = []
process_pos_block(
self.wxr,
page_data,
Expand Down

0 comments on commit 201c4d5

Please sign in to comment.