Skip to content

Commit

Permalink
Merge pull request #374 from xxyzz/fr
Browse files Browse the repository at this point in the history
Extract French Wiktionary notes section
  • Loading branch information
xxyzz authored Oct 20, 2023
2 parents bcef60e + 065cc4a commit 36dc65d
Show file tree
Hide file tree
Showing 14 changed files with 102 additions and 12 deletions.
6 changes: 6 additions & 0 deletions json_schema/fr.json
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,12 @@
"items": {
"type": "string"
}
},
"notes": {
"type": "array",
"items": {
"type": "string"
}
}
},
"$defs": {
Expand Down
6 changes: 5 additions & 1 deletion src/wiktextract/data/fr/other_subtitles.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@
"conjugaison",
"conjug"
],
"notes": [
"notes",
"note"
],
"pronunciation": [
"prononciation",
"pron",
Expand All @@ -41,4 +45,4 @@
"trad-trier",
"trad trier"
]
}
}
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/de/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ def parse_page(
)
continue
if (
wxr.config.capture_language_codes
wxr.config.capture_language_codes is not None
and lang_code not in wxr.config.capture_language_codes
):
continue
Expand Down
1 change: 0 additions & 1 deletion src/wiktextract/extractor/fr/etymology.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode

from wiktextract.page import LEVEL_KINDS, clean_node
from wiktextract.wxr_context import WiktextractContext

Expand Down
1 change: 0 additions & 1 deletion src/wiktextract/extractor/fr/form_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode

from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

Expand Down
1 change: 0 additions & 1 deletion src/wiktextract/extractor/fr/gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode

from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

Expand Down
1 change: 0 additions & 1 deletion src/wiktextract/extractor/fr/inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode

from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

Expand Down
1 change: 0 additions & 1 deletion src/wiktextract/extractor/fr/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode

from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

Expand Down
47 changes: 47 additions & 0 deletions src/wiktextract/extractor/fr/note.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from typing import Any, Dict, List

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext


def extract_note(
wxr: WiktextractContext,
page_data: List[Dict[str, Any]],
level_node: WikiNode,
) -> None:
# Save paragraph and list item texts to a list of string.
note_paragraph_nodes = []
for child in level_node.children:
if isinstance(child, TemplateNode) and child.template_name.startswith(
"note-"
):
process_note_template(wxr, page_data, child)
continue
if isinstance(child, WikiNode) and child.kind == NodeKind.LIST:
for list_item_node in child.find_child(NodeKind.LIST_ITEM):
note_text = clean_node(
wxr, page_data[-1], list_item_node.children
)
if len(note_text) > 0:
page_data[-1]["notes"].append(note_text)
continue

note_paragraph_nodes.append(child)
if isinstance(child, str) and child.endswith("\n"):
note_text = clean_node(wxr, page_data[-1], note_paragraph_nodes)
if len(note_text) > 0:
page_data[-1]["notes"].append(note_text)
note_paragraph_nodes.clear()


def process_note_template(
wxr: WiktextractContext,
page_data: List[Dict[str, Any]],
template_node: TemplateNode,
) -> None:
expaned_template = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(template_node), expand_all=True
)
extract_note(wxr, page_data, expaned_template)
6 changes: 4 additions & 2 deletions src/wiktextract/extractor/fr/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode

from wiktextract.datautils import append_base_data
from wiktextract.page import LEVEL_KINDS, clean_node
from wiktextract.wxr_context import WiktextractContext
Expand All @@ -15,6 +14,7 @@
from .gloss import extract_gloss, process_exemple_template
from .inflection import extract_inflection
from .linkage import extract_linkage
from .note import extract_note
from .pronunciation import extract_pronunciation
from .translation import extract_translation

Expand Down Expand Up @@ -91,6 +91,8 @@ def parse_section(
in wxr.config.OTHER_SUBTITLES["inflection_sections"]
):
pass
elif section_type in wxr.config.OTHER_SUBTITLES["notes"]:
extract_note(wxr, page_data, level_node)


def process_pos_block(
Expand Down Expand Up @@ -164,7 +166,7 @@ def parse_page(
categories_and_links = defaultdict(list)
lang_code = subtitle_template.template_parameters.get(1)
if (
wxr.config.capture_language_codes
wxr.config.capture_language_codes is not None
and lang_code not in wxr.config.capture_language_codes
):
continue
Expand Down
1 change: 0 additions & 1 deletion src/wiktextract/extractor/fr/pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode

from wiktextract.extractor.share import create_audio_url_dict
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext
Expand Down
1 change: 0 additions & 1 deletion src/wiktextract/extractor/fr/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode

from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/zh/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ def parse_page(
)
lang_code = wxr.config.LANGUAGES_BY_NAME.get(lang_name)
if (
wxr.config.capture_language_codes
wxr.config.capture_language_codes is not None
and lang_code not in wxr.config.capture_language_codes
):
continue
Expand Down
38 changes: 38 additions & 0 deletions tests/test_fr_note.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import unittest
from collections import defaultdict

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.fr.note import extract_note
from wiktextract.wxr_context import WiktextractContext


class TestNotes(unittest.TestCase):
def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="fr"), WiktionaryConfig(dump_file_lang_code="fr")
)

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()

def test_list_notes(self):
# list created from template "note-féminisation"
# https://fr.wiktionary.org/wiki/autrice
self.wxr.wtp.add_page("Modèle:note-féminisation", 10, "* list 1\n* list 2")
self.wxr.wtp.start_page("autrice")
nodes = self.wxr.wtp.parse("""==== {{S|notes}} ====
paragrapy 1
{{note-féminisation}}""")
page_data = [defaultdict(list)]
extract_note(self.wxr, page_data, nodes.children[0])
self.assertEqual(page_data, [
{
"notes": [
"paragrapy 1",
"list 1",
"list 2"
]
}
])

0 comments on commit 36dc65d

Please sign in to comment.