Skip to content

Commit

Permalink
Merge pull request #563 from xxyzz/de
Browse files Browse the repository at this point in the history
Some changes for de edition's `page.py` and `gloss.py` file
  • Loading branch information
xxyzz authored Mar 27, 2024
2 parents f65abdf + 35ac365 commit 39ac701
Show file tree
Hide file tree
Showing 7 changed files with 138 additions and 219 deletions.
24 changes: 12 additions & 12 deletions src/wiktextract/extractor/de/gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode
from wiktextract.extractor.de.models import Sense, WordEntry
from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

from .models import Sense, WordEntry
from .utils import find_and_remove_child, match_senseid


def extract_glosses(
wxr: WiktextractContext,
Expand Down Expand Up @@ -37,7 +38,7 @@ def process_gloss_list_item(
item_type = list_item_node.sarg
if item_type == "*":
handle_sense_modifier(wxr, base_sense, list_item_node)
elif item_type in [":", "::"]:
elif item_type.endswith(":"):
if any(
[
template_node.template_name
Expand All @@ -60,9 +61,6 @@ def process_gloss_list_item(
find_and_remove_child(list_item_node, NodeKind.LIST)
)

raw_gloss = clean_node(wxr, {}, list_item_node.children)
sense_data.raw_glosses = [raw_gloss]

process_K_template(wxr, sense_data, list_item_node)

gloss_text = clean_node(wxr, sense_data, list_item_node.children)
Expand All @@ -81,8 +79,8 @@ def process_gloss_list_item(
# XXX: Extract tags from nodes instead using Italic and Template
gloss_text = extract_tags_from_gloss_text(sense_data, gloss_text)

if gloss_text or not sub_glosses_list_nodes:
sense_data.glosses = [gloss_text]
if len(gloss_text) > 0:
sense_data.glosses.append(gloss_text)
word_entry.senses.append(sense_data)

for sub_list_node in sub_glosses_list_nodes:
Expand All @@ -92,7 +90,7 @@ def process_gloss_list_item(
base_sense,
sub_list_node,
senseid,
sense_data if not gloss_text else None,
sense_data,
)

else:
Expand All @@ -112,23 +110,25 @@ def handle_sense_modifier(
f"Found more than one child in sense modifier: {list_item_node.children}",
sortid="extractor/de/gloss/handle_sense_modifier/114",
)
modifier = clean_node(wxr, None, list_item_node.children)
modifier = clean_node(wxr, None, list_item_node.children).removesuffix(":")
if modifier != "":
sense.raw_tags = [modifier]


def process_K_template(
wxr: WiktextractContext,
sense_data: Sense,
list_item_node: NodeKind.LIST_ITEM,
list_item_node: WikiNode,
) -> None:
for template_node in list_item_node.find_child(NodeKind.TEMPLATE):
if template_node.template_name == "K":
categories = {"categories": []}
text = clean_node(wxr, categories, template_node).removesuffix(":")
sense_data.categories.extend(categories["categories"])
tags = re.split(r";|,", text)
sense_data.raw_tags.extend([t.strip() for t in tags])
sense_data.raw_tags.extend(
[t.strip() for t in tags if len(t.strip()) > 0]
)

# Prepositional and case information is sometimes only expanded to
# category links and not present in cleaned node. We still want it
Expand Down
5 changes: 1 addition & 4 deletions src/wiktextract/extractor/de/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,6 @@ class Sense(BaseModelWrap):
default=[],
description="list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.",
)
raw_glosses: list[str] = Field(
default=[],
description="list of uncleaned raw glosses for the word sense (usually only one).",
)
raw_tags: list[str] = Field(
default=[],
description="list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.",
Expand Down Expand Up @@ -144,6 +140,7 @@ class WordEntry(BaseModelWrap):

word: str = Field(description="word string")
pos: str = Field(default="", description="Part of speech type")
other_pos: list[str] = []
# pos_title: str = Field(default=None, description="Original POS title")
lang_code: str = Field(
description="Wiktionary language code", examples=["es"]
Expand Down
116 changes: 47 additions & 69 deletions src/wiktextract/extractor/de/page.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
import logging
from typing import Union
from typing import Any

from mediawiki_langcodes import name_to_code
from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode
from wiktextract.extractor.de.models import WordEntry
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

from .example import extract_examples
from .gloss import extract_glosses
from .linkage import extract_linkages
from .models import WordEntry
from .pronunciation import extract_pronunciation
from .section_titles import LINKAGE_TITLES, POS_SECTIONS
from .translation import extract_translation
Expand All @@ -31,65 +32,36 @@ def parse_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
base_data: WordEntry,
level_node_or_children: Union[WikiNode, list[Union[WikiNode, str]]],
level_node: WikiNode,
) -> None:
# Page structure: https://de.wiktionary.org/wiki/Hilfe:Formatvorlage
if isinstance(level_node_or_children, list):
for x in level_node_or_children:
parse_section(wxr, page_data, base_data, x)
return

elif not isinstance(level_node_or_children, WikiNode):
if (
not isinstance(level_node_or_children, str)
or not level_node_or_children.strip() == ""
):
wxr.wtp.debug(
f"Unexpected node type in parse_section: {level_node_or_children}",
sortid="extractor/de/page/parse_section/31",
)
return

# Level 3 headings are used to start POS sections like
# === {{Wortart|Verb|Deutsch}} ===
elif level_node_or_children.kind == NodeKind.LEVEL3:
for template_node in level_node_or_children.find_content(
NodeKind.TEMPLATE
):
# German Wiktionary uses a `Wortart` template to define the POS
if template_node.template_name == "Wortart":
process_pos_section(
wxr,
page_data,
base_data,
level_node_or_children,
template_node,
)
return

if level_node.kind == NodeKind.LEVEL3:
process_pos_section(wxr, page_data, base_data, level_node)
# Level 4 headings were introduced by overriding the default templates.
# See overrides/de.json for details.
elif level_node_or_children.kind == NodeKind.LEVEL4:
section_name = level_node_or_children.largs[0][0]
elif level_node.kind == NodeKind.LEVEL4:
section_name = clean_node(wxr, None, level_node.largs)
wxr.wtp.start_subsection(section_name)
if not len(page_data) > 0:
wxr.wtp.debug(
f"Reached section without extracting some page data first: {level_node_or_children}",
f"Reached section without extracting some page data first: {level_node}",
sortid="extractor/de/page/parse_section/55",
)
return
if section_name == "Bedeutungen":
extract_glosses(wxr, page_data[-1], level_node_or_children)
extract_glosses(wxr, page_data[-1], level_node)
elif wxr.config.capture_pronunciation and section_name == "Aussprache":
extract_pronunciation(wxr, page_data[-1], level_node_or_children)
extract_pronunciation(wxr, page_data[-1], level_node)
elif wxr.config.capture_examples and section_name == "Beispiele":
extract_examples(wxr, page_data[-1], level_node_or_children)
extract_examples(wxr, page_data[-1], level_node)
elif (
wxr.config.capture_translations and section_name == "Übersetzungen"
):
extract_translation(wxr, page_data[-1], level_node_or_children)
extract_translation(wxr, page_data[-1], level_node)
elif wxr.config.capture_linkages and section_name in LINKAGE_TITLES:
extract_linkages(wxr, page_data[-1], level_node_or_children)
extract_linkages(wxr, page_data[-1], level_node)


FORM_POS = {
Expand All @@ -116,29 +88,36 @@ def process_pos_section(
page_data: list[WordEntry],
base_data: WordEntry,
level_node: LevelNode,
pos_template_node: WikiNode,
) -> None:
# Extract the POS
pos_argument = pos_template_node.template_parameters.get(1)
if pos_argument in IGNORE_POS:
return
if pos_argument in FORM_POS:
# XXX: Extract form from form pages. Investigate first if this is needed
# at all or redundant with form tables.
pos_arguments = []
for template_node in level_node.find_content(NodeKind.TEMPLATE):
if template_node.template_name == "Wortart":
pos_argument = template_node.template_parameters.get(1, "")
if pos_argument in IGNORE_POS:
continue
if pos_argument in FORM_POS:
# XXX: Extract form from form pages. Investigate first if this is needed
# at all or redundant with form tables.
continue
if pos_argument in POS_SECTIONS:
pos_arguments.append(pos_argument)
else:
wxr.wtp.debug(
f"Unknown Wortart template POS argument: {pos_argument}",
sortid="extractor/de/page/process_pos_section/55",
)
if len(pos_arguments) == 0:
return

pos = ""
if pos_argument in POS_SECTIONS:
for pos_index, pos_argument in enumerate(pos_arguments):
pos = POS_SECTIONS[pos_argument]["pos"]
else:
wxr.wtp.debug(
f"Unknown POS type: {pos_argument}",
sortid="extractor/de/page/process_pos_section/55",
)
base_data.pos = pos
pos_tags = POS_SECTIONS[pos_argument].get("tags", [])
base_data.tags.extend(pos_tags)
if pos_index == 0:
base_data.pos = pos
else:
base_data.other_pos.append(pos)
page_data.append(base_data.model_copy(deep=True))

wxr.wtp.start_section(page_data[-1].lang_code + "_" + pos)
wxr.wtp.start_subsection(clean_node(wxr, None, level_node.largs))

# There might be other templates in the level node that define grammatical
# features other than the POS. Extract them here.
Expand Down Expand Up @@ -242,12 +221,11 @@ def process_pos_section(
f"Unexpected node in pos section: {non_l4_node}",
sortid="extractor/de/page/process_pos_section/41",
)
return


def parse_page(
wxr: WiktextractContext, page_title: str, page_text: str
) -> list[dict[str, any]]:
) -> list[dict[str, Any]]:
if wxr.config.verbose:
logging.info(f"Parsing page: {page_title}")

Expand All @@ -270,22 +248,22 @@ def parse_page(
# where <title> is the title of the page and <lang> is the
# German name of the language of the section.
if subtitle_template.template_name == "Sprache":
lang = subtitle_template.template_parameters.get(1)
lang_code = name_to_code(lang, "de")
lang_name = subtitle_template.template_parameters.get(1, "")
lang_code = name_to_code(lang_name, "de")
if lang_code == "":
wxr.wtp.warning(
f"Unknown language: {lang}",
f"Unknown language: {lang_name}",
sortid="extractor/de/page/parse_page/76",
)
if (
wxr.config.capture_language_codes is not None
and lang_code not in wxr.config.capture_language_codes
):
continue

base_data = WordEntry(
lang=lang, lang_code=lang_code, word=wxr.wtp.title
lang=lang_name, lang_code=lang_code, word=page_title
)
parse_section(wxr, page_data, base_data, level2_node.children)
for level3_node in level2_node.find_child(NodeKind.LEVEL3):
parse_section(wxr, page_data, base_data, level3_node)

return [d.model_dump(exclude_defaults=True) for d in page_data]
16 changes: 8 additions & 8 deletions src/wiktextract/extractor/de/section_titles.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

# argument of title template https://de.wiktionary.org/wiki/Vorlage:Wortart
POS_SECTIONS: POSSubtitleData = {
"Abkürzung (Deutsch)": {"pos": "abbrev"},
"Abkürzung": {"pos": "abbrev"},
"Abkürzung (Deutsch)": {"pos": "abbrev", "tags": ["abbreviation"]},
"Abkürzung": {"pos": "abbrev", "tags": ["abbreviation"]},
"Abtönungspartikel": {"pos": "particle"},
"Adjektiv ": {"pos": "adj"},
"Adjektiv": {"pos": "adj"},
Expand All @@ -17,7 +17,7 @@
"Demonstrativpronomen": {"pos": "pron"},
"Eigenname ": {"pos": "name"},
"Eigenname": {"pos": "name"},
"Enklitikon": {"pos": "suffix"},
"Enklitikon": {"pos": "suffix", "tags": ["morpheme"]},
"Fokuspartikel": {"pos": "particle"},
"Formel": {"pos": "phrase"},
"Gebundenes Lexem": {"pos": "lexeme"},
Expand Down Expand Up @@ -59,8 +59,8 @@
"Possessivpronomen ": {"pos": "pron"},
"Possessivpronomen": {"pos": "pron"},
"Postposition": {"pos": "postp"},
"Präfix": {"pos": "prefix"},
"Präfixoid": {"pos": "prefix"},
"Präfix": {"pos": "prefix", "tags": ["morpheme"]},
"Präfixoid": {"pos": "prefix", "tags": ["morpheme"]},
"Präposition ": {"pos": "prep"},
"Präposition": {"pos": "prep"},
"Pronomen": {"pos": "pron"},
Expand All @@ -75,8 +75,8 @@
"Straßenname": {"pos": "name"},
"Subjunktion": {"pos": "conj"},
"Substantiv": {"pos": "noun"},
"Suffix": {"pos": "suffix"},
"Suffixoid": {"pos": "suffix"},
"Suffix": {"pos": "suffix", "tags": ["morpheme"]},
"Suffixoid": {"pos": "suffix", "tags": ["morpheme"]},
"Symbol": {"pos": "symbol"},
"Temporaladverb": {"pos": "adv"},
"Temporaldverb": {"pos": "adv"},
Expand All @@ -89,7 +89,7 @@
"Wortverbindung": {"pos": "phrase"},
"Zahlklassifikator": {"pos": "noun"},
"Zahlzeichen": {"pos": "num"},
"Zirkumfix": {"pos": "circumfix"},
"Zirkumfix": {"pos": "circumfix", "tags": ["morpheme"]},
"Zirkumposition": {"pos": "circumpos"},
}

Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/de/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def extract_translation(
)
else:
sense_translations = []
sense_id = level_node_child.template_parameters.get(1, "")
sense_id = str(level_node_child.template_parameters.get(1, ""))
base_translation_data = Translation(sense_id=sense_id)
if sense_id == "":
# XXX: Sense-disambiguate where senseids are in Ü-Liste (ca. 0.03% of pages), e.g.:
Expand Down
Loading

0 comments on commit 39ac701

Please sign in to comment.