Skip to content

Commit

Permalink
Merge pull request #564 from xxyzz/de
Browse files Browse the repository at this point in the history
Simplify de editon's `page.py` and `gloss.py` file
  • Loading branch information
xxyzz authored Mar 28, 2024
2 parents 39ac701 + e75666c commit e15ea73
Show file tree
Hide file tree
Showing 5 changed files with 154 additions and 327 deletions.
154 changes: 58 additions & 96 deletions src/wiktextract/extractor/de/gloss.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,22 @@
import copy
import re

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode
from wikitextprocessor.parser import LevelNode, TemplateNode
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

from .models import Sense, WordEntry
from .utils import find_and_remove_child, match_senseid
from .utils import match_senseid


def extract_glosses(
wxr: WiktextractContext,
word_entry: WordEntry,
level_node: LevelNode,
) -> None:
base_sense = Sense()
sense = Sense()
for list_node in level_node.find_child(NodeKind.LIST):
process_gloss_list_item(wxr, word_entry, base_sense, list_node)
sense = process_gloss_list_item(wxr, word_entry, list_node, sense)

for non_list_node in level_node.invert_find_child(NodeKind.LIST):
wxr.wtp.debug(
Expand All @@ -29,67 +28,81 @@ def extract_glosses(
def process_gloss_list_item(
wxr: WiktextractContext,
word_entry: WordEntry,
base_sense: Sense,
list_node: WikiNode,
parent_senseid: str = "",
parent_gloss_data: Sense = None,
parent_sense: Sense,
) -> None:
for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
item_type = list_item_node.sarg
if item_type == "*":
handle_sense_modifier(wxr, base_sense, list_item_node)
if item_type == "*": # only contains modifier template
for template in list_item_node.find_child(NodeKind.TEMPLATE):
raw_tag = clean_node(wxr, parent_sense, template).removesuffix(
":"
)
parent_sense = Sense()
parent_sense.raw_tags.append(raw_tag)
elif item_type.endswith(":"):
if any(
[
template_node.template_name
in ["QS Herkunft", "QS Bedeutungen"]
for template_node in list_item_node.find_child_recursively(
NodeKind.TEMPLATE
sense_data = parent_sense.model_copy(deep=True)
gloss_nodes = []
for gloss_node in list_item_node.children:
if isinstance(gloss_node, TemplateNode):
if gloss_node.template_name == "K":
for (
k_arg,
k_arg_value,
) in gloss_node.template_parameters.items():
if k_arg == "ft":
gloss_nodes.append(
clean_node(wxr, None, k_arg_value)
)
gloss_nodes.append(":")
elif isinstance(k_arg, int):
raw_tag = clean_node(wxr, None, k_arg_value)
sense_data.raw_tags.append(raw_tag)
clean_node(wxr, sense_data, gloss_node)
elif gloss_node.template_name in (
"QS Herkunft",
"QS Bedeutungen",
):
continue
elif (
isinstance(gloss_node, WikiNode)
and gloss_node.kind == NodeKind.ITALIC
):
raw_tag = clean_node(wxr, None, gloss_node).removesuffix(
":"
)
]
):
continue

sense_data = (
copy.deepcopy(base_sense)
if parent_gloss_data is None
else copy.deepcopy(parent_gloss_data)
)

# Extract sub-glosses for later processing
sub_glosses_list_nodes = list(
find_and_remove_child(list_item_node, NodeKind.LIST)
)

process_K_template(wxr, sense_data, list_item_node)

gloss_text = clean_node(wxr, sense_data, list_item_node.children)

sense_data.raw_tags.append(raw_tag)
elif not (
isinstance(gloss_node, WikiNode)
and gloss_node.kind == NodeKind.LIST
):
gloss_nodes.append(gloss_node)

gloss_text = clean_node(wxr, sense_data, gloss_nodes)
senseid, gloss_text = match_senseid(gloss_text)
if senseid != "":
if not senseid[0].isnumeric():
senseid = parent_senseid + senseid
if (
not senseid[0].isnumeric()
and parent_sense is not None
and len(parent_sense.senseid) != ""
):
senseid = parent_sense.senseid + senseid
sense_data.senseid = senseid
elif len(gloss_text.strip()) > 0:
wxr.wtp.debug(
f"Failed to extract sense number from gloss node: {list_item_node}",
"Failed to extract sense number from gloss node",
sortid="extractor/de/glosses/extract_glosses/28",
)

# XXX: Extract tags from nodes instead using Italic and Template
gloss_text = extract_tags_from_gloss_text(sense_data, gloss_text)

if len(gloss_text) > 0:
sense_data.glosses.append(gloss_text)
word_entry.senses.append(sense_data)

for sub_list_node in sub_glosses_list_nodes:
for sub_list_node in list_item_node.find_child(NodeKind.LIST):
process_gloss_list_item(
wxr,
word_entry,
base_sense,
sub_list_node,
senseid,
sense_data,
)

Expand All @@ -99,58 +112,7 @@ def process_gloss_list_item(
sortid="extractor/de/glosses/extract_glosses/29",
)
continue


def handle_sense_modifier(
wxr: WiktextractContext, sense: Sense, list_item_node: WikiNode
):
if len(list(list_item_node.filter_empty_str_child())) > 1:
# XXX: Clean up sense modifier where there is more than one modifier
wxr.wtp.debug(
f"Found more than one child in sense modifier: {list_item_node.children}",
sortid="extractor/de/gloss/handle_sense_modifier/114",
)
modifier = clean_node(wxr, None, list_item_node.children).removesuffix(":")
if modifier != "":
sense.raw_tags = [modifier]


def process_K_template(
wxr: WiktextractContext,
sense_data: Sense,
list_item_node: WikiNode,
) -> None:
for template_node in list_item_node.find_child(NodeKind.TEMPLATE):
if template_node.template_name == "K":
categories = {"categories": []}
text = clean_node(wxr, categories, template_node).removesuffix(":")
sense_data.categories.extend(categories["categories"])
tags = re.split(r";|,", text)
sense_data.raw_tags.extend(
[t.strip() for t in tags if len(t.strip()) > 0]
)

# Prepositional and case information is sometimes only expanded to
# category links and not present in cleaned node. We still want it
# as a tag.
prep = template_node.template_parameters.get("Prä")
case = template_node.template_parameters.get("Kas")
category = (prep if prep else "") + (" + " + case if case else "")
if category:
sense_data.raw_tags.append(category)

# XXX: Investigate better ways to handle free text in K template
ft = template_node.template_parameters.get("ft")
if ft:
wxr.wtp.debug(
f"Found ft '{ft}' in K template which could be considered part of the gloss. Moved to tags for now.",
sortid="extractor/de/glosses/extract_glosses/63",
)

# Remove the template_node from the children of list_item_node
list_item_node.children = [
c for c in list_item_node.children if c != template_node
]
return parent_sense


def extract_tags_from_gloss_text(sense_data: Sense, gloss_text: str) -> None:
Expand Down
1 change: 1 addition & 0 deletions src/wiktextract/extractor/de/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,3 +165,4 @@ class WordEntry(BaseModelWrap):
proverbs: list[Linkage] = []
synonyms: list[Linkage] = []
tags: list[str] = []
raw_tags: list[str] = []
74 changes: 4 additions & 70 deletions src/wiktextract/extractor/de/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,10 @@ def process_pos_section(
f"Unknown Wortart template POS argument: {pos_argument}",
sortid="extractor/de/page/process_pos_section/55",
)
elif template_node.template_name != "Geschlecht":
# ignore placeholder gender template "Geschlecht"
base_data.raw_tags.append(clean_node(wxr, base_data, template_node))

if len(pos_arguments) == 0:
return
for pos_index, pos_argument in enumerate(pos_arguments):
Expand All @@ -119,76 +123,6 @@ def process_pos_section(
page_data.append(base_data.model_copy(deep=True))
wxr.wtp.start_subsection(clean_node(wxr, None, level_node.largs))

# There might be other templates in the level node that define grammatical
# features other than the POS. Extract them here.
for template_node in level_node.find_content(NodeKind.TEMPLATE):
template_name = template_node.template_name

GENDER_TAGS_TEMPLATES = {
"m",
"f",
"f ",
"n",
"n ",
"mf",
"mn.",
"fn",
"fm",
"nf",
"nm",
"mfn",
"u",
"un",
"Geschlecht", # placeholder template
}

VERB_TAGS_TEMPLATES = {
"unreg.",
"intrans.",
"trans.",
"refl.",
}

ARAB_VERB_STEM_TEMPLATES = {
"Grundstamm",
"I",
"II",
"III",
"IV",
"V",
"VI",
"VII",
"VIII",
}

NOUN_TAGS_TEMPLATES = {
"adjektivische Deklination",
"kPl.",
"Pl.",
"mPl.",
"fPl.",
"nPl.",
"Sachklasse",
"Personenklasse",
"indekl.",
"Suaheli Klassen",
}

if template_name == "Wortart":
continue

elif template_name in GENDER_TAGS_TEMPLATES.union(
ARAB_VERB_STEM_TEMPLATES
).union(NOUN_TAGS_TEMPLATES).union(VERB_TAGS_TEMPLATES):
# XXX: de: Extract additional grammatical markers
pass

else:
wxr.wtp.debug(
f"Unexpected template in POS section heading: {template_node}",
sortid="extractor/de/page/process_pos_section/31",
)

for level_4_node in level_node.find_child(NodeKind.LEVEL4):
parse_section(wxr, page_data, base_data, level_4_node)

Expand Down
Loading

0 comments on commit e15ea73

Please sign in to comment.