Skip to content

Commit

Permalink
Append nested gloss to parent gloss text in de edition code
Browse files Browse the repository at this point in the history
- Don't add empty gloss text
- Use real wikitext from Wiktionary pages in tests
- Remove `Sense.raw_glosses`, this field is not very useful
  • Loading branch information
xxyzz committed Mar 27, 2024
1 parent 640f496 commit 35ac365
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 65 deletions.
24 changes: 12 additions & 12 deletions src/wiktextract/extractor/de/gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode
from wiktextract.extractor.de.models import Sense, WordEntry
from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

from .models import Sense, WordEntry
from .utils import find_and_remove_child, match_senseid


def extract_glosses(
wxr: WiktextractContext,
Expand Down Expand Up @@ -37,7 +38,7 @@ def process_gloss_list_item(
item_type = list_item_node.sarg
if item_type == "*":
handle_sense_modifier(wxr, base_sense, list_item_node)
elif item_type in [":", "::"]:
elif item_type.endswith(":"):
if any(
[
template_node.template_name
Expand All @@ -60,9 +61,6 @@ def process_gloss_list_item(
find_and_remove_child(list_item_node, NodeKind.LIST)
)

raw_gloss = clean_node(wxr, {}, list_item_node.children)
sense_data.raw_glosses = [raw_gloss]

process_K_template(wxr, sense_data, list_item_node)

gloss_text = clean_node(wxr, sense_data, list_item_node.children)
Expand All @@ -81,8 +79,8 @@ def process_gloss_list_item(
# XXX: Extract tags from nodes instead using Italic and Template
gloss_text = extract_tags_from_gloss_text(sense_data, gloss_text)

if gloss_text or not sub_glosses_list_nodes:
sense_data.glosses = [gloss_text]
if len(gloss_text) > 0:
sense_data.glosses.append(gloss_text)
word_entry.senses.append(sense_data)

for sub_list_node in sub_glosses_list_nodes:
Expand All @@ -92,7 +90,7 @@ def process_gloss_list_item(
base_sense,
sub_list_node,
senseid,
sense_data if not gloss_text else None,
sense_data,
)

else:
Expand All @@ -112,23 +110,25 @@ def handle_sense_modifier(
f"Found more than one child in sense modifier: {list_item_node.children}",
sortid="extractor/de/gloss/handle_sense_modifier/114",
)
modifier = clean_node(wxr, None, list_item_node.children)
modifier = clean_node(wxr, None, list_item_node.children).removesuffix(":")
if modifier != "":
sense.raw_tags = [modifier]


def process_K_template(
wxr: WiktextractContext,
sense_data: Sense,
list_item_node: NodeKind.LIST_ITEM,
list_item_node: WikiNode,
) -> None:
for template_node in list_item_node.find_child(NodeKind.TEMPLATE):
if template_node.template_name == "K":
categories = {"categories": []}
text = clean_node(wxr, categories, template_node).removesuffix(":")
sense_data.categories.extend(categories["categories"])
tags = re.split(r";|,", text)
sense_data.raw_tags.extend([t.strip() for t in tags])
sense_data.raw_tags.extend(
[t.strip() for t in tags if len(t.strip()) > 0]
)

# Prepositional and case information is sometimes only expanded to
# category links and not present in cleaned node. We still want it
Expand Down
4 changes: 0 additions & 4 deletions src/wiktextract/extractor/de/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,6 @@ class Sense(BaseModelWrap):
default=[],
description="list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.",
)
raw_glosses: list[str] = Field(
default=[],
description="list of uncleaned raw glosses for the word sense (usually only one).",
)
raw_tags: list[str] = Field(
default=[],
description="list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.",
Expand Down
92 changes: 46 additions & 46 deletions tests/test_de_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,81 +45,81 @@ def test_de_extract_glosses(self):
[
{
"glosses": ["gloss1"],
"raw_glosses": ["[1] gloss1"],
"senseid": "1",
},
{
"glosses": ["gloss2"],
"raw_glosses": ["[2] gloss2"],
"senseid": "2",
},
],
)

def test_de_extract_glosses_with_subglosses(self):
self.wxr.wtp.start_page("")
def test_nested_gloss(self):
self.wxr.wtp.start_page("Keim")
self.wxr.wtp.add_page("Vorlage:K", 10, "{{{1|}}}, {{{2|}}}:")
root = self.wxr.wtp.parse(
":[1] gloss1\n::[a] subglossA\n::[b] subglossB"
"""===Bedeutungen===
:[2] das erste [[Entwicklungsstadium]]
::[a] {{K|Botanik}} erster [[Trieb]] einer Pflanze
::[b] {{K|Biologie|Medizin}} befruchtete [[Eizelle]], [[Embryo]]"""
)

word_entry = self.get_default_word_entry()

extract_glosses(self.wxr, word_entry, root)

senses = [
s.model_dump(exclude_defaults=True) for s in word_entry.senses
]

word_entry = WordEntry(
lang="Deutsch", lang_code="de", word="Keim", pos="noun"
)
extract_glosses(self.wxr, word_entry, root.children[0])
self.assertEqual(
senses,
[s.model_dump(exclude_defaults=True) for s in word_entry.senses],
[
{
"glosses": ["gloss1"],
"raw_glosses": ["[1] gloss1"],
"senseid": "1",
"glosses": ["das erste Entwicklungsstadium"],
"senseid": "2",
},
{
"glosses": ["subglossA"],
"raw_glosses": ["[a] subglossA"],
"senseid": "1a",
"glosses": [
"das erste Entwicklungsstadium",
"erster Trieb einer Pflanze",
],
"raw_tags": ["Botanik"],
"senseid": "2a",
},
{
"glosses": ["subglossB"],
"raw_glosses": ["[b] subglossB"],
"senseid": "1b",
"glosses": [
"das erste Entwicklungsstadium",
"befruchtete Eizelle, Embryo",
],
"raw_tags": ["Biologie", "Medizin"],
"senseid": "2b",
},
],
)

def test_de_extract_glosses_with_only_subglosses(self):
self.wxr.wtp.add_page("Vorlage:K", 10, "tag")
self.wxr.wtp.start_page("")
def test_nested_gloss_without_parent_gloss(self):
self.wxr.wtp.add_page("Vorlage:K", 10, "{{{1}}}:")
self.wxr.wtp.start_page("eingeben")
root = self.wxr.wtp.parse(
":[1] {{K|tag}}\n::[a] subglossA\n::[1b] subglossB"
"""===Bedeutungen===
*{{K|fachsprachlich}}
:[4] {{K|Technik}} etwas, was eine Maschine bearbeiten soll, an diese übergeben
:[5] {{K|EDV}} etwas in einen Computer übertragen"""
)

word_entry = self.get_default_word_entry()

extract_glosses(self.wxr, word_entry, root)

senses = [
s.model_dump(exclude_defaults=True) for s in word_entry.senses
]

word_entry = WordEntry(
lang="Deutsch", lang_code="de", word="eingeben", pos="verb"
)
extract_glosses(self.wxr, word_entry, root.children[0])
self.assertEqual(
senses,
[s.model_dump(exclude_defaults=True) for s in word_entry.senses],
[
{
"raw_tags": ["tag"],
"glosses": ["subglossA"],
"raw_glosses": ["[a] subglossA"],
"senseid": "1a",
"raw_tags": ["fachsprachlich", "Technik"],
"glosses": [
"etwas, was eine Maschine bearbeiten soll, an diese übergeben"
],
"senseid": "4",
},
{
"raw_tags": ["tag"],
"glosses": ["subglossB"],
"raw_glosses": ["[1b] subglossB"],
"senseid": "1b",
"raw_tags": ["fachsprachlich", "EDV"],
"glosses": ["etwas in einen Computer übertragen"],
"senseid": "5",
},
],
)
Expand Down
3 changes: 0 additions & 3 deletions tests/test_de_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,6 @@ def test_multiple_pos(self):
"glosses": [
"Staat in Südosteuropa, im Süden der Balkanhalbinsel"
],
"raw_glosses": [
"[1] Staat in Südosteuropa, im Süden der Balkanhalbinsel"
],
"senseid": "1",
}
],
Expand Down

0 comments on commit 35ac365

Please sign in to comment.