Skip to content

Commit

Permalink
Merge pull request #342 from empiriker/de
Browse files Browse the repository at this point in the history
Add German Wiktionary extractor code to parse page and extract glosses
  • Loading branch information
kristian-clausal authored Oct 5, 2023
2 parents 3a5822b + e3af868 commit ca9e913
Show file tree
Hide file tree
Showing 10 changed files with 1,496 additions and 0 deletions.
46 changes: 46 additions & 0 deletions tests/test_de_gloss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import unittest
from collections import defaultdict

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.de.gloss import extract_glosses
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext


class TestGlossList(unittest.TestCase):
def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="de"), WiktionaryConfig(dump_file_lang_code="de")
)

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()
close_thesaurus_db(
self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
)

def test_de_extract_glosses(self):
self.wxr.wtp.start_page("")
root = self.wxr.wtp.parse(":[1] gloss1 \n:[2] gloss2")

page_data = [defaultdict(list)]

extract_glosses(self.wxr, page_data, root.children[0])

self.assertEqual(
page_data,
[
{
"senses": [
{
"glosses": ["gloss1"],
},
{
"glosses": ["gloss2"],
},
]
}
],
)
213 changes: 213 additions & 0 deletions tests/test_de_page.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
# Tests for parsing a page from the German Wiktionary

import unittest

from collections import defaultdict

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.de.page import (
parse_page,
parse_section,
fix_level_hierarchy_of_subsections,
)
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext


class DePageTests(unittest.TestCase):
def setUp(self):
conf1 = WiktionaryConfig(
dump_file_lang_code="de",
# capture_language_codes=None,
# capture_translations=True,
# capture_pronunciation=True,
# capture_linkages=True,
# capture_compounds=True,
# capture_redirects=True,
# capture_examples=True,
)
self.wxr = WiktextractContext(Wtp(lang_code="de"), conf1)

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()
close_thesaurus_db(
self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
)

def test_de_parse_page(self):
self.wxr.wtp.add_page("Vorlage:Sprache", 10, "")
lst = parse_page(
self.wxr,
"Beispiel",
"""
== Beispiel ({{Sprache|Deutsch}}) ==
""",
)
self.assertEqual(
lst,
[
{
"lang": "Deutsch",
"lang_code": "de",
"word": "Beispiel",
}
],
)

def test_de_parse_page_skipping_head_templates(self):
self.wxr.wtp.add_page("Vorlage:Wort der Woche", 10, "")
self.wxr.wtp.add_page("Vorlage:Siehe auch", 10, "")
self.wxr.wtp.add_page("Vorlage:Sprache", 10, "")
lst = parse_page(
self.wxr,
"Beispiel",
"""
{{Wort der Woche|46|2020}}
{{Siehe auch|[[cát]]}}
== Beispiel ({{Sprache|Deutsch}}) ==
""",
)
self.assertEqual(
lst,
[
{
"lang": "Deutsch",
"lang_code": "de",
"word": "Beispiel",
}
],
)

# The way append_base_data() works requires the presence of a sense
# dictionary before starting a new pos section. Therefore, we need to add
# at least one sense data point to the test case.
def test_de_parse_section(self):
self.wxr.wtp.add_page("Vorlage:Wortart", 10, "")
self.wxr.wtp.add_page("Vorlage:Bedeutungen", 10, "")
page_text = """
=== {{Wortart|Adjektiv|Englisch}}, {{Wortart|Adverb|Englisch}} ===
{{Bedeutungen}}
:[1] gloss1
=== {{Wortart|Verb|Englisch}} ===
{{Bedeutungen}}
:[1] gloss2
=== {{Wortart|Substantiv|Englisch}} ===
{{Bedeutungen}}
:[1] gloss3
"""
self.wxr.wtp.start_page("")
root = self.wxr.wtp.parse(
page_text,
pre_expand=True,
)

base_data = defaultdict(list, {"lang_code": "de"})
page_data = [defaultdict(list, {"lang_code": "de"})]
parse_section(self.wxr, page_data, base_data, root.children)

self.assertEqual(
page_data,
[
{
"lang_code": "de",
"pos": "adj",
"senses": [
{
"glosses": ["gloss1"],
},
],
},
{
"lang_code": "de",
"pos": "adv",
"senses": [
{
"glosses": ["gloss1"],
},
],
},
{
"lang_code": "de",
"pos": "verb",
"senses": [
{
"glosses": ["gloss2"],
},
],
},
{
"lang_code": "de",
"pos": "noun",
"senses": [
{
"glosses": ["gloss3"],
},
],
},
],
)

def test_de_fix_level_hierarchy_of_subsections(self):
self.wxr.wtp.add_page("Vorlage:Englisch Substantiv Übersicht", 10, "")
self.wxr.wtp.add_page("Vorlage:Worttrennung", 10, "")
self.wxr.wtp.add_page("Vorlage:Aussprache", 10, "")
self.wxr.wtp.add_page("Vorlage:Übersetzungen", 10, "")
self.wxr.wtp.add_page("Vorlage:Ü-Tabelle", 10, "")
self.wxr.wtp.add_page("Vorlage:Referenzen", 10, "")

page_text = """
{{Englisch Substantiv Übersicht
|args=args}}
{{Worttrennung}}
:item
{{Aussprache}}
:item
==== {{Übersetzungen}} ====
{{Ü-Tabelle|1|G=arg|Ü-Liste=
:item
}}
{{Referenzen}}
:item
"""
self.wxr.wtp.start_page("")
root = self.wxr.wtp.parse(
page_text,
pre_expand=True,
)

subsections = fix_level_hierarchy_of_subsections(
self.wxr, root.children
)

target_page_text = """==== {{Englisch Substantiv Übersicht\n|args=args}} ====
==== {{Worttrennung}} ====
:item
==== {{Aussprache}} ====
:item
==== {{Übersetzungen}} ====
{{Ü-Tabelle|1|G=arg|Ü-Liste=
:item
}}
==== {{Referenzen}} ====
:item
"""
root = self.wxr.wtp.parse(
target_page_text,
pre_expand=True,
)

self.assertEqual(
[str(s) for s in subsections],
[str(t) for t in root.children],
)
72 changes: 72 additions & 0 deletions usertools/de_language_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Export German Wiktionary language data to JSON.
#
# Usage:
#
# python language_data.py de dewiktionary_dump_file [--languages languages_output_file]

import argparse
from wikitextprocessor import Wtp
from wiktextract.config import WiktionaryConfig
from wiktextract.wxr_context import WiktextractContext
from wiktextract.page import clean_node
from wikitextprocessor.dumpparser import process_dump
from wikitextprocessor import NodeKind, WikiNode

import json


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Export Wiktionary language data to JSON"
)
parser.add_argument("lang_code", type=str, help="Dump file language code")
parser.add_argument("dump", type=str, help="Wiktionary xml dump file path")
parser.add_argument(
"--languages",
type=str,
default="languages.json",
help="Language data output file path",
)
args = parser.parse_args()
wxr = WiktextractContext(Wtp(lang_code=args.lang_code), WiktionaryConfig())

wxr = WiktextractContext(
Wtp(
lang_code=args.lang_code, db_path="wikt-db_de_language_data_temp.db"
),
WiktionaryConfig(),
)
help_ns_id = wxr.wtp.NAMESPACE_DATA["Help"]["id"]
template_ns_id = wxr.wtp.NAMESPACE_DATA["Template"]["id"]
process_dump(wxr.wtp, args.dump, {help_ns_id, template_ns_id})

# The page 'Hilfe:Sprachkürzel seems to be the only central collection of
# language codes and their German expansions. We will use this until we find
# perhaps a more authoritative source.
sprachkuerzel = wxr.wtp.get_page("Hilfe:Sprachkürzel")

wxr.config.word = sprachkuerzel.title
wxr.wtp.start_page(sprachkuerzel.title)
tree = wxr.wtp.parse(
sprachkuerzel.body,
pre_expand=True,
)

languages = {}
for node in filter(lambda n: isinstance(n, WikiNode), tree.children):
if node.kind != NodeKind.LEVEL3:
continue

for table_row in node.find_child_recursively(NodeKind.TABLE_ROW):
third_row_content = table_row.children[2].children[0]
if (
isinstance(third_row_content, str)
or third_row_content.kind != NodeKind.TEMPLATE
):
continue
lang_code = third_row_content.template_name

languages[lang_code] = [clean_node(wxr, None, third_row_content)]

with open(args.languages, "w", encoding="utf-8") as fout:
json.dump(languages, fout, indent=2, ensure_ascii=False, sort_keys=True)
6 changes: 6 additions & 0 deletions wiktextract/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ class WiktionaryConfig:
"POS_TYPES",
"OTHER_SUBTITLES",
"ZH_PRON_TAGS",
"FR_FORM_TABLES",
"DE_FORM_TABLES",
"LANGUAGES_BY_NAME",
"LANGUAGES_BY_CODE",
"FORM_OF_TEMPLATES",
Expand Down Expand Up @@ -123,6 +125,10 @@ def __init__(
self.set_attr_from_json(
"FORM_OF_TEMPLATES", "form_of_templates.json"
)
if dump_file_lang_code == "fr":
self.set_attr_from_json("FR_FORM_TABLES", "form_tables.json")
if dump_file_lang_code == "de":
self.set_attr_from_json("DE_FORM_TABLES", "form_templates.json")

def to_kwargs(self):
return {
Expand Down
Loading

0 comments on commit ca9e913

Please sign in to comment.