-
Notifications
You must be signed in to change notification settings - Fork 87
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #342 from empiriker/de
Add German Wiktionary extractor code to parse page and extract glosses
- Loading branch information
Showing
10 changed files
with
1,496 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import unittest | ||
from collections import defaultdict | ||
|
||
from wikitextprocessor import Wtp | ||
|
||
from wiktextract.config import WiktionaryConfig | ||
from wiktextract.extractor.de.gloss import extract_glosses | ||
from wiktextract.thesaurus import close_thesaurus_db | ||
from wiktextract.wxr_context import WiktextractContext | ||
|
||
|
||
class TestGlossList(unittest.TestCase): | ||
def setUp(self) -> None: | ||
self.wxr = WiktextractContext( | ||
Wtp(lang_code="de"), WiktionaryConfig(dump_file_lang_code="de") | ||
) | ||
|
||
def tearDown(self) -> None: | ||
self.wxr.wtp.close_db_conn() | ||
close_thesaurus_db( | ||
self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn | ||
) | ||
|
||
def test_de_extract_glosses(self): | ||
self.wxr.wtp.start_page("") | ||
root = self.wxr.wtp.parse(":[1] gloss1 \n:[2] gloss2") | ||
|
||
page_data = [defaultdict(list)] | ||
|
||
extract_glosses(self.wxr, page_data, root.children[0]) | ||
|
||
self.assertEqual( | ||
page_data, | ||
[ | ||
{ | ||
"senses": [ | ||
{ | ||
"glosses": ["gloss1"], | ||
}, | ||
{ | ||
"glosses": ["gloss2"], | ||
}, | ||
] | ||
} | ||
], | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,213 @@ | ||
# Tests for parsing a page from the German Wiktionary | ||
|
||
import unittest | ||
|
||
from collections import defaultdict | ||
|
||
from wikitextprocessor import Wtp | ||
|
||
from wiktextract.config import WiktionaryConfig | ||
from wiktextract.extractor.de.page import ( | ||
parse_page, | ||
parse_section, | ||
fix_level_hierarchy_of_subsections, | ||
) | ||
from wiktextract.thesaurus import close_thesaurus_db | ||
from wiktextract.wxr_context import WiktextractContext | ||
|
||
|
||
class DePageTests(unittest.TestCase): | ||
def setUp(self): | ||
conf1 = WiktionaryConfig( | ||
dump_file_lang_code="de", | ||
# capture_language_codes=None, | ||
# capture_translations=True, | ||
# capture_pronunciation=True, | ||
# capture_linkages=True, | ||
# capture_compounds=True, | ||
# capture_redirects=True, | ||
# capture_examples=True, | ||
) | ||
self.wxr = WiktextractContext(Wtp(lang_code="de"), conf1) | ||
|
||
def tearDown(self) -> None: | ||
self.wxr.wtp.close_db_conn() | ||
close_thesaurus_db( | ||
self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn | ||
) | ||
|
||
def test_de_parse_page(self): | ||
self.wxr.wtp.add_page("Vorlage:Sprache", 10, "") | ||
lst = parse_page( | ||
self.wxr, | ||
"Beispiel", | ||
""" | ||
== Beispiel ({{Sprache|Deutsch}}) == | ||
""", | ||
) | ||
self.assertEqual( | ||
lst, | ||
[ | ||
{ | ||
"lang": "Deutsch", | ||
"lang_code": "de", | ||
"word": "Beispiel", | ||
} | ||
], | ||
) | ||
|
||
def test_de_parse_page_skipping_head_templates(self): | ||
self.wxr.wtp.add_page("Vorlage:Wort der Woche", 10, "") | ||
self.wxr.wtp.add_page("Vorlage:Siehe auch", 10, "") | ||
self.wxr.wtp.add_page("Vorlage:Sprache", 10, "") | ||
lst = parse_page( | ||
self.wxr, | ||
"Beispiel", | ||
""" | ||
{{Wort der Woche|46|2020}} | ||
{{Siehe auch|[[cát]]}} | ||
== Beispiel ({{Sprache|Deutsch}}) == | ||
""", | ||
) | ||
self.assertEqual( | ||
lst, | ||
[ | ||
{ | ||
"lang": "Deutsch", | ||
"lang_code": "de", | ||
"word": "Beispiel", | ||
} | ||
], | ||
) | ||
|
||
# The way append_base_data() works requires the presence of a sense | ||
# dictionary before starting a new pos section. Therefore, we need to add | ||
# at least one sense data point to the test case. | ||
def test_de_parse_section(self): | ||
self.wxr.wtp.add_page("Vorlage:Wortart", 10, "") | ||
self.wxr.wtp.add_page("Vorlage:Bedeutungen", 10, "") | ||
page_text = """ | ||
=== {{Wortart|Adjektiv|Englisch}}, {{Wortart|Adverb|Englisch}} === | ||
{{Bedeutungen}} | ||
:[1] gloss1 | ||
=== {{Wortart|Verb|Englisch}} === | ||
{{Bedeutungen}} | ||
:[1] gloss2 | ||
=== {{Wortart|Substantiv|Englisch}} === | ||
{{Bedeutungen}} | ||
:[1] gloss3 | ||
""" | ||
self.wxr.wtp.start_page("") | ||
root = self.wxr.wtp.parse( | ||
page_text, | ||
pre_expand=True, | ||
) | ||
|
||
base_data = defaultdict(list, {"lang_code": "de"}) | ||
page_data = [defaultdict(list, {"lang_code": "de"})] | ||
parse_section(self.wxr, page_data, base_data, root.children) | ||
|
||
self.assertEqual( | ||
page_data, | ||
[ | ||
{ | ||
"lang_code": "de", | ||
"pos": "adj", | ||
"senses": [ | ||
{ | ||
"glosses": ["gloss1"], | ||
}, | ||
], | ||
}, | ||
{ | ||
"lang_code": "de", | ||
"pos": "adv", | ||
"senses": [ | ||
{ | ||
"glosses": ["gloss1"], | ||
}, | ||
], | ||
}, | ||
{ | ||
"lang_code": "de", | ||
"pos": "verb", | ||
"senses": [ | ||
{ | ||
"glosses": ["gloss2"], | ||
}, | ||
], | ||
}, | ||
{ | ||
"lang_code": "de", | ||
"pos": "noun", | ||
"senses": [ | ||
{ | ||
"glosses": ["gloss3"], | ||
}, | ||
], | ||
}, | ||
], | ||
) | ||
|
||
def test_de_fix_level_hierarchy_of_subsections(self): | ||
self.wxr.wtp.add_page("Vorlage:Englisch Substantiv Übersicht", 10, "") | ||
self.wxr.wtp.add_page("Vorlage:Worttrennung", 10, "") | ||
self.wxr.wtp.add_page("Vorlage:Aussprache", 10, "") | ||
self.wxr.wtp.add_page("Vorlage:Übersetzungen", 10, "") | ||
self.wxr.wtp.add_page("Vorlage:Ü-Tabelle", 10, "") | ||
self.wxr.wtp.add_page("Vorlage:Referenzen", 10, "") | ||
|
||
page_text = """ | ||
{{Englisch Substantiv Übersicht | ||
|args=args}} | ||
{{Worttrennung}} | ||
:item | ||
{{Aussprache}} | ||
:item | ||
==== {{Übersetzungen}} ==== | ||
{{Ü-Tabelle|1|G=arg|Ü-Liste= | ||
:item | ||
}} | ||
{{Referenzen}} | ||
:item | ||
""" | ||
self.wxr.wtp.start_page("") | ||
root = self.wxr.wtp.parse( | ||
page_text, | ||
pre_expand=True, | ||
) | ||
|
||
subsections = fix_level_hierarchy_of_subsections( | ||
self.wxr, root.children | ||
) | ||
|
||
target_page_text = """==== {{Englisch Substantiv Übersicht\n|args=args}} ==== | ||
==== {{Worttrennung}} ==== | ||
:item | ||
==== {{Aussprache}} ==== | ||
:item | ||
==== {{Übersetzungen}} ==== | ||
{{Ü-Tabelle|1|G=arg|Ü-Liste= | ||
:item | ||
}} | ||
==== {{Referenzen}} ==== | ||
:item | ||
""" | ||
root = self.wxr.wtp.parse( | ||
target_page_text, | ||
pre_expand=True, | ||
) | ||
|
||
self.assertEqual( | ||
[str(s) for s in subsections], | ||
[str(t) for t in root.children], | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
# Export German Wiktionary language data to JSON. | ||
# | ||
# Usage: | ||
# | ||
# python language_data.py de dewiktionary_dump_file [--languages languages_output_file] | ||
|
||
import argparse | ||
from wikitextprocessor import Wtp | ||
from wiktextract.config import WiktionaryConfig | ||
from wiktextract.wxr_context import WiktextractContext | ||
from wiktextract.page import clean_node | ||
from wikitextprocessor.dumpparser import process_dump | ||
from wikitextprocessor import NodeKind, WikiNode | ||
|
||
import json | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser( | ||
description="Export Wiktionary language data to JSON" | ||
) | ||
parser.add_argument("lang_code", type=str, help="Dump file language code") | ||
parser.add_argument("dump", type=str, help="Wiktionary xml dump file path") | ||
parser.add_argument( | ||
"--languages", | ||
type=str, | ||
default="languages.json", | ||
help="Language data output file path", | ||
) | ||
args = parser.parse_args() | ||
wxr = WiktextractContext(Wtp(lang_code=args.lang_code), WiktionaryConfig()) | ||
|
||
wxr = WiktextractContext( | ||
Wtp( | ||
lang_code=args.lang_code, db_path="wikt-db_de_language_data_temp.db" | ||
), | ||
WiktionaryConfig(), | ||
) | ||
help_ns_id = wxr.wtp.NAMESPACE_DATA["Help"]["id"] | ||
template_ns_id = wxr.wtp.NAMESPACE_DATA["Template"]["id"] | ||
process_dump(wxr.wtp, args.dump, {help_ns_id, template_ns_id}) | ||
|
||
# The page 'Hilfe:Sprachkürzel seems to be the only central collection of | ||
# language codes and their German expansions. We will use this until we find | ||
# perhaps a more authoritative source. | ||
sprachkuerzel = wxr.wtp.get_page("Hilfe:Sprachkürzel") | ||
|
||
wxr.config.word = sprachkuerzel.title | ||
wxr.wtp.start_page(sprachkuerzel.title) | ||
tree = wxr.wtp.parse( | ||
sprachkuerzel.body, | ||
pre_expand=True, | ||
) | ||
|
||
languages = {} | ||
for node in filter(lambda n: isinstance(n, WikiNode), tree.children): | ||
if node.kind != NodeKind.LEVEL3: | ||
continue | ||
|
||
for table_row in node.find_child_recursively(NodeKind.TABLE_ROW): | ||
third_row_content = table_row.children[2].children[0] | ||
if ( | ||
isinstance(third_row_content, str) | ||
or third_row_content.kind != NodeKind.TEMPLATE | ||
): | ||
continue | ||
lang_code = third_row_content.template_name | ||
|
||
languages[lang_code] = [clean_node(wxr, None, third_row_content)] | ||
|
||
with open(args.languages, "w", encoding="utf-8") as fout: | ||
json.dump(languages, fout, indent=2, ensure_ascii=False, sort_keys=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.