Skip to content

Commit

Permalink
Merge pull request #370 from empiriker/master
Browse files Browse the repository at this point in the history
Add config.json for German extractor
  • Loading branch information
xxyzz authored Oct 20, 2023
2 parents a8787ef + bda937f commit 2f5edec
Show file tree
Hide file tree
Showing 10 changed files with 26 additions and 160 deletions.
2 changes: 0 additions & 2 deletions src/wiktextract/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,6 @@ def __init__(
self.set_attr_from_json(
"FORM_OF_TEMPLATES", "form_of_templates.json"
)
if dump_file_lang_code == "de":
self.set_attr_from_json("DE_FORM_TABLES", "form_tables.json")
self.analyze_templates = True # find templates that need pre-expand
self.extract_thesaurus_pages = True
self.load_edition_settings()
Expand Down
4 changes: 4 additions & 0 deletions src/wiktextract/data/de/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"analyze_templates": false,
"extract_thesaurus_pages": false
}
125 changes: 0 additions & 125 deletions src/wiktextract/data/de/form_tables.json

This file was deleted.

32 changes: 20 additions & 12 deletions src/wiktextract/extractor/de/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@

from wiktextract.datautils import append_base_data
from wiktextract.extractor.de.pronunciation import extract_pronunciation
from wiktextract.extractor.de.translation import extract_translation
from wiktextract.wxr_context import WiktextractContext

from .example import extract_examples
from .gloss import extract_glosses
from .pronunciation import extract_pronunciation
from .translation import extract_translation

# Templates that are used to form panels on pages and that should be ignored in
# various positions
Expand All @@ -24,14 +25,7 @@
PANEL_PREFIXES = set()

# Additional templates to be expanded in the pre-expand phase
ADDITIONAL_EXPAND_TEMPLATES = set()


# Templates that should not be pre-expanded
DO_NOT_PRE_EXPAND_TEMPLATES = {
"Ü-Tabelle", # Translation table
"Übersetzungen umleiten", # Translation table redirect
}
ADDITIONAL_EXPAND_TEMPLATES = {"NoCat"}


def parse_section(
Expand Down Expand Up @@ -210,7 +204,23 @@ def process_pos_section(
and non_l4_node.kind == NodeKind.TEMPLATE
and "Übersicht" in non_l4_node.template_name
):
# XXX: de: Extract form tables
# XXX: de: Extract form table templates
pass
elif (
isinstance(non_l4_node, WikiNode)
and non_l4_node.kind == NodeKind.TABLE
and "inflection-table" in non_l4_node.attrs.get("class")
):
# XXX: de: Extract html form table
pass
elif (
isinstance(non_l4_node, WikiNode)
and non_l4_node.kind == NodeKind.LINK
and len(non_l4_node.largs) > 0
and len(non_l4_node.largs[0]) > 0
and "Kategorie" in non_l4_node.largs[0][0]
):
# XXX Process categories
pass
else:
wxr.wtp.debug(
Expand All @@ -231,12 +241,10 @@ def parse_page(

# Parse the page, pre-expanding those templates that are likely to
# influence parsing
DO_NOT_PRE_EXPAND_TEMPLATES.update(wxr.config.DE_FORM_TABLES)
tree = wxr.wtp.parse(
page_text,
pre_expand=True,
additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,
)

page_data = []
Expand Down
4 changes: 2 additions & 2 deletions src/wiktextract/wiktionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,8 +225,8 @@ def reprocess_wiktionary(
last_time = estimate_progress(
processed_pages, all_page_nums, start_time, last_time
)

emit_words_in_thesaurus(wxr, emitted, out_f, human_readable)
if wxr.config.extract_thesaurus_pages:
emit_words_in_thesaurus(wxr, emitted, out_f, human_readable)
logging.info("Reprocessing wiktionary complete")


Expand Down
4 changes: 0 additions & 4 deletions tests/test_de_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.de.example import extract_examples, extract_reference
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext


Expand All @@ -19,9 +18,6 @@ def setUp(self) -> None:

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()
close_thesaurus_db(
self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
)

def test_de_extract_examples(self):
self.wxr.wtp.start_page("")
Expand Down
3 changes: 0 additions & 3 deletions tests/test_de_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,6 @@ def setUp(self) -> None:

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()
close_thesaurus_db(
self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
)

def test_de_extract_glosses(self):
self.wxr.wtp.start_page("")
Expand Down
4 changes: 0 additions & 4 deletions tests/test_de_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.de.page import parse_page, parse_section
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext


Expand All @@ -28,9 +27,6 @@ def setUp(self):

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()
close_thesaurus_db(
self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
)

def test_de_parse_page(self):
self.wxr.wtp.add_page("Vorlage:Sprache", 10, "")
Expand Down
4 changes: 0 additions & 4 deletions tests/test_de_pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
process_hoerbeispiele,
process_ipa,
)
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext


Expand All @@ -22,9 +21,6 @@ def setUp(self) -> None:

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()
close_thesaurus_db(
self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
)

def test_de_process_ipa(self):
test_cases = [
Expand Down
4 changes: 0 additions & 4 deletions tests/test_de_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
extract_translation,
process_translation_list,
)
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext


Expand All @@ -28,9 +27,6 @@ def setUp(self) -> None:

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()
close_thesaurus_db(
self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
)

def test_de_extract_translation(self):
test_cases = [
Expand Down

0 comments on commit 2f5edec

Please sign in to comment.