Merge pull request #370 from empiriker/master

Add config.json for German extractor
tatuylonen · Oct 20, 2023 · 2f5edec · 2f5edec
2 parents a8787ef + bda937f
commit 2f5edec
Show file tree

Hide file tree

Showing 10 changed files with 26 additions and 160 deletions.
diff --git a/src/wiktextract/config.py b/src/wiktextract/config.py
@@ -119,8 +119,6 @@ def __init__(
             self.set_attr_from_json(
                 "FORM_OF_TEMPLATES", "form_of_templates.json"
             )
-        if dump_file_lang_code == "de":
-            self.set_attr_from_json("DE_FORM_TABLES", "form_tables.json")
         self.analyze_templates = True  # find templates that need pre-expand
         self.extract_thesaurus_pages = True
         self.load_edition_settings()

diff --git a/src/wiktextract/data/de/config.json b/src/wiktextract/data/de/config.json
@@ -0,0 +1,4 @@
+{
+  "analyze_templates": false,
+  "extract_thesaurus_pages": false
+}
diff --git a/src/wiktextract/data/de/form_tables.json b/src/wiktextract/data/de/form_tables.json
diff --git a/src/wiktextract/extractor/de/page.py b/src/wiktextract/extractor/de/page.py
@@ -8,11 +8,12 @@
 
 from wiktextract.datautils import append_base_data
 from wiktextract.extractor.de.pronunciation import extract_pronunciation
-from wiktextract.extractor.de.translation import extract_translation
 from wiktextract.wxr_context import WiktextractContext
 
 from .example import extract_examples
 from .gloss import extract_glosses
+from .pronunciation import extract_pronunciation
+from .translation import extract_translation
 
 # Templates that are used to form panels on pages and that should be ignored in
 # various positions
@@ -24,14 +25,7 @@
 PANEL_PREFIXES = set()
 
 # Additional templates to be expanded in the pre-expand phase
-ADDITIONAL_EXPAND_TEMPLATES = set()
-
-
-# Templates that should not be pre-expanded
-DO_NOT_PRE_EXPAND_TEMPLATES = {
-    "Ü-Tabelle",  # Translation table
-    "Übersetzungen umleiten",  # Translation table redirect
-}
+ADDITIONAL_EXPAND_TEMPLATES = {"NoCat"}
 
 
 def parse_section(
@@ -210,7 +204,23 @@ def process_pos_section(
             and non_l4_node.kind == NodeKind.TEMPLATE
             and "Übersicht" in non_l4_node.template_name
         ):
-            # XXX: de: Extract form tables
+            # XXX: de: Extract form table templates
+            pass
+        elif (
+            isinstance(non_l4_node, WikiNode)
+            and non_l4_node.kind == NodeKind.TABLE
+            and "inflection-table" in non_l4_node.attrs.get("class")
+        ):
+            # XXX: de: Extract html form table
+            pass
+        elif (
+            isinstance(non_l4_node, WikiNode)
+            and non_l4_node.kind == NodeKind.LINK
+            and len(non_l4_node.largs) > 0
+            and len(non_l4_node.largs[0]) > 0
+            and "Kategorie" in non_l4_node.largs[0][0]
+        ):
+            # XXX Process categories
             pass
         else:
             wxr.wtp.debug(
@@ -231,12 +241,10 @@ def parse_page(
 
     # Parse the page, pre-expanding those templates that are likely to
     # influence parsing
-    DO_NOT_PRE_EXPAND_TEMPLATES.update(wxr.config.DE_FORM_TABLES)
     tree = wxr.wtp.parse(
         page_text,
         pre_expand=True,
         additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
-        do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,
     )
 
     page_data = []

diff --git a/src/wiktextract/wiktionary.py b/src/wiktextract/wiktionary.py
@@ -225,8 +225,8 @@ def reprocess_wiktionary(
             last_time = estimate_progress(
                 processed_pages, all_page_nums, start_time, last_time
             )
-
-    emit_words_in_thesaurus(wxr, emitted, out_f, human_readable)
+    if wxr.config.extract_thesaurus_pages:
+        emit_words_in_thesaurus(wxr, emitted, out_f, human_readable)
     logging.info("Reprocessing wiktionary complete")
 
 

diff --git a/tests/test_de_example.py b/tests/test_de_example.py
@@ -5,7 +5,6 @@
 
 from wiktextract.config import WiktionaryConfig
 from wiktextract.extractor.de.example import extract_examples, extract_reference
-from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
 
@@ -19,9 +18,6 @@ def setUp(self) -> None:
 
     def tearDown(self) -> None:
         self.wxr.wtp.close_db_conn()
-        close_thesaurus_db(
-            self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
-        )
 
     def test_de_extract_examples(self):
         self.wxr.wtp.start_page("")

diff --git a/tests/test_de_gloss.py b/tests/test_de_gloss.py
@@ -25,9 +25,6 @@ def setUp(self) -> None:
 
     def tearDown(self) -> None:
         self.wxr.wtp.close_db_conn()
-        close_thesaurus_db(
-            self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
-        )
 
     def test_de_extract_glosses(self):
         self.wxr.wtp.start_page("")

diff --git a/tests/test_de_page.py b/tests/test_de_page.py
@@ -7,7 +7,6 @@
 
 from wiktextract.config import WiktionaryConfig
 from wiktextract.extractor.de.page import parse_page, parse_section
-from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
 
@@ -28,9 +27,6 @@ def setUp(self):
 
     def tearDown(self) -> None:
         self.wxr.wtp.close_db_conn()
-        close_thesaurus_db(
-            self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
-        )
 
     def test_de_parse_page(self):
         self.wxr.wtp.add_page("Vorlage:Sprache", 10, "")

diff --git a/tests/test_de_pronunciation.py b/tests/test_de_pronunciation.py
@@ -8,7 +8,6 @@
     process_hoerbeispiele,
     process_ipa,
 )
-from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
 
@@ -22,9 +21,6 @@ def setUp(self) -> None:
 
     def tearDown(self) -> None:
         self.wxr.wtp.close_db_conn()
-        close_thesaurus_db(
-            self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
-        )
 
     def test_de_process_ipa(self):
         test_cases = [

diff --git a/tests/test_de_translation.py b/tests/test_de_translation.py
@@ -8,7 +8,6 @@
     extract_translation,
     process_translation_list,
 )
-from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
 
@@ -28,9 +27,6 @@ def setUp(self) -> None:
 
     def tearDown(self) -> None:
         self.wxr.wtp.close_db_conn()
-        close_thesaurus_db(
-            self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
-        )
 
     def test_de_extract_translation(self):
         test_cases = [