Merge pull request #391 from empiriker/es

Add languages.json for Spanish Wiktionary
tatuylonen · Oct 31, 2023 · 595ff1d · 595ff1d
2 parents fdb3c43 + a64e729
commit 595ff1d
Show file tree

Hide file tree

Showing 5 changed files with 3,715 additions and 16 deletions.
diff --git a/languages/get_de_data.py → languages/get_data_de.py b/languages/get_de_data.py → languages/get_data_de.py
@@ -5,28 +5,21 @@
 # python language_data.py de dewiktionary_dump_file [--languages languages_output_file]
 
 import argparse
-from wikitextprocessor import Wtp
-from wiktextract.config import WiktionaryConfig
-from wiktextract.wxr_context import WiktextractContext
-from wiktextract.page import clean_node
-from wikitextprocessor.dumpparser import process_dump
-from wikitextprocessor import NodeKind, WikiNode
-
 import json
 
+from wikitextprocessor import NodeKind, WikiNode, Wtp
+from wikitextprocessor.dumpparser import process_dump
+
+from wiktextract.config import WiktionaryConfig
+from wiktextract.page import clean_node
+from wiktextract.wxr_context import WiktextractContext
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Export Wiktionary language data to JSON"
     )
     parser.add_argument("lang_code", type=str, help="Dump file language code")
     parser.add_argument("dump", type=str, help="Wiktionary xml dump file path")
-    parser.add_argument(
-        "--languages",
-        type=str,
-        default="languages.json",
-        help="Language data output file path",
-    )
     args = parser.parse_args()
     wxr = WiktextractContext(Wtp(lang_code=args.lang_code), WiktionaryConfig())
 
@@ -40,7 +33,7 @@
     template_ns_id = wxr.wtp.NAMESPACE_DATA["Template"]["id"]
     process_dump(wxr.wtp, args.dump, {help_ns_id, template_ns_id})
 
-    # The page 'Hilfe:Sprachkürzel seems to be the only central collection of 
+    # The page 'Hilfe:Sprachkürzel seems to be the only central collection of
     # language codes and their German expansions. We will use this until we find
     #  perhaps a more authoritative source.
     sprachkuerzel = wxr.wtp.get_page("Hilfe:Sprachkürzel")
@@ -68,5 +61,9 @@
 
             languages[lang_code] = [clean_node(wxr, None, third_row_content)]
 
-    with open(args.languages, "w", encoding="utf-8") as fout:
+    with open(
+        f"src/wiktextract/data/{args.lang_code}/languages.json",
+        "w",
+        encoding="utf-8",
+    ) as fout:
         json.dump(languages, fout, indent=2, ensure_ascii=False, sort_keys=True)
diff --git a/languages/get_data_es.py b/languages/get_data_es.py
@@ -0,0 +1,60 @@
+# Export Spanish Wiktionary language data to JSON.
+#
+# Usage:
+#
+# python language_data.py de dewiktionary_dump_file [--languages languages_output_file]
+
+import argparse
+import json
+
+from wikitextprocessor import NodeKind, WikiNode, Wtp
+from wikitextprocessor.dumpparser import process_dump
+
+from wiktextract.config import WiktionaryConfig
+from wiktextract.page import clean_node
+from wiktextract.wxr_context import WiktextractContext
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Export Wiktionary language data to JSON"
+    )
+    parser.add_argument("lang_code", type=str, help="Dump file language code")
+    parser.add_argument("dump", type=str, help="Wiktionary xml dump file path")
+    args = parser.parse_args()
+    wxr = WiktextractContext(Wtp(lang_code=args.lang_code), WiktionaryConfig())
+
+    wxr = WiktextractContext(
+        Wtp(
+            lang_code=args.lang_code, db_path="wikt-db_es_language_data_temp.db"
+        ),
+        WiktionaryConfig(),
+    )
+    appendix_ns_id = wxr.wtp.NAMESPACE_DATA["Appendix"]["id"]
+    process_dump(wxr.wtp, args.dump, {appendix_ns_id})
+
+    # https://es.wiktionary.org/wiki/Ap%C3%A9ndice:C%C3%B3digos_de_idioma
+    codigos_de_idioma = wxr.wtp.get_page("Apéndice:Códigos de idioma")
+
+    wxr.config.word = codigos_de_idioma.title
+    wxr.wtp.start_page(codigos_de_idioma.title)
+    tree = wxr.wtp.parse(
+        codigos_de_idioma.body,
+        pre_expand=True,
+    )
+    languages = {}
+    for table in tree.find_child_recursively(NodeKind.TABLE):
+        for table_row in table.find_child(NodeKind.TABLE_ROW):
+            lang_code_language = []
+            for table_cell in table_row.find_child(NodeKind.TABLE_CELL):
+                lang_code_language.append(table_cell.children[0])
+
+            if lang_code_language:
+                languages[clean_node(wxr, None, lang_code_language[0])] = [
+                    clean_node(wxr, None, lang_code_language[1])
+                ]
+    with open(
+        f"src/wiktextract/data/{args.lang_code}/languages.json",
+        "w",
+        encoding="utf-8",
+    ) as fout:
+        json.dump(languages, fout, indent=2, ensure_ascii=False, sort_keys=True)
diff --git a/src/wiktextract/data/es/config.json b/src/wiktextract/data/es/config.json
@@ -0,0 +1,4 @@
+{
+  "analyze_templates": false,
+  "extract_thesaurus_pages": false
+}