Remove LANGUAGES_BY_NAME and LANGUAGES_BY_CODE

use the mediawiki-langcodes package to convert language names and codes
tatuylonen · Nov 7, 2023 · 52dccc1 · 52dccc1
1 parent 46f5213
commit 52dccc1
Show file tree

Hide file tree

Showing 25 changed files with 86 additions and 208 deletions.
diff --git a/README.md b/README.md
@@ -409,7 +409,6 @@ The following command-line options can be used to control its operation:
 * --out FILE: specifies the name of the file to write (specifying "-" as the file writes to stdout)
 * --all-languages: extract words for all available languages
 * --language LANGUAGE_CODE: extracts the given language (this option may be specified multiple times; by default, English [en] and Translingual [mul] words are extracted)
-* --list-languages: prints a list of supported language names
 * --dump-file-language-code LANGUAGE_CODE: specifies the language code for the Wiktionary edition that the dump file is for (defaults to "en"; "zh" is supported and others are being added)
 * --all: causes all data to be captured for the selected languages
 * --translations: causes translations to be captured

diff --git a/json_schema/zh.json b/json_schema/zh.json
@@ -11,7 +11,7 @@
     },
     "lang_code": {
       "description": "Wiktionary language code",
-      "type": ["string", "null"]
+      "type": "string"
     },
     "word": {
       "description": "word string",
@@ -285,7 +285,7 @@
       "properties": {
         "lang_code": {
           "description": "Wiktionary language code of the translation term",
-          "type": ["string", "null"]
+          "type": "string"
         },
         "lang_name": {
           "description": "Translation language name",

diff --git a/src/wiktextract/config.py b/src/wiktextract/config.py
@@ -50,8 +50,6 @@ class WiktionaryConfig:
         "ZH_PRON_TAGS",
         "FR_FORM_TABLES",
         "DE_FORM_TABLES",
-        "LANGUAGES_BY_NAME",
-        "LANGUAGES_BY_CODE",
         "FORM_OF_TEMPLATES",
         "analyze_templates",
         "extract_thesaurus_pages",
@@ -113,7 +111,6 @@ def __init__(
         self.redirects = {}
         self.data_folder = files("wiktextract") / "data" / dump_file_lang_code
         self.init_subtitles()
-        self.init_languages()
         self.set_attr_from_json("ZH_PRON_TAGS", "zh_pron_tags.json")
         if dump_file_lang_code == "zh":
             self.set_attr_from_json(
@@ -161,79 +158,6 @@ def init_subtitles(self) -> None:
                 assert isinstance(v["tags"], (list, tuple))
         self.set_attr_from_json("OTHER_SUBTITLES", "other_subtitles.json")
 
-    def init_languages(self):
-        def canon_warn(name, use_code, not_use_code):
-            print(
-                f"WARNING: Non-unique language canonical name '{name}'."
-                f" Mapping to '{use_code}' instead of '{not_use_code}'."
-            )
-
-        def alias_info(name, new_code, kind, old_code, use_code, not_use_code):
-            if self.verbose:
-                print(
-                    f"Language alias '{name}' for code '{new_code}'"
-                    f" is already a{kind} for {old_code}."
-                    f" Mapping to '{use_code}' instead of '{not_use_code}'."
-                )
-
-        self.set_attr_from_json("LANGUAGES_BY_CODE", "languages.json")
-
-        self.LANGUAGES_BY_NAME = {}
-
-        # add canonical names first to avoid overwriting them
-        canonical_names = {}
-        for lang_code, lang_names in self.LANGUAGES_BY_CODE.items():
-            canonical_name = lang_names[0]
-            if canonical_name in canonical_names:
-                lang_code0 = canonical_names[canonical_name]
-                if len(lang_code) < len(lang_code0):
-                    canon_warn(canonical_name, lang_code, lang_code0)
-                    canonical_names[canonical_name] = lang_code
-                    self.LANGUAGES_BY_NAME[canonical_name] = lang_code
-                else:
-                    canon_warn(canonical_name, lang_code0, lang_code)
-            else:
-                canonical_names[canonical_name] = lang_code
-                self.LANGUAGES_BY_NAME[canonical_name] = lang_code
-
-        # add other names
-        for lang_code, lang_names in self.LANGUAGES_BY_CODE.items():
-            for lang_name in lang_names[1:]:
-                if lang_name in canonical_names:
-                    lang_code0 = canonical_names[lang_name]
-                    alias_info(
-                        lang_name,
-                        lang_code,
-                        " canonical name",
-                        lang_code0,
-                        lang_code0,
-                        lang_code,
-                    )
-                    continue
-                if lang_name in self.LANGUAGES_BY_NAME:
-                    lang_code0 = self.LANGUAGES_BY_NAME[lang_name]
-                    if len(lang_code) < len(lang_code0):
-                        alias_info(
-                            lang_name,
-                            lang_code,
-                            "n alias",
-                            lang_code0,
-                            lang_code,
-                            lang_code0,
-                        )
-                        self.LANGUAGES_BY_NAME[lang_name] = lang_code
-                    else:
-                        alias_info(
-                            lang_name,
-                            lang_code,
-                            "n alias",
-                            lang_code0,
-                            lang_code0,
-                            lang_code,
-                        )
-                else:
-                    self.LANGUAGES_BY_NAME[lang_name] = lang_code
-
     def load_edition_settings(self):
         file_path = self.data_folder / "config.json"
         if file_path.exists():

diff --git a/src/wiktextract/extractor/de/example.py b/src/wiktextract/extractor/de/example.py
@@ -3,7 +3,6 @@
 
 from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.parser import LevelNode
-
 from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext

diff --git a/src/wiktextract/extractor/de/gloss.py b/src/wiktextract/extractor/de/gloss.py
@@ -4,7 +4,6 @@
 
 from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.parser import LevelNode
-
 from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext

diff --git a/src/wiktextract/extractor/de/linkage.py b/src/wiktextract/extractor/de/linkage.py
@@ -3,7 +3,6 @@
 
 from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.parser import LevelNode
-
 from wiktextract.extractor.de.utils import split_senseids
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext

diff --git a/src/wiktextract/extractor/de/page.py b/src/wiktextract/extractor/de/page.py
@@ -3,9 +3,9 @@
 from collections import defaultdict
 from typing import Dict, List, Union
 
+from mediawiki_langcodes import name_to_code
 from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.parser import LevelNode
-
 from wiktextract.datautils import append_base_data
 from wiktextract.wxr_context import WiktextractContext
 
@@ -263,13 +263,12 @@ def parse_page(
             # German name of the language of the section.
             if subtitle_template.template_name == "Sprache":
                 lang_name = subtitle_template.template_parameters.get(1)
-                lang_code = wxr.config.LANGUAGES_BY_NAME.get(lang_name)
-                if not lang_code:
+                lang_code = name_to_code(lang_name, "de")
+                if lang_code == "":
                     wxr.wtp.warning(
                         f"Unknown language: {lang_name}",
                         sortid="extractor/de/page/parse_page/76",
                     )
-                    continue
                 if (
                     wxr.config.capture_language_codes is not None
                     and lang_code not in wxr.config.capture_language_codes

diff --git a/src/wiktextract/extractor/de/pronunciation.py b/src/wiktextract/extractor/de/pronunciation.py
@@ -1,9 +1,9 @@
 from collections import defaultdict
 from typing import Dict, List, Union
 
+from mediawiki_langcodes import code_to_name
 from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.parser import LevelNode
-
 from wiktextract.extractor.share import create_audio_url_dict
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
@@ -94,7 +94,7 @@ def process_lautschrift_template(
 
     lang_code = template_parameters.get("spr")
     if lang_code:
-        language = wxr.wtp.LANGUAGES_BY_CODE[lang_code]
+        language = code_to_name(lang_code, "de")
         add_sound_data_without_appending_to_existing_properties(
             sound_data,
             {

diff --git a/src/wiktextract/extractor/de/translation.py b/src/wiktextract/extractor/de/translation.py
@@ -2,9 +2,9 @@
 from collections import defaultdict
 from typing import Dict, List, Union
 
+from mediawiki_langcodes import code_to_name
 from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.parser import TemplateNode
-
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
 
@@ -101,12 +101,10 @@ def process_translation_list(
 
             lang_code = node.template_parameters.get(1)
             translation_data["code"] = lang_code
-            languages = wxr.wtp.LANGUAGES_BY_CODE.get(lang_code)
-            if languages:
-                translation_data["lang"] = languages[0]
-            else:
+            translation_data["lang"] = code_to_name(lang_code, "de")
+            if translation_data["lang"] == "":
                 wxr.wtp.debug(
-                    f"Unknown language code: {lang_code}",
+                    f"Unknown language code: {translation_data['lang']}",
                     sortid="extractor/de/translation/process_translation_list/70",
                 )
             if node.template_name[-1] == "?":

diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py
@@ -2,37 +2,47 @@
 #
 # Copyright (c) 2018-2022 Tatu Ylonen.  See file LICENSE and https://ylonen.org
 
-import re
-import sys
 import copy
 import html
 import logging
-
+import re
+import sys
 from collections import defaultdict
 from functools import partial
 from typing import Dict, List, Optional, Set, Union
 
-from wikitextprocessor import WikiNode, NodeKind
+from mediawiki_langcodes import get_all_names, name_to_code
+from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.core import TemplateArgs
-from wiktextract.wxr_context import WiktextractContext
-from wiktextract.parts_of_speech import PARTS_OF_SPEECH
-from wiktextract.linkages import parse_linkage_item_text
-from wiktextract.translations import parse_translation_item_text
 from wiktextract.clean import clean_template_args
-from wiktextract.datautils import data_append, data_extend, ns_title_prefix_tuple
-from wiktextract.tags import valid_tags
-from wiktextract.page import (
-    clean_node, recursively_extract, LEVEL_KINDS, is_panel_template
+from wiktextract.datautils import (
+    data_append,
+    data_extend,
+    ns_title_prefix_tuple,
 )
-
 from wiktextract.form_descriptions import (
-    decode_tags, parse_word_head, parse_sense_qualifier,
-    distw, parse_alt_or_inflection_of, classify_desc)
-from wiktextract.inflection import parse_inflection_section, TableContext
+    classify_desc,
+    decode_tags,
+    distw,
+    parse_alt_or_inflection_of,
+    parse_sense_qualifier,
+    parse_word_head,
+)
+from wiktextract.inflection import TableContext, parse_inflection_section
+from wiktextract.linkages import parse_linkage_item_text
+from wiktextract.page import (
+    LEVEL_KINDS,
+    clean_node,
+    is_panel_template,
+    recursively_extract,
+)
+from wiktextract.parts_of_speech import PARTS_OF_SPEECH
+from wiktextract.tags import valid_tags
+from wiktextract.translations import parse_translation_item_text
+from wiktextract.wxr_context import WiktextractContext
 
 from ..ruby import extract_ruby, parse_ruby
 from ..share import strip_nodes
-
 from .unsupported_titles import unsupported_title_map
 
 # Matches head tag
@@ -532,7 +542,7 @@ def init_head_tag_re(wxr):
             r"^(head|Han char|arabic-noun|arabic-noun-form|"
             r"hangul-symbol|syllable-hangul)$|" +
             r"^(latin|" +
-            "|".join(wxr.wtp.LANGUAGES_BY_CODE) + r")-(" +
+            "|".join(lang_name for _, lang_name in get_all_names("en")) + r")-(" +
             "|".join([
                 "abbr",
                 "adj",
@@ -3356,7 +3366,6 @@ def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str:
     """Fix subtitle hierarchy to be strict Language -> Etymology ->
     Part-of-Speech -> Translation/Linkage."""
 
-    # Known language names are in languages_by_name
     # Known lowercase PoS names are in part_of_speech_map
     # Known lowercase linkage section names are in linkage_map
 
@@ -3381,7 +3390,7 @@ def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str:
                       .format(title, left, right),
                       sortid="page/2904")
         lc = title.lower()
-        if title in wxr.config.LANGUAGES_BY_NAME:
+        if name_to_code(title, "en") != "":
             if level > 2:
                 wxr.wtp.debug("subtitle has language name {} at level {}"
                           .format(title, level),
@@ -3491,16 +3500,18 @@ def multitrans_post_fn(name, ht, text):
             # Some pages have links at top level, e.g., "trees" in Wiktionary
             continue
         if langnode.kind != NodeKind.LEVEL2:
-            wxr.wtp.debug("unexpected top-level node: {}".format(langnode),
-                      sortid="page/3014")
-            continue
-        lang = clean_node(wxr, None,
-                          langnode.sarg if langnode.sarg else langnode.largs)
-        if lang not in wxr.config.LANGUAGES_BY_NAME:
-            wxr.wtp.debug("unrecognized language name at top-level {!r}"
-                      .format(lang), sortid="page/3019")
+            wxr.wtp.debug(
+                f"unexpected top-level node: {langnode}", sortid="page/3014"
+            )
             continue
-        lang_code = wxr.config.LANGUAGES_BY_NAME.get(lang)
+        lang = clean_node(
+            wxr, None, langnode.sarg if langnode.sarg else langnode.largs
+        )
+        lang_code = name_to_code(lang, "en")
+        if lang_code == "":
+            wxr.wtp.debug(
+                f"unrecognized language name: {lang}", sortid="page/3019"
+            )
         if (
             wxr.config.capture_language_codes
             and lang_code not in wxr.config.capture_language_codes

diff --git a/src/wiktextract/extractor/en/thesaurus.py b/src/wiktextract/extractor/en/thesaurus.py
@@ -7,8 +7,8 @@
 import re
 from typing import List, Optional
 
+from mediawiki_langcodes import code_to_name, name_to_code
 from wikitextprocessor import NodeKind, Page, WikiNode
-
 from wiktextract.datautils import ns_title_prefix_tuple
 from wiktextract.form_descriptions import parse_sense_qualifier
 from wiktextract.page import LEVEL_KINDS, clean_node
@@ -98,7 +98,7 @@ def extract_thesaurus_page(
     # {{ws header|lang=xx}}
     m = re.search(r"(?s)\{\{ws header\|[^}]*lang=([^}|]*)", text)
     if m:
-        lang = wxr.config.LANGUAGES_BY_CODE.get(m.group(1), [None])[0]
+        lang = code_to_name(m.group(1), "en")
 
     def recurse(contents) -> Optional[List[ThesaurusTerm]]:
         nonlocal lang
@@ -197,7 +197,7 @@ def qual_fn(m):
                     w1 = w1.removesuffix(" [⇒ thesaurus]")
 
                     if w1:
-                        lang_code = wxr.config.LANGUAGES_BY_NAME.get(lang)
+                        lang_code = name_to_code(lang, "en")
                         if lang_code is None:
                             logging.debug(
                                 f"Linkage language {lang} not recognized"
@@ -230,7 +230,7 @@ def qual_fn(m):
         subtitle = wxr.wtp.node_to_text(
             contents.sarg if contents.sarg else contents.largs
         )
-        if subtitle in wxr.config.LANGUAGES_BY_NAME:
+        if name_to_code(subtitle, "en") != "":
             lang = subtitle
             pos = None
             sense = None

diff --git a/src/wiktextract/extractor/zh/page.py b/src/wiktextract/extractor/zh/page.py
@@ -4,6 +4,7 @@
 from collections import defaultdict
 from typing import Dict, List, Union
 
+from mediawiki_langcodes import name_to_code
 from wikitextprocessor import NodeKind, WikiNode
 from wiktextract.datautils import append_base_data
 from wiktextract.page import LEVEL_KINDS, clean_node
@@ -212,12 +213,12 @@ def parse_page(
     for level2_node in tree.find_child(NodeKind.LEVEL2):
         categories_and_links = defaultdict(list)
         lang_name = clean_node(wxr, categories_and_links, level2_node.largs)
-        if lang_name not in wxr.config.LANGUAGES_BY_NAME:
+        if name_to_code(lang_name, "zh") == "":
             wxr.wtp.warning(
                 f"Unrecognized language name: {lang_name}",
                 sortid="extractor/zh/page/parse_page/509",
             )
-        lang_code = wxr.config.LANGUAGES_BY_NAME.get(lang_name)
+        lang_code = name_to_code(lang_name, "zh")
         if (
             wxr.config.capture_language_codes is not None
             and lang_code not in wxr.config.capture_language_codes