From 52dccc1748d033ed836ccf78c5ba7886568d12c6 Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Tue, 7 Nov 2023 10:34:57 +0800
Subject: [PATCH] Remove `LANGUAGES_BY_NAME` and `LANGUAGES_BY_CODE`

use the mediawiki-langcodes package to convert language names and
codes
---
 README.md                                     |  1 -
 json_schema/zh.json                           |  4 +-
 src/wiktextract/config.py                     | 76 -------------------
 src/wiktextract/extractor/de/example.py       |  1 -
 src/wiktextract/extractor/de/gloss.py         |  1 -
 src/wiktextract/extractor/de/linkage.py       |  1 -
 src/wiktextract/extractor/de/page.py          |  7 +-
 src/wiktextract/extractor/de/pronunciation.py |  4 +-
 src/wiktextract/extractor/de/translation.py   | 10 +--
 src/wiktextract/extractor/en/page.py          | 69 ++++++++++-------
 src/wiktextract/extractor/en/thesaurus.py     |  8 +-
 src/wiktextract/extractor/zh/page.py          |  5 +-
 src/wiktextract/extractor/zh/thesaurus.py     |  3 +-
 src/wiktextract/extractor/zh/translation.py   |  3 +-
 src/wiktextract/lang_specific_configs.py      |  2 +-
 src/wiktextract/page.py                       |  6 +-
 src/wiktextract/thesaurus.py                  |  3 +-
 src/wiktextract/translations.py               | 20 ++---
 src/wiktextract/wiktionary.py                 |  2 +-
 src/wiktextract/wiktwords.py                  | 43 -----------
 tests/test_de_pronunciation.py                |  4 -
 tests/test_de_translation.py                  |  6 --
 tests/test_long.py                            |  9 +--
 tests/test_translations.py                    |  4 +-
 tests/test_zh_translation.py                  |  2 +-
 25 files changed, 86 insertions(+), 208 deletions(-)

diff --git a/README.md b/README.md
index ab978870..bb600b06 100644
--- a/README.md
+++ b/README.md
@@ -409,7 +409,6 @@ The following command-line options can be used to control its operation:
 * --out FILE: specifies the name of the file to write (specifying "-" as the file writes to stdout)
 * --all-languages: extract words for all available languages
 * --language LANGUAGE_CODE: extracts the given language (this option may be specified multiple times; by default, English [en] and Translingual [mul] words are extracted)
-* --list-languages: prints a list of supported language names
 * --dump-file-language-code LANGUAGE_CODE: specifies the language code for the Wiktionary edition that the dump file is for (defaults to "en"; "zh" is supported and others are being added)
 * --all: causes all data to be captured for the selected languages
 * --translations: causes translations to be captured
diff --git a/json_schema/zh.json b/json_schema/zh.json
index 1e4af6ae..9de1c86c 100644
--- a/json_schema/zh.json
+++ b/json_schema/zh.json
@@ -11,7 +11,7 @@
     },
     "lang_code": {
       "description": "Wiktionary language code",
-      "type": ["string", "null"]
+      "type": "string"
     },
     "word": {
       "description": "word string",
@@ -285,7 +285,7 @@
       "properties": {
         "lang_code": {
           "description": "Wiktionary language code of the translation term",
-          "type": ["string", "null"]
+          "type": "string"
         },
         "lang_name": {
           "description": "Translation language name",
diff --git a/src/wiktextract/config.py b/src/wiktextract/config.py
index 9a24bf5b..41829c13 100644
--- a/src/wiktextract/config.py
+++ b/src/wiktextract/config.py
@@ -50,8 +50,6 @@ class WiktionaryConfig:
         "ZH_PRON_TAGS",
         "FR_FORM_TABLES",
         "DE_FORM_TABLES",
-        "LANGUAGES_BY_NAME",
-        "LANGUAGES_BY_CODE",
         "FORM_OF_TEMPLATES",
         "analyze_templates",
         "extract_thesaurus_pages",
@@ -113,7 +111,6 @@ def __init__(
         self.redirects = {}
         self.data_folder = files("wiktextract") / "data" / dump_file_lang_code
         self.init_subtitles()
-        self.init_languages()
         self.set_attr_from_json("ZH_PRON_TAGS", "zh_pron_tags.json")
         if dump_file_lang_code == "zh":
             self.set_attr_from_json(
@@ -161,79 +158,6 @@ def init_subtitles(self) -> None:
                 assert isinstance(v["tags"], (list, tuple))
         self.set_attr_from_json("OTHER_SUBTITLES", "other_subtitles.json")
 
-    def init_languages(self):
-        def canon_warn(name, use_code, not_use_code):
-            print(
-                f"WARNING: Non-unique language canonical name '{name}'."
-                f" Mapping to '{use_code}' instead of '{not_use_code}'."
-            )
-
-        def alias_info(name, new_code, kind, old_code, use_code, not_use_code):
-            if self.verbose:
-                print(
-                    f"Language alias '{name}' for code '{new_code}'"
-                    f" is already a{kind} for {old_code}."
-                    f" Mapping to '{use_code}' instead of '{not_use_code}'."
-                )
-
-        self.set_attr_from_json("LANGUAGES_BY_CODE", "languages.json")
-
-        self.LANGUAGES_BY_NAME = {}
-
-        # add canonical names first to avoid overwriting them
-        canonical_names = {}
-        for lang_code, lang_names in self.LANGUAGES_BY_CODE.items():
-            canonical_name = lang_names[0]
-            if canonical_name in canonical_names:
-                lang_code0 = canonical_names[canonical_name]
-                if len(lang_code) < len(lang_code0):
-                    canon_warn(canonical_name, lang_code, lang_code0)
-                    canonical_names[canonical_name] = lang_code
-                    self.LANGUAGES_BY_NAME[canonical_name] = lang_code
-                else:
-                    canon_warn(canonical_name, lang_code0, lang_code)
-            else:
-                canonical_names[canonical_name] = lang_code
-                self.LANGUAGES_BY_NAME[canonical_name] = lang_code
-
-        # add other names
-        for lang_code, lang_names in self.LANGUAGES_BY_CODE.items():
-            for lang_name in lang_names[1:]:
-                if lang_name in canonical_names:
-                    lang_code0 = canonical_names[lang_name]
-                    alias_info(
-                        lang_name,
-                        lang_code,
-                        " canonical name",
-                        lang_code0,
-                        lang_code0,
-                        lang_code,
-                    )
-                    continue
-                if lang_name in self.LANGUAGES_BY_NAME:
-                    lang_code0 = self.LANGUAGES_BY_NAME[lang_name]
-                    if len(lang_code) < len(lang_code0):
-                        alias_info(
-                            lang_name,
-                            lang_code,
-                            "n alias",
-                            lang_code0,
-                            lang_code,
-                            lang_code0,
-                        )
-                        self.LANGUAGES_BY_NAME[lang_name] = lang_code
-                    else:
-                        alias_info(
-                            lang_name,
-                            lang_code,
-                            "n alias",
-                            lang_code0,
-                            lang_code0,
-                            lang_code,
-                        )
-                else:
-                    self.LANGUAGES_BY_NAME[lang_name] = lang_code
-
     def load_edition_settings(self):
         file_path = self.data_folder / "config.json"
         if file_path.exists():
diff --git a/src/wiktextract/extractor/de/example.py b/src/wiktextract/extractor/de/example.py
index da3268d3..9c7e247f 100644
--- a/src/wiktextract/extractor/de/example.py
+++ b/src/wiktextract/extractor/de/example.py
@@ -3,7 +3,6 @@
 
 from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.parser import LevelNode
-
 from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
diff --git a/src/wiktextract/extractor/de/gloss.py b/src/wiktextract/extractor/de/gloss.py
index 3caa7252..ad9183f3 100644
--- a/src/wiktextract/extractor/de/gloss.py
+++ b/src/wiktextract/extractor/de/gloss.py
@@ -4,7 +4,6 @@
 
 from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.parser import LevelNode
-
 from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
diff --git a/src/wiktextract/extractor/de/linkage.py b/src/wiktextract/extractor/de/linkage.py
index eb4425e7..10de977e 100644
--- a/src/wiktextract/extractor/de/linkage.py
+++ b/src/wiktextract/extractor/de/linkage.py
@@ -3,7 +3,6 @@
 
 from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.parser import LevelNode
-
 from wiktextract.extractor.de.utils import split_senseids
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
diff --git a/src/wiktextract/extractor/de/page.py b/src/wiktextract/extractor/de/page.py
index d4353e48..8b276de2 100644
--- a/src/wiktextract/extractor/de/page.py
+++ b/src/wiktextract/extractor/de/page.py
@@ -3,9 +3,9 @@
 from collections import defaultdict
 from typing import Dict, List, Union
 
+from mediawiki_langcodes import name_to_code
 from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.parser import LevelNode
-
 from wiktextract.datautils import append_base_data
 from wiktextract.wxr_context import WiktextractContext
 
@@ -263,13 +263,12 @@ def parse_page(
             # German name of the language of the section.
             if subtitle_template.template_name == "Sprache":
                 lang_name = subtitle_template.template_parameters.get(1)
-                lang_code = wxr.config.LANGUAGES_BY_NAME.get(lang_name)
-                if not lang_code:
+                lang_code = name_to_code(lang_name, "de")
+                if lang_code == "":
                     wxr.wtp.warning(
                         f"Unknown language: {lang_name}",
                         sortid="extractor/de/page/parse_page/76",
                     )
-                    continue
                 if (
                     wxr.config.capture_language_codes is not None
                     and lang_code not in wxr.config.capture_language_codes
diff --git a/src/wiktextract/extractor/de/pronunciation.py b/src/wiktextract/extractor/de/pronunciation.py
index e55adb13..2fb63e6d 100644
--- a/src/wiktextract/extractor/de/pronunciation.py
+++ b/src/wiktextract/extractor/de/pronunciation.py
@@ -1,9 +1,9 @@
 from collections import defaultdict
 from typing import Dict, List, Union
 
+from mediawiki_langcodes import code_to_name
 from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.parser import LevelNode
-
 from wiktextract.extractor.share import create_audio_url_dict
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
@@ -94,7 +94,7 @@ def process_lautschrift_template(
 
     lang_code = template_parameters.get("spr")
     if lang_code:
-        language = wxr.wtp.LANGUAGES_BY_CODE[lang_code]
+        language = code_to_name(lang_code, "de")
         add_sound_data_without_appending_to_existing_properties(
             sound_data,
             {
diff --git a/src/wiktextract/extractor/de/translation.py b/src/wiktextract/extractor/de/translation.py
index e39cb079..77e598e1 100644
--- a/src/wiktextract/extractor/de/translation.py
+++ b/src/wiktextract/extractor/de/translation.py
@@ -2,9 +2,9 @@
 from collections import defaultdict
 from typing import Dict, List, Union
 
+from mediawiki_langcodes import code_to_name
 from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.parser import TemplateNode
-
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
 
@@ -101,12 +101,10 @@ def process_translation_list(
 
             lang_code = node.template_parameters.get(1)
             translation_data["code"] = lang_code
-            languages = wxr.wtp.LANGUAGES_BY_CODE.get(lang_code)
-            if languages:
-                translation_data["lang"] = languages[0]
-            else:
+            translation_data["lang"] = code_to_name(lang_code, "de")
+            if translation_data["lang"] == "":
                 wxr.wtp.debug(
-                    f"Unknown language code: {lang_code}",
+                    f"Unknown language code: {translation_data['lang']}",
                     sortid="extractor/de/translation/process_translation_list/70",
                 )
             if node.template_name[-1] == "?":
diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py
index d2327f89..8c3d63c1 100644
--- a/src/wiktextract/extractor/en/page.py
+++ b/src/wiktextract/extractor/en/page.py
@@ -2,37 +2,47 @@
 #
 # Copyright (c) 2018-2022 Tatu Ylonen.  See file LICENSE and https://ylonen.org
 
-import re
-import sys
 import copy
 import html
 import logging
-
+import re
+import sys
 from collections import defaultdict
 from functools import partial
 from typing import Dict, List, Optional, Set, Union
 
-from wikitextprocessor import WikiNode, NodeKind
+from mediawiki_langcodes import get_all_names, name_to_code
+from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.core import TemplateArgs
-from wiktextract.wxr_context import WiktextractContext
-from wiktextract.parts_of_speech import PARTS_OF_SPEECH
-from wiktextract.linkages import parse_linkage_item_text
-from wiktextract.translations import parse_translation_item_text
 from wiktextract.clean import clean_template_args
-from wiktextract.datautils import data_append, data_extend, ns_title_prefix_tuple
-from wiktextract.tags import valid_tags
-from wiktextract.page import (
-    clean_node, recursively_extract, LEVEL_KINDS, is_panel_template
+from wiktextract.datautils import (
+    data_append,
+    data_extend,
+    ns_title_prefix_tuple,
 )
-
 from wiktextract.form_descriptions import (
-    decode_tags, parse_word_head, parse_sense_qualifier,
-    distw, parse_alt_or_inflection_of, classify_desc)
-from wiktextract.inflection import parse_inflection_section, TableContext
+    classify_desc,
+    decode_tags,
+    distw,
+    parse_alt_or_inflection_of,
+    parse_sense_qualifier,
+    parse_word_head,
+)
+from wiktextract.inflection import TableContext, parse_inflection_section
+from wiktextract.linkages import parse_linkage_item_text
+from wiktextract.page import (
+    LEVEL_KINDS,
+    clean_node,
+    is_panel_template,
+    recursively_extract,
+)
+from wiktextract.parts_of_speech import PARTS_OF_SPEECH
+from wiktextract.tags import valid_tags
+from wiktextract.translations import parse_translation_item_text
+from wiktextract.wxr_context import WiktextractContext
 
 from ..ruby import extract_ruby, parse_ruby
 from ..share import strip_nodes
-
 from .unsupported_titles import unsupported_title_map
 
 # Matches head tag
@@ -532,7 +542,7 @@ def init_head_tag_re(wxr):
             r"^(head|Han char|arabic-noun|arabic-noun-form|"
             r"hangul-symbol|syllable-hangul)$|" +
             r"^(latin|" +
-            "|".join(wxr.wtp.LANGUAGES_BY_CODE) + r")-(" +
+            "|".join(lang_name for _, lang_name in get_all_names("en")) + r")-(" +
             "|".join([
                 "abbr",
                 "adj",
@@ -3356,7 +3366,6 @@ def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str:
     """Fix subtitle hierarchy to be strict Language -> Etymology ->
     Part-of-Speech -> Translation/Linkage."""
 
-    # Known language names are in languages_by_name
     # Known lowercase PoS names are in part_of_speech_map
     # Known lowercase linkage section names are in linkage_map
 
@@ -3381,7 +3390,7 @@ def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str:
                       .format(title, left, right),
                       sortid="page/2904")
         lc = title.lower()
-        if title in wxr.config.LANGUAGES_BY_NAME:
+        if name_to_code(title, "en") != "":
             if level > 2:
                 wxr.wtp.debug("subtitle has language name {} at level {}"
                           .format(title, level),
@@ -3491,16 +3500,18 @@ def multitrans_post_fn(name, ht, text):
             # Some pages have links at top level, e.g., "trees" in Wiktionary
             continue
         if langnode.kind != NodeKind.LEVEL2:
-            wxr.wtp.debug("unexpected top-level node: {}".format(langnode),
-                      sortid="page/3014")
-            continue
-        lang = clean_node(wxr, None,
-                          langnode.sarg if langnode.sarg else langnode.largs)
-        if lang not in wxr.config.LANGUAGES_BY_NAME:
-            wxr.wtp.debug("unrecognized language name at top-level {!r}"
-                      .format(lang), sortid="page/3019")
+            wxr.wtp.debug(
+                f"unexpected top-level node: {langnode}", sortid="page/3014"
+            )
             continue
-        lang_code = wxr.config.LANGUAGES_BY_NAME.get(lang)
+        lang = clean_node(
+            wxr, None, langnode.sarg if langnode.sarg else langnode.largs
+        )
+        lang_code = name_to_code(lang, "en")
+        if lang_code == "":
+            wxr.wtp.debug(
+                f"unrecognized language name: {lang}", sortid="page/3019"
+            )
         if (
             wxr.config.capture_language_codes
             and lang_code not in wxr.config.capture_language_codes
diff --git a/src/wiktextract/extractor/en/thesaurus.py b/src/wiktextract/extractor/en/thesaurus.py
index db9d222b..bb3b28dc 100644
--- a/src/wiktextract/extractor/en/thesaurus.py
+++ b/src/wiktextract/extractor/en/thesaurus.py
@@ -7,8 +7,8 @@
 import re
 from typing import List, Optional
 
+from mediawiki_langcodes import code_to_name, name_to_code
 from wikitextprocessor import NodeKind, Page, WikiNode
-
 from wiktextract.datautils import ns_title_prefix_tuple
 from wiktextract.form_descriptions import parse_sense_qualifier
 from wiktextract.page import LEVEL_KINDS, clean_node
@@ -98,7 +98,7 @@ def extract_thesaurus_page(
     # {{ws header|lang=xx}}
     m = re.search(r"(?s)\{\{ws header\|[^}]*lang=([^}|]*)", text)
     if m:
-        lang = wxr.config.LANGUAGES_BY_CODE.get(m.group(1), [None])[0]
+        lang = code_to_name(m.group(1), "en")
 
     def recurse(contents) -> Optional[List[ThesaurusTerm]]:
         nonlocal lang
@@ -197,7 +197,7 @@ def qual_fn(m):
                     w1 = w1.removesuffix(" [⇒ thesaurus]")
 
                     if w1:
-                        lang_code = wxr.config.LANGUAGES_BY_NAME.get(lang)
+                        lang_code = name_to_code(lang, "en")
                         if lang_code is None:
                             logging.debug(
                                 f"Linkage language {lang} not recognized"
@@ -230,7 +230,7 @@ def qual_fn(m):
         subtitle = wxr.wtp.node_to_text(
             contents.sarg if contents.sarg else contents.largs
         )
-        if subtitle in wxr.config.LANGUAGES_BY_NAME:
+        if name_to_code(subtitle, "en") != "":
             lang = subtitle
             pos = None
             sense = None
diff --git a/src/wiktextract/extractor/zh/page.py b/src/wiktextract/extractor/zh/page.py
index f7c37d9b..95d84b7b 100644
--- a/src/wiktextract/extractor/zh/page.py
+++ b/src/wiktextract/extractor/zh/page.py
@@ -4,6 +4,7 @@
 from collections import defaultdict
 from typing import Dict, List, Union
 
+from mediawiki_langcodes import name_to_code
 from wikitextprocessor import NodeKind, WikiNode
 from wiktextract.datautils import append_base_data
 from wiktextract.page import LEVEL_KINDS, clean_node
@@ -212,12 +213,12 @@ def parse_page(
     for level2_node in tree.find_child(NodeKind.LEVEL2):
         categories_and_links = defaultdict(list)
         lang_name = clean_node(wxr, categories_and_links, level2_node.largs)
-        if lang_name not in wxr.config.LANGUAGES_BY_NAME:
+        if name_to_code(lang_name, "zh") == "":
             wxr.wtp.warning(
                 f"Unrecognized language name: {lang_name}",
                 sortid="extractor/zh/page/parse_page/509",
             )
-        lang_code = wxr.config.LANGUAGES_BY_NAME.get(lang_name)
+        lang_code = name_to_code(lang_name, "zh")
         if (
             wxr.config.capture_language_codes is not None
             and lang_code not in wxr.config.capture_language_codes
diff --git a/src/wiktextract/extractor/zh/thesaurus.py b/src/wiktextract/extractor/zh/thesaurus.py
index 230364e9..8692f3c2 100644
--- a/src/wiktextract/extractor/zh/thesaurus.py
+++ b/src/wiktextract/extractor/zh/thesaurus.py
@@ -2,6 +2,7 @@
 import re
 from typing import List, Optional, Union
 
+from mediawiki_codes import name_to_code
 from wikitextprocessor import NodeKind, Page, WikiNode
 
 from ...page import clean_node
@@ -161,7 +162,7 @@ def recursive_parse(
 
     if node.kind == NodeKind.LEVEL2:
         lang_name = clean_node(wxr, None, node.largs)
-        lang_code = wxr.config.LANGUAGES_BY_NAME.get(lang_name)
+        lang_code = name_to_code(lang_name, "zh")
         if lang_code is None:
             logging.warning(
                 f"Unrecognized language: {lang_name} in page Thesaurus:{entry}"
diff --git a/src/wiktextract/extractor/zh/translation.py b/src/wiktextract/extractor/zh/translation.py
index 25959644..4b461dab 100644
--- a/src/wiktextract/extractor/zh/translation.py
+++ b/src/wiktextract/extractor/zh/translation.py
@@ -2,6 +2,7 @@
 from collections import defaultdict
 from typing import Dict, List, Optional, Union
 
+from mediawiki_langcodes import name_to_code
 from wikitextprocessor import NodeKind, WikiNode
 from wiktextract.datautils import find_similar_gloss
 from wiktextract.page import LEVEL_KINDS, clean_node
@@ -100,7 +101,7 @@ def process_translation_list_item(
     words_text = words_text.strip()
     if len(words_text) == 0:
         return
-    lang_code = wxr.config.LANGUAGES_BY_NAME.get(lang_text)
+    lang_code = name_to_code(lang_text, "zh")
 
     # split words by `,` or `;` that are not inside `()`
     for word_and_tags in re.split(r"[,;、](?![^(]*\))\s*", words_text):
diff --git a/src/wiktextract/lang_specific_configs.py b/src/wiktextract/lang_specific_configs.py
index 2f091d0e..2b2c51bb 100644
--- a/src/wiktextract/lang_specific_configs.py
+++ b/src/wiktextract/lang_specific_configs.py
@@ -3,7 +3,7 @@
 # parsing.
 
 import re
-# from wiktextract.datautils import languages_by_name
+
 from wiktextract.tags import valid_tags, tag_categories
 from wiktextract.parts_of_speech import PARTS_OF_SPEECH
 
diff --git a/src/wiktextract/page.py b/src/wiktextract/page.py
index d7d43c4d..648666da 100644
--- a/src/wiktextract/page.py
+++ b/src/wiktextract/page.py
@@ -7,8 +7,8 @@
 from copy import copy
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
+from mediawiki_langcodes import get_all_names, name_to_code
 from wikitextprocessor import NodeKind, WikiNode
-
 from wiktextract.wxr_context import WiktextractContext
 
 from .clean import clean_value
@@ -248,7 +248,7 @@ def process_categories(wxr: WiktextractContext, page_data: List[Dict]) -> None:
         r"^("
         + wxr.wtp.NAMESPACE_DATA.get("Rhymes", {}).get("name", "")
         + ":)?("
-        + "|".join(re.escape(x) for x in wxr.config.LANGUAGES_BY_NAME)
+        + "|".join(re.escape(x) for _, x in get_all_names("en"))
         + ")[ /]?"
     )
     # Remove category links that start with a language name from entries for
@@ -261,7 +261,7 @@ def process_categories(wxr: WiktextractContext, page_data: List[Dict]) -> None:
             m = re.match(starts_lang_re, cat)
             if m:
                 catlang = m.group(2)
-                catlang_code = wxr.config.LANGUAGES_BY_NAME.get(catlang)
+                catlang_code = name_to_code(catlang, "en")
                 if catlang_code != lang_code and not (
                     catlang_code == "en" and data.get("lang_code") == "mul"
                 ):
diff --git a/src/wiktextract/thesaurus.py b/src/wiktextract/thesaurus.py
index 919d40dd..8df54130 100644
--- a/src/wiktextract/thesaurus.py
+++ b/src/wiktextract/thesaurus.py
@@ -14,6 +14,7 @@
 from pathlib import Path
 from typing import List, Optional, Set, TextIO, Tuple
 
+from mediawiki_langcodes import name_to_code
 from wikitextprocessor import Page
 
 from .import_utils import import_extractor_module
@@ -281,7 +282,7 @@ def emit_words_in_thesaurus(
 
         entry = {
             "word": entry,
-            "lang": wxr.config.LANGUAGES_BY_CODE.get(lang_code)[0],
+            "lang": code_to_name(lang_code, "en"),
             "lang_code": lang_code,
             "pos": pos,
             "senses": [sense_dict] if sense_dict else [],
diff --git a/src/wiktextract/translations.py b/src/wiktextract/translations.py
index ef858e68..287d41c6 100644
--- a/src/wiktextract/translations.py
+++ b/src/wiktextract/translations.py
@@ -4,6 +4,8 @@
 
 import re
 import copy
+
+from mediawiki_langcodes import code_to_name, name_to_code
 from wiktextract.wxr_context import WiktextractContext
 from wikitextprocessor import MAGIC_FIRST, MAGIC_LAST
 
@@ -329,12 +331,12 @@ def parse_translation_item_text(wxr, word, data, item, sense, pos_datas,
     # name from the higher level, and some append a language variant
     # name to a broader language name)
     extra_langcodes = set()
-    if lang and lang in wxr.config.LANGUAGES_BY_NAME:
-        extra_langcodes.add(wxr.config.LANGUAGES_BY_NAME[lang])
+    if lang and name_to_code(lang, "en") != "":
+        lang_code = name_to_code(lang, "en")
+        extra_langcodes.add(lang_code)
         # Canonicalize language name (we could have gotten it via
         # alias or other_names)
-        lang = wxr.config.LANGUAGES_BY_CODE[wxr.config.LANGUAGES_BY_NAME[lang]][0]
-        assert lang
+        lang = code_to_name(lang_code, "en")
     m = re.match(r"\*?\s*([-' \w][-'&, \w()]*)[:：]\s*", item)
     tags = []
     if m:
@@ -359,7 +361,7 @@ def parse_translation_item_text(wxr, word, data, item, sense, pos_datas,
                 return None
             lang = sublang
         elif (lang_sublang and
-                any((captured_lang := lang_comb) in wxr.config.LANGUAGES_BY_NAME
+                any(name_to_code(captured_lang := lang_comb, "en") != ""
                     # Python 3.8: catch the value of lang_comb with :=
                     for lang_comb in language_name_variations)
               ):
@@ -377,7 +379,7 @@ def parse_translation_item_text(wxr, word, data, item, sense, pos_datas,
             # separate language codes, so additional langcode
             # removal tricks may need to be played below.
             tags.extend(tr_second_tagmap[sublang].split())
-        elif sublang in wxr.config.LANGUAGES_BY_NAME:
+        elif name_to_code(sublang, "en") != "":
             lang = sublang
         elif sublang[0].isupper() and classify_desc(sublang) == "tags":
             # Interpret it as a tag
@@ -393,7 +395,7 @@ def parse_translation_item_text(wxr, word, data, item, sense, pos_datas,
     elif lang is None:
         # No mathing language prefix.  Try if it is missing colon.
         parts = item.split()
-        if len(parts) > 1 and parts[0] in wxr.config.LANGUAGES_BY_NAME:
+        if len(parts) > 1 and name_to_code(parts[0], "en") != "":
             lang = parts[0]
             item = " ".join(parts[1:])
         else:
@@ -407,8 +409,8 @@ def parse_translation_item_text(wxr, word, data, item, sense, pos_datas,
 
     # If we didn't get language code from the template, look it up
     # based on language name
-    if langcode is None and lang in wxr.config.LANGUAGES_BY_NAME:
-        langcode = wxr.config.LANGUAGES_BY_NAME[lang]
+    if langcode is None and name_to_code(lang, "en") != "":
+        langcode = name_to_code(lang, "en")
 
     # Remove (<langcode>) parts from the item.  They seem to be
     # generated by {{t+|...}}.
diff --git a/src/wiktextract/wiktionary.py b/src/wiktextract/wiktionary.py
index 3a41b3c8..279f75f5 100644
--- a/src/wiktextract/wiktionary.py
+++ b/src/wiktextract/wiktionary.py
@@ -225,7 +225,7 @@ def reprocess_wiktionary(
             last_time = estimate_progress(
                 processed_pages, all_page_nums, start_time, last_time
             )
-    if wxr.config.extract_thesaurus_pages:
+    if wxr.config.dump_file_lang_code == "en":
         emit_words_in_thesaurus(wxr, emitted, out_f, human_readable)
     logging.info("Reprocessing wiktionary complete")
 
diff --git a/src/wiktextract/wiktwords.py b/src/wiktextract/wiktwords.py
index 527556f2..be24b828 100755
--- a/src/wiktextract/wiktwords.py
+++ b/src/wiktextract/wiktwords.py
@@ -138,12 +138,6 @@ def main():
         default=False,
         help="Extract words for all languages",
     )
-    parser.add_argument(
-        "--list-languages",
-        action="store_true",
-        default=False,
-        help="Print list of supported languages",
-    )
     parser.add_argument(
         "--pages-dir",
         type=str,
@@ -379,43 +373,6 @@ def main():
         expand_tables=args.inflection_tables_file,
     )
 
-    if args.language:
-        new_lang_codes = []
-        for x in args.language:
-            if x not in conf1.LANGUAGES_BY_CODE:
-                if x in conf1.LANGUAGES_BY_NAME:
-                    new_lang_codes.append(conf1.LANGUAGES_BY_NAME[x])
-                else:
-                    logging.error(f"Invalid language: {x}")
-                    sys.exit(1)
-            else:
-                new_lang_codes.append(x)
-        conf1.capture_language_codes = new_lang_codes
-
-    if args.language:
-        lang_names = []
-        for x in args.language:
-            if x in conf1.LANGUAGES_BY_CODE:
-                lang_names.extend(conf1.LANGUAGES_BY_CODE[x])
-            else:
-                lang_names.extend(
-                    conf1.LANGUAGES_BY_CODE[conf1.LANGUAGES_BY_NAME[x]]
-                )
-
-        lang_names = [re.escape(x) for x in lang_names]
-        lang_names_re = r"==\s*("
-        lang_names_re += "|".join(lang_names)
-        lang_names_re += r")"
-        lang_names_re = re.compile(lang_names_re)
-
-    # If --list-languages has been specified, just print the list of supported
-    # languages
-    if args.list_languages:
-        print("Supported languages:")
-        for lang_name, lang_code in conf1.LANGUAGES_BY_NAME.items():
-            print(f"    {lang_name}: {lang_code}")
-        sys.exit(0)
-
     if not args.path and not args.db_path:
         print(
             "The PATH argument for wiktionary dump file is normally mandatory."
diff --git a/tests/test_de_pronunciation.py b/tests/test_de_pronunciation.py
index 6786ab00..cc9419a8 100644
--- a/tests/test_de_pronunciation.py
+++ b/tests/test_de_pronunciation.py
@@ -60,8 +60,6 @@ def test_de_process_ipa(self):
                 self.wxr.wtp.add_page("Vorlage:IPA", 10, "")
                 self.wxr.wtp.add_page("Vorlage:Lautschrift", 10, "(Deutsch)")
 
-                self.wxr.wtp.LANGUAGES_BY_CODE["de"] = "Deutsch"
-
                 root = self.wxr.wtp.parse(case["input"])
 
                 sound_data = [defaultdict(list)]
@@ -138,8 +136,6 @@ def test_de_process_hoerbeispiele(self):
                 self.wxr.wtp.add_page("Vorlage:IPA", 10, "")
                 self.wxr.wtp.add_page("Vorlage:Audio", 10, "")
 
-                self.wxr.wtp.LANGUAGES_BY_CODE["de"] = "Deutsch"
-
                 root = self.wxr.wtp.parse(case["input"])
 
                 sound_data = [defaultdict(list)]
diff --git a/tests/test_de_translation.py b/tests/test_de_translation.py
index 2db61cd8..7ddf5418 100644
--- a/tests/test_de_translation.py
+++ b/tests/test_de_translation.py
@@ -18,12 +18,6 @@ def setUp(self) -> None:
         self.wxr = WiktextractContext(
             Wtp(lang_code="de"), WiktionaryConfig(dump_file_lang_code="de")
         )
-        self.wxr.wtp.LANGUAGES_BY_CODE["en"] = ["Englisch"]
-        self.wxr.wtp.LANGUAGES_BY_CODE["hy"] = ["Armenisch"]
-        self.wxr.wtp.LANGUAGES_BY_CODE["ru"] = ["Russisch"]
-        self.wxr.wtp.LANGUAGES_BY_CODE["fr"] = ["Französisch"]
-        self.wxr.wtp.LANGUAGES_BY_CODE["ar"] = ["Arabisch"]
-        self.wxr.wtp.LANGUAGES_BY_CODE["la"] = ["Latein"]
 
     def tearDown(self) -> None:
         self.wxr.wtp.close_db_conn()
diff --git a/tests/test_long.py b/tests/test_long.py
index 83d356f4..4942b3cf 100644
--- a/tests/test_long.py
+++ b/tests/test_long.py
@@ -99,10 +99,7 @@ def test_long(self):
             sum(words.values()), sum(poses.values()) + num_redirects
         )
         self.assertGreater(num_transl, 0)
-        thesaurus_data = [
-            data
-            for data in search_thesaurus(
-                self.wxr.thesaurus_db_conn, "hieno", "fi", "adj"
-            )
-        ]
+        thesaurus_data = list(search_thesaurus(
+            self.wxr.thesaurus_db_conn, "hieno", "fi", "adj"
+        ))
         self.assertEqual(len(thesaurus_data), 17)
diff --git a/tests/test_translations.py b/tests/test_translations.py
index 14c84946..0973e196 100644
--- a/tests/test_translations.py
+++ b/tests/test_translations.py
@@ -132,8 +132,8 @@ def test_tr7(self):
     def test_tr8(self):
         data = self.runtr("Mandarin: 是 (rrr)", lang="Chinese")
         self.assertEqual(data, {"translations": [
-            {"word": "是", "roman": "rrr", "lang": "Mandarin Chinese",
-             "code": "cmn",
+            {"word": "是", "roman": "rrr", "lang": "Chinese Mandarin",
+             "code": "zh",
              }]})
 
     def test_tr9(self):
diff --git a/tests/test_zh_translation.py b/tests/test_zh_translation.py
index a3ba267b..d7d63771 100644
--- a/tests/test_zh_translation.py
+++ b/tests/test_zh_translation.py
@@ -52,7 +52,7 @@ def test_normal(self, mock_get_page) -> None:
                     "word": "këtu",
                 },
                 {
-                    "lang_code": None,
+                    "lang_code": "",
                     "lang_name": "西阿帕切語",
                     "sense": "靠近說話者的地方",
                     "word": "kú",