Merge pull request #393 from xxyzz/icu

Remove `languages_by_code` `Wtp` class argument
tatuylonen · Nov 7, 2023 · a3665b8 · a3665b8
2 parents 69357b4 + 6edbb3f
commit a3665b8
Show file tree

Hide file tree

Showing 32 changed files with 122 additions and 89,071 deletions.
diff --git a/README.md b/README.md
@@ -408,8 +408,8 @@ The following command-line options can be used to control its operation:
 
 * --out FILE: specifies the name of the file to write (specifying "-" as the file writes to stdout)
 * --all-languages: extract words for all available languages
-* --language LANGUAGE_CODE: extracts the given language (this option may be specified multiple times; by default, English [en] and Translingual [mul] words are extracted)
-* --list-languages: prints a list of supported language names
+* --language-code LANGUAGE_CODE: extracts the given language (this option may be specified multiple times; defaults to dump file language code and `mul`(Translingual))
+* --language-name LANGUAGE_NAME: Similar to `--language-code` except this option accepts language name
 * --dump-file-language-code LANGUAGE_CODE: specifies the language code for the Wiktionary edition that the dump file is for (defaults to "en"; "zh" is supported and others are being added)
 * --all: causes all data to be captured for the selected languages
 * --translations: causes translations to be captured

diff --git a/json_schema/zh.json b/json_schema/zh.json
@@ -11,7 +11,7 @@
     },
     "lang_code": {
       "description": "Wiktionary language code",
-      "type": ["string", "null"]
+      "type": "string"
     },
     "word": {
       "description": "word string",
@@ -285,7 +285,7 @@
       "properties": {
         "lang_code": {
           "description": "Wiktionary language code of the translation term",
-          "type": ["string", "null"]
+          "type": "string"
         },
         "lang_name": {
           "description": "Translation language name",

diff --git a/src/wiktextract/config.py b/src/wiktextract/config.py
@@ -50,8 +50,6 @@ class WiktionaryConfig:
         "ZH_PRON_TAGS",
         "FR_FORM_TABLES",
         "DE_FORM_TABLES",
-        "LANGUAGES_BY_NAME",
-        "LANGUAGES_BY_CODE",
         "FORM_OF_TEMPLATES",
         "analyze_templates",
         "extract_thesaurus_pages",
@@ -60,7 +58,7 @@ class WiktionaryConfig:
     def __init__(
         self,
         dump_file_lang_code="en",
-        capture_language_codes=["en", "mul"],
+        capture_language_codes={"en", "mul"},
         capture_translations=True,
         capture_pronunciation=True,
         capture_linkages=True,
@@ -113,7 +111,6 @@ def __init__(
         self.redirects = {}
         self.data_folder = files("wiktextract") / "data" / dump_file_lang_code
         self.init_subtitles()
-        self.init_languages()
         self.set_attr_from_json("ZH_PRON_TAGS", "zh_pron_tags.json")
         if dump_file_lang_code == "zh":
             self.set_attr_from_json(
@@ -161,79 +158,6 @@ def init_subtitles(self) -> None:
                 assert isinstance(v["tags"], (list, tuple))
         self.set_attr_from_json("OTHER_SUBTITLES", "other_subtitles.json")
 
-    def init_languages(self):
-        def canon_warn(name, use_code, not_use_code):
-            print(
-                f"WARNING: Non-unique language canonical name '{name}'."
-                f" Mapping to '{use_code}' instead of '{not_use_code}'."
-            )
-
-        def alias_info(name, new_code, kind, old_code, use_code, not_use_code):
-            if self.verbose:
-                print(
-                    f"Language alias '{name}' for code '{new_code}'"
-                    f" is already a{kind} for {old_code}."
-                    f" Mapping to '{use_code}' instead of '{not_use_code}'."
-                )
-
-        self.set_attr_from_json("LANGUAGES_BY_CODE", "languages.json")
-
-        self.LANGUAGES_BY_NAME = {}
-
-        # add canonical names first to avoid overwriting them
-        canonical_names = {}
-        for lang_code, lang_names in self.LANGUAGES_BY_CODE.items():
-            canonical_name = lang_names[0]
-            if canonical_name in canonical_names:
-                lang_code0 = canonical_names[canonical_name]
-                if len(lang_code) < len(lang_code0):
-                    canon_warn(canonical_name, lang_code, lang_code0)
-                    canonical_names[canonical_name] = lang_code
-                    self.LANGUAGES_BY_NAME[canonical_name] = lang_code
-                else:
-                    canon_warn(canonical_name, lang_code0, lang_code)
-            else:
-                canonical_names[canonical_name] = lang_code
-                self.LANGUAGES_BY_NAME[canonical_name] = lang_code
-
-        # add other names
-        for lang_code, lang_names in self.LANGUAGES_BY_CODE.items():
-            for lang_name in lang_names[1:]:
-                if lang_name in canonical_names:
-                    lang_code0 = canonical_names[lang_name]
-                    alias_info(
-                        lang_name,
-                        lang_code,
-                        " canonical name",
-                        lang_code0,
-                        lang_code0,
-                        lang_code,
-                    )
-                    continue
-                if lang_name in self.LANGUAGES_BY_NAME:
-                    lang_code0 = self.LANGUAGES_BY_NAME[lang_name]
-                    if len(lang_code) < len(lang_code0):
-                        alias_info(
-                            lang_name,
-                            lang_code,
-                            "n alias",
-                            lang_code0,
-                            lang_code,
-                            lang_code0,
-                        )
-                        self.LANGUAGES_BY_NAME[lang_name] = lang_code
-                    else:
-                        alias_info(
-                            lang_name,
-                            lang_code,
-                            "n alias",
-                            lang_code0,
-                            lang_code0,
-                            lang_code,
-                        )
-                else:
-                    self.LANGUAGES_BY_NAME[lang_name] = lang_code
-
     def load_edition_settings(self):
         file_path = self.data_folder / "config.json"
         if file_path.exists():