From 52dccc1748d033ed836ccf78c5ba7886568d12c6 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 7 Nov 2023 10:34:57 +0800 Subject: [PATCH] Remove `LANGUAGES_BY_NAME` and `LANGUAGES_BY_CODE` use the mediawiki-langcodes package to convert language names and codes --- README.md | 1 - json_schema/zh.json | 4 +- src/wiktextract/config.py | 76 ------------------- src/wiktextract/extractor/de/example.py | 1 - src/wiktextract/extractor/de/gloss.py | 1 - src/wiktextract/extractor/de/linkage.py | 1 - src/wiktextract/extractor/de/page.py | 7 +- src/wiktextract/extractor/de/pronunciation.py | 4 +- src/wiktextract/extractor/de/translation.py | 10 +-- src/wiktextract/extractor/en/page.py | 69 ++++++++++------- src/wiktextract/extractor/en/thesaurus.py | 8 +- src/wiktextract/extractor/zh/page.py | 5 +- src/wiktextract/extractor/zh/thesaurus.py | 3 +- src/wiktextract/extractor/zh/translation.py | 3 +- src/wiktextract/lang_specific_configs.py | 2 +- src/wiktextract/page.py | 6 +- src/wiktextract/thesaurus.py | 3 +- src/wiktextract/translations.py | 20 ++--- src/wiktextract/wiktionary.py | 2 +- src/wiktextract/wiktwords.py | 43 ----------- tests/test_de_pronunciation.py | 4 - tests/test_de_translation.py | 6 -- tests/test_long.py | 9 +-- tests/test_translations.py | 4 +- tests/test_zh_translation.py | 2 +- 25 files changed, 86 insertions(+), 208 deletions(-) diff --git a/README.md b/README.md index ab978870..bb600b06 100644 --- a/README.md +++ b/README.md @@ -409,7 +409,6 @@ The following command-line options can be used to control its operation: * --out FILE: specifies the name of the file to write (specifying "-" as the file writes to stdout) * --all-languages: extract words for all available languages * --language LANGUAGE_CODE: extracts the given language (this option may be specified multiple times; by default, English [en] and Translingual [mul] words are extracted) -* --list-languages: prints a list of supported language names * --dump-file-language-code LANGUAGE_CODE: specifies the language code for the Wiktionary edition that the dump file is for (defaults to "en"; "zh" is supported and others are being added) * --all: causes all data to be captured for the selected languages * --translations: causes translations to be captured diff --git a/json_schema/zh.json b/json_schema/zh.json index 1e4af6ae..9de1c86c 100644 --- a/json_schema/zh.json +++ b/json_schema/zh.json @@ -11,7 +11,7 @@ }, "lang_code": { "description": "Wiktionary language code", - "type": ["string", "null"] + "type": "string" }, "word": { "description": "word string", @@ -285,7 +285,7 @@ "properties": { "lang_code": { "description": "Wiktionary language code of the translation term", - "type": ["string", "null"] + "type": "string" }, "lang_name": { "description": "Translation language name", diff --git a/src/wiktextract/config.py b/src/wiktextract/config.py index 9a24bf5b..41829c13 100644 --- a/src/wiktextract/config.py +++ b/src/wiktextract/config.py @@ -50,8 +50,6 @@ class WiktionaryConfig: "ZH_PRON_TAGS", "FR_FORM_TABLES", "DE_FORM_TABLES", - "LANGUAGES_BY_NAME", - "LANGUAGES_BY_CODE", "FORM_OF_TEMPLATES", "analyze_templates", "extract_thesaurus_pages", @@ -113,7 +111,6 @@ def __init__( self.redirects = {} self.data_folder = files("wiktextract") / "data" / dump_file_lang_code self.init_subtitles() - self.init_languages() self.set_attr_from_json("ZH_PRON_TAGS", "zh_pron_tags.json") if dump_file_lang_code == "zh": self.set_attr_from_json( @@ -161,79 +158,6 @@ def init_subtitles(self) -> None: assert isinstance(v["tags"], (list, tuple)) self.set_attr_from_json("OTHER_SUBTITLES", "other_subtitles.json") - def init_languages(self): - def canon_warn(name, use_code, not_use_code): - print( - f"WARNING: Non-unique language canonical name '{name}'." - f" Mapping to '{use_code}' instead of '{not_use_code}'." - ) - - def alias_info(name, new_code, kind, old_code, use_code, not_use_code): - if self.verbose: - print( - f"Language alias '{name}' for code '{new_code}'" - f" is already a{kind} for {old_code}." - f" Mapping to '{use_code}' instead of '{not_use_code}'." - ) - - self.set_attr_from_json("LANGUAGES_BY_CODE", "languages.json") - - self.LANGUAGES_BY_NAME = {} - - # add canonical names first to avoid overwriting them - canonical_names = {} - for lang_code, lang_names in self.LANGUAGES_BY_CODE.items(): - canonical_name = lang_names[0] - if canonical_name in canonical_names: - lang_code0 = canonical_names[canonical_name] - if len(lang_code) < len(lang_code0): - canon_warn(canonical_name, lang_code, lang_code0) - canonical_names[canonical_name] = lang_code - self.LANGUAGES_BY_NAME[canonical_name] = lang_code - else: - canon_warn(canonical_name, lang_code0, lang_code) - else: - canonical_names[canonical_name] = lang_code - self.LANGUAGES_BY_NAME[canonical_name] = lang_code - - # add other names - for lang_code, lang_names in self.LANGUAGES_BY_CODE.items(): - for lang_name in lang_names[1:]: - if lang_name in canonical_names: - lang_code0 = canonical_names[lang_name] - alias_info( - lang_name, - lang_code, - " canonical name", - lang_code0, - lang_code0, - lang_code, - ) - continue - if lang_name in self.LANGUAGES_BY_NAME: - lang_code0 = self.LANGUAGES_BY_NAME[lang_name] - if len(lang_code) < len(lang_code0): - alias_info( - lang_name, - lang_code, - "n alias", - lang_code0, - lang_code, - lang_code0, - ) - self.LANGUAGES_BY_NAME[lang_name] = lang_code - else: - alias_info( - lang_name, - lang_code, - "n alias", - lang_code0, - lang_code0, - lang_code, - ) - else: - self.LANGUAGES_BY_NAME[lang_name] = lang_code - def load_edition_settings(self): file_path = self.data_folder / "config.json" if file_path.exists(): diff --git a/src/wiktextract/extractor/de/example.py b/src/wiktextract/extractor/de/example.py index da3268d3..9c7e247f 100644 --- a/src/wiktextract/extractor/de/example.py +++ b/src/wiktextract/extractor/de/example.py @@ -3,7 +3,6 @@ from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import LevelNode - from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext diff --git a/src/wiktextract/extractor/de/gloss.py b/src/wiktextract/extractor/de/gloss.py index 3caa7252..ad9183f3 100644 --- a/src/wiktextract/extractor/de/gloss.py +++ b/src/wiktextract/extractor/de/gloss.py @@ -4,7 +4,6 @@ from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import LevelNode - from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext diff --git a/src/wiktextract/extractor/de/linkage.py b/src/wiktextract/extractor/de/linkage.py index eb4425e7..10de977e 100644 --- a/src/wiktextract/extractor/de/linkage.py +++ b/src/wiktextract/extractor/de/linkage.py @@ -3,7 +3,6 @@ from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import LevelNode - from wiktextract.extractor.de.utils import split_senseids from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext diff --git a/src/wiktextract/extractor/de/page.py b/src/wiktextract/extractor/de/page.py index d4353e48..8b276de2 100644 --- a/src/wiktextract/extractor/de/page.py +++ b/src/wiktextract/extractor/de/page.py @@ -3,9 +3,9 @@ from collections import defaultdict from typing import Dict, List, Union +from mediawiki_langcodes import name_to_code from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import LevelNode - from wiktextract.datautils import append_base_data from wiktextract.wxr_context import WiktextractContext @@ -263,13 +263,12 @@ def parse_page( # German name of the language of the section. if subtitle_template.template_name == "Sprache": lang_name = subtitle_template.template_parameters.get(1) - lang_code = wxr.config.LANGUAGES_BY_NAME.get(lang_name) - if not lang_code: + lang_code = name_to_code(lang_name, "de") + if lang_code == "": wxr.wtp.warning( f"Unknown language: {lang_name}", sortid="extractor/de/page/parse_page/76", ) - continue if ( wxr.config.capture_language_codes is not None and lang_code not in wxr.config.capture_language_codes diff --git a/src/wiktextract/extractor/de/pronunciation.py b/src/wiktextract/extractor/de/pronunciation.py index e55adb13..2fb63e6d 100644 --- a/src/wiktextract/extractor/de/pronunciation.py +++ b/src/wiktextract/extractor/de/pronunciation.py @@ -1,9 +1,9 @@ from collections import defaultdict from typing import Dict, List, Union +from mediawiki_langcodes import code_to_name from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import LevelNode - from wiktextract.extractor.share import create_audio_url_dict from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext @@ -94,7 +94,7 @@ def process_lautschrift_template( lang_code = template_parameters.get("spr") if lang_code: - language = wxr.wtp.LANGUAGES_BY_CODE[lang_code] + language = code_to_name(lang_code, "de") add_sound_data_without_appending_to_existing_properties( sound_data, { diff --git a/src/wiktextract/extractor/de/translation.py b/src/wiktextract/extractor/de/translation.py index e39cb079..77e598e1 100644 --- a/src/wiktextract/extractor/de/translation.py +++ b/src/wiktextract/extractor/de/translation.py @@ -2,9 +2,9 @@ from collections import defaultdict from typing import Dict, List, Union +from mediawiki_langcodes import code_to_name from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import TemplateNode - from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext @@ -101,12 +101,10 @@ def process_translation_list( lang_code = node.template_parameters.get(1) translation_data["code"] = lang_code - languages = wxr.wtp.LANGUAGES_BY_CODE.get(lang_code) - if languages: - translation_data["lang"] = languages[0] - else: + translation_data["lang"] = code_to_name(lang_code, "de") + if translation_data["lang"] == "": wxr.wtp.debug( - f"Unknown language code: {lang_code}", + f"Unknown language code: {translation_data['lang']}", sortid="extractor/de/translation/process_translation_list/70", ) if node.template_name[-1] == "?": diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py index d2327f89..8c3d63c1 100644 --- a/src/wiktextract/extractor/en/page.py +++ b/src/wiktextract/extractor/en/page.py @@ -2,37 +2,47 @@ # # Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org -import re -import sys import copy import html import logging - +import re +import sys from collections import defaultdict from functools import partial from typing import Dict, List, Optional, Set, Union -from wikitextprocessor import WikiNode, NodeKind +from mediawiki_langcodes import get_all_names, name_to_code +from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.core import TemplateArgs -from wiktextract.wxr_context import WiktextractContext -from wiktextract.parts_of_speech import PARTS_OF_SPEECH -from wiktextract.linkages import parse_linkage_item_text -from wiktextract.translations import parse_translation_item_text from wiktextract.clean import clean_template_args -from wiktextract.datautils import data_append, data_extend, ns_title_prefix_tuple -from wiktextract.tags import valid_tags -from wiktextract.page import ( - clean_node, recursively_extract, LEVEL_KINDS, is_panel_template +from wiktextract.datautils import ( + data_append, + data_extend, + ns_title_prefix_tuple, ) - from wiktextract.form_descriptions import ( - decode_tags, parse_word_head, parse_sense_qualifier, - distw, parse_alt_or_inflection_of, classify_desc) -from wiktextract.inflection import parse_inflection_section, TableContext + classify_desc, + decode_tags, + distw, + parse_alt_or_inflection_of, + parse_sense_qualifier, + parse_word_head, +) +from wiktextract.inflection import TableContext, parse_inflection_section +from wiktextract.linkages import parse_linkage_item_text +from wiktextract.page import ( + LEVEL_KINDS, + clean_node, + is_panel_template, + recursively_extract, +) +from wiktextract.parts_of_speech import PARTS_OF_SPEECH +from wiktextract.tags import valid_tags +from wiktextract.translations import parse_translation_item_text +from wiktextract.wxr_context import WiktextractContext from ..ruby import extract_ruby, parse_ruby from ..share import strip_nodes - from .unsupported_titles import unsupported_title_map # Matches head tag @@ -532,7 +542,7 @@ def init_head_tag_re(wxr): r"^(head|Han char|arabic-noun|arabic-noun-form|" r"hangul-symbol|syllable-hangul)$|" + r"^(latin|" + - "|".join(wxr.wtp.LANGUAGES_BY_CODE) + r")-(" + + "|".join(lang_name for _, lang_name in get_all_names("en")) + r")-(" + "|".join([ "abbr", "adj", @@ -3356,7 +3366,6 @@ def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str: """Fix subtitle hierarchy to be strict Language -> Etymology -> Part-of-Speech -> Translation/Linkage.""" - # Known language names are in languages_by_name # Known lowercase PoS names are in part_of_speech_map # Known lowercase linkage section names are in linkage_map @@ -3381,7 +3390,7 @@ def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str: .format(title, left, right), sortid="page/2904") lc = title.lower() - if title in wxr.config.LANGUAGES_BY_NAME: + if name_to_code(title, "en") != "": if level > 2: wxr.wtp.debug("subtitle has language name {} at level {}" .format(title, level), @@ -3491,16 +3500,18 @@ def multitrans_post_fn(name, ht, text): # Some pages have links at top level, e.g., "trees" in Wiktionary continue if langnode.kind != NodeKind.LEVEL2: - wxr.wtp.debug("unexpected top-level node: {}".format(langnode), - sortid="page/3014") - continue - lang = clean_node(wxr, None, - langnode.sarg if langnode.sarg else langnode.largs) - if lang not in wxr.config.LANGUAGES_BY_NAME: - wxr.wtp.debug("unrecognized language name at top-level {!r}" - .format(lang), sortid="page/3019") + wxr.wtp.debug( + f"unexpected top-level node: {langnode}", sortid="page/3014" + ) continue - lang_code = wxr.config.LANGUAGES_BY_NAME.get(lang) + lang = clean_node( + wxr, None, langnode.sarg if langnode.sarg else langnode.largs + ) + lang_code = name_to_code(lang, "en") + if lang_code == "": + wxr.wtp.debug( + f"unrecognized language name: {lang}", sortid="page/3019" + ) if ( wxr.config.capture_language_codes and lang_code not in wxr.config.capture_language_codes diff --git a/src/wiktextract/extractor/en/thesaurus.py b/src/wiktextract/extractor/en/thesaurus.py index db9d222b..bb3b28dc 100644 --- a/src/wiktextract/extractor/en/thesaurus.py +++ b/src/wiktextract/extractor/en/thesaurus.py @@ -7,8 +7,8 @@ import re from typing import List, Optional +from mediawiki_langcodes import code_to_name, name_to_code from wikitextprocessor import NodeKind, Page, WikiNode - from wiktextract.datautils import ns_title_prefix_tuple from wiktextract.form_descriptions import parse_sense_qualifier from wiktextract.page import LEVEL_KINDS, clean_node @@ -98,7 +98,7 @@ def extract_thesaurus_page( # {{ws header|lang=xx}} m = re.search(r"(?s)\{\{ws header\|[^}]*lang=([^}|]*)", text) if m: - lang = wxr.config.LANGUAGES_BY_CODE.get(m.group(1), [None])[0] + lang = code_to_name(m.group(1), "en") def recurse(contents) -> Optional[List[ThesaurusTerm]]: nonlocal lang @@ -197,7 +197,7 @@ def qual_fn(m): w1 = w1.removesuffix(" [⇒ thesaurus]") if w1: - lang_code = wxr.config.LANGUAGES_BY_NAME.get(lang) + lang_code = name_to_code(lang, "en") if lang_code is None: logging.debug( f"Linkage language {lang} not recognized" @@ -230,7 +230,7 @@ def qual_fn(m): subtitle = wxr.wtp.node_to_text( contents.sarg if contents.sarg else contents.largs ) - if subtitle in wxr.config.LANGUAGES_BY_NAME: + if name_to_code(subtitle, "en") != "": lang = subtitle pos = None sense = None diff --git a/src/wiktextract/extractor/zh/page.py b/src/wiktextract/extractor/zh/page.py index f7c37d9b..95d84b7b 100644 --- a/src/wiktextract/extractor/zh/page.py +++ b/src/wiktextract/extractor/zh/page.py @@ -4,6 +4,7 @@ from collections import defaultdict from typing import Dict, List, Union +from mediawiki_langcodes import name_to_code from wikitextprocessor import NodeKind, WikiNode from wiktextract.datautils import append_base_data from wiktextract.page import LEVEL_KINDS, clean_node @@ -212,12 +213,12 @@ def parse_page( for level2_node in tree.find_child(NodeKind.LEVEL2): categories_and_links = defaultdict(list) lang_name = clean_node(wxr, categories_and_links, level2_node.largs) - if lang_name not in wxr.config.LANGUAGES_BY_NAME: + if name_to_code(lang_name, "zh") == "": wxr.wtp.warning( f"Unrecognized language name: {lang_name}", sortid="extractor/zh/page/parse_page/509", ) - lang_code = wxr.config.LANGUAGES_BY_NAME.get(lang_name) + lang_code = name_to_code(lang_name, "zh") if ( wxr.config.capture_language_codes is not None and lang_code not in wxr.config.capture_language_codes diff --git a/src/wiktextract/extractor/zh/thesaurus.py b/src/wiktextract/extractor/zh/thesaurus.py index 230364e9..8692f3c2 100644 --- a/src/wiktextract/extractor/zh/thesaurus.py +++ b/src/wiktextract/extractor/zh/thesaurus.py @@ -2,6 +2,7 @@ import re from typing import List, Optional, Union +from mediawiki_codes import name_to_code from wikitextprocessor import NodeKind, Page, WikiNode from ...page import clean_node @@ -161,7 +162,7 @@ def recursive_parse( if node.kind == NodeKind.LEVEL2: lang_name = clean_node(wxr, None, node.largs) - lang_code = wxr.config.LANGUAGES_BY_NAME.get(lang_name) + lang_code = name_to_code(lang_name, "zh") if lang_code is None: logging.warning( f"Unrecognized language: {lang_name} in page Thesaurus:{entry}" diff --git a/src/wiktextract/extractor/zh/translation.py b/src/wiktextract/extractor/zh/translation.py index 25959644..4b461dab 100644 --- a/src/wiktextract/extractor/zh/translation.py +++ b/src/wiktextract/extractor/zh/translation.py @@ -2,6 +2,7 @@ from collections import defaultdict from typing import Dict, List, Optional, Union +from mediawiki_langcodes import name_to_code from wikitextprocessor import NodeKind, WikiNode from wiktextract.datautils import find_similar_gloss from wiktextract.page import LEVEL_KINDS, clean_node @@ -100,7 +101,7 @@ def process_translation_list_item( words_text = words_text.strip() if len(words_text) == 0: return - lang_code = wxr.config.LANGUAGES_BY_NAME.get(lang_text) + lang_code = name_to_code(lang_text, "zh") # split words by `,` or `;` that are not inside `()` for word_and_tags in re.split(r"[,;、](?![^(]*\))\s*", words_text): diff --git a/src/wiktextract/lang_specific_configs.py b/src/wiktextract/lang_specific_configs.py index 2f091d0e..2b2c51bb 100644 --- a/src/wiktextract/lang_specific_configs.py +++ b/src/wiktextract/lang_specific_configs.py @@ -3,7 +3,7 @@ # parsing. import re -# from wiktextract.datautils import languages_by_name + from wiktextract.tags import valid_tags, tag_categories from wiktextract.parts_of_speech import PARTS_OF_SPEECH diff --git a/src/wiktextract/page.py b/src/wiktextract/page.py index d7d43c4d..648666da 100644 --- a/src/wiktextract/page.py +++ b/src/wiktextract/page.py @@ -7,8 +7,8 @@ from copy import copy from typing import Callable, Dict, List, Optional, Tuple, Union +from mediawiki_langcodes import get_all_names, name_to_code from wikitextprocessor import NodeKind, WikiNode - from wiktextract.wxr_context import WiktextractContext from .clean import clean_value @@ -248,7 +248,7 @@ def process_categories(wxr: WiktextractContext, page_data: List[Dict]) -> None: r"^(" + wxr.wtp.NAMESPACE_DATA.get("Rhymes", {}).get("name", "") + ":)?(" - + "|".join(re.escape(x) for x in wxr.config.LANGUAGES_BY_NAME) + + "|".join(re.escape(x) for _, x in get_all_names("en")) + ")[ /]?" ) # Remove category links that start with a language name from entries for @@ -261,7 +261,7 @@ def process_categories(wxr: WiktextractContext, page_data: List[Dict]) -> None: m = re.match(starts_lang_re, cat) if m: catlang = m.group(2) - catlang_code = wxr.config.LANGUAGES_BY_NAME.get(catlang) + catlang_code = name_to_code(catlang, "en") if catlang_code != lang_code and not ( catlang_code == "en" and data.get("lang_code") == "mul" ): diff --git a/src/wiktextract/thesaurus.py b/src/wiktextract/thesaurus.py index 919d40dd..8df54130 100644 --- a/src/wiktextract/thesaurus.py +++ b/src/wiktextract/thesaurus.py @@ -14,6 +14,7 @@ from pathlib import Path from typing import List, Optional, Set, TextIO, Tuple +from mediawiki_langcodes import name_to_code from wikitextprocessor import Page from .import_utils import import_extractor_module @@ -281,7 +282,7 @@ def emit_words_in_thesaurus( entry = { "word": entry, - "lang": wxr.config.LANGUAGES_BY_CODE.get(lang_code)[0], + "lang": code_to_name(lang_code, "en"), "lang_code": lang_code, "pos": pos, "senses": [sense_dict] if sense_dict else [], diff --git a/src/wiktextract/translations.py b/src/wiktextract/translations.py index ef858e68..287d41c6 100644 --- a/src/wiktextract/translations.py +++ b/src/wiktextract/translations.py @@ -4,6 +4,8 @@ import re import copy + +from mediawiki_langcodes import code_to_name, name_to_code from wiktextract.wxr_context import WiktextractContext from wikitextprocessor import MAGIC_FIRST, MAGIC_LAST @@ -329,12 +331,12 @@ def parse_translation_item_text(wxr, word, data, item, sense, pos_datas, # name from the higher level, and some append a language variant # name to a broader language name) extra_langcodes = set() - if lang and lang in wxr.config.LANGUAGES_BY_NAME: - extra_langcodes.add(wxr.config.LANGUAGES_BY_NAME[lang]) + if lang and name_to_code(lang, "en") != "": + lang_code = name_to_code(lang, "en") + extra_langcodes.add(lang_code) # Canonicalize language name (we could have gotten it via # alias or other_names) - lang = wxr.config.LANGUAGES_BY_CODE[wxr.config.LANGUAGES_BY_NAME[lang]][0] - assert lang + lang = code_to_name(lang_code, "en") m = re.match(r"\*?\s*([-' \w][-'&, \w()]*)[::]\s*", item) tags = [] if m: @@ -359,7 +361,7 @@ def parse_translation_item_text(wxr, word, data, item, sense, pos_datas, return None lang = sublang elif (lang_sublang and - any((captured_lang := lang_comb) in wxr.config.LANGUAGES_BY_NAME + any(name_to_code(captured_lang := lang_comb, "en") != "" # Python 3.8: catch the value of lang_comb with := for lang_comb in language_name_variations) ): @@ -377,7 +379,7 @@ def parse_translation_item_text(wxr, word, data, item, sense, pos_datas, # separate language codes, so additional langcode # removal tricks may need to be played below. tags.extend(tr_second_tagmap[sublang].split()) - elif sublang in wxr.config.LANGUAGES_BY_NAME: + elif name_to_code(sublang, "en") != "": lang = sublang elif sublang[0].isupper() and classify_desc(sublang) == "tags": # Interpret it as a tag @@ -393,7 +395,7 @@ def parse_translation_item_text(wxr, word, data, item, sense, pos_datas, elif lang is None: # No mathing language prefix. Try if it is missing colon. parts = item.split() - if len(parts) > 1 and parts[0] in wxr.config.LANGUAGES_BY_NAME: + if len(parts) > 1 and name_to_code(parts[0], "en") != "": lang = parts[0] item = " ".join(parts[1:]) else: @@ -407,8 +409,8 @@ def parse_translation_item_text(wxr, word, data, item, sense, pos_datas, # If we didn't get language code from the template, look it up # based on language name - if langcode is None and lang in wxr.config.LANGUAGES_BY_NAME: - langcode = wxr.config.LANGUAGES_BY_NAME[lang] + if langcode is None and name_to_code(lang, "en") != "": + langcode = name_to_code(lang, "en") # Remove () parts from the item. They seem to be # generated by {{t+|...}}. diff --git a/src/wiktextract/wiktionary.py b/src/wiktextract/wiktionary.py index 3a41b3c8..279f75f5 100644 --- a/src/wiktextract/wiktionary.py +++ b/src/wiktextract/wiktionary.py @@ -225,7 +225,7 @@ def reprocess_wiktionary( last_time = estimate_progress( processed_pages, all_page_nums, start_time, last_time ) - if wxr.config.extract_thesaurus_pages: + if wxr.config.dump_file_lang_code == "en": emit_words_in_thesaurus(wxr, emitted, out_f, human_readable) logging.info("Reprocessing wiktionary complete") diff --git a/src/wiktextract/wiktwords.py b/src/wiktextract/wiktwords.py index 527556f2..be24b828 100755 --- a/src/wiktextract/wiktwords.py +++ b/src/wiktextract/wiktwords.py @@ -138,12 +138,6 @@ def main(): default=False, help="Extract words for all languages", ) - parser.add_argument( - "--list-languages", - action="store_true", - default=False, - help="Print list of supported languages", - ) parser.add_argument( "--pages-dir", type=str, @@ -379,43 +373,6 @@ def main(): expand_tables=args.inflection_tables_file, ) - if args.language: - new_lang_codes = [] - for x in args.language: - if x not in conf1.LANGUAGES_BY_CODE: - if x in conf1.LANGUAGES_BY_NAME: - new_lang_codes.append(conf1.LANGUAGES_BY_NAME[x]) - else: - logging.error(f"Invalid language: {x}") - sys.exit(1) - else: - new_lang_codes.append(x) - conf1.capture_language_codes = new_lang_codes - - if args.language: - lang_names = [] - for x in args.language: - if x in conf1.LANGUAGES_BY_CODE: - lang_names.extend(conf1.LANGUAGES_BY_CODE[x]) - else: - lang_names.extend( - conf1.LANGUAGES_BY_CODE[conf1.LANGUAGES_BY_NAME[x]] - ) - - lang_names = [re.escape(x) for x in lang_names] - lang_names_re = r"==\s*(" - lang_names_re += "|".join(lang_names) - lang_names_re += r")" - lang_names_re = re.compile(lang_names_re) - - # If --list-languages has been specified, just print the list of supported - # languages - if args.list_languages: - print("Supported languages:") - for lang_name, lang_code in conf1.LANGUAGES_BY_NAME.items(): - print(f" {lang_name}: {lang_code}") - sys.exit(0) - if not args.path and not args.db_path: print( "The PATH argument for wiktionary dump file is normally mandatory." diff --git a/tests/test_de_pronunciation.py b/tests/test_de_pronunciation.py index 6786ab00..cc9419a8 100644 --- a/tests/test_de_pronunciation.py +++ b/tests/test_de_pronunciation.py @@ -60,8 +60,6 @@ def test_de_process_ipa(self): self.wxr.wtp.add_page("Vorlage:IPA", 10, "") self.wxr.wtp.add_page("Vorlage:Lautschrift", 10, "(Deutsch)") - self.wxr.wtp.LANGUAGES_BY_CODE["de"] = "Deutsch" - root = self.wxr.wtp.parse(case["input"]) sound_data = [defaultdict(list)] @@ -138,8 +136,6 @@ def test_de_process_hoerbeispiele(self): self.wxr.wtp.add_page("Vorlage:IPA", 10, "") self.wxr.wtp.add_page("Vorlage:Audio", 10, "") - self.wxr.wtp.LANGUAGES_BY_CODE["de"] = "Deutsch" - root = self.wxr.wtp.parse(case["input"]) sound_data = [defaultdict(list)] diff --git a/tests/test_de_translation.py b/tests/test_de_translation.py index 2db61cd8..7ddf5418 100644 --- a/tests/test_de_translation.py +++ b/tests/test_de_translation.py @@ -18,12 +18,6 @@ def setUp(self) -> None: self.wxr = WiktextractContext( Wtp(lang_code="de"), WiktionaryConfig(dump_file_lang_code="de") ) - self.wxr.wtp.LANGUAGES_BY_CODE["en"] = ["Englisch"] - self.wxr.wtp.LANGUAGES_BY_CODE["hy"] = ["Armenisch"] - self.wxr.wtp.LANGUAGES_BY_CODE["ru"] = ["Russisch"] - self.wxr.wtp.LANGUAGES_BY_CODE["fr"] = ["Französisch"] - self.wxr.wtp.LANGUAGES_BY_CODE["ar"] = ["Arabisch"] - self.wxr.wtp.LANGUAGES_BY_CODE["la"] = ["Latein"] def tearDown(self) -> None: self.wxr.wtp.close_db_conn() diff --git a/tests/test_long.py b/tests/test_long.py index 83d356f4..4942b3cf 100644 --- a/tests/test_long.py +++ b/tests/test_long.py @@ -99,10 +99,7 @@ def test_long(self): sum(words.values()), sum(poses.values()) + num_redirects ) self.assertGreater(num_transl, 0) - thesaurus_data = [ - data - for data in search_thesaurus( - self.wxr.thesaurus_db_conn, "hieno", "fi", "adj" - ) - ] + thesaurus_data = list(search_thesaurus( + self.wxr.thesaurus_db_conn, "hieno", "fi", "adj" + )) self.assertEqual(len(thesaurus_data), 17) diff --git a/tests/test_translations.py b/tests/test_translations.py index 14c84946..0973e196 100644 --- a/tests/test_translations.py +++ b/tests/test_translations.py @@ -132,8 +132,8 @@ def test_tr7(self): def test_tr8(self): data = self.runtr("Mandarin: 是 (rrr)", lang="Chinese") self.assertEqual(data, {"translations": [ - {"word": "是", "roman": "rrr", "lang": "Mandarin Chinese", - "code": "cmn", + {"word": "是", "roman": "rrr", "lang": "Chinese Mandarin", + "code": "zh", }]}) def test_tr9(self): diff --git a/tests/test_zh_translation.py b/tests/test_zh_translation.py index a3ba267b..d7d63771 100644 --- a/tests/test_zh_translation.py +++ b/tests/test_zh_translation.py @@ -52,7 +52,7 @@ def test_normal(self, mock_get_page) -> None: "word": "këtu", }, { - "lang_code": None, + "lang_code": "", "lang_name": "西阿帕切語", "sense": "靠近說話者的地方", "word": "kú",