Skip to content

Commit

Permalink
Remove LANGUAGES_BY_NAME and LANGUAGES_BY_CODE
Browse files Browse the repository at this point in the history
use the mediawiki-langcodes package to convert language names and
codes
  • Loading branch information
xxyzz committed Nov 7, 2023
1 parent 46f5213 commit 52dccc1
Show file tree
Hide file tree
Showing 25 changed files with 86 additions and 208 deletions.
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,6 @@ The following command-line options can be used to control its operation:
* --out FILE: specifies the name of the file to write (specifying "-" as the file writes to stdout)
* --all-languages: extract words for all available languages
* --language LANGUAGE_CODE: extracts the given language (this option may be specified multiple times; by default, English [en] and Translingual [mul] words are extracted)
* --list-languages: prints a list of supported language names
* --dump-file-language-code LANGUAGE_CODE: specifies the language code for the Wiktionary edition that the dump file is for (defaults to "en"; "zh" is supported and others are being added)
* --all: causes all data to be captured for the selected languages
* --translations: causes translations to be captured
Expand Down
4 changes: 2 additions & 2 deletions json_schema/zh.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
},
"lang_code": {
"description": "Wiktionary language code",
"type": ["string", "null"]
"type": "string"
},
"word": {
"description": "word string",
Expand Down Expand Up @@ -285,7 +285,7 @@
"properties": {
"lang_code": {
"description": "Wiktionary language code of the translation term",
"type": ["string", "null"]
"type": "string"
},
"lang_name": {
"description": "Translation language name",
Expand Down
76 changes: 0 additions & 76 deletions src/wiktextract/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,6 @@ class WiktionaryConfig:
"ZH_PRON_TAGS",
"FR_FORM_TABLES",
"DE_FORM_TABLES",
"LANGUAGES_BY_NAME",
"LANGUAGES_BY_CODE",
"FORM_OF_TEMPLATES",
"analyze_templates",
"extract_thesaurus_pages",
Expand Down Expand Up @@ -113,7 +111,6 @@ def __init__(
self.redirects = {}
self.data_folder = files("wiktextract") / "data" / dump_file_lang_code
self.init_subtitles()
self.init_languages()
self.set_attr_from_json("ZH_PRON_TAGS", "zh_pron_tags.json")
if dump_file_lang_code == "zh":
self.set_attr_from_json(
Expand Down Expand Up @@ -161,79 +158,6 @@ def init_subtitles(self) -> None:
assert isinstance(v["tags"], (list, tuple))
self.set_attr_from_json("OTHER_SUBTITLES", "other_subtitles.json")

def init_languages(self):
def canon_warn(name, use_code, not_use_code):
print(
f"WARNING: Non-unique language canonical name '{name}'."
f" Mapping to '{use_code}' instead of '{not_use_code}'."
)

def alias_info(name, new_code, kind, old_code, use_code, not_use_code):
if self.verbose:
print(
f"Language alias '{name}' for code '{new_code}'"
f" is already a{kind} for {old_code}."
f" Mapping to '{use_code}' instead of '{not_use_code}'."
)

self.set_attr_from_json("LANGUAGES_BY_CODE", "languages.json")

self.LANGUAGES_BY_NAME = {}

# add canonical names first to avoid overwriting them
canonical_names = {}
for lang_code, lang_names in self.LANGUAGES_BY_CODE.items():
canonical_name = lang_names[0]
if canonical_name in canonical_names:
lang_code0 = canonical_names[canonical_name]
if len(lang_code) < len(lang_code0):
canon_warn(canonical_name, lang_code, lang_code0)
canonical_names[canonical_name] = lang_code
self.LANGUAGES_BY_NAME[canonical_name] = lang_code
else:
canon_warn(canonical_name, lang_code0, lang_code)
else:
canonical_names[canonical_name] = lang_code
self.LANGUAGES_BY_NAME[canonical_name] = lang_code

# add other names
for lang_code, lang_names in self.LANGUAGES_BY_CODE.items():
for lang_name in lang_names[1:]:
if lang_name in canonical_names:
lang_code0 = canonical_names[lang_name]
alias_info(
lang_name,
lang_code,
" canonical name",
lang_code0,
lang_code0,
lang_code,
)
continue
if lang_name in self.LANGUAGES_BY_NAME:
lang_code0 = self.LANGUAGES_BY_NAME[lang_name]
if len(lang_code) < len(lang_code0):
alias_info(
lang_name,
lang_code,
"n alias",
lang_code0,
lang_code,
lang_code0,
)
self.LANGUAGES_BY_NAME[lang_name] = lang_code
else:
alias_info(
lang_name,
lang_code,
"n alias",
lang_code0,
lang_code0,
lang_code,
)
else:
self.LANGUAGES_BY_NAME[lang_name] = lang_code

def load_edition_settings(self):
file_path = self.data_folder / "config.json"
if file_path.exists():
Expand Down
1 change: 0 additions & 1 deletion src/wiktextract/extractor/de/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode

from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext
Expand Down
1 change: 0 additions & 1 deletion src/wiktextract/extractor/de/gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode

from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext
Expand Down
1 change: 0 additions & 1 deletion src/wiktextract/extractor/de/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode

from wiktextract.extractor.de.utils import split_senseids
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext
Expand Down
7 changes: 3 additions & 4 deletions src/wiktextract/extractor/de/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
from collections import defaultdict
from typing import Dict, List, Union

from mediawiki_langcodes import name_to_code
from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode

from wiktextract.datautils import append_base_data
from wiktextract.wxr_context import WiktextractContext

Expand Down Expand Up @@ -263,13 +263,12 @@ def parse_page(
# German name of the language of the section.
if subtitle_template.template_name == "Sprache":
lang_name = subtitle_template.template_parameters.get(1)
lang_code = wxr.config.LANGUAGES_BY_NAME.get(lang_name)
if not lang_code:
lang_code = name_to_code(lang_name, "de")
if lang_code == "":
wxr.wtp.warning(
f"Unknown language: {lang_name}",
sortid="extractor/de/page/parse_page/76",
)
continue
if (
wxr.config.capture_language_codes is not None
and lang_code not in wxr.config.capture_language_codes
Expand Down
4 changes: 2 additions & 2 deletions src/wiktextract/extractor/de/pronunciation.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from collections import defaultdict
from typing import Dict, List, Union

from mediawiki_langcodes import code_to_name
from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode

from wiktextract.extractor.share import create_audio_url_dict
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext
Expand Down Expand Up @@ -94,7 +94,7 @@ def process_lautschrift_template(

lang_code = template_parameters.get("spr")
if lang_code:
language = wxr.wtp.LANGUAGES_BY_CODE[lang_code]
language = code_to_name(lang_code, "de")
add_sound_data_without_appending_to_existing_properties(
sound_data,
{
Expand Down
10 changes: 4 additions & 6 deletions src/wiktextract/extractor/de/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
from collections import defaultdict
from typing import Dict, List, Union

from mediawiki_langcodes import code_to_name
from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode

from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

Expand Down Expand Up @@ -101,12 +101,10 @@ def process_translation_list(

lang_code = node.template_parameters.get(1)
translation_data["code"] = lang_code
languages = wxr.wtp.LANGUAGES_BY_CODE.get(lang_code)
if languages:
translation_data["lang"] = languages[0]
else:
translation_data["lang"] = code_to_name(lang_code, "de")
if translation_data["lang"] == "":
wxr.wtp.debug(
f"Unknown language code: {lang_code}",
f"Unknown language code: {translation_data['lang']}",
sortid="extractor/de/translation/process_translation_list/70",
)
if node.template_name[-1] == "?":
Expand Down
69 changes: 40 additions & 29 deletions src/wiktextract/extractor/en/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,47 @@
#
# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org

import re
import sys
import copy
import html
import logging

import re
import sys
from collections import defaultdict
from functools import partial
from typing import Dict, List, Optional, Set, Union

from wikitextprocessor import WikiNode, NodeKind
from mediawiki_langcodes import get_all_names, name_to_code
from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.core import TemplateArgs
from wiktextract.wxr_context import WiktextractContext
from wiktextract.parts_of_speech import PARTS_OF_SPEECH
from wiktextract.linkages import parse_linkage_item_text
from wiktextract.translations import parse_translation_item_text
from wiktextract.clean import clean_template_args
from wiktextract.datautils import data_append, data_extend, ns_title_prefix_tuple
from wiktextract.tags import valid_tags
from wiktextract.page import (
clean_node, recursively_extract, LEVEL_KINDS, is_panel_template
from wiktextract.datautils import (
data_append,
data_extend,
ns_title_prefix_tuple,
)

from wiktextract.form_descriptions import (
decode_tags, parse_word_head, parse_sense_qualifier,
distw, parse_alt_or_inflection_of, classify_desc)
from wiktextract.inflection import parse_inflection_section, TableContext
classify_desc,
decode_tags,
distw,
parse_alt_or_inflection_of,
parse_sense_qualifier,
parse_word_head,
)
from wiktextract.inflection import TableContext, parse_inflection_section
from wiktextract.linkages import parse_linkage_item_text
from wiktextract.page import (
LEVEL_KINDS,
clean_node,
is_panel_template,
recursively_extract,
)
from wiktextract.parts_of_speech import PARTS_OF_SPEECH
from wiktextract.tags import valid_tags
from wiktextract.translations import parse_translation_item_text
from wiktextract.wxr_context import WiktextractContext

from ..ruby import extract_ruby, parse_ruby
from ..share import strip_nodes

from .unsupported_titles import unsupported_title_map

# Matches head tag
Expand Down Expand Up @@ -532,7 +542,7 @@ def init_head_tag_re(wxr):
r"^(head|Han char|arabic-noun|arabic-noun-form|"
r"hangul-symbol|syllable-hangul)$|" +
r"^(latin|" +
"|".join(wxr.wtp.LANGUAGES_BY_CODE) + r")-(" +
"|".join(lang_name for _, lang_name in get_all_names("en")) + r")-(" +
"|".join([
"abbr",
"adj",
Expand Down Expand Up @@ -3356,7 +3366,6 @@ def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str:
"""Fix subtitle hierarchy to be strict Language -> Etymology ->
Part-of-Speech -> Translation/Linkage."""

# Known language names are in languages_by_name
# Known lowercase PoS names are in part_of_speech_map
# Known lowercase linkage section names are in linkage_map

Expand All @@ -3381,7 +3390,7 @@ def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str:
.format(title, left, right),
sortid="page/2904")
lc = title.lower()
if title in wxr.config.LANGUAGES_BY_NAME:
if name_to_code(title, "en") != "":
if level > 2:
wxr.wtp.debug("subtitle has language name {} at level {}"
.format(title, level),
Expand Down Expand Up @@ -3491,16 +3500,18 @@ def multitrans_post_fn(name, ht, text):
# Some pages have links at top level, e.g., "trees" in Wiktionary
continue
if langnode.kind != NodeKind.LEVEL2:
wxr.wtp.debug("unexpected top-level node: {}".format(langnode),
sortid="page/3014")
continue
lang = clean_node(wxr, None,
langnode.sarg if langnode.sarg else langnode.largs)
if lang not in wxr.config.LANGUAGES_BY_NAME:
wxr.wtp.debug("unrecognized language name at top-level {!r}"
.format(lang), sortid="page/3019")
wxr.wtp.debug(
f"unexpected top-level node: {langnode}", sortid="page/3014"
)
continue
lang_code = wxr.config.LANGUAGES_BY_NAME.get(lang)
lang = clean_node(
wxr, None, langnode.sarg if langnode.sarg else langnode.largs
)
lang_code = name_to_code(lang, "en")
if lang_code == "":
wxr.wtp.debug(
f"unrecognized language name: {lang}", sortid="page/3019"
)
if (
wxr.config.capture_language_codes
and lang_code not in wxr.config.capture_language_codes
Expand Down
8 changes: 4 additions & 4 deletions src/wiktextract/extractor/en/thesaurus.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
import re
from typing import List, Optional

from mediawiki_langcodes import code_to_name, name_to_code
from wikitextprocessor import NodeKind, Page, WikiNode

from wiktextract.datautils import ns_title_prefix_tuple
from wiktextract.form_descriptions import parse_sense_qualifier
from wiktextract.page import LEVEL_KINDS, clean_node
Expand Down Expand Up @@ -98,7 +98,7 @@ def extract_thesaurus_page(
# {{ws header|lang=xx}}
m = re.search(r"(?s)\{\{ws header\|[^}]*lang=([^}|]*)", text)
if m:
lang = wxr.config.LANGUAGES_BY_CODE.get(m.group(1), [None])[0]
lang = code_to_name(m.group(1), "en")

def recurse(contents) -> Optional[List[ThesaurusTerm]]:
nonlocal lang
Expand Down Expand Up @@ -197,7 +197,7 @@ def qual_fn(m):
w1 = w1.removesuffix(" [⇒ thesaurus]")

if w1:
lang_code = wxr.config.LANGUAGES_BY_NAME.get(lang)
lang_code = name_to_code(lang, "en")
if lang_code is None:
logging.debug(
f"Linkage language {lang} not recognized"
Expand Down Expand Up @@ -230,7 +230,7 @@ def qual_fn(m):
subtitle = wxr.wtp.node_to_text(
contents.sarg if contents.sarg else contents.largs
)
if subtitle in wxr.config.LANGUAGES_BY_NAME:
if name_to_code(subtitle, "en") != "":
lang = subtitle
pos = None
sense = None
Expand Down
5 changes: 3 additions & 2 deletions src/wiktextract/extractor/zh/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from collections import defaultdict
from typing import Dict, List, Union

from mediawiki_langcodes import name_to_code
from wikitextprocessor import NodeKind, WikiNode
from wiktextract.datautils import append_base_data
from wiktextract.page import LEVEL_KINDS, clean_node
Expand Down Expand Up @@ -212,12 +213,12 @@ def parse_page(
for level2_node in tree.find_child(NodeKind.LEVEL2):
categories_and_links = defaultdict(list)
lang_name = clean_node(wxr, categories_and_links, level2_node.largs)
if lang_name not in wxr.config.LANGUAGES_BY_NAME:
if name_to_code(lang_name, "zh") == "":
wxr.wtp.warning(
f"Unrecognized language name: {lang_name}",
sortid="extractor/zh/page/parse_page/509",
)
lang_code = wxr.config.LANGUAGES_BY_NAME.get(lang_name)
lang_code = name_to_code(lang_name, "zh")
if (
wxr.config.capture_language_codes is not None
and lang_code not in wxr.config.capture_language_codes
Expand Down
Loading

0 comments on commit 52dccc1

Please sign in to comment.