Skip to content

Commit

Permalink
Merge pull request #393 from xxyzz/icu
Browse files Browse the repository at this point in the history
Remove `languages_by_code` `Wtp` class argument
  • Loading branch information
xxyzz authored Nov 7, 2023
2 parents 69357b4 + 6edbb3f commit a3665b8
Show file tree
Hide file tree
Showing 32 changed files with 122 additions and 89,071 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -408,8 +408,8 @@ The following command-line options can be used to control its operation:

* --out FILE: specifies the name of the file to write (specifying "-" as the file writes to stdout)
* --all-languages: extract words for all available languages
* --language LANGUAGE_CODE: extracts the given language (this option may be specified multiple times; by default, English [en] and Translingual [mul] words are extracted)
* --list-languages: prints a list of supported language names
* --language-code LANGUAGE_CODE: extracts the given language (this option may be specified multiple times; defaults to dump file language code and `mul`(Translingual))
* --language-name LANGUAGE_NAME: Similar to `--language-code` except this option accepts language name
* --dump-file-language-code LANGUAGE_CODE: specifies the language code for the Wiktionary edition that the dump file is for (defaults to "en"; "zh" is supported and others are being added)
* --all: causes all data to be captured for the selected languages
* --translations: causes translations to be captured
Expand Down
4 changes: 2 additions & 2 deletions json_schema/zh.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
},
"lang_code": {
"description": "Wiktionary language code",
"type": ["string", "null"]
"type": "string"
},
"word": {
"description": "word string",
Expand Down Expand Up @@ -285,7 +285,7 @@
"properties": {
"lang_code": {
"description": "Wiktionary language code of the translation term",
"type": ["string", "null"]
"type": "string"
},
"lang_name": {
"description": "Translation language name",
Expand Down
78 changes: 1 addition & 77 deletions src/wiktextract/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,6 @@ class WiktionaryConfig:
"ZH_PRON_TAGS",
"FR_FORM_TABLES",
"DE_FORM_TABLES",
"LANGUAGES_BY_NAME",
"LANGUAGES_BY_CODE",
"FORM_OF_TEMPLATES",
"analyze_templates",
"extract_thesaurus_pages",
Expand All @@ -60,7 +58,7 @@ class WiktionaryConfig:
def __init__(
self,
dump_file_lang_code="en",
capture_language_codes=["en", "mul"],
capture_language_codes={"en", "mul"},
capture_translations=True,
capture_pronunciation=True,
capture_linkages=True,
Expand Down Expand Up @@ -113,7 +111,6 @@ def __init__(
self.redirects = {}
self.data_folder = files("wiktextract") / "data" / dump_file_lang_code
self.init_subtitles()
self.init_languages()
self.set_attr_from_json("ZH_PRON_TAGS", "zh_pron_tags.json")
if dump_file_lang_code == "zh":
self.set_attr_from_json(
Expand Down Expand Up @@ -161,79 +158,6 @@ def init_subtitles(self) -> None:
assert isinstance(v["tags"], (list, tuple))
self.set_attr_from_json("OTHER_SUBTITLES", "other_subtitles.json")

def init_languages(self):
def canon_warn(name, use_code, not_use_code):
print(
f"WARNING: Non-unique language canonical name '{name}'."
f" Mapping to '{use_code}' instead of '{not_use_code}'."
)

def alias_info(name, new_code, kind, old_code, use_code, not_use_code):
if self.verbose:
print(
f"Language alias '{name}' for code '{new_code}'"
f" is already a{kind} for {old_code}."
f" Mapping to '{use_code}' instead of '{not_use_code}'."
)

self.set_attr_from_json("LANGUAGES_BY_CODE", "languages.json")

self.LANGUAGES_BY_NAME = {}

# add canonical names first to avoid overwriting them
canonical_names = {}
for lang_code, lang_names in self.LANGUAGES_BY_CODE.items():
canonical_name = lang_names[0]
if canonical_name in canonical_names:
lang_code0 = canonical_names[canonical_name]
if len(lang_code) < len(lang_code0):
canon_warn(canonical_name, lang_code, lang_code0)
canonical_names[canonical_name] = lang_code
self.LANGUAGES_BY_NAME[canonical_name] = lang_code
else:
canon_warn(canonical_name, lang_code0, lang_code)
else:
canonical_names[canonical_name] = lang_code
self.LANGUAGES_BY_NAME[canonical_name] = lang_code

# add other names
for lang_code, lang_names in self.LANGUAGES_BY_CODE.items():
for lang_name in lang_names[1:]:
if lang_name in canonical_names:
lang_code0 = canonical_names[lang_name]
alias_info(
lang_name,
lang_code,
" canonical name",
lang_code0,
lang_code0,
lang_code,
)
continue
if lang_name in self.LANGUAGES_BY_NAME:
lang_code0 = self.LANGUAGES_BY_NAME[lang_name]
if len(lang_code) < len(lang_code0):
alias_info(
lang_name,
lang_code,
"n alias",
lang_code0,
lang_code,
lang_code0,
)
self.LANGUAGES_BY_NAME[lang_name] = lang_code
else:
alias_info(
lang_name,
lang_code,
"n alias",
lang_code0,
lang_code0,
lang_code,
)
else:
self.LANGUAGES_BY_NAME[lang_name] = lang_code

def load_edition_settings(self):
file_path = self.data_folder / "config.json"
if file_path.exists():
Expand Down
Loading

0 comments on commit a3665b8

Please sign in to comment.