From 063d2f77f8991ad30760564cf39c4f02ac91f38e Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 20 Oct 2023 09:54:55 +0800 Subject: [PATCH] Append default override page JSON file to the "--override" option Chinese and German Wiktionary extractor requires some pages to be overriden. --- overrides/.gitignore | 0 .../wiktextract/data/overrides}/de.json | 0 .../wiktextract/data/overrides}/zh.json | 0 src/wiktextract/wiktwords.py | 22 ++++++++++++++++--- 4 files changed, 19 insertions(+), 3 deletions(-) delete mode 100644 overrides/.gitignore rename {overrides => src/wiktextract/data/overrides}/de.json (100%) rename {overrides => src/wiktextract/data/overrides}/zh.json (100%) diff --git a/overrides/.gitignore b/overrides/.gitignore deleted file mode 100644 index e69de29b..00000000 diff --git a/overrides/de.json b/src/wiktextract/data/overrides/de.json similarity index 100% rename from overrides/de.json rename to src/wiktextract/data/overrides/de.json diff --git a/overrides/zh.json b/src/wiktextract/data/overrides/zh.json similarity index 100% rename from overrides/zh.json rename to src/wiktextract/data/overrides/zh.json diff --git a/src/wiktextract/wiktwords.py b/src/wiktextract/wiktwords.py index 936a51fe..de4c9a36 100755 --- a/src/wiktextract/wiktwords.py +++ b/src/wiktextract/wiktwords.py @@ -19,6 +19,11 @@ from pathlib import Path from typing import TextIO +if sys.version_info < (3, 10): + from importlib_resources import files +else: + from importlib.resources import files + from wikitextprocessor import Wtp from wikitextprocessor.dumpparser import analyze_and_overwrite_pages @@ -246,8 +251,7 @@ def main(): "--override", type=str, action="append", - help="Override module(s) by one in file or files in directory " - "(for debugging)", + help="Path of JSON file contains override page data", ) parser.add_argument( "--use-thesaurus", @@ -439,8 +443,20 @@ def main(): pr = cProfile.Profile() pr.enable() + skip_extract_dump = wxr.wtp.saved_page_nums() > 0 + default_override_json_path = ( + files("wiktextract") + / "data" + / "overrides" + / f"{args.dump_file_language_code}.json" + ) + if default_override_json_path.exists() and not skip_extract_dump: + if args.override is None: + args.override = [default_override_json_path] + elif default_override_json_path not in args.override: + args.override.append(default_override_json_path) + try: - skip_extract_dump = wxr.wtp.saved_page_nums() > 0 if args.path is not None: namespace_ids = { wxr.wtp.NAMESPACE_DATA.get(name, {}).get("id")