diff --git a/src/wiktextract/config.py b/src/wiktextract/config.py index 1680902aa..0f75767d1 100644 --- a/src/wiktextract/config.py +++ b/src/wiktextract/config.py @@ -50,6 +50,8 @@ class WiktionaryConfig: "ZH_PRON_TAGS", "analyze_templates", "extract_thesaurus_pages", + "save_ns_names", + "extract_ns_names", ) def __init__( @@ -111,6 +113,20 @@ def __init__( self.set_attr_from_json("ZH_PRON_TAGS", "zh_pron_tags.json") self.analyze_templates = True # find templates that need pre-expand self.extract_thesaurus_pages = True + # these namespace pages will be copied from the XML dump file and + # saved to a SQLite db file + self.save_ns_names = [ + "Main", + "Category", # do we use this? + "Appendix", + "Project", + "Thesaurus", + "Module", + "Template", + "Reconstruction", + ] + # these are extracted namespaces + self.extract_ns_names = ["Main", "Reconstruction"] self.load_edition_settings() def merge_return(self, ret: CollatedErrorReturnData): diff --git a/src/wiktextract/data/fr/config.json b/src/wiktextract/data/fr/config.json index 91a7ba446..eb4e717c6 100644 --- a/src/wiktextract/data/fr/config.json +++ b/src/wiktextract/data/fr/config.json @@ -1,4 +1,6 @@ { "analyze_templates": false, - "extract_thesaurus_pages": false + "extract_thesaurus_pages": false, + "save_ns_names": ["Main", "Template", "Module", "Conjugaison"], + "extract_ns_names": ["Main"] } diff --git a/src/wiktextract/extractor/fr/linkage.py b/src/wiktextract/extractor/fr/linkage.py index 77eb516bb..55e8e390d 100644 --- a/src/wiktextract/extractor/fr/linkage.py +++ b/src/wiktextract/extractor/fr/linkage.py @@ -70,7 +70,7 @@ def process_linkage_list( sense_index_text = template_or_list_node.template_parameters.get( 2, "0" ) - if sense_index_text.isdigit(): + if isinstance(sense_index_text, str) and sense_index_text.isdigit(): sense_index = int(sense_index_text) continue # sense could also be in ";" description list diff --git a/src/wiktextract/extractor/fr/translation.py b/src/wiktextract/extractor/fr/translation.py index 11221d04a..cdb65980f 100644 --- a/src/wiktextract/extractor/fr/translation.py +++ b/src/wiktextract/extractor/fr/translation.py @@ -80,9 +80,9 @@ def process_translation_templates( sense_parameter = template_node.template_parameters.get(1, "") sense_text = clean_node(wxr, None, sense_parameter) base_translation_data.sense = sense_text - base_translation_data.sense_index = int( - template_node.template_parameters.get(2, "0") - ) + sense_index_str = template_node.template_parameters.get(2, "0") + if isinstance(sense_index_str, str) and sense_index_str.isdigit(): + base_translation_data.sense_index = int(sense_index_str) elif template_node.template_name == "T": # Translation language: https://fr.wiktionary.org/wiki/Modèle:T diff --git a/src/wiktextract/wiktionary.py b/src/wiktextract/wiktionary.py index 6b6e93138..6e3c68789 100644 --- a/src/wiktextract/wiktionary.py +++ b/src/wiktextract/wiktionary.py @@ -184,7 +184,7 @@ def reprocess_wiktionary( process_ns_ids = list( { wxr.wtp.NAMESPACE_DATA.get(ns, {}).get("id", 0) - for ns in ["Main", "Reconstruction"] + for ns in wxr.config.extract_ns_names } ) start_time = time.time() diff --git a/src/wiktextract/wiktwords.py b/src/wiktextract/wiktwords.py index 6a896d178..ef1702b49 100755 --- a/src/wiktextract/wiktwords.py +++ b/src/wiktextract/wiktwords.py @@ -45,18 +45,6 @@ from wiktextract.wiktionary import write_json_data from wiktextract.wxr_context import WiktextractContext -# Pages within these namespaces are captured. -RECOGNIZED_NAMESPACE_NAMES = [ - "Main", - "Category", - "Appendix", - "Project", - "Thesaurus", - "Module", - "Template", - "Reconstruction", -] - def process_single_page( path_or_title: str, @@ -438,8 +426,8 @@ def main(): try: if args.path is not None: namespace_ids = { - wxr.wtp.NAMESPACE_DATA.get(name, {}).get("id") - for name in RECOGNIZED_NAMESPACE_NAMES + wxr.wtp.NAMESPACE_DATA.get(name, {}).get("id", 0) + for name in wxr.config.save_ns_names } # Parse the normal full Wiktionary data dump parse_wiktionary(