Skip to content

Commit

Permalink
Merge pull request #443 from xxyzz/ns
Browse files Browse the repository at this point in the history
Add two new edition configuration options
  • Loading branch information
xxyzz authored Dec 29, 2023
2 parents 649088f + 2e978d5 commit e0524ae
Show file tree
Hide file tree
Showing 6 changed files with 26 additions and 20 deletions.
16 changes: 16 additions & 0 deletions src/wiktextract/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ class WiktionaryConfig:
"ZH_PRON_TAGS",
"analyze_templates",
"extract_thesaurus_pages",
"save_ns_names",
"extract_ns_names",
)

def __init__(
Expand Down Expand Up @@ -111,6 +113,20 @@ def __init__(
self.set_attr_from_json("ZH_PRON_TAGS", "zh_pron_tags.json")
self.analyze_templates = True # find templates that need pre-expand
self.extract_thesaurus_pages = True
# these namespace pages will be copied from the XML dump file and
# saved to a SQLite db file
self.save_ns_names = [
"Main",
"Category", # do we use this?
"Appendix",
"Project",
"Thesaurus",
"Module",
"Template",
"Reconstruction",
]
# these are extracted namespaces
self.extract_ns_names = ["Main", "Reconstruction"]
self.load_edition_settings()

def merge_return(self, ret: CollatedErrorReturnData):
Expand Down
4 changes: 3 additions & 1 deletion src/wiktextract/data/fr/config.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
{
"analyze_templates": false,
"extract_thesaurus_pages": false
"extract_thesaurus_pages": false,
"save_ns_names": ["Main", "Template", "Module", "Conjugaison"],
"extract_ns_names": ["Main"]
}
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/fr/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def process_linkage_list(
sense_index_text = template_or_list_node.template_parameters.get(
2, "0"
)
if sense_index_text.isdigit():
if isinstance(sense_index_text, str) and sense_index_text.isdigit():
sense_index = int(sense_index_text)
continue
# sense could also be in ";" description list
Expand Down
6 changes: 3 additions & 3 deletions src/wiktextract/extractor/fr/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,9 @@ def process_translation_templates(
sense_parameter = template_node.template_parameters.get(1, "")
sense_text = clean_node(wxr, None, sense_parameter)
base_translation_data.sense = sense_text
base_translation_data.sense_index = int(
template_node.template_parameters.get(2, "0")
)
sense_index_str = template_node.template_parameters.get(2, "0")
if isinstance(sense_index_str, str) and sense_index_str.isdigit():
base_translation_data.sense_index = int(sense_index_str)

elif template_node.template_name == "T":
# Translation language: https://fr.wiktionary.org/wiki/Modèle:T
Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/wiktionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def reprocess_wiktionary(
process_ns_ids = list(
{
wxr.wtp.NAMESPACE_DATA.get(ns, {}).get("id", 0)
for ns in ["Main", "Reconstruction"]
for ns in wxr.config.extract_ns_names
}
)
start_time = time.time()
Expand Down
16 changes: 2 additions & 14 deletions src/wiktextract/wiktwords.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,18 +45,6 @@
from wiktextract.wiktionary import write_json_data
from wiktextract.wxr_context import WiktextractContext

# Pages within these namespaces are captured.
RECOGNIZED_NAMESPACE_NAMES = [
"Main",
"Category",
"Appendix",
"Project",
"Thesaurus",
"Module",
"Template",
"Reconstruction",
]


def process_single_page(
path_or_title: str,
Expand Down Expand Up @@ -438,8 +426,8 @@ def main():
try:
if args.path is not None:
namespace_ids = {
wxr.wtp.NAMESPACE_DATA.get(name, {}).get("id")
for name in RECOGNIZED_NAMESPACE_NAMES
wxr.wtp.NAMESPACE_DATA.get(name, {}).get("id", 0)
for name in wxr.config.save_ns_names
}
# Parse the normal full Wiktionary data dump
parse_wiktionary(
Expand Down

0 comments on commit e0524ae

Please sign in to comment.