From 69357b4be41eaeca4b8ac24ef591ed5f45758e21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Thu, 2 Nov 2023 08:13:28 +0200 Subject: [PATCH] Add "original_title" to data is applicable Some words have titles that cannot be easily handled by Wikimedia's page engine (or whatever is an appropriate term here...), like "C#". These words have special (bespoke?) articles with urls containing "Unsupported titles/" and a url-friendly string instead, which causes problems with searching for those original articles or generating urls "back" to Wiktionary. If the "word" field is different from the actual article title (first line of wiktextract extract raw debug page starting with "TITLE: " in our case), then add a new field "original_title" containing the original title. Debug messages are printed out in two flavors: "Unsupported titles/" need to be handled separately by adding them to unsupported_title.py Words that differ from the original titles are otherwise suspicious and get a generic debug message. --- src/wiktextract/extractor/en/page.py | 14 +++++++++++++- .../{ => extractor/en}/unsupported_titles.py | 0 tests/test_page.py | 1 + 3 files changed, 14 insertions(+), 1 deletion(-) rename src/wiktextract/{ => extractor/en}/unsupported_titles.py (100%) diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py index 37e07709..d2327f89 100644 --- a/src/wiktextract/extractor/en/page.py +++ b/src/wiktextract/extractor/en/page.py @@ -19,7 +19,6 @@ from wiktextract.linkages import parse_linkage_item_text from wiktextract.translations import parse_translation_item_text from wiktextract.clean import clean_template_args -from wiktextract.unsupported_titles import unsupported_title_map from wiktextract.datautils import data_append, data_extend, ns_title_prefix_tuple from wiktextract.tags import valid_tags from wiktextract.page import ( @@ -34,6 +33,7 @@ from ..ruby import extract_ruby, parse_ruby from ..share import strip_nodes +from .unsupported_titles import unsupported_title_map # Matches head tag head_tag_re = None @@ -3572,6 +3572,18 @@ def multitrans_post_fn(name, ht, text): data["topics"] = list(new_topics) # Copy list! ret.extend(lang_datas) + for x in ret: + if x["word"] != word: + if word.startswith("Unsupported titles/"): + wxr.wtp.debug(f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'", + sortid="20231101/3578page.py" + ) + else: + wxr.wtp.debug(f"DIFFERENT ORIGINAL TITLE: '{word}' " + f"-> '{x['word']}'", + sortid="20231101/3582page.py" + ) + x["original_title"] = word return ret diff --git a/src/wiktextract/unsupported_titles.py b/src/wiktextract/extractor/en/unsupported_titles.py similarity index 100% rename from src/wiktextract/unsupported_titles.py rename to src/wiktextract/extractor/en/unsupported_titles.py diff --git a/tests/test_page.py b/tests/test_page.py index 97bc5457..029ee297 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -122,6 +122,7 @@ def test_page3(self): ], "lang": "Swedish", "lang_code": "sv", + "original_title": "Unsupported titles/C sharp", "pos": "noun", "senses": [ {