From 69357b4be41eaeca4b8ac24ef591ed5f45758e21 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= <kristian@clausal.com>
Date: Thu, 2 Nov 2023 08:13:28 +0200
Subject: [PATCH] Add "original_title" to data is applicable

Some words have titles that cannot be easily handled by
Wikimedia's page engine (or whatever is an appropriate term
here...), like "C#". These words have special (bespoke?)
articles with urls containing "Unsupported titles/" and
a url-friendly string instead, which causes problems with
searching for those original articles or generating
urls "back" to Wiktionary.

If the "word" field is different from the actual article
title (first line of wiktextract extract raw debug page
starting with "TITLE: " in our case), then add a new
field "original_title" containing the original title.

Debug messages are printed out in two flavors:

"Unsupported titles/" need to be handled separately
by adding them to unsupported_title.py

Words that differ from the original titles are
otherwise suspicious and get a generic debug message.
---
 src/wiktextract/extractor/en/page.py               | 14 +++++++++++++-
 .../{ => extractor/en}/unsupported_titles.py       |  0
 tests/test_page.py                                 |  1 +
 3 files changed, 14 insertions(+), 1 deletion(-)
 rename src/wiktextract/{ => extractor/en}/unsupported_titles.py (100%)

diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py
index 37e07709..d2327f89 100644
--- a/src/wiktextract/extractor/en/page.py
+++ b/src/wiktextract/extractor/en/page.py
@@ -19,7 +19,6 @@
 from wiktextract.linkages import parse_linkage_item_text
 from wiktextract.translations import parse_translation_item_text
 from wiktextract.clean import clean_template_args
-from wiktextract.unsupported_titles import unsupported_title_map
 from wiktextract.datautils import data_append, data_extend, ns_title_prefix_tuple
 from wiktextract.tags import valid_tags
 from wiktextract.page import (
@@ -34,6 +33,7 @@
 from ..ruby import extract_ruby, parse_ruby
 from ..share import strip_nodes
 
+from .unsupported_titles import unsupported_title_map
 
 # Matches head tag
 head_tag_re = None
@@ -3572,6 +3572,18 @@ def multitrans_post_fn(name, ht, text):
                     data["topics"] = list(new_topics)  # Copy list!
         ret.extend(lang_datas)
 
+    for x in ret:
+        if x["word"] != word:
+            if word.startswith("Unsupported titles/"):
+                wxr.wtp.debug(f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'",
+                                sortid="20231101/3578page.py"
+                            )
+            else:
+                wxr.wtp.debug(f"DIFFERENT ORIGINAL TITLE: '{word}' "
+                              f"-> '{x['word']}'",
+                              sortid="20231101/3582page.py"
+                             )
+            x["original_title"] = word
     return ret
 
 
diff --git a/src/wiktextract/unsupported_titles.py b/src/wiktextract/extractor/en/unsupported_titles.py
similarity index 100%
rename from src/wiktextract/unsupported_titles.py
rename to src/wiktextract/extractor/en/unsupported_titles.py
diff --git a/tests/test_page.py b/tests/test_page.py
index 97bc5457..029ee297 100644
--- a/tests/test_page.py
+++ b/tests/test_page.py
@@ -122,6 +122,7 @@ def test_page3(self):
                     ],
                     "lang": "Swedish",
                     "lang_code": "sv",
+                    "original_title": "Unsupported titles/C sharp",
                     "pos": "noun",
                     "senses": [
                         {