Add "original_title" to data is applicable

Some words have titles that cannot be easily handled by Wikimedia's page engine (or whatever is an appropriate term here...), like "C#". These words have special (bespoke?) articles with urls containing "Unsupported titles/" and a url-friendly string instead, which causes problems with searching for those original articles or generating urls "back" to Wiktionary. If the "word" field is different from the actual article title (first line of wiktextract extract raw debug page starting with "TITLE: " in our case), then add a new field "original_title" containing the original title. Debug messages are printed out in two flavors: "Unsupported titles/" need to be handled separately by adding them to unsupported_title.py Words that differ from the original titles are otherwise suspicious and get a generic debug message.
tatuylonen · Nov 3, 2023 · 69357b4 · 69357b4
1 parent 595ff1d
commit 69357b4
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 1 deletion.
diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py
@@ -19,7 +19,6 @@
 from wiktextract.linkages import parse_linkage_item_text
 from wiktextract.translations import parse_translation_item_text
 from wiktextract.clean import clean_template_args
-from wiktextract.unsupported_titles import unsupported_title_map
 from wiktextract.datautils import data_append, data_extend, ns_title_prefix_tuple
 from wiktextract.tags import valid_tags
 from wiktextract.page import (
@@ -34,6 +33,7 @@
 from ..ruby import extract_ruby, parse_ruby
 from ..share import strip_nodes
 
+from .unsupported_titles import unsupported_title_map
 
 # Matches head tag
 head_tag_re = None
@@ -3572,6 +3572,18 @@ def multitrans_post_fn(name, ht, text):
                     data["topics"] = list(new_topics)  # Copy list!
         ret.extend(lang_datas)
 
+    for x in ret:
+        if x["word"] != word:
+            if word.startswith("Unsupported titles/"):
+                wxr.wtp.debug(f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'",
+                                sortid="20231101/3578page.py"
+                            )
+            else:
+                wxr.wtp.debug(f"DIFFERENT ORIGINAL TITLE: '{word}' "
+                              f"-> '{x['word']}'",
+                              sortid="20231101/3582page.py"
+                             )
+            x["original_title"] = word
     return ret
 
 

diff --git a/src/wiktextract/unsupported_titles.py → ...xtract/extractor/en/unsupported_titles.py b/src/wiktextract/unsupported_titles.py → ...xtract/extractor/en/unsupported_titles.py
diff --git a/tests/test_page.py b/tests/test_page.py
@@ -122,6 +122,7 @@ def test_page3(self):
                     ],
                     "lang": "Swedish",
                     "lang_code": "sv",
+                    "original_title": "Unsupported titles/C sharp",
                     "pos": "noun",
                     "senses": [
                         {