Wiktextract typing project start

Set mypy to use Python 3.9 as baseline in pyproject.toml. Add typing to easy files. Found out that --statistics wasn't being actually used, so commented that out for later in the previous commit.
tatuylonen · Dec 21, 2023 · ba6871a · ba6871a
1 parent 7c83565
commit ba6871a
Show file tree

Hide file tree

Showing 4 changed files with 159 additions and 74 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -82,3 +82,11 @@ select = [
     "I",  # isort
     "W",  # pycodestyle warning
 ]
+
+[tool.mypy]
+mypy_path = "typestubs"
+python_version = 3.9
+
+[[tool.mypy.overrides]]
+module = "importlib_resources.*"
+ignore_missing_imports = true
diff --git a/src/wiktextract/categories.py b/src/wiktextract/categories.py
@@ -2,6 +2,13 @@
 #
 # Copyright (c) 2021 Tatu Ylonen.  See file LICENSE and https://ylonen.org
 
+from wikitextprocessor.core import NamespaceDataEntry
+from typing import (
+    Any,
+    Optional,
+    TypedDict,
+    Union,
+)
 from wiktextract.wxr_context import WiktextractContext
 from .page import clean_node
 
@@ -65,16 +72,39 @@
 return export
 """
 
-def extract_categories(wxr: WiktextractContext):
+CategoryEntry = TypedDict(
+    "CategoryEntry",
+    {
+        "name": str,
+        "desc": str,
+        "clean_desc": str,
+        "children": list[str],
+        "sort": list[str],
+    },
+    total=False,
+)
+
+CategoryReturn = TypedDict(
+    "CategoryReturn",
+    {
+        "roots": list[str],
+        "nodes": dict[str, CategoryEntry],
+    },
+    total=False,
+)
+
+def extract_categories(wxr: WiktextractContext) -> CategoryReturn:
     """Extracts the category tree from Wiktionary."""
-    module_ns = wxr.wtp.NAMESPACE_DATA.get("Module", {})
+    module_ns: Optional[NamespaceDataEntry] = wxr.wtp.NAMESPACE_DATA.get(
+                                                            "Module", None)
+    assert module_ns is not None
     module_ns_local_name = module_ns.get("name")
     module_ns_id = module_ns.get("id")
     wxr.wtp.add_page(f"{module_ns_local_name}:wiktextract cat tree",
                  module_ns_id, LUA_CODE, model="Scribunto")
     wxr.wtp.start_page("Wiktextract category tree extraction")
     rawdata = wxr.wtp.expand("{{#invoke:wiktextract cat tree|main}}")
-    ht = {}
+    ht: dict[str, CategoryEntry] = {}
     for line in rawdata.split("\n"):
         if not line:
             continue
@@ -97,7 +127,7 @@ def extract_categories(wxr: WiktextractContext):
             parent_name_lc = parent_name.lower()
             parent_sort = parts[i + 1]
             if parent_name_lc not in ht:
-                p = {"name": parent_name}
+                p: CategoryEntry  = {"name": parent_name}
                 ht[parent_name_lc] = p
             else:
                 p = ht[parent_name_lc]
@@ -109,10 +139,10 @@ def extract_categories(wxr: WiktextractContext):
                     p["sort"] = []
                 p["sort"].append(parent_sort)
 
-    seen = set()
-    is_child = set()
+    seen: set[str] = set()
+    is_child: set[str] = set()
 
-    def recurse(name):
+    def recurse(name: str) -> None:
         if name in seen:
             return
         seen.add(name)
@@ -125,8 +155,8 @@ def recurse(name):
         for child in v.get("children", ()):
             is_child.add(child.lower())
 
-    notseen = set(x.lower() for x in ht.keys()) - seen - is_child
-    notseen = list(ht[x]["name"] for x in sorted(notseen))
+    notseen_set = set(x.lower() for x in ht.keys()) - seen - is_child
+    notseen = list(ht[x]["name"] for x in sorted(notseen_set))
     #if notseen:
     #    print("NOT SEEN:", "; ".join(notseen))
 
@@ -137,7 +167,7 @@ def recurse(name):
 
     roots = ["Fundamental"]
     roots.extend(notseen)
-    ret = {"roots": roots, "nodes": ht}
+    ret: CategoryReturn = {"roots": roots, "nodes": ht}
     # import json
     # print(json.dumps(ret, sort_keys=True, indent=2))
     return ret
diff --git a/src/wiktextract/clean.py b/src/wiktextract/clean.py
@@ -9,14 +9,20 @@
 import re
 import html
 import unicodedata
+from typing import (
+    Callable,
+    Optional,
+    Union
+)
 from wikitextprocessor.common import MAGIC_FIRST, MAGIC_LAST
+from wikitextprocessor.core import NamespaceDataEntry
 from .wxr_context import WiktextractContext
 
 ######################################################################
 # Cleaning values into plain text.
 ######################################################################
 
-superscript_ht = {
+superscript_ht: dict[str, str] = {
     "0": "⁰",
     "1": "¹",
     "2": "²",
@@ -91,7 +97,7 @@
     "∞": "\u2002᪲"  # This is a KLUDGE
 }
 
-subscript_ht = {
+subscript_ht: dict[str, str] = {
     "0": "₀",
     "1": "₁",
     "2": "₂",
@@ -131,7 +137,7 @@
     "χ": "ᵪ",
 }
 
-def to_superscript(text):
+def to_superscript(text: str) -> str:
     "Converts text to superscript."
     if not text:
         return ""
@@ -141,7 +147,7 @@ def to_superscript(text):
         return "^" + text
     return "^({})".format(text)
 
-def to_subscript(text):
+def to_subscript(text: str) -> str:
     """Converts text to subscript."""
     if not text:
         return ""
@@ -151,14 +157,14 @@ def to_subscript(text):
         return "_" + text
     return "_({})".format(text)
 
-def to_chem(text):
+def to_chem(text: str) -> str:
     """Converts text to chemical formula, making digits subscript."""
     return "".join(to_subscript(x) if x.isdigit() else x
                    for x in text)
 
 # Mapping from Latex names to Unicode characters/strings.  This is the
 # default mapping (some cases are handled specially in the code).
-math_map = {
+math_map: dict[str, str] = {
     # XXX should probably change greek characters to non-slanted ones?
     "AC": "∿",
     "APLcomment": "⍝",
@@ -912,7 +918,7 @@ def to_chem(text):
     "mathrm": "",
 }
 
-mathcal_map = {
+mathcal_map: dict[str, str] = {
     "A": "𝒜",
     "B": "ℬ",
     "C": "𝒞",
@@ -967,7 +973,7 @@ def to_chem(text):
     "z": "𝓏",
 }
 
-mathfrak_map = {
+mathfrak_map: dict[str, str]= {
     "A": "𝔄",
     "B": "𝔅",
     "C": "ℭ",
@@ -994,7 +1000,7 @@ def to_chem(text):
     "Z": "ℨ",
 }
 
-mathbb_map = {
+mathbb_map: dict[str, str] = {
     "A": "𝔸",
     "B": "𝔹",
     "C": "ℂ",
@@ -1064,38 +1070,43 @@ def to_chem(text):
     "9": "𝟡",
 }
 
-def mathcal_fn(text):
+def mathcal_fn(text: str) -> str:
     return "".join(mathcal_map.get(x, x) for x in text)
 
-def mathfrak_fn(text):
+def mathfrak_fn(text: str) -> str:
     return "".join(mathfrak_map.get(x, x) for x in text)
 
-def mathbb_fn(text):
+def mathbb_fn(text: str) -> str:
     return "".join(mathbb_map.get(x, x) for x in text)
 
-def to_math(text):
+def to_math(text: str) -> str:
     """Converts a mathematical formula to ASCII."""
     # print("to_math: {!r}".format(text))
-    magic_vec = []
+    magic_vec: list[str] = []
 
-    def expand(text):
+    def expand(text: str) -> str:
         while True:
             orig = text
+            # formatting with {:c} converts input into character
             text = re.sub(r"[{:c}-{:c}]".format(MAGIC_FIRST, MAGIC_LAST),
                           lambda m: magic_vec[ord(m.group(0)) - MAGIC_FIRST],
                           text)
             if text == orig:
                 break
         return text
 
-    def recurse(text):
-        def math_magic(text, left, right, fn):
-            regexp = r"{}([^{}{}]+){}".format(
+    def recurse(text: str) -> str:
+        def math_magic(text: str,
+                        left: str,
+                        right: str,
+                        fn: Callable[[str], str]
+        ) -> str:
+            regexp_str = r"{}([^{}{}]+){}".format(
                 re.escape(left), re.escape(left),
                 re.escape(right), re.escape(right))
-            regexp = re.compile(regexp)
+            regexp = re.compile(regexp_str)
 
-            def repl(m):
+            def repl(m: re.Match) -> str:
                 magic = chr(MAGIC_FIRST + len(magic_vec))
                 t = fn(m.group(1)).strip()
                 magic_vec.append(t)
@@ -1108,8 +1119,8 @@ def repl(m):
                     break
             return text
 
-        def expand_group(v):
-            fn = None
+        def expand_group(v: str) -> str:
+            fn: Optional[Callable[[str], str]] = None
             if re.match(r"\\mathcal\b", v):
                 fn = mathcal_fn
                 v = v[8:].strip()
@@ -1181,7 +1192,7 @@ def expand_group(v):
             v = expand(v)
             return v
 
-        parts = []
+        parts: list[str] = []
         while True:
             orig = text
             text = math_magic(text, "{", "}", recurse)
@@ -1223,7 +1234,7 @@ def expand_group(v):
     return text
 
 
-def bold_follows(parts, i):
+def bold_follows(parts: list[str], i: int) -> bool:
     """Checks if there is a bold (''') in parts after parts[i].  We allow
     intervening italics ('')."""
     parts = parts[i + 1:]
@@ -1235,7 +1246,7 @@ def bold_follows(parts, i):
     return False
 
 
-def remove_italic_and_bold(text):
+def remove_italic_and_bold(text: str) -> str:
     """Based on token_iter in wikitextprocessor"""
     assert isinstance(text, str)
     lines = re.split(r"(\n+)", text)  # Lines and separators
@@ -1300,51 +1311,56 @@ def remove_italic_and_bold(text):
     new_text_parts = new_text_parts[:-1] # remove last \n
     return "".join(new_text_parts)
 
-def clean_value(wxr, title, no_strip=False, no_html_strip=False):
+def clean_value(wxr: WiktextractContext,
+                title: str,
+                no_strip=False,
+                no_html_strip=False
+) -> str:
     """Cleans a title or value into a normal string.  This should basically
     remove any Wikimedia formatting from it: HTML tags, templates, links,
     emphasis, etc.  This will also merge multiple whitespaces into one
     normal space and will remove any surrounding whitespace."""
     assert isinstance(wxr, WiktextractContext)
     assert isinstance(title, str)
 
-    def repl_1(m):
+    def repl_1(m: re.Match) -> str:
         return clean_value(wxr, m.group(1), no_strip=True)
-    def repl_exturl(m):
+
+    def repl_exturl(m: re.Match) -> str:
         args = re.split(r"\s+", m.group(1))
         i = 0
         while i < len(args) - 1:
             if not re.match(r"(https?|mailto)://", args[i]):
                 break
             i += 1
         return " ".join(args[i:])
-    def repl_link(m):
+    def repl_link(m: re.Match) -> str:
         if m.group(2) and m.group(2).lower() in ("file", "image"):
             return ""
         v = m.group(3).split("|")
         return clean_value(wxr, v[0], no_strip=True)
-    def repl_link_bars(m):
+    def repl_link_bars(m: re.Match) -> str:
         lnk = m.group(1)
         if re.match(r"(?si)(File|Image)\s*:", lnk):
             return ""
         return clean_value(wxr, m.group(4) or m.group(2) or "",
                            no_strip=True)
 
-    def repl_1_sup(m):
+    def repl_1_sup(m: re.Match) -> str:
         return to_superscript(clean_value(wxr, m.group(1)))
 
-    def repl_1_sub(m):
+    def repl_1_sub(m: re.Match) -> str:
         return to_subscript(clean_value(wxr, m.group(1)))
 
-    def repl_1_chem(m):
+    def repl_1_chem(m: re.Match) -> str:
         return to_chem(clean_value(wxr, m.group(1)))
 
-    def repl_1_math(m):
+    def repl_1_math(m: re.Match) -> str:
         v = to_math(m.group(1))
         # print("to_math:", ascii(v))
         return v
 
-    def repl_1_syntaxhighlight(m):
+    def repl_1_syntaxhighlight(m: re.Match) -> str:
         # Content is preformatted
         return "\n" + m.group(1).strip() + "\n"
 
@@ -1423,9 +1439,12 @@ def repl_1_syntaxhighlight(m):
     title = re.sub(r"\[//[^]\s]+\s+edit\s*\]", "", title)
     # Replace links by their text
 
-    category_ns_data = wxr.wtp.NAMESPACE_DATA.get("Category", {})
-    category_ns_names = {category_ns_data.get("name")} | set(
-        category_ns_data.get("aliases")
+    category_ns_data: Optional[NamespaceDataEntry]
+    # XXX "Category" -> config variable for portability
+    category_ns_data = wxr.wtp.NAMESPACE_DATA.get("Category", None)
+    assert category_ns_data is not None
+    category_ns_names = {category_ns_data["name"]} | set(
+        category_ns_data["aliases"]
     )
     category_names_pattern = rf"(?:{'|'.join(category_ns_names)})"
     while True:
@@ -1489,7 +1508,10 @@ def repl_1_syntaxhighlight(m):
     return title
 
 
-def clean_template_args(wxr, ht, no_strip=False):
+def clean_template_args(wxr: WiktextractContext,
+                        ht: dict[Union[int, str], str], # XXX -> "TemplateArgs"
+                        no_strip=False
+) -> dict[str, str]:
     """Cleans all values in a template argument dictionary and returns the
     cleaned dictionary."""
     assert isinstance(wxr, WiktextractContext)