Skip to content

Commit

Permalink
Wiktextract typing project start
Browse files Browse the repository at this point in the history
Set mypy to use Python 3.9 as baseline in pyproject.toml.

Add typing to easy files.

Found out that --statistics wasn't being actually used,
so commented that out for later in the previous commit.
  • Loading branch information
kristian-clausal committed Dec 21, 2023
1 parent 7c83565 commit ba6871a
Show file tree
Hide file tree
Showing 4 changed files with 159 additions and 74 deletions.
8 changes: 8 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,11 @@ select = [
"I", # isort
"W", # pycodestyle warning
]

[tool.mypy]
mypy_path = "typestubs"
python_version = 3.9

[[tool.mypy.overrides]]
module = "importlib_resources.*"
ignore_missing_imports = true
50 changes: 40 additions & 10 deletions src/wiktextract/categories.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@
#
# Copyright (c) 2021 Tatu Ylonen. See file LICENSE and https://ylonen.org

from wikitextprocessor.core import NamespaceDataEntry
from typing import (
Any,
Optional,
TypedDict,
Union,
)
from wiktextract.wxr_context import WiktextractContext
from .page import clean_node

Expand Down Expand Up @@ -65,16 +72,39 @@
return export
"""

def extract_categories(wxr: WiktextractContext):
CategoryEntry = TypedDict(
"CategoryEntry",
{
"name": str,
"desc": str,
"clean_desc": str,
"children": list[str],
"sort": list[str],
},
total=False,
)

CategoryReturn = TypedDict(
"CategoryReturn",
{
"roots": list[str],
"nodes": dict[str, CategoryEntry],
},
total=False,
)

def extract_categories(wxr: WiktextractContext) -> CategoryReturn:
"""Extracts the category tree from Wiktionary."""
module_ns = wxr.wtp.NAMESPACE_DATA.get("Module", {})
module_ns: Optional[NamespaceDataEntry] = wxr.wtp.NAMESPACE_DATA.get(
"Module", None)
assert module_ns is not None
module_ns_local_name = module_ns.get("name")
module_ns_id = module_ns.get("id")
wxr.wtp.add_page(f"{module_ns_local_name}:wiktextract cat tree",
module_ns_id, LUA_CODE, model="Scribunto")
wxr.wtp.start_page("Wiktextract category tree extraction")
rawdata = wxr.wtp.expand("{{#invoke:wiktextract cat tree|main}}")
ht = {}
ht: dict[str, CategoryEntry] = {}
for line in rawdata.split("\n"):
if not line:
continue
Expand All @@ -97,7 +127,7 @@ def extract_categories(wxr: WiktextractContext):
parent_name_lc = parent_name.lower()
parent_sort = parts[i + 1]
if parent_name_lc not in ht:
p = {"name": parent_name}
p: CategoryEntry = {"name": parent_name}
ht[parent_name_lc] = p
else:
p = ht[parent_name_lc]
Expand All @@ -109,10 +139,10 @@ def extract_categories(wxr: WiktextractContext):
p["sort"] = []
p["sort"].append(parent_sort)

seen = set()
is_child = set()
seen: set[str] = set()
is_child: set[str] = set()

def recurse(name):
def recurse(name: str) -> None:
if name in seen:
return
seen.add(name)
Expand All @@ -125,8 +155,8 @@ def recurse(name):
for child in v.get("children", ()):
is_child.add(child.lower())

notseen = set(x.lower() for x in ht.keys()) - seen - is_child
notseen = list(ht[x]["name"] for x in sorted(notseen))
notseen_set = set(x.lower() for x in ht.keys()) - seen - is_child
notseen = list(ht[x]["name"] for x in sorted(notseen_set))
#if notseen:
# print("NOT SEEN:", "; ".join(notseen))

Expand All @@ -137,7 +167,7 @@ def recurse(name):

roots = ["Fundamental"]
roots.extend(notseen)
ret = {"roots": roots, "nodes": ht}
ret: CategoryReturn = {"roots": roots, "nodes": ht}
# import json
# print(json.dumps(ret, sort_keys=True, indent=2))
return ret
100 changes: 61 additions & 39 deletions src/wiktextract/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,20 @@
import re
import html
import unicodedata
from typing import (
Callable,
Optional,
Union
)
from wikitextprocessor.common import MAGIC_FIRST, MAGIC_LAST
from wikitextprocessor.core import NamespaceDataEntry
from .wxr_context import WiktextractContext

######################################################################
# Cleaning values into plain text.
######################################################################

superscript_ht = {
superscript_ht: dict[str, str] = {
"0": "⁰",
"1": "¹",
"2": "²",
Expand Down Expand Up @@ -91,7 +97,7 @@
"∞": "\u2002᪲" # This is a KLUDGE
}

subscript_ht = {
subscript_ht: dict[str, str] = {
"0": "₀",
"1": "₁",
"2": "₂",
Expand Down Expand Up @@ -131,7 +137,7 @@
"χ": "ᵪ",
}

def to_superscript(text):
def to_superscript(text: str) -> str:
"Converts text to superscript."
if not text:
return ""
Expand All @@ -141,7 +147,7 @@ def to_superscript(text):
return "^" + text
return "^({})".format(text)

def to_subscript(text):
def to_subscript(text: str) -> str:
"""Converts text to subscript."""
if not text:
return ""
Expand All @@ -151,14 +157,14 @@ def to_subscript(text):
return "_" + text
return "_({})".format(text)

def to_chem(text):
def to_chem(text: str) -> str:
"""Converts text to chemical formula, making digits subscript."""
return "".join(to_subscript(x) if x.isdigit() else x
for x in text)

# Mapping from Latex names to Unicode characters/strings. This is the
# default mapping (some cases are handled specially in the code).
math_map = {
math_map: dict[str, str] = {
# XXX should probably change greek characters to non-slanted ones?
"AC": "∿",
"APLcomment": "⍝",
Expand Down Expand Up @@ -912,7 +918,7 @@ def to_chem(text):
"mathrm": "",
}

mathcal_map = {
mathcal_map: dict[str, str] = {
"A": "𝒜",
"B": "ℬ",
"C": "𝒞",
Expand Down Expand Up @@ -967,7 +973,7 @@ def to_chem(text):
"z": "𝓏",
}

mathfrak_map = {
mathfrak_map: dict[str, str]= {
"A": "𝔄",
"B": "𝔅",
"C": "ℭ",
Expand All @@ -994,7 +1000,7 @@ def to_chem(text):
"Z": "ℨ",
}

mathbb_map = {
mathbb_map: dict[str, str] = {
"A": "𝔸",
"B": "𝔹",
"C": "ℂ",
Expand Down Expand Up @@ -1064,38 +1070,43 @@ def to_chem(text):
"9": "𝟡",
}

def mathcal_fn(text):
def mathcal_fn(text: str) -> str:
return "".join(mathcal_map.get(x, x) for x in text)

def mathfrak_fn(text):
def mathfrak_fn(text: str) -> str:
return "".join(mathfrak_map.get(x, x) for x in text)

def mathbb_fn(text):
def mathbb_fn(text: str) -> str:
return "".join(mathbb_map.get(x, x) for x in text)

def to_math(text):
def to_math(text: str) -> str:
"""Converts a mathematical formula to ASCII."""
# print("to_math: {!r}".format(text))
magic_vec = []
magic_vec: list[str] = []

def expand(text):
def expand(text: str) -> str:
while True:
orig = text
# formatting with {:c} converts input into character
text = re.sub(r"[{:c}-{:c}]".format(MAGIC_FIRST, MAGIC_LAST),
lambda m: magic_vec[ord(m.group(0)) - MAGIC_FIRST],
text)
if text == orig:
break
return text

def recurse(text):
def math_magic(text, left, right, fn):
regexp = r"{}([^{}{}]+){}".format(
def recurse(text: str) -> str:
def math_magic(text: str,
left: str,
right: str,
fn: Callable[[str], str]
) -> str:
regexp_str = r"{}([^{}{}]+){}".format(
re.escape(left), re.escape(left),
re.escape(right), re.escape(right))
regexp = re.compile(regexp)
regexp = re.compile(regexp_str)

def repl(m):
def repl(m: re.Match) -> str:
magic = chr(MAGIC_FIRST + len(magic_vec))
t = fn(m.group(1)).strip()
magic_vec.append(t)
Expand All @@ -1108,8 +1119,8 @@ def repl(m):
break
return text

def expand_group(v):
fn = None
def expand_group(v: str) -> str:
fn: Optional[Callable[[str], str]] = None
if re.match(r"\\mathcal\b", v):
fn = mathcal_fn
v = v[8:].strip()
Expand Down Expand Up @@ -1181,7 +1192,7 @@ def expand_group(v):
v = expand(v)
return v

parts = []
parts: list[str] = []
while True:
orig = text
text = math_magic(text, "{", "}", recurse)
Expand Down Expand Up @@ -1223,7 +1234,7 @@ def expand_group(v):
return text


def bold_follows(parts, i):
def bold_follows(parts: list[str], i: int) -> bool:
"""Checks if there is a bold (''') in parts after parts[i]. We allow
intervening italics ('')."""
parts = parts[i + 1:]
Expand All @@ -1235,7 +1246,7 @@ def bold_follows(parts, i):
return False


def remove_italic_and_bold(text):
def remove_italic_and_bold(text: str) -> str:
"""Based on token_iter in wikitextprocessor"""
assert isinstance(text, str)
lines = re.split(r"(\n+)", text) # Lines and separators
Expand Down Expand Up @@ -1300,51 +1311,56 @@ def remove_italic_and_bold(text):
new_text_parts = new_text_parts[:-1] # remove last \n
return "".join(new_text_parts)

def clean_value(wxr, title, no_strip=False, no_html_strip=False):
def clean_value(wxr: WiktextractContext,
title: str,
no_strip=False,
no_html_strip=False
) -> str:
"""Cleans a title or value into a normal string. This should basically
remove any Wikimedia formatting from it: HTML tags, templates, links,
emphasis, etc. This will also merge multiple whitespaces into one
normal space and will remove any surrounding whitespace."""
assert isinstance(wxr, WiktextractContext)
assert isinstance(title, str)

def repl_1(m):
def repl_1(m: re.Match) -> str:
return clean_value(wxr, m.group(1), no_strip=True)
def repl_exturl(m):

def repl_exturl(m: re.Match) -> str:
args = re.split(r"\s+", m.group(1))
i = 0
while i < len(args) - 1:
if not re.match(r"(https?|mailto)://", args[i]):
break
i += 1
return " ".join(args[i:])
def repl_link(m):
def repl_link(m: re.Match) -> str:
if m.group(2) and m.group(2).lower() in ("file", "image"):
return ""
v = m.group(3).split("|")
return clean_value(wxr, v[0], no_strip=True)
def repl_link_bars(m):
def repl_link_bars(m: re.Match) -> str:
lnk = m.group(1)
if re.match(r"(?si)(File|Image)\s*:", lnk):
return ""
return clean_value(wxr, m.group(4) or m.group(2) or "",
no_strip=True)

def repl_1_sup(m):
def repl_1_sup(m: re.Match) -> str:
return to_superscript(clean_value(wxr, m.group(1)))

def repl_1_sub(m):
def repl_1_sub(m: re.Match) -> str:
return to_subscript(clean_value(wxr, m.group(1)))

def repl_1_chem(m):
def repl_1_chem(m: re.Match) -> str:
return to_chem(clean_value(wxr, m.group(1)))

def repl_1_math(m):
def repl_1_math(m: re.Match) -> str:
v = to_math(m.group(1))
# print("to_math:", ascii(v))
return v

def repl_1_syntaxhighlight(m):
def repl_1_syntaxhighlight(m: re.Match) -> str:
# Content is preformatted
return "\n" + m.group(1).strip() + "\n"

Expand Down Expand Up @@ -1423,9 +1439,12 @@ def repl_1_syntaxhighlight(m):
title = re.sub(r"\[//[^]\s]+\s+edit\s*\]", "", title)
# Replace links by their text

category_ns_data = wxr.wtp.NAMESPACE_DATA.get("Category", {})
category_ns_names = {category_ns_data.get("name")} | set(
category_ns_data.get("aliases")
category_ns_data: Optional[NamespaceDataEntry]
# XXX "Category" -> config variable for portability
category_ns_data = wxr.wtp.NAMESPACE_DATA.get("Category", None)
assert category_ns_data is not None
category_ns_names = {category_ns_data["name"]} | set(
category_ns_data["aliases"]
)
category_names_pattern = rf"(?:{'|'.join(category_ns_names)})"
while True:
Expand Down Expand Up @@ -1489,7 +1508,10 @@ def repl_1_syntaxhighlight(m):
return title


def clean_template_args(wxr, ht, no_strip=False):
def clean_template_args(wxr: WiktextractContext,
ht: dict[Union[int, str], str], # XXX -> "TemplateArgs"
no_strip=False
) -> dict[str, str]:
"""Cleans all values in a template argument dictionary and returns the
cleaned dictionary."""
assert isinstance(wxr, WiktextractContext)
Expand Down
Loading

0 comments on commit ba6871a

Please sign in to comment.