fonttools · anthrotype · Oct 28, 2022 · Sep 17, 2022 · Sep 17, 2022 · Sep 18, 2022
diff --git a/makeunicodedata.py b/makeunicodedata.py
@@ -26,13 +26,14 @@
 # written by Fredrik Lundh ([email protected])
 #
 
+import dataclasses
 import os
 import sys
 import zipfile
 
-from collections import namedtuple
 from functools import partial
 from textwrap import dedent
+from typing import Iterator, List, Optional, Set, Tuple
 
 SCRIPT = sys.argv[0]
 VERSION = "3.3"
@@ -76,7 +77,8 @@
     "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
     "ON", "LRI", "RLI", "FSI", "PDI" ]
 
-EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
+# "N" needs to be the first entry, see the comment in makeunicodedata
+EASTASIANWIDTH_NAMES = [ "N", "H", "W", "Na", "A", "F" ]
 
 MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
 
@@ -135,6 +137,14 @@ def maketables(trace=0):
 
 def makeunicodedata(unicode, trace):
 
+    # the default value of east_asian_width is "N", for unassigned code points
+    # not mentioned in EastAsianWidth.txt
+    # in addition there are some reserved but unassigned code points in CJK
+    # ranges that are classified as "W". code points in private use areas
+    # have a width of "A". both of these have entries in
+    # EastAsianWidth.txt
+    # see https://unicode.org/reports/tr11/#Unassigned
+    assert EASTASIANWIDTH_NAMES[0] == "N"
     dummy = (0, 0, 0, 0, 0, 0)
     table = [dummy]
     cache = {0: dummy}
@@ -160,15 +170,25 @@ def makeunicodedata(unicode, trace):
                 category, combining, bidirectional, mirrored, eastasianwidth,
                 normalizationquickcheck
                 )
-            # add entry to index and item tables
-            i = cache.get(item)
-            if i is None:
-                cache[item] = i = len(table)
-                table.append(item)
-            index[char] = i
+        elif unicode.widths[char] is not None:
+            # an unassigned but reserved character, with a known
+            # east_asian_width
+            eastasianwidth = EASTASIANWIDTH_NAMES.index(unicode.widths[char])
+            item = (0, 0, 0, 0, eastasianwidth, 0)
+        else:
+            continue
+
+        # add entry to index and item tables
+        i = cache.get(item)
+        if i is None:
+            cache[item] = i = len(table)
+            table.append(item)
+        index[char] = i
+
 
     # 2) decomposition data
 
+    decomp_data_cache = {}
     decomp_data = [0]
     decomp_prefix = [""]
     decomp_index = [0] * len(unicode.chars)
@@ -207,12 +227,15 @@ def makeunicodedata(unicode, trace):
                     comp_first[l] = 1
                     comp_last[r] = 1
                     comp_pairs.append((l,r,char))
-                try:
-                    i = decomp_data.index(decomp)
-                except ValueError:
+                key = tuple(decomp)
+                i = decomp_data_cache.get(key, -1)
+                if i == -1:
                     i = len(decomp_data)
                     decomp_data.extend(decomp)
                     decomp_size = decomp_size + len(decomp) * 2
+                    decomp_data_cache[key] = i
+                else:
+                    assert decomp_data[i:i+len(decomp)] == decomp
             else:
                 i = 0
             decomp_index[char] = i
@@ -270,6 +293,7 @@ def makeunicodedata(unicode, trace):
         fprint()
         fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION)
         fprint("/* a list of unique database records */")
+        # NOTE: static qualification added by unicodedata2
         fprint("static const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")
         for item in table:
             fprint("    {%d, %d, %d, %d, %d, %d}," % item)
@@ -295,18 +319,21 @@ def makeunicodedata(unicode, trace):
         # the support code moved into unicodedatabase.c
 
         fprint("/* string literals */")
+        # NOTE: static qualification added by unicodedata2
         fprint("static const char *_PyUnicode_CategoryNames[] = {")
         for name in CATEGORY_NAMES:
             fprint("    \"%s\"," % name)
         fprint("    NULL")
         fprint("};")
 
+        # NOTE: static qualification added by unicodedata2
         fprint("static const char *_PyUnicode_BidirectionalNames[] = {")
         for name in BIDIRECTIONAL_NAMES:
             fprint("    \"%s\"," % name)
         fprint("    NULL")
         fprint("};")
 
+        # NOTE: static qualification added by unicodedata2
         fprint("static const char *_PyUnicode_EastAsianWidthNames[] = {")
         for name in EASTASIANWIDTH_NAMES:
             fprint("    \"%s\"," % name)
@@ -515,6 +542,7 @@ def makeunicodetype(unicode, trace):
         fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
         fprint()
         fprint("/* a list of unique character type descriptors */")
+        # NOTE: static qualification added by unicodedata2
         fprint("static const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {")
         for item in table:
             fprint("    {%d, %d, %d, %d, %d, %d}," % item)
@@ -523,6 +551,7 @@ def makeunicodetype(unicode, trace):
 
         fprint("/* extended case mappings */")
         fprint()
+        # NOTE: static qualification added by unicodedata2
         fprint("static const Py_UCS4 _PyUnicode_ExtendedCase[] = {")
         for c in extra_casing:
             fprint("    %d," % c)
@@ -820,9 +849,9 @@ def merge_old_version(version, new, old):
             continue
         # check characters that differ
         if old.table[i] != new.table[i]:
-            for k, field_name in enumerate(UcdRecord._fields):
-                value = getattr(old.table[i], field_name)
-                new_value = getattr(new.table[i], field_name)
+            for k, field in enumerate(dataclasses.fields(UcdRecord)):
+                value = getattr(old.table[i], field.name)
+                new_value = getattr(new.table[i], field.name)
                 if value != new_value:
                     if k == 1 and i in PUA_15:
                         # the name is not set in the old.table, but in the
@@ -892,9 +921,9 @@ def open_data(template, version):
         import urllib.request
         if version == '3.2.0':
             # irregular url structure
-            url = ('http://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,)
+            url = ('https://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,)
         else:
-            url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
+            url = ('https://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
         os.makedirs(DATA_DIR, exist_ok=True)
         urllib.request.urlretrieve(url, filename=local)
     if local.endswith('.txt'):
@@ -904,7 +933,7 @@ def open_data(template, version):
         return open(local, 'rb')
 
 
-def expand_range(char_range):
+def expand_range(char_range: str) -> Iterator[int]:
     '''
     Parses ranges of code points, as described in UAX #44:
       https://www.unicode.org/reports/tr44/#Code_Point_Ranges
@@ -927,67 +956,65 @@ class UcdFile:
     own separate format.
     '''
 
-    def __init__(self, template, version):
+    def __init__(self, template: str, version: str) -> None:
         self.template = template
         self.version = version
 
-    def records(self):
+    def records(self) -> Iterator[List[str]]:
         with open_data(self.template, self.version) as file:
             for line in file:
                 line = line.split('#', 1)[0].strip()
                 if not line:
                     continue
                 yield [field.strip() for field in line.split(';')]
 
-    def __iter__(self):
+    def __iter__(self) -> Iterator[List[str]]:
         return self.records()
 
-    def expanded(self):
+    def expanded(self) -> Iterator[Tuple[int, List[str]]]:
         for record in self.records():
             char_range, rest = record[0], record[1:]
             for char in expand_range(char_range):
                 yield char, rest
 
 
-class UcdRecord(namedtuple('UcdRecord', [
+@dataclasses.dataclass
+class UcdRecord:
     # 15 fields from UnicodeData.txt .  See:
     #   https://www.unicode.org/reports/tr44/#UnicodeData.txt
-    'codepoint',
-    'name',
-    'general_category',
-    'canonical_combining_class',
-    'bidi_class',
-    'decomposition_type',
-    'decomposition_mapping',
-    'numeric_type',
-    'numeric_value',
-    'bidi_mirrored',
-    'unicode_1_name',  # obsolete
-    'iso_comment',  # obsolete
-    'simple_uppercase_mapping',
-    'simple_lowercase_mapping',
-    'simple_titlecase_mapping',
+    codepoint: str
+    name: str
+    general_category: str
+    canonical_combining_class: str
+    bidi_class: str
+    decomposition_type: str
+    decomposition_mapping: str
+    numeric_type: str
+    numeric_value: str
+    bidi_mirrored: str
+    unicode_1_name: str  # obsolete
+    iso_comment: str  # obsolete
+    simple_uppercase_mapping: str
+    simple_lowercase_mapping: str
+    simple_titlecase_mapping: str
 
     # https://www.unicode.org/reports/tr44/#EastAsianWidth.txt
-    'east_asian_width',
+    east_asian_width: Optional[str]
 
     # Binary properties, as a set of those that are true.
     # Taken from multiple files:
     #   https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
     #   https://www.unicode.org/reports/tr44/#LineBreak.txt
-    'binary_properties',
+    binary_properties: Set[str]
 
     # The Quick_Check properties related to normalization:
     #   https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization
     # We store them as a bitmask.
-    'quick_check',
-])):
+    quick_check: int
+
 
-    @classmethod
-    def from_row(cls, row):
-        return cls(
-            *row, east_asian_width=None, binary_properties=set(), quick_check=0
-        )
+def from_row(row: List[str]) -> UcdRecord:
+    return UcdRecord(*row, None, set(), 0)
 
 
 # --------------------------------------------------------------------
@@ -1004,7 +1031,7 @@ def __init__(self, version, cjk_check=True):
         table = [None] * 0x110000
         for s in UcdFile(UNICODE_DATA, version):
             char = int(s[0], 16)
-            table[char] = UcdRecord.from_row(s)
+            table[char] = from_row(s)
 
         cjk_ranges_found = []
 
@@ -1017,16 +1044,16 @@ def __init__(self, version, cjk_check=True):
             s = table[i]
             if s:
                 if s.name[-6:] == "First>":
-                    s = table[i] = s._replace(name="")
-                    field = tuple(s)[:15]
+                    s.name = ""
+                    field = dataclasses.astuple(s)[:15]
                 elif s.name[-5:] == "Last>":
                     if s.name.startswith("<CJK Ideograph"):
                         cjk_ranges_found.append((field[0],
                                                  s.codepoint))
-                    table[i] = s._replace(name="")
+                    s.name = ""
                     field = None
             elif field:
-                table[i] = UcdRecord.from_row(('%X' % i,) + field[1:])
+                table[i] = from_row(('%X' % i,) + field[1:])
         if cjk_check and cjk_ranges != cjk_ranges_found:
             raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
 
@@ -1047,7 +1074,7 @@ def __init__(self, version, cjk_check=True):
                 char = int(char, 16)
                 self.aliases.append((name, char))
                 # also store the name in the PUA 1
-                self.table[pua_index] = self.table[pua_index]._replace(name=name)
+                self.table[pua_index].name = name
                 pua_index += 1
             assert pua_index - NAME_ALIASES_START == len(self.aliases)
 
@@ -1066,7 +1093,7 @@ def __init__(self, version, cjk_check=True):
                     "the NamedSequence struct and in unicodedata_lookup")
                 self.named_sequences.append((name, chars))
                 # also store these in the PUA 1
-                self.table[pua_index] = self.table[pua_index]._replace(name=name)
+                self.table[pua_index].name = name
                 pua_index += 1
             assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
 
@@ -1081,7 +1108,8 @@ def __init__(self, version, cjk_check=True):
 
         for i in range(0, 0x110000):
             if table[i] is not None:
-                table[i] = table[i]._replace(east_asian_width=widths[i])
+                table[i].east_asian_width = widths[i]
+        self.widths = widths
 
         for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
             if table[char]:
@@ -1115,7 +1143,7 @@ def __init__(self, version, cjk_check=True):
                 quickchecks[char] |= quickcheck
         for i in range(0, 0x110000):
             if table[i] is not None:
-                table[i] = table[i]._replace(quick_check=quickchecks[i])
+                table[i].quick_check = quickchecks[i]
 
         with open_data(UNIHAN, version) as file:
             zip = zipfile.ZipFile(file)
@@ -1134,7 +1162,7 @@ def __init__(self, version, cjk_check=True):
             i = int(code[2:], 16)
             # Patch the numeric field
             if table[i] is not None:
-                table[i] = table[i]._replace(numeric_value=value)
+                table[i].numeric_value = value
 
         sc = self.special_casing = {}
         for data in UcdFile(SPECIAL_CASING, version):