Merge pull request #62 from SnoopJ/feature/Unicode-15.1

Add support for Unicode 15.1
fonttools · Sep 21, 2023 · d924b53 · d924b53
2 parents a7ef92c + 461c283
commit d924b53
Show file tree

Hide file tree

Showing 8 changed files with 18,888 additions and 18,495 deletions.
diff --git a/makeunicodedata.py b/makeunicodedata.py
@@ -44,7 +44,7 @@
 #   * Doc/library/stdtypes.rst, and
 #   * Doc/library/unicodedata.rst
 #   * Doc/reference/lexical_analysis.rst (two occurrences)
-UNIDATA_VERSION = "15.0.0"
+UNIDATA_VERSION = "15.1.0"
 UNICODE_DATA = "UnicodeData%s.txt"
 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
@@ -101,15 +101,16 @@
 
 # these ranges need to match unicodedata.c:is_unified_ideograph
 cjk_ranges = [
-    ('3400', '4DBF'),
-    ('4E00', '9FFF'),
-    ('20000', '2A6DF'),
-    ('2A700', '2B739'),
-    ('2B740', '2B81D'),
-    ('2B820', '2CEA1'),
-    ('2CEB0', '2EBE0'),
-    ('30000', '3134A'),
-    ('31350', '323AF'),
+    ('3400', '4DBF'),    # CJK Ideograph Extension A CJK
+    ('4E00', '9FFF'),    # CJK Ideograph
+    ('20000', '2A6DF'),  # CJK Ideograph Extension B
+    ('2A700', '2B739'),  # CJK Ideograph Extension C
+    ('2B740', '2B81D'),  # CJK Ideograph Extension D
+    ('2B820', '2CEA1'),  # CJK Ideograph Extension E
+    ('2CEB0', '2EBE0'),  # CJK Ideograph Extension F
+    ('2EBF0', '2EE5D'),  # CJK Ideograph Extension I
+    ('30000', '3134A'),  # CJK Ideograph Extension G
+    ('31350', '323AF'),  # CJK Ideograph Extension H
 ]
 
 
@@ -1111,11 +1112,15 @@ def __init__(self, version, cjk_check=True):
                 table[i].east_asian_width = widths[i]
         self.widths = widths
 
-        for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
+        for char, (propname, *propinfo) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
+            if propinfo:
+                # this is not a binary property, ignore it
+                continue
+
             if table[char]:
                 # Some properties (e.g. Default_Ignorable_Code_Point)
                 # apply to unassigned code points; ignore them
-                table[char].binary_properties.add(p)
+                table[char].binary_properties.add(propname)
 
         for char_range, value in UcdFile(LINE_BREAK, version):
             if value not in MANDATORY_LINE_BREAKS:

diff --git a/setup.py b/setup.py
@@ -26,7 +26,7 @@
 
 setup(
     name="unicodedata2",
-    version="15.0.0",
+    version="15.1.0",
     description="Unicodedata backport updated to the latest Unicode version.",
     long_description=long_description,
     long_description_content_type="text/markdown",

diff --git a/tests/test_unicodedata2.py b/tests/test_unicodedata2.py
@@ -14,41 +14,6 @@
 encoding = 'utf-8'
 errors = 'surrogatepass'
 
-MAX_UNICODE_UCS4 = 0x10FFFF
-
-if sys.maxunicode < MAX_UNICODE_UCS4:
-    # workarounds for Python "narrow" builds with UCS2-only support.
-
-    _narrow_unichr = chr
-
-    def chr(i):
-        """
-        Return the unicode character whose Unicode code is the integer 'i'.
-        The valid range is 0 to 0x10FFFF inclusive.
-        >>> _narrow_unichr(0xFFFF + 1)
-        Traceback (most recent call last):
-          File "<stdin>", line 1, in ?
-        ValueError: unichr() arg not in range(0x10000) (narrow Python build)
-        >>> chr(0xFFFF + 1) == u'\U00010000'
-        True
-        >>> chr(1114111) == u'\U0010FFFF'
-        True
-        >>> chr(0x10FFFF + 1)
-        Traceback (most recent call last):
-          File "<stdin>", line 1, in ?
-        ValueError: chr() arg not in range(0x110000)
-        """
-        try:
-            return _narrow_unichr(i)
-        except ValueError:
-            try:
-                padded_hex_str = hex(i)[2:].zfill(8)
-                escape_str = "\\U" + padded_hex_str
-                return escape_str.decode("unicode-escape")
-            except UnicodeDecodeError:
-                raise ValueError('chr() arg not in range(0x110000)')
-
-
 ### Run tests
 
 # NOTE: UnicodeMethodsTest upstream tests methods on `str` objects, and
@@ -68,13 +33,15 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
 
     # Update this if the database changes. Make sure to do a full rebuild
     # (e.g. 'make distclean && make') to get the correct checksum.
-    expectedchecksum = 'ef638fce5e02dcaa0ad14dd5034314e65f726c62'
+    expectedchecksum = '232affd2a50ec4bd69d2482aa0291385cbdefaba'
 
     def test_function_checksum(self):
+        import unicodedata2
+
         data = []
         h = hashlib.sha1()
 
-        for i in range(0x10000):
+        for i in range(sys.maxunicode + 1):
             char = chr(i)
             data = [
                 # Properties
@@ -86,11 +53,20 @@ def test_function_checksum(self):
                 self.db.decomposition(char),
                 str(self.db.mirrored(char)),
                 str(self.db.combining(char)),
+                unicodedata2.east_asian_width(char),
+                self.db.name(char, ""),
             ]
             h.update(''.join(data).encode("ascii"))
         result = h.hexdigest()
         self.assertEqual(result, self.expectedchecksum)
 
+    def test_name_inverse_lookup(self):
+        for i in range(sys.maxunicode + 1):
+            char = chr(i)
+            looked_name = self.db.name(char, None)
+            if looked_name:
+                self.assertEqual(self.db.lookup(looked_name), char)
+
     def test_digit(self):
         self.assertEqual(self.db.digit('A', None), None)
         self.assertEqual(self.db.digit('9'), 9)

diff --git a/tox.ini b/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = py{37,38,39,310,311}, pypy{37,38,39}
+envlist = py{37,38,39,310,311,312}, pypy{37,38,39}
 skip_missing_interpreters = true
 
 [testenv]

diff --git a/unicodedata2/unicodedata.c b/unicodedata2/unicodedata.c
@@ -941,6 +941,7 @@ is_unified_ideograph(Py_UCS4 code)
         (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
         (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
         (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
+        (0x2EBF0 <= code && code <= 0x2EE5D) || /* CJK Ideograph Extension I */
         (0x30000 <= code && code <= 0x3134A) || /* CJK Ideograph Extension G */
         (0x31350 <= code && code <= 0x323AF);   /* CJK Ideograph Extension H */
 }