Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for Unicode 15.1 #62

Merged
merged 13 commits into from
Sep 21, 2023
29 changes: 17 additions & 12 deletions makeunicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
# * Doc/library/stdtypes.rst, and
# * Doc/library/unicodedata.rst
# * Doc/reference/lexical_analysis.rst (two occurrences)
UNIDATA_VERSION = "15.0.0"
UNIDATA_VERSION = "15.1.0"
UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Expand Down Expand Up @@ -101,15 +101,16 @@

# these ranges need to match unicodedata.c:is_unified_ideograph
cjk_ranges = [
('3400', '4DBF'),
('4E00', '9FFF'),
('20000', '2A6DF'),
('2A700', '2B739'),
('2B740', '2B81D'),
('2B820', '2CEA1'),
('2CEB0', '2EBE0'),
('30000', '3134A'),
('31350', '323AF'),
('3400', '4DBF'), # CJK Ideograph Extension A CJK
('4E00', '9FFF'), # CJK Ideograph
('20000', '2A6DF'), # CJK Ideograph Extension B
('2A700', '2B739'), # CJK Ideograph Extension C
('2B740', '2B81D'), # CJK Ideograph Extension D
('2B820', '2CEA1'), # CJK Ideograph Extension E
('2CEB0', '2EBE0'), # CJK Ideograph Extension F
('2EBF0', '2EE5D'), # CJK Ideograph Extension I
('30000', '3134A'), # CJK Ideograph Extension G
('31350', '323AF'), # CJK Ideograph Extension H
]


Expand Down Expand Up @@ -1111,11 +1112,15 @@ def __init__(self, version, cjk_check=True):
table[i].east_asian_width = widths[i]
self.widths = widths

for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
for char, (propname, *propinfo) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
if propinfo:
# this is not a binary property, ignore it
continue

if table[char]:
# Some properties (e.g. Default_Ignorable_Code_Point)
# apply to unassigned code points; ignore them
table[char].binary_properties.add(p)
table[char].binary_properties.add(propname)

for char_range, value in UcdFile(LINE_BREAK, version):
if value not in MANDATORY_LINE_BREAKS:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

setup(
name="unicodedata2",
version="15.0.0",
version="15.1.0",
description="Unicodedata backport updated to the latest Unicode version.",
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down
50 changes: 13 additions & 37 deletions tests/test_unicodedata2.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,41 +14,6 @@
encoding = 'utf-8'
errors = 'surrogatepass'

MAX_UNICODE_UCS4 = 0x10FFFF

if sys.maxunicode < MAX_UNICODE_UCS4:
# workarounds for Python "narrow" builds with UCS2-only support.

_narrow_unichr = chr

def chr(i):
"""
Return the unicode character whose Unicode code is the integer 'i'.
The valid range is 0 to 0x10FFFF inclusive.
>>> _narrow_unichr(0xFFFF + 1)
Traceback (most recent call last):
File "<stdin>", line 1, in ?
ValueError: unichr() arg not in range(0x10000) (narrow Python build)
>>> chr(0xFFFF + 1) == u'\U00010000'
True
>>> chr(1114111) == u'\U0010FFFF'
True
>>> chr(0x10FFFF + 1)
Traceback (most recent call last):
File "<stdin>", line 1, in ?
ValueError: chr() arg not in range(0x110000)
"""
try:
return _narrow_unichr(i)
except ValueError:
try:
padded_hex_str = hex(i)[2:].zfill(8)
escape_str = "\\U" + padded_hex_str
return escape_str.decode("unicode-escape")
except UnicodeDecodeError:
raise ValueError('chr() arg not in range(0x110000)')


### Run tests

# NOTE: UnicodeMethodsTest upstream tests methods on `str` objects, and
Expand All @@ -68,13 +33,15 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):

# Update this if the database changes. Make sure to do a full rebuild
# (e.g. 'make distclean && make') to get the correct checksum.
expectedchecksum = 'ef638fce5e02dcaa0ad14dd5034314e65f726c62'
expectedchecksum = '232affd2a50ec4bd69d2482aa0291385cbdefaba'

def test_function_checksum(self):
import unicodedata2

data = []
h = hashlib.sha1()

for i in range(0x10000):
for i in range(sys.maxunicode + 1):
char = chr(i)
data = [
# Properties
Expand All @@ -86,11 +53,20 @@ def test_function_checksum(self):
self.db.decomposition(char),
str(self.db.mirrored(char)),
str(self.db.combining(char)),
unicodedata2.east_asian_width(char),
self.db.name(char, ""),
]
h.update(''.join(data).encode("ascii"))
result = h.hexdigest()
self.assertEqual(result, self.expectedchecksum)

def test_name_inverse_lookup(self):
for i in range(sys.maxunicode + 1):
char = chr(i)
looked_name = self.db.name(char, None)
if looked_name:
self.assertEqual(self.db.lookup(looked_name), char)

def test_digit(self):
self.assertEqual(self.db.digit('A', None), None)
self.assertEqual(self.db.digit('9'), 9)
Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[tox]
envlist = py{37,38,39,310,311}, pypy{37,38,39}
envlist = py{37,38,39,310,311,312}, pypy{37,38,39}
skip_missing_interpreters = true

[testenv]
Expand Down
1 change: 1 addition & 0 deletions unicodedata2/unicodedata.c
Original file line number Diff line number Diff line change
Expand Up @@ -941,6 +941,7 @@ is_unified_ideograph(Py_UCS4 code)
(0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
(0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
(0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
(0x2EBF0 <= code && code <= 0x2EE5D) || /* CJK Ideograph Extension I */
(0x30000 <= code && code <= 0x3134A) || /* CJK Ideograph Extension G */
(0x31350 <= code && code <= 0x323AF); /* CJK Ideograph Extension H */
}
Expand Down
Loading