Skip to content

Commit

Permalink
Merge pull request #62 from SnoopJ/feature/Unicode-15.1
Browse files Browse the repository at this point in the history
Add support for Unicode 15.1
  • Loading branch information
SnoopJ committed Sep 21, 2023
2 parents a7ef92c + 461c283 commit d924b53
Show file tree
Hide file tree
Showing 8 changed files with 18,888 additions and 18,495 deletions.
29 changes: 17 additions & 12 deletions makeunicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
# * Doc/library/stdtypes.rst, and
# * Doc/library/unicodedata.rst
# * Doc/reference/lexical_analysis.rst (two occurrences)
UNIDATA_VERSION = "15.0.0"
UNIDATA_VERSION = "15.1.0"
UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Expand Down Expand Up @@ -101,15 +101,16 @@

# these ranges need to match unicodedata.c:is_unified_ideograph
cjk_ranges = [
('3400', '4DBF'),
('4E00', '9FFF'),
('20000', '2A6DF'),
('2A700', '2B739'),
('2B740', '2B81D'),
('2B820', '2CEA1'),
('2CEB0', '2EBE0'),
('30000', '3134A'),
('31350', '323AF'),
('3400', '4DBF'), # CJK Ideograph Extension A CJK
('4E00', '9FFF'), # CJK Ideograph
('20000', '2A6DF'), # CJK Ideograph Extension B
('2A700', '2B739'), # CJK Ideograph Extension C
('2B740', '2B81D'), # CJK Ideograph Extension D
('2B820', '2CEA1'), # CJK Ideograph Extension E
('2CEB0', '2EBE0'), # CJK Ideograph Extension F
('2EBF0', '2EE5D'), # CJK Ideograph Extension I
('30000', '3134A'), # CJK Ideograph Extension G
('31350', '323AF'), # CJK Ideograph Extension H
]


Expand Down Expand Up @@ -1111,11 +1112,15 @@ def __init__(self, version, cjk_check=True):
table[i].east_asian_width = widths[i]
self.widths = widths

for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
for char, (propname, *propinfo) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
if propinfo:
# this is not a binary property, ignore it
continue

if table[char]:
# Some properties (e.g. Default_Ignorable_Code_Point)
# apply to unassigned code points; ignore them
table[char].binary_properties.add(p)
table[char].binary_properties.add(propname)

for char_range, value in UcdFile(LINE_BREAK, version):
if value not in MANDATORY_LINE_BREAKS:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

setup(
name="unicodedata2",
version="15.0.0",
version="15.1.0",
description="Unicodedata backport updated to the latest Unicode version.",
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down
50 changes: 13 additions & 37 deletions tests/test_unicodedata2.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,41 +14,6 @@
encoding = 'utf-8'
errors = 'surrogatepass'

MAX_UNICODE_UCS4 = 0x10FFFF

if sys.maxunicode < MAX_UNICODE_UCS4:
# workarounds for Python "narrow" builds with UCS2-only support.

_narrow_unichr = chr

def chr(i):
"""
Return the unicode character whose Unicode code is the integer 'i'.
The valid range is 0 to 0x10FFFF inclusive.
>>> _narrow_unichr(0xFFFF + 1)
Traceback (most recent call last):
File "<stdin>", line 1, in ?
ValueError: unichr() arg not in range(0x10000) (narrow Python build)
>>> chr(0xFFFF + 1) == u'\U00010000'
True
>>> chr(1114111) == u'\U0010FFFF'
True
>>> chr(0x10FFFF + 1)
Traceback (most recent call last):
File "<stdin>", line 1, in ?
ValueError: chr() arg not in range(0x110000)
"""
try:
return _narrow_unichr(i)
except ValueError:
try:
padded_hex_str = hex(i)[2:].zfill(8)
escape_str = "\\U" + padded_hex_str
return escape_str.decode("unicode-escape")
except UnicodeDecodeError:
raise ValueError('chr() arg not in range(0x110000)')


### Run tests

# NOTE: UnicodeMethodsTest upstream tests methods on `str` objects, and
Expand All @@ -68,13 +33,15 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):

# Update this if the database changes. Make sure to do a full rebuild
# (e.g. 'make distclean && make') to get the correct checksum.
expectedchecksum = 'ef638fce5e02dcaa0ad14dd5034314e65f726c62'
expectedchecksum = '232affd2a50ec4bd69d2482aa0291385cbdefaba'

def test_function_checksum(self):
import unicodedata2

data = []
h = hashlib.sha1()

for i in range(0x10000):
for i in range(sys.maxunicode + 1):
char = chr(i)
data = [
# Properties
Expand All @@ -86,11 +53,20 @@ def test_function_checksum(self):
self.db.decomposition(char),
str(self.db.mirrored(char)),
str(self.db.combining(char)),
unicodedata2.east_asian_width(char),
self.db.name(char, ""),
]
h.update(''.join(data).encode("ascii"))
result = h.hexdigest()
self.assertEqual(result, self.expectedchecksum)

def test_name_inverse_lookup(self):
for i in range(sys.maxunicode + 1):
char = chr(i)
looked_name = self.db.name(char, None)
if looked_name:
self.assertEqual(self.db.lookup(looked_name), char)

def test_digit(self):
self.assertEqual(self.db.digit('A', None), None)
self.assertEqual(self.db.digit('9'), 9)
Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[tox]
envlist = py{37,38,39,310,311}, pypy{37,38,39}
envlist = py{37,38,39,310,311,312}, pypy{37,38,39}
skip_missing_interpreters = true

[testenv]
Expand Down
1 change: 1 addition & 0 deletions unicodedata2/unicodedata.c
Original file line number Diff line number Diff line change
Expand Up @@ -941,6 +941,7 @@ is_unified_ideograph(Py_UCS4 code)
(0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
(0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
(0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
(0x2EBF0 <= code && code <= 0x2EE5D) || /* CJK Ideograph Extension I */
(0x30000 <= code && code <= 0x3134A) || /* CJK Ideograph Extension G */
(0x31350 <= code && code <= 0x323AF); /* CJK Ideograph Extension H */
}
Expand Down
Loading

0 comments on commit d924b53

Please sign in to comment.