Skip to content

Commit

Permalink
Merge pull request #58 from SnoopJ/feature/sync-API-with-upstream
Browse files Browse the repository at this point in the history
Synchronize with CPython 3.7.14 C-API usage, backport
  • Loading branch information
anthrotype authored Oct 28, 2022
2 parents 3057a7e + 75cdcb9 commit ed442aa
Show file tree
Hide file tree
Showing 7 changed files with 2,424 additions and 2,419 deletions.
140 changes: 84 additions & 56 deletions makeunicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,14 @@
# written by Fredrik Lundh ([email protected])
#

import dataclasses
import os
import sys
import zipfile

from collections import namedtuple
from functools import partial
from textwrap import dedent
from typing import Iterator, List, Optional, Set, Tuple

SCRIPT = sys.argv[0]
VERSION = "3.3"
Expand Down Expand Up @@ -76,7 +77,8 @@
"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
"ON", "LRI", "RLI", "FSI", "PDI" ]

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
# "N" needs to be the first entry, see the comment in makeunicodedata
EASTASIANWIDTH_NAMES = [ "N", "H", "W", "Na", "A", "F" ]

MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]

Expand Down Expand Up @@ -135,6 +137,14 @@ def maketables(trace=0):

def makeunicodedata(unicode, trace):

# the default value of east_asian_width is "N", for unassigned code points
# not mentioned in EastAsianWidth.txt
# in addition there are some reserved but unassigned code points in CJK
# ranges that are classified as "W". code points in private use areas
# have a width of "A". both of these have entries in
# EastAsianWidth.txt
# see https://unicode.org/reports/tr11/#Unassigned
assert EASTASIANWIDTH_NAMES[0] == "N"
dummy = (0, 0, 0, 0, 0, 0)
table = [dummy]
cache = {0: dummy}
Expand All @@ -160,15 +170,25 @@ def makeunicodedata(unicode, trace):
category, combining, bidirectional, mirrored, eastasianwidth,
normalizationquickcheck
)
# add entry to index and item tables
i = cache.get(item)
if i is None:
cache[item] = i = len(table)
table.append(item)
index[char] = i
elif unicode.widths[char] is not None:
# an unassigned but reserved character, with a known
# east_asian_width
eastasianwidth = EASTASIANWIDTH_NAMES.index(unicode.widths[char])
item = (0, 0, 0, 0, eastasianwidth, 0)
else:
continue

# add entry to index and item tables
i = cache.get(item)
if i is None:
cache[item] = i = len(table)
table.append(item)
index[char] = i


# 2) decomposition data

decomp_data_cache = {}
decomp_data = [0]
decomp_prefix = [""]
decomp_index = [0] * len(unicode.chars)
Expand Down Expand Up @@ -207,12 +227,15 @@ def makeunicodedata(unicode, trace):
comp_first[l] = 1
comp_last[r] = 1
comp_pairs.append((l,r,char))
try:
i = decomp_data.index(decomp)
except ValueError:
key = tuple(decomp)
i = decomp_data_cache.get(key, -1)
if i == -1:
i = len(decomp_data)
decomp_data.extend(decomp)
decomp_size = decomp_size + len(decomp) * 2
decomp_data_cache[key] = i
else:
assert decomp_data[i:i+len(decomp)] == decomp
else:
i = 0
decomp_index[char] = i
Expand Down Expand Up @@ -270,6 +293,7 @@ def makeunicodedata(unicode, trace):
fprint()
fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION)
fprint("/* a list of unique database records */")
# NOTE: static qualification added by unicodedata2
fprint("static const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")
for item in table:
fprint(" {%d, %d, %d, %d, %d, %d}," % item)
Expand All @@ -295,18 +319,21 @@ def makeunicodedata(unicode, trace):
# the support code moved into unicodedatabase.c

fprint("/* string literals */")
# NOTE: static qualification added by unicodedata2
fprint("static const char *_PyUnicode_CategoryNames[] = {")
for name in CATEGORY_NAMES:
fprint(" \"%s\"," % name)
fprint(" NULL")
fprint("};")

# NOTE: static qualification added by unicodedata2
fprint("static const char *_PyUnicode_BidirectionalNames[] = {")
for name in BIDIRECTIONAL_NAMES:
fprint(" \"%s\"," % name)
fprint(" NULL")
fprint("};")

# NOTE: static qualification added by unicodedata2
fprint("static const char *_PyUnicode_EastAsianWidthNames[] = {")
for name in EASTASIANWIDTH_NAMES:
fprint(" \"%s\"," % name)
Expand Down Expand Up @@ -515,6 +542,7 @@ def makeunicodetype(unicode, trace):
fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
fprint()
fprint("/* a list of unique character type descriptors */")
# NOTE: static qualification added by unicodedata2
fprint("static const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {")
for item in table:
fprint(" {%d, %d, %d, %d, %d, %d}," % item)
Expand All @@ -523,6 +551,7 @@ def makeunicodetype(unicode, trace):

fprint("/* extended case mappings */")
fprint()
# NOTE: static qualification added by unicodedata2
fprint("static const Py_UCS4 _PyUnicode_ExtendedCase[] = {")
for c in extra_casing:
fprint(" %d," % c)
Expand Down Expand Up @@ -820,9 +849,9 @@ def merge_old_version(version, new, old):
continue
# check characters that differ
if old.table[i] != new.table[i]:
for k, field_name in enumerate(UcdRecord._fields):
value = getattr(old.table[i], field_name)
new_value = getattr(new.table[i], field_name)
for k, field in enumerate(dataclasses.fields(UcdRecord)):
value = getattr(old.table[i], field.name)
new_value = getattr(new.table[i], field.name)
if value != new_value:
if k == 1 and i in PUA_15:
# the name is not set in the old.table, but in the
Expand Down Expand Up @@ -892,9 +921,9 @@ def open_data(template, version):
import urllib.request
if version == '3.2.0':
# irregular url structure
url = ('http://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,)
url = ('https://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,)
else:
url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
url = ('https://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
os.makedirs(DATA_DIR, exist_ok=True)
urllib.request.urlretrieve(url, filename=local)
if local.endswith('.txt'):
Expand All @@ -904,7 +933,7 @@ def open_data(template, version):
return open(local, 'rb')


def expand_range(char_range):
def expand_range(char_range: str) -> Iterator[int]:
'''
Parses ranges of code points, as described in UAX #44:
https://www.unicode.org/reports/tr44/#Code_Point_Ranges
Expand All @@ -927,67 +956,65 @@ class UcdFile:
own separate format.
'''

def __init__(self, template, version):
def __init__(self, template: str, version: str) -> None:
self.template = template
self.version = version

def records(self):
def records(self) -> Iterator[List[str]]:
with open_data(self.template, self.version) as file:
for line in file:
line = line.split('#', 1)[0].strip()
if not line:
continue
yield [field.strip() for field in line.split(';')]

def __iter__(self):
def __iter__(self) -> Iterator[List[str]]:
return self.records()

def expanded(self):
def expanded(self) -> Iterator[Tuple[int, List[str]]]:
for record in self.records():
char_range, rest = record[0], record[1:]
for char in expand_range(char_range):
yield char, rest


class UcdRecord(namedtuple('UcdRecord', [
@dataclasses.dataclass
class UcdRecord:
# 15 fields from UnicodeData.txt . See:
# https://www.unicode.org/reports/tr44/#UnicodeData.txt
'codepoint',
'name',
'general_category',
'canonical_combining_class',
'bidi_class',
'decomposition_type',
'decomposition_mapping',
'numeric_type',
'numeric_value',
'bidi_mirrored',
'unicode_1_name', # obsolete
'iso_comment', # obsolete
'simple_uppercase_mapping',
'simple_lowercase_mapping',
'simple_titlecase_mapping',
codepoint: str
name: str
general_category: str
canonical_combining_class: str
bidi_class: str
decomposition_type: str
decomposition_mapping: str
numeric_type: str
numeric_value: str
bidi_mirrored: str
unicode_1_name: str # obsolete
iso_comment: str # obsolete
simple_uppercase_mapping: str
simple_lowercase_mapping: str
simple_titlecase_mapping: str

# https://www.unicode.org/reports/tr44/#EastAsianWidth.txt
'east_asian_width',
east_asian_width: Optional[str]

# Binary properties, as a set of those that are true.
# Taken from multiple files:
# https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
# https://www.unicode.org/reports/tr44/#LineBreak.txt
'binary_properties',
binary_properties: Set[str]

# The Quick_Check properties related to normalization:
# https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization
# We store them as a bitmask.
'quick_check',
])):
quick_check: int


@classmethod
def from_row(cls, row):
return cls(
*row, east_asian_width=None, binary_properties=set(), quick_check=0
)
def from_row(row: List[str]) -> UcdRecord:
return UcdRecord(*row, None, set(), 0)


# --------------------------------------------------------------------
Expand All @@ -1004,7 +1031,7 @@ def __init__(self, version, cjk_check=True):
table = [None] * 0x110000
for s in UcdFile(UNICODE_DATA, version):
char = int(s[0], 16)
table[char] = UcdRecord.from_row(s)
table[char] = from_row(s)

cjk_ranges_found = []

Expand All @@ -1017,16 +1044,16 @@ def __init__(self, version, cjk_check=True):
s = table[i]
if s:
if s.name[-6:] == "First>":
s = table[i] = s._replace(name="")
field = tuple(s)[:15]
s.name = ""
field = dataclasses.astuple(s)[:15]
elif s.name[-5:] == "Last>":
if s.name.startswith("<CJK Ideograph"):
cjk_ranges_found.append((field[0],
s.codepoint))
table[i] = s._replace(name="")
s.name = ""
field = None
elif field:
table[i] = UcdRecord.from_row(('%X' % i,) + field[1:])
table[i] = from_row(('%X' % i,) + field[1:])
if cjk_check and cjk_ranges != cjk_ranges_found:
raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)

Expand All @@ -1047,7 +1074,7 @@ def __init__(self, version, cjk_check=True):
char = int(char, 16)
self.aliases.append((name, char))
# also store the name in the PUA 1
self.table[pua_index] = self.table[pua_index]._replace(name=name)
self.table[pua_index].name = name
pua_index += 1
assert pua_index - NAME_ALIASES_START == len(self.aliases)

Expand All @@ -1066,7 +1093,7 @@ def __init__(self, version, cjk_check=True):
"the NamedSequence struct and in unicodedata_lookup")
self.named_sequences.append((name, chars))
# also store these in the PUA 1
self.table[pua_index] = self.table[pua_index]._replace(name=name)
self.table[pua_index].name = name
pua_index += 1
assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)

Expand All @@ -1081,7 +1108,8 @@ def __init__(self, version, cjk_check=True):

for i in range(0, 0x110000):
if table[i] is not None:
table[i] = table[i]._replace(east_asian_width=widths[i])
table[i].east_asian_width = widths[i]
self.widths = widths

for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
if table[char]:
Expand Down Expand Up @@ -1115,7 +1143,7 @@ def __init__(self, version, cjk_check=True):
quickchecks[char] |= quickcheck
for i in range(0, 0x110000):
if table[i] is not None:
table[i] = table[i]._replace(quick_check=quickchecks[i])
table[i].quick_check = quickchecks[i]

with open_data(UNIHAN, version) as file:
zip = zipfile.ZipFile(file)
Expand All @@ -1134,7 +1162,7 @@ def __init__(self, version, cjk_check=True):
i = int(code[2:], 16)
# Patch the numeric field
if table[i] is not None:
table[i] = table[i]._replace(numeric_value=value)
table[i].numeric_value = value

sc = self.special_casing = {}
for data in UcdFile(SPECIAL_CASING, version):
Expand Down
Loading

0 comments on commit ed442aa

Please sign in to comment.