Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Synchronize with CPython 3.7.14 C-API usage, backport #58

Merged
merged 18 commits into from
Oct 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 84 additions & 56 deletions makeunicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,14 @@
# written by Fredrik Lundh ([email protected])
#

import dataclasses
import os
import sys
import zipfile

from collections import namedtuple
from functools import partial
from textwrap import dedent
from typing import Iterator, List, Optional, Set, Tuple

SCRIPT = sys.argv[0]
VERSION = "3.3"
Expand Down Expand Up @@ -76,7 +77,8 @@
"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
"ON", "LRI", "RLI", "FSI", "PDI" ]

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
# "N" needs to be the first entry, see the comment in makeunicodedata
EASTASIANWIDTH_NAMES = [ "N", "H", "W", "Na", "A", "F" ]

MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]

Expand Down Expand Up @@ -135,6 +137,14 @@ def maketables(trace=0):

def makeunicodedata(unicode, trace):

# the default value of east_asian_width is "N", for unassigned code points
# not mentioned in EastAsianWidth.txt
# in addition there are some reserved but unassigned code points in CJK
# ranges that are classified as "W". code points in private use areas
# have a width of "A". both of these have entries in
# EastAsianWidth.txt
# see https://unicode.org/reports/tr11/#Unassigned
assert EASTASIANWIDTH_NAMES[0] == "N"
dummy = (0, 0, 0, 0, 0, 0)
table = [dummy]
cache = {0: dummy}
Expand All @@ -160,15 +170,25 @@ def makeunicodedata(unicode, trace):
category, combining, bidirectional, mirrored, eastasianwidth,
normalizationquickcheck
)
# add entry to index and item tables
i = cache.get(item)
if i is None:
cache[item] = i = len(table)
table.append(item)
index[char] = i
elif unicode.widths[char] is not None:
# an unassigned but reserved character, with a known
# east_asian_width
eastasianwidth = EASTASIANWIDTH_NAMES.index(unicode.widths[char])
item = (0, 0, 0, 0, eastasianwidth, 0)
else:
continue

# add entry to index and item tables
i = cache.get(item)
if i is None:
cache[item] = i = len(table)
table.append(item)
index[char] = i


# 2) decomposition data

decomp_data_cache = {}
decomp_data = [0]
decomp_prefix = [""]
decomp_index = [0] * len(unicode.chars)
Expand Down Expand Up @@ -207,12 +227,15 @@ def makeunicodedata(unicode, trace):
comp_first[l] = 1
comp_last[r] = 1
comp_pairs.append((l,r,char))
try:
i = decomp_data.index(decomp)
except ValueError:
key = tuple(decomp)
i = decomp_data_cache.get(key, -1)
if i == -1:
i = len(decomp_data)
decomp_data.extend(decomp)
decomp_size = decomp_size + len(decomp) * 2
decomp_data_cache[key] = i
else:
assert decomp_data[i:i+len(decomp)] == decomp
else:
i = 0
decomp_index[char] = i
Expand Down Expand Up @@ -270,6 +293,7 @@ def makeunicodedata(unicode, trace):
fprint()
fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION)
fprint("/* a list of unique database records */")
# NOTE: static qualification added by unicodedata2
fprint("static const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")
for item in table:
fprint(" {%d, %d, %d, %d, %d, %d}," % item)
Expand All @@ -295,18 +319,21 @@ def makeunicodedata(unicode, trace):
# the support code moved into unicodedatabase.c

fprint("/* string literals */")
# NOTE: static qualification added by unicodedata2
fprint("static const char *_PyUnicode_CategoryNames[] = {")
for name in CATEGORY_NAMES:
fprint(" \"%s\"," % name)
fprint(" NULL")
fprint("};")

# NOTE: static qualification added by unicodedata2
fprint("static const char *_PyUnicode_BidirectionalNames[] = {")
for name in BIDIRECTIONAL_NAMES:
fprint(" \"%s\"," % name)
fprint(" NULL")
fprint("};")

# NOTE: static qualification added by unicodedata2
fprint("static const char *_PyUnicode_EastAsianWidthNames[] = {")
for name in EASTASIANWIDTH_NAMES:
fprint(" \"%s\"," % name)
Expand Down Expand Up @@ -515,6 +542,7 @@ def makeunicodetype(unicode, trace):
fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
fprint()
fprint("/* a list of unique character type descriptors */")
# NOTE: static qualification added by unicodedata2
fprint("static const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {")
for item in table:
fprint(" {%d, %d, %d, %d, %d, %d}," % item)
Expand All @@ -523,6 +551,7 @@ def makeunicodetype(unicode, trace):

fprint("/* extended case mappings */")
fprint()
# NOTE: static qualification added by unicodedata2
fprint("static const Py_UCS4 _PyUnicode_ExtendedCase[] = {")
for c in extra_casing:
fprint(" %d," % c)
Expand Down Expand Up @@ -820,9 +849,9 @@ def merge_old_version(version, new, old):
continue
# check characters that differ
if old.table[i] != new.table[i]:
for k, field_name in enumerate(UcdRecord._fields):
value = getattr(old.table[i], field_name)
new_value = getattr(new.table[i], field_name)
for k, field in enumerate(dataclasses.fields(UcdRecord)):
value = getattr(old.table[i], field.name)
new_value = getattr(new.table[i], field.name)
if value != new_value:
if k == 1 and i in PUA_15:
# the name is not set in the old.table, but in the
Expand Down Expand Up @@ -892,9 +921,9 @@ def open_data(template, version):
import urllib.request
if version == '3.2.0':
# irregular url structure
url = ('http://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,)
url = ('https://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,)
else:
url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
url = ('https://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
os.makedirs(DATA_DIR, exist_ok=True)
urllib.request.urlretrieve(url, filename=local)
if local.endswith('.txt'):
Expand All @@ -904,7 +933,7 @@ def open_data(template, version):
return open(local, 'rb')


def expand_range(char_range):
def expand_range(char_range: str) -> Iterator[int]:
'''
Parses ranges of code points, as described in UAX #44:
https://www.unicode.org/reports/tr44/#Code_Point_Ranges
Expand All @@ -927,67 +956,65 @@ class UcdFile:
own separate format.
'''

def __init__(self, template, version):
def __init__(self, template: str, version: str) -> None:
self.template = template
self.version = version

def records(self):
def records(self) -> Iterator[List[str]]:
with open_data(self.template, self.version) as file:
for line in file:
line = line.split('#', 1)[0].strip()
if not line:
continue
yield [field.strip() for field in line.split(';')]

def __iter__(self):
def __iter__(self) -> Iterator[List[str]]:
return self.records()

def expanded(self):
def expanded(self) -> Iterator[Tuple[int, List[str]]]:
for record in self.records():
char_range, rest = record[0], record[1:]
for char in expand_range(char_range):
yield char, rest


class UcdRecord(namedtuple('UcdRecord', [
@dataclasses.dataclass
class UcdRecord:
# 15 fields from UnicodeData.txt . See:
# https://www.unicode.org/reports/tr44/#UnicodeData.txt
'codepoint',
'name',
'general_category',
'canonical_combining_class',
'bidi_class',
'decomposition_type',
'decomposition_mapping',
'numeric_type',
'numeric_value',
'bidi_mirrored',
'unicode_1_name', # obsolete
'iso_comment', # obsolete
'simple_uppercase_mapping',
'simple_lowercase_mapping',
'simple_titlecase_mapping',
codepoint: str
name: str
general_category: str
canonical_combining_class: str
bidi_class: str
decomposition_type: str
decomposition_mapping: str
numeric_type: str
numeric_value: str
bidi_mirrored: str
unicode_1_name: str # obsolete
iso_comment: str # obsolete
simple_uppercase_mapping: str
simple_lowercase_mapping: str
simple_titlecase_mapping: str

# https://www.unicode.org/reports/tr44/#EastAsianWidth.txt
'east_asian_width',
east_asian_width: Optional[str]

# Binary properties, as a set of those that are true.
# Taken from multiple files:
# https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
# https://www.unicode.org/reports/tr44/#LineBreak.txt
'binary_properties',
binary_properties: Set[str]

# The Quick_Check properties related to normalization:
# https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization
# We store them as a bitmask.
'quick_check',
])):
quick_check: int


@classmethod
def from_row(cls, row):
return cls(
*row, east_asian_width=None, binary_properties=set(), quick_check=0
)
def from_row(row: List[str]) -> UcdRecord:
return UcdRecord(*row, None, set(), 0)


# --------------------------------------------------------------------
Expand All @@ -1004,7 +1031,7 @@ def __init__(self, version, cjk_check=True):
table = [None] * 0x110000
for s in UcdFile(UNICODE_DATA, version):
char = int(s[0], 16)
table[char] = UcdRecord.from_row(s)
table[char] = from_row(s)

cjk_ranges_found = []

Expand All @@ -1017,16 +1044,16 @@ def __init__(self, version, cjk_check=True):
s = table[i]
if s:
if s.name[-6:] == "First>":
s = table[i] = s._replace(name="")
field = tuple(s)[:15]
s.name = ""
field = dataclasses.astuple(s)[:15]
elif s.name[-5:] == "Last>":
if s.name.startswith("<CJK Ideograph"):
cjk_ranges_found.append((field[0],
s.codepoint))
table[i] = s._replace(name="")
s.name = ""
field = None
elif field:
table[i] = UcdRecord.from_row(('%X' % i,) + field[1:])
table[i] = from_row(('%X' % i,) + field[1:])
if cjk_check and cjk_ranges != cjk_ranges_found:
raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)

Expand All @@ -1047,7 +1074,7 @@ def __init__(self, version, cjk_check=True):
char = int(char, 16)
self.aliases.append((name, char))
# also store the name in the PUA 1
self.table[pua_index] = self.table[pua_index]._replace(name=name)
self.table[pua_index].name = name
pua_index += 1
assert pua_index - NAME_ALIASES_START == len(self.aliases)

Expand All @@ -1066,7 +1093,7 @@ def __init__(self, version, cjk_check=True):
"the NamedSequence struct and in unicodedata_lookup")
self.named_sequences.append((name, chars))
# also store these in the PUA 1
self.table[pua_index] = self.table[pua_index]._replace(name=name)
self.table[pua_index].name = name
pua_index += 1
assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)

Expand All @@ -1081,7 +1108,8 @@ def __init__(self, version, cjk_check=True):

for i in range(0, 0x110000):
if table[i] is not None:
table[i] = table[i]._replace(east_asian_width=widths[i])
table[i].east_asian_width = widths[i]
self.widths = widths

for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
if table[char]:
Expand Down Expand Up @@ -1115,7 +1143,7 @@ def __init__(self, version, cjk_check=True):
quickchecks[char] |= quickcheck
for i in range(0, 0x110000):
if table[i] is not None:
table[i] = table[i]._replace(quick_check=quickchecks[i])
table[i].quick_check = quickchecks[i]

with open_data(UNIHAN, version) as file:
zip = zipfile.ZipFile(file)
Expand All @@ -1134,7 +1162,7 @@ def __init__(self, version, cjk_check=True):
i = int(code[2:], 16)
# Patch the numeric field
if table[i] is not None:
table[i] = table[i]._replace(numeric_value=value)
table[i].numeric_value = value

sc = self.special_casing = {}
for data in UcdFile(SPECIAL_CASING, version):
Expand Down
Loading