-
Notifications
You must be signed in to change notification settings - Fork 22
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #58 from SnoopJ/feature/sync-API-with-upstream
Synchronize with CPython 3.7.14 C-API usage, backport
- Loading branch information
Showing
7 changed files
with
2,424 additions
and
2,419 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,13 +26,14 @@ | |
# written by Fredrik Lundh ([email protected]) | ||
# | ||
|
||
import dataclasses | ||
import os | ||
import sys | ||
import zipfile | ||
|
||
from collections import namedtuple | ||
from functools import partial | ||
from textwrap import dedent | ||
from typing import Iterator, List, Optional, Set, Tuple | ||
|
||
SCRIPT = sys.argv[0] | ||
VERSION = "3.3" | ||
|
@@ -76,7 +77,8 @@ | |
"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS", | ||
"ON", "LRI", "RLI", "FSI", "PDI" ] | ||
|
||
EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ] | ||
# "N" needs to be the first entry, see the comment in makeunicodedata | ||
EASTASIANWIDTH_NAMES = [ "N", "H", "W", "Na", "A", "F" ] | ||
|
||
MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ] | ||
|
||
|
@@ -135,6 +137,14 @@ def maketables(trace=0): | |
|
||
def makeunicodedata(unicode, trace): | ||
|
||
# the default value of east_asian_width is "N", for unassigned code points | ||
# not mentioned in EastAsianWidth.txt | ||
# in addition there are some reserved but unassigned code points in CJK | ||
# ranges that are classified as "W". code points in private use areas | ||
# have a width of "A". both of these have entries in | ||
# EastAsianWidth.txt | ||
# see https://unicode.org/reports/tr11/#Unassigned | ||
assert EASTASIANWIDTH_NAMES[0] == "N" | ||
dummy = (0, 0, 0, 0, 0, 0) | ||
table = [dummy] | ||
cache = {0: dummy} | ||
|
@@ -160,15 +170,25 @@ def makeunicodedata(unicode, trace): | |
category, combining, bidirectional, mirrored, eastasianwidth, | ||
normalizationquickcheck | ||
) | ||
# add entry to index and item tables | ||
i = cache.get(item) | ||
if i is None: | ||
cache[item] = i = len(table) | ||
table.append(item) | ||
index[char] = i | ||
elif unicode.widths[char] is not None: | ||
# an unassigned but reserved character, with a known | ||
# east_asian_width | ||
eastasianwidth = EASTASIANWIDTH_NAMES.index(unicode.widths[char]) | ||
item = (0, 0, 0, 0, eastasianwidth, 0) | ||
else: | ||
continue | ||
|
||
# add entry to index and item tables | ||
i = cache.get(item) | ||
if i is None: | ||
cache[item] = i = len(table) | ||
table.append(item) | ||
index[char] = i | ||
|
||
|
||
# 2) decomposition data | ||
|
||
decomp_data_cache = {} | ||
decomp_data = [0] | ||
decomp_prefix = [""] | ||
decomp_index = [0] * len(unicode.chars) | ||
|
@@ -207,12 +227,15 @@ def makeunicodedata(unicode, trace): | |
comp_first[l] = 1 | ||
comp_last[r] = 1 | ||
comp_pairs.append((l,r,char)) | ||
try: | ||
i = decomp_data.index(decomp) | ||
except ValueError: | ||
key = tuple(decomp) | ||
i = decomp_data_cache.get(key, -1) | ||
if i == -1: | ||
i = len(decomp_data) | ||
decomp_data.extend(decomp) | ||
decomp_size = decomp_size + len(decomp) * 2 | ||
decomp_data_cache[key] = i | ||
else: | ||
assert decomp_data[i:i+len(decomp)] == decomp | ||
else: | ||
i = 0 | ||
decomp_index[char] = i | ||
|
@@ -270,6 +293,7 @@ def makeunicodedata(unicode, trace): | |
fprint() | ||
fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION) | ||
fprint("/* a list of unique database records */") | ||
# NOTE: static qualification added by unicodedata2 | ||
fprint("static const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {") | ||
for item in table: | ||
fprint(" {%d, %d, %d, %d, %d, %d}," % item) | ||
|
@@ -295,18 +319,21 @@ def makeunicodedata(unicode, trace): | |
# the support code moved into unicodedatabase.c | ||
|
||
fprint("/* string literals */") | ||
# NOTE: static qualification added by unicodedata2 | ||
fprint("static const char *_PyUnicode_CategoryNames[] = {") | ||
for name in CATEGORY_NAMES: | ||
fprint(" \"%s\"," % name) | ||
fprint(" NULL") | ||
fprint("};") | ||
|
||
# NOTE: static qualification added by unicodedata2 | ||
fprint("static const char *_PyUnicode_BidirectionalNames[] = {") | ||
for name in BIDIRECTIONAL_NAMES: | ||
fprint(" \"%s\"," % name) | ||
fprint(" NULL") | ||
fprint("};") | ||
|
||
# NOTE: static qualification added by unicodedata2 | ||
fprint("static const char *_PyUnicode_EastAsianWidthNames[] = {") | ||
for name in EASTASIANWIDTH_NAMES: | ||
fprint(" \"%s\"," % name) | ||
|
@@ -515,6 +542,7 @@ def makeunicodetype(unicode, trace): | |
fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION)) | ||
fprint() | ||
fprint("/* a list of unique character type descriptors */") | ||
# NOTE: static qualification added by unicodedata2 | ||
fprint("static const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {") | ||
for item in table: | ||
fprint(" {%d, %d, %d, %d, %d, %d}," % item) | ||
|
@@ -523,6 +551,7 @@ def makeunicodetype(unicode, trace): | |
|
||
fprint("/* extended case mappings */") | ||
fprint() | ||
# NOTE: static qualification added by unicodedata2 | ||
fprint("static const Py_UCS4 _PyUnicode_ExtendedCase[] = {") | ||
for c in extra_casing: | ||
fprint(" %d," % c) | ||
|
@@ -820,9 +849,9 @@ def merge_old_version(version, new, old): | |
continue | ||
# check characters that differ | ||
if old.table[i] != new.table[i]: | ||
for k, field_name in enumerate(UcdRecord._fields): | ||
value = getattr(old.table[i], field_name) | ||
new_value = getattr(new.table[i], field_name) | ||
for k, field in enumerate(dataclasses.fields(UcdRecord)): | ||
value = getattr(old.table[i], field.name) | ||
new_value = getattr(new.table[i], field.name) | ||
if value != new_value: | ||
if k == 1 and i in PUA_15: | ||
# the name is not set in the old.table, but in the | ||
|
@@ -892,9 +921,9 @@ def open_data(template, version): | |
import urllib.request | ||
if version == '3.2.0': | ||
# irregular url structure | ||
url = ('http://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,) | ||
url = ('https://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,) | ||
else: | ||
url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '') | ||
url = ('https://www.unicode.org/Public/%s/ucd/'+template) % (version, '') | ||
os.makedirs(DATA_DIR, exist_ok=True) | ||
urllib.request.urlretrieve(url, filename=local) | ||
if local.endswith('.txt'): | ||
|
@@ -904,7 +933,7 @@ def open_data(template, version): | |
return open(local, 'rb') | ||
|
||
|
||
def expand_range(char_range): | ||
def expand_range(char_range: str) -> Iterator[int]: | ||
''' | ||
Parses ranges of code points, as described in UAX #44: | ||
https://www.unicode.org/reports/tr44/#Code_Point_Ranges | ||
|
@@ -927,67 +956,65 @@ class UcdFile: | |
own separate format. | ||
''' | ||
|
||
def __init__(self, template, version): | ||
def __init__(self, template: str, version: str) -> None: | ||
self.template = template | ||
self.version = version | ||
|
||
def records(self): | ||
def records(self) -> Iterator[List[str]]: | ||
with open_data(self.template, self.version) as file: | ||
for line in file: | ||
line = line.split('#', 1)[0].strip() | ||
if not line: | ||
continue | ||
yield [field.strip() for field in line.split(';')] | ||
|
||
def __iter__(self): | ||
def __iter__(self) -> Iterator[List[str]]: | ||
return self.records() | ||
|
||
def expanded(self): | ||
def expanded(self) -> Iterator[Tuple[int, List[str]]]: | ||
for record in self.records(): | ||
char_range, rest = record[0], record[1:] | ||
for char in expand_range(char_range): | ||
yield char, rest | ||
|
||
|
||
class UcdRecord(namedtuple('UcdRecord', [ | ||
@dataclasses.dataclass | ||
class UcdRecord: | ||
# 15 fields from UnicodeData.txt . See: | ||
# https://www.unicode.org/reports/tr44/#UnicodeData.txt | ||
'codepoint', | ||
'name', | ||
'general_category', | ||
'canonical_combining_class', | ||
'bidi_class', | ||
'decomposition_type', | ||
'decomposition_mapping', | ||
'numeric_type', | ||
'numeric_value', | ||
'bidi_mirrored', | ||
'unicode_1_name', # obsolete | ||
'iso_comment', # obsolete | ||
'simple_uppercase_mapping', | ||
'simple_lowercase_mapping', | ||
'simple_titlecase_mapping', | ||
codepoint: str | ||
name: str | ||
general_category: str | ||
canonical_combining_class: str | ||
bidi_class: str | ||
decomposition_type: str | ||
decomposition_mapping: str | ||
numeric_type: str | ||
numeric_value: str | ||
bidi_mirrored: str | ||
unicode_1_name: str # obsolete | ||
iso_comment: str # obsolete | ||
simple_uppercase_mapping: str | ||
simple_lowercase_mapping: str | ||
simple_titlecase_mapping: str | ||
|
||
# https://www.unicode.org/reports/tr44/#EastAsianWidth.txt | ||
'east_asian_width', | ||
east_asian_width: Optional[str] | ||
|
||
# Binary properties, as a set of those that are true. | ||
# Taken from multiple files: | ||
# https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt | ||
# https://www.unicode.org/reports/tr44/#LineBreak.txt | ||
'binary_properties', | ||
binary_properties: Set[str] | ||
|
||
# The Quick_Check properties related to normalization: | ||
# https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization | ||
# We store them as a bitmask. | ||
'quick_check', | ||
])): | ||
quick_check: int | ||
|
||
|
||
@classmethod | ||
def from_row(cls, row): | ||
return cls( | ||
*row, east_asian_width=None, binary_properties=set(), quick_check=0 | ||
) | ||
def from_row(row: List[str]) -> UcdRecord: | ||
return UcdRecord(*row, None, set(), 0) | ||
|
||
|
||
# -------------------------------------------------------------------- | ||
|
@@ -1004,7 +1031,7 @@ def __init__(self, version, cjk_check=True): | |
table = [None] * 0x110000 | ||
for s in UcdFile(UNICODE_DATA, version): | ||
char = int(s[0], 16) | ||
table[char] = UcdRecord.from_row(s) | ||
table[char] = from_row(s) | ||
|
||
cjk_ranges_found = [] | ||
|
||
|
@@ -1017,16 +1044,16 @@ def __init__(self, version, cjk_check=True): | |
s = table[i] | ||
if s: | ||
if s.name[-6:] == "First>": | ||
s = table[i] = s._replace(name="") | ||
field = tuple(s)[:15] | ||
s.name = "" | ||
field = dataclasses.astuple(s)[:15] | ||
elif s.name[-5:] == "Last>": | ||
if s.name.startswith("<CJK Ideograph"): | ||
cjk_ranges_found.append((field[0], | ||
s.codepoint)) | ||
table[i] = s._replace(name="") | ||
s.name = "" | ||
field = None | ||
elif field: | ||
table[i] = UcdRecord.from_row(('%X' % i,) + field[1:]) | ||
table[i] = from_row(('%X' % i,) + field[1:]) | ||
if cjk_check and cjk_ranges != cjk_ranges_found: | ||
raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found) | ||
|
||
|
@@ -1047,7 +1074,7 @@ def __init__(self, version, cjk_check=True): | |
char = int(char, 16) | ||
self.aliases.append((name, char)) | ||
# also store the name in the PUA 1 | ||
self.table[pua_index] = self.table[pua_index]._replace(name=name) | ||
self.table[pua_index].name = name | ||
pua_index += 1 | ||
assert pua_index - NAME_ALIASES_START == len(self.aliases) | ||
|
||
|
@@ -1066,7 +1093,7 @@ def __init__(self, version, cjk_check=True): | |
"the NamedSequence struct and in unicodedata_lookup") | ||
self.named_sequences.append((name, chars)) | ||
# also store these in the PUA 1 | ||
self.table[pua_index] = self.table[pua_index]._replace(name=name) | ||
self.table[pua_index].name = name | ||
pua_index += 1 | ||
assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences) | ||
|
||
|
@@ -1081,7 +1108,8 @@ def __init__(self, version, cjk_check=True): | |
|
||
for i in range(0, 0x110000): | ||
if table[i] is not None: | ||
table[i] = table[i]._replace(east_asian_width=widths[i]) | ||
table[i].east_asian_width = widths[i] | ||
self.widths = widths | ||
|
||
for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded(): | ||
if table[char]: | ||
|
@@ -1115,7 +1143,7 @@ def __init__(self, version, cjk_check=True): | |
quickchecks[char] |= quickcheck | ||
for i in range(0, 0x110000): | ||
if table[i] is not None: | ||
table[i] = table[i]._replace(quick_check=quickchecks[i]) | ||
table[i].quick_check = quickchecks[i] | ||
|
||
with open_data(UNIHAN, version) as file: | ||
zip = zipfile.ZipFile(file) | ||
|
@@ -1134,7 +1162,7 @@ def __init__(self, version, cjk_check=True): | |
i = int(code[2:], 16) | ||
# Patch the numeric field | ||
if table[i] is not None: | ||
table[i] = table[i]._replace(numeric_value=value) | ||
table[i].numeric_value = value | ||
|
||
sc = self.special_casing = {} | ||
for data in UcdFile(SPECIAL_CASING, version): | ||
|
Oops, something went wrong.