diff --git a/.travis.yml b/.travis.yml index c447fb6..d487e61 100644 --- a/.travis.yml +++ b/.travis.yml @@ -56,6 +56,9 @@ matrix: - os: linux env: - MB_PYTHON_VERSION=3.8 + - os: linux + env: + - MB_PYTHON_VERSION=3.9 - os: linux env: - MB_PYTHON_VERSION=3.7 @@ -80,6 +83,10 @@ matrix: language: generic env: - MB_PYTHON_VERSION=3.8 + - os: osx + language: generic + env: + - MB_PYTHON_VERSION=3.9 before_install: - source multibuild/common_utils.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 417a08d..2950b1d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 13.0.0-3 + - Port https://github.com/python/cpython/commit/d134809cd3764c6a634eab7bb8995e3e2eff14d5 to unicodedata2 + - Port is_normalized to unicodedata2 (https://github.com/python/cpython/commit/2810dd7be9876236f74ac80716d113572c9098dd & https://github.com/python/cpython/commit/2f09413947d1ce0043de62ed2346f9a2b4e5880b) + - Port https://github.com/python/cpython/commit/c8c4200b65b2159bbb13cee10d67dfb3676fef26 & friends to follow PEP489 + ## 13.0.0 - Upgrade to Unicode 13.0.0 diff --git a/LICENSE b/LICENSE index e06d208..1179844 100644 --- a/LICENSE +++ b/LICENSE @@ -186,7 +186,9 @@ Apache License same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright {yyyy} {name of copyright owner} + Copyright 2021 unicodedata2 contributors + Copyright © 2001-2021 Python Software Foundation; All Rights + Reserved Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/appveyor.yml b/appveyor.yml index 2745204..fd8a8dc 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -41,6 +41,14 @@ environment: PYTHON_VERSION: "3.8.x" PYTHON_ARCH: "64" + - PYTHON: "C:\\Python39" + PYTHON_VERSION: "3.9.x" + PYTHON_ARCH: "32" + + - PYTHON: "C:\\Python39-x64" + PYTHON_VERSION: "3.9.x" + PYTHON_ARCH: "64" + matrix: fast_finish: true diff --git a/multibuild b/multibuild index ec50c47..be06f5f 160000 --- a/multibuild +++ b/multibuild @@ -1 +1 @@ -Subproject commit ec50c47fcbb860abd25ac27a6c86215b8f4620b5 +Subproject commit be06f5f857fa6865701da4980f3e879b10c6b717 diff --git a/setup.py b/setup.py index 5efda65..b2e7193 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ setup( name="unicodedata2", - version="13.0.0-2", + version="13.0.0-3", description="Unicodedata backport for Python 2/3 updated to the latest Unicode version.", long_description=long_description, long_description_content_type="text/markdown", diff --git a/tests/test_normalization.py b/tests/test_normalization.py new file mode 100644 index 0000000..4a1ca36 --- /dev/null +++ b/tests/test_normalization.py @@ -0,0 +1,128 @@ +""" +Taken from cpython test_normalization.py. +(c) 2021 PSF +""" + +try: + from urllib.request import urlretrieve +except: + from urllib import urlretrieve +import unittest + +import sys +from unicodedata2 import normalize, is_normalized, unidata_version + +TESTDATAFILE = "NormalizationTest.txt" +TESTDATAURL = "http://www.pythontest.net/unicode/" + unidata_version + "/" + TESTDATAFILE + +def check_version(testfile): + hdr = testfile.readline() + return unidata_version in hdr + +class RangeError(Exception): + pass + +def NFC(str): + return normalize("NFC", str) + +def NFKC(str): + return normalize("NFKC", str) + +def NFD(str): + return normalize("NFD", str) + +def NFKD(str): + return normalize("NFKD", str) + +chr = chr if sys.version_info[0] >= 3 else unichr + +def unistr(data): + data = [int(x, 16) for x in data.split(" ")] + for x in data: + if x > sys.maxunicode: + raise RangeError + return "".join([chr(x) for x in data]) + +class NormalizationTest(unittest.TestCase): + def test_main(self): + # Hit the exception early + try: + kwargs = {} + if sys.version_info[0] >= 3: + kwargs['encoding'] = "utf-8" + filename, _ = urlretrieve(TESTDATAURL) + testdata = open(filename, **kwargs) + if not check_version(testdata): + raise ValueError('Bad test data file') + except OSError: + self.fail("Could not retrieve {TESTDATAURL}".format(**globals())) + + with testdata: + self.run_normalization_tests(testdata) + + def run_normalization_tests(self, testdata): + part = None + part1_data = {} + + for line in testdata: + if '#' in line: + line = line.split('#')[0] + line = line.strip() + if not line: + continue + if line.startswith("@Part"): + part = line.split()[0] + continue + try: + c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]] + except RangeError: + # Skip unsupported characters; + # try at least adding c1 if we are in part1 + if part == "@Part1": + try: + c1 = unistr(line.split(';')[0]) + except RangeError: + pass + else: + part1_data[c1] = 1 + continue + + # Perform tests + self.assertTrue(c2 == NFC(c1) == NFC(c2) == NFC(c3), line) + self.assertTrue(c4 == NFC(c4) == NFC(c5), line) + self.assertTrue(c3 == NFD(c1) == NFD(c2) == NFD(c3), line) + self.assertTrue(c5 == NFD(c4) == NFD(c5), line) + self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \ + NFKC(c3) == NFKC(c4) == NFKC(c5), + line) + self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \ + NFKD(c3) == NFKD(c4) == NFKD(c5), + line) + + self.assertTrue(is_normalized("NFC", c2)) + self.assertTrue(is_normalized("NFC", c4)) + + self.assertTrue(is_normalized("NFD", c3)) + self.assertTrue(is_normalized("NFD", c5)) + + self.assertTrue(is_normalized("NFKC", c4)) + self.assertTrue(is_normalized("NFKD", c5)) + + # Record part 1 data + if part == "@Part1": + part1_data[c1] = 1 + + # Perform tests for all other data + for c in range(sys.maxunicode+1): + X = chr(c) + if X in part1_data: + continue + self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c) + + def test_bug_834676(self): + # Check for bug 834676 + normalize('NFC', u'\ud55c\uae00') + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_unicodedata2.py b/tests/test_unicodedata2.py index 0d918ed..e840134 100644 --- a/tests/test_unicodedata2.py +++ b/tests/test_unicodedata2.py @@ -199,6 +199,9 @@ def test_issue10254(self): b = 'C\u0338' * 20 + '\xC7' self.assertEqual(self.db.normalize('NFC', a), b) + # For tests of unicodedata.is_normalized / self.db.is_normalized , + # see test_normalization.py . + def test_east_asian_width(self): eaw = self.db.east_asian_width self.assertRaises(TypeError, eaw, b'a') @@ -217,6 +220,20 @@ def test_east_asian_width_9_0_changes(self): self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N') self.assertEqual(self.db.east_asian_width('\u231a'), 'W') + # Taken from https://github.com/python/cpython/commit/d134809cd3764c6a634eab7bb8995e3e2eff14d5 + def test_issue29456(self): + # Fix #29456 + u1176_str_a = '\u1100\u1176\u11a8' + u1176_str_b = '\u1100\u1176\u11a8' + u11a7_str_a = '\u1100\u1175\u11a7' + u11a7_str_b = '\uae30\u11a7' + u11c3_str_a = '\u1100\u1175\u11c3' + u11c3_str_b = '\uae30\u11c3' + self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b) + self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b) + self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b) + + class UnicodeMiscTest(UnicodeDatabaseTest): def test_decimal_numeric_consistent(self): diff --git a/unicodedata2/py2/unicodedata.c b/unicodedata2/py2/unicodedata.c index 3cf06cd..e97373f 100644 --- a/unicodedata2/py2/unicodedata.c +++ b/unicodedata2/py2/unicodedata.c @@ -17,6 +17,14 @@ #include "structmember.h" #include "unicodectype.h" +#ifdef MS_WINDOWS +typedef int bool; +#define true 1 +#define false 0 +#else +#include +#endif + #if PY_MAJOR_VERSION == 2 && (PY_MINOR_VERSION < 7 || PY_MICRO_VERSION < 3) #define Py_TOUPPER(c) toupper(c) #endif @@ -665,14 +673,18 @@ nfc_nfkc(PyObject *self, PyObject *input, int k) pairs, since we always have decomposed data. */ if (LBase <= *i && *i < (LBase+LCount) && i + 1 < end && - VBase <= i[1] && i[1] <= (VBase+VCount)) { + VBase <= i[1] && i[1] < (VBase+VCount)) { + /* check L character is a modern leading consonant (0x1100 ~ 0x1112) + and V character is a modern vowel (0x1161 ~ 0x1175). */ int LIndex, VIndex; LIndex = i[0] - LBase; VIndex = i[1] - VBase; code = SBase + (LIndex*VCount+VIndex)*TCount; i+=2; if (i < end && - TBase <= *i && *i <= (TBase+TCount)) { + TBase < *i && *i < (TBase+TCount)) { + /* check T character is a modern trailing consonant + (0x11A8 ~ 0x11C2). */ code += *i-TBase; i++; } @@ -735,36 +747,138 @@ nfc_nfkc(PyObject *self, PyObject *input, int k) return result; } -/* Return 1 if the input is certainly normalized, 0 if it might not be. */ -static int -is_normalized(PyObject *self, PyObject *input, int nfc, int k) + +// This needs to match the logic in makeunicodedata.py +// which constructs the quickcheck data. +typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult; + +/* Run the Unicode normalization "quickcheck" algorithm. + * + * Return YES or NO if quickcheck determines the input is certainly + * normalized or certainly not, and MAYBE if quickcheck is unable to + * tell. + * + * If `yes_only` is true, then return MAYBE as soon as we determine + * the answer is not YES. + * + * For background and details on the algorithm, see UAX #15: + * https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms + */ +static QuickcheckResult +is_normalized_quickcheck(PyObject *self, PyObject *input, bool nfc, bool k, + bool yes_only) { - Py_UNICODE *i, *end; - unsigned char prev_combining = 0, quickcheck_mask; + Py_ssize_t i, len; + Py_UNICODE *data; + unsigned char prev_combining = 0; - /* An older version of the database is requested, quickchecks must be - disabled. */ - if (self != NULL) + /* The two quickcheck bits at this shift have type QuickcheckResult. */ + int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0); + + QuickcheckResult result = YES; /* certainly normalized, unless we find something */ + + /* UCD 3.2.0 is requested, quickchecks must be disabled. */ + if (self != NULL) { return 0; + } - /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No, - as described in http://unicode.org/reports/tr15/#Annex8. */ - quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0)); + i = 0; + data = PyUnicode_AS_UNICODE(input); + len = PyUnicode_GET_SIZE(input); + while (i < len) { + Py_UCS4 ch = *(data + i++); + const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch); - i = PyUnicode_AS_UNICODE(input); - end = i + PyUnicode_GET_SIZE(input); - while (i < end) { - const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++); unsigned char combining = record->combining; - unsigned char quickcheck = record->normalization_quick_check; + unsigned char quickcheck_whole = record->normalization_quick_check; - if (quickcheck & quickcheck_mask) - return 0; /* this string might need normalization */ if (combining && prev_combining > combining) - return 0; /* non-canonical sort order, not normalized */ + return NO; /* non-canonical sort order, not normalized */ prev_combining = combining; + + if (yes_only) { + if (quickcheck_whole & (3 << quickcheck_shift)) + return MAYBE; + } else { + switch ((quickcheck_whole >> quickcheck_shift) & 3) { + case NO: + return NO; + case MAYBE: + result = MAYBE; /* this string might need normalization */ + } + } } - return 1; /* certainly normalized */ + return result; +} + + +PyDoc_STRVAR(unicodedata_is_normalized__doc__, +"is_normalized($self, form, unistr, /)\n" +"--\n" +"\n" +"Return whether the Unicode string unistr is in the normal form \'form\'.\n" +"\n" +"Valid values for form are \'NFC\', \'NFKC\', \'NFD\', and \'NFKD\'."); + +static PyObject * +unicodedata_is_normalized(PyObject *self, PyObject *args) +/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/ +{ + char *form; + PyObject *input; + + PyObject *result; + bool nfc = false; + bool k = false; + QuickcheckResult m; + + PyObject *cmp; + int match = 0; + + if(!PyArg_ParseTuple(args, "sO!:is_normalized", + &form, &PyUnicode_Type, &input)) + return NULL; + + if (PyUnicode_GetSize(input) == 0) { + /* special case empty input strings. */ + Py_RETURN_TRUE; + } + + if (strcmp(form, "NFC") == 0) { + nfc = true; + } + else if (strcmp(form, "NFKC") == 0) { + nfc = true; + k = true; + } + else if (strcmp(form, "NFD") == 0) { + /* matches default values for `nfc` and `k` */ + } + else if (strcmp(form, "NFKD") == 0) { + k = true; + } + else { + PyErr_SetString(PyExc_ValueError, "invalid normalization form"); + return NULL; + } + + m = is_normalized_quickcheck(self, input, nfc, k, false); + + if (m == MAYBE) { + cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k); + if (cmp == NULL) { + return NULL; + } + match = PyUnicode_Compare(input, cmp); + Py_DECREF(cmp); + result = (match == 0) ? Py_True : Py_False; + } + else { + result = (m == YES) ? Py_True : Py_False; + } + + Py_INCREF(result); + return result; } PyDoc_STRVAR(unicodedata_normalize__doc__, @@ -791,28 +905,32 @@ unicodedata_normalize(PyObject *self, PyObject *args) } if (strcmp(form, "NFC") == 0) { - if (is_normalized(self, input, 1, 0)) { + if (is_normalized_quickcheck(self, input, + true, false, true) == YES) { Py_INCREF(input); return input; } return nfc_nfkc(self, input, 0); } if (strcmp(form, "NFKC") == 0) { - if (is_normalized(self, input, 1, 1)) { + if (is_normalized_quickcheck(self, input, + true, true, true) == YES) { Py_INCREF(input); return input; } return nfc_nfkc(self, input, 1); } if (strcmp(form, "NFD") == 0) { - if (is_normalized(self, input, 0, 0)) { + if (is_normalized_quickcheck(self, input, + false, false, true) == YES) { Py_INCREF(input); return input; } return nfd_nfkd(self, input, 0); } if (strcmp(form, "NFKD") == 0) { - if (is_normalized(self, input, 0, 1)) { + if (is_normalized_quickcheck(self, input, + false, true, true) == YES) { Py_INCREF(input); return input; } @@ -1234,6 +1352,8 @@ static PyMethodDef unicodedata_functions[] = { {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__}, {"normalize", unicodedata_normalize, METH_VARARGS, unicodedata_normalize__doc__}, + {"is_normalized", unicodedata_is_normalized, METH_VARARGS, + unicodedata_is_normalized__doc__}, {NULL, NULL} /* sentinel */ }; diff --git a/unicodedata2/py3/unicodedata.c b/unicodedata2/py3/unicodedata.c index 5fbf00d..bac6cd2 100644 --- a/unicodedata2/py3/unicodedata.c +++ b/unicodedata2/py3/unicodedata.c @@ -16,15 +16,27 @@ #define PY_SSIZE_T_CLEAN #include "Python.h" -#include "ucnhash.h" #include "structmember.h" #include "unicodectype.h" +#ifdef MS_WINDOWS +typedef int bool; +#define true 1 +#define false 0 +#else +#include +#endif + +_Py_IDENTIFIER(NFC); +_Py_IDENTIFIER(NFD); +_Py_IDENTIFIER(NFKC); +_Py_IDENTIFIER(NFKD); + /*[clinic input] module unicodedata -class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type' +class unicodedata.UCD 'PreviousDBVersion *' '' [clinic start generated code]*/ -/*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/ +/*[clinic end generated code: output=da39a3ee5e6b4b0d input=e47113e05924be43]*/ /* character properties */ @@ -84,22 +96,25 @@ static PyMemberDef DB_members[] = { {NULL} }; -/* forward declaration */ -static PyTypeObject UCD_Type; -#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type) +// Check if self is an unicodedata.UCD instance. +// If self is NULL (when the PyCapsule C API is used), return 0. +// PyModule_Check() is used to avoid having to retrieve the ucd_type. +// See unicodedata_functions comment to the rationale of this macro. +#define UCD_Check(self) (self != NULL && !PyModule_Check(self)) static PyObject* -new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4), +new_previous_version(PyTypeObject *ucd_type, + const char*name, const change_record* (*getrecord)(Py_UCS4), Py_UCS4 (*normalization)(Py_UCS4)) { - PreviousDBVersion *self; - self = PyObject_New(PreviousDBVersion, &UCD_Type); - if (self == NULL) - return NULL; - self->name = name; - self->getrecord = getrecord; - self->normalization = normalization; - return (PyObject*)self; + PreviousDBVersion *self; + self = PyObject_New(PreviousDBVersion, ucd_type); + if (self == NULL) + return NULL; + self->name = name; + self->getrecord = getrecord; + self->normalization = normalization; + return (PyObject*)self; } @@ -129,7 +144,7 @@ unicodedata_UCD_decimal_impl(PyObject *self, int chr, long rc; Py_UCS4 c = (Py_UCS4)chr; - if (self && UCD_Check(self)) { + if (UCD_Check(self)) { const change_record *old = get_old_record(self, c); if (old->category_changed == 0) { /* unassigned */ @@ -217,7 +232,7 @@ unicodedata_UCD_numeric_impl(PyObject *self, int chr, double rc; Py_UCS4 c = (Py_UCS4)chr; - if (self && UCD_Check(self)) { + if (UCD_Check(self)) { const change_record *old = get_old_record(self, c); if (old->category_changed == 0) { /* unassigned */ @@ -262,7 +277,7 @@ unicodedata_UCD_category_impl(PyObject *self, int chr) int index; Py_UCS4 c = (Py_UCS4)chr; index = (int) _getrecord_ex(c)->category; - if (self && UCD_Check(self)) { + if (UCD_Check(self)) { const change_record *old = get_old_record(self, c); if (old->category_changed != 0xFF) index = old->category_changed; @@ -289,7 +304,7 @@ unicodedata_UCD_bidirectional_impl(PyObject *self, int chr) int index; Py_UCS4 c = (Py_UCS4)chr; index = (int) _getrecord_ex(c)->bidirectional; - if (self && UCD_Check(self)) { + if (UCD_Check(self)) { const change_record *old = get_old_record(self, c); if (old->category_changed == 0) index = 0; /* unassigned */ @@ -318,7 +333,7 @@ unicodedata_UCD_combining_impl(PyObject *self, int chr) int index; Py_UCS4 c = (Py_UCS4)chr; index = (int) _getrecord_ex(c)->combining; - if (self && UCD_Check(self)) { + if (UCD_Check(self)) { const change_record *old = get_old_record(self, c); if (old->category_changed == 0) index = 0; /* unassigned */ @@ -346,7 +361,7 @@ unicodedata_UCD_mirrored_impl(PyObject *self, int chr) int index; Py_UCS4 c = (Py_UCS4)chr; index = (int) _getrecord_ex(c)->mirrored; - if (self && UCD_Check(self)) { + if (UCD_Check(self)) { const change_record *old = get_old_record(self, c); if (old->category_changed == 0) index = 0; /* unassigned */ @@ -373,7 +388,7 @@ unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr) int index; Py_UCS4 c = (Py_UCS4)chr; index = (int) _getrecord_ex(c)->east_asian_width; - if (self && UCD_Check(self)) { + if (UCD_Check(self)) { const change_record *old = get_old_record(self, c); if (old->category_changed == 0) index = 0; /* unassigned */ @@ -407,7 +422,7 @@ unicodedata_UCD_decomposition_impl(PyObject *self, int chr) code = (int)c; - if (self && UCD_Check(self)) { + if (UCD_Check(self)) { const change_record *old = get_old_record(self, c); if (old->category_changed == 0) return PyUnicode_FromString(""); /* unassigned */ @@ -450,12 +465,14 @@ unicodedata_UCD_decomposition_impl(PyObject *self, int chr) } static void -get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count) +get_decomp_record(PyObject *self, Py_UCS4 code, + int *index, int *prefix, int *count) { if (code >= 0x110000) { *index = 0; - } else if (self && UCD_Check(self) && - get_old_record(self, code)->category_changed==0) { + } + else if (UCD_Check(self) + && get_old_record(self, code)->category_changed==0) { /* unassigned in old version */ *index = 0; } @@ -490,7 +507,7 @@ nfd_nfkd(PyObject *self, PyObject *input, int k) Py_UCS4 *output; Py_ssize_t i, o, osize; int kind; - void *data; + const void *data; /* Longest decomposition in Unicode 3.2: U+FDFA */ Py_UCS4 stack[20]; Py_ssize_t space, isize; @@ -552,7 +569,7 @@ nfd_nfkd(PyObject *self, PyObject *input, int k) continue; } /* normalization changes */ - if (self && UCD_Check(self)) { + if (UCD_Check(self)) { Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code); if (value != 0) { stack[stackptr++] = value; @@ -617,7 +634,7 @@ nfd_nfkd(PyObject *self, PyObject *input, int k) } static int -find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code) +find_nfc_index(const struct reindex* nfc, Py_UCS4 code) { unsigned int index; for (index = 0; nfc[index].start; index++) { @@ -637,7 +654,7 @@ nfc_nfkc(PyObject *self, PyObject *input, int k) { PyObject *result; int kind; - void *data; + const void *data; Py_UCS4 *output; Py_ssize_t i, i1, o, len; int f,l,index,index1,comb; @@ -682,15 +699,19 @@ nfc_nfkc(PyObject *self, PyObject *input, int k) if (LBase <= code && code < (LBase+LCount) && i + 1 < len && VBase <= PyUnicode_READ(kind, data, i+1) && - PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) { + PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) { + /* check L character is a modern leading consonant (0x1100 ~ 0x1112) + and V character is a modern vowel (0x1161 ~ 0x1175). */ int LIndex, VIndex; LIndex = code - LBase; VIndex = PyUnicode_READ(kind, data, i+1) - VBase; code = SBase + (LIndex*VCount+VIndex)*TCount; i+=2; if (i < len && - TBase <= PyUnicode_READ(kind, data, i) && - PyUnicode_READ(kind, data, i) <= (TBase+TCount)) { + TBase < PyUnicode_READ(kind, data, i) && + PyUnicode_READ(kind, data, i) < (TBase+TCount)) { + /* check T character is a modern trailing consonant + (0x11A8 ~ 0x11C2). */ code += PyUnicode_READ(kind, data, i)-TBase; i++; } @@ -699,7 +720,7 @@ nfc_nfkc(PyObject *self, PyObject *input, int k) } /* code is still input[i] here */ - f = find_nfc_index(self, nfc_first, code); + f = find_nfc_index(nfc_first, code); if (f == -1) { output[o++] = code; i++; @@ -722,7 +743,7 @@ nfc_nfkc(PyObject *self, PyObject *input, int k) continue; } } - l = find_nfc_index(self, nfc_last, code1); + l = find_nfc_index(nfc_last, code1); /* i1 cannot be combined with i. If i1 is a starter, we don't need to look further. Otherwise, record the combining class. */ @@ -747,7 +768,7 @@ nfc_nfkc(PyObject *self, PyObject *input, int k) assert(cskipped < 20); skipped[cskipped++] = i1; i1++; - f = find_nfc_index(self, nfc_first, output[o]); + f = find_nfc_index(nfc_first, output[o]); if (f == -1) break; } @@ -767,23 +788,40 @@ nfc_nfkc(PyObject *self, PyObject *input, int k) return result; } -/* Return 1 if the input is certainly normalized, 0 if it might not be. */ -static int -is_normalized(PyObject *self, PyObject *input, int nfc, int k) +// This needs to match the logic in makeunicodedata.py +// which constructs the quickcheck data. +typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult; + +/* Run the Unicode normalization "quickcheck" algorithm. + * + * Return YES or NO if quickcheck determines the input is certainly + * normalized or certainly not, and MAYBE if quickcheck is unable to + * tell. + * + * If `yes_only` is true, then return MAYBE as soon as we determine + * the answer is not YES. + * + * For background and details on the algorithm, see UAX #15: + * https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms + */ +static QuickcheckResult +is_normalized_quickcheck(PyObject *self, PyObject *input, bool nfc, bool k, + bool yes_only) { + /* UCD 3.2.0 is requested, quickchecks must be disabled. */ + if (UCD_Check(self)) { + return NO; + } + Py_ssize_t i, len; int kind; - void *data; - unsigned char prev_combining = 0, quickcheck_mask; + const void *data; + unsigned char prev_combining = 0; - /* An older version of the database is requested, quickchecks must be - disabled. */ - if (self && UCD_Check(self)) - return 0; + /* The two quickcheck bits at this shift have type QuickcheckResult. */ + int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0); - /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No, - as described in http://unicode.org/reports/tr15/#Annex8. */ - quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0)); + QuickcheckResult result = YES; /* certainly normalized, unless we find something */ i = 0; kind = PyUnicode_KIND(input); @@ -792,23 +830,106 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k) while (i < len) { Py_UCS4 ch = PyUnicode_READ(kind, data, i++); const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch); - unsigned char combining = record->combining; - unsigned char quickcheck = record->normalization_quick_check; - if (quickcheck & quickcheck_mask) - return 0; /* this string might need normalization */ + unsigned char combining = record->combining; if (combining && prev_combining > combining) - return 0; /* non-canonical sort order, not normalized */ + return NO; /* non-canonical sort order, not normalized */ prev_combining = combining; + + unsigned char quickcheck_whole = record->normalization_quick_check; + if (yes_only) { + if (quickcheck_whole & (3 << quickcheck_shift)) + return MAYBE; + } else { + switch ((quickcheck_whole >> quickcheck_shift) & 3) { + case NO: + return NO; + case MAYBE: + result = MAYBE; /* this string might need normalization */ + } + } } - return 1; /* certainly normalized */ + return result; } +/*[clinic input] +unicodedata.UCD.is_normalized + + self: self + form: unicode + unistr as input: unicode + / + +Return whether the Unicode string unistr is in the normal form 'form'. + +Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'. +[clinic start generated code]*/ + +static PyObject * +unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form, + PyObject *input) +/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/ +{ + if (PyUnicode_READY(input) == -1) { + return NULL; + } + + if (PyUnicode_GET_LENGTH(input) == 0) { + /* special case empty input strings. */ + Py_RETURN_TRUE; + } + + PyObject *result; + bool nfc = false; + bool k = false; + QuickcheckResult m; + + PyObject *cmp; + int match = 0; + + if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) { + nfc = true; + } + else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) { + nfc = true; + k = true; + } + else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) { + /* matches default values for `nfc` and `k` */ + } + else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) { + k = true; + } + else { + PyErr_SetString(PyExc_ValueError, "invalid normalization form"); + return NULL; + } + + m = is_normalized_quickcheck(self, input, nfc, k, false); + + if (m == MAYBE) { + cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k); + if (cmp == NULL) { + return NULL; + } + match = PyUnicode_Compare(input, cmp); + Py_DECREF(cmp); + result = (match == 0) ? Py_True : Py_False; + } + else { + result = (m == YES) ? Py_True : Py_False; + } + + Py_INCREF(result); + return result; +} + + /*[clinic input] unicodedata.UCD.normalize self: self - form: str + form: unicode unistr as input: object(subclass_of='&PyUnicode_Type') / @@ -818,9 +939,9 @@ Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'. [clinic start generated code]*/ static PyObject * -unicodedata_UCD_normalize_impl(PyObject *self, const char *form, +unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form, PyObject *input) -/*[clinic end generated code: output=62d1f8870027efdc input=cd092e631cf11883]*/ +/*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/ { if (PyUnicode_READY(input) == -1) return NULL; @@ -832,29 +953,33 @@ unicodedata_UCD_normalize_impl(PyObject *self, const char *form, return input; } - if (strcmp(form, "NFC") == 0) { - if (is_normalized(self, input, 1, 0)) { + if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) { + if (is_normalized_quickcheck(self, input, + true, false, true) == YES) { Py_INCREF(input); return input; } return nfc_nfkc(self, input, 0); } - if (strcmp(form, "NFKC") == 0) { - if (is_normalized(self, input, 1, 1)) { + if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) { + if (is_normalized_quickcheck(self, input, + true, true, true) == YES) { Py_INCREF(input); return input; } return nfc_nfkc(self, input, 1); } - if (strcmp(form, "NFD") == 0) { - if (is_normalized(self, input, 0, 0)) { + if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) { + if (is_normalized_quickcheck(self, input, + false, false, true) == YES) { Py_INCREF(input); return input; } return nfd_nfkd(self, input, 0); } - if (strcmp(form, "NFKD") == 0) { - if (is_normalized(self, input, 0, 1)) { + if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) { + if (is_normalized_quickcheck(self, input, + false, true, true) == YES) { Py_INCREF(input); return input; } @@ -880,7 +1005,7 @@ _gethash(const char *s, int len, int scale) unsigned long h = 0; unsigned long ix; for (i = 0; i < len; i++) { - h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i])); + h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]); ix = h & 0xff000000; if (ix) h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff; @@ -941,8 +1066,8 @@ is_unified_ideograph(Py_UCS4 code) (cp < named_sequences_end)) static int -_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen, - int with_alias_and_seq) +_getucname(PyObject *self, + Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq) { /* Find the name associated with the given code point. * If with_alias_and_seq is 1, check for names in the Private Use Area 15 @@ -959,7 +1084,7 @@ _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen, if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code))) return 0; - if (self && UCD_Check(self)) { + if (UCD_Check(self)) { /* in 3.2.0 there are no aliases and named sequences */ const change_record *old; if (IS_ALIAS(code) || IS_NAMED_SEQ(code)) @@ -1042,6 +1167,15 @@ _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen, return 1; } +static int +capi_getucname(Py_UCS4 code, + char* buffer, int buflen, + int with_alias_and_seq) +{ + return _getucname(NULL, code, buffer, buflen, with_alias_and_seq); + +} + static int _cmpname(PyObject *self, int code, const char* name, int namelen) { @@ -1051,7 +1185,7 @@ _cmpname(PyObject *self, int code, const char* name, int namelen) if (!_getucname(self, code, buffer, NAME_MAXLEN, 1)) return 0; for (i = 0; i < namelen; i++) { - if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i]) + if (Py_TOUPPER(name[i]) != buffer[i]) return 0; } return buffer[namelen] == '\0'; @@ -1093,8 +1227,8 @@ _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq) } static int -_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code, - int with_named_seq) +_getcode(PyObject* self, + const char* name, int namelen, Py_UCS4* code, int with_named_seq) { /* Return the code point associated with the given name. * Named aliases are resolved too (unless self != NULL (i.e. we are using @@ -1155,8 +1289,9 @@ _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code, v = code_hash[i]; if (!v) return 0; - if (_cmpname(self, v, name, namelen)) + if (_cmpname(self, v, name, namelen)) { return _check_alias_and_seq(v, code, with_named_seq); + } incr = (h ^ (h >> 3)) & mask; if (!incr) incr = mask; @@ -1165,21 +1300,30 @@ _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code, v = code_hash[i]; if (!v) return 0; - if (_cmpname(self, v, name, namelen)) + if (_cmpname(self, v, name, namelen)) { return _check_alias_and_seq(v, code, with_named_seq); + } incr = incr << 1; if (incr > mask) incr = incr ^ code_poly; } } + +static int +capi_getcode(const char* name, int namelen, Py_UCS4* code, + int with_named_seq) +{ + return _getcode(NULL, name, namelen, code, with_named_seq); + +} /* -static const _PyUnicode_Name_CAPI hashAPI = +static const _PyUnicode_Name_CAPI unicodedata_capi = { - sizeof(_PyUnicode_Name_CAPI), - _getucname, - _getcode + .getname = capi_getucname, + .getcode = capi_getcode, }; */ + /* -------------------------------------------------------------------- */ /* Python bindings */ @@ -1258,8 +1402,10 @@ unicodedata_UCD_lookup_impl(PyObject *self, const char *name, return PyUnicode_FromOrdinal(code); } -/* XXX Add doc strings. */ - +// List of functions used to define module functions *AND* unicodedata.UCD +// methods. For module functions, self is the module. For UCD methods, self +// is an UCD instance. The UCD_Check() macro is used to check if self is +// an UCD instance. static PyMethodDef unicodedata_functions[] = { UNICODEDATA_UCD_DECIMAL_METHODDEF UNICODEDATA_UCD_DIGIT_METHODDEF @@ -1272,54 +1418,32 @@ static PyMethodDef unicodedata_functions[] = { UNICODEDATA_UCD_DECOMPOSITION_METHODDEF UNICODEDATA_UCD_NAME_METHODDEF UNICODEDATA_UCD_LOOKUP_METHODDEF + UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF UNICODEDATA_UCD_NORMALIZE_METHODDEF {NULL, NULL} /* sentinel */ }; -static PyTypeObject UCD_Type = { - /* The ob_type field must be initialized in the module init function - * to be portable to Windows without using C++. */ - PyVarObject_HEAD_INIT(NULL, 0) - "unicodedata2.UCD", /*tp_name*/ - sizeof(PreviousDBVersion), /*tp_basicsize*/ - 0, /*tp_itemsize*/ - /* methods */ - (destructor)PyObject_Del, /*tp_dealloc*/ - 0, /*tp_print*/ - 0, /*tp_getattr*/ - 0, /*tp_setattr*/ - 0, /*tp_reserved*/ - 0, /*tp_repr*/ - 0, /*tp_as_number*/ - 0, /*tp_as_sequence*/ - 0, /*tp_as_mapping*/ - 0, /*tp_hash*/ - 0, /*tp_call*/ - 0, /*tp_str*/ - PyObject_GenericGetAttr,/*tp_getattro*/ - 0, /*tp_setattro*/ - 0, /*tp_as_buffer*/ - Py_TPFLAGS_DEFAULT, /*tp_flags*/ - 0, /*tp_doc*/ - 0, /*tp_traverse*/ - 0, /*tp_clear*/ - 0, /*tp_richcompare*/ - 0, /*tp_weaklistoffset*/ - 0, /*tp_iter*/ - 0, /*tp_iternext*/ - unicodedata_functions, /*tp_methods*/ - DB_members, /*tp_members*/ - 0, /*tp_getset*/ - 0, /*tp_base*/ - 0, /*tp_dict*/ - 0, /*tp_descr_get*/ - 0, /*tp_descr_set*/ - 0, /*tp_dictoffset*/ - 0, /*tp_init*/ - 0, /*tp_alloc*/ - 0, /*tp_new*/ - 0, /*tp_free*/ - 0, /*tp_is_gc*/ +static void +ucd_dealloc(PreviousDBVersion *self) +{ + PyTypeObject *tp = Py_TYPE(self); + PyObject_Del(self); + Py_DECREF(tp); +} + +static PyType_Slot ucd_type_slots[] = { + {Py_tp_dealloc, ucd_dealloc}, + {Py_tp_getattro, PyObject_GenericGetAttr}, + {Py_tp_methods, unicodedata_functions}, + {Py_tp_members, DB_members}, + {0, 0} +}; + +static PyType_Spec ucd_type_spec = { + .name = "unicodedata2.UCD", + .basicsize = sizeof(PreviousDBVersion), + .flags = Py_TPFLAGS_DEFAULT, + .slots = ucd_type_slots }; PyDoc_STRVAR(unicodedata_docstring, @@ -1331,45 +1455,72 @@ this database is based on the UnicodeData.txt file version\n\ The module uses the same names and symbols as defined by the\n\ UnicodeData File Format " UNIDATA_VERSION "."); -static struct PyModuleDef unicodedatamodule = { - PyModuleDef_HEAD_INIT, - "unicodedata2", - unicodedata_docstring, - -1, - unicodedata_functions, - NULL, - NULL, - NULL, - NULL -}; -PyMODINIT_FUNC -PyInit_unicodedata2(void) +static int +unicodedata_exec(PyObject *module) { - PyObject *m, *v; - - Py_TYPE(&UCD_Type) = &PyType_Type; + if (PyModule_AddStringConstant(module, "unidata_version", UNIDATA_VERSION) < 0) { + return -1; + } - m = PyModule_Create(&unicodedatamodule); - if (!m) - return NULL; + PyTypeObject *ucd_type = (PyTypeObject *)PyType_FromSpec(&ucd_type_spec); + if (ucd_type == NULL) { + return -1; + } - PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION); - Py_INCREF(&UCD_Type); - PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type); + if (PyModule_AddObject(module, "UCD", ucd_type) < 0) { + Py_DECREF(ucd_type); + return -1; + } /* Previous versions */ - v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0); - if (v != NULL) - PyModule_AddObject(m, "ucd_3_2_0", v); + PyObject *v; + v = new_previous_version(ucd_type, "3.2.0", + get_change_3_2_0, normalization_3_2_0); + Py_DECREF(ucd_type); + if (v == NULL) { + return -1; + } + if (PyModule_AddObject(module, "ucd_3_2_0", v) < 0) { + Py_DECREF(v); + return -1; + } /* Export C API */ - // v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL); - // if (v != NULL) - // PyModule_AddObject(m, "ucnhash_CAPI", v); - return m; + // v = PyCapsule_New((void *)&unicodedata_capi, PyUnicodeData_CAPSULE_NAME, + // NULL); + // if (v == NULL) { + // return -1; + // } + // if (PyModule_AddObject(module, "ucnhash_CAPI", v) < 0) { + // Py_DECREF(v); + // return -1; + // } + return 0; } +static PyModuleDef_Slot unicodedata_slots[] = { + {Py_mod_exec, unicodedata_exec}, + {0, NULL} +}; + +static struct PyModuleDef unicodedata_module = { + PyModuleDef_HEAD_INIT, + .m_name = "unicodedata2", + .m_doc = unicodedata_docstring, + .m_size = 0, + .m_methods = unicodedata_functions, + .m_slots = unicodedata_slots, +}; + +PyMODINIT_FUNC +PyInit_unicodedata2(void) +{ + return PyModuleDef_Init(&unicodedata_module); +} + + + /* Local variables: c-basic-offset: 4 diff --git a/unicodedata2/py3/unicodedata.c.h b/unicodedata2/py3/unicodedata.c.h index 944908b..235ad8f 100644 --- a/unicodedata2/py3/unicodedata.c.h +++ b/unicodedata2/py3/unicodedata.c.h @@ -284,6 +284,38 @@ unicodedata_UCD_decomposition(PyObject *self, PyObject *arg) return return_value; } +PyDoc_STRVAR(unicodedata_UCD_is_normalized__doc__, +"is_normalized($self, form, unistr, /)\n" +"--\n" +"\n" +"Return whether the Unicode string unistr is in the normal form \'form\'.\n" +"\n" +"Valid values for form are \'NFC\', \'NFKC\', \'NFD\', and \'NFKD\'."); + +#define UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF \ + {"is_normalized", (PyCFunction)unicodedata_UCD_is_normalized, METH_VARARGS, unicodedata_UCD_is_normalized__doc__}, + +static PyObject * +unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form, + PyObject *input); + +static PyObject * +unicodedata_UCD_is_normalized(PyObject *self, PyObject *args) +{ + PyObject *return_value = NULL; + PyObject *form; + PyObject *input; + + if (!PyArg_ParseTuple(args, "UU:is_normalized", + &form, &input)) { + goto exit; + } + return_value = unicodedata_UCD_is_normalized_impl(self, form, input); + +exit: + return return_value; +} + PyDoc_STRVAR(unicodedata_UCD_normalize__doc__, "normalize($self, form, unistr, /)\n" "--\n" @@ -296,18 +328,18 @@ PyDoc_STRVAR(unicodedata_UCD_normalize__doc__, {"normalize", (PyCFunction)unicodedata_UCD_normalize, METH_VARARGS, unicodedata_UCD_normalize__doc__}, static PyObject * -unicodedata_UCD_normalize_impl(PyObject *self, const char *form, +unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form, PyObject *input); static PyObject * unicodedata_UCD_normalize(PyObject *self, PyObject *args) { PyObject *return_value = NULL; - const char *form; + PyObject *form; PyObject *input; - if (!PyArg_ParseTuple(args, "sO!:normalize", - &form, &PyUnicode_Type, &input)) { + if (!PyArg_ParseTuple(args, "UU:normalize", + &form, &input)) { goto exit; } return_value = unicodedata_UCD_normalize_impl(self, form, input); @@ -379,5 +411,5 @@ unicodedata_UCD_lookup(PyObject *self, PyObject *arg) exit: return return_value; } -/*[clinic end generated code: output=5313ce129da87b2f input=a9049054013a1b77]*/ +/*[clinic end generated code: output=2c5fbf597c18f6b8 input=a9049054013a1b77]*/