From 772eca6d0412fa3f5a3a39fe309bfb2e7b8754a0 Mon Sep 17 00:00:00 2001 From: chris0e3 Date: Thu, 16 Sep 2021 22:05:43 +0100 Subject: [PATCH 1/3] Resolves JuliaStrings/utf8proc#227 --- Makefile | 6 +++--- data/Makefile | 50 ++++++++++++++++++++++---------------------------- utf8proc.c | 51 +++++++++++++++++++++++++++++++++++++-------------- 3 files changed, 62 insertions(+), 45 deletions(-) diff --git a/Makefile b/Makefile index ede0609..3fa9267 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,7 @@ PERL=perl CFLAGS ?= -O2 PICFLAG = -fPIC C99FLAG = -std=c99 -WCFLAGS = -Wsign-conversion -Wall -Wextra -pedantic +WCFLAGS = -Wall -Wextra -pedantic UCFLAGS = $(CPPFLAGS) $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS $(UTF8PROC_DEFINES) LDFLAG_SHARED = -shared SOFLAG = -Wl,-soname @@ -70,7 +70,7 @@ manifest: MANIFEST.new # real targets -data/utf8proc_data.c.new: libutf8proc.$(SHLIB_EXT) data/data_generator.rb data/charwidths.jl +data/utf8proc_data.c.new: data_make.py $(MAKE) -C data utf8proc_data.c.new utf8proc.o: utf8proc.h utf8proc.c utf8proc_data.c @@ -166,7 +166,7 @@ test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h $(CC) $(UCFLAGS) $(LDFLAGS) test/custom.c test/tests.o utf8proc.o -o $@ test/misc: test/misc.c test/tests.o utf8proc.o utf8proc.h test/tests.h - $(CC) $(UCFLAGS) $(LDFLAGS) -DUNICODE_VERSION='"'`$(PERL) -ne "/^UNICODE_VERSION=/ and print $$';" data/Makefile`'"' test/misc.c test/tests.o utf8proc.o -o $@ + $(CC) $(UCFLAGS) $(LDFLAGS) -DUNICODE_VERSION=`sed -En 's/^#.+UNICODE_VERSION.(.+)/\1/p' utf8proc_data.c` test/misc.c test/tests.o utf8proc.o -o $@ check: test/normtest data/NormalizationTest.txt data/Lowercase.txt data/Uppercase.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/iscase test/custom test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o $(MAKE) -C bench diff --git a/data/Makefile b/data/Makefile index 6d3667c..5573d04 100644 --- a/data/Makefile +++ b/data/Makefile @@ -1,63 +1,57 @@ # Unicode data generation rules. Except for the test data files, most # users will not use these Makefile rules, which are primarily to re-generate -# unicode_data.c when we get a new Unicode version or charwidth data; they -# require ruby and julia to be installed. +# unicode_data.c when we get a new Unicode version or charwidth data. +# Requires python 3.7+, curl & sed to be installed. -# programs -CURL=curl -RUBY=ruby -PERL=perl -MAKE=make -JULIA=julia -CURLFLAGS = --retry 5 --location +CURL = /usr/bin/curl --retry 5 --location .PHONY: clean .DELETE_ON_ERROR: -utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt CharWidths.txt emoji-data.txt - $(RUBY) data_generator.rb < UnicodeData.txt > $@ - -CharWidths.txt: charwidths.jl EastAsianWidth.txt - $(JULIA) charwidths.jl > $@ +utf8proc_data.c.new: ../data_make.py UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt \ + CompositionExclusions.txt CaseFolding.txt emoji-data.txt EastAsianWidth.txt + ../data_make.py --format=1 --fix26 --output $@ . # Unicode data version (must also update utf8proc_unicode_version function) -UNICODE_VERSION=13.0.0 +UNICODE_VERSION?=14.0.0 +URL_ROOT = $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd UnicodeData.txt: - $(CURL) $(CURLFLAGS) -o $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt + $(CURL) -o $@ $(URL_ROOT)/UnicodeData.txt EastAsianWidth.txt: - $(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/EastAsianWidth.txt + $(CURL) -o $@ $(URL_ROOT)/EastAsianWidth.txt GraphemeBreakProperty.txt: - $(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/GraphemeBreakProperty.txt + $(CURL) -o $@ $(URL_ROOT)/auxiliary/GraphemeBreakProperty.txt DerivedCoreProperties.txt: - $(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/DerivedCoreProperties.txt + $(CURL) -o $@ $(URL_ROOT)/DerivedCoreProperties.txt CompositionExclusions.txt: - $(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/CompositionExclusions.txt + $(CURL) -o $@ $(URL_ROOT)/CompositionExclusions.txt CaseFolding.txt: - $(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/CaseFolding.txt + $(CURL) -o $@ $(URL_ROOT)/CaseFolding.txt NormalizationTest.txt: - $(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/NormalizationTest.txt + $(CURL) -o $@ $(URL_ROOT)/NormalizationTest.txt GraphemeBreakTest.txt: - $(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/GraphemeBreakTest.txt + $(CURL) -o $@ $(URL_ROOT)/auxiliary/GraphemeBreakTest.txt emoji-data.txt: - $(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://unicode.org/Public/$(UNICODE_VERSION)/ucd/emoji/emoji-data.txt + $(CURL) -o $@ $(URL_ROOT)/emoji/emoji-data.txt Uppercase.txt: DerivedCoreProperties.txt - $(RUBY) -e 'puts File.read("DerivedCoreProperties.txt")[/# Derived Property: Uppercase.*?# Total code points:/m]' > $@ + sed -En '/^# Derived Property: Uppercase/,/^# Total code points:/p' DerivedCoreProperties.txt > $@ Lowercase.txt: DerivedCoreProperties.txt - $(RUBY) -e 'puts File.read("DerivedCoreProperties.txt")[/# Derived Property: Lowercase.*?# Total code points:/m]' > $@ + sed -En '/^# Derived Property: Lowercase/,/^# Total code points:/p' DerivedCoreProperties.txt > $@ clean: - rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt emoji-data.txt - rm -f Uppercase.txt Lowercase.txt + rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt \ + CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt \ + emoji-data.txt Uppercase.txt Lowercase.txt rm -f utf8proc_data.c.new diff --git a/utf8proc.c b/utf8proc.c index 225738c..784cdca 100644 --- a/utf8proc.c +++ b/utf8proc.c @@ -51,6 +51,9 @@ #endif #include "utf8proc_data.c" +#ifndef U8CASEMAP +#define utf8proc_casemap utf8proc_sequences +#endif UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = { @@ -101,7 +104,11 @@ UTF8PROC_DLLEXPORT const char *utf8proc_version(void) { } UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) { +#ifdef UNICODE_VERSION + return UNICODE_VERSION; +#else return "13.0.0"; +#endif } UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) { @@ -125,7 +132,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) { UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate( const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst ) { - utf8proc_int32_t uc; + utf8proc_uint32_t uc; const utf8proc_uint8_t *end; *dst = -1; @@ -137,7 +144,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate( return 1; } // Must be between 0xc2 and 0xf4 inclusive to be valid - if ((utf8proc_uint32_t)(uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8; + if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8; if (uc < 0xe0) { // 2-byte sequence // Must have valid continuation character if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8; @@ -232,9 +239,10 @@ static utf8proc_ssize_t charbound_encode_char(utf8proc_int32_t uc, utf8proc_uint /* internal "unsafe" version that does not check whether uc is in range */ static const utf8proc_property_t *unsafe_get_property(utf8proc_int32_t uc) { /* ASSERT: uc >= 0 && uc < 0x110000 */ + const int stage1shift = 16 - sizeof(utf8proc_stage1table[0]) * 8; return utf8proc_properties + ( utf8proc_stage2table[ - utf8proc_stage1table[uc >> 8] + (uc & 0xFF) + (utf8proc_stage1table[uc >> 8] << stage1shift) + (uc & 0xFF) ] ); } @@ -350,14 +358,15 @@ static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry) static utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex) { - const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex]; + const utf8proc_uint16_t *entry = &utf8proc_casemap[seqindex]; return seqindex_decode_entry(&entry); } -static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) { +static utf8proc_ssize_t +write_char_decomposed(const utf8proc_uint16_t *entry, int len, utf8proc_int32_t *dst, + utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) +{ utf8proc_ssize_t written = 0; - const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x1FFF]; - int len = seqindex >> 13; if (len >= 7) { len = *entry; entry++; @@ -373,22 +382,36 @@ static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqinde return written; } +static inline utf8proc_ssize_t +write_char_decomposed_seq(utf8proc_uint16_t idx, utf8proc_int32_t *dst, utf8proc_ssize_t dstsize, + utf8proc_option_t options, int *last_boundclass) +{ + return write_char_decomposed(&utf8proc_sequences[idx & 0x1FFF], idx >> 13, dst, dstsize, options, last_boundclass); +} + +static inline utf8proc_ssize_t +write_char_decomposed_case(utf8proc_uint16_t idx, utf8proc_int32_t *dst, utf8proc_ssize_t dstsize, + utf8proc_option_t options, int* last_boundclass) +{ + return write_char_decomposed(&utf8proc_casemap[idx & 0x1FFF], idx >> 13, dst, dstsize, options, last_boundclass); +} + UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c) { utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex; - return cl != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cl) : c; + return cl != UINT16_MAX ? seqindex_decode_index(cl) : c; } UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c) { utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex; - return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c; + return cu != UINT16_MAX ? seqindex_decode_index(cu) : c; } UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c) { utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex; - return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c; + return cu != UINT16_MAX ? seqindex_decode_index(cu) : c; } UTF8PROC_DLLEXPORT int utf8proc_islower(utf8proc_int32_t c) @@ -420,7 +443,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) { #define utf8proc_decompose_lump(replacement_uc) \ return utf8proc_decompose_char((replacement_uc), dst, bufsize, \ - options & ~(unsigned int)UTF8PROC_LUMP, last_boundclass) + options & ~UTF8PROC_LUMP, last_boundclass) UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) { const utf8proc_property_t *property; @@ -487,13 +510,13 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, } if (options & UTF8PROC_CASEFOLD) { if (property->casefold_seqindex != UINT16_MAX) { - return seqindex_write_char_decomposed(property->casefold_seqindex, dst, bufsize, options, last_boundclass); + return write_char_decomposed_case(property->casefold_seqindex, dst, bufsize, options, last_boundclass); } } if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { if (property->decomp_seqindex != UINT16_MAX && (!property->decomp_type || (options & UTF8PROC_COMPAT))) { - return seqindex_write_char_decomposed(property->decomp_seqindex, dst, bufsize, options, last_boundclass); + return write_char_decomposed_seq(property->decomp_seqindex, dst, bufsize, options, last_boundclass); } } if (options & UTF8PROC_CHARBOUND) { @@ -735,7 +758,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom( *dstptr = NULL; result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data); if (result < 0) return result; - buffer = (utf8proc_int32_t *) malloc(((utf8proc_size_t)result) * sizeof(utf8proc_int32_t) + 1); + buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1); if (!buffer) return UTF8PROC_ERROR_NOMEM; result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data); if (result < 0) { From a9bee2f056aebc2fea3e26e50c7b54ad8ca66e21 Mon Sep 17 00:00:00 2001 From: chris0e3 Date: Thu, 16 Sep 2021 22:11:56 +0100 Subject: [PATCH 2/3] Missing file. Resolves JuliaStrings/utf8proc#227 --- data_make.py | 588 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 588 insertions(+) create mode 100755 data_make.py diff --git a/data_make.py b/data_make.py new file mode 100755 index 0000000..4234f88 --- /dev/null +++ b/data_make.py @@ -0,0 +1,588 @@ +#!/usr/bin/python3 +# Generate 'utf8proc_data.c' by parsing the Unicode data files 'UnicodeData.txt' etc. +# from the Unicode Character Database. Tested with UCD 13.0.0 & 14.0.0-dev. +# Usage: data_make.py [options…] [] + +import re, os, sys, fileinput, copy, datetime, getopt, platform +from collections import defaultdict + +Me = os.path.basename(sys.argv[0]) +Vers = '2.7.0dev' # ??? +DDir = "./data" +Targ = "utf8proc_data.out.c" +UVers = None +gVerb = 0 +gFmt = 0 +gCMap = False +gS1Byt = False +gFix26 = False + + +def Print (s): + print(s, file=sys.stderr) + +def Error (msg): + Print("%s: ERROR: %s." % (Me, msg)) + +def TextFile (name): + return DDir+ "/" +name+ ".txt" + +def Sed (expr, file): + return os.popen("/usr/bin/sed -En '" +expr+ "' '" +TextFile(file)+ "'", 'r', 1) + +def Cat (file): + return fileinput.input(files=TextFile(file), mode='r') + +def Hex (s): + return int(s, 16) + +def Hex0 (s): + return int(s, 16) if s else -1 + +def HexArray (hexs): + v = [] + for h in hexs.split(' '): + if h: v.append(int(h, 16)) + return v + + +try: + opts, args = getopt.getopt(sys.argv[1:], "vxcf:o:", + ["verbose", "fix26", "cmap", "format=", "output="]) + omap = {'verbose':'v', 'fix26':'x', 'cmap':'c', 'format':'f', 'output':'o'} + for o, a in opts: + o = o.lstrip('-') + o = omap.get(o, o) + if o == 'v': gVerb += 1 + elif o == 'x': gFix26 = True + elif o == 'c': gCMap = True + # elif o == 'b': gS1Byt = True + elif o == 'f': gFmt = min(max(0, int(a)), 2) + elif o == 'o': Targ = a + + if len(args): DDir = args[0] + UVers = (' ' + Sed(r"1s/.+-([0-9.]+)\..+/\1/p;q", + "DerivedCoreProperties").readline()).strip() + if gFmt: gFix26 = True + if gFmt or UVers >= '14.0.0': gCMap = True + gS1Byt = gFmt > 0 +except getopt.GetoptError as err: + Error(err) + exit(1) + +if gVerb: print("# Settings: data-format: %d fix-2.6.1: %d has-casemap: %d UCD-version: %s" + % (gFmt, gFix26, gCMap, UVers)) + +#-------------------------------------------------------------------------------------------------- + +def ParseDCProps (name, start): + v = set() + r1 = re.compile(r'^[0-9A-F]+') + r2 = re.compile(r'^([0-9A-F]{4,6})\.\.([0-9A-F]+)') + for L in Sed("/^# Derived Property: " +start+ "/,/^# Total code points:/p", "DerivedCoreProperties"): + m = re.match(r2, L) + if m: + for i in range(Hex(m[1]), Hex(m[2]) + 1): v.add(i) + else: + m = re.match(r1, L) + if m: v.add(Hex(m[0])) + return v + + +def Ignorable (): + return ParseDCProps("Ignorable", "Default_Ignorable_Code_Point") + +def Uppercase (): + return ParseDCProps("Uppercase", "Uppercase") + +def Lowercase (): + return ParseDCProps("Lowercase", "Lowercase") + + +def GraphemeBounds (): + v = defaultdict(lambda: "Other") + r1 = re.compile(r'^([0-9A-F]+)\s*;\s*([A-Za-z_]+)') + r2 = re.compile(r'^([0-9A-F]{4,6})\.\.([0-9A-F]+)\s*;\s*([A-Za-z_]+)') + for L in Cat("GraphemeBreakProperty"): + m = re.match(r2, L) + if m: + C = m[3] + for n in range(Hex(m[1]), Hex(m[2]) + 1): v[n] = C + else: + m = re.match(r1, L) + if m: + C = m[2] + v[Hex(m[1])] = C + + r1 = re.compile(r'^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Extended_Pictographic\W', re.I) + r2 = re.compile(r'^([0-9A-F]+)\s*;\s*Extended_Pictographic\W', re.I) + r3 = re.compile(r'^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Emoji_Modifier\W', re.I) + r4 = re.compile(r'^([0-9A-F]+)\s*;\s*Emoji_Modifier\W', re.I) + extPict = 'EXTENDED_PICTOGRAPHIC'; extend = 'EXTEND' + for L in Cat("emoji-data"): + m = re.match(r1, L) + if m: + for e in range(Hex(m[1]), Hex(m[2]) + 1): v[e] = extPict + else: + m = re.match(r2, L) + if m: + v[Hex(m[1])] = extPict + else: + m = re.match(r3, L) + if m: + for e in range(Hex(m[1]), Hex(m[2]) + 1): v[e] = extend + else: + m = re.match(r4, L) + if m: v[Hex(m[1])] = extend + return v + + +def CharWidths (): + cws = defaultdict(lambda: 1) + + # Use a default width of 1 for all character categories that are letter/symbol/number-like, + # as well as for unassigned/private-use chars. This can be overridden by UAX 11 below, + # but provides a useful nonzero fallback for new codepoints when a new Unicode version + # has been released but Unifont hasn't been updated yet. + + # Categories that may contain zero-width chars + zerowidth = set(('Mn','Mc','Me','Zl','Zp','Cc','Cf','Cs')) # +'Sk' - see issue #167 + for c in uchars: + if c.category in zerowidth: + cws[c.code] = 0 + + # Widths from UAX #11: East Asian Width + # These take precedence for all codepoints listed explicitly as wide/full/narrow/half-width + rx = re.compile(r'^([0-9A-F]{4,6})(\.\.([0-9A-F]{4,6}))?;([AFHNW]a?)(\s*#\s*Cn\W)?') + for L in Cat("EastAsianWidth"): + m = re.match(rx, L) + if m: + width = m[4] + cstart = Hex(m[1]) + cend = Hex(m[3]) if m.end(3) > 0 else cstart + for c in range(cstart, cend + 1): # Assign widths + if not m[5]: # skip any with a `# Cn …` comment + if width == 'W' or width == 'F': # wide or full + cws[c] = 2 + elif width == 'Na' or width == 'H': + cws[c] = 1 + + # A few exceptions to the above cases, found by manual comparison + # to other wcwidth functions and similar checks. + for ch in uchars: + c = ch.code + cat = ch.category + + # Ensure format control chars (cat Cf) have width 0 (some of these, like U+0601, + # can have a width in some cases but normally act like prepended combining marks. + # U+FFF9 etc. are also odd, but have zero width in typical terminal contexts) + if cat == 'Cf': cws[c] = 0 + + # Unifont has nonzero width for a number of non-spacing combining characters, + # e.g. (in 7.0.06): F84,17B4,17B5,180B,180D,2D7F and the variation selectors + elif cat == 'Mn': cws[c] = 0 + + # We also assign width of one to unassigned and private-use codepoints + # (Unifont includes ConScript Unicode Registry PUA fonts, but since these + # are nonstandard it seems questionable to use Unifont metrics; + # if they are printed as the replacement char U+FFFD they will have width 1). + elif cat == 'Co' or cat == 'Cn': cws[c] = 1 + + # For some reason, Unifont has width-2 glyphs for ASCII control chars + elif cat == 'Cc': cws[c] = 0 + + # Soft hyphen is typically printed as a hyphen (-) in terminals. + cws[0x00AD] = 1 + + # By definition, should have zero width (on the same line) + cws[0x2028] = 0 # category: Zl, name: LINE SEPARATOR + cws[0x2029] = 0 # category: Zp, name: PARAGRAPH SEPARATOR + return cws + + +def CompExclusions (start, name): + v = set() + rx = re.compile(r'^[0-9A-F]+') + for L in Sed("/^# " +start+ "/,/^# Total code points:/p", "CompositionExclusions"): + m = re.match(rx, L) + if m: v.add(Hex(m[0])) + return v + + +def Exclusions (): + v = CompExclusions("\(1\) Script Specifics", "Exclusions") + # data_generator.rb erroneously adds `0` in lines 136 & 139 (for each comment line) + if not gFix26: v.add(0) #••• + return v + + +def Precomposed (): + return CompExclusions("\(2\) Post Composition Version precomposed characters", "Precomposed") + + +def CaseFolding (): + v = {} + rx = re.compile(r'^([0-9A-F]+); [CF]; ([0-9A-F ]+);') + for L in Cat("CaseFolding"): + m = re.match(rx, L) + if m: v[Hex(m[1])] = HexArray(m[2]) + return v + + +udRE = re.compile(r'^' + r'([0-9A-F]+);' # 1: code + r'([^;]+);' # 2: name + r'([A-Za-z]+);' # 3: general category + r'([0-9]+);' # 4: canonical combining class + r'([A-Z]+);' # 5: bidi class + r'(<([A-Za-z]*)>)?' # 7: decomposition type + r'((\ ?[0-9A-F]+)*);' # 8: decompomposition mapping + r'([0-9]*);' # 10: decimal digit + r'([0-9]*);' # 11: digit + r'([^;]*);' # 12: numeric + r'([YN]*);' # 13: bidi mirrored + r'([^;]*);' # 14: unicode 1.0 name + r'([^;]*);' # 15: iso comment + r'([0-9A-F]*);' # 16: simple uppercase mapping + r'([0-9A-F]*);' # 17: simple lowercase mapping + r'([0-9A-F]*)$') # 18: simple titlecase mapping +uchar_hash = {} +pSequences = [] +seqs_hash = {} +pCaseMap = [] +cmap_hash = {} + +def push_seq (seq): + key = str(seq) + if key not in seqs_hash: + idx = len(pSequences) + seqs_hash[key] = idx + pSequences.extend(seq) + return idx + return seqs_hash[key] + +def push_cas (seq): + if not gCMap: return push_seq(seq) + key = str(seq) + if key not in cmap_hash: + idx = len(pCaseMap) + cmap_hash[key] = idx + pCaseMap.extend(seq) + return idx + return cmap_hash[key] + +def to_u16 (seq): + v = [] + for cp in seq: + if cp <= 0xFFFF: + if (cp >> 11) == 0x1B: Error("UTF-16 code: U+%06X" % cp) + v.append(cp) + else: + v += [0xD800 | ((cp - 0x10000) >> 10), 0xDC00 | (cp & 0x03FF)] + return v + +def dm_index (seq): # decomp_map sequence + if not seq or len(seq) == 0: return g_ + lencode = len(seq) - 1 # no sequence has len 0, so we encode len 1 as 0, len 2 as 1, … + seq = to_u16(seq) + if lencode >= 7: # we have 3 bits for length (which is cutting it close. + seq.insert(0, lencode) # May need to change it to 2 bits in future Unicode versions) + lencode = 7 + idx = push_seq(seq) + if idx > 0x1FFF: Error("decomp_map: pSequences[%d] out of bounds" % idx) + return idx | (lencode << 13) + +def cf_index (seq): # case_fold sequence + if not gCMap: return dm_index(seq) + if not seq or len(seq) == 0: return g_ + lencode = len(seq) - 1 # no sequence has len 0, so we encode len 1 as 0, len 2 as 1, … + seq = to_u16(seq) + if lencode >= 7: # we have 3 bits for length + seq.insert(0, lencode) + lencode = 7 + idx = push_cas(seq) + if idx > 0x1FFF: Error("case_fold: pCaseMap[%d] out of bounds" % idx) + return idx | (lencode << 13) + +def case_map (cp): + if not cp: return g_ + if cp > 0xFFFF: + idx = push_cas([0xD800 | ((cp - 0x10000) >> 10), 0xDC00 | (cp & 0x03FF)]) + else: + if (cp >> 11) == 0x1B: Error("UTF-16 code: U+%06X" % cp) + idx = push_cas([cp]) + if idx >= 0xFFFF: Error("case_map: pCaseMap[%d] out of bounds" % idx); + return idx + +def S (s): return s if s != None else g_ + +def F (f): return gT if f else gF + +def D0 (s): return 'UTF8PROC_DECOMP_TYPE_' + s.upper() if s else '0' +def D1 (s): return s.upper() if s else '0' + +g_ = 'UINT16_MAX'; gT = 'true'; gF = 'false'; D = D0 +gPF = " {{UTF8PROC_CATEGORY_{}, {}, UTF8PROC_BIDI_CLASS_{}, {}, {}," \ + " {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, 0, UTF8PROC_BOUNDCLASS_{}}}," +if gFmt: g_ = '_'; gT = 'T'; gF = 'F'; D = D1; \ + gPF = "\tP({},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{})" +ZZCC = set(('Zl', 'Zp', 'Cc', 'Cf')) + +class UChar: + def __init__ (self, line): + m = re.match(udRE, line) + if not m: Error("Could not parse input ‘%s’" % line) + uc = Hex(m[1]) + self.code = uc + self.name = m[2] + self.category = m[3] + self.comb_class = int(m[4]) + self.bidi_class = m[5] + self.decomp_type = m[7] + self.decomp_map = HexArray(m[8]) if m[8] else None + self.decomp_idx = g_ + self.bidi_mirror = (m[13] == 'Y') + # issue #130: use nonstandard uppercase ß -> ẞ + # issue #195: if character is uppercase but has no lowercase mapping, + # then make lowercase mapping = itself (vice versa for lowercase) + U = Hex0(m[16]); L = Hex0(m[17]); T = Hex0(m[18]) + self.uppercase = (0x1E9E if uc==0x00DF else (uc if L<0 and uc in lowercase \ + else None)) if U<0 else U + self.lowercase = (uc if U<0 and uc in uppercase else None) if L<0 else L + self.titlecase = (0x1E9E if uc==0x00DF else None) if T<0 else T + self.case_fold = caseFolding.get(uc) + self.centry_idx = None + + def ctrl_bound (self): + return F((self.category in ZZCC) and (not gFix26 or (self.code != 0x200C and + self.code != 0x200D))) #••• + + def c_entry (self): + uc = self.code + com_idx = combIndexes.get(uc, g_) + comp_ex = F(uc in exclusions or uc in exclVersion) + ignore = F(uc in ignorable) + gbounds = grfmBounds[uc].upper() + self.uppercase = case_map(self.uppercase) + self.lowercase = case_map(self.lowercase) + self.titlecase = case_map(self.titlecase) + return gPF.format( + self.category.upper(), self.comb_class, self.bidi_class, + D(self.decomp_type), S(self.decomp_idx), S(self.case_fold), + self.uppercase, self.lowercase, self.titlecase, com_idx, F(self.bidi_mirror), + comp_ex, ignore, self.ctrl_bound(), charWidths[uc], gbounds) + + +def UnicodeData (): + uchars = [] + r1 = re.compile(r'^([0-9A-F]+);<[^;>,]+, First>;') + r2 = re.compile(r'^([0-9A-F]+);<([^;>,]+), Last>;') + state = True + for L in Cat("UnicodeData"): + if state: + m = re.match(r1, L) + if m: + first = Hex(m[1]) + state = False + else: + ch = UChar(L) + uchar_hash[ch.code] = ch + uchars.append(ch) + else: + m = re.match(r2, L) + if not m: Error("No last character of sequence U+%04X … found" % first) + name = '<' + m[2] + '>' + ch = UChar(L) + state = True + for i in range(first, Hex(m[1]) + 1): + ch_clone = copy.copy(ch) + ch_clone.code = i + ch_clone.name = name + uchar_hash[i] = ch_clone + uchars.append(ch_clone) + if not state: Error("No last character of sequence U+%04X … found" % first) + return uchars + + +def Sequences (): + comb1st = {} + comb1st_keys = [] # Ordered list of comb1st’ keys. + comb2nd = {} + comb2nd_keys = [] + comb2nd_long = set() + comb_array = {} + for ch in uchars: + dm = ch.decomp_map + if not ch.decomp_type and dm and len(dm) == 2 and dm[0] in uchar_hash and \ + uchar_hash[dm[0]].comb_class == 0 and ch.code not in exclusions: + dm0 = dm[0] + dm1 = dm[1] + if dm0 not in comb1st or comb1st[dm0] == None: + comb1st[dm0] = c1i_dm0 = len(comb1st) + comb1st_keys.append(dm0) + else: + c1i_dm0 = comb1st[dm0] + if dm1 not in comb2nd or comb2nd[dm1] == None: + comb2nd_keys.append(dm1) + comb2nd[dm1] = len(comb2nd) + if not comb_array.get(c1i_dm0): + comb_array[c1i_dm0] = {} + if comb2nd[dm1] in comb_array[c1i_dm0]: + Error("Duplicate canonical mapping: U+%05X %d/%d" % (ch.code, dm0, dm1)) + comb_array[c1i_dm0][comb2nd[dm1]] = ch.code + if ch.code > 0xFFFF: comb2nd_long.add(dm1) + ch.decomp_idx = dm_index(dm) + ch.case_fold = cf_index(ch.case_fold) + + comb_idxs = {} + comb1st_offsets = {} # (first, last) tuples + cur_pos = 0 + for dm0 in comb1st_keys: # Force comb1st build order + index = comb1st[dm0] + comb_i = comb_array[index] + first = None + last = None + offset = 0; b = -1 + for dm1 in comb2nd_keys: + b += 1 + if b in comb_i and comb_i[b] != None: + if first == None: first = offset + last = offset + if dm1 in comb2nd_long: last += 1 + offset += 1 + if dm1 in comb2nd_long: offset += 1 + comb1st_offsets[index] = (first, last) + if dm0 in comb_idxs: Error("double index at %d" % dm0) + comb_idxs[dm0] = cur_pos + cur_pos += last - first + 1 + 2 + + offset = 0 + for dm1 in comb2nd_keys: + if dm1 in comb_idxs: Error("double index at %d" % dm1) + comb_idxs[dm1] = 0x8000 | (comb2nd[dm1] + offset) + if comb2nd[dm1] + offset > 0x4000: Error("too large comb index at %d" % dm1) + if dm1 in comb2nd_long: + comb_idxs[dm1] = comb_idxs[dm1] | 0x4000 + offset += 1 + + class SStr: # A string stream + def __init__ (self): self.s = " " + def __lshift__ (self, x): self.s += x; return self + def i (self, x): self.s += "%d, " % x; return self + class SVec: # A vector stream + def __init__ (self): self.s = [] + def __lshift__ (self, x): return self + def i (self, x): self.s.append(x); return self + + # Create string with original line breaking or array of ints + i = 0; s = SVec() if gFmt else SStr() + for a in range(len(comb1st)): + o1 = comb1st_offsets[a] + s.i(o1[0]).i(o1[1]) + offset = 0; b = -1 + for dm1 in comb2nd_keys: + b += 1 + if offset > o1[1]: break + if offset >= o1[0]: + i += 1 + if i == 8: i = 0; s << "\n " + v = comb_array[a][b] if b in comb_array[a] else 0 + if dm1 in comb2nd_long: s.i((v & 0xFFFF0000) >> 16) + s.i(v & 0xFFFF) + offset += 2 if dm1 in comb2nd_long else 1 + s << "\n" + return s.s, comb_idxs + + +def WriteIntArray (array, name, file, bytes = False): + print("static const utf8proc_uint%d_t utf8proc_%s[] = {" + % ((8 if bytes else 16), name), file=file) + if isinstance(array, str): + print(array, end='', file=file) + elif gFmt: + i = 0; pre = "\t" + for e in array: + if i and i % 16 == 0: pre = ",\n\t" + i += 1 + print("%s%d" % (pre, e), end='', file=file) + pre = "," + print("", file=file) + else: + i = 0; print(" ", end='', file=file) + for e in array: + i += 1 + if i == 8: print("\n ", end='', file=file); i = 0 + print("%d, " % e, end='', file=file) + print("};\n", file=file) + + +def WriteData (stage1, stage2, props, combinations): + f = open(Targ, mode='w') + if gFmt: print("// Generated by %s v%s from UnicodeData version %s on %s.\n" + "// Options: --format=%d%s%s\n\n" + "#define UNICODE_VERSION \"%s\"" + % (Me, Vers, UVers, datetime.datetime.utcnow().strftime("%F %T"), gFmt, + (" --fix26" if gFix26 else ""), (" --cmap" if gCMap else ""), UVers), file=f) + WriteIntArray(pSequences, "sequences", f) + if gCMap: + print("#define U8CASEMAP", file=f) + WriteIntArray(pCaseMap, "casemap", f) + WriteIntArray(stage1, "stage1table", f, gS1Byt) + WriteIntArray(stage2, "stage2table", f) + if gFmt: + print("#define P(C,c,B,D,ds,cs,us,ls,ts,x,m,e,i,cb,w,b)\t" + "{UTF8PROC_CATEGORY_##C,c,UTF8PROC_BIDI_CLASS_##B,\\\n\t\t" + "UTF8PROC_DECOMP_TYPE_##D,ds,cs,us,ls,ts,x,m,e,i,cb,w,0,UTF8PROC_BOUNDCLASS_##b},\n" + "enum { F = false, T = true, UTF8PROC_BIDI_CLASS_0 = 0," + " UTF8PROC_DECOMP_TYPE_0 = 0, _ = UINT16_MAX };\n\n" + "static const utf8proc_property_t utf8proc_properties[] = {\n" + " P(CN,0,0,0,_,_,_,_,_,_,F,F,F,F,1,OTHER)", file=f) + else: + u = "UINT16_MAX, "; b = "false," + print("static const utf8proc_property_t utf8proc_properties[] = {\n" + " {0, 0, 0, 0, "+u+u+u+u+u+u+' '+b+b+b+b+" 1, 0, UTF8PROC_BOUNDCLASS_OTHER},", file=f) + for c in props: + print(c, file=f) + print("};\n", file=f) + WriteIntArray(combinations, "combinations", f) + if gFmt: print("// End.", file=f) + if gVerb: print("# Wrote ‘%s’." % Targ) + + +ignorable = Ignorable() +uppercase = Uppercase() +lowercase = Lowercase() +grfmBounds = GraphemeBounds() # Grapheme bounds classes +exclusions = Exclusions() +exclVersion = Precomposed() +caseFolding = CaseFolding() +uchars = UnicodeData() +charWidths = CharWidths() +(pCombinations, combIndexes) = Sequences() + +prop_idxs = {} +pProperties = [] +for ch in uchars: + centry = ch.c_entry() + ch.centry_idx = prop_idxs.get(centry) + if not ch.centry_idx: + prop_idxs[centry] = ch.centry_idx = len(pProperties) + pProperties.append(centry) + +pStage1 = [] # stage1table +pStage2 = [] # stage2table +chunks2 = [] +scale = 1 if gS1Byt else 0x100 +for page in range(0, 0x110000, 0x100): + stage2_entry = [] + for code in range(page, page + 0x100): + stage2_entry.append(uchar_hash[code].centry_idx + 1 if code in uchar_hash else 0) + if stage2_entry in chunks2: + pStage1.append(chunks2.index(stage2_entry) * scale) + else: + pStage1.append(len(chunks2) * scale) + pStage2.extend(stage2_entry) + chunks2.append(stage2_entry) + +WriteData(pStage1, pStage2, pProperties, pCombinations) + From dbc2b185a9d94f52fe1c70f6f3c83c342ec48dd3 Mon Sep 17 00:00:00 2001 From: chris0e3 Date: Fri, 17 Sep 2021 01:12:34 +0100 Subject: [PATCH 3/3] =?UTF-8?q?Makefile=20-=20(WCFLAGS):=20Restored=20`-Ws?= =?UTF-8?q?ign-conversion`=20from=20610730f2314f4cdb52c64e2ef78a9d5d69402b?= =?UTF-8?q?66.=20=09(UNICODE=5FVERSION):=20Added=20`14.0.0`=20default.=20?= =?UTF-8?q?=09=09[Note:=20It=20may=20be=20possible=20to=20remove=20the=20s?= =?UTF-8?q?imilar=20statement=20in=20data/Makefile=20but=20it=20may=20requ?= =?UTF-8?q?ire=20updating=20all=20the=20calls=20to=20`$(MAKE)=20-C=20data?= =?UTF-8?q?=20=E2=80=A6`=20in=20this=20file.]=20=09(test/misc):=20Use=20ab?= =?UTF-8?q?ove=20UNICODE=5FVERSION=20var=20so=20that=20this=20target=20als?= =?UTF-8?q?o=20builds=20with=20original=20utf8proc=5Fdata.c=20file.=20=09?= =?UTF-8?q?=09[Note:=20This=20test=20in=20test/misc=20is=20now=20redundant?= =?UTF-8?q?=20when=20using=20the=20new=20utf8proc=5Fdata.c=20files.]=20utf?= =?UTF-8?q?8proc.c=20=20-=20Restored=20changes=20from=20610730f2314f4cdb52?= =?UTF-8?q?c64e2ef78a9d5d69402b66.=20=09(utf8proc=5Fcategory):=20Fixed=20n?= =?UTF-8?q?ew=20warning.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Makefile | 6 ++++-- utf8proc.c | 16 ++++++++-------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index 3fa9267..acc537e 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,7 @@ PERL=perl CFLAGS ?= -O2 PICFLAG = -fPIC C99FLAG = -std=c99 -WCFLAGS = -Wall -Wextra -pedantic +WCFLAGS = -Wsign-conversion -Wall -Wextra -pedantic UCFLAGS = $(CPPFLAGS) $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS $(UTF8PROC_DEFINES) LDFLAG_SHARED = -shared SOFLAG = -Wl,-soname @@ -26,6 +26,8 @@ MAJOR=2 MINOR=4 PATCH=1 +UNICODE_VERSION ?= 14.0.0 + OS := $(shell uname) ifeq ($(OS),Darwin) # MacOS X SHLIB_EXT = dylib @@ -166,7 +168,7 @@ test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h $(CC) $(UCFLAGS) $(LDFLAGS) test/custom.c test/tests.o utf8proc.o -o $@ test/misc: test/misc.c test/tests.o utf8proc.o utf8proc.h test/tests.h - $(CC) $(UCFLAGS) $(LDFLAGS) -DUNICODE_VERSION=`sed -En 's/^#.+UNICODE_VERSION.(.+)/\1/p' utf8proc_data.c` test/misc.c test/tests.o utf8proc.o -o $@ + $(CC) $(UCFLAGS) $(LDFLAGS) '-DUNICODE_VERSION="$(UNICODE_VERSION)"' test/misc.c test/tests.o utf8proc.o -o $@ check: test/normtest data/NormalizationTest.txt data/Lowercase.txt data/Uppercase.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/iscase test/custom test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o $(MAKE) -C bench diff --git a/utf8proc.c b/utf8proc.c index 784cdca..f2845be 100644 --- a/utf8proc.c +++ b/utf8proc.c @@ -132,7 +132,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) { UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate( const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst ) { - utf8proc_uint32_t uc; + utf8proc_int32_t uc; const utf8proc_uint8_t *end; *dst = -1; @@ -144,7 +144,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate( return 1; } // Must be between 0xc2 and 0xf4 inclusive to be valid - if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8; + if ((utf8proc_uint32_t)(uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8; if (uc < 0xe0) { // 2-byte sequence // Must have valid continuation character if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8; @@ -399,19 +399,19 @@ write_char_decomposed_case(utf8proc_uint16_t idx, utf8proc_int32_t *dst, utf8pro UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c) { utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex; - return cl != UINT16_MAX ? seqindex_decode_index(cl) : c; + return cl != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cl) : c; } UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c) { utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex; - return cu != UINT16_MAX ? seqindex_decode_index(cu) : c; + return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c; } UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c) { utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex; - return cu != UINT16_MAX ? seqindex_decode_index(cu) : c; + return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c; } UTF8PROC_DLLEXPORT int utf8proc_islower(utf8proc_int32_t c) @@ -433,7 +433,7 @@ UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) { } UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) { - return utf8proc_get_property(c)->category; + return (utf8proc_category_t) utf8proc_get_property(c)->category; } UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) { @@ -443,7 +443,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) { #define utf8proc_decompose_lump(replacement_uc) \ return utf8proc_decompose_char((replacement_uc), dst, bufsize, \ - options & ~UTF8PROC_LUMP, last_boundclass) + options & ~(unsigned int)UTF8PROC_LUMP, last_boundclass) UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) { const utf8proc_property_t *property; @@ -758,7 +758,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom( *dstptr = NULL; result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data); if (result < 0) return result; - buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1); + buffer = (utf8proc_int32_t *) malloc(((utf8proc_size_t)result) * sizeof(utf8proc_int32_t) + 1); if (!buffer) return UTF8PROC_ERROR_NOMEM; result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data); if (result < 0) {