From 772eca6d0412fa3f5a3a39fe309bfb2e7b8754a0 Mon Sep 17 00:00:00 2001
From: chris0e3 <chris0e3@gmail.com>
Date: Thu, 16 Sep 2021 22:05:43 +0100
Subject: [PATCH 1/3] Resolves JuliaStrings/utf8proc#227

---
 Makefile      |  6 +++---
 data/Makefile | 50 ++++++++++++++++++++++----------------------------
 utf8proc.c    | 51 +++++++++++++++++++++++++++++++++++++--------------
 3 files changed, 62 insertions(+), 45 deletions(-)

diff --git a/Makefile b/Makefile
index ede0609..3fa9267 100644
--- a/Makefile
+++ b/Makefile
@@ -11,7 +11,7 @@ PERL=perl
 CFLAGS ?= -O2
 PICFLAG = -fPIC
 C99FLAG = -std=c99
-WCFLAGS = -Wsign-conversion -Wall -Wextra -pedantic
+WCFLAGS = -Wall -Wextra -pedantic
 UCFLAGS = $(CPPFLAGS) $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS $(UTF8PROC_DEFINES)
 LDFLAG_SHARED = -shared
 SOFLAG = -Wl,-soname
@@ -70,7 +70,7 @@ manifest: MANIFEST.new
 
 # real targets
 
-data/utf8proc_data.c.new: libutf8proc.$(SHLIB_EXT) data/data_generator.rb data/charwidths.jl
+data/utf8proc_data.c.new: data_make.py
 	$(MAKE) -C data utf8proc_data.c.new
 
 utf8proc.o: utf8proc.h utf8proc.c utf8proc_data.c
@@ -166,7 +166,7 @@ test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h
 	$(CC) $(UCFLAGS) $(LDFLAGS) test/custom.c test/tests.o utf8proc.o -o $@
 
 test/misc: test/misc.c test/tests.o utf8proc.o utf8proc.h test/tests.h
-	$(CC) $(UCFLAGS) $(LDFLAGS) -DUNICODE_VERSION='"'`$(PERL) -ne "/^UNICODE_VERSION=/ and print $$';" data/Makefile`'"' test/misc.c test/tests.o utf8proc.o -o $@
+	$(CC) $(UCFLAGS) $(LDFLAGS) -DUNICODE_VERSION=`sed -En 's/^#.+UNICODE_VERSION.(.+)/\1/p' utf8proc_data.c` test/misc.c test/tests.o utf8proc.o -o $@
 
 check: test/normtest data/NormalizationTest.txt data/Lowercase.txt data/Uppercase.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/iscase test/custom test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
 	$(MAKE) -C bench
diff --git a/data/Makefile b/data/Makefile
index 6d3667c..5573d04 100644
--- a/data/Makefile
+++ b/data/Makefile
@@ -1,63 +1,57 @@
 # Unicode data generation rules.  Except for the test data files, most
 # users will not use these Makefile rules, which are primarily to re-generate
-# unicode_data.c when we get a new Unicode version or charwidth data; they
-# require ruby and julia to be installed.
+# unicode_data.c when we get a new Unicode version or charwidth data.
+# Requires python 3.7+, curl & sed to be installed.
 
-# programs
-CURL=curl
-RUBY=ruby
-PERL=perl
-MAKE=make
-JULIA=julia
-CURLFLAGS = --retry 5 --location
+CURL = /usr/bin/curl --retry 5 --location
 
 .PHONY: clean
 
 .DELETE_ON_ERROR:
 
-utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt CharWidths.txt emoji-data.txt
-	$(RUBY) data_generator.rb < UnicodeData.txt > $@
-
-CharWidths.txt: charwidths.jl EastAsianWidth.txt
-	$(JULIA) charwidths.jl > $@
+utf8proc_data.c.new: ../data_make.py UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt \
+					 CompositionExclusions.txt CaseFolding.txt emoji-data.txt EastAsianWidth.txt
+	../data_make.py --format=1 --fix26 --output $@ .
 
 # Unicode data version (must also update utf8proc_unicode_version function)
-UNICODE_VERSION=13.0.0
+UNICODE_VERSION?=14.0.0
+URL_ROOT = $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd
 
 UnicodeData.txt:
-	$(CURL) $(CURLFLAGS) -o $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt
+	$(CURL) -o $@ $(URL_ROOT)/UnicodeData.txt
 
 EastAsianWidth.txt:
-	$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/EastAsianWidth.txt
+	$(CURL) -o $@ $(URL_ROOT)/EastAsianWidth.txt
 
 GraphemeBreakProperty.txt:
-	$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/GraphemeBreakProperty.txt
+	$(CURL) -o $@ $(URL_ROOT)/auxiliary/GraphemeBreakProperty.txt
 
 DerivedCoreProperties.txt:
-	$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/DerivedCoreProperties.txt
+	$(CURL) -o $@ $(URL_ROOT)/DerivedCoreProperties.txt
 
 CompositionExclusions.txt:
-	$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/CompositionExclusions.txt
+	$(CURL) -o $@ $(URL_ROOT)/CompositionExclusions.txt
 
 CaseFolding.txt:
-	$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/CaseFolding.txt
+	$(CURL) -o $@ $(URL_ROOT)/CaseFolding.txt
 
 NormalizationTest.txt:
-	$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/NormalizationTest.txt
+	$(CURL) -o $@ $(URL_ROOT)/NormalizationTest.txt
 
 GraphemeBreakTest.txt:
-	$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/GraphemeBreakTest.txt
+	$(CURL) -o $@ $(URL_ROOT)/auxiliary/GraphemeBreakTest.txt
 
 emoji-data.txt:
-	$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://unicode.org/Public/$(UNICODE_VERSION)/ucd/emoji/emoji-data.txt
+	$(CURL) -o $@ $(URL_ROOT)/emoji/emoji-data.txt
 
 Uppercase.txt: DerivedCoreProperties.txt
-	$(RUBY) -e 'puts File.read("DerivedCoreProperties.txt")[/# Derived Property: Uppercase.*?# Total code points:/m]' > $@
+	sed -En '/^# Derived Property: Uppercase/,/^# Total code points:/p' DerivedCoreProperties.txt > $@
 
 Lowercase.txt: DerivedCoreProperties.txt
-	$(RUBY) -e 'puts File.read("DerivedCoreProperties.txt")[/# Derived Property: Lowercase.*?# Total code points:/m]' > $@
+	sed -En '/^# Derived Property: Lowercase/,/^# Total code points:/p' DerivedCoreProperties.txt > $@
 
 clean:
-	rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt emoji-data.txt
-	rm -f Uppercase.txt Lowercase.txt
+	rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt \
+		  CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt \
+		  emoji-data.txt Uppercase.txt Lowercase.txt
 	rm -f utf8proc_data.c.new
diff --git a/utf8proc.c b/utf8proc.c
index 225738c..784cdca 100644
--- a/utf8proc.c
+++ b/utf8proc.c
@@ -51,6 +51,9 @@
 #endif
 
 #include "utf8proc_data.c"
+#ifndef U8CASEMAP
+#define utf8proc_casemap utf8proc_sequences
+#endif
 
 
 UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = {
@@ -101,7 +104,11 @@ UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
 }
 
 UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) {
+#ifdef UNICODE_VERSION
+  return UNICODE_VERSION;
+#else
   return "13.0.0";
+#endif
 }
 
 UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
@@ -125,7 +132,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
   const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst
 ) {
-  utf8proc_int32_t uc;
+  utf8proc_uint32_t uc;
   const utf8proc_uint8_t *end;
 
   *dst = -1;
@@ -137,7 +144,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
     return 1;
   }
   // Must be between 0xc2 and 0xf4 inclusive to be valid
-  if ((utf8proc_uint32_t)(uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
+  if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
   if (uc < 0xe0) {         // 2-byte sequence
      // Must have valid continuation character
      if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
@@ -232,9 +239,10 @@ static utf8proc_ssize_t charbound_encode_char(utf8proc_int32_t uc, utf8proc_uint
 /* internal "unsafe" version that does not check whether uc is in range */
 static const utf8proc_property_t *unsafe_get_property(utf8proc_int32_t uc) {
   /* ASSERT: uc >= 0 && uc < 0x110000 */
+  const int stage1shift = 16 - sizeof(utf8proc_stage1table[0]) * 8;
   return utf8proc_properties + (
     utf8proc_stage2table[
-      utf8proc_stage1table[uc >> 8] + (uc & 0xFF)
+      (utf8proc_stage1table[uc >> 8] << stage1shift) + (uc & 0xFF)
     ]
   );
 }
@@ -350,14 +358,15 @@ static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry)
 
 static utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex)
 {
-  const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex];
+  const utf8proc_uint16_t *entry = &utf8proc_casemap[seqindex];
   return seqindex_decode_entry(&entry);
 }
 
-static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
+static utf8proc_ssize_t
+write_char_decomposed(const utf8proc_uint16_t *entry, int len, utf8proc_int32_t *dst,
+                      utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass)
+{
   utf8proc_ssize_t written = 0;
-  const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x1FFF];
-  int len = seqindex >> 13;
   if (len >= 7) {
     len = *entry;
     entry++;
@@ -373,22 +382,36 @@ static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqinde
   return written;
 }
 
+static inline utf8proc_ssize_t
+write_char_decomposed_seq(utf8proc_uint16_t idx, utf8proc_int32_t *dst, utf8proc_ssize_t dstsize,
+                          utf8proc_option_t options, int *last_boundclass)
+{
+  return write_char_decomposed(&utf8proc_sequences[idx & 0x1FFF], idx >> 13, dst, dstsize, options, last_boundclass);
+}
+
+static inline utf8proc_ssize_t
+write_char_decomposed_case(utf8proc_uint16_t idx, utf8proc_int32_t *dst, utf8proc_ssize_t dstsize,
+                           utf8proc_option_t options, int* last_boundclass)
+{
+  return write_char_decomposed(&utf8proc_casemap[idx & 0x1FFF], idx >> 13, dst, dstsize, options, last_boundclass);
+}
+
 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
 {
   utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex;
-  return cl != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cl) : c;
+  return cl != UINT16_MAX ? seqindex_decode_index(cl) : c;
 }
 
 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
 {
   utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex;
-  return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c;
+  return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
 }
 
 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c)
 {
   utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex;
-  return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c;
+  return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
 }
 
 UTF8PROC_DLLEXPORT int utf8proc_islower(utf8proc_int32_t c)
@@ -420,7 +443,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
 
 #define utf8proc_decompose_lump(replacement_uc) \
   return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
-  options & ~(unsigned int)UTF8PROC_LUMP, last_boundclass)
+  options & ~UTF8PROC_LUMP, last_boundclass)
 
 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
   const utf8proc_property_t *property;
@@ -487,13 +510,13 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
   }
   if (options & UTF8PROC_CASEFOLD) {
     if (property->casefold_seqindex != UINT16_MAX) {
-      return seqindex_write_char_decomposed(property->casefold_seqindex, dst, bufsize, options, last_boundclass);
+      return write_char_decomposed_case(property->casefold_seqindex, dst, bufsize, options, last_boundclass);
     }
   }
   if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
     if (property->decomp_seqindex != UINT16_MAX &&
         (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
-      return seqindex_write_char_decomposed(property->decomp_seqindex, dst, bufsize, options, last_boundclass);
+      return write_char_decomposed_seq(property->decomp_seqindex, dst, bufsize, options, last_boundclass);
     }
   }
   if (options & UTF8PROC_CHARBOUND) {
@@ -735,7 +758,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
   *dstptr = NULL;
   result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data);
   if (result < 0) return result;
-  buffer = (utf8proc_int32_t *) malloc(((utf8proc_size_t)result) * sizeof(utf8proc_int32_t) + 1);
+  buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
   if (!buffer) return UTF8PROC_ERROR_NOMEM;
   result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data);
   if (result < 0) {

From a9bee2f056aebc2fea3e26e50c7b54ad8ca66e21 Mon Sep 17 00:00:00 2001
From: chris0e3 <chris0e3@gmail.com>
Date: Thu, 16 Sep 2021 22:11:56 +0100
Subject: [PATCH 2/3] Missing file. Resolves JuliaStrings/utf8proc#227

---
 data_make.py | 588 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 588 insertions(+)
 create mode 100755 data_make.py

diff --git a/data_make.py b/data_make.py
new file mode 100755
index 0000000..4234f88
--- /dev/null
+++ b/data_make.py
@@ -0,0 +1,588 @@
+#!/usr/bin/python3
+# Generate 'utf8proc_data.c' by parsing the Unicode data files 'UnicodeData.txt' etc.
+# from the Unicode Character Database.  Tested with UCD 13.0.0 & 14.0.0-dev.
+# Usage: data_make.py [options…] [<data-dir>]
+
+import re, os, sys, fileinput, copy, datetime, getopt, platform
+from collections import defaultdict
+
+Me     = os.path.basename(sys.argv[0])
+Vers   = '2.7.0dev'		# ???
+DDir   = "./data"
+Targ   = "utf8proc_data.out.c"
+UVers  = None
+gVerb  = 0
+gFmt   = 0
+gCMap  = False
+gS1Byt = False
+gFix26 = False
+
+
+def Print (s):
+	print(s, file=sys.stderr)
+
+def Error (msg):
+	Print("%s: ERROR: %s." % (Me, msg))
+
+def TextFile (name):
+	return DDir+ "/" +name+ ".txt"
+
+def Sed (expr, file):
+	return os.popen("/usr/bin/sed -En '" +expr+ "' '" +TextFile(file)+ "'", 'r', 1)
+
+def Cat (file):
+	return fileinput.input(files=TextFile(file), mode='r')
+
+def Hex (s):
+	return int(s, 16)
+
+def Hex0 (s):
+	return int(s, 16) if s else -1
+
+def HexArray (hexs):
+	v = []
+	for h in hexs.split(' '):
+		if h: v.append(int(h, 16))
+	return v
+
+
+try:
+	opts, args = getopt.getopt(sys.argv[1:], "vxcf:o:",
+							   ["verbose", "fix26", "cmap", "format=", "output="])
+	omap = {'verbose':'v', 'fix26':'x', 'cmap':'c', 'format':'f', 'output':'o'}
+	for o, a in opts:
+		o = o.lstrip('-')
+		o = omap.get(o, o)
+		if   o == 'v':	gVerb += 1
+		elif o == 'x':	gFix26 = True
+		elif o == 'c':	gCMap  = True
+	#	elif o == 'b':	gS1Byt = True
+		elif o == 'f':	gFmt   = min(max(0, int(a)), 2)
+		elif o == 'o':	Targ   = a
+
+	if len(args): DDir = args[0]
+	UVers = (' ' + Sed(r"1s/.+-([0-9.]+)\..+/\1/p;q",
+					   "DerivedCoreProperties").readline()).strip()
+	if gFmt: gFix26 = True
+	if gFmt or UVers >= '14.0.0': gCMap = True
+	gS1Byt = gFmt > 0
+except getopt.GetoptError as err:
+	Error(err)
+	exit(1)
+
+if gVerb: print("# Settings: data-format: %d  fix-2.6.1: %d  has-casemap: %d  UCD-version: %s"
+				% (gFmt, gFix26, gCMap, UVers))
+
+#--------------------------------------------------------------------------------------------------
+
+def ParseDCProps (name, start):
+	v = set()
+	r1 = re.compile(r'^[0-9A-F]+')
+	r2 = re.compile(r'^([0-9A-F]{4,6})\.\.([0-9A-F]+)')
+	for L in Sed("/^# Derived Property: " +start+ "/,/^# Total code points:/p", "DerivedCoreProperties"):
+		m = re.match(r2, L)
+		if m:
+			for i in range(Hex(m[1]), Hex(m[2]) + 1): v.add(i)
+		else:
+			m = re.match(r1, L)
+			if m: v.add(Hex(m[0]))
+	return v
+
+
+def Ignorable ():
+	return ParseDCProps("Ignorable", "Default_Ignorable_Code_Point")
+
+def Uppercase ():
+	return ParseDCProps("Uppercase", "Uppercase")
+
+def Lowercase ():
+	return ParseDCProps("Lowercase", "Lowercase")
+
+
+def GraphemeBounds ():
+	v = defaultdict(lambda: "Other")
+	r1 = re.compile(r'^([0-9A-F]+)\s*;\s*([A-Za-z_]+)')
+	r2 = re.compile(r'^([0-9A-F]{4,6})\.\.([0-9A-F]+)\s*;\s*([A-Za-z_]+)')
+	for L in Cat("GraphemeBreakProperty"):
+		m = re.match(r2, L)
+		if m:
+			C = m[3]
+			for n in range(Hex(m[1]), Hex(m[2]) + 1): v[n] = C
+		else:
+			m = re.match(r1, L)
+			if m:
+				C = m[2]
+				v[Hex(m[1])] = C
+
+	r1 = re.compile(r'^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Extended_Pictographic\W', re.I)
+	r2 = re.compile(r'^([0-9A-F]+)\s*;\s*Extended_Pictographic\W', re.I)
+	r3 = re.compile(r'^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Emoji_Modifier\W', re.I)
+	r4 = re.compile(r'^([0-9A-F]+)\s*;\s*Emoji_Modifier\W', re.I)
+	extPict = 'EXTENDED_PICTOGRAPHIC'; extend = 'EXTEND'
+	for L in Cat("emoji-data"):
+		m = re.match(r1, L)
+		if m:
+			for e in range(Hex(m[1]), Hex(m[2]) + 1): v[e] = extPict
+		else:
+			m = re.match(r2, L)
+			if m:
+				v[Hex(m[1])] = extPict
+			else:
+				m = re.match(r3, L)
+				if m:
+					for e in range(Hex(m[1]), Hex(m[2]) + 1): v[e] = extend
+				else:
+					m = re.match(r4, L)
+					if m: v[Hex(m[1])] = extend
+	return v
+
+
+def CharWidths ():
+	cws = defaultdict(lambda: 1)
+
+	# Use a default width of 1 for all character categories that are letter/symbol/number-like,
+	# as well as for unassigned/private-use chars. This can be overridden by UAX 11 below,
+	# but provides a useful nonzero fallback for new codepoints when a new Unicode version
+	# has been released but Unifont hasn't been updated yet.
+
+	# Categories that may contain zero-width chars
+	zerowidth = set(('Mn','Mc','Me','Zl','Zp','Cc','Cf','Cs'))	# +'Sk' - see issue #167
+	for c in uchars:
+		if c.category in zerowidth:
+			cws[c.code] = 0
+
+	# Widths from UAX #11: East Asian Width
+	# These take precedence for all codepoints listed explicitly as wide/full/narrow/half-width
+	rx = re.compile(r'^([0-9A-F]{4,6})(\.\.([0-9A-F]{4,6}))?;([AFHNW]a?)(\s*#\s*Cn\W)?')
+	for L in Cat("EastAsianWidth"):
+		m = re.match(rx, L)
+		if m:
+			width  = m[4]
+			cstart = Hex(m[1])
+			cend   = Hex(m[3]) if m.end(3) > 0 else cstart
+			for c in range(cstart, cend + 1):			# Assign widths
+				if not m[5]:							# skip any with a `# Cn …` comment
+					if width == 'W' or width == 'F':	# wide or full
+						cws[c] = 2
+					elif width == 'Na' or width == 'H':
+						cws[c] = 1
+
+	# A few exceptions to the above cases, found by manual comparison
+	# to other wcwidth functions and similar checks.
+	for ch in uchars:
+		c   = ch.code
+		cat = ch.category
+
+			# Ensure format control chars (cat Cf) have width 0 (some of these, like U+0601,
+			# can have a width in some cases but normally act like prepended combining marks.
+			#  U+FFF9 etc. are also odd, but have zero width in typical terminal contexts)
+		if cat == 'Cf':						cws[c] = 0
+
+			# Unifont has nonzero width for a number of non-spacing combining characters,
+			# e.g. (in 7.0.06): F84,17B4,17B5,180B,180D,2D7F and the variation selectors
+		elif cat == 'Mn':					cws[c] = 0
+
+			# We also assign width of one to unassigned and private-use codepoints
+			# (Unifont includes ConScript Unicode Registry PUA fonts, but since these
+			# are nonstandard it seems questionable to use Unifont metrics;
+			# if they are printed as the replacement char U+FFFD they will have width 1).
+		elif cat == 'Co' or cat == 'Cn':	cws[c] = 1
+
+			# For some reason, Unifont has width-2 glyphs for ASCII control chars
+		elif cat == 'Cc':					cws[c] = 0
+
+	# Soft hyphen is typically printed as a hyphen (-) in terminals.
+	cws[0x00AD] = 1
+
+	# By definition, should have zero width (on the same line)
+	cws[0x2028] = 0		# category: Zl, name: LINE SEPARATOR
+	cws[0x2029] = 0		# category: Zp, name: PARAGRAPH SEPARATOR
+	return cws
+
+
+def CompExclusions (start, name):
+	v = set()
+	rx = re.compile(r'^[0-9A-F]+')
+	for L in Sed("/^# " +start+ "/,/^# Total code points:/p", "CompositionExclusions"):
+		m = re.match(rx, L)
+		if m: v.add(Hex(m[0]))
+	return v
+
+
+def Exclusions ():
+	v = CompExclusions("\(1\) Script Specifics", "Exclusions")
+	# data_generator.rb erroneously adds `0` in lines 136 & 139 (for each comment line)
+	if not gFix26: v.add(0)		#•••
+	return v
+
+
+def Precomposed ():
+	return CompExclusions("\(2\) Post Composition Version precomposed characters", "Precomposed")
+
+
+def CaseFolding ():
+	v = {}
+	rx = re.compile(r'^([0-9A-F]+); [CF]; ([0-9A-F ]+);')
+	for L in Cat("CaseFolding"):
+		m = re.match(rx, L)
+		if m: v[Hex(m[1])] = HexArray(m[2])
+	return v
+
+
+udRE = re.compile(r'^'
+		r'([0-9A-F]+);'			#  1: code
+		r'([^;]+);'				#  2: name
+		r'([A-Za-z]+);'	 		#  3: general category
+		r'([0-9]+);' 			#  4: canonical combining class
+		r'([A-Z]+);' 			#  5: bidi class
+		r'(<([A-Za-z]*)>)?'		#  7: decomposition type
+		r'((\ ?[0-9A-F]+)*);'	#  8: decompomposition mapping
+		r'([0-9]*);'			# 10: decimal digit
+		r'([0-9]*);'			# 11: digit
+		r'([^;]*);'				# 12: numeric
+		r'([YN]*);'				# 13: bidi mirrored
+		r'([^;]*);'				# 14: unicode 1.0 name
+		r'([^;]*);'				# 15: iso comment
+		r'([0-9A-F]*);'			# 16: simple uppercase mapping
+		r'([0-9A-F]*);'			# 17: simple lowercase mapping
+		r'([0-9A-F]*)$')		# 18: simple titlecase mapping
+uchar_hash = {}
+pSequences = []
+seqs_hash  = {}
+pCaseMap   = []
+cmap_hash  = {}
+
+def push_seq (seq):
+	key = str(seq)
+	if key not in seqs_hash:
+		idx = len(pSequences)
+		seqs_hash[key] = idx
+		pSequences.extend(seq)
+		return idx
+	return seqs_hash[key]
+
+def push_cas (seq):
+	if not gCMap: return push_seq(seq)
+	key = str(seq)
+	if key not in cmap_hash:
+		idx = len(pCaseMap)
+		cmap_hash[key] = idx
+		pCaseMap.extend(seq)
+		return idx
+	return cmap_hash[key]
+
+def to_u16 (seq):
+	v = []
+	for cp in seq:
+		if cp <= 0xFFFF:
+			if (cp >> 11) == 0x1B: Error("UTF-16 code: U+%06X" % cp)
+			v.append(cp)
+		else:
+			v += [0xD800 | ((cp - 0x10000) >> 10), 0xDC00 | (cp & 0x03FF)]
+	return v
+
+def dm_index (seq):				# decomp_map sequence
+	if not seq or len(seq) == 0: return g_
+	lencode = len(seq) - 1		# no sequence has len 0, so we encode len 1 as 0, len 2 as 1, …
+	seq = to_u16(seq)
+	if lencode >= 7:			# we have 3 bits for length (which is cutting it close.
+		seq.insert(0, lencode)	# May need to change it to 2 bits in future Unicode versions)
+		lencode = 7
+	idx = push_seq(seq)
+	if idx > 0x1FFF: Error("decomp_map: pSequences[%d] out of bounds" % idx)
+	return idx | (lencode << 13)
+
+def cf_index (seq):				# case_fold sequence
+	if not gCMap: return dm_index(seq)
+	if not seq or len(seq) == 0: return g_
+	lencode = len(seq) - 1		# no sequence has len 0, so we encode len 1 as 0, len 2 as 1, …
+	seq = to_u16(seq)
+	if lencode >= 7:			# we have 3 bits for length
+		seq.insert(0, lencode)
+		lencode = 7
+	idx = push_cas(seq)
+	if idx > 0x1FFF: Error("case_fold: pCaseMap[%d] out of bounds" % idx)
+	return idx | (lencode << 13)
+
+def case_map (cp):
+	if not cp: return g_
+	if cp > 0xFFFF:
+		idx = push_cas([0xD800 | ((cp - 0x10000) >> 10), 0xDC00 | (cp & 0x03FF)])
+	else:
+		if (cp >> 11) == 0x1B: Error("UTF-16 code: U+%06X" % cp)
+		idx = push_cas([cp])
+	if idx >= 0xFFFF: Error("case_map: pCaseMap[%d] out of bounds" % idx);
+	return idx
+
+def S (s): return s if s != None else g_
+
+def F (f): return gT if f else gF
+
+def D0 (s): return 'UTF8PROC_DECOMP_TYPE_' + s.upper() if s else '0'
+def D1 (s): return s.upper() if s else '0'
+
+g_ = 'UINT16_MAX'; gT = 'true'; gF = 'false'; D = D0
+gPF = "  {{UTF8PROC_CATEGORY_{}, {}, UTF8PROC_BIDI_CLASS_{}, {}, {}," \
+	  " {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, 0, UTF8PROC_BOUNDCLASS_{}}},"
+if gFmt: g_ = '_'; gT = 'T'; gF = 'F'; D = D1; \
+		 gPF = "\tP({},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{})"
+ZZCC = set(('Zl', 'Zp', 'Cc', 'Cf'))
+
+class UChar:
+	def __init__ (self, line):
+		m = re.match(udRE, line)
+		if not m: Error("Could not parse input ‘%s’" % line)
+		uc = Hex(m[1])
+		self.code        = uc
+		self.name        = m[2]
+		self.category    = m[3]
+		self.comb_class  = int(m[4])
+		self.bidi_class  = m[5]
+		self.decomp_type = m[7]
+		self.decomp_map  = HexArray(m[8]) if m[8] else None
+		self.decomp_idx  = g_
+		self.bidi_mirror = (m[13] == 'Y')
+		# issue #130: use nonstandard uppercase ß -> ẞ
+		# issue #195: if character is uppercase but has no lowercase mapping,
+		#             then make lowercase mapping = itself (vice versa for lowercase)
+		U = Hex0(m[16]); L = Hex0(m[17]); T = Hex0(m[18])
+		self.uppercase  = (0x1E9E if uc==0x00DF else (uc if L<0 and uc in lowercase \
+														 else None)) if U<0 else U
+		self.lowercase  = (uc if U<0 and uc in uppercase else None) if L<0 else L
+		self.titlecase  = (0x1E9E if uc==0x00DF else None) if T<0 else T
+		self.case_fold  = caseFolding.get(uc)
+		self.centry_idx = None
+
+	def ctrl_bound (self):
+		return F((self.category in ZZCC) and (not gFix26 or (self.code != 0x200C and
+															 self.code != 0x200D)))		#•••
+
+	def c_entry (self):
+		uc      = self.code
+		com_idx = combIndexes.get(uc, g_)
+		comp_ex = F(uc in exclusions or uc in exclVersion)
+		ignore  = F(uc in ignorable)
+		gbounds = grfmBounds[uc].upper()
+		self.uppercase = case_map(self.uppercase)
+		self.lowercase = case_map(self.lowercase)
+		self.titlecase = case_map(self.titlecase)
+		return gPF.format(
+				self.category.upper(), self.comb_class, self.bidi_class,
+				D(self.decomp_type), S(self.decomp_idx), S(self.case_fold),
+				self.uppercase, self.lowercase, self.titlecase, com_idx, F(self.bidi_mirror),
+				comp_ex, ignore, self.ctrl_bound(), charWidths[uc], gbounds)
+
+
+def UnicodeData ():
+	uchars = []
+	r1 = re.compile(r'^([0-9A-F]+);<[^;>,]+, First>;')
+	r2 = re.compile(r'^([0-9A-F]+);<([^;>,]+), Last>;')
+	state = True
+	for L in Cat("UnicodeData"):
+		if state:
+			m = re.match(r1, L)
+			if m:
+				first = Hex(m[1])
+				state = False
+			else:
+				ch = UChar(L)
+				uchar_hash[ch.code] = ch
+				uchars.append(ch)
+		else:
+			m = re.match(r2, L)
+			if not m: Error("No last character of sequence U+%04X … found" % first)
+			name  = '<' + m[2] + '>'
+			ch    = UChar(L)
+			state = True
+			for i in range(first, Hex(m[1]) + 1):
+				ch_clone = copy.copy(ch)
+				ch_clone.code = i
+				ch_clone.name = name
+				uchar_hash[i] = ch_clone
+				uchars.append(ch_clone)
+	if not state: Error("No last character of sequence U+%04X … found" % first)
+	return uchars
+
+
+def Sequences ():
+	comb1st      = {}
+	comb1st_keys = []			# Ordered list of comb1st’ keys.
+	comb2nd      = {}
+	comb2nd_keys = []
+	comb2nd_long = set()
+	comb_array   = {}
+	for ch in uchars:
+		dm = ch.decomp_map
+		if not ch.decomp_type and dm and len(dm) == 2 and dm[0] in uchar_hash and \
+				uchar_hash[dm[0]].comb_class == 0 and ch.code not in exclusions:
+			dm0 = dm[0]
+			dm1 = dm[1]
+			if dm0 not in comb1st or comb1st[dm0] == None:
+				comb1st[dm0] = c1i_dm0 = len(comb1st)
+				comb1st_keys.append(dm0)
+			else:
+				c1i_dm0 = comb1st[dm0]
+			if dm1 not in comb2nd or comb2nd[dm1] == None:
+				comb2nd_keys.append(dm1)
+				comb2nd[dm1] = len(comb2nd)
+			if not comb_array.get(c1i_dm0):
+				comb_array[c1i_dm0] = {}
+			if comb2nd[dm1] in comb_array[c1i_dm0]:
+				Error("Duplicate canonical mapping: U+%05X %d/%d" % (ch.code, dm0, dm1))
+			comb_array[c1i_dm0][comb2nd[dm1]] = ch.code
+			if ch.code > 0xFFFF: comb2nd_long.add(dm1)
+		ch.decomp_idx = dm_index(dm)
+		ch.case_fold  = cf_index(ch.case_fold)
+
+	comb_idxs       = {}
+	comb1st_offsets = {}			# (first, last) tuples
+	cur_pos = 0
+	for dm0 in comb1st_keys:		# Force comb1st build order
+		index  = comb1st[dm0]
+		comb_i = comb_array[index]
+		first  = None
+		last   = None
+		offset = 0; b = -1
+		for dm1 in comb2nd_keys:
+			b += 1
+			if b in comb_i and comb_i[b] != None:
+				if first == None: first = offset
+				last = offset
+				if dm1 in comb2nd_long: last += 1
+			offset += 1
+			if dm1 in comb2nd_long: offset += 1
+		comb1st_offsets[index] = (first, last)
+		if dm0 in comb_idxs: Error("double index at %d" % dm0)
+		comb_idxs[dm0] = cur_pos
+		cur_pos += last - first + 1 + 2
+
+	offset = 0
+	for dm1 in comb2nd_keys:
+		if dm1 in comb_idxs: Error("double index at %d" % dm1)
+		comb_idxs[dm1] = 0x8000 | (comb2nd[dm1] + offset)
+		if comb2nd[dm1] + offset > 0x4000: Error("too large comb index at %d" % dm1)
+		if dm1 in comb2nd_long:
+			comb_idxs[dm1] = comb_idxs[dm1] | 0x4000
+			offset += 1
+
+	class SStr:						# A string stream
+		def __init__ (self):      self.s = "  "
+		def __lshift__ (self, x): self.s += x; return self
+		def i (self, x):          self.s += "%d, " % x; return self
+	class SVec:						# A vector stream
+		def __init__ (self):      self.s = []
+		def __lshift__ (self, x): return self
+		def i (self, x):          self.s.append(x); return self
+
+	# Create string with original line breaking or array of ints
+	i = 0; s = SVec() if gFmt else SStr()
+	for a in range(len(comb1st)):
+		o1 = comb1st_offsets[a]
+		s.i(o1[0]).i(o1[1])
+		offset = 0; b = -1
+		for dm1 in comb2nd_keys:
+			b += 1
+			if offset > o1[1]: break
+			if offset >= o1[0]:
+				i += 1
+				if i == 8: i = 0; s << "\n  "
+				v = comb_array[a][b] if b in comb_array[a] else 0
+				if dm1 in comb2nd_long: s.i((v & 0xFFFF0000) >> 16)
+				s.i(v & 0xFFFF)
+			offset += 2 if dm1 in comb2nd_long else 1
+		s << "\n"
+	return s.s, comb_idxs
+
+
+def WriteIntArray (array, name, file, bytes = False):
+	print("static const utf8proc_uint%d_t utf8proc_%s[] = {"
+			% ((8 if bytes else 16), name), file=file)
+	if isinstance(array, str):
+		print(array, end='', file=file)
+	elif gFmt:
+		i = 0; pre = "\t"
+		for e in array:
+			if i and i % 16 == 0: pre = ",\n\t"
+			i += 1
+			print("%s%d" % (pre, e), end='', file=file)
+			pre = ","
+		print("", file=file)
+	else:
+		i = 0; print("  ", end='', file=file)
+		for e in array:
+			i += 1
+			if i == 8: print("\n  ", end='', file=file); i = 0
+			print("%d, " % e, end='', file=file)
+	print("};\n", file=file)
+
+
+def WriteData (stage1, stage2, props, combinations):
+	f = open(Targ, mode='w')
+	if gFmt: print("// Generated by %s v%s from UnicodeData version %s on %s.\n"
+				   "// Options: --format=%d%s%s\n\n"
+				   "#define	UNICODE_VERSION	\"%s\""
+				   % (Me, Vers, UVers, datetime.datetime.utcnow().strftime("%F %T"), gFmt,
+					  (" --fix26" if gFix26 else ""), (" --cmap" if gCMap else ""), UVers), file=f)
+	WriteIntArray(pSequences, "sequences", f)
+	if gCMap:
+		print("#define	U8CASEMAP", file=f)
+		WriteIntArray(pCaseMap, "casemap", f)
+	WriteIntArray(stage1, "stage1table", f, gS1Byt)
+	WriteIntArray(stage2, "stage2table", f)
+	if gFmt:
+		print("#define	P(C,c,B,D,ds,cs,us,ls,ts,x,m,e,i,cb,w,b)\t"
+			  "{UTF8PROC_CATEGORY_##C,c,UTF8PROC_BIDI_CLASS_##B,\\\n\t\t"
+			  "UTF8PROC_DECOMP_TYPE_##D,ds,cs,us,ls,ts,x,m,e,i,cb,w,0,UTF8PROC_BOUNDCLASS_##b},\n"
+			  "enum { F = false, T = true, UTF8PROC_BIDI_CLASS_0 = 0,"
+			  " UTF8PROC_DECOMP_TYPE_0 = 0, _ = UINT16_MAX };\n\n"
+			  "static const utf8proc_property_t utf8proc_properties[] = {\n"
+			  "	P(CN,0,0,0,_,_,_,_,_,_,F,F,F,F,1,OTHER)", file=f)
+	else:
+		u = "UINT16_MAX, "; b = "false,"
+		print("static const utf8proc_property_t utf8proc_properties[] = {\n"
+			  "  {0, 0, 0, 0, "+u+u+u+u+u+u+' '+b+b+b+b+" 1, 0, UTF8PROC_BOUNDCLASS_OTHER},", file=f)
+	for c in props:
+		print(c, file=f)
+	print("};\n", file=f)
+	WriteIntArray(combinations, "combinations", f)
+	if gFmt: print("// End.", file=f)
+	if gVerb: print("# Wrote ‘%s’." % Targ)
+
+
+ignorable   = Ignorable()
+uppercase   = Uppercase()
+lowercase   = Lowercase()
+grfmBounds  = GraphemeBounds()		# Grapheme bounds classes
+exclusions  = Exclusions()
+exclVersion = Precomposed()
+caseFolding = CaseFolding()
+uchars      = UnicodeData()
+charWidths  = CharWidths()
+(pCombinations, combIndexes) = Sequences()
+
+prop_idxs  = {}
+pProperties = []
+for ch in uchars:
+	centry = ch.c_entry()
+	ch.centry_idx = prop_idxs.get(centry)
+	if not ch.centry_idx:
+		prop_idxs[centry] = ch.centry_idx = len(pProperties)
+		pProperties.append(centry)
+
+pStage1 = []		# stage1table
+pStage2 = []		# stage2table
+chunks2 = []
+scale = 1 if gS1Byt else 0x100
+for page in range(0, 0x110000, 0x100):
+	stage2_entry = []
+	for code in range(page, page + 0x100):
+		stage2_entry.append(uchar_hash[code].centry_idx + 1 if code in uchar_hash else 0)
+	if stage2_entry in chunks2:
+		pStage1.append(chunks2.index(stage2_entry) * scale)
+	else:
+		pStage1.append(len(chunks2) * scale)
+		pStage2.extend(stage2_entry)
+		chunks2.append(stage2_entry)
+
+WriteData(pStage1, pStage2, pProperties, pCombinations)
+

From dbc2b185a9d94f52fe1c70f6f3c83c342ec48dd3 Mon Sep 17 00:00:00 2001
From: chris0e3 <chris0e3@gmail.com>
Date: Fri, 17 Sep 2021 01:12:34 +0100
Subject: [PATCH 3/3] =?UTF-8?q?Makefile=20-=20(WCFLAGS):=20Restored=20`-Ws?=
 =?UTF-8?q?ign-conversion`=20from=20610730f2314f4cdb52c64e2ef78a9d5d69402b?=
 =?UTF-8?q?66.=20=09(UNICODE=5FVERSION):=20Added=20`14.0.0`=20default.=20?=
 =?UTF-8?q?=09=09[Note:=20It=20may=20be=20possible=20to=20remove=20the=20s?=
 =?UTF-8?q?imilar=20statement=20in=20data/Makefile=20but=20it=20may=20requ?=
 =?UTF-8?q?ire=20updating=20all=20the=20calls=20to=20`$(MAKE)=20-C=20data?=
 =?UTF-8?q?=20=E2=80=A6`=20in=20this=20file.]=20=09(test/misc):=20Use=20ab?=
 =?UTF-8?q?ove=20UNICODE=5FVERSION=20var=20so=20that=20this=20target=20als?=
 =?UTF-8?q?o=20builds=20with=20original=20utf8proc=5Fdata.c=20file.=20=09?=
 =?UTF-8?q?=09[Note:=20This=20test=20in=20test/misc=20is=20now=20redundant?=
 =?UTF-8?q?=20when=20using=20the=20new=20utf8proc=5Fdata.c=20files.]=20utf?=
 =?UTF-8?q?8proc.c=20=20-=20Restored=20changes=20from=20610730f2314f4cdb52?=
 =?UTF-8?q?c64e2ef78a9d5d69402b66.=20=09(utf8proc=5Fcategory):=20Fixed=20n?=
 =?UTF-8?q?ew=20warning.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Makefile   |  6 ++++--
 utf8proc.c | 16 ++++++++--------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/Makefile b/Makefile
index 3fa9267..acc537e 100644
--- a/Makefile
+++ b/Makefile
@@ -11,7 +11,7 @@ PERL=perl
 CFLAGS ?= -O2
 PICFLAG = -fPIC
 C99FLAG = -std=c99
-WCFLAGS = -Wall -Wextra -pedantic
+WCFLAGS = -Wsign-conversion -Wall -Wextra -pedantic
 UCFLAGS = $(CPPFLAGS) $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS $(UTF8PROC_DEFINES)
 LDFLAG_SHARED = -shared
 SOFLAG = -Wl,-soname
@@ -26,6 +26,8 @@ MAJOR=2
 MINOR=4
 PATCH=1
 
+UNICODE_VERSION ?= 14.0.0
+
 OS := $(shell uname)
 ifeq ($(OS),Darwin) # MacOS X
   SHLIB_EXT = dylib
@@ -166,7 +168,7 @@ test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h
 	$(CC) $(UCFLAGS) $(LDFLAGS) test/custom.c test/tests.o utf8proc.o -o $@
 
 test/misc: test/misc.c test/tests.o utf8proc.o utf8proc.h test/tests.h
-	$(CC) $(UCFLAGS) $(LDFLAGS) -DUNICODE_VERSION=`sed -En 's/^#.+UNICODE_VERSION.(.+)/\1/p' utf8proc_data.c` test/misc.c test/tests.o utf8proc.o -o $@
+	$(CC) $(UCFLAGS) $(LDFLAGS) '-DUNICODE_VERSION="$(UNICODE_VERSION)"' test/misc.c test/tests.o utf8proc.o -o $@
 
 check: test/normtest data/NormalizationTest.txt data/Lowercase.txt data/Uppercase.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/iscase test/custom test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
 	$(MAKE) -C bench
diff --git a/utf8proc.c b/utf8proc.c
index 784cdca..f2845be 100644
--- a/utf8proc.c
+++ b/utf8proc.c
@@ -132,7 +132,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
   const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst
 ) {
-  utf8proc_uint32_t uc;
+  utf8proc_int32_t uc;
   const utf8proc_uint8_t *end;
 
   *dst = -1;
@@ -144,7 +144,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
     return 1;
   }
   // Must be between 0xc2 and 0xf4 inclusive to be valid
-  if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
+  if ((utf8proc_uint32_t)(uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
   if (uc < 0xe0) {         // 2-byte sequence
      // Must have valid continuation character
      if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
@@ -399,19 +399,19 @@ write_char_decomposed_case(utf8proc_uint16_t idx, utf8proc_int32_t *dst, utf8pro
 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
 {
   utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex;
-  return cl != UINT16_MAX ? seqindex_decode_index(cl) : c;
+  return cl != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cl) : c;
 }
 
 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
 {
   utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex;
-  return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
+  return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c;
 }
 
 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c)
 {
   utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex;
-  return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
+  return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c;
 }
 
 UTF8PROC_DLLEXPORT int utf8proc_islower(utf8proc_int32_t c)
@@ -433,7 +433,7 @@ UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
 }
 
 UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) {
-  return utf8proc_get_property(c)->category;
+  return (utf8proc_category_t) utf8proc_get_property(c)->category;
 }
 
 UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
@@ -443,7 +443,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
 
 #define utf8proc_decompose_lump(replacement_uc) \
   return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
-  options & ~UTF8PROC_LUMP, last_boundclass)
+  options & ~(unsigned int)UTF8PROC_LUMP, last_boundclass)
 
 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
   const utf8proc_property_t *property;
@@ -758,7 +758,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
   *dstptr = NULL;
   result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data);
   if (result < 0) return result;
-  buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
+  buffer = (utf8proc_int32_t *) malloc(((utf8proc_size_t)result) * sizeof(utf8proc_int32_t) + 1);
   if (!buffer) return UTF8PROC_ERROR_NOMEM;
   result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data);
   if (result < 0) {