From 5e7407101a5a73ecb25a556ae9ede3e755b17a38 Mon Sep 17 00:00:00 2001
From: David Demelier <markand@malikania.fr>
Date: Tue, 28 Jan 2020 14:33:11 +0100
Subject: [PATCH] add new utf8proc_sizeof_char function

---
 .gitignore        |  1 +
 Makefile          |  6 +++++-
 test/sizeofchar.c | 41 +++++++++++++++++++++++++++++++++++++++++
 utf8proc.c        | 13 +++++++++++++
 utf8proc.h        |  7 +++++++
 5 files changed, 67 insertions(+), 1 deletion(-)
 create mode 100644 test/sizeofchar.c

diff --git a/.gitignore b/.gitignore
index ffd2eda..482d491 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,4 +27,5 @@ test/valid
 test/iterate
 test/case
 test/custom
+test/sizeofchar
 /tmp/
diff --git a/Makefile b/Makefile
index 0de0697..0957424 100644
--- a/Makefile
+++ b/Makefile
@@ -156,10 +156,13 @@ test/case: test/case.c test/tests.o utf8proc.o utf8proc.h test/tests.h
 test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h
 	$(CC) $(UCFLAGS) $(LDFLAGS) test/custom.c test/tests.o utf8proc.o -o $@
 
+test/sizeofchar: test/sizeofchar.c test/tests.o utf8proc.o utf8proc.h test/tests.h
+	$(CC) $(UCFLAGS) $(LDFLAGS) test/sizeofchar.c test/tests.o utf8proc.o -o $@
+
 test/misc: test/misc.c test/tests.o utf8proc.o utf8proc.h test/tests.h
 	$(CC) $(UCFLAGS) $(LDFLAGS) -DUNICODE_VERSION='"'`$(PERL) -ne "/^UNICODE_VERSION=/ and print $$';" data/Makefile`'"' test/misc.c test/tests.o utf8proc.o -o $@
 
-check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/custom test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
+check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/custom test/sizeofchar test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
 	$(MAKE) -C bench
 	test/normtest data/NormalizationTest.txt
 	test/graphemetest data/GraphemeBreakTest.txt
@@ -169,3 +172,4 @@ check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeB
 	test/iterate
 	test/case
 	test/custom
+	test/sizeofchar
diff --git a/test/sizeofchar.c b/test/sizeofchar.c
new file mode 100644
index 0000000..b86b17a
--- /dev/null
+++ b/test/sizeofchar.c
@@ -0,0 +1,41 @@
+#include "tests.h"
+#include <ctype.h>
+#include <wchar.h>
+
+int main(int argc, char **argv)
+{
+     int c, error = 0;
+
+     (void) argc; /* unused */
+     (void) argv; /* unused */
+
+     /* some simple sanity tests of  */
+     for (c = 0; c < 0x80; c++) {
+        if (utf8proc_sizeof_char(c) != 1) {
+           fprintf(stderr, "Failed: sizeof_char(%04x) != 1\n", c);
+           error++;
+        }
+     }
+     for (;c < 0x800; c++) {
+        if (utf8proc_sizeof_char(c) != 2) {
+           fprintf(stderr, "Failed: sizeof_char(%04x) != 2\n", c);
+           error++;
+        }
+     }
+     for (;c < 0x10000; c++) {
+        if (utf8proc_sizeof_char(c) != 3) {
+           fprintf(stderr, "Failed: sizeof_char(%06x) != 3\n", c);
+           error++;
+        }
+     }
+     for (;c < 0x110000; c++) {
+        if (utf8proc_sizeof_char(c) != 4) {
+           fprintf(stderr, "Failed: sizeof_char(%06x) != 4\n", c);
+           error++;
+        }
+     }
+     check(!error, "utf8proc_sizeof_char FAILED %d tests.", error);
+     printf("Validity tests SUCCEEDED.\n");
+
+     return 0;
+}
diff --git a/utf8proc.c b/utf8proc.c
index 3920d50..34e528d 100644
--- a/utf8proc.c
+++ b/utf8proc.c
@@ -174,6 +174,19 @@ UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) {
     return (((utf8proc_uint32_t)uc)-0xd800 > 0x07ff) && ((utf8proc_uint32_t)uc < 0x110000);
 }
 
+UTF8PROC_DLLEXPORT int utf8proc_sizeof_char(utf8proc_int32_t uc)
+{
+   if (uc < 0x80) {
+      return 1;
+   } else if (uc < 0x800) {
+      return 2;
+   } else if (uc < 0x10000) {
+      return 3;
+   } else if (uc < 0x110000) {
+      return 4;
+   } else return 0;
+}
+
 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
   if (uc < 0x00) {
     return 0;
diff --git a/utf8proc.h b/utf8proc.h
index 490200f..b78f794 100644
--- a/utf8proc.h
+++ b/utf8proc.h
@@ -439,6 +439,13 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(const utf8proc_uint8_t *str
  */
 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t codepoint);
 
+/**
+ * Returns the number of UTF-8 bytes required for the given codepoint.
+ *
+ * This function does not check whether `codepoint` is valid Unicode.
+ */
+UTF8PROC_DLLEXPORT int utf8proc_sizeof_char(utf8proc_int32_t codepoint);
+
 /**
  * Encodes the codepoint as an UTF-8 string in the byte array pointed
  * to by `dst`. This array must be at least 4 bytes long.