From 5e7407101a5a73ecb25a556ae9ede3e755b17a38 Mon Sep 17 00:00:00 2001 From: David Demelier Date: Tue, 28 Jan 2020 14:33:11 +0100 Subject: [PATCH] add new utf8proc_sizeof_char function --- .gitignore | 1 + Makefile | 6 +++++- test/sizeofchar.c | 41 +++++++++++++++++++++++++++++++++++++++++ utf8proc.c | 13 +++++++++++++ utf8proc.h | 7 +++++++ 5 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 test/sizeofchar.c diff --git a/.gitignore b/.gitignore index ffd2eda..482d491 100644 --- a/.gitignore +++ b/.gitignore @@ -27,4 +27,5 @@ test/valid test/iterate test/case test/custom +test/sizeofchar /tmp/ diff --git a/Makefile b/Makefile index 0de0697..0957424 100644 --- a/Makefile +++ b/Makefile @@ -156,10 +156,13 @@ test/case: test/case.c test/tests.o utf8proc.o utf8proc.h test/tests.h test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h $(CC) $(UCFLAGS) $(LDFLAGS) test/custom.c test/tests.o utf8proc.o -o $@ +test/sizeofchar: test/sizeofchar.c test/tests.o utf8proc.o utf8proc.h test/tests.h + $(CC) $(UCFLAGS) $(LDFLAGS) test/sizeofchar.c test/tests.o utf8proc.o -o $@ + test/misc: test/misc.c test/tests.o utf8proc.o utf8proc.h test/tests.h $(CC) $(UCFLAGS) $(LDFLAGS) -DUNICODE_VERSION='"'`$(PERL) -ne "/^UNICODE_VERSION=/ and print $$';" data/Makefile`'"' test/misc.c test/tests.o utf8proc.o -o $@ -check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/custom test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o +check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/custom test/sizeofchar test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o $(MAKE) -C bench test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt @@ -169,3 +172,4 @@ check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeB test/iterate test/case test/custom + test/sizeofchar diff --git a/test/sizeofchar.c b/test/sizeofchar.c new file mode 100644 index 0000000..b86b17a --- /dev/null +++ b/test/sizeofchar.c @@ -0,0 +1,41 @@ +#include "tests.h" +#include +#include + +int main(int argc, char **argv) +{ + int c, error = 0; + + (void) argc; /* unused */ + (void) argv; /* unused */ + + /* some simple sanity tests of */ + for (c = 0; c < 0x80; c++) { + if (utf8proc_sizeof_char(c) != 1) { + fprintf(stderr, "Failed: sizeof_char(%04x) != 1\n", c); + error++; + } + } + for (;c < 0x800; c++) { + if (utf8proc_sizeof_char(c) != 2) { + fprintf(stderr, "Failed: sizeof_char(%04x) != 2\n", c); + error++; + } + } + for (;c < 0x10000; c++) { + if (utf8proc_sizeof_char(c) != 3) { + fprintf(stderr, "Failed: sizeof_char(%06x) != 3\n", c); + error++; + } + } + for (;c < 0x110000; c++) { + if (utf8proc_sizeof_char(c) != 4) { + fprintf(stderr, "Failed: sizeof_char(%06x) != 4\n", c); + error++; + } + } + check(!error, "utf8proc_sizeof_char FAILED %d tests.", error); + printf("Validity tests SUCCEEDED.\n"); + + return 0; +} diff --git a/utf8proc.c b/utf8proc.c index 3920d50..34e528d 100644 --- a/utf8proc.c +++ b/utf8proc.c @@ -174,6 +174,19 @@ UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) { return (((utf8proc_uint32_t)uc)-0xd800 > 0x07ff) && ((utf8proc_uint32_t)uc < 0x110000); } +UTF8PROC_DLLEXPORT int utf8proc_sizeof_char(utf8proc_int32_t uc) +{ + if (uc < 0x80) { + return 1; + } else if (uc < 0x800) { + return 2; + } else if (uc < 0x10000) { + return 3; + } else if (uc < 0x110000) { + return 4; + } else return 0; +} + UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) { if (uc < 0x00) { return 0; diff --git a/utf8proc.h b/utf8proc.h index 490200f..b78f794 100644 --- a/utf8proc.h +++ b/utf8proc.h @@ -439,6 +439,13 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(const utf8proc_uint8_t *str */ UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t codepoint); +/** + * Returns the number of UTF-8 bytes required for the given codepoint. + * + * This function does not check whether `codepoint` is valid Unicode. + */ +UTF8PROC_DLLEXPORT int utf8proc_sizeof_char(utf8proc_int32_t codepoint); + /** * Encodes the codepoint as an UTF-8 string in the byte array pointed * to by `dst`. This array must be at least 4 bytes long.