diff --git a/include/ctre/evaluation.hpp b/include/ctre/evaluation.hpp index 6701c6fa..06295f64 100644 --- a/include/ctre/evaluation.hpp +++ b/include/ctre/evaluation.hpp @@ -115,11 +115,55 @@ template constexpr CTR return false; } -template constexpr CTRE_FORCE_INLINE string_match_result evaluate_match_string(Iterator current, [[maybe_unused]] const EndIterator end, std::index_sequence) noexcept { - - bool same = (compare_character(String, current, end) && ... && true); +#if __cpp_char8_t >= 201811 +template constexpr CTRE_FORCE_INLINE string_match_result evaluate_match_utf8_string(Iterator current, [[maybe_unused]] const EndIterator end, char8_t (&buffer)[N], std::index_sequence) noexcept { + //abuse inside knowledge of how utf8_iterator works + if constexpr (!std::is_same_v<::std::remove_const_t, utf8_iterator::sentinel>) { + size_t count = end.ptr - current.ptr; //size_t count = std::distance(current.ptr, end.ptr); + size_t bump = ((count < N) ? count : N); + //using ^ operator vs != because gcc complains about parens + return { Iterator{current.ptr + bump, current.end}, (count >= N) && !(bool)(((current.ptr[Idx] ^ buffer[Idx])) | ... | char8_t{0}) }; + } else { + size_t count = current.end - current.ptr; //size_t count = std::distance(current.ptr, current.end); + size_t bump = ((count < N) ? count : N); + return { Iterator{current.ptr + bump, current.end}, (count >= N) && !(bool)(((current.ptr[Idx] ^ buffer[Idx])) | ... | char8_t{0}) }; + } +} +#endif - return {current, same}; +template constexpr CTRE_FORCE_INLINE string_match_result evaluate_match_string(Iterator current, [[maybe_unused]] const EndIterator end, std::index_sequence) noexcept { +#if __cpp_char8_t >= 201811 + if constexpr (sizeof...(String) && std::is_same_v<::std::remove_const_t, utf8_iterator> && (std::is_same_v, std::remove_const_t> || std::is_same_v<::std::remove_const_t, utf8_iterator::sentinel>)) { + constexpr size_t str_length = (utf8_codepoint_length(String) + ... + 0ULL); + //encode our String... into it's utf8 representation + char8_t utf8_sequence[str_length]; + char8_t* ptr = utf8_sequence; + ((ptr = utf32_codepoint_to_utf8_codepoint(String, ptr)), ...); + //run the comparison + return evaluate_match_utf8_string(current, end, utf8_sequence, std::make_index_sequence()); + } else if constexpr (sizeof...(String) && is_random_accessible(typename std::iterator_traits::iterator_category{}) && std::is_same_v, std::remove_const_t>) { + using char_type = ::std::remove_reference_t<::std::remove_cv_t>; + //check the remaining bytes* + size_t count = end - current; + //make sure we only "bump" the iterator a safe distance + size_t bump = ((count < sizeof...(String)) ? count : sizeof...(String)); + //do math against how many characters we match, avoid as many branches as possible + return { current + bump, (count >= sizeof...(String)) && !(bool)(((current[Idx] ^ static_cast(String))) | ... | char_type{0}) }; + } else { + bool same = (compare_character(String, current, end) && ... && true); + return { current, same }; + } +#else + if constexpr (sizeof...(String) && is_random_accessible(typename std::iterator_traits::iterator_category{}) && std::is_same_v, std::remove_const_t>) { + using char_type = ::std::remove_reference_t<::std::remove_cv_t>; + size_t count = end - current; + size_t bump = ((count < sizeof...(String)) ? count : sizeof...(String)); + return { current + bump, (count >= sizeof...(String)) && !(bool)(((current[Idx] ^ static_cast(String))) | ... | char_type{0}) }; + } else { + bool same = (compare_character(String, current, end) && ... && true); + return { current, same }; + } +#endif } template diff --git a/include/ctre/utf8.hpp b/include/ctre/utf8.hpp index d519d254..3cb02b04 100644 --- a/include/ctre/utf8.hpp +++ b/include/ctre/utf8.hpp @@ -8,6 +8,44 @@ #include namespace ctre { +constexpr char8_t* utf32_codepoint_to_utf8_codepoint(uint32_t code, char8_t *ptr) { + if (code < 0x80) { + ptr[0] = code; + return ptr + 1; + } else if (code < 0x800) { // 00000yyy yyxxxxxx + ptr[0] = (0b11000000 | (code >> 6)); + ptr[1] = (0b10000000 | (code & 0x3f)); + return ptr + 2; + } else if (code < 0x10000) { // zzzzyyyy yyxxxxxx + ptr[0] = (0b11100000 | (code >> 12)); // 1110zzz + ptr[1] = (0b10000000 | ((code >> 6) & 0x3f)); // 10yyyyy + ptr[2] = (0b10000000 | (code & 0x3f)); // 10xxxxx + return ptr + 3; + } else if (code < 0x200000) { // 000uuuuu zzzzyyyy yyxxxxxx + ptr[0] = (0b11110000 | (code >> 18)); // 11110uuu + ptr[1] = (0b10000000 | ((code >> 12) & 0x3f)); // 10uuzzzz + ptr[2] = (0b10000000 | ((code >> 6) & 0x3f)); // 10yyyyyy + ptr[3] = (0b10000000 | (code & 0x3f)); // 10xxxxxx + return ptr + 4; + } else { + ptr[0] = 0xff; //invalid start byte + return ptr + 1; + } +} + +constexpr uint32_t utf8_codepoint_length(uint32_t code) { + if (code < 0x80) { + return 1; + } else if (code < 0x800) { // 00000yyy yyxxxxxx + return 2; + } else if (code < 0x10000) { // zzzzyyyy yyxxxxxx + return 3; + } else if (code < 0x200000) { // 000uuuuu zzzzyyyy yyxxxxxx + return 4; + } else { + return 1; + } +} struct utf8_iterator { using self_type = utf8_iterator;