From 97082dae5fca26f6e78cc28f6db4e692c8f82fa7 Mon Sep 17 00:00:00 2001 From: Billy Robert O'Neal III Date: Wed, 3 Jan 2024 13:47:33 -0800 Subject: [PATCH] Fix Utf8Decoder operator== handling of the last code point in the input While working on diagnostics for https://github.com/microsoft/vcpkg-tool/pull/1210 I observed that we were printing the caret ^ in the wrong place when it goes after the input. The way this works is we form the line of text to print, then decode the unicode encoding units, and when we hit the target, we stop and print ^: https://github.com/microsoft/vcpkg-tool/blob/5b8f9c40dd9d6a07ec636590e78c2f49d28624b9/src/vcpkg/base/parse.cpp#L51-L68 however, if the intended location for the ^ is the "end" of the line, we hit this bug: https://github.com/microsoft/vcpkg-tool/blob/5b8f9c40dd9d6a07ec636590e78c2f49d28624b9/src/vcpkg/base/unicode.cpp#L273 The iterator only compares the last_ pointers, but both the "points at the last code point in the input" and "points to the end of the input" state set `next_ == last_`. See: https://github.com/microsoft/vcpkg-tool/blob/5b8f9c40dd9d6a07ec636590e78c2f49d28624b9/src/vcpkg/base/unicode.cpp#L222-L226 This means that the points to the end and points one past the end iterator compare equal, so the loop in parse.cpp stops one position too early. Also adds a bunch of testing for this specific case, for other parts of Utf8Decoder, adds a way to parse the first code point without failing, makes all the operators 'hidden friends', and removes localized strings for bugs-in-vcpkg-itself. --- include/vcpkg/base/message-data.inc.h | 5 -- include/vcpkg/base/unicode.h | 45 ++++++++--- locales/messages.json | 2 - src/vcpkg-test/ci-baseline.cpp | 6 +- src/vcpkg-test/manifests.cpp | 2 +- src/vcpkg-test/unicode.cpp | 106 ++++++++++++++++++++++++++ src/vcpkg/base/unicode.cpp | 15 +--- 7 files changed, 150 insertions(+), 31 deletions(-) create mode 100644 src/vcpkg-test/unicode.cpp diff --git a/include/vcpkg/base/message-data.inc.h b/include/vcpkg/base/message-data.inc.h index 3e9672d83c..bd08ee1e25 100644 --- a/include/vcpkg/base/message-data.inc.h +++ b/include/vcpkg/base/message-data.inc.h @@ -866,10 +866,6 @@ DECLARE_MESSAGE(CommandFailed, "{command_line}\n" "failed with the following results:") DECLARE_MESSAGE(CommunityTriplets, (), "", "Community Triplets:") -DECLARE_MESSAGE(ComparingUtf8Decoders, - (), - "", - "Comparing Utf8Decoders with different provenance; this is always an error") DECLARE_MESSAGE(CompressFolderFailed, (msg::path), "", "Failed to compress folder \"{path}\":") DECLARE_MESSAGE(ComputingInstallPlan, (), "", "Computing installation plan...") DECLARE_MESSAGE(ConfigurationErrorRegistriesWithoutBaseline, @@ -1702,7 +1698,6 @@ DECLARE_MESSAGE(IllegalPlatformSpec, (), "", "Platform qualifier is not allowed DECLARE_MESSAGE(ImproperShaLength, (msg::value), "{value} is a sha.", "SHA512's must be 128 hex characters: {value}") DECLARE_MESSAGE(IncorrectArchiveFileSignature, (), "", "Incorrect archive file signature") DECLARE_MESSAGE(IncorrectPESignature, (), "", "Incorrect PE signature") -DECLARE_MESSAGE(IncrementedUtf8Decoder, (), "", "Incremented Utf8Decoder at the end of the string") DECLARE_MESSAGE(InfoSetEnvVar, (msg::env_var), "In this context 'editor' means IDE", diff --git a/include/vcpkg/base/unicode.h b/include/vcpkg/base/unicode.h index 75b163559a..c09b0b3253 100644 --- a/include/vcpkg/base/unicode.h +++ b/include/vcpkg/base/unicode.h @@ -31,7 +31,6 @@ namespace vcpkg::Unicode Utf8CodeUnitKind utf8_code_unit_kind(unsigned char code_unit) noexcept; - constexpr int utf8_code_unit_count(Utf8CodeUnitKind kind) noexcept { return static_cast(kind); } int utf8_code_unit_count(char code_unit) noexcept; int utf8_encode_code_point(char (&array)[4], char32_t code_point) noexcept; @@ -131,6 +130,24 @@ namespace vcpkg::Unicode current_ = end_of_file; } } + constexpr Utf8Decoder(StringView sv, utf8_errc& first_decode_error) + : Utf8Decoder(sv.begin(), sv.end(), first_decode_error) + { + } + constexpr Utf8Decoder(const char* first, const char* last, utf8_errc& first_decode_error) noexcept + : current_(0), next_(first), last_(last) + { + if (next_ != last_) + { + first_decode_error = next(); + } + else + { + current_ = end_of_file; + first_decode_error = utf8_errc::NoError; + } + } + struct sentinel { }; @@ -161,7 +178,24 @@ namespace vcpkg::Unicode constexpr sentinel end() const { return sentinel(); } - friend bool operator==(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept; + friend constexpr bool operator==(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept + { + if (lhs.last_ != rhs.last_) + { + // comparing decoders of different provenance is always an error + Checks::unreachable(VCPKG_LINE_INFO); + } + + return lhs.next_ == rhs.next_ && lhs.current_ == rhs.current_; + } + friend constexpr bool operator!=(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept + { + return !(lhs == rhs); + } + friend constexpr bool operator==(const Utf8Decoder& d, Utf8Decoder::sentinel) { return d.is_eof(); } + friend constexpr bool operator==(Utf8Decoder::sentinel s, const Utf8Decoder& d) { return d == s; } + friend constexpr bool operator!=(const Utf8Decoder& d, Utf8Decoder::sentinel) { return !d.is_eof(); } + friend constexpr bool operator!=(Utf8Decoder::sentinel s, const Utf8Decoder& d) { return d != s; } using difference_type = std::ptrdiff_t; using value_type = char32_t; @@ -174,11 +208,4 @@ namespace vcpkg::Unicode const char* next_; const char* last_; }; - - inline bool operator!=(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept { return !(lhs == rhs); } - - constexpr bool operator==(const Utf8Decoder& d, Utf8Decoder::sentinel) { return d.is_eof(); } - constexpr bool operator==(Utf8Decoder::sentinel s, const Utf8Decoder& d) { return d == s; } - constexpr bool operator!=(const Utf8Decoder& d, Utf8Decoder::sentinel) { return !d.is_eof(); } - constexpr bool operator!=(Utf8Decoder::sentinel s, const Utf8Decoder& d) { return d != s; } } diff --git a/locales/messages.json b/locales/messages.json index 01a12a6aca..9c421a2340 100644 --- a/locales/messages.json +++ b/locales/messages.json @@ -511,7 +511,6 @@ "CommandFailed": "command:\n{command_line}\nfailed with the following results:", "_CommandFailed.comment": "An example of {command_line} is vcpkg install zlib.", "CommunityTriplets": "Community Triplets:", - "ComparingUtf8Decoders": "Comparing Utf8Decoders with different provenance; this is always an error", "CompressFolderFailed": "Failed to compress folder \"{path}\":", "_CompressFolderFailed.comment": "An example of {path} is /foo/bar.", "ComputingInstallPlan": "Computing installation plan...", @@ -972,7 +971,6 @@ "_ImproperShaLength.comment": "{value} is a sha.", "IncorrectArchiveFileSignature": "Incorrect archive file signature", "IncorrectPESignature": "Incorrect PE signature", - "IncrementedUtf8Decoder": "Incremented Utf8Decoder at the end of the string", "InfoSetEnvVar": "You can also set {env_var} to your editor of choice.", "_InfoSetEnvVar.comment": "In this context 'editor' means IDE An example of {env_var} is VCPKG_DEFAULT_TRIPLET.", "InitRegistryFailedNoRepo": "Could not create a registry at {path} because this is not a git repository root.\nUse `git init {command_line}` to create a git repository in this folder.", diff --git a/src/vcpkg-test/ci-baseline.cpp b/src/vcpkg-test/ci-baseline.cpp index e89c883b4a..337e7f95cb 100644 --- a/src/vcpkg-test/ci-baseline.cpp +++ b/src/vcpkg-test/ci-baseline.cpp @@ -258,7 +258,7 @@ TEST_CASE ("Parse Errors", "[ci-baseline]") { check_error("hello", R"(test:1:6: error: expected ':' here on expression: hello - ^)"); + ^)"); check_error("hello\n:", R"(test:1:6: error: expected ':' here on expression: hello @@ -271,7 +271,7 @@ TEST_CASE ("Parse Errors", "[ci-baseline]") check_error("x64-windows:", R"(test:1:13: error: expected a triplet name here (must be lowercase, digits, '-') on expression: x64-windows: - ^)"); + ^)"); check_error("x64-windows:\nport:x64-windows=skip", R"(test:1:13: error: expected a triplet name here (must be lowercase, digits, '-') @@ -285,7 +285,7 @@ TEST_CASE ("Parse Errors", "[ci-baseline]") // clang-format off check_error(" \tx64-windows:", R"(test:1:21: error: expected a triplet name here (must be lowercase, digits, '-') on expression: )" "\t" R"(x64-windows: - )" "\t" R"( ^)"); + )" "\t" R"( ^)"); // clang-format on check_error("port:x64-windows\n=fail", R"(test:1:17: error: expected '=' here diff --git a/src/vcpkg-test/manifests.cpp b/src/vcpkg-test/manifests.cpp index ce1fde2770..ce3a8a58d6 100644 --- a/src/vcpkg-test/manifests.cpp +++ b/src/vcpkg-test/manifests.cpp @@ -1267,7 +1267,7 @@ TEST_CASE ("license error messages", "[manifests][license]") CHECK(messages.error->to_string() == R"(:1:8: error: Expected a license name, found the end of the string. on expression: MIT AND - ^)"); + ^)"); parse_spdx_license_expression("MIT AND unknownlicense", messages); CHECK(!messages.error); diff --git a/src/vcpkg-test/unicode.cpp b/src/vcpkg-test/unicode.cpp new file mode 100644 index 0000000000..75e19a49b1 --- /dev/null +++ b/src/vcpkg-test/unicode.cpp @@ -0,0 +1,106 @@ +#include + +#include + +#include + +using namespace vcpkg::Unicode; + +TEST_CASE ("Utf8Decoder valid", "[unicode]") +{ + const char32_t* expected = U""; + const char* input = ""; + SECTION ("hello") + { + expected = U"hello"; + input = "hello"; + } + + SECTION ("all types of code points") + { + expected = U"one: a two: \u00E9 three: \u672C four: \U0001F3C8"; + input = "one: a two: \xC3\xA9 three: \xE6\x9C\xAC four: \xF0\x9F\x8F\x88"; + } + + SECTION ("wtf-8 leading") + { + // U+1F3C8 as WTF-8 + static constexpr char32_t storage[] = {0xD83C, 0}; + expected = storage; + input = "\xED\xA0\xBC"; + } + + SECTION ("wtf-8 trailing") + { + // U+1F3C8 as WTF-8 + static constexpr char32_t storage[] = {0xDFC8, 0}; + expected = storage; + input = "\xED\xBF\x88"; + } + + auto input_end = input + strlen(input); + Utf8Decoder decode(input); + // strlen for char32_t: + size_t expected_size = 0; + for (auto* e = expected; *e; ++e) + { + ++expected_size; + } + + auto decode_at_end = std::next(decode, expected_size); + for (size_t idx = 0; idx < expected_size; ++idx) + { + REQUIRE(decode != decode.end()); // compare sentinel + REQUIRE(decode != decode_at_end); // compare iterator + REQUIRE(*decode == expected[idx]); + REQUIRE(!decode.is_eof()); + char32_t decoded; + auto pointer_to_current = decode.pointer_to_current(); + REQUIRE(utf8_decode_code_point(pointer_to_current, input_end, decoded).second == utf8_errc::NoError); + REQUIRE(decoded == expected[idx]); + char encoded[4]; + auto encoded_size = utf8_encode_code_point(encoded, decoded); + REQUIRE(std::equal(encoded, encoded + encoded_size, pointer_to_current)); + ++decode; + } + + REQUIRE(decode == decode.end()); + REQUIRE(decode == decode_at_end); +} + +TEST_CASE ("Utf8Decoder first decode empty", "[unicode]") +{ + utf8_errc err; + Utf8Decoder uut("", err); + REQUIRE(err == utf8_errc::NoError); + REQUIRE(uut.is_eof()); + REQUIRE(uut == uut.end()); + REQUIRE(uut == uut); +} + +TEST_CASE ("Utf8Decoder invalid", "[unicode]") +{ + utf8_errc err; + // clang-format off + Utf8Decoder uut(GENERATE( + "hello \xFF too big", + "hello \xC3\xBF\xBF\xBF also too big", + "hello \x9C continuation", + "hello \xE0\x28 overlong", + "hello \xED\xA0\xBC\xED\xBF\x88 paired WTF-8", + "missing two: \xC3", + "missing three one: \xE6\x9C", + "missing three two: \xE6", + "missing four one: \xF0\x9F\x8F", + "missing four two: \xF0\x9F", + "missing four three: \xF0" + ), err); + // clang-format on + while (err == utf8_errc::NoError) + { + REQUIRE(!uut.is_eof()); + err = uut.next(); + } + + REQUIRE(uut.is_eof()); +} diff --git a/src/vcpkg/base/unicode.cpp b/src/vcpkg/base/unicode.cpp index 5613be895b..4f501b17b5 100644 --- a/src/vcpkg/base/unicode.cpp +++ b/src/vcpkg/base/unicode.cpp @@ -31,6 +31,8 @@ namespace vcpkg::Unicode } } + static constexpr int utf8_code_unit_count(Utf8CodeUnitKind kind) noexcept { return static_cast(kind); } + int utf8_code_unit_count(char code_unit) noexcept { return utf8_code_unit_count(utf8_code_unit_kind(static_cast(code_unit))); @@ -216,7 +218,8 @@ namespace vcpkg::Unicode { if (is_eof()) { - vcpkg::Checks::msg_exit_with_message(VCPKG_LINE_INFO, msgIncrementedUtf8Decoder); + // incremented Utf8Decoder at the end of the string + Checks::unreachable(VCPKG_LINE_INFO); } if (next_ == last_) @@ -262,14 +265,4 @@ namespace vcpkg::Unicode current_ = end_of_file; return *this; } - - bool operator==(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept - { - if (lhs.last_ != rhs.last_) - { - Checks::msg_exit_with_message(VCPKG_LINE_INFO, msgComparingUtf8Decoders); - } - - return lhs.next_ == rhs.next_; - } }