Skip to content

Commit

Permalink
Fix Utf8Decoder operator== handling of the last code point in the input
Browse files Browse the repository at this point in the history
While working on diagnostics for microsoft#1210 I observed that we were printing the caret ^ in the wrong place when it goes after the input.

The way this works is we form the line of text to print, then decode the unicode encoding units, and when we hit the target, we stop and print ^:

https://github.com/microsoft/vcpkg-tool/blob/5b8f9c40dd9d6a07ec636590e78c2f49d28624b9/src/vcpkg/base/parse.cpp#L51-L68

however, if the intended location for the ^ is the "end" of the line, we hit this bug:

https://github.com/microsoft/vcpkg-tool/blob/5b8f9c40dd9d6a07ec636590e78c2f49d28624b9/src/vcpkg/base/unicode.cpp#L273

The iterator only compares the last_ pointers, but both the "points at the last code point in the input" and "points to the end of the input" state set `next_ == last_`. See:

https://github.com/microsoft/vcpkg-tool/blob/5b8f9c40dd9d6a07ec636590e78c2f49d28624b9/src/vcpkg/base/unicode.cpp#L222-L226

This means that the points to the end and points one past the end iterator compare equal, so the loop in parse.cpp stops one position too early.

Also adds a bunch of testing for this specific case, for other parts of Utf8Decoder, adds a way to parse the first code point without failing, makes all the operators 'hidden friends', and removes localized strings for bugs-in-vcpkg-itself.
  • Loading branch information
BillyONeal committed Jan 3, 2024
1 parent 5b8f9c4 commit 97082da
Show file tree
Hide file tree
Showing 7 changed files with 150 additions and 31 deletions.
5 changes: 0 additions & 5 deletions include/vcpkg/base/message-data.inc.h
Original file line number Diff line number Diff line change
Expand Up @@ -866,10 +866,6 @@ DECLARE_MESSAGE(CommandFailed,
"{command_line}\n"
"failed with the following results:")
DECLARE_MESSAGE(CommunityTriplets, (), "", "Community Triplets:")
DECLARE_MESSAGE(ComparingUtf8Decoders,
(),
"",
"Comparing Utf8Decoders with different provenance; this is always an error")
DECLARE_MESSAGE(CompressFolderFailed, (msg::path), "", "Failed to compress folder \"{path}\":")
DECLARE_MESSAGE(ComputingInstallPlan, (), "", "Computing installation plan...")
DECLARE_MESSAGE(ConfigurationErrorRegistriesWithoutBaseline,
Expand Down Expand Up @@ -1702,7 +1698,6 @@ DECLARE_MESSAGE(IllegalPlatformSpec, (), "", "Platform qualifier is not allowed
DECLARE_MESSAGE(ImproperShaLength, (msg::value), "{value} is a sha.", "SHA512's must be 128 hex characters: {value}")
DECLARE_MESSAGE(IncorrectArchiveFileSignature, (), "", "Incorrect archive file signature")
DECLARE_MESSAGE(IncorrectPESignature, (), "", "Incorrect PE signature")
DECLARE_MESSAGE(IncrementedUtf8Decoder, (), "", "Incremented Utf8Decoder at the end of the string")
DECLARE_MESSAGE(InfoSetEnvVar,
(msg::env_var),
"In this context 'editor' means IDE",
Expand Down
45 changes: 36 additions & 9 deletions include/vcpkg/base/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ namespace vcpkg::Unicode

Utf8CodeUnitKind utf8_code_unit_kind(unsigned char code_unit) noexcept;

constexpr int utf8_code_unit_count(Utf8CodeUnitKind kind) noexcept { return static_cast<int>(kind); }
int utf8_code_unit_count(char code_unit) noexcept;

int utf8_encode_code_point(char (&array)[4], char32_t code_point) noexcept;
Expand Down Expand Up @@ -131,6 +130,24 @@ namespace vcpkg::Unicode
current_ = end_of_file;
}
}
constexpr Utf8Decoder(StringView sv, utf8_errc& first_decode_error)
: Utf8Decoder(sv.begin(), sv.end(), first_decode_error)
{
}
constexpr Utf8Decoder(const char* first, const char* last, utf8_errc& first_decode_error) noexcept
: current_(0), next_(first), last_(last)
{
if (next_ != last_)
{
first_decode_error = next();
}
else
{
current_ = end_of_file;
first_decode_error = utf8_errc::NoError;
}
}

struct sentinel
{
};
Expand Down Expand Up @@ -161,7 +178,24 @@ namespace vcpkg::Unicode

constexpr sentinel end() const { return sentinel(); }

friend bool operator==(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept;
friend constexpr bool operator==(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept
{
if (lhs.last_ != rhs.last_)
{
// comparing decoders of different provenance is always an error
Checks::unreachable(VCPKG_LINE_INFO);
}

return lhs.next_ == rhs.next_ && lhs.current_ == rhs.current_;
}
friend constexpr bool operator!=(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept
{
return !(lhs == rhs);
}
friend constexpr bool operator==(const Utf8Decoder& d, Utf8Decoder::sentinel) { return d.is_eof(); }
friend constexpr bool operator==(Utf8Decoder::sentinel s, const Utf8Decoder& d) { return d == s; }
friend constexpr bool operator!=(const Utf8Decoder& d, Utf8Decoder::sentinel) { return !d.is_eof(); }
friend constexpr bool operator!=(Utf8Decoder::sentinel s, const Utf8Decoder& d) { return d != s; }

using difference_type = std::ptrdiff_t;
using value_type = char32_t;
Expand All @@ -174,11 +208,4 @@ namespace vcpkg::Unicode
const char* next_;
const char* last_;
};

inline bool operator!=(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept { return !(lhs == rhs); }

constexpr bool operator==(const Utf8Decoder& d, Utf8Decoder::sentinel) { return d.is_eof(); }
constexpr bool operator==(Utf8Decoder::sentinel s, const Utf8Decoder& d) { return d == s; }
constexpr bool operator!=(const Utf8Decoder& d, Utf8Decoder::sentinel) { return !d.is_eof(); }
constexpr bool operator!=(Utf8Decoder::sentinel s, const Utf8Decoder& d) { return d != s; }
}
2 changes: 0 additions & 2 deletions locales/messages.json
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,6 @@
"CommandFailed": "command:\n{command_line}\nfailed with the following results:",
"_CommandFailed.comment": "An example of {command_line} is vcpkg install zlib.",
"CommunityTriplets": "Community Triplets:",
"ComparingUtf8Decoders": "Comparing Utf8Decoders with different provenance; this is always an error",
"CompressFolderFailed": "Failed to compress folder \"{path}\":",
"_CompressFolderFailed.comment": "An example of {path} is /foo/bar.",
"ComputingInstallPlan": "Computing installation plan...",
Expand Down Expand Up @@ -972,7 +971,6 @@
"_ImproperShaLength.comment": "{value} is a sha.",
"IncorrectArchiveFileSignature": "Incorrect archive file signature",
"IncorrectPESignature": "Incorrect PE signature",
"IncrementedUtf8Decoder": "Incremented Utf8Decoder at the end of the string",
"InfoSetEnvVar": "You can also set {env_var} to your editor of choice.",
"_InfoSetEnvVar.comment": "In this context 'editor' means IDE An example of {env_var} is VCPKG_DEFAULT_TRIPLET.",
"InitRegistryFailedNoRepo": "Could not create a registry at {path} because this is not a git repository root.\nUse `git init {command_line}` to create a git repository in this folder.",
Expand Down
6 changes: 3 additions & 3 deletions src/vcpkg-test/ci-baseline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ TEST_CASE ("Parse Errors", "[ci-baseline]")
{
check_error("hello", R"(test:1:6: error: expected ':' here
on expression: hello
^)");
^)");

check_error("hello\n:", R"(test:1:6: error: expected ':' here
on expression: hello
Expand All @@ -271,7 +271,7 @@ TEST_CASE ("Parse Errors", "[ci-baseline]")

check_error("x64-windows:", R"(test:1:13: error: expected a triplet name here (must be lowercase, digits, '-')
on expression: x64-windows:
^)");
^)");

check_error("x64-windows:\nport:x64-windows=skip",
R"(test:1:13: error: expected a triplet name here (must be lowercase, digits, '-')
Expand All @@ -285,7 +285,7 @@ TEST_CASE ("Parse Errors", "[ci-baseline]")
// clang-format off
check_error(" \tx64-windows:", R"(test:1:21: error: expected a triplet name here (must be lowercase, digits, '-')
on expression: )" "\t" R"(x64-windows:
)" "\t" R"( ^)");
)" "\t" R"( ^)");
// clang-format on

check_error("port:x64-windows\n=fail", R"(test:1:17: error: expected '=' here
Expand Down
2 changes: 1 addition & 1 deletion src/vcpkg-test/manifests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1267,7 +1267,7 @@ TEST_CASE ("license error messages", "[manifests][license]")
CHECK(messages.error->to_string() ==
R"(<license string>:1:8: error: Expected a license name, found the end of the string.
on expression: MIT AND
^)");
^)");

parse_spdx_license_expression("MIT AND unknownlicense", messages);
CHECK(!messages.error);
Expand Down
106 changes: 106 additions & 0 deletions src/vcpkg-test/unicode.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#include <vcpkg-test/util.h>

#include <vcpkg/base/unicode.h>

#include <iterator>

using namespace vcpkg::Unicode;

TEST_CASE ("Utf8Decoder valid", "[unicode]")
{
const char32_t* expected = U"";
const char* input = "";
SECTION ("hello")
{
expected = U"hello";
input = "hello";
}

SECTION ("all types of code points")
{
expected = U"one: a two: \u00E9 three: \u672C four: \U0001F3C8";
input = "one: a two: \xC3\xA9 three: \xE6\x9C\xAC four: \xF0\x9F\x8F\x88";
}

SECTION ("wtf-8 leading")
{
// U+1F3C8 as WTF-8
static constexpr char32_t storage[] = {0xD83C, 0};
expected = storage;
input = "\xED\xA0\xBC";
}

SECTION ("wtf-8 trailing")
{
// U+1F3C8 as WTF-8
static constexpr char32_t storage[] = {0xDFC8, 0};
expected = storage;
input = "\xED\xBF\x88";
}

auto input_end = input + strlen(input);
Utf8Decoder decode(input);
// strlen for char32_t:
size_t expected_size = 0;
for (auto* e = expected; *e; ++e)
{
++expected_size;
}

auto decode_at_end = std::next(decode, expected_size);
for (size_t idx = 0; idx < expected_size; ++idx)
{
REQUIRE(decode != decode.end()); // compare sentinel
REQUIRE(decode != decode_at_end); // compare iterator
REQUIRE(*decode == expected[idx]);
REQUIRE(!decode.is_eof());
char32_t decoded;
auto pointer_to_current = decode.pointer_to_current();
REQUIRE(utf8_decode_code_point(pointer_to_current, input_end, decoded).second == utf8_errc::NoError);
REQUIRE(decoded == expected[idx]);
char encoded[4];
auto encoded_size = utf8_encode_code_point(encoded, decoded);
REQUIRE(std::equal(encoded, encoded + encoded_size, pointer_to_current));
++decode;
}

REQUIRE(decode == decode.end());
REQUIRE(decode == decode_at_end);
}

TEST_CASE ("Utf8Decoder first decode empty", "[unicode]")
{
utf8_errc err;
Utf8Decoder uut("", err);
REQUIRE(err == utf8_errc::NoError);
REQUIRE(uut.is_eof());
REQUIRE(uut == uut.end());
REQUIRE(uut == uut);
}

TEST_CASE ("Utf8Decoder invalid", "[unicode]")
{
utf8_errc err;
// clang-format off
Utf8Decoder uut(GENERATE(
"hello \xFF too big",
"hello \xC3\xBF\xBF\xBF also too big",
"hello \x9C continuation",
"hello \xE0\x28 overlong",
"hello \xED\xA0\xBC\xED\xBF\x88 paired WTF-8",
"missing two: \xC3",
"missing three one: \xE6\x9C",
"missing three two: \xE6",
"missing four one: \xF0\x9F\x8F",
"missing four two: \xF0\x9F",
"missing four three: \xF0"
), err);
// clang-format on
while (err == utf8_errc::NoError)
{
REQUIRE(!uut.is_eof());
err = uut.next();
}

REQUIRE(uut.is_eof());
}
15 changes: 4 additions & 11 deletions src/vcpkg/base/unicode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ namespace vcpkg::Unicode
}
}

static constexpr int utf8_code_unit_count(Utf8CodeUnitKind kind) noexcept { return static_cast<int>(kind); }

int utf8_code_unit_count(char code_unit) noexcept
{
return utf8_code_unit_count(utf8_code_unit_kind(static_cast<unsigned char>(code_unit)));
Expand Down Expand Up @@ -216,7 +218,8 @@ namespace vcpkg::Unicode
{
if (is_eof())
{
vcpkg::Checks::msg_exit_with_message(VCPKG_LINE_INFO, msgIncrementedUtf8Decoder);
// incremented Utf8Decoder at the end of the string
Checks::unreachable(VCPKG_LINE_INFO);
}

if (next_ == last_)
Expand Down Expand Up @@ -262,14 +265,4 @@ namespace vcpkg::Unicode
current_ = end_of_file;
return *this;
}

bool operator==(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept
{
if (lhs.last_ != rhs.last_)
{
Checks::msg_exit_with_message(VCPKG_LINE_INFO, msgComparingUtf8Decoders);
}

return lhs.next_ == rhs.next_;
}
}

0 comments on commit 97082da

Please sign in to comment.