forked from microsoft/vcpkg-tool
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix Utf8Decoder operator== handling of the last code point in the input
While working on diagnostics for microsoft#1210 I observed that we were printing the caret ^ in the wrong place when it goes after the input. The way this works is we form the line of text to print, then decode the unicode encoding units, and when we hit the target, we stop and print ^: https://github.com/microsoft/vcpkg-tool/blob/5b8f9c40dd9d6a07ec636590e78c2f49d28624b9/src/vcpkg/base/parse.cpp#L51-L68 however, if the intended location for the ^ is the "end" of the line, we hit this bug: https://github.com/microsoft/vcpkg-tool/blob/5b8f9c40dd9d6a07ec636590e78c2f49d28624b9/src/vcpkg/base/unicode.cpp#L273 The iterator only compares the last_ pointers, but both the "points at the last code point in the input" and "points to the end of the input" state set `next_ == last_`. See: https://github.com/microsoft/vcpkg-tool/blob/5b8f9c40dd9d6a07ec636590e78c2f49d28624b9/src/vcpkg/base/unicode.cpp#L222-L226 This means that the points to the end and points one past the end iterator compare equal, so the loop in parse.cpp stops one position too early. Also adds a bunch of testing for this specific case, for other parts of Utf8Decoder, adds a way to parse the first code point without failing, makes all the operators 'hidden friends', and removes localized strings for bugs-in-vcpkg-itself.
- Loading branch information
1 parent
5b8f9c4
commit 97082da
Showing
7 changed files
with
150 additions
and
31 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
#include <vcpkg-test/util.h> | ||
|
||
#include <vcpkg/base/unicode.h> | ||
|
||
#include <iterator> | ||
|
||
using namespace vcpkg::Unicode; | ||
|
||
TEST_CASE ("Utf8Decoder valid", "[unicode]") | ||
{ | ||
const char32_t* expected = U""; | ||
const char* input = ""; | ||
SECTION ("hello") | ||
{ | ||
expected = U"hello"; | ||
input = "hello"; | ||
} | ||
|
||
SECTION ("all types of code points") | ||
{ | ||
expected = U"one: a two: \u00E9 three: \u672C four: \U0001F3C8"; | ||
input = "one: a two: \xC3\xA9 three: \xE6\x9C\xAC four: \xF0\x9F\x8F\x88"; | ||
} | ||
|
||
SECTION ("wtf-8 leading") | ||
{ | ||
// U+1F3C8 as WTF-8 | ||
static constexpr char32_t storage[] = {0xD83C, 0}; | ||
expected = storage; | ||
input = "\xED\xA0\xBC"; | ||
} | ||
|
||
SECTION ("wtf-8 trailing") | ||
{ | ||
// U+1F3C8 as WTF-8 | ||
static constexpr char32_t storage[] = {0xDFC8, 0}; | ||
expected = storage; | ||
input = "\xED\xBF\x88"; | ||
} | ||
|
||
auto input_end = input + strlen(input); | ||
Utf8Decoder decode(input); | ||
// strlen for char32_t: | ||
size_t expected_size = 0; | ||
for (auto* e = expected; *e; ++e) | ||
{ | ||
++expected_size; | ||
} | ||
|
||
auto decode_at_end = std::next(decode, expected_size); | ||
for (size_t idx = 0; idx < expected_size; ++idx) | ||
{ | ||
REQUIRE(decode != decode.end()); // compare sentinel | ||
REQUIRE(decode != decode_at_end); // compare iterator | ||
REQUIRE(*decode == expected[idx]); | ||
REQUIRE(!decode.is_eof()); | ||
char32_t decoded; | ||
auto pointer_to_current = decode.pointer_to_current(); | ||
REQUIRE(utf8_decode_code_point(pointer_to_current, input_end, decoded).second == utf8_errc::NoError); | ||
REQUIRE(decoded == expected[idx]); | ||
char encoded[4]; | ||
auto encoded_size = utf8_encode_code_point(encoded, decoded); | ||
REQUIRE(std::equal(encoded, encoded + encoded_size, pointer_to_current)); | ||
++decode; | ||
} | ||
|
||
REQUIRE(decode == decode.end()); | ||
REQUIRE(decode == decode_at_end); | ||
} | ||
|
||
TEST_CASE ("Utf8Decoder first decode empty", "[unicode]") | ||
{ | ||
utf8_errc err; | ||
Utf8Decoder uut("", err); | ||
REQUIRE(err == utf8_errc::NoError); | ||
REQUIRE(uut.is_eof()); | ||
REQUIRE(uut == uut.end()); | ||
REQUIRE(uut == uut); | ||
} | ||
|
||
TEST_CASE ("Utf8Decoder invalid", "[unicode]") | ||
{ | ||
utf8_errc err; | ||
// clang-format off | ||
Utf8Decoder uut(GENERATE( | ||
"hello \xFF too big", | ||
"hello \xC3\xBF\xBF\xBF also too big", | ||
"hello \x9C continuation", | ||
"hello \xE0\x28 overlong", | ||
"hello \xED\xA0\xBC\xED\xBF\x88 paired WTF-8", | ||
"missing two: \xC3", | ||
"missing three one: \xE6\x9C", | ||
"missing three two: \xE6", | ||
"missing four one: \xF0\x9F\x8F", | ||
"missing four two: \xF0\x9F", | ||
"missing four three: \xF0" | ||
), err); | ||
// clang-format on | ||
while (err == utf8_errc::NoError) | ||
{ | ||
REQUIRE(!uut.is_eof()); | ||
err = uut.next(); | ||
} | ||
|
||
REQUIRE(uut.is_eof()); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters