diff --git a/CMakeLists.txt b/CMakeLists.txt index 84e554b6..77331f6f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -133,6 +133,6 @@ if (BUILD_TESTS) # target runs tests without building them. add_custom_target(run_tests COMMAND ${CMAKE_CTEST_COMMAND} DEPENDS libjsonnet_test libjsonnet_test_file libjsonnet_test_snippet - jsonnet parser_test lexer_test libjsonnet++_test libjsonnet_test_locale + jsonnet unicode_test parser_test lexer_test libjsonnet++_test libjsonnet_test_locale ) endif() diff --git a/core/BUILD b/core/BUILD index 62b4459f..c84744ad 100644 --- a/core/BUILD +++ b/core/BUILD @@ -69,3 +69,12 @@ cc_test( "@com_google_googletest//:gtest_main", ], ) + +cc_test( + name = "unicode_test", + srcs = ["unicode_test.cpp"], + deps = [ + ":libjsonnet", + "@com_google_googletest//:gtest_main", + ], +) diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 048addb2..8ab37775 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -87,6 +87,9 @@ function(add_test_executable test_name) endfunction() if (BUILD_TESTS) + add_test_executable(unicode_test) + add_test(unicode_test ${GLOBAL_OUTPUT_PATH}/unicode_test) + add_test_executable(lexer_test) add_test(lexer_test ${GLOBAL_OUTPUT_PATH}/lexer_test) diff --git a/core/unicode.h b/core/unicode.h index 56484cc2..b28fcd4b 100644 --- a/core/unicode.h +++ b/core/unicode.h @@ -114,7 +114,7 @@ static inline char32_t decode_utf8(const std::string &str, size_t &i) if ((c3 & 0xC0) != 0x80) { return JSONNET_CODEPOINT_ERROR; } - return ((c0 & 0x7) << 24ul) | ((c1 & 0x3F) << 12ul) | ((c2 & 0x3F) << 6) | (c3 & 0x3F); + return ((c0 & 0x7) << 18ul) | ((c1 & 0x3F) << 12ul) | ((c2 & 0x3F) << 6) | (c3 & 0x3F); } else { return JSONNET_CODEPOINT_ERROR; } diff --git a/core/unicode_test.cpp b/core/unicode_test.cpp new file mode 100644 index 00000000..c34365ee --- /dev/null +++ b/core/unicode_test.cpp @@ -0,0 +1,114 @@ +/* +Copyright 2025 Google Inc. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include +#include +#include +#include +#include "unicode.h" +#include "gtest/gtest.h" + +namespace jsonnet::internal { +namespace { + +void testEncodeDecode(char32_t codepoint, const std::string &expect_utf8) { + std::string buffer; + size_t len = encode_utf8(codepoint, buffer); + EXPECT_EQ(len, expect_utf8.size()); + EXPECT_EQ(buffer, expect_utf8); + + size_t at = 0; + char32_t decoded = decode_utf8(expect_utf8, at); + EXPECT_EQ(decoded, codepoint); + EXPECT_EQ(at, expect_utf8.size() - 1); +} + +TEST(Unicode, TestUTF8) +{ + // ASCII encodes as itself. + testEncodeDecode(0x00, std::string("\x00", 1)); + testEncodeDecode(0x41, "A"); + testEncodeDecode(0x7f, "\x7f"); + + testEncodeDecode(0x80, "\xc2\x80"); + testEncodeDecode(0x100, "\xc4\x80"); + testEncodeDecode(0x7ff, "\xdf\xbf"); + + testEncodeDecode(0x800, "\xe0\xa0\x80"); + testEncodeDecode(0x1482, "\xe1\x92\x82"); + testEncodeDecode(0xffff, "\xef\xbf\xbf"); + + testEncodeDecode(0x010000, "\xf0\x90\x80\x80"); + testEncodeDecode(0x01f600, "\xf0\x9f\x98\x80"); // U+1F600 "Grinning Face" + testEncodeDecode(0x0f057e, "\xf3\xb0\x95\xbe"); // U+F057E Private use area character + testEncodeDecode(0x10ffff, "\xf4\x8f\xbf\xbf"); +} + +TEST(Unicode, TestUTF8RejectBad) +{ + const auto test_cases = std::array{ + "\x80", // Continuation byte without leading byte + "\xa0", // Continuation byte without leading byte + "\xbf", // Continuation byte without leading byte + "\xc0", // Leading byte for 2-byte sequence (missing tail) + "\xe0", // Leading byte for 3-byte sequence (missing tail) + "\xf0", // Leading byte for 4-byte sequence (missing tail) + "\xf8\x83\x83\x83", // Invalid leading byte + "\xe0\x80", // Leading byte for 3-byte sequence (missing tail) + "\xf0\x80", // Leading byte for 4-byte sequence (missing tail) + "\xf0\x80\x80", // Leading byte for 4-byte sequence (missing tail) + "\xc0\xcf", // Leading byte for 2-byte sequence (incorrect tail) + "\xe0\xcf", // Leading byte for 3-byte sequence (incorrect tail) + "\xf0\xcf", // Leading byte for 4-byte sequence (incorrect tail) + "\xe0\xcf\x80", // Leading byte for 3-byte sequence (incorrect tail) + "\xf0\xcf\x80", // Leading byte for 4-byte sequence (incorrect tail) + "\xe0\x80\xcf", // Leading byte for 3-byte sequence (incorrect tail) + "\xf0\x80\xcf", // Leading byte for 4-byte sequence (incorrect tail) + "\xf0\x80\x80\xcf", // Leading byte for 4-byte sequence (incorrect tail) + "\xf0\x80\xcf\x80", // Leading byte for 4-byte sequence (incorrect tail) + "\xf0\xcf\x80\x80", // Leading byte for 4-byte sequence (incorrect tail) + }; + for (size_t i = 0; i < test_cases.size(); ++i) { + const auto str = test_cases[i]; + size_t at = 0; + char32_t c = decode_utf8(str, at); + + EXPECT_EQ(c, JSONNET_CODEPOINT_ERROR) << "expect decode to reject. case " << i << std::endl; + } +} + +TEST(Unicode, TestUTF8RoundTripExhaustive) +{ + // Encode every Unicode code-point as UTF-8 and verify that + // it decodes to the same value. + std::string buffer; + for (int x = 0; x < JSONNET_CODEPOINT_MAX; ++x) { + if (x == JSONNET_CODEPOINT_ERROR) { + continue; + } + buffer.clear(); + encode_utf8(x, buffer); + + size_t at = 0; + char32_t y = decode_utf8(buffer, at); + EXPECT_NE(y, JSONNET_CODEPOINT_ERROR) << "UTF-8 roundtrip failed for codepoint " << x << " decode rejects" << std::endl; + EXPECT_EQ(x, y) << "UTF-8 roundtrip failed for codepoint " << x << " converts to " << y << std::endl; + EXPECT_EQ(at, buffer.size() - 1) << "UTF-8 roundtrip failed for codepoint " << x << " decodes incorrect length" << std::endl; + } +} + +} // namespace +} // namespace jsonnet::internal