From 454c8dbe384dcda17f17656ece2f6467b91d0e37 Mon Sep 17 00:00:00 2001 From: Joshua Marantz Date: Wed, 4 Sep 2024 15:25:31 -0400 Subject: [PATCH] generate hex escapes when utf8 is not valid, and check that those are correct in tests. Signed-off-by: Joshua Marantz --- source/common/json/json_sanitizer.cc | 9 +++- test/common/json/BUILD | 1 + test/common/json/json_sanitizer_test.cc | 45 +++++++++++++------- test/common/json/json_sanitizer_test_util.cc | 28 +++++++++++- test/common/json/json_sanitizer_test_util.h | 21 +++++++++ 5 files changed, 84 insertions(+), 20 deletions(-) diff --git a/source/common/json/json_sanitizer.cc b/source/common/json/json_sanitizer.cc index ab814c70e815..4914b1fb8a5b 100644 --- a/source/common/json/json_sanitizer.cc +++ b/source/common/json/json_sanitizer.cc @@ -5,6 +5,7 @@ #include "source/common/json/json_internal.h" #include "absl/strings/str_format.h" +#include "utf8_validity.h" namespace Envoy { namespace Json { @@ -77,16 +78,20 @@ absl::string_view sanitize(std::string& buffer, absl::string_view str) { } END_TRY catch (std::exception&) { - // If Nlohmann throws an error, emit an octal escape for any character + // If Nlohmann throws an error, emit an hex escape for any character // requiring it. This can occur for invalid utf-8 sequences, and we don't // want to crash the server if such a sequence makes its way into a string // we need to serialize. For example, if admin endpoint /stats?format=json // is called, and a stat name was synthesized from dynamic content such as a // gRPC method. + // + // Note that JSON string escapes are always 4 digit hex. 3 digit octal would + // be more compact, and is legal JavaScript, but not legal JSON. See + // https://www.json.org/json-en.html for details. buffer.clear(); for (char c : str) { if (needs_slow_sanitizer[static_cast(c)]) { - buffer.append(absl::StrFormat("\\%03o", c)); + buffer.append(absl::StrFormat("\\u%04x", c)); } else { buffer.append(1, c); } diff --git a/test/common/json/BUILD b/test/common/json/BUILD index 042d825da6c8..eb5f4569691c 100644 --- a/test/common/json/BUILD +++ b/test/common/json/BUILD @@ -50,6 +50,7 @@ envoy_cc_test( "//source/common/json:json_internal_lib", "//source/common/json:json_sanitizer_lib", "//source/common/protobuf:utility_lib", + "@utf8_range//:utf8_validity", ], ) diff --git a/test/common/json/json_sanitizer_test.cc b/test/common/json/json_sanitizer_test.cc index 23222b72930a..ea61b3bd6567 100644 --- a/test/common/json/json_sanitizer_test.cc +++ b/test/common/json/json_sanitizer_test.cc @@ -1,6 +1,7 @@ #include #include "source/common/json/json_internal.h" +#include "source/common/json/json_loader.h" #include "source/common/json/json_sanitizer.h" #include "source/common/protobuf/utility.h" @@ -10,6 +11,7 @@ #include "absl/strings/str_format.h" #include "gmock/gmock.h" #include "gtest/gtest.h" +#include "utf8_validity.h" namespace Envoy { namespace Json { @@ -140,6 +142,7 @@ TEST_F(JsonSanitizerTest, AllTwoByteUtf8) { buf[0] = byte1 | Utf8::Pattern2Byte; for (uint32_t byte2 = 0; byte2 < 64; ++byte2) { buf[1] = byte2 | Utf8::ContinuePattern; + EXPECT_EQ(utf8_range::IsStructurallyValid(utf8), TestUtil::isProtoSerializableUtf8(utf8)); auto [unicode, consumed] = Envoy::Json::Utf8::decode(reinterpret_cast(buf), 2); ASSERT_EQ(2, consumed); @@ -156,6 +159,7 @@ TEST_F(JsonSanitizerTest, AllThreeByteUtf8) { utf8[1] = byte2 | Utf8::ContinuePattern; for (uint32_t byte3 = 0; byte3 < 64; ++byte3) { utf8[2] = byte3 | Utf8::ContinuePattern; + EXPECT_EQ(utf8_range::IsStructurallyValid(utf8), TestUtil::isProtoSerializableUtf8(utf8)); auto [unicode, num_consumed] = Utf8::decode(utf8); if (unicode >= 0x800) { // 3-byte unicode values start at 0x800. absl::string_view sanitized = sanitize(utf8); @@ -173,7 +177,7 @@ TEST_F(JsonSanitizerTest, AllThreeByteUtf8) { } TEST_F(JsonSanitizerTest, AllFourByteUtf8) { - std::string utf8("abcd"); + std::string utf8("abcd"), errmsg; // This test takes 46 seconds without optimization and 46 seconds without, // so we'll just stride all loop by 2 in non-optimized mode to cover the @@ -192,6 +196,7 @@ TEST_F(JsonSanitizerTest, AllFourByteUtf8) { utf8[2] = byte3 | Utf8::ContinuePattern; for (uint32_t byte4 = 0; byte4 < 64; byte4 += inc) { utf8[3] = byte4 | Utf8::ContinuePattern; + EXPECT_EQ(utf8_range::IsStructurallyValid(utf8), TestUtil::isProtoSerializableUtf8(utf8)); absl::string_view sanitized = sanitize(utf8); if (TestUtil::isProtoSerializableUtf8(utf8)) { auto [unicode, consumed] = @@ -200,6 +205,10 @@ TEST_F(JsonSanitizerTest, AllFourByteUtf8) { EXPECT_UTF8_EQ( protoSanitize(utf8), sanitized, absl::StrFormat("0x%x(%d,%d,%d,%d)", unicode, byte1, byte2, byte3, byte4)); + } else { + bool equiv = TestUtil::jsonEquivalentStrings(sanitized, utf8, errmsg); + EXPECT_TRUE(equiv) << absl::StrFormat("%s: non-utf8(%d,%d,%d,%d)", errmsg, byte1, byte2, + byte3, byte4); } } } @@ -256,32 +265,36 @@ TEST_F(JsonSanitizerTest, High8Bit) { // exception, which Json::sanitizer catches and just escapes the characters so // we don't lose information in the encoding. All bytes with the high-bit set // are invalid utf-8 in isolation, so we fall through to escaping these. - EXPECT_EQ("\\200\\201\\202\\203\\204\\205\\206\\207\\210\\211\\212\\213\\214\\215\\216\\217" - "\\220\\221\\222\\223\\224\\225\\226\\227\\230\\231\\232\\233\\234\\235\\236\\237" - "\\240\\241\\242\\243\\244\\245\\246\\247\\250\\251\\252\\253\\254\\255\\256\\257" - "\\260\\261\\262\\263\\264\\265\\266\\267\\270\\271\\272\\273\\274\\275\\276\\277" - "\\300\\301\\302\\303\\304\\305\\306\\307\\310\\311\\312\\313\\314\\315\\316\\317" - "\\320\\321\\322\\323\\324\\325\\326\\327\\330\\331\\332\\333\\334\\335\\336\\337" - "\\340\\341\\342\\343\\344\\345\\346\\347\\350\\351\\352\\353\\354\\355\\356\\357" - "\\360\\361\\362\\363\\364\\365\\366\\367\\370\\371\\372\\373\\374\\375\\376\\377", + EXPECT_EQ("\\u0080\\u0081\\u0082\\u0083\\u0084\\u0085\\u0086\\u0087\\u0088\\u0089\\u008a" + "\\u008b\\u008c\\u008d\\u008e\\u008f\\u0090\\u0091\\u0092\\u0093\\u0094\\u0095" + "\\u0096\\u0097\\u0098\\u0099\\u009a\\u009b\\u009c\\u009d\\u009e\\u009f\\u00a0" + "\\u00a1\\u00a2\\u00a3\\u00a4\\u00a5\\u00a6\\u00a7\\u00a8\\u00a9\\u00aa\\u00ab" + "\\u00ac\\u00ad\\u00ae\\u00af\\u00b0\\u00b1\\u00b2\\u00b3\\u00b4\\u00b5\\u00b6" + "\\u00b7\\u00b8\\u00b9\\u00ba\\u00bb\\u00bc\\u00bd\\u00be\\u00bf\\u00c0\\u00c1" + "\\u00c2\\u00c3\\u00c4\\u00c5\\u00c6\\u00c7\\u00c8\\u00c9\\u00ca\\u00cb\\u00cc" + "\\u00cd\\u00ce\\u00cf\\u00d0\\u00d1\\u00d2\\u00d3\\u00d4\\u00d5\\u00d6\\u00d7" + "\\u00d8\\u00d9\\u00da\\u00db\\u00dc\\u00dd\\u00de\\u00df\\u00e0\\u00e1\\u00e2" + "\\u00e3\\u00e4\\u00e5\\u00e6\\u00e7\\u00e8\\u00e9\\u00ea\\u00eb\\u00ec\\u00ed" + "\\u00ee\\u00ef\\u00f0\\u00f1\\u00f2\\u00f3\\u00f4\\u00f5\\u00f6\\u00f7\\u00f8" + "\\u00f9\\u00fa\\u00fb\\u00fc\\u00fd\\u00fe\\u00ff", sanitize(x80_ff)); } TEST_F(JsonSanitizerTest, InvalidUtf8) { // 2 byte - EXPECT_EQ("\\316", sanitizeInvalid(truncate(LambdaUtf8))); - EXPECT_EQ("\\316\\373", sanitizeInvalid(corruptByte2(LambdaUtf8))); + EXPECT_EQ("\\u00ce", sanitizeInvalid(truncate(LambdaUtf8))); + EXPECT_EQ("\\u00ce\\u00fb", sanitizeInvalid(corruptByte2(LambdaUtf8))); // 3 byte - EXPECT_EQ("\\341\\275", sanitizeInvalid(truncate(OmicronUtf8))); - EXPECT_EQ("\\341\\375\\271", sanitizeInvalid(corruptByte2(OmicronUtf8))); + EXPECT_EQ("\\u00e1\\u00bd", sanitizeInvalid(truncate(OmicronUtf8))); + EXPECT_EQ("\\u00e1\\u00fd\\u00b9", sanitizeInvalid(corruptByte2(OmicronUtf8))); // 4 byte - EXPECT_EQ("\\360\\235\\204", sanitizeInvalid(truncate(TrebleClefUtf8))); - EXPECT_EQ("\\360\\375\\204\\236", sanitizeInvalid(corruptByte2(TrebleClefUtf8))); + EXPECT_EQ("\\u00f0\\u009d\\u0084", sanitizeInvalid(truncate(TrebleClefUtf8))); + EXPECT_EQ("\\u00f0\\u00fd\\u0084\\u009e", sanitizeInvalid(corruptByte2(TrebleClefUtf8))); // Invalid input embedded in normal text. - EXPECT_EQ("Hello, \\360\\235\\204, World!", + EXPECT_EQ("Hello, \\u00f0\\u009d\\u0084, World!", sanitize(absl::StrCat("Hello, ", truncate(TrebleClefUtf8), ", World!"))); // Replicate a few other cases that were discovered during initial fuzzing, diff --git a/test/common/json/json_sanitizer_test_util.cc b/test/common/json/json_sanitizer_test_util.cc index e19c154160c8..69bbb6ab9345 100644 --- a/test/common/json/json_sanitizer_test_util.cc +++ b/test/common/json/json_sanitizer_test_util.cc @@ -111,9 +111,11 @@ bool compareUnicodeEscapeAgainstUtf8(absl::string_view& escaped, absl::string_vi uint32_t escaped_unicode; if (parseUnicode(escaped, escaped_unicode)) { // If one side of the comparison is a Unicode escape, - auto [unicode, consumed] = Utf8::decode(utf8); + // for (uint32_t size = utf8.size(); size > 0; --size) { + const uint32_t size = utf8.size(); + auto [unicode, consumed] = Utf8::decode(utf8.substr(0, size)); if (consumed != 0 && unicode == escaped_unicode) { - utf8 = utf8.substr(consumed, utf8.size() - consumed); + utf8 = utf8.substr(consumed, size - consumed); escaped = escaped.substr(UnicodeEscapeLength, escaped.size() - UnicodeEscapeLength); return true; } @@ -148,6 +150,28 @@ bool utf8Equivalent(absl::string_view a, absl::string_view b, std::string& diffs } } +bool jsonEquivalentStrings(absl::string_view sanitized, absl::string_view original, + std::string& errmsg) { + for (uint32_t hex; !sanitized.empty() && !original.empty(); + original = original.substr(1, original.size() - 1)) { + if (sanitized[0] == original[0]) { + sanitized = sanitized.substr(1, sanitized.size() - 1); + } else if (parseUnicode(sanitized.substr(0, 6), hex) && + hex == static_cast(original[0])) { + sanitized = sanitized.substr(6, sanitized.size() - 6); + } else { + errmsg = absl::StrFormat("%s != %s", sanitized, original); + return false; + } + } + + if (sanitized.empty() && original.empty()) { + return true; + } + errmsg = absl::StrFormat("`%s' and `%s` have different lengths", sanitized, original); + return false; +} + } // namespace TestUtil } // namespace Json } // namespace Envoy diff --git a/test/common/json/json_sanitizer_test_util.h b/test/common/json/json_sanitizer_test_util.h index 3a2b950a8b75..2360641c87d0 100644 --- a/test/common/json/json_sanitizer_test_util.h +++ b/test/common/json/json_sanitizer_test_util.h @@ -23,6 +23,27 @@ bool utf8Equivalent(absl::string_view a, absl::string_view b, std::string& errms EXPECT_TRUE(TestUtil::utf8Equivalent(a, b, errmsg)) << context << "\n" << errmsg; \ } +/** + * SPELLCHECKER(skip-block) + * Determines whether `sanitized`is equivalent to `original`. `original` may be + * in any encoding, e.g. ascii, binary, utf-8, gb2132. `sanitized` is valid JSON + * with `\u` escapes for any characters are not allowed in JSON strings, per + * https://www.json.org/json-en.html. We want to make sure our json encoding for + * `original` would be decoded into the same bytes. + * + * Note that the 'sanitized' argument does not accept arbitrary json string + * encodings, such as `\r`, as those are not currently generated by the + * exception handler in Json::sanitize(); only numeric escapes are generated + * so that's all this test helper accepts. + * + * @param sanitized + * @param original + * @param errmsg + * @param true if the strings are equivalent. + */ +bool jsonEquivalentStrings(absl::string_view sanitized, absl::string_view original, + std::string& errmsg); + } // namespace TestUtil } // namespace Json } // namespace Envoy