From 9386d3441d3e92ab41477c4d704ca316805e5bf2 Mon Sep 17 00:00:00 2001
From: Joshua Marantz <jmarantz@google.com>
Date: Thu, 5 Sep 2024 21:07:33 -0400
Subject: [PATCH] json: generate hex escapes when utf8 is not valid, and check
 that those are correct in tests. (#35972)

Commit Message: Currently the JSON serializer, when sanitizing a string
that is not valid UTF-8, generates correct JavaScript string syntax.
However, the syntax is not valid JSON; in JSON you must use 4-digit hex
escapes, and 3-digit octal escapes are not in spec.

This was working for the Envoy admin detailed histograms, which would
load the histogram detail via JSON, but parse it with JavaScript.
However it does not pass validation.

Note that this would only show up if the strings (e.g. stat names,
prefixes) are not valid utf-8.

This PR changes the escape syntax to hex, so it will pass JSON
validation. It also adds a test that checks to see the escaped JSON
strings match the input.

Additional Description: These extra checks were added in anticipation of
helping to test https://github.com/envoyproxy/envoy/pull/35936
Risk Level: low
Testing: //test/common/json/...
Docs Changes: n/a
Release Notes: n/a
Platform Specific Features: n/a

---------

Signed-off-by: Joshua Marantz <jmarantz@google.com>
---
 source/common/json/json_sanitizer.cc         | 14 ++++-
 test/common/json/json_sanitizer_test.cc      | 63 +++++++++++++-------
 test/common/json/json_sanitizer_test_util.cc | 34 +++++++++--
 test/common/json/json_sanitizer_test_util.h  | 29 +++++++++
 4 files changed, 111 insertions(+), 29 deletions(-)

diff --git a/source/common/json/json_sanitizer.cc b/source/common/json/json_sanitizer.cc
index ab814c70e815..d45a7bdc8013 100644
--- a/source/common/json/json_sanitizer.cc
+++ b/source/common/json/json_sanitizer.cc
@@ -77,16 +77,26 @@ absl::string_view sanitize(std::string& buffer, absl::string_view str) {
   }
   END_TRY
   catch (std::exception&) {
-    // If Nlohmann throws an error, emit an octal escape for any character
+    // If Nlohmann throws an error, emit a hex escape for any character
     // requiring it. This can occur for invalid utf-8 sequences, and we don't
     // want to crash the server if such a sequence makes its way into a string
     // we need to serialize. For example, if admin endpoint /stats?format=json
     // is called, and a stat name was synthesized from dynamic content such as a
     // gRPC method.
+    //
+    // Note that JSON string escapes are always 4 digit hex. 3 digit octal would
+    // be more compact, and is legal JavaScript, but not legal JSON. See
+    // https://www.json.org/json-en.html for details.
+    //
+    // TODO(jmarantz): It would better to use the compact JSON escapes for
+    // quotes, slashes, backspace, form-feed, linefeed, CR, and tab, in which
+    // case we'd also need to modify jsonEquivalentStrings in
+    // test/common/json/json_sanitizer_test_util.h. We don't expect to hit this
+    // often, so it isn't a priority to use these more compact encodings.
     buffer.clear();
     for (char c : str) {
       if (needs_slow_sanitizer[static_cast<uint8_t>(c)]) {
-        buffer.append(absl::StrFormat("\\%03o", c));
+        buffer.append(absl::StrFormat("\\u%04x", c));
       } else {
         buffer.append(1, c);
       }
diff --git a/test/common/json/json_sanitizer_test.cc b/test/common/json/json_sanitizer_test.cc
index 23222b72930a..14434eefe60f 100644
--- a/test/common/json/json_sanitizer_test.cc
+++ b/test/common/json/json_sanitizer_test.cc
@@ -32,11 +32,9 @@ class JsonSanitizerTest : public testing::Test {
   }
 
   absl::string_view sanitizeAndCheckAgainstProtobufJson(absl::string_view str) {
-    EXPECT_TRUE(TestUtil::isProtoSerializableUtf8(str)) << "str=" << str;
     absl::string_view sanitized = sanitize(str);
-    if (TestUtil::isProtoSerializableUtf8(str)) {
-      EXPECT_UTF8_EQ(protoSanitize(str), sanitized, str);
-    }
+    EXPECT_TRUE(TestUtil::isProtoSerializableUtf8(str)) << "str=" << str;
+    EXPECT_UTF8_EQ(protoSanitize(str), sanitized, str);
     return sanitized;
   }
 
@@ -53,9 +51,11 @@ class JsonSanitizerTest : public testing::Test {
     return corrupt_second_byte;
   }
 
-  absl::string_view sanitizeInvalid(absl::string_view str) {
-    EXPECT_EQ(Utf8::UnicodeSizePair(0, 0), decode(str));
-    return sanitize(str);
+  absl::string_view sanitizeInvalidAndCheckEscapes(absl::string_view str) {
+    EXPECT_FALSE(TestUtil::isProtoSerializableUtf8(str));
+    absl::string_view sanitized = sanitize(str);
+    EXPECT_JSON_STREQ(sanitized, str, str);
+    return sanitized;
   }
 
   std::pair<uint32_t, uint32_t> decode(absl::string_view str) {
@@ -165,6 +165,9 @@ TEST_F(JsonSanitizerTest, AllThreeByteUtf8) {
             EXPECT_EQ(3, consumed);
             EXPECT_UTF8_EQ(protoSanitize(utf8), sanitized,
                            absl::StrFormat("0x%x(%d,%d,%d)", unicode, byte1, byte2, byte3));
+          } else {
+            EXPECT_JSON_STREQ(sanitized, utf8,
+                              absl::StrFormat("non-utf8(%d,%d,%d)", byte1, byte2, byte3));
           }
         }
       }
@@ -200,6 +203,9 @@ TEST_F(JsonSanitizerTest, AllFourByteUtf8) {
             EXPECT_UTF8_EQ(
                 protoSanitize(utf8), sanitized,
                 absl::StrFormat("0x%x(%d,%d,%d,%d)", unicode, byte1, byte2, byte3, byte4));
+          } else {
+            EXPECT_JSON_STREQ(sanitized, utf8,
+                              absl::StrFormat("non-utf8(%d,%d,%d,%d)", byte1, byte2, byte3, byte4));
           }
         }
       }
@@ -256,33 +262,44 @@ TEST_F(JsonSanitizerTest, High8Bit) {
   // exception, which Json::sanitizer catches and just escapes the characters so
   // we don't lose information in the encoding. All bytes with the high-bit set
   // are invalid utf-8 in isolation, so we fall through to escaping these.
-  EXPECT_EQ("\\200\\201\\202\\203\\204\\205\\206\\207\\210\\211\\212\\213\\214\\215\\216\\217"
-            "\\220\\221\\222\\223\\224\\225\\226\\227\\230\\231\\232\\233\\234\\235\\236\\237"
-            "\\240\\241\\242\\243\\244\\245\\246\\247\\250\\251\\252\\253\\254\\255\\256\\257"
-            "\\260\\261\\262\\263\\264\\265\\266\\267\\270\\271\\272\\273\\274\\275\\276\\277"
-            "\\300\\301\\302\\303\\304\\305\\306\\307\\310\\311\\312\\313\\314\\315\\316\\317"
-            "\\320\\321\\322\\323\\324\\325\\326\\327\\330\\331\\332\\333\\334\\335\\336\\337"
-            "\\340\\341\\342\\343\\344\\345\\346\\347\\350\\351\\352\\353\\354\\355\\356\\357"
-            "\\360\\361\\362\\363\\364\\365\\366\\367\\370\\371\\372\\373\\374\\375\\376\\377",
+  EXPECT_EQ("\\u0080\\u0081\\u0082\\u0083\\u0084\\u0085\\u0086\\u0087\\u0088\\u0089\\u008a"
+            "\\u008b\\u008c\\u008d\\u008e\\u008f\\u0090\\u0091\\u0092\\u0093\\u0094\\u0095"
+            "\\u0096\\u0097\\u0098\\u0099\\u009a\\u009b\\u009c\\u009d\\u009e\\u009f\\u00a0"
+            "\\u00a1\\u00a2\\u00a3\\u00a4\\u00a5\\u00a6\\u00a7\\u00a8\\u00a9\\u00aa\\u00ab"
+            "\\u00ac\\u00ad\\u00ae\\u00af\\u00b0\\u00b1\\u00b2\\u00b3\\u00b4\\u00b5\\u00b6"
+            "\\u00b7\\u00b8\\u00b9\\u00ba\\u00bb\\u00bc\\u00bd\\u00be\\u00bf\\u00c0\\u00c1"
+            "\\u00c2\\u00c3\\u00c4\\u00c5\\u00c6\\u00c7\\u00c8\\u00c9\\u00ca\\u00cb\\u00cc"
+            "\\u00cd\\u00ce\\u00cf\\u00d0\\u00d1\\u00d2\\u00d3\\u00d4\\u00d5\\u00d6\\u00d7"
+            "\\u00d8\\u00d9\\u00da\\u00db\\u00dc\\u00dd\\u00de\\u00df\\u00e0\\u00e1\\u00e2"
+            "\\u00e3\\u00e4\\u00e5\\u00e6\\u00e7\\u00e8\\u00e9\\u00ea\\u00eb\\u00ec\\u00ed"
+            "\\u00ee\\u00ef\\u00f0\\u00f1\\u00f2\\u00f3\\u00f4\\u00f5\\u00f6\\u00f7\\u00f8"
+            "\\u00f9\\u00fa\\u00fb\\u00fc\\u00fd\\u00fe\\u00ff",
             sanitize(x80_ff));
 }
 
 TEST_F(JsonSanitizerTest, InvalidUtf8) {
   // 2 byte
-  EXPECT_EQ("\\316", sanitizeInvalid(truncate(LambdaUtf8)));
-  EXPECT_EQ("\\316\\373", sanitizeInvalid(corruptByte2(LambdaUtf8)));
+  EXPECT_EQ("\\u00ce", sanitizeInvalidAndCheckEscapes(truncate(LambdaUtf8)));
+  EXPECT_EQ("\\u00ce\\u00fb", sanitizeInvalidAndCheckEscapes(corruptByte2(LambdaUtf8)));
 
   // 3 byte
-  EXPECT_EQ("\\341\\275", sanitizeInvalid(truncate(OmicronUtf8)));
-  EXPECT_EQ("\\341\\375\\271", sanitizeInvalid(corruptByte2(OmicronUtf8)));
+  EXPECT_EQ("\\u00e1\\u00bd", sanitizeInvalidAndCheckEscapes(truncate(OmicronUtf8)));
+  EXPECT_EQ("\\u00e1\\u00fd\\u00b9", sanitizeInvalidAndCheckEscapes(corruptByte2(OmicronUtf8)));
 
   // 4 byte
-  EXPECT_EQ("\\360\\235\\204", sanitizeInvalid(truncate(TrebleClefUtf8)));
-  EXPECT_EQ("\\360\\375\\204\\236", sanitizeInvalid(corruptByte2(TrebleClefUtf8)));
+  EXPECT_EQ("\\u00f0\\u009d\\u0084", sanitizeInvalidAndCheckEscapes(truncate(TrebleClefUtf8)));
+  EXPECT_EQ("\\u00f0\\u00fd\\u0084\\u009e",
+            sanitizeInvalidAndCheckEscapes(corruptByte2(TrebleClefUtf8)));
 
   // Invalid input embedded in normal text.
-  EXPECT_EQ("Hello, \\360\\235\\204, World!",
-            sanitize(absl::StrCat("Hello, ", truncate(TrebleClefUtf8), ", World!")));
+  EXPECT_EQ("Hello, \\u00f0\\u009d\\u0084, World!",
+            sanitizeInvalidAndCheckEscapes(
+                absl::StrCat("Hello, ", truncate(TrebleClefUtf8), ", World!")));
+
+  // Invalid input with leading slash.
+  EXPECT_EQ("\\u005cHello, \\u00f0\\u009d\\u0084, World!",
+            sanitizeInvalidAndCheckEscapes(
+                absl::StrCat("\\Hello, ", truncate(TrebleClefUtf8), ", World!")));
 
   // Replicate a few other cases that were discovered during initial fuzzing,
   // to ensure we see these as invalid utf8 and avoid them in comparisons.
diff --git a/test/common/json/json_sanitizer_test_util.cc b/test/common/json/json_sanitizer_test_util.cc
index e19c154160c8..b2096cdd35e9 100644
--- a/test/common/json/json_sanitizer_test_util.cc
+++ b/test/common/json/json_sanitizer_test_util.cc
@@ -105,6 +105,13 @@ bool parseUnicode(absl::string_view str, uint32_t& hex_value) {
   return false;
 }
 
+// Removes 'prefix_size' characters from the beginning of 'str'.
+void removePrefix(absl::string_view& str, uint32_t prefix_size) {
+  ASSERT(prefix_size > 0);
+  ASSERT(prefix_size <= str.size());
+  str = str.substr(prefix_size, str.size() - prefix_size);
+}
+
 // Compares a string that's possibly an escaped Unicode, e.g. \u1234, to
 // one that is utf8-encoded.
 bool compareUnicodeEscapeAgainstUtf8(absl::string_view& escaped, absl::string_view& utf8) {
@@ -113,8 +120,8 @@ bool compareUnicodeEscapeAgainstUtf8(absl::string_view& escaped, absl::string_vi
     // If one side of the comparison is a Unicode escape,
     auto [unicode, consumed] = Utf8::decode(utf8);
     if (consumed != 0 && unicode == escaped_unicode) {
-      utf8 = utf8.substr(consumed, utf8.size() - consumed);
-      escaped = escaped.substr(UnicodeEscapeLength, escaped.size() - UnicodeEscapeLength);
+      removePrefix(utf8, consumed);
+      removePrefix(escaped, UnicodeEscapeLength);
       return true;
     }
   }
@@ -137,8 +144,8 @@ bool utf8Equivalent(absl::string_view a, absl::string_view b, std::string& diffs
       diffs = absl::StrFormat("`%s' and `%s` have different lengths", a, b);
       return false;
     } else if (a[0] == b[0]) {
-      a = a.substr(1, a.size() - 1);
-      b = b.substr(1, b.size() - 1);
+      removePrefix(a, 1);
+      removePrefix(b, 1);
     } else if (!compareUnicodeEscapeAgainstUtf8(a, b) && !compareUnicodeEscapeAgainstUtf8(b, a)) {
       diffs = absl::StrFormat("%s != %s, [%d]%c(0x02%x, \\%03o) != [%d] %c(0x02%x, \\%03o)", all_a,
                               all_b, a.data() - all_a.data(), a[0], a[0], a[0],
@@ -148,6 +155,25 @@ bool utf8Equivalent(absl::string_view a, absl::string_view b, std::string& diffs
   }
 }
 
+bool decodeEscapedJson(absl::string_view sanitized, std::string& decoded, std::string& errmsg) {
+  while (!sanitized.empty()) {
+    uint32_t hex;
+    if (sanitized.size() >= UnicodeEscapeLength &&
+        parseUnicode(sanitized.substr(0, UnicodeEscapeLength), hex)) {
+      if (hex >= 256) {
+        errmsg = absl::StrFormat("Unexpected encoding >= 256: %u", hex);
+        return false;
+      }
+      decoded.append(1, hex);
+      removePrefix(sanitized, UnicodeEscapeLength);
+    } else {
+      decoded.append(1, sanitized[0]);
+      removePrefix(sanitized, 1);
+    }
+  }
+  return true;
+}
+
 } // namespace TestUtil
 } // namespace Json
 } // namespace Envoy
diff --git a/test/common/json/json_sanitizer_test_util.h b/test/common/json/json_sanitizer_test_util.h
index 3a2b950a8b75..713315d4d83f 100644
--- a/test/common/json/json_sanitizer_test_util.h
+++ b/test/common/json/json_sanitizer_test_util.h
@@ -23,6 +23,35 @@ bool utf8Equivalent(absl::string_view a, absl::string_view b, std::string& errms
     EXPECT_TRUE(TestUtil::utf8Equivalent(a, b, errmsg)) << context << "\n" << errmsg;              \
   }
 
+/**
+ * Reverses the json escaping algorithm in sanitize(), which is used when the
+ * utf-8 serialization fails. Note that `sanitized` may be in any encoding, e.g.
+ * ascii, binary, utf-8, gb2132. `sanitized` is valid JSON with `\u` escapes for
+ * any characters are not allowed in JSON strings, per
+ * https://www.json.org/json-en.html. We want to make sure our json encoding for
+ * `original` would be decoded into the same bytes.
+ *
+ * Writes the decoded version into `decoded`.
+ *
+ * Note that the `sanitized` argument does not accept arbitrary json string
+ * encodings, such as `\r`, as those are not currently generated by the
+ * exception handler in Json::sanitize(); only numeric escapes are generated
+ * so that's all this test helper accepts.
+ *
+ * @param sanitized the output of Json::sanitize(), when it is not utf-8 compliant.
+ * @param decoded the decoded form, undoing any escapes added by Json::sanitize().
+ * @param errmsg details any error encountered during decoding.
+ * @param true if the encoding was successful.
+ */
+bool decodeEscapedJson(absl::string_view sanitized, std::string& decoded, std::string& errmsg);
+#define EXPECT_JSON_STREQ(sanitized, original, context)                                            \
+  {                                                                                                \
+    std::string decoded, errmsg;                                                                   \
+    EXPECT_TRUE(TestUtil::decodeEscapedJson(sanitized, decoded, errmsg))                           \
+        << context << ": " << errmsg;                                                              \
+    EXPECT_EQ(decoded, original) << context;                                                       \
+  }
+
 } // namespace TestUtil
 } // namespace Json
 } // namespace Envoy