json: generate hex escapes when utf8 is not valid, and check that tho…

…se are correct in tests. (envoyproxy#35972) Commit Message: Currently the JSON serializer, when sanitizing a string that is not valid UTF-8, generates correct JavaScript string syntax. However, the syntax is not valid JSON; in JSON you must use 4-digit hex escapes, and 3-digit octal escapes are not in spec. This was working for the Envoy admin detailed histograms, which would load the histogram detail via JSON, but parse it with JavaScript. However it does not pass validation. Note that this would only show up if the strings (e.g. stat names, prefixes) are not valid utf-8. This PR changes the escape syntax to hex, so it will pass JSON validation. It also adds a test that checks to see the escaped JSON strings match the input. Additional Description: These extra checks were added in anticipation of helping to test envoyproxy#35936 Risk Level: low Testing: //test/common/json/... Docs Changes: n/a Release Notes: n/a Platform Specific Features: n/a --------- Signed-off-by: Joshua Marantz <[email protected]>
sendbird · Sep 6, 2024 · 9386d34 · 9386d34
1 parent 69f1b69
commit 9386d34
Show file tree

Hide file tree

Showing 4 changed files with 111 additions and 29 deletions.
diff --git a/source/common/json/json_sanitizer.cc b/source/common/json/json_sanitizer.cc
@@ -77,16 +77,26 @@ absl::string_view sanitize(std::string& buffer, absl::string_view str) {
   }
   END_TRY
   catch (std::exception&) {
-    // If Nlohmann throws an error, emit an octal escape for any character
+    // If Nlohmann throws an error, emit a hex escape for any character
     // requiring it. This can occur for invalid utf-8 sequences, and we don't
     // want to crash the server if such a sequence makes its way into a string
     // we need to serialize. For example, if admin endpoint /stats?format=json
     // is called, and a stat name was synthesized from dynamic content such as a
     // gRPC method.
+    //
+    // Note that JSON string escapes are always 4 digit hex. 3 digit octal would
+    // be more compact, and is legal JavaScript, but not legal JSON. See
+    // https://www.json.org/json-en.html for details.
+    //
+    // TODO(jmarantz): It would better to use the compact JSON escapes for
+    // quotes, slashes, backspace, form-feed, linefeed, CR, and tab, in which
+    // case we'd also need to modify jsonEquivalentStrings in
+    // test/common/json/json_sanitizer_test_util.h. We don't expect to hit this
+    // often, so it isn't a priority to use these more compact encodings.
     buffer.clear();
     for (char c : str) {
       if (needs_slow_sanitizer[static_cast<uint8_t>(c)]) {
-        buffer.append(absl::StrFormat("\\%03o", c));
+        buffer.append(absl::StrFormat("\\u%04x", c));
       } else {
         buffer.append(1, c);
       }

diff --git a/test/common/json/json_sanitizer_test.cc b/test/common/json/json_sanitizer_test.cc
@@ -32,11 +32,9 @@ class JsonSanitizerTest : public testing::Test {
   }
 
   absl::string_view sanitizeAndCheckAgainstProtobufJson(absl::string_view str) {
-    EXPECT_TRUE(TestUtil::isProtoSerializableUtf8(str)) << "str=" << str;
     absl::string_view sanitized = sanitize(str);
-    if (TestUtil::isProtoSerializableUtf8(str)) {
-      EXPECT_UTF8_EQ(protoSanitize(str), sanitized, str);
-    }
+    EXPECT_TRUE(TestUtil::isProtoSerializableUtf8(str)) << "str=" << str;
+    EXPECT_UTF8_EQ(protoSanitize(str), sanitized, str);
     return sanitized;
   }
 
@@ -53,9 +51,11 @@ class JsonSanitizerTest : public testing::Test {
     return corrupt_second_byte;
   }
 
-  absl::string_view sanitizeInvalid(absl::string_view str) {
-    EXPECT_EQ(Utf8::UnicodeSizePair(0, 0), decode(str));
-    return sanitize(str);
+  absl::string_view sanitizeInvalidAndCheckEscapes(absl::string_view str) {
+    EXPECT_FALSE(TestUtil::isProtoSerializableUtf8(str));
+    absl::string_view sanitized = sanitize(str);
+    EXPECT_JSON_STREQ(sanitized, str, str);
+    return sanitized;
   }
 
   std::pair<uint32_t, uint32_t> decode(absl::string_view str) {
@@ -165,6 +165,9 @@ TEST_F(JsonSanitizerTest, AllThreeByteUtf8) {
             EXPECT_EQ(3, consumed);
             EXPECT_UTF8_EQ(protoSanitize(utf8), sanitized,
                            absl::StrFormat("0x%x(%d,%d,%d)", unicode, byte1, byte2, byte3));
+          } else {
+            EXPECT_JSON_STREQ(sanitized, utf8,
+                              absl::StrFormat("non-utf8(%d,%d,%d)", byte1, byte2, byte3));
           }
         }
       }
@@ -200,6 +203,9 @@ TEST_F(JsonSanitizerTest, AllFourByteUtf8) {
             EXPECT_UTF8_EQ(
                 protoSanitize(utf8), sanitized,
                 absl::StrFormat("0x%x(%d,%d,%d,%d)", unicode, byte1, byte2, byte3, byte4));
+          } else {
+            EXPECT_JSON_STREQ(sanitized, utf8,
+                              absl::StrFormat("non-utf8(%d,%d,%d,%d)", byte1, byte2, byte3, byte4));
           }
         }
       }
@@ -256,33 +262,44 @@ TEST_F(JsonSanitizerTest, High8Bit) {
   // exception, which Json::sanitizer catches and just escapes the characters so
   // we don't lose information in the encoding. All bytes with the high-bit set
   // are invalid utf-8 in isolation, so we fall through to escaping these.
-  EXPECT_EQ("\\200\\201\\202\\203\\204\\205\\206\\207\\210\\211\\212\\213\\214\\215\\216\\217"
-            "\\220\\221\\222\\223\\224\\225\\226\\227\\230\\231\\232\\233\\234\\235\\236\\237"
-            "\\240\\241\\242\\243\\244\\245\\246\\247\\250\\251\\252\\253\\254\\255\\256\\257"
-            "\\260\\261\\262\\263\\264\\265\\266\\267\\270\\271\\272\\273\\274\\275\\276\\277"
-            "\\300\\301\\302\\303\\304\\305\\306\\307\\310\\311\\312\\313\\314\\315\\316\\317"
-            "\\320\\321\\322\\323\\324\\325\\326\\327\\330\\331\\332\\333\\334\\335\\336\\337"
-            "\\340\\341\\342\\343\\344\\345\\346\\347\\350\\351\\352\\353\\354\\355\\356\\357"
-            "\\360\\361\\362\\363\\364\\365\\366\\367\\370\\371\\372\\373\\374\\375\\376\\377",
+  EXPECT_EQ("\\u0080\\u0081\\u0082\\u0083\\u0084\\u0085\\u0086\\u0087\\u0088\\u0089\\u008a"
+            "\\u008b\\u008c\\u008d\\u008e\\u008f\\u0090\\u0091\\u0092\\u0093\\u0094\\u0095"
+            "\\u0096\\u0097\\u0098\\u0099\\u009a\\u009b\\u009c\\u009d\\u009e\\u009f\\u00a0"
+            "\\u00a1\\u00a2\\u00a3\\u00a4\\u00a5\\u00a6\\u00a7\\u00a8\\u00a9\\u00aa\\u00ab"
+            "\\u00ac\\u00ad\\u00ae\\u00af\\u00b0\\u00b1\\u00b2\\u00b3\\u00b4\\u00b5\\u00b6"
+            "\\u00b7\\u00b8\\u00b9\\u00ba\\u00bb\\u00bc\\u00bd\\u00be\\u00bf\\u00c0\\u00c1"
+            "\\u00c2\\u00c3\\u00c4\\u00c5\\u00c6\\u00c7\\u00c8\\u00c9\\u00ca\\u00cb\\u00cc"
+            "\\u00cd\\u00ce\\u00cf\\u00d0\\u00d1\\u00d2\\u00d3\\u00d4\\u00d5\\u00d6\\u00d7"
+            "\\u00d8\\u00d9\\u00da\\u00db\\u00dc\\u00dd\\u00de\\u00df\\u00e0\\u00e1\\u00e2"
+            "\\u00e3\\u00e4\\u00e5\\u00e6\\u00e7\\u00e8\\u00e9\\u00ea\\u00eb\\u00ec\\u00ed"
+            "\\u00ee\\u00ef\\u00f0\\u00f1\\u00f2\\u00f3\\u00f4\\u00f5\\u00f6\\u00f7\\u00f8"
+            "\\u00f9\\u00fa\\u00fb\\u00fc\\u00fd\\u00fe\\u00ff",
             sanitize(x80_ff));
 }
 
 TEST_F(JsonSanitizerTest, InvalidUtf8) {
   // 2 byte
-  EXPECT_EQ("\\316", sanitizeInvalid(truncate(LambdaUtf8)));
-  EXPECT_EQ("\\316\\373", sanitizeInvalid(corruptByte2(LambdaUtf8)));
+  EXPECT_EQ("\\u00ce", sanitizeInvalidAndCheckEscapes(truncate(LambdaUtf8)));
+  EXPECT_EQ("\\u00ce\\u00fb", sanitizeInvalidAndCheckEscapes(corruptByte2(LambdaUtf8)));
 
   // 3 byte
-  EXPECT_EQ("\\341\\275", sanitizeInvalid(truncate(OmicronUtf8)));
-  EXPECT_EQ("\\341\\375\\271", sanitizeInvalid(corruptByte2(OmicronUtf8)));
+  EXPECT_EQ("\\u00e1\\u00bd", sanitizeInvalidAndCheckEscapes(truncate(OmicronUtf8)));
+  EXPECT_EQ("\\u00e1\\u00fd\\u00b9", sanitizeInvalidAndCheckEscapes(corruptByte2(OmicronUtf8)));
 
   // 4 byte
-  EXPECT_EQ("\\360\\235\\204", sanitizeInvalid(truncate(TrebleClefUtf8)));
-  EXPECT_EQ("\\360\\375\\204\\236", sanitizeInvalid(corruptByte2(TrebleClefUtf8)));
+  EXPECT_EQ("\\u00f0\\u009d\\u0084", sanitizeInvalidAndCheckEscapes(truncate(TrebleClefUtf8)));
+  EXPECT_EQ("\\u00f0\\u00fd\\u0084\\u009e",
+            sanitizeInvalidAndCheckEscapes(corruptByte2(TrebleClefUtf8)));
 
   // Invalid input embedded in normal text.
-  EXPECT_EQ("Hello, \\360\\235\\204, World!",
-            sanitize(absl::StrCat("Hello, ", truncate(TrebleClefUtf8), ", World!")));
+  EXPECT_EQ("Hello, \\u00f0\\u009d\\u0084, World!",
+            sanitizeInvalidAndCheckEscapes(
+                absl::StrCat("Hello, ", truncate(TrebleClefUtf8), ", World!")));
+
+  // Invalid input with leading slash.
+  EXPECT_EQ("\\u005cHello, \\u00f0\\u009d\\u0084, World!",
+            sanitizeInvalidAndCheckEscapes(
+                absl::StrCat("\\Hello, ", truncate(TrebleClefUtf8), ", World!")));
 
   // Replicate a few other cases that were discovered during initial fuzzing,
   // to ensure we see these as invalid utf8 and avoid them in comparisons.

diff --git a/test/common/json/json_sanitizer_test_util.cc b/test/common/json/json_sanitizer_test_util.cc
@@ -105,6 +105,13 @@ bool parseUnicode(absl::string_view str, uint32_t& hex_value) {
   return false;
 }
 
+// Removes 'prefix_size' characters from the beginning of 'str'.
+void removePrefix(absl::string_view& str, uint32_t prefix_size) {
+  ASSERT(prefix_size > 0);
+  ASSERT(prefix_size <= str.size());
+  str = str.substr(prefix_size, str.size() - prefix_size);
+}
+
 // Compares a string that's possibly an escaped Unicode, e.g. \u1234, to
 // one that is utf8-encoded.
 bool compareUnicodeEscapeAgainstUtf8(absl::string_view& escaped, absl::string_view& utf8) {
@@ -113,8 +120,8 @@ bool compareUnicodeEscapeAgainstUtf8(absl::string_view& escaped, absl::string_vi
     // If one side of the comparison is a Unicode escape,
     auto [unicode, consumed] = Utf8::decode(utf8);
     if (consumed != 0 && unicode == escaped_unicode) {
-      utf8 = utf8.substr(consumed, utf8.size() - consumed);
-      escaped = escaped.substr(UnicodeEscapeLength, escaped.size() - UnicodeEscapeLength);
+      removePrefix(utf8, consumed);
+      removePrefix(escaped, UnicodeEscapeLength);
       return true;
     }
   }
@@ -137,8 +144,8 @@ bool utf8Equivalent(absl::string_view a, absl::string_view b, std::string& diffs
       diffs = absl::StrFormat("`%s' and `%s` have different lengths", a, b);
       return false;
     } else if (a[0] == b[0]) {
-      a = a.substr(1, a.size() - 1);
-      b = b.substr(1, b.size() - 1);
+      removePrefix(a, 1);
+      removePrefix(b, 1);
     } else if (!compareUnicodeEscapeAgainstUtf8(a, b) && !compareUnicodeEscapeAgainstUtf8(b, a)) {
       diffs = absl::StrFormat("%s != %s, [%d]%c(0x02%x, \\%03o) != [%d] %c(0x02%x, \\%03o)", all_a,
                               all_b, a.data() - all_a.data(), a[0], a[0], a[0],
@@ -148,6 +155,25 @@ bool utf8Equivalent(absl::string_view a, absl::string_view b, std::string& diffs
   }
 }
 
+bool decodeEscapedJson(absl::string_view sanitized, std::string& decoded, std::string& errmsg) {
+  while (!sanitized.empty()) {
+    uint32_t hex;
+    if (sanitized.size() >= UnicodeEscapeLength &&
+        parseUnicode(sanitized.substr(0, UnicodeEscapeLength), hex)) {
+      if (hex >= 256) {
+        errmsg = absl::StrFormat("Unexpected encoding >= 256: %u", hex);
+        return false;
+      }
+      decoded.append(1, hex);
+      removePrefix(sanitized, UnicodeEscapeLength);
+    } else {
+      decoded.append(1, sanitized[0]);
+      removePrefix(sanitized, 1);
+    }
+  }
+  return true;
+}
+
 } // namespace TestUtil
 } // namespace Json
 } // namespace Envoy
diff --git a/test/common/json/json_sanitizer_test_util.h b/test/common/json/json_sanitizer_test_util.h
@@ -23,6 +23,35 @@ bool utf8Equivalent(absl::string_view a, absl::string_view b, std::string& errms
     EXPECT_TRUE(TestUtil::utf8Equivalent(a, b, errmsg)) << context << "\n" << errmsg;              \
   }
 
+/**
+ * Reverses the json escaping algorithm in sanitize(), which is used when the
+ * utf-8 serialization fails. Note that `sanitized` may be in any encoding, e.g.
+ * ascii, binary, utf-8, gb2132. `sanitized` is valid JSON with `\u` escapes for
+ * any characters are not allowed in JSON strings, per
+ * https://www.json.org/json-en.html. We want to make sure our json encoding for
+ * `original` would be decoded into the same bytes.
+ *
+ * Writes the decoded version into `decoded`.
+ *
+ * Note that the `sanitized` argument does not accept arbitrary json string
+ * encodings, such as `\r`, as those are not currently generated by the
+ * exception handler in Json::sanitize(); only numeric escapes are generated
+ * so that's all this test helper accepts.
+ *
+ * @param sanitized the output of Json::sanitize(), when it is not utf-8 compliant.
+ * @param decoded the decoded form, undoing any escapes added by Json::sanitize().
+ * @param errmsg details any error encountered during decoding.
+ * @param true if the encoding was successful.
+ */
+bool decodeEscapedJson(absl::string_view sanitized, std::string& decoded, std::string& errmsg);
+#define EXPECT_JSON_STREQ(sanitized, original, context)                                            \
+  {                                                                                                \
+    std::string decoded, errmsg;                                                                   \
+    EXPECT_TRUE(TestUtil::decodeEscapedJson(sanitized, decoded, errmsg))                           \
+        << context << ": " << errmsg;                                                              \
+    EXPECT_EQ(decoded, original) << context;                                                       \
+  }
+
 } // namespace TestUtil
 } // namespace Json
 } // namespace Envoy