Bug 1501155 - Part 1: Add AtomizeWTF8Chars. r=jwalden

arai-a · arai-a · commit 6ca00b0b0306 · 2018-11-28T14:16:29.000+09:00
diff --git a/js/public/CharacterEncoding.h b/js/public/CharacterEncoding.h
@@ -90,6 +90,26 @@ class UTF8Chars : public mozilla::Range<unsigned char>
     {}
 };
 
+/*
+ * Similar to UTF8Chars, but contains WTF-8.
+ * https://simonsapin.github.io/wtf-8/
+ */
+class WTF8Chars : public mozilla::Range<unsigned char>
+{
+    typedef mozilla::Range<unsigned char> Base;
+
+  public:
+    using CharT = unsigned char;
+
+    WTF8Chars() : Base() {}
+    WTF8Chars(char* aBytes, size_t aLength)
+      : Base(reinterpret_cast<unsigned char*>(aBytes), aLength)
+    {}
+    WTF8Chars(const char* aBytes, size_t aLength)
+      : Base(reinterpret_cast<unsigned char*>(const_cast<char*>(aBytes)), aLength)
+    {}
+};
+
 /*
  * SpiderMonkey also deals directly with UTF-8 encoded text in some places.
  */
@@ -254,6 +274,12 @@ Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length);
 extern JS_PUBLIC_API TwoByteCharsZ
 UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
 
+/*
+ * Like UTF8CharsToNewTwoByteCharsZ, but for WTF8Chars.
+ */
+extern JS_PUBLIC_API TwoByteCharsZ
+WTF8CharsToNewTwoByteCharsZ(JSContext* cx, const WTF8Chars wtf8, size_t* outlen);
+
 /*
  * Like UTF8CharsToNewTwoByteCharsZ, but for ConstUTF8CharsZ.
  */
diff --git a/js/src/NamespaceImports.h b/js/src/NamespaceImports.h
@@ -28,6 +28,7 @@ class ConstTwoByteChars;
 class TwoByteChars;
 class TwoByteCharsZ;
 class UTF8Chars;
+class WTF8Chars;
 class UTF8CharsZ;
 
 using AutoValueVector = AutoVector<Value>;
@@ -75,6 +76,7 @@ using JS::ConstTwoByteChars;
 using JS::TwoByteChars;
 using JS::TwoByteCharsZ;
 using JS::UTF8Chars;
+using JS::WTF8Chars;
 using JS::UTF8CharsZ;
 using JS::UniqueChars;
 using JS::UniqueTwoByteChars;
diff --git a/js/src/vm/CharacterEncoding.cpp b/js/src/vm/CharacterEncoding.cpp
@@ -209,13 +209,18 @@ JS::CharsToNewUTF8CharsZ(JSContext* maybeCx,
 static const uint32_t INVALID_UTF8 = UINT32_MAX;
 
 /*
- * Convert a utf8 character sequence into a UCS-4 character and return that
- * character.  It is assumed that the caller already checked that the sequence
- * is valid.
+ * Convert a UTF-8 or WTF-8 (depending on InputCharsT, which is either
+ * UTF8Chars or WTF8Chars) character sequence into a UCS-4 character and return
+ * that character.  It is assumed that the caller already checked that the
+ * sequence is valid.
  */
-uint32_t
-JS::Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length)
+template <class InputCharsT>
+static uint32_t
+Utf8ToOneUcs4CharImpl(const uint8_t* utf8Buffer, int utf8Length)
 {
+    static_assert(std::is_same<InputCharsT, UTF8Chars>::value ||
+                  std::is_same<InputCharsT, WTF8Chars>::value,
+                  "must be either UTF-8 or WTF-8");
     MOZ_ASSERT(1 <= utf8Length && utf8Length <= 4);
 
     if (utf8Length == 1) {
@@ -235,13 +240,26 @@ JS::Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length)
         ucs4Char = (ucs4Char << 6) | (*utf8Buffer++ & 0x3F);
     }
 
-    if (MOZ_UNLIKELY(ucs4Char < minucs4Char || (ucs4Char >= 0xD800 && ucs4Char <= 0xDFFF))) {
+    if (MOZ_UNLIKELY(ucs4Char < minucs4Char)) {
+        return INVALID_UTF8;
+    }
+
+    // WTF-8 allows lone surrogate.
+    if (std::is_same<InputCharsT, UTF8Chars>::value &&
+        MOZ_UNLIKELY(ucs4Char >= 0xD800 && ucs4Char <= 0xDFFF))
+    {
         return INVALID_UTF8;
     }
 
     return ucs4Char;
 }
 
+uint32_t
+JS::Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length)
+{
+    return Utf8ToOneUcs4CharImpl<UTF8Chars>(utf8Buffer, utf8Length);
+}
+
 static void
 ReportInvalidCharacter(JSContext* cx, uint32_t offset)
 {
@@ -276,13 +294,13 @@ enum class OnUTF8Error {
     Crash,
 };
 
-// Scan UTF8 input and (internally, at least) convert it to a series of UTF-16
-// code units. But you can also do odd things like pass an empty lambda for
-// `dst`, in which case the output is discarded entirely--the only effect of
-// calling the template that way is error-checking.
-template <OnUTF8Error ErrorAction, typename OutputFn>
+// Scan UTF-8 or WTF-8 input and (internally, at least) convert it to a series
+// of UTF-16 code units. But you can also do odd things like pass an empty
+// lambda for `dst`, in which case the output is discarded entirely--the only
+// effect of calling the template that way is error-checking.
+template <OnUTF8Error ErrorAction, typename OutputFn, class InputCharsT>
 static bool
-InflateUTF8ToUTF16(JSContext* cx, const UTF8Chars src, OutputFn dst)
+InflateUTF8ToUTF16(JSContext* cx, const InputCharsT src, OutputFn dst)
 {
     size_t srclen = src.length();
     for (uint32_t i = 0; i < srclen; i++) {
@@ -339,7 +357,15 @@ InflateUTF8ToUTF16(JSContext* cx, const UTF8Chars src, OutputFn dst)
                 (v == 0xF0 && ((uint8_t)src[i + 1] & 0xF0) == 0x80) ||  // F0 90~BF
                 (v == 0xF4 && ((uint8_t)src[i + 1] & 0xF0) != 0x80))    // F4 80~8F
             {
-                INVALID(ReportInvalidCharacter, i, 1);
+                if (std::is_same<InputCharsT, UTF8Chars>::value) {
+                    INVALID(ReportInvalidCharacter, i, 1);
+                } else {
+                    // WTF-8 allows lone surrogate as ED A0~BF 80~BF.
+                    MOZ_ASSERT((std::is_same<InputCharsT, WTF8Chars>::value));
+                    if (v == 0xED && ((uint8_t)src[i + 1] & 0xE0) != 0xA0) { // ED A0~BF
+                        INVALID(ReportInvalidCharacter, i, 1);
+                    }
+                }
             }
 
             // Check the continuation bytes.
@@ -350,7 +376,7 @@ InflateUTF8ToUTF16(JSContext* cx, const UTF8Chars src, OutputFn dst)
             }
 
             // Determine the code unit's length in CharT and act accordingly.
-            v = JS::Utf8ToOneUcs4Char((uint8_t*)&src[i], n);
+            v = Utf8ToOneUcs4CharImpl<InputCharsT>((uint8_t*)&src[i], n);
             if (v < 0x10000) {
                 // The n-byte UTF8 code unit will fit in a single CharT.
                 if (dst(char16_t(v)) == LoopDisposition::Break) {
@@ -383,9 +409,10 @@ InflateUTF8ToUTF16(JSContext* cx, const UTF8Chars src, OutputFn dst)
     return true;
 }
 
-template <OnUTF8Error ErrorAction, typename CharT>
+template <OnUTF8Error ErrorAction, typename CharT, class InputCharsT>
 static void
-CopyAndInflateUTF8IntoBuffer(JSContext* cx, const UTF8Chars src, CharT *dst, size_t outlen, bool allASCII)
+CopyAndInflateUTF8IntoBuffer(JSContext* cx, const InputCharsT src, CharT* dst, size_t outlen,
+                             bool allASCII)
 {
     if (allASCII) {
         size_t srclen = src.length();
@@ -405,9 +432,9 @@ CopyAndInflateUTF8IntoBuffer(JSContext* cx, const UTF8Chars src, CharT *dst, siz
     dst[outlen] = CharT('\0');    // NUL char
 }
 
-template <OnUTF8Error ErrorAction, typename CharsT>
+template <OnUTF8Error ErrorAction, typename CharsT, class InputCharsT>
 static CharsT
-InflateUTF8StringHelper(JSContext* cx, const UTF8Chars src, size_t* outlen)
+InflateUTF8StringHelper(JSContext* cx, const InputCharsT src, size_t* outlen)
 {
     using CharT = typename CharsT::CharT;
     static_assert(std::is_same<CharT, char16_t>::value ||
@@ -448,6 +475,12 @@ JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* out
     return InflateUTF8StringHelper<OnUTF8Error::Throw, TwoByteCharsZ>(cx, utf8, outlen);
 }
 
+TwoByteCharsZ
+JS::WTF8CharsToNewTwoByteCharsZ(JSContext* cx, const WTF8Chars wtf8, size_t* outlen)
+{
+    return InflateUTF8StringHelper<OnUTF8Error::Throw, TwoByteCharsZ>(cx, wtf8, outlen);
+}
+
 TwoByteCharsZ
 JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen)
 {
@@ -518,8 +551,9 @@ JS::LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t*
  * consumption.
  */
 
+template <class InputCharsT>
 bool
-GetUTF8AtomizationData(JSContext* cx, const JS::UTF8Chars utf8, size_t* outlen,
+GetUTF8AtomizationData(JSContext* cx, const InputCharsT utf8, size_t* outlen,
                        JS::SmallestEncoding* encoding, HashNumber* hashNum)
 {
     *outlen = 0;
@@ -539,6 +573,15 @@ GetUTF8AtomizationData(JSContext* cx, const JS::UTF8Chars utf8, size_t* outlen,
     return true;
 }
 
+template
+bool
+GetUTF8AtomizationData<JS::UTF8Chars>(JSContext* cx, const JS::UTF8Chars utf8, size_t* outlen,
+                                      JS::SmallestEncoding* encoding, HashNumber* hashNum);
+template
+bool
+GetUTF8AtomizationData<JS::WTF8Chars>(JSContext* cx, const JS::WTF8Chars utf8, size_t* outlen,
+                                      JS::SmallestEncoding* encoding, HashNumber* hashNum);
+
 template <typename CharT>
 bool
 UTF8EqualsChars(const JS::UTF8Chars utfChars, const CharT* chars)
@@ -575,9 +618,9 @@ UTF8EqualsChars(const JS::UTF8Chars utfChars, const CharT* chars)
 template bool UTF8EqualsChars<char16_t>(const JS::UTF8Chars, const char16_t*);
 template bool UTF8EqualsChars<JS::Latin1Char>(const JS::UTF8Chars, const JS::Latin1Char*);
 
-template <typename CharT>
+template <typename CharT, class InputCharsT>
 void
-InflateUTF8CharsToBufferAndTerminate(const UTF8Chars src, CharT* dst, size_t dstLen,
+InflateUTF8CharsToBufferAndTerminate(const InputCharsT src, CharT* dst, size_t dstLen,
                                      JS::SmallestEncoding encoding)
 {
     CopyAndInflateUTF8IntoBuffer<OnUTF8Error::Crash>(/* cx = */ nullptr, src, dst, dstLen,
@@ -590,6 +633,12 @@ InflateUTF8CharsToBufferAndTerminate<char16_t>(const UTF8Chars src, char16_t* ds
 template void
 InflateUTF8CharsToBufferAndTerminate<JS::Latin1Char>(const UTF8Chars src, JS::Latin1Char* dst,
                                                      size_t dstLen, JS::SmallestEncoding encoding);
+template void
+InflateUTF8CharsToBufferAndTerminate<char16_t>(const WTF8Chars src, char16_t* dst, size_t dstLen,
+                                               JS::SmallestEncoding encoding);
+template void
+InflateUTF8CharsToBufferAndTerminate<JS::Latin1Char>(const WTF8Chars src, JS::Latin1Char* dst,
+                                                     size_t dstLen, JS::SmallestEncoding encoding);
 
 #ifdef DEBUG
 void
diff --git a/js/src/vm/JSAtom.cpp b/js/src/vm/JSAtom.cpp
@@ -42,16 +42,17 @@ using mozilla::Maybe;
 using mozilla::Nothing;
 using mozilla::RangedPtr;
 
-template <typename CharT>
-extern void InflateUTF8CharsToBufferAndTerminate(const UTF8Chars src, CharT* dst, size_t dstLen,
+template <typename CharT, typename InputCharsT>
+extern void InflateUTF8CharsToBufferAndTerminate(const InputCharsT src, CharT* dst, size_t dstLen,
                                                  JS::SmallestEncoding encoding);
 
 template <typename CharT>
 extern bool UTF8EqualsChars(const JS::UTF8Chars utf8, const CharT* chars);
 
+template <typename InputCharsT>
 extern bool
-GetUTF8AtomizationData(JSContext* cx, const JS::UTF8Chars utf8, size_t* outlen, JS::SmallestEncoding* encoding,
-                       HashNumber* hashNum);
+GetUTF8AtomizationData(JSContext* cx, const InputCharsT utf8, size_t* outlen,
+                       JS::SmallestEncoding* encoding, HashNumber* hashNum);
 
 struct js::AtomHasher::Lookup
 {
@@ -866,16 +867,19 @@ PermanentlyAtomizeAndCopyChars(JSContext* cx,
     return atom;
 }
 
-struct AtomizeUTF8CharsWrapper
+template <typename CharsT>
+struct AtomizeUTF8OrWTF8CharsWrapper
 {
-    JS::UTF8Chars utf8;
+    CharsT utf8;
     JS::SmallestEncoding encoding;
 
-    AtomizeUTF8CharsWrapper(const JS::UTF8Chars& chars, JS::SmallestEncoding minEncode)
+    AtomizeUTF8OrWTF8CharsWrapper(const CharsT& chars, JS::SmallestEncoding minEncode)
       : utf8(chars), encoding(minEncode)
     { }
 };
 
+// MakeFlatStringForAtomization has 4 variants.
+// This is used by Latin1Char and char16_t.
 template <typename CharT>
 MOZ_ALWAYS_INLINE
 static JSFlatString*
@@ -884,10 +888,10 @@ MakeFlatStringForAtomization(JSContext* cx, const CharT* tbchars, size_t length)
     return NewStringCopyN<NoGC>(cx, tbchars, length);
 }
 
-template<typename CharT>
+template<typename CharT, typename WrapperT>
 MOZ_ALWAYS_INLINE
 static JSFlatString*
-MakeUTF8AtomHelper(JSContext* cx, const AtomizeUTF8CharsWrapper* chars, size_t length)
+MakeUTF8AtomHelper(JSContext* cx, const WrapperT* chars, size_t length)
 {
     if (JSInlineString::lengthFits<CharT>(length)) {
         CharT* storage;
@@ -920,10 +924,14 @@ MakeUTF8AtomHelper(JSContext* cx, const AtomizeUTF8CharsWrapper* chars, size_t l
     return str;
 }
 
-template<>
+// Another 2 variants of MakeFlatStringForAtomization.
+// This is used by AtomizeUTF8OrWTF8CharsWrapper with UTF8Chars or WTF8Chars.
+template<typename InputCharsT>
 MOZ_ALWAYS_INLINE
 /* static */ JSFlatString*
-MakeFlatStringForAtomization(JSContext* cx, const AtomizeUTF8CharsWrapper* chars, size_t length)
+MakeFlatStringForAtomization(JSContext* cx,
+                             const AtomizeUTF8OrWTF8CharsWrapper<InputCharsT>* chars,
+                             size_t length)
 {
     if (length == 0) {
         return cx->emptyString();
@@ -1041,8 +1049,9 @@ js::AtomizeChars(JSContext* cx, const Latin1Char* chars, size_t length, PinningB
 template JSAtom*
 js::AtomizeChars(JSContext* cx, const char16_t* chars, size_t length, PinningBehavior pin);
 
+template <typename CharsT>
 JSAtom*
-js::AtomizeUTF8Chars(JSContext* cx, const char* utf8Chars, size_t utf8ByteLength)
+AtomizeUTF8OrWTF8Chars(JSContext* cx, const char* utf8Chars, size_t utf8ByteLength)
 {
     // Since the static strings are all ascii, we can check them before trying anything else.
     if (JSAtom* s = cx->staticStrings().lookup(utf8Chars, utf8ByteLength)) {
@@ -1052,16 +1061,28 @@ js::AtomizeUTF8Chars(JSContext* cx, const char* utf8Chars, size_t utf8ByteLength
     size_t length;
     HashNumber hash;
     JS::SmallestEncoding forCopy;
-    UTF8Chars utf8(utf8Chars, utf8ByteLength);
+    CharsT utf8(utf8Chars, utf8ByteLength);
     if (!GetUTF8AtomizationData(cx, utf8, &length, &forCopy, &hash)) {
         return nullptr;
     }
 
-    AtomizeUTF8CharsWrapper chars(utf8, forCopy);
+    AtomizeUTF8OrWTF8CharsWrapper<CharsT> chars(utf8, forCopy);
     AtomHasher::Lookup lookup(utf8Chars, utf8ByteLength, length, hash);
     return AtomizeAndCopyCharsFromLookup(cx, &chars, length, lookup, DoNotPinAtom, Nothing());
 }
 
+JSAtom*
+js::AtomizeUTF8Chars(JSContext* cx, const char* utf8Chars, size_t utf8ByteLength)
+{
+    return AtomizeUTF8OrWTF8Chars<UTF8Chars>(cx, utf8Chars, utf8ByteLength);
+}
+
+JSAtom*
+js::AtomizeWTF8Chars(JSContext* cx, const char* wtf8Chars, size_t wtf8ByteLength)
+{
+    return AtomizeUTF8OrWTF8Chars<WTF8Chars>(cx, wtf8Chars, wtf8ByteLength);
+}
+
 bool
 js::IndexToIdSlow(JSContext* cx, uint32_t index, MutableHandleId idp)
 {
diff --git a/js/src/vm/JSAtom.h b/js/src/vm/JSAtom.h
@@ -69,6 +69,9 @@ AtomizeChars(JSContext* cx, const CharT* chars, size_t length,
 extern JSAtom*
 AtomizeUTF8Chars(JSContext* cx, const char* utf8Chars, size_t utf8ByteLength);
 
+extern JSAtom*
+AtomizeWTF8Chars(JSContext* cx, const char* wtf8Chars, size_t wtf8ByteLength);
+
 extern JSAtom*
 AtomizeString(JSContext* cx, JSString* str, js::PinningBehavior pin = js::DoNotPinAtom);