Skip to content

Commit 6ca00b0

Browse files
committed
Bug 1501155 - Part 1: Add AtomizeWTF8Chars. r=jwalden
1 parent 03a482b commit 6ca00b0

File tree

5 files changed

+136
-35
lines changed

5 files changed

+136
-35
lines changed

js/public/CharacterEncoding.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,26 @@ class UTF8Chars : public mozilla::Range<unsigned char>
9090
{}
9191
};
9292

93+
/*
94+
* Similar to UTF8Chars, but contains WTF-8.
95+
* https://simonsapin.github.io/wtf-8/
96+
*/
97+
class WTF8Chars : public mozilla::Range<unsigned char>
98+
{
99+
typedef mozilla::Range<unsigned char> Base;
100+
101+
public:
102+
using CharT = unsigned char;
103+
104+
WTF8Chars() : Base() {}
105+
WTF8Chars(char* aBytes, size_t aLength)
106+
: Base(reinterpret_cast<unsigned char*>(aBytes), aLength)
107+
{}
108+
WTF8Chars(const char* aBytes, size_t aLength)
109+
: Base(reinterpret_cast<unsigned char*>(const_cast<char*>(aBytes)), aLength)
110+
{}
111+
};
112+
93113
/*
94114
* SpiderMonkey also deals directly with UTF-8 encoded text in some places.
95115
*/
@@ -254,6 +274,12 @@ Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length);
254274
extern JS_PUBLIC_API TwoByteCharsZ
255275
UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
256276

277+
/*
278+
* Like UTF8CharsToNewTwoByteCharsZ, but for WTF8Chars.
279+
*/
280+
extern JS_PUBLIC_API TwoByteCharsZ
281+
WTF8CharsToNewTwoByteCharsZ(JSContext* cx, const WTF8Chars wtf8, size_t* outlen);
282+
257283
/*
258284
* Like UTF8CharsToNewTwoByteCharsZ, but for ConstUTF8CharsZ.
259285
*/

js/src/NamespaceImports.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ class ConstTwoByteChars;
2828
class TwoByteChars;
2929
class TwoByteCharsZ;
3030
class UTF8Chars;
31+
class WTF8Chars;
3132
class UTF8CharsZ;
3233

3334
using AutoValueVector = AutoVector<Value>;
@@ -75,6 +76,7 @@ using JS::ConstTwoByteChars;
7576
using JS::TwoByteChars;
7677
using JS::TwoByteCharsZ;
7778
using JS::UTF8Chars;
79+
using JS::WTF8Chars;
7880
using JS::UTF8CharsZ;
7981
using JS::UniqueChars;
8082
using JS::UniqueTwoByteChars;

js/src/vm/CharacterEncoding.cpp

Lines changed: 70 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -209,13 +209,18 @@ JS::CharsToNewUTF8CharsZ(JSContext* maybeCx,
209209
static const uint32_t INVALID_UTF8 = UINT32_MAX;
210210

211211
/*
212-
* Convert a utf8 character sequence into a UCS-4 character and return that
213-
* character. It is assumed that the caller already checked that the sequence
214-
* is valid.
212+
* Convert a UTF-8 or WTF-8 (depending on InputCharsT, which is either
213+
* UTF8Chars or WTF8Chars) character sequence into a UCS-4 character and return
214+
* that character. It is assumed that the caller already checked that the
215+
* sequence is valid.
215216
*/
216-
uint32_t
217-
JS::Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length)
217+
template <class InputCharsT>
218+
static uint32_t
219+
Utf8ToOneUcs4CharImpl(const uint8_t* utf8Buffer, int utf8Length)
218220
{
221+
static_assert(std::is_same<InputCharsT, UTF8Chars>::value ||
222+
std::is_same<InputCharsT, WTF8Chars>::value,
223+
"must be either UTF-8 or WTF-8");
219224
MOZ_ASSERT(1 <= utf8Length && utf8Length <= 4);
220225

221226
if (utf8Length == 1) {
@@ -235,13 +240,26 @@ JS::Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length)
235240
ucs4Char = (ucs4Char << 6) | (*utf8Buffer++ & 0x3F);
236241
}
237242

238-
if (MOZ_UNLIKELY(ucs4Char < minucs4Char || (ucs4Char >= 0xD800 && ucs4Char <= 0xDFFF))) {
243+
if (MOZ_UNLIKELY(ucs4Char < minucs4Char)) {
244+
return INVALID_UTF8;
245+
}
246+
247+
// WTF-8 allows lone surrogate.
248+
if (std::is_same<InputCharsT, UTF8Chars>::value &&
249+
MOZ_UNLIKELY(ucs4Char >= 0xD800 && ucs4Char <= 0xDFFF))
250+
{
239251
return INVALID_UTF8;
240252
}
241253

242254
return ucs4Char;
243255
}
244256

257+
uint32_t
258+
JS::Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length)
259+
{
260+
return Utf8ToOneUcs4CharImpl<UTF8Chars>(utf8Buffer, utf8Length);
261+
}
262+
245263
static void
246264
ReportInvalidCharacter(JSContext* cx, uint32_t offset)
247265
{
@@ -276,13 +294,13 @@ enum class OnUTF8Error {
276294
Crash,
277295
};
278296

279-
// Scan UTF8 input and (internally, at least) convert it to a series of UTF-16
280-
// code units. But you can also do odd things like pass an empty lambda for
281-
// `dst`, in which case the output is discarded entirely--the only effect of
282-
// calling the template that way is error-checking.
283-
template <OnUTF8Error ErrorAction, typename OutputFn>
297+
// Scan UTF-8 or WTF-8 input and (internally, at least) convert it to a series
298+
// of UTF-16 code units. But you can also do odd things like pass an empty
299+
// lambda for `dst`, in which case the output is discarded entirely--the only
300+
// effect of calling the template that way is error-checking.
301+
template <OnUTF8Error ErrorAction, typename OutputFn, class InputCharsT>
284302
static bool
285-
InflateUTF8ToUTF16(JSContext* cx, const UTF8Chars src, OutputFn dst)
303+
InflateUTF8ToUTF16(JSContext* cx, const InputCharsT src, OutputFn dst)
286304
{
287305
size_t srclen = src.length();
288306
for (uint32_t i = 0; i < srclen; i++) {
@@ -339,7 +357,15 @@ InflateUTF8ToUTF16(JSContext* cx, const UTF8Chars src, OutputFn dst)
339357
(v == 0xF0 && ((uint8_t)src[i + 1] & 0xF0) == 0x80) || // F0 90~BF
340358
(v == 0xF4 && ((uint8_t)src[i + 1] & 0xF0) != 0x80)) // F4 80~8F
341359
{
342-
INVALID(ReportInvalidCharacter, i, 1);
360+
if (std::is_same<InputCharsT, UTF8Chars>::value) {
361+
INVALID(ReportInvalidCharacter, i, 1);
362+
} else {
363+
// WTF-8 allows lone surrogate as ED A0~BF 80~BF.
364+
MOZ_ASSERT((std::is_same<InputCharsT, WTF8Chars>::value));
365+
if (v == 0xED && ((uint8_t)src[i + 1] & 0xE0) != 0xA0) { // ED A0~BF
366+
INVALID(ReportInvalidCharacter, i, 1);
367+
}
368+
}
343369
}
344370

345371
// Check the continuation bytes.
@@ -350,7 +376,7 @@ InflateUTF8ToUTF16(JSContext* cx, const UTF8Chars src, OutputFn dst)
350376
}
351377

352378
// Determine the code unit's length in CharT and act accordingly.
353-
v = JS::Utf8ToOneUcs4Char((uint8_t*)&src[i], n);
379+
v = Utf8ToOneUcs4CharImpl<InputCharsT>((uint8_t*)&src[i], n);
354380
if (v < 0x10000) {
355381
// The n-byte UTF8 code unit will fit in a single CharT.
356382
if (dst(char16_t(v)) == LoopDisposition::Break) {
@@ -383,9 +409,10 @@ InflateUTF8ToUTF16(JSContext* cx, const UTF8Chars src, OutputFn dst)
383409
return true;
384410
}
385411

386-
template <OnUTF8Error ErrorAction, typename CharT>
412+
template <OnUTF8Error ErrorAction, typename CharT, class InputCharsT>
387413
static void
388-
CopyAndInflateUTF8IntoBuffer(JSContext* cx, const UTF8Chars src, CharT *dst, size_t outlen, bool allASCII)
414+
CopyAndInflateUTF8IntoBuffer(JSContext* cx, const InputCharsT src, CharT* dst, size_t outlen,
415+
bool allASCII)
389416
{
390417
if (allASCII) {
391418
size_t srclen = src.length();
@@ -405,9 +432,9 @@ CopyAndInflateUTF8IntoBuffer(JSContext* cx, const UTF8Chars src, CharT *dst, siz
405432
dst[outlen] = CharT('\0'); // NUL char
406433
}
407434

408-
template <OnUTF8Error ErrorAction, typename CharsT>
435+
template <OnUTF8Error ErrorAction, typename CharsT, class InputCharsT>
409436
static CharsT
410-
InflateUTF8StringHelper(JSContext* cx, const UTF8Chars src, size_t* outlen)
437+
InflateUTF8StringHelper(JSContext* cx, const InputCharsT src, size_t* outlen)
411438
{
412439
using CharT = typename CharsT::CharT;
413440
static_assert(std::is_same<CharT, char16_t>::value ||
@@ -448,6 +475,12 @@ JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* out
448475
return InflateUTF8StringHelper<OnUTF8Error::Throw, TwoByteCharsZ>(cx, utf8, outlen);
449476
}
450477

478+
TwoByteCharsZ
479+
JS::WTF8CharsToNewTwoByteCharsZ(JSContext* cx, const WTF8Chars wtf8, size_t* outlen)
480+
{
481+
return InflateUTF8StringHelper<OnUTF8Error::Throw, TwoByteCharsZ>(cx, wtf8, outlen);
482+
}
483+
451484
TwoByteCharsZ
452485
JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen)
453486
{
@@ -518,8 +551,9 @@ JS::LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t*
518551
* consumption.
519552
*/
520553

554+
template <class InputCharsT>
521555
bool
522-
GetUTF8AtomizationData(JSContext* cx, const JS::UTF8Chars utf8, size_t* outlen,
556+
GetUTF8AtomizationData(JSContext* cx, const InputCharsT utf8, size_t* outlen,
523557
JS::SmallestEncoding* encoding, HashNumber* hashNum)
524558
{
525559
*outlen = 0;
@@ -539,6 +573,15 @@ GetUTF8AtomizationData(JSContext* cx, const JS::UTF8Chars utf8, size_t* outlen,
539573
return true;
540574
}
541575

576+
template
577+
bool
578+
GetUTF8AtomizationData<JS::UTF8Chars>(JSContext* cx, const JS::UTF8Chars utf8, size_t* outlen,
579+
JS::SmallestEncoding* encoding, HashNumber* hashNum);
580+
template
581+
bool
582+
GetUTF8AtomizationData<JS::WTF8Chars>(JSContext* cx, const JS::WTF8Chars utf8, size_t* outlen,
583+
JS::SmallestEncoding* encoding, HashNumber* hashNum);
584+
542585
template <typename CharT>
543586
bool
544587
UTF8EqualsChars(const JS::UTF8Chars utfChars, const CharT* chars)
@@ -575,9 +618,9 @@ UTF8EqualsChars(const JS::UTF8Chars utfChars, const CharT* chars)
575618
template bool UTF8EqualsChars<char16_t>(const JS::UTF8Chars, const char16_t*);
576619
template bool UTF8EqualsChars<JS::Latin1Char>(const JS::UTF8Chars, const JS::Latin1Char*);
577620

578-
template <typename CharT>
621+
template <typename CharT, class InputCharsT>
579622
void
580-
InflateUTF8CharsToBufferAndTerminate(const UTF8Chars src, CharT* dst, size_t dstLen,
623+
InflateUTF8CharsToBufferAndTerminate(const InputCharsT src, CharT* dst, size_t dstLen,
581624
JS::SmallestEncoding encoding)
582625
{
583626
CopyAndInflateUTF8IntoBuffer<OnUTF8Error::Crash>(/* cx = */ nullptr, src, dst, dstLen,
@@ -590,6 +633,12 @@ InflateUTF8CharsToBufferAndTerminate<char16_t>(const UTF8Chars src, char16_t* ds
590633
template void
591634
InflateUTF8CharsToBufferAndTerminate<JS::Latin1Char>(const UTF8Chars src, JS::Latin1Char* dst,
592635
size_t dstLen, JS::SmallestEncoding encoding);
636+
template void
637+
InflateUTF8CharsToBufferAndTerminate<char16_t>(const WTF8Chars src, char16_t* dst, size_t dstLen,
638+
JS::SmallestEncoding encoding);
639+
template void
640+
InflateUTF8CharsToBufferAndTerminate<JS::Latin1Char>(const WTF8Chars src, JS::Latin1Char* dst,
641+
size_t dstLen, JS::SmallestEncoding encoding);
593642

594643
#ifdef DEBUG
595644
void

js/src/vm/JSAtom.cpp

Lines changed: 35 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -42,16 +42,17 @@ using mozilla::Maybe;
4242
using mozilla::Nothing;
4343
using mozilla::RangedPtr;
4444

45-
template <typename CharT>
46-
extern void InflateUTF8CharsToBufferAndTerminate(const UTF8Chars src, CharT* dst, size_t dstLen,
45+
template <typename CharT, typename InputCharsT>
46+
extern void InflateUTF8CharsToBufferAndTerminate(const InputCharsT src, CharT* dst, size_t dstLen,
4747
JS::SmallestEncoding encoding);
4848

4949
template <typename CharT>
5050
extern bool UTF8EqualsChars(const JS::UTF8Chars utf8, const CharT* chars);
5151

52+
template <typename InputCharsT>
5253
extern bool
53-
GetUTF8AtomizationData(JSContext* cx, const JS::UTF8Chars utf8, size_t* outlen, JS::SmallestEncoding* encoding,
54-
HashNumber* hashNum);
54+
GetUTF8AtomizationData(JSContext* cx, const InputCharsT utf8, size_t* outlen,
55+
JS::SmallestEncoding* encoding, HashNumber* hashNum);
5556

5657
struct js::AtomHasher::Lookup
5758
{
@@ -866,16 +867,19 @@ PermanentlyAtomizeAndCopyChars(JSContext* cx,
866867
return atom;
867868
}
868869

869-
struct AtomizeUTF8CharsWrapper
870+
template <typename CharsT>
871+
struct AtomizeUTF8OrWTF8CharsWrapper
870872
{
871-
JS::UTF8Chars utf8;
873+
CharsT utf8;
872874
JS::SmallestEncoding encoding;
873875

874-
AtomizeUTF8CharsWrapper(const JS::UTF8Chars& chars, JS::SmallestEncoding minEncode)
876+
AtomizeUTF8OrWTF8CharsWrapper(const CharsT& chars, JS::SmallestEncoding minEncode)
875877
: utf8(chars), encoding(minEncode)
876878
{ }
877879
};
878880

881+
// MakeFlatStringForAtomization has 4 variants.
882+
// This is used by Latin1Char and char16_t.
879883
template <typename CharT>
880884
MOZ_ALWAYS_INLINE
881885
static JSFlatString*
@@ -884,10 +888,10 @@ MakeFlatStringForAtomization(JSContext* cx, const CharT* tbchars, size_t length)
884888
return NewStringCopyN<NoGC>(cx, tbchars, length);
885889
}
886890

887-
template<typename CharT>
891+
template<typename CharT, typename WrapperT>
888892
MOZ_ALWAYS_INLINE
889893
static JSFlatString*
890-
MakeUTF8AtomHelper(JSContext* cx, const AtomizeUTF8CharsWrapper* chars, size_t length)
894+
MakeUTF8AtomHelper(JSContext* cx, const WrapperT* chars, size_t length)
891895
{
892896
if (JSInlineString::lengthFits<CharT>(length)) {
893897
CharT* storage;
@@ -920,10 +924,14 @@ MakeUTF8AtomHelper(JSContext* cx, const AtomizeUTF8CharsWrapper* chars, size_t l
920924
return str;
921925
}
922926

923-
template<>
927+
// Another 2 variants of MakeFlatStringForAtomization.
928+
// This is used by AtomizeUTF8OrWTF8CharsWrapper with UTF8Chars or WTF8Chars.
929+
template<typename InputCharsT>
924930
MOZ_ALWAYS_INLINE
925931
/* static */ JSFlatString*
926-
MakeFlatStringForAtomization(JSContext* cx, const AtomizeUTF8CharsWrapper* chars, size_t length)
932+
MakeFlatStringForAtomization(JSContext* cx,
933+
const AtomizeUTF8OrWTF8CharsWrapper<InputCharsT>* chars,
934+
size_t length)
927935
{
928936
if (length == 0) {
929937
return cx->emptyString();
@@ -1041,8 +1049,9 @@ js::AtomizeChars(JSContext* cx, const Latin1Char* chars, size_t length, PinningB
10411049
template JSAtom*
10421050
js::AtomizeChars(JSContext* cx, const char16_t* chars, size_t length, PinningBehavior pin);
10431051

1052+
template <typename CharsT>
10441053
JSAtom*
1045-
js::AtomizeUTF8Chars(JSContext* cx, const char* utf8Chars, size_t utf8ByteLength)
1054+
AtomizeUTF8OrWTF8Chars(JSContext* cx, const char* utf8Chars, size_t utf8ByteLength)
10461055
{
10471056
// Since the static strings are all ascii, we can check them before trying anything else.
10481057
if (JSAtom* s = cx->staticStrings().lookup(utf8Chars, utf8ByteLength)) {
@@ -1052,16 +1061,28 @@ js::AtomizeUTF8Chars(JSContext* cx, const char* utf8Chars, size_t utf8ByteLength
10521061
size_t length;
10531062
HashNumber hash;
10541063
JS::SmallestEncoding forCopy;
1055-
UTF8Chars utf8(utf8Chars, utf8ByteLength);
1064+
CharsT utf8(utf8Chars, utf8ByteLength);
10561065
if (!GetUTF8AtomizationData(cx, utf8, &length, &forCopy, &hash)) {
10571066
return nullptr;
10581067
}
10591068

1060-
AtomizeUTF8CharsWrapper chars(utf8, forCopy);
1069+
AtomizeUTF8OrWTF8CharsWrapper<CharsT> chars(utf8, forCopy);
10611070
AtomHasher::Lookup lookup(utf8Chars, utf8ByteLength, length, hash);
10621071
return AtomizeAndCopyCharsFromLookup(cx, &chars, length, lookup, DoNotPinAtom, Nothing());
10631072
}
10641073

1074+
JSAtom*
1075+
js::AtomizeUTF8Chars(JSContext* cx, const char* utf8Chars, size_t utf8ByteLength)
1076+
{
1077+
return AtomizeUTF8OrWTF8Chars<UTF8Chars>(cx, utf8Chars, utf8ByteLength);
1078+
}
1079+
1080+
JSAtom*
1081+
js::AtomizeWTF8Chars(JSContext* cx, const char* wtf8Chars, size_t wtf8ByteLength)
1082+
{
1083+
return AtomizeUTF8OrWTF8Chars<WTF8Chars>(cx, wtf8Chars, wtf8ByteLength);
1084+
}
1085+
10651086
bool
10661087
js::IndexToIdSlow(JSContext* cx, uint32_t index, MutableHandleId idp)
10671088
{

js/src/vm/JSAtom.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,9 @@ AtomizeChars(JSContext* cx, const CharT* chars, size_t length,
6969
extern JSAtom*
7070
AtomizeUTF8Chars(JSContext* cx, const char* utf8Chars, size_t utf8ByteLength);
7171

72+
extern JSAtom*
73+
AtomizeWTF8Chars(JSContext* cx, const char* wtf8Chars, size_t wtf8ByteLength);
74+
7275
extern JSAtom*
7376
AtomizeString(JSContext* cx, JSString* str, js::PinningBehavior pin = js::DoNotPinAtom);
7477

0 commit comments

Comments
 (0)