diff --git a/src/vm/jvm/runtime/org/raku/nqp/runtime/Ops.java b/src/vm/jvm/runtime/org/raku/nqp/runtime/Ops.java index b20811c85..35a4cb4d8 100644 --- a/src/vm/jvm/runtime/org/raku/nqp/runtime/Ops.java +++ b/src/vm/jvm/runtime/org/raku/nqp/runtime/Ops.java @@ -4295,21 +4295,168 @@ private static String javaEncodingName(String nameIn) { } private static void encodeUTF16(String str, SixModelObject res, ThreadContext tc) { - short[] buffer = new short[str.length()]; - for (int i = 0; i < str.length(); i++) - buffer[i] = (short)str.charAt(i); - if (res instanceof VMArrayInstance_i16) { - VMArrayInstance_i16 arr = (VMArrayInstance_i16)res; - arr.elems = buffer.length; - arr.start = 0; - arr.slots = buffer; + int[] buffer = new int[str.length()]; /* Can be an overestimate. */ + int bufPos = 0; + for (int i = 0; i < str.length(); ) { + int cp = str.codePointAt(i); + buffer[bufPos++] = cp; + i += Character.charCount(cp); + } + + /* In the following code we ignore that: + * - 2-byte codepoints between 0xD800 and 0xDFFF are invalid + * - 4-byte codepoints greater than 0x10FFFF are invalid */ + if (res instanceof VMArrayInstance_i8 || res instanceof VMArrayInstance_u8) { + res.set_elems(tc, str.length() * 4); /* Very likely too much. */ + int elems = 0; + for (int i = 0; i < bufPos; i++) { + int cp = buffer[i]; + if (cp < 0x10000) { // 2-byte UTF-16 + tc.native_i = cp & 0xFF; + res.bind_pos_native(tc, elems++); + tc.native_i = cp >> 8; + res.bind_pos_native(tc, elems++); + } + else { // 4-byte UTF-16 + /* Get a 20-bit value between 0x00000 and 0xFFFFF. */ + cp -= 0x10000; + cp &= 0xFFFFF; + int highWord = 0xD800 + (cp >> 10); // 0xD800 + high ten bits + int lowWord = 0xDC00 + (cp & 0x3FF); // 0xDC00 + low ten bits + tc.native_i = highWord & 0xFF; + res.bind_pos_native(tc, elems++); + tc.native_i = highWord >> 8; + res.bind_pos_native(tc, elems++); + tc.native_i = lowWord & 0xFF; + res.bind_pos_native(tc, elems++); + tc.native_i = lowWord >> 8; + res.bind_pos_native(tc, elems++); + } + } + res.set_elems(tc, elems); /* Shorten VMArray as needed. */ + } + else if (res instanceof VMArrayInstance_i16 || res instanceof VMArrayInstance_u16) { + res.set_elems(tc, str.length() * 2); /* Likely too much. */ + int elems = 0; + for (int i = 0; i < bufPos; i++) { + int cp = buffer[i]; + if (cp < 0x10000) { // 2-byte UTF-16 + tc.native_i = cp; + res.bind_pos_native(tc, elems++); + } + else { // 4-byte UTF-16 + /* Get a 20-bit value between 0x00000 and 0xFFFFF. */ + cp -= 0x10000; + cp &= 0xFFFFF; + tc.native_i = 0xD800 + (cp >> 10); // 0xD800 + high ten bits + res.bind_pos_native(tc, elems++); + tc.native_i = 0xDC00 + (cp & 0x3FF); // 0xDC00 + low ten bits + res.bind_pos_native(tc, elems++); + } + } + res.set_elems(tc, elems); /* Shorten VMArray as needed. */ + } + else if (res instanceof VMArrayInstance_i32 || res instanceof VMArrayInstance_u32) { + res.set_elems(tc, str.length()); /* Likely too much. */ + int elems = 0; + long cpTemp = 0; + byte elemsTemp = 0; + for (int i = 0; i < bufPos; i++) { + long cp = buffer[i]; + if (cp < 0x10000) { // 2-byte UTF-16 + if (elemsTemp == 0) { + cpTemp = cp; + elemsTemp = 1; + } + else { + tc.native_i = cpTemp + (cp << 16); + res.bind_pos_native(tc, elems++); + cpTemp = 0; + elemsTemp = 0; + } + } + else { // 4-byte UTF-16 + /* Get a 20-bit value between 0x00000 and 0xFFFFF. */ + cp -= 0x10000; + cp &= 0xFFFFF; + long highSurrogate = 0xD800 + (cp >> 10); // 0xD800 + high ten bits + long lowSurrogate = 0xDC00 + (cp & 0x3FF); // 0xDC00 + low ten bits + if (elemsTemp == 0) { + tc.native_i = highSurrogate + (lowSurrogate << 16); + res.bind_pos_native(tc, elems++); + } + else { + tc.native_i = cpTemp + (highSurrogate << 16); + res.bind_pos_native(tc, elems++); + cpTemp = lowSurrogate; + } + } + } + if (elemsTemp != 0) { + tc.native_i = cpTemp; + res.bind_pos_native(tc, elems++); + } + res.set_elems(tc, elems); /* Shorten VMArray as needed. */ } else { - res.set_elems(tc, buffer.length); - for (int i = 0; i < buffer.length; i++) { - tc.native_i = buffer[i]; - res.bind_pos_native(tc, i); + res.set_elems(tc, (str.length() + 1) / 2); /* Likely too much. */ + int elems = 0; + long cpTemp = 0; + byte elemsTemp = 0; + for (int i = 0; i < bufPos; i++) { + long cp = buffer[i]; + if (cp < 0x10000) { // 2-byte UTF-16 + if (elemsTemp == 0) { + cpTemp = cp; + elemsTemp = 1; + } + else { + if (elemsTemp == 3) { // buffer elem full + tc.native_i = cpTemp + (cp << 48); + res.bind_pos_native(tc, elems++); + cpTemp = 0; + elemsTemp = 0; + } + else { + cpTemp += cp << (elemsTemp++ * 16); + } + } + } + else { // 4-byte UTF-16 + /* Get a 20-bit value between 0x00000 and 0xFFFFF. */ + cp -= 0x10000; + cp &= 0xFFFFF; + long highSurrogate = 0xD800 + (cp >> 10); // 0xD800 + high ten bits + long lowSurrogate = 0xDC00 + (cp & 0x3FF); // 0xDC00 + low ten bits + if (elemsTemp == 0) { + cpTemp = highSurrogate + (lowSurrogate << 16); + elemsTemp = 2; + } + else { + if (elemsTemp == 3) { // buffer elem full + tc.native_i = cpTemp + (highSurrogate << 48); + res.bind_pos_native(tc, elems++); + cpTemp = lowSurrogate; + elemsTemp = 1; + } + else if (elemsTemp == 2) { + tc.native_i = cpTemp + (highSurrogate << 32) + (lowSurrogate << 48); + res.bind_pos_native(tc, elems++); + cpTemp = 0; + elemsTemp = 0; + } + else { // elemsTemp == 1 + cpTemp += (highSurrogate << 16) + (lowSurrogate << 32); + elemsTemp = 3; + } + } + } + } + if (elemsTemp != 0) { + tc.native_i = cpTemp; + res.bind_pos_native(tc, elems++); } + res.set_elems(tc, elems); /* Shorten VMArray as needed. */ } } diff --git a/t/nqp/082-decode.t b/t/nqp/082-decode.t index a24907c7b..72090282e 100644 --- a/t/nqp/082-decode.t +++ b/t/nqp/082-decode.t @@ -1,6 +1,6 @@ use nqpmo; -plan(30); +plan(46); my sub create_buf($type) { my $buf := nqp::newtype(nqp::null(), 'VMArray'); @@ -10,6 +10,8 @@ my sub create_buf($type) { my $buf8 := create_buf(uint8); my $buf16 := create_buf(uint16); +my $buf32 := create_buf(uint32); +my $buf64 := create_buf(uint64); my $buf := nqp::encode('', 'utf8', nqp::create($buf8)); @@ -153,3 +155,72 @@ else { }, 'encode dies with missing character'); } } + +$buf := nqp::encode('abc', 'utf16', nqp::create($buf8)); +is(buf_dump($buf), '97,0,98,0,99,0', 'nqp::encode 3-chars ascii string with utf16 using buffer of type uint8'); +is(nqp::decode($buf, "utf16"), 'abc', 'nqp::decode gives original string back'); + +$buf := nqp::encode('abc', 'utf16', nqp::create($buf16)); +is(buf_dump($buf), '97,98,99', 'nqp::encode 3-chars ascii string with utf16 using buffer of type uint16'); +is(nqp::decode($buf, "utf16"), 'abc', 'nqp::decode gives original string back'); + +$buf := nqp::encode('abc', 'utf16', nqp::create($buf32)); +if nqp::getcomp('nqp').backend.name eq 'moar' { + todo('trailing character is dropped, https://github.com/MoarVM/MoarVM/issues/1606', 1); + ok(0); +} +else { + is(buf_dump($buf), ~(97 + 98 * 2**16) ~ ',99', 'nqp::encode 3-chars ascii string with utf16 using buffer of type uint32'); + ## TODO This returns "abc\0". Maybe that's just correct? + #is(nqp::decode($buf, "utf16"), 'abc', 'nqp::decode gives original string back'); +} + +$buf := nqp::encode('abcd', 'utf16', nqp::create($buf64)); +is(buf_dump($buf), (97 + 98 * 2**16 + 99 * 2**32 + 100 * 2**48), 'nqp::encode 4-chars ascii string with utf16 using buffer of type uint64'); +if nqp::getcomp('nqp').backend.name eq 'jvm' { + todo('Unknown buf type in nqp::decode', 1); + ok(0); +} +else { + is(nqp::decode($buf, "utf16"), 'abcd', 'nqp::decode gives original string back'); +} + +## "\x1F63E" 'POUNTING CAT FACE' is a code point from supplemantary planes and requires two 16-bit code units (0xD83D,0xDE3E) +$buf := nqp::encode("\x1F63E", 'utf16', nqp::create($buf8)); +is(buf_dump($buf), '61,216,62,222', 'nqp::encode 1-char surrogate pair with utf16 using buffer of type uint8'); +is(nqp::decode($buf, "utf16"), "\x1F63E", 'nqp::decode gives original string back'); + +$buf := nqp::encode("\x1F63E", 'utf16', nqp::create($buf16)); +is(buf_dump($buf), '55357,56894', 'nqp::encode 1-char surrogate pair with utf16 using buffer of type uint16'); +is(nqp::decode($buf, "utf16"), "\x1F63E", 'nqp::decode gives original string back'); + +$buf := nqp::encode("\x1F63E", 'utf16', nqp::create($buf32)); +is(buf_dump($buf), ~(55357 + 56894 * 2**16), 'nqp::encode 1-char surrogate pair with utf16 using buffer of type uint32'); +if nqp::getcomp('nqp').backend.name eq 'jvm' { + todo('java.lang.IllegalArgumentException: Not a valid Unicode code point: 0xFFFFDE3E', 1); + ok(0); +} +else { + is(nqp::decode($buf, "utf16"), "\x1F63E", 'nqp::decode gives original string back'); +} + +$buf := nqp::encode('a' ~ "\x1F63E" ~ 'bcd', 'utf16', nqp::create($buf32)); +is(buf_dump($buf), (97 + 55357 * 2**16) ~ ',' ~ (56894 + 98 * 2**16) ~ ',' ~ (99 + 100 * 2**16), + 'nqp::encode mixed string with ascii and surrogate pair with utf16 using buffer of type uint32'); +if nqp::getcomp('nqp').backend.name eq 'jvm' { + todo('java.lang.IllegalArgumentException: Not a valid Unicode code point: 0xFFFFD83D', 1); + ok(0); +} +else { + is(nqp::decode($buf, "utf16"), 'a' ~ "\x1F63E" ~ 'bcd', 'nqp::decode gives original string back'); +} + +$buf := nqp::encode('a' ~ "\x1F63E" ~ 'bcd', 'utf16', nqp::create($buf64)); +if nqp::getcomp('nqp').backend.name eq 'moar' { + todo('trailing character is dropped, https://github.com/MoarVM/MoarVM/issues/1606', 1); + ok(0); +} +else { + is(buf_dump($buf), (97 + 55357 * 2**16 + 56894 * 2**32 + 98 * 2**48) ~ ',' ~ (99 + 100 * 2**16), + 'nqp::encode mixed string with ascii and surrogate pair with utf16 using buffer of type uint64'); +}