Raku · usev6 · Nov 26, 2021
diff --git a/src/vm/jvm/runtime/org/raku/nqp/runtime/Ops.java b/src/vm/jvm/runtime/org/raku/nqp/runtime/Ops.java
@@ -4295,21 +4295,168 @@ private static String javaEncodingName(String nameIn) {
     }
 
     private static void encodeUTF16(String str, SixModelObject res, ThreadContext tc) {
-        short[] buffer = new short[str.length()];
-        for (int i = 0; i < str.length(); i++)
-            buffer[i] = (short)str.charAt(i);
-        if (res instanceof VMArrayInstance_i16) {
-            VMArrayInstance_i16 arr = (VMArrayInstance_i16)res;
-            arr.elems = buffer.length;
-            arr.start = 0;
-            arr.slots = buffer;
+        int[] buffer = new int[str.length()]; /* Can be an overestimate. */
+        int bufPos = 0;
+        for (int i = 0; i < str.length(); ) {
+            int cp = str.codePointAt(i);
+            buffer[bufPos++] = cp;
+            i += Character.charCount(cp);
+        }
+
+        /* In the following code we ignore that:
+         * - 2-byte codepoints between 0xD800 and 0xDFFF are invalid
+         * - 4-byte codepoints greater than 0x10FFFF are invalid */
+        if (res instanceof VMArrayInstance_i8 || res instanceof VMArrayInstance_u8) {
+            res.set_elems(tc, str.length() * 4); /* Very likely too much. */
+            int elems = 0;
+            for (int i = 0; i < bufPos; i++) {
+                int cp = buffer[i];
+                if (cp < 0x10000) { // 2-byte UTF-16
+                    tc.native_i = cp & 0xFF;
+                    res.bind_pos_native(tc, elems++);
+                    tc.native_i = cp >> 8;
+                    res.bind_pos_native(tc, elems++);
+                }
+                else { // 4-byte UTF-16
+                    /* Get a 20-bit value between 0x00000 and 0xFFFFF. */
+                    cp -= 0x10000;
+                    cp &= 0xFFFFF;
+                    int highWord = 0xD800 + (cp >> 10); // 0xD800 + high ten bits
+                    int lowWord = 0xDC00 + (cp & 0x3FF); // 0xDC00 + low ten bits
+                    tc.native_i = highWord & 0xFF;
+                    res.bind_pos_native(tc, elems++);
+                    tc.native_i = highWord >> 8;
+                    res.bind_pos_native(tc, elems++);
+                    tc.native_i = lowWord & 0xFF;
+                    res.bind_pos_native(tc, elems++);
+                    tc.native_i = lowWord >> 8;
+                    res.bind_pos_native(tc, elems++);
+                }
+            }
+            res.set_elems(tc, elems); /* Shorten VMArray as needed. */
+        }
+        else if (res instanceof VMArrayInstance_i16 || res instanceof VMArrayInstance_u16) {
+            res.set_elems(tc, str.length() * 2); /* Likely too much. */
+            int elems = 0;
+            for (int i = 0; i < bufPos; i++) {
+                int cp = buffer[i];
+                if (cp < 0x10000) { // 2-byte UTF-16
+                    tc.native_i = cp;
+                    res.bind_pos_native(tc, elems++);
+                }
+                else { // 4-byte UTF-16
+                    /* Get a 20-bit value between 0x00000 and 0xFFFFF. */
+                    cp -= 0x10000;
+                    cp &= 0xFFFFF;
+                    tc.native_i = 0xD800 + (cp >> 10); // 0xD800 + high ten bits
+                    res.bind_pos_native(tc, elems++);
+                    tc.native_i = 0xDC00 + (cp & 0x3FF); // 0xDC00 + low ten bits
+                    res.bind_pos_native(tc, elems++);
+                }
+            }
+            res.set_elems(tc, elems); /* Shorten VMArray as needed. */
+        }
+        else if (res instanceof VMArrayInstance_i32 || res instanceof VMArrayInstance_u32) {
+            res.set_elems(tc, str.length()); /* Likely too much. */
+            int elems = 0;
+            long cpTemp = 0;
+            byte elemsTemp = 0;
+            for (int i = 0; i < bufPos; i++) {
+                long cp = buffer[i];
+                if (cp < 0x10000) { // 2-byte UTF-16
+                    if (elemsTemp == 0) {
+                        cpTemp = cp;
+                        elemsTemp = 1;
+                    }
+                    else {
+                        tc.native_i = cpTemp + (cp << 16);
+                        res.bind_pos_native(tc, elems++);
+                        cpTemp = 0;
+                        elemsTemp = 0;
+                    }
+                }
+                else { // 4-byte UTF-16
+                    /* Get a 20-bit value between 0x00000 and 0xFFFFF. */
+                    cp -= 0x10000;
+                    cp &= 0xFFFFF;
+                    long highSurrogate = 0xD800 + (cp >> 10); // 0xD800 + high ten bits
+                    long lowSurrogate = 0xDC00 + (cp & 0x3FF); // 0xDC00 + low ten bits
+                    if (elemsTemp == 0) {
+                        tc.native_i = highSurrogate + (lowSurrogate << 16);
+                        res.bind_pos_native(tc, elems++);
+                    }
+                    else {
+                        tc.native_i = cpTemp + (highSurrogate << 16);
+                        res.bind_pos_native(tc, elems++);
+                        cpTemp = lowSurrogate;
+                    }
+                }
+            }
+            if (elemsTemp != 0) {
+                tc.native_i = cpTemp;
+                res.bind_pos_native(tc, elems++);
+            }
+            res.set_elems(tc, elems); /* Shorten VMArray as needed. */
         }
         else {
-            res.set_elems(tc, buffer.length);
-            for (int i = 0; i < buffer.length; i++) {
-                tc.native_i = buffer[i];
-                res.bind_pos_native(tc, i);
+            res.set_elems(tc, (str.length() + 1) / 2); /* Likely too much. */
+            int elems = 0;
+            long cpTemp = 0;
+            byte elemsTemp = 0;
+            for (int i = 0; i < bufPos; i++) {
+                long cp = buffer[i];
+                if (cp < 0x10000) { // 2-byte UTF-16
+                    if (elemsTemp == 0) {
+                        cpTemp = cp;
+                        elemsTemp = 1;
+                    }
+                    else {
+                        if (elemsTemp == 3) { // buffer elem full
+                            tc.native_i = cpTemp + (cp << 48);
+                            res.bind_pos_native(tc, elems++);
+                            cpTemp = 0;
+                            elemsTemp = 0;
+                        }
+                        else {
+                            cpTemp += cp << (elemsTemp++ * 16);
+                        }
+                    }
+                }
+                else { // 4-byte UTF-16
+                    /* Get a 20-bit value between 0x00000 and 0xFFFFF. */
+                    cp -= 0x10000;
+                    cp &= 0xFFFFF;
+                    long highSurrogate = 0xD800 + (cp >> 10); // 0xD800 + high ten bits
+                    long lowSurrogate = 0xDC00 + (cp & 0x3FF); // 0xDC00 + low ten bits
+                    if (elemsTemp == 0) {
+                        cpTemp = highSurrogate + (lowSurrogate << 16);
+                        elemsTemp = 2;
+                    }
+                    else {
+                        if (elemsTemp == 3) { // buffer elem full
+                            tc.native_i = cpTemp + (highSurrogate << 48);
+                            res.bind_pos_native(tc, elems++);
+                            cpTemp = lowSurrogate;
+                            elemsTemp = 1;
+                        }
+                        else if (elemsTemp == 2) {
+                            tc.native_i = cpTemp + (highSurrogate << 32) + (lowSurrogate << 48);
+                            res.bind_pos_native(tc, elems++);
+                            cpTemp = 0;
+                            elemsTemp = 0;
+                        }
+                        else { // elemsTemp == 1
+                            cpTemp += (highSurrogate << 16) + (lowSurrogate << 32);
+                            elemsTemp = 3;
+                        }
+                    }
+                }
+            }
+            if (elemsTemp != 0) {
+                tc.native_i = cpTemp;
+                res.bind_pos_native(tc, elems++);
             }
+            res.set_elems(tc, elems); /* Shorten VMArray as needed. */
         }
     }
 

diff --git a/t/nqp/082-decode.t b/t/nqp/082-decode.t
@@ -1,6 +1,6 @@
 use nqpmo;
 
-plan(30);
+plan(46);
 
 my sub create_buf($type) {
     my $buf := nqp::newtype(nqp::null(), 'VMArray');
@@ -10,6 +10,8 @@ my sub create_buf($type) {
 
 my $buf8 := create_buf(uint8);
 my $buf16 := create_buf(uint16);
+my $buf32 := create_buf(uint32);
+my $buf64 := create_buf(uint64);
 
 my $buf := nqp::encode('', 'utf8', nqp::create($buf8));
 
@@ -153,3 +155,72 @@ else {
       }, 'encode dies with missing character');
     }
 }
+
+$buf := nqp::encode('abc', 'utf16', nqp::create($buf8));
+is(buf_dump($buf), '97,0,98,0,99,0', 'nqp::encode 3-chars ascii string with utf16 using buffer of type uint8');
+is(nqp::decode($buf, "utf16"), 'abc', 'nqp::decode gives original string back');
+
+$buf := nqp::encode('abc', 'utf16', nqp::create($buf16));
+is(buf_dump($buf), '97,98,99', 'nqp::encode 3-chars ascii string with utf16 using buffer of type uint16');
+is(nqp::decode($buf, "utf16"), 'abc', 'nqp::decode gives original string back');
+
+$buf := nqp::encode('abc', 'utf16', nqp::create($buf32));
+if nqp::getcomp('nqp').backend.name eq 'moar' {
+    todo('trailing character is dropped, https://github.com/MoarVM/MoarVM/issues/1606', 1);
+    ok(0);
+}
+else {
+    is(buf_dump($buf), ~(97 + 98 * 2**16) ~ ',99', 'nqp::encode 3-chars ascii string with utf16 using buffer of type uint32');
+    ## TODO This returns "abc\0". Maybe that's just correct?
+    #is(nqp::decode($buf, "utf16"), 'abc', 'nqp::decode gives original string back');
+}
+
+$buf := nqp::encode('abcd', 'utf16', nqp::create($buf64));
+is(buf_dump($buf), (97 + 98 * 2**16 + 99 * 2**32 + 100 * 2**48), 'nqp::encode 4-chars ascii string with utf16 using buffer of type uint64');
+if nqp::getcomp('nqp').backend.name eq 'jvm' {
+    todo('Unknown buf type in nqp::decode', 1);
+    ok(0);
+}
+else {
+    is(nqp::decode($buf, "utf16"), 'abcd', 'nqp::decode gives original string back');
+}
+
+## "\x1F63E" 'POUNTING CAT FACE' is a code point from supplemantary planes and requires two 16-bit code units (0xD83D,0xDE3E)
+$buf := nqp::encode("\x1F63E", 'utf16', nqp::create($buf8));
+is(buf_dump($buf), '61,216,62,222', 'nqp::encode 1-char surrogate pair with utf16 using buffer of type uint8');
+is(nqp::decode($buf, "utf16"), "\x1F63E", 'nqp::decode gives original string back');
+
+$buf := nqp::encode("\x1F63E", 'utf16', nqp::create($buf16));
+is(buf_dump($buf), '55357,56894', 'nqp::encode 1-char surrogate pair with utf16 using buffer of type uint16');
+is(nqp::decode($buf, "utf16"), "\x1F63E", 'nqp::decode gives original string back');
+
+$buf := nqp::encode("\x1F63E", 'utf16', nqp::create($buf32));
+is(buf_dump($buf), ~(55357 + 56894 * 2**16), 'nqp::encode 1-char surrogate pair with utf16 using buffer of type uint32');
+if nqp::getcomp('nqp').backend.name eq 'jvm' {
+    todo('java.lang.IllegalArgumentException: Not a valid Unicode code point: 0xFFFFDE3E', 1);
+    ok(0);
+}
+else {
+    is(nqp::decode($buf, "utf16"), "\x1F63E", 'nqp::decode gives original string back');
+}
+
+$buf := nqp::encode('a' ~ "\x1F63E" ~ 'bcd', 'utf16', nqp::create($buf32));
+is(buf_dump($buf), (97 + 55357 * 2**16) ~ ',' ~ (56894 + 98 * 2**16) ~ ',' ~ (99 + 100 * 2**16),
+    'nqp::encode mixed string with ascii and surrogate pair with utf16 using buffer of type uint32');
+if nqp::getcomp('nqp').backend.name eq 'jvm' {
+    todo('java.lang.IllegalArgumentException: Not a valid Unicode code point: 0xFFFFD83D', 1);
+    ok(0);
+}
+else {
+    is(nqp::decode($buf, "utf16"), 'a' ~ "\x1F63E" ~ 'bcd', 'nqp::decode gives original string back');
+}
+
+$buf := nqp::encode('a' ~ "\x1F63E" ~ 'bcd', 'utf16', nqp::create($buf64));
+if nqp::getcomp('nqp').backend.name eq 'moar' {
+    todo('trailing character is dropped, https://github.com/MoarVM/MoarVM/issues/1606', 1);
+    ok(0);
+}
+else {
+    is(buf_dump($buf), (97 + 55357 * 2**16 + 56894 * 2**32 + 98 * 2**48) ~ ',' ~ (99 + 100 * 2**16),
+        'nqp::encode mixed string with ascii and surrogate pair with utf16 using buffer of type uint64');
+}