Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[JVM] Reimplement nqp::encode for utf-16 #749

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
171 changes: 159 additions & 12 deletions src/vm/jvm/runtime/org/raku/nqp/runtime/Ops.java
Original file line number Diff line number Diff line change
Expand Up @@ -4295,21 +4295,168 @@ private static String javaEncodingName(String nameIn) {
}

private static void encodeUTF16(String str, SixModelObject res, ThreadContext tc) {
short[] buffer = new short[str.length()];
for (int i = 0; i < str.length(); i++)
buffer[i] = (short)str.charAt(i);
if (res instanceof VMArrayInstance_i16) {
VMArrayInstance_i16 arr = (VMArrayInstance_i16)res;
arr.elems = buffer.length;
arr.start = 0;
arr.slots = buffer;
int[] buffer = new int[str.length()]; /* Can be an overestimate. */
int bufPos = 0;
for (int i = 0; i < str.length(); ) {
int cp = str.codePointAt(i);
buffer[bufPos++] = cp;
i += Character.charCount(cp);
}

/* In the following code we ignore that:
* - 2-byte codepoints between 0xD800 and 0xDFFF are invalid
* - 4-byte codepoints greater than 0x10FFFF are invalid */
if (res instanceof VMArrayInstance_i8 || res instanceof VMArrayInstance_u8) {
res.set_elems(tc, str.length() * 4); /* Very likely too much. */
int elems = 0;
for (int i = 0; i < bufPos; i++) {
int cp = buffer[i];
if (cp < 0x10000) { // 2-byte UTF-16
tc.native_i = cp & 0xFF;
res.bind_pos_native(tc, elems++);
tc.native_i = cp >> 8;
res.bind_pos_native(tc, elems++);
}
else { // 4-byte UTF-16
/* Get a 20-bit value between 0x00000 and 0xFFFFF. */
cp -= 0x10000;
cp &= 0xFFFFF;
int highWord = 0xD800 + (cp >> 10); // 0xD800 + high ten bits
int lowWord = 0xDC00 + (cp & 0x3FF); // 0xDC00 + low ten bits
tc.native_i = highWord & 0xFF;
res.bind_pos_native(tc, elems++);
tc.native_i = highWord >> 8;
res.bind_pos_native(tc, elems++);
tc.native_i = lowWord & 0xFF;
res.bind_pos_native(tc, elems++);
tc.native_i = lowWord >> 8;
res.bind_pos_native(tc, elems++);
}
}
res.set_elems(tc, elems); /* Shorten VMArray as needed. */
}
else if (res instanceof VMArrayInstance_i16 || res instanceof VMArrayInstance_u16) {
res.set_elems(tc, str.length() * 2); /* Likely too much. */
int elems = 0;
for (int i = 0; i < bufPos; i++) {
int cp = buffer[i];
if (cp < 0x10000) { // 2-byte UTF-16
tc.native_i = cp;
res.bind_pos_native(tc, elems++);
}
else { // 4-byte UTF-16
/* Get a 20-bit value between 0x00000 and 0xFFFFF. */
cp -= 0x10000;
cp &= 0xFFFFF;
tc.native_i = 0xD800 + (cp >> 10); // 0xD800 + high ten bits
res.bind_pos_native(tc, elems++);
tc.native_i = 0xDC00 + (cp & 0x3FF); // 0xDC00 + low ten bits
res.bind_pos_native(tc, elems++);
}
}
res.set_elems(tc, elems); /* Shorten VMArray as needed. */
}
else if (res instanceof VMArrayInstance_i32 || res instanceof VMArrayInstance_u32) {
res.set_elems(tc, str.length()); /* Likely too much. */
int elems = 0;
long cpTemp = 0;
byte elemsTemp = 0;
for (int i = 0; i < bufPos; i++) {
long cp = buffer[i];
if (cp < 0x10000) { // 2-byte UTF-16
if (elemsTemp == 0) {
cpTemp = cp;
elemsTemp = 1;
}
else {
tc.native_i = cpTemp + (cp << 16);
res.bind_pos_native(tc, elems++);
cpTemp = 0;
elemsTemp = 0;
}
}
else { // 4-byte UTF-16
/* Get a 20-bit value between 0x00000 and 0xFFFFF. */
cp -= 0x10000;
cp &= 0xFFFFF;
long highSurrogate = 0xD800 + (cp >> 10); // 0xD800 + high ten bits
long lowSurrogate = 0xDC00 + (cp & 0x3FF); // 0xDC00 + low ten bits
if (elemsTemp == 0) {
tc.native_i = highSurrogate + (lowSurrogate << 16);
res.bind_pos_native(tc, elems++);
}
else {
tc.native_i = cpTemp + (highSurrogate << 16);
res.bind_pos_native(tc, elems++);
cpTemp = lowSurrogate;
}
}
}
if (elemsTemp != 0) {
tc.native_i = cpTemp;
res.bind_pos_native(tc, elems++);
}
res.set_elems(tc, elems); /* Shorten VMArray as needed. */
}
else {
res.set_elems(tc, buffer.length);
for (int i = 0; i < buffer.length; i++) {
tc.native_i = buffer[i];
res.bind_pos_native(tc, i);
res.set_elems(tc, (str.length() + 1) / 2); /* Likely too much. */
int elems = 0;
long cpTemp = 0;
byte elemsTemp = 0;
for (int i = 0; i < bufPos; i++) {
long cp = buffer[i];
if (cp < 0x10000) { // 2-byte UTF-16
if (elemsTemp == 0) {
cpTemp = cp;
elemsTemp = 1;
}
else {
if (elemsTemp == 3) { // buffer elem full
tc.native_i = cpTemp + (cp << 48);
res.bind_pos_native(tc, elems++);
cpTemp = 0;
elemsTemp = 0;
}
else {
cpTemp += cp << (elemsTemp++ * 16);
}
}
}
else { // 4-byte UTF-16
/* Get a 20-bit value between 0x00000 and 0xFFFFF. */
cp -= 0x10000;
cp &= 0xFFFFF;
long highSurrogate = 0xD800 + (cp >> 10); // 0xD800 + high ten bits
long lowSurrogate = 0xDC00 + (cp & 0x3FF); // 0xDC00 + low ten bits
if (elemsTemp == 0) {
cpTemp = highSurrogate + (lowSurrogate << 16);
elemsTemp = 2;
}
else {
if (elemsTemp == 3) { // buffer elem full
tc.native_i = cpTemp + (highSurrogate << 48);
res.bind_pos_native(tc, elems++);
cpTemp = lowSurrogate;
elemsTemp = 1;
}
else if (elemsTemp == 2) {
tc.native_i = cpTemp + (highSurrogate << 32) + (lowSurrogate << 48);
res.bind_pos_native(tc, elems++);
cpTemp = 0;
elemsTemp = 0;
}
else { // elemsTemp == 1
cpTemp += (highSurrogate << 16) + (lowSurrogate << 32);
elemsTemp = 3;
}
}
}
}
if (elemsTemp != 0) {
tc.native_i = cpTemp;
res.bind_pos_native(tc, elems++);
}
res.set_elems(tc, elems); /* Shorten VMArray as needed. */
}
}

Expand Down
73 changes: 72 additions & 1 deletion t/nqp/082-decode.t
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use nqpmo;

plan(30);
plan(46);

my sub create_buf($type) {
my $buf := nqp::newtype(nqp::null(), 'VMArray');
Expand All @@ -10,6 +10,8 @@ my sub create_buf($type) {

my $buf8 := create_buf(uint8);
my $buf16 := create_buf(uint16);
my $buf32 := create_buf(uint32);
my $buf64 := create_buf(uint64);

my $buf := nqp::encode('', 'utf8', nqp::create($buf8));

Expand Down Expand Up @@ -153,3 +155,72 @@ else {
}, 'encode dies with missing character');
}
}

$buf := nqp::encode('abc', 'utf16', nqp::create($buf8));
is(buf_dump($buf), '97,0,98,0,99,0', 'nqp::encode 3-chars ascii string with utf16 using buffer of type uint8');
is(nqp::decode($buf, "utf16"), 'abc', 'nqp::decode gives original string back');

$buf := nqp::encode('abc', 'utf16', nqp::create($buf16));
is(buf_dump($buf), '97,98,99', 'nqp::encode 3-chars ascii string with utf16 using buffer of type uint16');
is(nqp::decode($buf, "utf16"), 'abc', 'nqp::decode gives original string back');

$buf := nqp::encode('abc', 'utf16', nqp::create($buf32));
if nqp::getcomp('nqp').backend.name eq 'moar' {
todo('trailing character is dropped, https://github.com/MoarVM/MoarVM/issues/1606', 1);
ok(0);
}
else {
is(buf_dump($buf), ~(97 + 98 * 2**16) ~ ',99', 'nqp::encode 3-chars ascii string with utf16 using buffer of type uint32');
## TODO This returns "abc\0". Maybe that's just correct?
#is(nqp::decode($buf, "utf16"), 'abc', 'nqp::decode gives original string back');
}

$buf := nqp::encode('abcd', 'utf16', nqp::create($buf64));
is(buf_dump($buf), (97 + 98 * 2**16 + 99 * 2**32 + 100 * 2**48), 'nqp::encode 4-chars ascii string with utf16 using buffer of type uint64');
if nqp::getcomp('nqp').backend.name eq 'jvm' {
todo('Unknown buf type in nqp::decode', 1);
ok(0);
}
else {
is(nqp::decode($buf, "utf16"), 'abcd', 'nqp::decode gives original string back');
}

## "\x1F63E" 'POUNTING CAT FACE' is a code point from supplemantary planes and requires two 16-bit code units (0xD83D,0xDE3E)
$buf := nqp::encode("\x1F63E", 'utf16', nqp::create($buf8));
is(buf_dump($buf), '61,216,62,222', 'nqp::encode 1-char surrogate pair with utf16 using buffer of type uint8');
is(nqp::decode($buf, "utf16"), "\x1F63E", 'nqp::decode gives original string back');

$buf := nqp::encode("\x1F63E", 'utf16', nqp::create($buf16));
is(buf_dump($buf), '55357,56894', 'nqp::encode 1-char surrogate pair with utf16 using buffer of type uint16');
is(nqp::decode($buf, "utf16"), "\x1F63E", 'nqp::decode gives original string back');

$buf := nqp::encode("\x1F63E", 'utf16', nqp::create($buf32));
is(buf_dump($buf), ~(55357 + 56894 * 2**16), 'nqp::encode 1-char surrogate pair with utf16 using buffer of type uint32');
if nqp::getcomp('nqp').backend.name eq 'jvm' {
todo('java.lang.IllegalArgumentException: Not a valid Unicode code point: 0xFFFFDE3E', 1);
ok(0);
}
else {
is(nqp::decode($buf, "utf16"), "\x1F63E", 'nqp::decode gives original string back');
}

$buf := nqp::encode('a' ~ "\x1F63E" ~ 'bcd', 'utf16', nqp::create($buf32));
is(buf_dump($buf), (97 + 55357 * 2**16) ~ ',' ~ (56894 + 98 * 2**16) ~ ',' ~ (99 + 100 * 2**16),
'nqp::encode mixed string with ascii and surrogate pair with utf16 using buffer of type uint32');
if nqp::getcomp('nqp').backend.name eq 'jvm' {
todo('java.lang.IllegalArgumentException: Not a valid Unicode code point: 0xFFFFD83D', 1);
ok(0);
}
else {
is(nqp::decode($buf, "utf16"), 'a' ~ "\x1F63E" ~ 'bcd', 'nqp::decode gives original string back');
}

$buf := nqp::encode('a' ~ "\x1F63E" ~ 'bcd', 'utf16', nqp::create($buf64));
if nqp::getcomp('nqp').backend.name eq 'moar' {
todo('trailing character is dropped, https://github.com/MoarVM/MoarVM/issues/1606', 1);
ok(0);
}
else {
is(buf_dump($buf), (97 + 55357 * 2**16 + 56894 * 2**32 + 98 * 2**48) ~ ',' ~ (99 + 100 * 2**16),
'nqp::encode mixed string with ascii and surrogate pair with utf16 using buffer of type uint64');
}