Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Write 4-byte characters (surrogate pairs) instead of escapes #1335

Merged
merged 7 commits into from
Sep 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions release-notes/CREDITS-2.x
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,15 @@ Antonin Janec (@xtonic)
* Contributed #1218: Simplify Unicode surrogate pair conversion for generation
(2.17.0)

Ian Roberts (@ianroberts)
* Reported #223: `UTF8JsonGenerator` writes supplementary characters as a
surrogate pair: should use 4-byte encoding
(2.18.0)

Radovan Netuka (@rnetuka)
* Contributed fix for #223: `UTF8JsonGenerator` writes supplementary characters as a
surrogate pair: should use 4-byte encoding

Jared Stehler (@jaredstehler)
* Reported, contributed fix for #1274: `NUL`-corrupted keys, values on JSON serialization
(2.18.0)
Expand Down
4 changes: 4 additions & 0 deletions release-notes/VERSION-2.x
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ a pure JSON library.

2.18.0 (not yet released)

#223: `UTF8JsonGenerator` writes supplementary characters as a surrogate pair:
should use 4-byte encoding
(reported by Ian R)
(fix contributed by Radovan N)
#1230: Improve performance of `float` and `double` parsing from `TextBuffer`
(implemented by @pjfanning)
#1251: `InternCache` replace synchronized with `ReentrantLock` - the cache
Expand Down
15 changes: 11 additions & 4 deletions src/main/java/com/fasterxml/jackson/core/JsonGenerator.java
Original file line number Diff line number Diff line change
Expand Up @@ -269,13 +269,20 @@ public enum Feature {
WRITE_HEX_UPPER_CASE(true),

/**
* Feature that specifies whether {@link JsonGenerator} should escape forward slashes.
* <p>
* Feature is disabled by default for Jackson 2.x version, and enabled by default in Jackson 3.0.
* See {@link com.fasterxml.jackson.core.json.JsonWriteFeature#ESCAPE_FORWARD_SLASHES}.
*
* @since 2.17
*/
ESCAPE_FORWARD_SLASHES(false);
ESCAPE_FORWARD_SLASHES(false),

/**
* See {@link com.fasterxml.jackson.core.json.JsonWriteFeature#COMBINE_UNICODE_SURROGATES_IN_UTF8}.
*
* @since 2.18
*/
COMBINE_UNICODE_SURROGATES_IN_UTF8(false),

;

private final boolean _defaultState;
private final int _mask;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ public enum JsonWriteFeature
{
// // // Support for non-standard data format constructs: comments

// // Quoting/ecsaping-related features
// // Quoting/escaping-related features

/**
* Feature that determines whether JSON Object field names are
Expand Down Expand Up @@ -117,6 +117,28 @@ public enum JsonWriteFeature
*/
ESCAPE_FORWARD_SLASHES(false, JsonGenerator.Feature.ESCAPE_FORWARD_SLASHES),

/**
* Feature that specifies how characters outside "Basic Multilingual Plane" (BMP) -- ones encoded
* as 4-byte UTF-8 sequences but represented in JVM memory as 2 16-bit "surrogate" {@code chars} --
* should be encoded as UTF-8 by {@link JsonGenerator}.
* If enabled, surrogate pairs are combined and flushed as a
* single, 4-byte UTF-8 character.
* If disabled, each {@code char} of pair is written as 2 separate characters: that is, as 2
* separate 3-byte UTF-8 characters with values in Surrogate character ranges
* ({@code 0xD800} - {@code 0xDBFF} and {@code 0xDC00} - {@code 0xDFFF})
* <p>
* Note that this feature only has effect for {@link JsonGenerator}s that directly encode
* {@code byte}-based output, as UTF-8 (target {@link java.io.OutputStream}, {@code byte[]}
* and so on); it will not (can not) change handling of
* {@code char}-based output (like {@link java.io.Writer} or {@link java.lang.String}).
* <p>
* Feature is disabled by default in 2.x for backwards-compatibility (will be enabled
* in 3.0).
*
* @since 2.18
*/
COMBINE_UNICODE_SURROGATES_IN_UTF8(false, JsonGenerator.Feature.COMBINE_UNICODE_SURROGATES_IN_UTF8),

;

final private boolean _defaultState;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1510,6 +1510,16 @@ private final void _writeStringSegment2(final char[] cbuf, int offset, final int
outputBuffer[outputPtr++] = (byte) (0xc0 | (ch >> 6));
outputBuffer[outputPtr++] = (byte) (0x80 | (ch & 0x3f));
} else {
// 3- or 4-byte character
if (_isSurrogateChar(ch)) {
final boolean combineSurrogates = Feature.COMBINE_UNICODE_SURROGATES_IN_UTF8.enabledIn(_features);
if (combineSurrogates && offset < end) {
char highSurrogate = (char) ch;
char lowSurrogate = cbuf[offset++];
outputPtr = _outputSurrogatePair(highSurrogate, lowSurrogate, outputPtr);
continue;
}
}
outputPtr = _outputMultiByteChar(ch, outputPtr);
}
}
Expand Down Expand Up @@ -1548,6 +1558,16 @@ private final void _writeStringSegment2(final String text, int offset, final int
outputBuffer[outputPtr++] = (byte) (0xc0 | (ch >> 6));
outputBuffer[outputPtr++] = (byte) (0x80 | (ch & 0x3f));
} else {
// 3- or 4-byte character
if (_isSurrogateChar(ch)) {
final boolean combineSurrogates = Feature.COMBINE_UNICODE_SURROGATES_IN_UTF8.enabledIn(_features);
if (combineSurrogates && offset < end) {
char highSurrogate = (char) ch;
char lowSurrogate = text.charAt(offset++);
outputPtr = _outputSurrogatePair(highSurrogate, lowSurrogate, outputPtr);
continue;
}
}
outputPtr = _outputMultiByteChar(ch, outputPtr);
}
}
Expand Down Expand Up @@ -2133,6 +2153,19 @@ protected final void _outputSurrogates(int surr1, int surr2) throws IOException
bbuf[_outputTail++] = (byte) (0x80 | (c & 0x3f));
}

// @since 2.18
private int _outputSurrogatePair(char highSurrogate, char lowSurrogate, int outputPtr) {
final int unicode = 0x10000 + ((highSurrogate & 0x03FF) << 10)
+ (lowSurrogate & 0x03FF);

_outputBuffer[outputPtr++] = (byte) (0xF0 + ((unicode >> 18) & 0x07));
_outputBuffer[outputPtr++] = (byte) (0x80 + ((unicode >> 12) & 0x3F));
_outputBuffer[outputPtr++] = (byte) (0x80 + ((unicode >> 6) & 0x3F));
_outputBuffer[outputPtr++] = (byte) (0x80 + (unicode & 0x3F));

return outputPtr;
}

/**
*
* @param ch
Expand Down Expand Up @@ -2214,5 +2247,10 @@ protected final void _flushBuffer() throws IOException
private byte[] getHexBytes() {
return _cfgWriteHexUppercase ? HEX_BYTES_UPPER : HEX_BYTES_LOWER;
}

// @since 2.18
private boolean _isSurrogateChar(int ch) {
return (ch & 0xD800) == 0xD800;
}
}

Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package com.fasterxml.jackson.failing;
package com.fasterxml.jackson.core.json;

import java.io.ByteArrayOutputStream;
import java.io.StringWriter;
Expand All @@ -9,10 +9,18 @@
import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;

class Surrogate223Test extends JUnit5TestBase
{
private final JsonFactory JSON_F = new JsonFactory();
private final JsonFactory DEFAULT_JSON_F = newStreamFactory();

// for [core#223]
@Test
void surrogatesDefaultSetting() throws Exception {
// default in 2.x should be disabled:
assertFalse(DEFAULT_JSON_F.isEnabled(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8.mappedFeature()));
}

// for [core#223]
@Test
Expand All @@ -23,36 +31,41 @@ void surrogatesByteBacked() throws Exception
final String toQuote = new String(Character.toChars(0x1F602));
assertEquals(2, toQuote.length()); // just sanity check

// default should be disabled:
// assertFalse(JSON_F.isEnabled(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES));

out = new ByteArrayOutputStream();
g = JSON_F.createGenerator(out);

JsonFactory f = JsonFactory.builder()
.enable(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8)
.build();
g = f.createGenerator(out);
g.writeStartArray();
g.writeString(toQuote);
g.writeEndArray();
g.close();
assertEquals(2 + 2 + 4, out.size()); // brackets, quotes, 4-byte encoding

// Also parse back to ensure correctness
JsonParser p = JSON_F.createParser(out.toByteArray());
JsonParser p = f.createParser(out.toByteArray());
assertToken(JsonToken.START_ARRAY, p.nextToken());
assertToken(JsonToken.VALUE_STRING, p.nextToken());
assertEquals(toQuote, p.getText());
assertToken(JsonToken.END_ARRAY, p.nextToken());
p.close();

// but may revert back to original behavior
out = new ByteArrayOutputStream();
g = JSON_F.createGenerator(out);
// g.enable(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES);
f = JsonFactory.builder()
.disable(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8)
.build();

g = f.createGenerator(out);
g.writeStartArray();
g.writeString(toQuote);
g.writeEndArray();
g.close();
assertEquals(2 + 2 + 12, out.size()); // brackets, quotes, 2 x 6 byte JSON escape
}

// for [core#223]
// for [core#223]: no change for character-backed (cannot do anything)
@Test
void surrogatesCharBacked() throws Exception
{
Expand All @@ -61,32 +74,20 @@ void surrogatesCharBacked() throws Exception
final String toQuote = new String(Character.toChars(0x1F602));
assertEquals(2, toQuote.length()); // just sanity check

// default should be disabled:
// assertFalse(JSON_F.isEnabled(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES));

out = new StringWriter();
g = JSON_F.createGenerator(out);
g = DEFAULT_JSON_F.createGenerator(out);
g.writeStartArray();
g.writeString(toQuote);
g.writeEndArray();
g.close();
assertEquals(2 + 2 + 2, out.toString().length()); // brackets, quotes, 2 chars as is

// Also parse back to ensure correctness
JsonParser p = JSON_F.createParser(out.toString());
JsonParser p = DEFAULT_JSON_F.createParser(out.toString());
assertToken(JsonToken.START_ARRAY, p.nextToken());
assertToken(JsonToken.VALUE_STRING, p.nextToken());
assertEquals(toQuote, p.getText());
assertToken(JsonToken.END_ARRAY, p.nextToken());
p.close();

// but may revert back to original behavior
out = new StringWriter();
g = JSON_F.createGenerator(out);
// g.enable(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES);
g.writeStartArray();
g.writeString(toQuote);
g.writeEndArray();
g.close();
assertEquals(2 + 2 + 12, out.toString().length()); // brackets, quotes, 2 x 6 byte JSON escape
}
}