From 8a735e58e6804be1e6a125678d1a8d116ad54651 Mon Sep 17 00:00:00 2001 From: peterdm Date: Fri, 14 Sep 2018 21:15:32 +0200 Subject: [PATCH 1/2] XALANJ-2617 Fixed serializer such that it correctly deals with high-surrogate UTF-16 characters --- src/org/apache/xml/serializer/ToStream.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/org/apache/xml/serializer/ToStream.java b/src/org/apache/xml/serializer/ToStream.java index 1134eb7ee..5c1f74172 100644 --- a/src/org/apache/xml/serializer/ToStream.java +++ b/src/org/apache/xml/serializer/ToStream.java @@ -1594,6 +1594,13 @@ else if (ch == CharInfo.S_LINE_SEPARATOR) { writer.write("
"); lastDirtyCharProcessed = i; } + else if (Encodings.isHighUTF16Surrogate(ch)) { + // As of Java 1.5, we could use Character.isHighSurrogate(ch), + // but this codebase needs to be Java 1.3 compliant (even though that is seriously outdated), + // which is why we settle for Encodings.isHighUTF16Surrogate(ch). + lastDirtyCharProcessed = processDirty(chars, end, i, ch, lastDirtyCharProcessed, true); + i = lastDirtyCharProcessed; + } else if (m_encodingInfo.isInEncoding(ch)) { // If the character is in the encoding, and // not in the normal ASCII range, we also From 0edbce9f3f2708ecc697dab043ba8fd82076a7ad Mon Sep 17 00:00:00 2001 From: peterdm Date: Fri, 19 Oct 2018 21:50:01 +0200 Subject: [PATCH 2/2] XALANJ-2617 Fixed serializer for high-surrogate UTF-16 characters also for attribute values --- src/org/apache/xml/serializer/ToStream.java | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/org/apache/xml/serializer/ToStream.java b/src/org/apache/xml/serializer/ToStream.java index 5c1f74172..322fef2bb 100644 --- a/src/org/apache/xml/serializer/ToStream.java +++ b/src/org/apache/xml/serializer/ToStream.java @@ -2109,6 +2109,7 @@ public void writeAttrString( } string.getChars(0,len, m_attrBuff, 0); final char[] stringChars = m_attrBuff; + int lastDirtyCharProcessed = -1; for (int i = 0; i < len; i++) { @@ -2118,7 +2119,7 @@ public void writeAttrString( // The character is supposed to be replaced by a String // e.g. '&' --> "&" // e.g. '<' --> "<" - accumDefaultEscape(writer, ch, i, stringChars, len, false, true); + lastDirtyCharProcessed = accumDefaultEscape(writer, ch, i, stringChars, len, false, true); } else { if (0x0 <= ch && ch <= 0x1F) { @@ -2140,17 +2141,21 @@ public void writeAttrString( case CharInfo.S_HORIZONAL_TAB: writer.write(" "); + lastDirtyCharProcessed = i; break; case CharInfo.S_LINEFEED: writer.write(" "); + lastDirtyCharProcessed = i; break; case CharInfo.S_CARRIAGERETURN: writer.write(" "); + lastDirtyCharProcessed = i; break; default: writer.write("&#"); writer.write(Integer.toString(ch)); writer.write(';'); + lastDirtyCharProcessed = i; break; } @@ -2159,6 +2164,7 @@ else if (ch < 0x7F) { // Range 0x20 through 0x7E inclusive // Normal ASCII chars writer.write(ch); + lastDirtyCharProcessed = i; } else if (ch <= 0x9F){ // Range 0x7F through 0x9F inclusive @@ -2166,16 +2172,23 @@ else if (ch <= 0x9F){ writer.write("&#"); writer.write(Integer.toString(ch)); writer.write(';'); + lastDirtyCharProcessed = i; } else if (ch == CharInfo.S_LINE_SEPARATOR) { // LINE SEPARATOR writer.write("
"); + lastDirtyCharProcessed = i; + } + else if (Encodings.isHighUTF16Surrogate(ch)) { + lastDirtyCharProcessed = processDirty(stringChars, len, i, ch, lastDirtyCharProcessed, false); + i = lastDirtyCharProcessed; } else if (m_encodingInfo.isInEncoding(ch)) { // If the character is in the encoding, and // not in the normal ASCII range, we also // just write it out writer.write(ch); + lastDirtyCharProcessed = i; } else { // This is a fallback plan, we should never get here @@ -2185,6 +2198,7 @@ else if (m_encodingInfo.isInEncoding(ch)) { writer.write("&#"); writer.write(Integer.toString(ch)); writer.write(';'); + lastDirtyCharProcessed = i; } }