Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Skip UTF-8 BOM mark in EncodingDetectingInputStream and default to UTF-8 in RewriteTest #4546

Merged
merged 10 commits into from
Oct 3, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -20,25 +20,27 @@
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UncheckedIOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;

public class EncodingDetectingInputStream extends InputStream {
private static final Charset WINDOWS_1252 = Charset.forName("Windows-1252");
private static final byte[] UTF8_BOM = new byte[]{(byte) 0xEF, (byte) 0xBB, (byte) 0xBF};

private final InputStream inputStream;

@Nullable
private Charset charset;

private boolean bomChecked;
private boolean charsetBomMarked;

/**
* Last byte read
*/
private int prev;
private int prev2;
private int prev3;

boolean maybeTwoByteSequence = false;
boolean maybeThreeByteSequence = false;
Expand All @@ -64,71 +66,68 @@ public boolean isCharsetBomMarked() {

@Override
public int read() throws IOException {
int aByte = inputStream.read();
int read;
if (!bomChecked) {
read = checkAndSkipUtf8Bom();
if (charsetBomMarked) {
read = inputStream.read();
}
} else {
read = inputStream.read();
}


// if we haven't yet determined a charset...
if (charset == null) {
guessCharset(aByte);
if (read == -1) {
if (charset == null) {
if (maybeTwoByteSequence || maybeThreeByteSequence || maybeFourByteSequence) {
charset = WINDOWS_1252;
} else {
charset = StandardCharsets.UTF_8;
}
}
} else if (charset == null) {
guessCharset(read);
}
return aByte;
return read;
}

private void guessCharset(int aByte) {
if (prev3 == 0xEF && prev2 == 0xBB && prev == 0xBF) {
charsetBomMarked = true;
charset = StandardCharsets.UTF_8;
} else {
if (aByte == -1 || !(prev2 == 0 && prev == 0xEF || prev3 == 0 && prev2 == 0xEF)) {
if (maybeTwoByteSequence) {
if (aByte == -1 && !utf8SequenceEnd(prev) || aByte != -1 && !(utf8SequenceEnd(aByte))) {
charset = WINDOWS_1252;
} else {
maybeTwoByteSequence = false;
prev2 = -1;
prev = -1;
}
} else if (maybeThreeByteSequence) {
if (aByte == -1 ||
utf8SequenceEnd(prev) && !(utf8SequenceEnd(aByte)) ||
!utf8SequenceEnd(aByte)) {
charset = WINDOWS_1252;
}

if (utf8SequenceEnd(prev) && utf8SequenceEnd(aByte)) {
maybeThreeByteSequence = false;
prev2 = -1;
prev = -1;
}
} else if (maybeFourByteSequence) {
if (aByte == -1 ||
utf8SequenceEnd(prev2) && utf8SequenceEnd(prev) && !utf8SequenceEnd(aByte) ||
utf8SequenceEnd(prev) && !utf8SequenceEnd(aByte) ||
!(utf8SequenceEnd(aByte))) {
charset = WINDOWS_1252;
}

if (utf8SequenceEnd(prev2) && utf8SequenceEnd(prev) && utf8SequenceEnd(aByte)) {
maybeFourByteSequence = false;
prev2 = -1;
prev = -1;
}
} else if (utf8TwoByteSequence(aByte)) {
maybeTwoByteSequence = true;
} else if (utf8ThreeByteSequence(aByte)) {
maybeThreeByteSequence = true;
} else if (utf8FourByteSequence(aByte)) {
maybeFourByteSequence = true;
} else if (!utf8TwoByteSequence(prev) && utf8SequenceEnd(aByte)) {
charset = WINDOWS_1252;
}
if (utf8TwoByteSequence(aByte)) {
maybeTwoByteSequence = true;
} else if (utf8ThreeByteSequence(aByte)) {
maybeThreeByteSequence = true;
} else if (utf8FourByteSequence(aByte)) {
maybeFourByteSequence = true;
} else if (maybeTwoByteSequence) {
if (!utf8SequenceEnd(aByte)) {
charset = WINDOWS_1252;
} else {
maybeTwoByteSequence = false;
prev = -1;
}
} else if (maybeThreeByteSequence) {
if (!utf8SequenceEnd(aByte)) {
charset = WINDOWS_1252;
}

if (aByte == -1 && charset == null) {
charset = StandardCharsets.UTF_8;
if (utf8SequenceEnd(prev) && utf8SequenceEnd(aByte)) {
maybeThreeByteSequence = false;
prev = -1;
}
} else if (maybeFourByteSequence) {
if (utf8SequenceEnd(prev2) && utf8SequenceEnd(prev) && !utf8SequenceEnd(aByte) || utf8SequenceEnd(prev) && !utf8SequenceEnd(aByte) || !utf8SequenceEnd(aByte)) {
charset = WINDOWS_1252;
}

if (utf8SequenceEnd(prev2) && utf8SequenceEnd(prev) && utf8SequenceEnd(aByte)) {
maybeFourByteSequence = false;
prev = -1;
}
} else if (utf8SequenceEnd(aByte)) {
charset = WINDOWS_1252;
}

prev3 = prev2;
prev2 = prev;
prev = aByte;
}
Expand All @@ -143,14 +142,36 @@ public synchronized String toString() {
};
byte[] buffer = new byte[4096];
int n;
// Note that `is` is this, so the BOM will be checked in `read()`
while ((n = is.read(buffer)) != -1) {
bos.write(buffer, 0, n);
}

return bos.toString();
} catch (IOException e) {
throw new UnsupportedOperationException(e);
throw new UncheckedIOException(e);
}
}

private int checkAndSkipUtf8Bom() throws IOException {
// `Files#newInputStream()` does not need to support mark/reset, so one at the time...
bomChecked = true;
int read = inputStream.read();
if ((byte) read != UTF8_BOM[0]) {
return read;
}
read = inputStream.read();
if ((byte) read != UTF8_BOM[1]) {
return read;
}
read = inputStream.read();
if ((byte) read != UTF8_BOM[2]) {
return read;
}
charsetBomMarked = true;
charset = StandardCharsets.UTF_8;
// return anything other that -1
return -2;
}

// The first byte of a UTF-8 two byte sequence is between 0xC0 - 0xDF.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,15 @@ void detectUTF8Bom() throws IOException {
}
}

@Test
void skipUTF8Bom() throws IOException {
String bom = "\uFEFFhello";
try (EncodingDetectingInputStream is = new EncodingDetectingInputStream(new ByteArrayInputStream(bom.getBytes(UTF_8)))) {
assertThat(is.readFully()).isEqualTo("hello");
assertThat(is.isCharsetBomMarked()).isTrue();
}
}

@Test
void isUtf8() throws IOException {
List<String> accents = Arrays.asList("Café", "Lýðræðisríki");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
import org.openrewrite.test.RewriteTest;
import org.openrewrite.tree.ParseError;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.Base64;
Expand Down Expand Up @@ -938,8 +937,7 @@ public MockResponse dispatch(RecordedRequest request) {
var ctx = MavenExecutionContextView.view(new InMemoryExecutionContext(t -> {
throw new RuntimeException(t);
}));
var settings = MavenSettings.parse(new Parser.Input(Paths.get("settings.xml"), () ->
new ByteArrayInputStream(
var settings = MavenSettings.parse(Parser.Input.fromString(Paths.get("settings.xml"),
//language=xml
"""
<settings>
Expand All @@ -959,8 +957,8 @@ public MockResponse dispatch(RecordedRequest request) {
</server>
</servers>
</settings>
""".formatted(mockRepo.getHostName(), mockRepo.getPort(), username, password).getBytes()
)), ctx);
""".formatted(mockRepo.getHostName(), mockRepo.getPort(), username, password)
), ctx);
ctx.setMavenSettings(settings);

var maven = MavenParser.builder().build().parse(
Expand Down
Loading