diff --git a/src/main/java/org/github/_1c_syntax/bsl/parser/BSLExtendedParser.java b/src/main/java/org/github/_1c_syntax/bsl/parser/BSLExtendedParser.java
index 6d567236..89014851 100644
--- a/src/main/java/org/github/_1c_syntax/bsl/parser/BSLExtendedParser.java
+++ b/src/main/java/org/github/_1c_syntax/bsl/parser/BSLExtendedParser.java
@@ -27,6 +27,7 @@
import org.antlr.v4.runtime.TokenStream;
import java.io.File;
+import java.io.FileInputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
@@ -54,12 +55,20 @@ public BSLParser.FileContext parseFile(File file) {
}
private void prepareParser(Path path) {
+
CharStream input;
+
try {
- input = CharStreams.fromPath(path, StandardCharsets.UTF_8);
+ FileInputStream fis = new FileInputStream(path.toAbsolutePath().toString());
+
+ UnicodeBOMInputStream ubis = new UnicodeBOMInputStream(fis);
+ ubis.skipBOM();
+
+ input = CharStreams.fromStream(ubis, StandardCharsets.UTF_8);
} catch (IOException e) {
throw new RuntimeException(e);
}
+
lexer.setInputStream(input);
CommonTokenStream tokenStream = new CommonTokenStream(lexer);
diff --git a/src/main/java/org/github/_1c_syntax/bsl/parser/UnicodeBOMInputStream.java b/src/main/java/org/github/_1c_syntax/bsl/parser/UnicodeBOMInputStream.java
new file mode 100644
index 00000000..93ff9c91
--- /dev/null
+++ b/src/main/java/org/github/_1c_syntax/bsl/parser/UnicodeBOMInputStream.java
@@ -0,0 +1,318 @@
+/*
+ * This file is a part of BSL Parser.
+ *
+ * Copyright © 2018-2019
+ * Alexey Sosnoviy The
+ * Unicode FAQ
+ * defines 5 types of BOMs:UnicodeBOMInputStream
class wraps any
+ * InputStream
and detects the presence of any Unicode BOM
+ * (Byte Order Mark) at its beginning, as defined by
+ * RFC 3629 - UTF-8, a
+ * transformation format of ISO 10646
+ *
+ *
+ *
00 00 FE FF = UTF-32, big-endian
FF FE 00 00 = UTF-32, little-endian
FE FF = UTF-16, big-endian
FF FE = UTF-16, little-endian
EF BB BF = UTF-8
Use the {@link #getBOM()} method to know whether a BOM has been detected + * or not. + *
+ *Use the {@link #skipBOM()} method to remove the detected BOM from the
+ * wrapped InputStream
object.
String
representation of this BOM
+ * value.
+ */
+ public final String toString()
+ {
+ return description;
+ }
+
+ /**
+ * Returns the bytes corresponding to this BOM
value.
+ */
+ public final byte[] getBytes()
+ {
+ final int length = bytes.length;
+ final byte[] result = new byte[length];
+
+ // make a defensive copy
+ System.arraycopy(bytes, 0, result, 0, length);
+
+ return result;
+ }
+
+ private BOM(final byte bom[], final String description)
+ {
+ assert(bom != null) : "invalid BOM: null is not allowed";
+ assert(description != null) : "invalid description: null is not allowed";
+ assert(description.length() != 0) : "invalid description: empty string is not allowed";
+
+ this.bytes = bom;
+ this.description = description;
+ }
+
+ final byte bytes[];
+ private final String description;
+
+ } // BOM
+
+ /**
+ * Constructs a new UnicodeBOMInputStream
that wraps the
+ * specified InputStream
.
+ *
+ * @param inputStream an InputStream
.
+ *
+ * @throws NullPointerException when inputStream
is
+ * null
.
+ * @throws IOException on reading from the specified InputStream
+ * when trying to detect the Unicode BOM.
+ */
+ public UnicodeBOMInputStream(final InputStream inputStream) throws NullPointerException,
+ IOException
+ {
+ if (inputStream == null)
+ throw new NullPointerException("invalid input stream: null is not allowed");
+
+ in = new PushbackInputStream(inputStream, 4);
+
+ final byte bom[] = new byte[4];
+ final int read = in.read(bom);
+
+ switch(read)
+ {
+ case 4:
+ if ((bom[0] == (byte)0xFF) &&
+ (bom[1] == (byte)0xFE) &&
+ (bom[2] == (byte)0x00) &&
+ (bom[3] == (byte)0x00))
+ {
+ this.bom = BOM.UTF_32_LE;
+ break;
+ }
+ else
+ if ((bom[0] == (byte)0x00) &&
+ (bom[1] == (byte)0x00) &&
+ (bom[2] == (byte)0xFE) &&
+ (bom[3] == (byte)0xFF))
+ {
+ this.bom = BOM.UTF_32_BE;
+ break;
+ }
+
+ case 3:
+ if ((bom[0] == (byte)0xEF) &&
+ (bom[1] == (byte)0xBB) &&
+ (bom[2] == (byte)0xBF))
+ {
+ this.bom = BOM.UTF_8;
+ break;
+ }
+
+ case 2:
+ if ((bom[0] == (byte)0xFF) &&
+ (bom[1] == (byte)0xFE))
+ {
+ this.bom = BOM.UTF_16_LE;
+ break;
+ }
+ else
+ if ((bom[0] == (byte)0xFE) &&
+ (bom[1] == (byte)0xFF))
+ {
+ this.bom = BOM.UTF_16_BE;
+ break;
+ }
+
+ default:
+ this.bom = BOM.NONE;
+ break;
+ }
+
+ if (read > 0)
+ in.unread(bom, 0, read);
+ }
+
+ /**
+ * Returns the BOM
that was detected in the wrapped
+ * InputStream
object.
+ *
+ * @return a BOM
value.
+ */
+ public final BOM getBOM()
+ {
+ // BOM type is immutable.
+ return bom;
+ }
+
+ /**
+ * Skips the BOM
that was found in the wrapped
+ * InputStream
object.
+ *
+ * @return this UnicodeBOMInputStream
.
+ *
+ * @throws IOException when trying to skip the BOM from the wrapped
+ * InputStream
object.
+ */
+ public final synchronized UnicodeBOMInputStream skipBOM() throws IOException
+ {
+ if (!skipped)
+ {
+ in.skip(bom.bytes.length);
+ skipped = true;
+ }
+ return this;
+ }
+
+ @Override
+ public int read() throws IOException
+ {
+ return in.read();
+ }
+
+ @Override
+ public int read(final byte b[]) throws IOException,
+ NullPointerException
+ {
+ return in.read(b, 0, b.length);
+ }
+
+ @Override
+ public int read(final byte b[],
+ final int off,
+ final int len) throws IOException,
+ NullPointerException
+ {
+ return in.read(b, off, len);
+ }
+
+ @Override
+ public long skip(final long n) throws IOException
+ {
+ return in.skip(n);
+ }
+
+ @Override
+ public int available() throws IOException
+ {
+ return in.available();
+ }
+
+ @Override
+ public void close() throws IOException
+ {
+ in.close();
+ }
+
+ @Override
+ public synchronized void mark(final int readlimit)
+ {
+ in.mark(readlimit);
+ }
+
+ @Override
+ public synchronized void reset() throws IOException
+ {
+ in.reset();
+ }
+
+ @Override
+ public boolean markSupported()
+ {
+ return in.markSupported();
+ }
+
+ private final PushbackInputStream in;
+ private final BOM bom;
+ private boolean skipped = false;
+
+} // UnicodeBOMInputStream
diff --git a/src/test/java/org/github/_1c_syntax/bsl/parser/BSLLexerTest.java b/src/test/java/org/github/_1c_syntax/bsl/parser/BSLLexerTest.java
index 19a8c9a8..bd873d84 100644
--- a/src/test/java/org/github/_1c_syntax/bsl/parser/BSLLexerTest.java
+++ b/src/test/java/org/github/_1c_syntax/bsl/parser/BSLLexerTest.java
@@ -30,10 +30,10 @@
import java.io.IOException;
import java.io.InputStream;
-import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
import java.util.List;
-import static org.junit.jupiter.api.Assertions.*;
+import static org.junit.jupiter.api.Assertions.assertArrayEquals;
class BSLLexerTest {
@@ -44,13 +44,19 @@ private void assertMatch(String inputString, Integer... expectedTokens) {
}
private void assertMatch(int mode, String inputString, Integer... expectedTokens) {
- InputStream inputStream = IOUtils.toInputStream(inputString, Charset.forName("UTF-8"));
CharStream input;
+
try {
- input = CharStreams.fromStream(inputStream, Charset.forName("UTF-8"));
+ InputStream inputStream = IOUtils.toInputStream(inputString, StandardCharsets.UTF_8);
+
+ UnicodeBOMInputStream ubis = new UnicodeBOMInputStream(inputStream);
+ ubis.skipBOM();
+
+ input = CharStreams.fromStream(ubis, StandardCharsets.UTF_8);
} catch (IOException e) {
throw new RuntimeException(e);
}
+
lexer.setInputStream(input);
lexer.mode(mode);
@@ -65,6 +71,11 @@ private void assertMatch(int mode, String inputString, Integer... expectedTokens
assertArrayEquals(expectedTokens, tokenTypes);
}
+ @Test
+ void testBOM() {
+ assertMatch('\uFEFF' + "Процедура", BSLLexer.PROCEDURE_KEYWORD);
+ }
+
@Test
void testUse() {
assertMatch(BSLLexer.PREPROCESSOR_MODE, "Использовать lib", BSLLexer.PREPROC_USE_KEYWORD, BSLLexer.PREPROC_IDENTIFIER);
diff --git a/src/test/java/org/github/_1c_syntax/bsl/parser/BSLParserTest.java b/src/test/java/org/github/_1c_syntax/bsl/parser/BSLParserTest.java
index 24bb7d82..36f50ed7 100644
--- a/src/test/java/org/github/_1c_syntax/bsl/parser/BSLParserTest.java
+++ b/src/test/java/org/github/_1c_syntax/bsl/parser/BSLParserTest.java
@@ -32,7 +32,7 @@
import java.io.IOException;
import java.io.InputStream;
-import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
import static org.junit.jupiter.api.Assertions.assertThrows;
@@ -45,13 +45,19 @@ private void setInput(String inputString) {
}
private void setInput(String inputString, int mode) {
- InputStream inputStream = IOUtils.toInputStream(inputString, Charset.forName("UTF-8"));
CharStream input;
+
try {
- input = CharStreams.fromStream(inputStream, Charset.forName("UTF-8"));
+ InputStream inputStream = IOUtils.toInputStream(inputString, StandardCharsets.UTF_8);
+
+ UnicodeBOMInputStream ubis = new UnicodeBOMInputStream(inputStream);
+ ubis.skipBOM();
+
+ input = CharStreams.fromStream(ubis, StandardCharsets.UTF_8);
} catch (IOException e) {
throw new RuntimeException(e);
}
+
lexer.setInputStream(input);
lexer.mode(mode);