diff --git a/src/main/java/org/github/_1c_syntax/bsl/parser/BSLExtendedParser.java b/src/main/java/org/github/_1c_syntax/bsl/parser/BSLExtendedParser.java index 6d567236..89014851 100644 --- a/src/main/java/org/github/_1c_syntax/bsl/parser/BSLExtendedParser.java +++ b/src/main/java/org/github/_1c_syntax/bsl/parser/BSLExtendedParser.java @@ -27,6 +27,7 @@ import org.antlr.v4.runtime.TokenStream; import java.io.File; +import java.io.FileInputStream; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.Path; @@ -54,12 +55,20 @@ public BSLParser.FileContext parseFile(File file) { } private void prepareParser(Path path) { + CharStream input; + try { - input = CharStreams.fromPath(path, StandardCharsets.UTF_8); + FileInputStream fis = new FileInputStream(path.toAbsolutePath().toString()); + + UnicodeBOMInputStream ubis = new UnicodeBOMInputStream(fis); + ubis.skipBOM(); + + input = CharStreams.fromStream(ubis, StandardCharsets.UTF_8); } catch (IOException e) { throw new RuntimeException(e); } + lexer.setInputStream(input); CommonTokenStream tokenStream = new CommonTokenStream(lexer); diff --git a/src/main/java/org/github/_1c_syntax/bsl/parser/UnicodeBOMInputStream.java b/src/main/java/org/github/_1c_syntax/bsl/parser/UnicodeBOMInputStream.java new file mode 100644 index 00000000..93ff9c91 --- /dev/null +++ b/src/main/java/org/github/_1c_syntax/bsl/parser/UnicodeBOMInputStream.java @@ -0,0 +1,318 @@ +/* + * This file is a part of BSL Parser. + * + * Copyright © 2018-2019 + * Alexey Sosnoviy , Nikita Gryzlov , Sergey Batanov + * + * SPDX-License-Identifier: LGPL-3.0-or-later + * + * BSL Parser is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3.0 of the License, or (at your option) any later version. + * + * BSL Parser is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with BSL Parser. + */ + +// Originally released by: +// (‑●‑●)> released under the WTFPL v2 license, by Gregory Pakosz (@gpakosz) + +package org.github._1c_syntax.bsl.parser; + +import java.io.IOException; +import java.io.InputStream; +import java.io.PushbackInputStream; + +/** + * The UnicodeBOMInputStream class wraps any + * InputStream and detects the presence of any Unicode BOM + * (Byte Order Mark) at its beginning, as defined by + * RFC 3629 - UTF-8, a + * transformation format of ISO 10646 + * + *

The + * Unicode FAQ + * defines 5 types of BOMs:

    + *
  • 00 00 FE FF  = UTF-32, big-endian
  • + *
  • FF FE 00 00  = UTF-32, little-endian
  • + *
  • FE FF        = UTF-16, big-endian
  • + *
  • FF FE        = UTF-16, little-endian
  • + *
  • EF BB BF     = UTF-8
  • + *

+ * + *

Use the {@link #getBOM()} method to know whether a BOM has been detected + * or not. + *

+ *

Use the {@link #skipBOM()} method to remove the detected BOM from the + * wrapped InputStream object.

+ * + * @author Gregory Pakosz + * @version 1.0 + */ +public class UnicodeBOMInputStream extends InputStream +{ + /** + * Type safe enumeration class that describes the different types of Unicode + * BOMs. + */ + public static final class BOM + { + /** + * NONE. + */ + public static final BOM NONE = new BOM(new byte[]{}, "NONE"); + + /** + * UTF-8 BOM (EF BB BF). + */ + public static final BOM UTF_8 = new BOM(new byte[]{(byte)0xEF, + (byte)0xBB, + (byte)0xBF}, + "UTF-8"); + + /** + * UTF-16, little-endian (FF FE). + */ + public static final BOM UTF_16_LE = new BOM(new byte[]{ (byte)0xFF, + (byte)0xFE}, + "UTF-16 little-endian"); + + /** + * UTF-16, big-endian (FE FF). + */ + public static final BOM UTF_16_BE = new BOM(new byte[]{ (byte)0xFE, + (byte)0xFF}, + "UTF-16 big-endian"); + + /** + * UTF-32, little-endian (FF FE 00 00). + */ + public static final BOM UTF_32_LE = new BOM(new byte[]{ (byte)0xFF, + (byte)0xFE, + (byte)0x00, + (byte)0x00}, + "UTF-32 little-endian"); + + /** + * UTF-32, big-endian (00 00 FE FF). + */ + public static final BOM UTF_32_BE = new BOM(new byte[]{ (byte)0x00, + (byte)0x00, + (byte)0xFE, + (byte)0xFF}, + "UTF-32 big-endian"); + + /** + * Returns a String representation of this BOM + * value. + */ + public final String toString() + { + return description; + } + + /** + * Returns the bytes corresponding to this BOM value. + */ + public final byte[] getBytes() + { + final int length = bytes.length; + final byte[] result = new byte[length]; + + // make a defensive copy + System.arraycopy(bytes, 0, result, 0, length); + + return result; + } + + private BOM(final byte bom[], final String description) + { + assert(bom != null) : "invalid BOM: null is not allowed"; + assert(description != null) : "invalid description: null is not allowed"; + assert(description.length() != 0) : "invalid description: empty string is not allowed"; + + this.bytes = bom; + this.description = description; + } + + final byte bytes[]; + private final String description; + + } // BOM + + /** + * Constructs a new UnicodeBOMInputStream that wraps the + * specified InputStream. + * + * @param inputStream an InputStream. + * + * @throws NullPointerException when inputStream is + * null. + * @throws IOException on reading from the specified InputStream + * when trying to detect the Unicode BOM. + */ + public UnicodeBOMInputStream(final InputStream inputStream) throws NullPointerException, + IOException + { + if (inputStream == null) + throw new NullPointerException("invalid input stream: null is not allowed"); + + in = new PushbackInputStream(inputStream, 4); + + final byte bom[] = new byte[4]; + final int read = in.read(bom); + + switch(read) + { + case 4: + if ((bom[0] == (byte)0xFF) && + (bom[1] == (byte)0xFE) && + (bom[2] == (byte)0x00) && + (bom[3] == (byte)0x00)) + { + this.bom = BOM.UTF_32_LE; + break; + } + else + if ((bom[0] == (byte)0x00) && + (bom[1] == (byte)0x00) && + (bom[2] == (byte)0xFE) && + (bom[3] == (byte)0xFF)) + { + this.bom = BOM.UTF_32_BE; + break; + } + + case 3: + if ((bom[0] == (byte)0xEF) && + (bom[1] == (byte)0xBB) && + (bom[2] == (byte)0xBF)) + { + this.bom = BOM.UTF_8; + break; + } + + case 2: + if ((bom[0] == (byte)0xFF) && + (bom[1] == (byte)0xFE)) + { + this.bom = BOM.UTF_16_LE; + break; + } + else + if ((bom[0] == (byte)0xFE) && + (bom[1] == (byte)0xFF)) + { + this.bom = BOM.UTF_16_BE; + break; + } + + default: + this.bom = BOM.NONE; + break; + } + + if (read > 0) + in.unread(bom, 0, read); + } + + /** + * Returns the BOM that was detected in the wrapped + * InputStream object. + * + * @return a BOM value. + */ + public final BOM getBOM() + { + // BOM type is immutable. + return bom; + } + + /** + * Skips the BOM that was found in the wrapped + * InputStream object. + * + * @return this UnicodeBOMInputStream. + * + * @throws IOException when trying to skip the BOM from the wrapped + * InputStream object. + */ + public final synchronized UnicodeBOMInputStream skipBOM() throws IOException + { + if (!skipped) + { + in.skip(bom.bytes.length); + skipped = true; + } + return this; + } + + @Override + public int read() throws IOException + { + return in.read(); + } + + @Override + public int read(final byte b[]) throws IOException, + NullPointerException + { + return in.read(b, 0, b.length); + } + + @Override + public int read(final byte b[], + final int off, + final int len) throws IOException, + NullPointerException + { + return in.read(b, off, len); + } + + @Override + public long skip(final long n) throws IOException + { + return in.skip(n); + } + + @Override + public int available() throws IOException + { + return in.available(); + } + + @Override + public void close() throws IOException + { + in.close(); + } + + @Override + public synchronized void mark(final int readlimit) + { + in.mark(readlimit); + } + + @Override + public synchronized void reset() throws IOException + { + in.reset(); + } + + @Override + public boolean markSupported() + { + return in.markSupported(); + } + + private final PushbackInputStream in; + private final BOM bom; + private boolean skipped = false; + +} // UnicodeBOMInputStream diff --git a/src/test/java/org/github/_1c_syntax/bsl/parser/BSLLexerTest.java b/src/test/java/org/github/_1c_syntax/bsl/parser/BSLLexerTest.java index 19a8c9a8..bd873d84 100644 --- a/src/test/java/org/github/_1c_syntax/bsl/parser/BSLLexerTest.java +++ b/src/test/java/org/github/_1c_syntax/bsl/parser/BSLLexerTest.java @@ -30,10 +30,10 @@ import java.io.IOException; import java.io.InputStream; -import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.util.List; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; class BSLLexerTest { @@ -44,13 +44,19 @@ private void assertMatch(String inputString, Integer... expectedTokens) { } private void assertMatch(int mode, String inputString, Integer... expectedTokens) { - InputStream inputStream = IOUtils.toInputStream(inputString, Charset.forName("UTF-8")); CharStream input; + try { - input = CharStreams.fromStream(inputStream, Charset.forName("UTF-8")); + InputStream inputStream = IOUtils.toInputStream(inputString, StandardCharsets.UTF_8); + + UnicodeBOMInputStream ubis = new UnicodeBOMInputStream(inputStream); + ubis.skipBOM(); + + input = CharStreams.fromStream(ubis, StandardCharsets.UTF_8); } catch (IOException e) { throw new RuntimeException(e); } + lexer.setInputStream(input); lexer.mode(mode); @@ -65,6 +71,11 @@ private void assertMatch(int mode, String inputString, Integer... expectedTokens assertArrayEquals(expectedTokens, tokenTypes); } + @Test + void testBOM() { + assertMatch('\uFEFF' + "Процедура", BSLLexer.PROCEDURE_KEYWORD); + } + @Test void testUse() { assertMatch(BSLLexer.PREPROCESSOR_MODE, "Использовать lib", BSLLexer.PREPROC_USE_KEYWORD, BSLLexer.PREPROC_IDENTIFIER); diff --git a/src/test/java/org/github/_1c_syntax/bsl/parser/BSLParserTest.java b/src/test/java/org/github/_1c_syntax/bsl/parser/BSLParserTest.java index 24bb7d82..36f50ed7 100644 --- a/src/test/java/org/github/_1c_syntax/bsl/parser/BSLParserTest.java +++ b/src/test/java/org/github/_1c_syntax/bsl/parser/BSLParserTest.java @@ -32,7 +32,7 @@ import java.io.IOException; import java.io.InputStream; -import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import static org.junit.jupiter.api.Assertions.assertThrows; @@ -45,13 +45,19 @@ private void setInput(String inputString) { } private void setInput(String inputString, int mode) { - InputStream inputStream = IOUtils.toInputStream(inputString, Charset.forName("UTF-8")); CharStream input; + try { - input = CharStreams.fromStream(inputStream, Charset.forName("UTF-8")); + InputStream inputStream = IOUtils.toInputStream(inputString, StandardCharsets.UTF_8); + + UnicodeBOMInputStream ubis = new UnicodeBOMInputStream(inputStream); + ubis.skipBOM(); + + input = CharStreams.fromStream(ubis, StandardCharsets.UTF_8); } catch (IOException e) { throw new RuntimeException(e); } + lexer.setInputStream(input); lexer.mode(mode);