diff --git a/src/main/java/io/sf/carte/uparser/CommentRemovalHandler.java b/src/main/java/io/sf/carte/uparser/CommentRemovalHandler.java index 8ba2301..84a05f6 100644 --- a/src/main/java/io/sf/carte/uparser/CommentRemovalHandler.java +++ b/src/main/java/io/sf/carte/uparser/CommentRemovalHandler.java @@ -11,15 +11,45 @@ package io.sf.carte.uparser; +/** + * A handler that removes comments. + *

+ * Example: + *

+ * + *

+ * String removeComments(String text) {
+ *     String[] opening = { "/{@literal *}", "<!--" };
+ *     String[] closing = { "{@literal *}/", "-->" };
+ *     CommentRemovalHandler handler = new CommentRemovalHandler(text.length());
+ *     TokenProducer tp = new TokenProducer(handler);
+ *     try {
+ *         tp.parseMultiComment(new StringReader(text), opening, closing);
+ *     } catch (IOException e) {
+ *     }
+ *     return handler.getBuffer().toString();
+ * }
+ * 
+ */ public class CommentRemovalHandler implements TokenHandler2 { private final StringBuilder buffer; + /** + * Construct the handler with the given initial buffer size. + * + * @param bufSize the initial buffer size. + */ public CommentRemovalHandler(int bufSize) { super(); buffer = new StringBuilder(bufSize); } + /** + * Get the buffer. + * + * @return the buffer. + */ public StringBuilder getBuffer() { return buffer; } diff --git a/src/main/java/io/sf/carte/uparser/TokenControl.java b/src/main/java/io/sf/carte/uparser/TokenControl.java index 551a420..32a41fb 100644 --- a/src/main/java/io/sf/carte/uparser/TokenControl.java +++ b/src/main/java/io/sf/carte/uparser/TokenControl.java @@ -63,7 +63,7 @@ public interface TokenControl { * * @return the {@code TokenHandler}. */ - TokenHandler2 getTokenHandler(); + TokenHandler3 getTokenHandler(); /** * Set a new {@code TokenHandler}. @@ -71,7 +71,7 @@ public interface TokenControl { * @param handler * the new {@code TokenHandler}. */ - void setTokenHandler(TokenHandler2 handler); + void setTokenHandler(TokenHandler3 handler); /** * Disable the handling of all types of comments. diff --git a/src/main/java/io/sf/carte/uparser/TokenHandler.java b/src/main/java/io/sf/carte/uparser/TokenHandler.java index 7abd512..b26974f 100644 --- a/src/main/java/io/sf/carte/uparser/TokenHandler.java +++ b/src/main/java/io/sf/carte/uparser/TokenHandler.java @@ -15,39 +15,39 @@ * A {@link TokenHandler2} that is backwards-compatible with * {@code TokenProducer} 1.x. *

- * You may consider using {@link TokenHandler2} instead. + * You may consider using {@link TokenHandler2} or {@link TokenHandler3} instead. *

*/ public interface TokenHandler extends TokenHandler2 { @Override default void leftParenthesis(int index) { - openGroup(index, TokenProducer.CHAR_LEFT_PAREN); + openGroup(index, TokenProducer3.CHAR_LEFT_PAREN); } @Override default void leftSquareBracket(int index) { - openGroup(index, TokenProducer.CHAR_LEFT_SQ_BRACKET); + openGroup(index, TokenProducer3.CHAR_LEFT_SQ_BRACKET); } @Override default void leftCurlyBracket(int index) { - openGroup(index, TokenProducer.CHAR_LEFT_CURLY_BRACKET); + openGroup(index, TokenProducer3.CHAR_LEFT_CURLY_BRACKET); } @Override default void rightParenthesis(int index) { - closeGroup(index, TokenProducer.CHAR_RIGHT_PAREN); + closeGroup(index, TokenProducer3.CHAR_RIGHT_PAREN); } @Override default void rightSquareBracket(int index) { - closeGroup(index, TokenProducer.CHAR_RIGHT_SQ_BRACKET); + closeGroup(index, TokenProducer3.CHAR_RIGHT_SQ_BRACKET); } @Override default void rightCurlyBracket(int index) { - closeGroup(index, TokenProducer.CHAR_RIGHT_CURLY_BRACKET); + closeGroup(index, TokenProducer3.CHAR_RIGHT_CURLY_BRACKET); } /** diff --git a/src/main/java/io/sf/carte/uparser/TokenHandler2.java b/src/main/java/io/sf/carte/uparser/TokenHandler2.java index d3124e5..f088575 100644 --- a/src/main/java/io/sf/carte/uparser/TokenHandler2.java +++ b/src/main/java/io/sf/carte/uparser/TokenHandler2.java @@ -12,10 +12,17 @@ package io.sf.carte.uparser; /** - * To be implemented by listeners that handle the different events generated by - * the TokenProducer. + * A {@link TokenHandler3} that has no checked exceptions, backwards-compatible + * with {@code TokenProducer} 2.x. + *

+ * Most token handlers will report problems through error handlers and produce + * no checked exceptions, in which case you should use this handler together + * with {@link TokenProducer}. In other use cases your handler may want to throw + * checked exceptions, and then you must use {@link TokenProducer3} together + * with {@link TokenHandler3} instead. + *

*/ -public interface TokenHandler2 { +public interface TokenHandler2 extends TokenHandler3 { /** * At the beginning of parsing, this method is called, passing the {@link TokenControl} @@ -24,6 +31,7 @@ public interface TokenHandler2 { * @param control * the TokenControl object in charge of parsing. */ + @Override void tokenStart(TokenControl control); /** @@ -34,6 +42,7 @@ public interface TokenHandler2 { * @param word * the word. */ + @Override void word(int index, CharSequence word); /** @@ -44,6 +53,7 @@ public interface TokenHandler2 { * @param codePoint * the codepoint of the found separator. */ + @Override void separator(int index, int codePoint); /** @@ -56,6 +66,7 @@ public interface TokenHandler2 { * @param quote * the quote character. */ + @Override void quoted(int index, CharSequence quoted, int quote); /** @@ -68,6 +79,7 @@ public interface TokenHandler2 { * @param quoteCp * the quote character codepoint. */ + @Override void quotedWithControl(int index, CharSequence quoted, int quoteCp); /** @@ -78,6 +90,7 @@ public interface TokenHandler2 { * @param codePoint * the FF/LF/CR codepoint. */ + @Override void quotedNewlineChar(int index, int codePoint); /** @@ -86,6 +99,7 @@ public interface TokenHandler2 { * @param index * the index at which the codepoint was found. */ + @Override void leftParenthesis(int index); /** @@ -94,6 +108,7 @@ public interface TokenHandler2 { * @param index * the index at which the codepoint was found. */ + @Override void leftSquareBracket(int index); /** @@ -102,6 +117,7 @@ public interface TokenHandler2 { * @param index * the index at which the codepoint was found. */ + @Override void leftCurlyBracket(int index); /** @@ -110,6 +126,7 @@ public interface TokenHandler2 { * @param index * the index at which the codepoint was found. */ + @Override void rightParenthesis(int index); /** @@ -118,6 +135,7 @@ public interface TokenHandler2 { * @param index * the index at which the codepoint was found. */ + @Override void rightSquareBracket(int index); /** @@ -126,6 +144,7 @@ public interface TokenHandler2 { * @param index * the index at which the codepoint was found. */ + @Override void rightCurlyBracket(int index); /** @@ -138,6 +157,7 @@ public interface TokenHandler2 { * @param codePoint * the found codepoint. */ + @Override default void startPunctuation(int index, int codePoint) { character(index, codePoint); } @@ -152,6 +172,7 @@ default void startPunctuation(int index, int codePoint) { * @param codePoint * the found codepoint. */ + @Override default void endPunctuation(int index, int codePoint) { character(index, codePoint); } @@ -169,6 +190,7 @@ default void endPunctuation(int index, int codePoint) { * @param codePoint * the codepoint of the found punctuation. */ + @Override void character(int index, int codePoint); /** @@ -179,6 +201,7 @@ default void endPunctuation(int index, int codePoint) { * @param codePoint * the escaped codepoint. */ + @Override void escaped(int index, int codePoint); /** @@ -189,6 +212,7 @@ default void endPunctuation(int index, int codePoint) { * @param codePoint * the control codepoint. */ + @Override void control(int index, int codePoint); /** @@ -201,6 +225,7 @@ default void endPunctuation(int index, int codePoint) { * @param comment * the commented string. */ + @Override void commented(int index, int commentType, String comment); /** @@ -209,6 +234,7 @@ default void endPunctuation(int index, int codePoint) { * @param len * the length of the processed stream. */ + @Override void endOfStream(int len); /** @@ -225,5 +251,6 @@ default void endPunctuation(int index, int codePoint) { * a context sequence. If a string was parsed, it will contain up to 16 * characters before and after the error. */ + @Override void error(int index, byte errCode, CharSequence context); } diff --git a/src/main/java/io/sf/carte/uparser/TokenHandler3.java b/src/main/java/io/sf/carte/uparser/TokenHandler3.java new file mode 100644 index 0000000..8c7fb33 --- /dev/null +++ b/src/main/java/io/sf/carte/uparser/TokenHandler3.java @@ -0,0 +1,251 @@ +/* + + Copyright (c) 2017-2023, Carlos Amengual. + + Licensed under a BSD-style License. You can find the license here: + https://css4j.github.io/LICENSE.txt + + */ + +// SPDX-License-Identifier: BSD-3-Clause + +package io.sf.carte.uparser; + +/** + * To be implemented by listeners that handle the different events generated by + * the {@link TokenProducer3}. + * + * @param E the exception that the methods may throw. + */ +public interface TokenHandler3 { + + /** + * At the beginning of parsing, this method is called, passing the {@link TokenControl} + * object that can be used to fine-control the parsing. + * + * @param control + * the TokenControl object in charge of parsing. + * @throws E in case of an error when processing the tokens. + */ + void tokenStart(TokenControl control); + + /** + * A word was found by the parser (includes connector punctuation). + * + * @param index + * the index at which the word was found. + * @param word + * the word. + * @throws E in case of an error when processing the tokens. + */ + void word(int index, CharSequence word) throws E; + + /** + * A separator (Zs, Zl and Zp unicode categories) was found. + * + * @param index + * the index at which the separator was found. + * @param codePoint + * the codepoint of the found separator. + * @throws E in case of an error when processing the tokens. + */ + void separator(int index, int codePoint) throws E; + + /** + * A quoted string was found by the parser. + * + * @param index + * the index at which the quoted string was found. + * @param quoted + * the quoted sequence of characters, without the quotes. + * @param quote + * the quote character. + * @throws E in case of an error when processing the tokens. + */ + void quoted(int index, CharSequence quoted, int quote) throws E; + + /** + * A quoted string was found by the parser, and contains control characters. + * + * @param index + * the index at which the quoted string was found. + * @param quoted + * the quoted sequence of characters, without the quotes. + * @param quoteCp + * the quote character codepoint. + * @throws E in case of an error when processing the tokens. + */ + void quotedWithControl(int index, CharSequence quoted, int quoteCp) throws E; + + /** + * An unescaped FF/LF/CR control was found while assembling a quoted string. + * + * @param index + * the index at which the control was found. + * @param codePoint + * the FF/LF/CR codepoint. + * @throws E in case of an error when processing the tokens. + */ + void quotedNewlineChar(int index, int codePoint) throws E; + + /** + * Called when the {@code (} codepoint is found. + * + * @param index + * the index at which the codepoint was found. + * @throws E in case of an error when processing the tokens. + */ + void leftParenthesis(int index) throws E; + + /** + * Called when the {@code [} codepoint is found. + * + * @param index + * the index at which the codepoint was found. + * @throws E in case of an error when processing the tokens. + */ + void leftSquareBracket(int index) throws E; + + /** + * Called when the { codepoint is found. + * + * @param index + * the index at which the codepoint was found. + * @throws E in case of an error when processing the tokens. + */ + void leftCurlyBracket(int index) throws E; + + /** + * Called when the {@code )} codepoint is found. + * + * @param index + * the index at which the codepoint was found. + * @throws E in case of an error when processing the tokens. + */ + void rightParenthesis(int index) throws E; + + /** + * Called when the {@code ]} codepoint is found. + * + * @param index + * the index at which the codepoint was found. + * @throws E in case of an error when processing the tokens. + */ + void rightSquareBracket(int index) throws E; + + /** + * Called when the } codepoint is found. + * + * @param index + * the index at which the codepoint was found. + * @throws E in case of an error when processing the tokens. + */ + void rightCurlyBracket(int index) throws E; + + /** + * Called when start punctuation (Ps) codepoints are found (except characters + * handled by {@link #leftCurlyBracket(int)}, {@link #leftParenthesis(int)} and + * {@link #leftSquareBracket(int)}). + * + * @param index + * the index at which the codepoint was found. + * @param codePoint + * the found codepoint. + * @throws E in case of an error when processing the tokens. + */ + default void startPunctuation(int index, int codePoint) throws E { + character(index, codePoint); + } + + /** + * Called when end punctuation (Pe) codepoints are found (except characters + * handled by {@link #rightCurlyBracket(int)}, {@link #rightParenthesis(int)} + * and {@link #rightSquareBracket(int)}). + * + * @param index + * the index at which the codepoint was found. + * @param codePoint + * the found codepoint. + * @throws E in case of an error when processing the tokens. + */ + default void endPunctuation(int index, int codePoint) throws E { + character(index, codePoint); + } + + /** + * Other characters including punctuation (excluding connector punctuation) and symbols + * (Sc, Sm and Sk unicode categories) was found, that was not one of the non-alphanumeric + * characters allowed in words. + *

+ * Symbols in So category are considered part of words and won't be handled by this + * method. + * + * @param index + * the index at which the punctuation was found. + * @param codePoint + * the codepoint of the found punctuation. + * @throws E in case of an error when processing the tokens. + */ + void character(int index, int codePoint) throws E; + + /** + * A codepoint preceded with a backslash was found outside of quoted text. + * + * @param index + * the index at which the escaped codepoint was found. + * @param codePoint + * the escaped codepoint. + * @throws E in case of an error when processing the tokens. + */ + void escaped(int index, int codePoint) throws E; + + /** + * A control character codepoint was found. + * + * @param index + * the index at which the control codepoint was found. + * @param codePoint + * the control codepoint. + * @throws E in case of an error when processing the tokens. + */ + void control(int index, int codePoint) throws E; + + /** + * A commented string was found by the parser. + * + * @param index + * the index at which the commented string was found. + * @param commentType + * the type of comment. + * @param comment + * the commented string. + * @throws E in case of an error when processing the tokens. + */ + void commented(int index, int commentType, String comment) throws E; + + /** + * The stream that was being parsed reached its end. + * + * @param len + * the length of the processed stream. + * @throws E in case of an error when processing the tokens. + */ + void endOfStream(int len) throws E; + + /** + * An error was found while parsing. + *

+ * Something was found that broke the assumptions made by the parser, like an escape + * character at the end of the stream or an unmatched quote. + * + * @param index + * the index at which the error was found. + * @param errCode + * the error code. + * @param context + * a context sequence. If a string was parsed, it will contain up to 16 + * characters before and after the error. + * @throws E in case that the error handler decides to throw an exception. + */ + void error(int index, byte errCode, CharSequence context) throws E; +} diff --git a/src/main/java/io/sf/carte/uparser/TokenProducer.java b/src/main/java/io/sf/carte/uparser/TokenProducer.java index dad0757..5d4cfc7 100644 --- a/src/main/java/io/sf/carte/uparser/TokenProducer.java +++ b/src/main/java/io/sf/carte/uparser/TokenProducer.java @@ -11,12 +11,17 @@ package io.sf.carte.uparser; -import java.io.IOException; -import java.io.Reader; -import java.util.Arrays; - /** - * A simple parser that produces tokens from a String or Reader. + * A simple parser that produces tokens from a String or Reader, and processes + * them through a user-provided handler. + *

+ * This parser is intended to deal with handlers that produce only runtime + * (unchecked) exceptions. If your use case requires dealing with checked + * exceptions, please use {@link TokenProducer3} instead. + *

+ *

+ * Tokenization Overview + *

*

* The tokens produced are: *