From 85c3b8b727ad6dd00623fce74d2dea260675b29f Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Sun, 19 Nov 2023 13:25:27 +1100 Subject: [PATCH 1/5] Revised tree builder method names Using more explicit names vs overrides; removed some duplication. --- .../org/jsoup/parser/HtmlTreeBuilder.java | 105 +++++------ .../jsoup/parser/HtmlTreeBuilderState.java | 174 +++++++++--------- .../java/org/jsoup/parser/TreeBuilder.java | 46 ++--- 3 files changed, 156 insertions(+), 169 deletions(-) diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java index 9b0a2c06b2..bdbd69d5b8 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java @@ -3,6 +3,7 @@ import org.jsoup.helper.Validate; import org.jsoup.internal.Normalizer; import org.jsoup.internal.StringUtil; +import org.jsoup.nodes.Attributes; import org.jsoup.nodes.CDataNode; import org.jsoup.nodes.Comment; import org.jsoup.nodes.DataNode; @@ -167,14 +168,11 @@ protected void initialiseParse(Reader input, String baseUri, Parser parser) { protected boolean process(Token token) { currentToken = token; - if (shouldDispatchToCurrentInsertionMode(token)) { - return this.state.process(token, this); - } else { - return ForeignContent.process(token, this); - } + HtmlTreeBuilderState dispatch = useCurrentOrForeignInsert(token) ? this.state : ForeignContent; + return dispatch.process(token, this); } - boolean shouldDispatchToCurrentInsertionMode(Token token) { + boolean useCurrentOrForeignInsert(Token token) { // https://html.spec.whatwg.org/multipage/parsing.html#tree-construction // If the stack of open elements is empty if (stack.isEmpty()) @@ -307,80 +305,79 @@ void error(HtmlTreeBuilderState state) { currentToken.tokenType(), currentToken, state)); } - /** Inserts an HTML element for the given tag) */ - Element insert(final Token.StartTag startTag) { - dedupeAttributes(startTag); + Element createElementFor(Token.StartTag startTag, String namespace, boolean forcePreserveCase) { + // dedupe and normalize the attributes: + Attributes attributes = startTag.attributes; + if (!forcePreserveCase) + attributes = settings.normalizeAttributes(attributes); + if (attributes != null && !attributes.isEmpty()) { + int dupes = attributes.deduplicate(settings); + if (dupes > 0) { + error("Dropped duplicate attribute(s) in tag [%s]", startTag.normalName); + } + } + Tag tag = tagFor(startTag.tagName, namespace, + forcePreserveCase ? ParseSettings.preserveCase : settings); + + return (tag.normalName().equals("form")) ? + new FormElement(tag, null, attributes) : + new Element(tag, null, attributes); + } + + /** Inserts an HTML element for the given tag) */ + Element insertElementFor(final Token.StartTag startTag) { // handle empty unknown tags // when the spec expects an empty tag, will directly hit insertEmpty, so won't generate this fake end tag. if (startTag.isSelfClosing()) { - Element el = insertEmpty(startTag); + Element el = insertEmptyElementFor(startTag); stack.add(el); tokeniser.transition(TokeniserState.Data); // handles