From 2d517c769763e749ce0c3242b40be66c36af9918 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Tue, 21 Nov 2023 16:41:25 +1100 Subject: [PATCH] Tracks the source range of attributes (name and value) when source tracking is on (#2057) When source tracking is enabled, the source position for attribute names and values is now available.`Attribute#sourceRange()` provides the ranges. --- CHANGES | 4 + .../org/jsoup/internal/SharedConstants.java | 16 ++ src/main/java/org/jsoup/nodes/Attribute.java | 28 ++- src/main/java/org/jsoup/nodes/Attributes.java | 53 ++++-- src/main/java/org/jsoup/nodes/Element.java | 4 +- src/main/java/org/jsoup/nodes/Node.java | 12 +- src/main/java/org/jsoup/nodes/Range.java | 64 ++++++- .../org/jsoup/parser/CharacterReader.java | 4 +- .../java/org/jsoup/parser/ParseError.java | 4 +- src/main/java/org/jsoup/parser/Parser.java | 2 +- src/main/java/org/jsoup/parser/Token.java | 164 +++++++++++------- src/main/java/org/jsoup/parser/Tokeniser.java | 7 +- .../java/org/jsoup/parser/TokeniserState.java | 56 +++--- .../java/org/jsoup/parser/TreeBuilder.java | 12 +- .../java/org/jsoup/nodes/PositionTest.java | 76 ++++++++ .../org/jsoup/parser/CharacterReaderTest.java | 18 +- 16 files changed, 392 insertions(+), 132 deletions(-) create mode 100644 src/main/java/org/jsoup/internal/SharedConstants.java diff --git a/CHANGES b/CHANGES index 87c2d1aee8..fb68020931 100644 --- a/CHANGES +++ b/CHANGES @@ -27,6 +27,10 @@ Release 1.17.1 [PENDING] are tracked and detectable via Range.isImplicit(). + * Improvement: when source tracking is enabled, the source position for attribute names and values is now available. + Attribute#sourceRange() provides the ranges. + + * Bugfix: when outputting with XML syntax, HTML elements that were parsed as data nodes ( - final Token.StartTag startPending = new Token.StartTag(); + final Token.StartTag startPending; final Token.EndTag endPending = new Token.EndTag(); - Token.Tag tagPending = startPending; // tag we are building up: start or end pending + Token.Tag tagPending; // tag we are building up: start or end pending final Token.Character charPending = new Token.Character(); final Token.Doctype doctypePending = new Token.Doctype(); // doctype building up final Token.Comment commentPending = new Token.Comment(); // comment building up @@ -52,7 +52,8 @@ final class Tokeniser { private static final int Unset = -1; private int markupStartPos, charStartPos = Unset; // reader pos at the start of markup / characters. updated on state transition - Tokeniser(CharacterReader reader, ParseErrorList errors) { + Tokeniser(CharacterReader reader, ParseErrorList errors, boolean trackSource) { + tagPending = startPending = new Token.StartTag(trackSource, reader); this.reader = reader; this.errors = errors; } diff --git a/src/main/java/org/jsoup/parser/TokeniserState.java b/src/main/java/org/jsoup/parser/TokeniserState.java index a9ba53f084..081b1525fb 100644 --- a/src/main/java/org/jsoup/parser/TokeniserState.java +++ b/src/main/java/org/jsoup/parser/TokeniserState.java @@ -595,7 +595,7 @@ private void anythingElse(Tokeniser t, CharacterReader r) { case '=': t.error(this); t.tagPending.newAttribute(); - t.tagPending.appendAttributeName(c); + t.tagPending.appendAttributeName(c, r.pos()-1, r.pos()); t.transition(AttributeName); break; default: // A-Z, anything else @@ -608,9 +608,11 @@ private void anythingElse(Tokeniser t, CharacterReader r) { AttributeName { // from before attribute name @Override void read(Tokeniser t, CharacterReader r) { + int pos = r.pos(); String name = r.consumeToAnySorted(attributeNameCharsSorted); // spec deviate - consume and emit nulls in one hit vs stepping - t.tagPending.appendAttributeName(name); + t.tagPending.appendAttributeName(name, pos, r.pos()); + pos = r.pos(); char c = r.consume(); switch (c) { case '\t': @@ -638,10 +640,10 @@ private void anythingElse(Tokeniser t, CharacterReader r) { case '\'': case '<': t.error(this); - t.tagPending.appendAttributeName(c); + t.tagPending.appendAttributeName(c, pos, r.pos()); break; default: // buffer underrun - t.tagPending.appendAttributeName(c); + t.tagPending.appendAttributeName(c, pos, r.pos()); } } }, @@ -668,7 +670,7 @@ private void anythingElse(Tokeniser t, CharacterReader r) { break; case nullChar: t.error(this); - t.tagPending.appendAttributeName(replacementChar); + t.tagPending.appendAttributeName(replacementChar, r.pos()-1, r.pos()); t.transition(AttributeName); break; case eof: @@ -680,7 +682,7 @@ private void anythingElse(Tokeniser t, CharacterReader r) { case '<': t.error(this); t.tagPending.newAttribute(); - t.tagPending.appendAttributeName(c); + t.tagPending.appendAttributeName(c, r.pos()-1, r.pos()); t.transition(AttributeName); break; default: // A-Z, anything else @@ -713,7 +715,7 @@ private void anythingElse(Tokeniser t, CharacterReader r) { break; case nullChar: t.error(this); - t.tagPending.appendAttributeValue(replacementChar); + t.tagPending.appendAttributeValue(replacementChar, r.pos()-1, r.pos()); t.transition(AttributeValue_unquoted); break; case eof: @@ -730,7 +732,7 @@ private void anythingElse(Tokeniser t, CharacterReader r) { case '=': case '`': t.error(this); - t.tagPending.appendAttributeValue(c); + t.tagPending.appendAttributeValue(c, r.pos()-1, r.pos()); t.transition(AttributeValue_unquoted); break; default: @@ -741,12 +743,14 @@ private void anythingElse(Tokeniser t, CharacterReader r) { }, AttributeValue_doubleQuoted { @Override void read(Tokeniser t, CharacterReader r) { + int pos = r.pos(); String value = r.consumeAttributeQuoted(false); if (value.length() > 0) - t.tagPending.appendAttributeValue(value); + t.tagPending.appendAttributeValue(value, pos, r.pos()); else t.tagPending.setEmptyAttributeValue(); + pos = r.pos(); char c = r.consume(); switch (c) { case '"': @@ -755,31 +759,33 @@ private void anythingElse(Tokeniser t, CharacterReader r) { case '&': int[] ref = t.consumeCharacterReference('"', true); if (ref != null) - t.tagPending.appendAttributeValue(ref); + t.tagPending.appendAttributeValue(ref, pos, r.pos()); else - t.tagPending.appendAttributeValue('&'); + t.tagPending.appendAttributeValue('&', pos, r.pos()); break; case nullChar: t.error(this); - t.tagPending.appendAttributeValue(replacementChar); + t.tagPending.appendAttributeValue(replacementChar, pos, r.pos()); break; case eof: t.eofError(this); t.transition(Data); break; default: // hit end of buffer in first read, still in attribute - t.tagPending.appendAttributeValue(c); + t.tagPending.appendAttributeValue(c, pos, r.pos()); } } }, AttributeValue_singleQuoted { @Override void read(Tokeniser t, CharacterReader r) { + int pos = r.pos(); String value = r.consumeAttributeQuoted(true); if (value.length() > 0) - t.tagPending.appendAttributeValue(value); + t.tagPending.appendAttributeValue(value, pos, r.pos()); else t.tagPending.setEmptyAttributeValue(); + pos = r.pos(); char c = r.consume(); switch (c) { case '\'': @@ -788,29 +794,31 @@ private void anythingElse(Tokeniser t, CharacterReader r) { case '&': int[] ref = t.consumeCharacterReference('\'', true); if (ref != null) - t.tagPending.appendAttributeValue(ref); + t.tagPending.appendAttributeValue(ref, pos, r.pos()); else - t.tagPending.appendAttributeValue('&'); + t.tagPending.appendAttributeValue('&', pos, r.pos()); break; case nullChar: t.error(this); - t.tagPending.appendAttributeValue(replacementChar); + t.tagPending.appendAttributeValue(replacementChar, pos, r.pos()); break; case eof: t.eofError(this); t.transition(Data); break; default: // hit end of buffer in first read, still in attribute - t.tagPending.appendAttributeValue(c); + t.tagPending.appendAttributeValue(c, pos, r.pos()); } } }, AttributeValue_unquoted { @Override void read(Tokeniser t, CharacterReader r) { + int pos = r.pos(); String value = r.consumeToAnySorted(attributeValueUnquoted); if (value.length() > 0) - t.tagPending.appendAttributeValue(value); + t.tagPending.appendAttributeValue(value, pos, r.pos()); + pos = r.pos(); char c = r.consume(); switch (c) { case '\t': @@ -823,9 +831,9 @@ private void anythingElse(Tokeniser t, CharacterReader r) { case '&': int[] ref = t.consumeCharacterReference('>', true); if (ref != null) - t.tagPending.appendAttributeValue(ref); + t.tagPending.appendAttributeValue(ref, pos, r.pos()); else - t.tagPending.appendAttributeValue('&'); + t.tagPending.appendAttributeValue('&', pos, r.pos()); break; case '>': t.emitTagPending(); @@ -833,7 +841,7 @@ private void anythingElse(Tokeniser t, CharacterReader r) { break; case nullChar: t.error(this); - t.tagPending.appendAttributeValue(replacementChar); + t.tagPending.appendAttributeValue(replacementChar, pos, r.pos()); break; case eof: t.eofError(this); @@ -845,10 +853,10 @@ private void anythingElse(Tokeniser t, CharacterReader r) { case '=': case '`': t.error(this); - t.tagPending.appendAttributeValue(c); + t.tagPending.appendAttributeValue(c, pos, r.pos()); break; default: // hit end of buffer in first read, still in attribute - t.tagPending.appendAttributeValue(c); + t.tagPending.appendAttributeValue(c, pos, r.pos()); } } diff --git a/src/main/java/org/jsoup/parser/TreeBuilder.java b/src/main/java/org/jsoup/parser/TreeBuilder.java index 8f15addfb2..e20a641de2 100644 --- a/src/main/java/org/jsoup/parser/TreeBuilder.java +++ b/src/main/java/org/jsoup/parser/TreeBuilder.java @@ -1,10 +1,11 @@ package org.jsoup.parser; import org.jsoup.helper.Validate; +import org.jsoup.internal.SharedConstants; +import org.jsoup.nodes.Attribute; import org.jsoup.nodes.Attributes; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; -import org.jsoup.nodes.LeafNode; import org.jsoup.nodes.Node; import org.jsoup.nodes.Range; @@ -30,7 +31,7 @@ abstract class TreeBuilder { ParseSettings settings; Map seenTags; // tags we've used in this parse; saves tag GC for custom tags. - private final Token.StartTag start = new Token.StartTag(); // start tag to process + private Token.StartTag start; // start tag to process private final Token.EndTag end = new Token.EndTag(); abstract ParseSettings defaultSettings(); @@ -49,9 +50,10 @@ void initialiseParse(Reader input, String baseUri, Parser parser) { trackSourceRange = parser.isTrackPosition(); reader.trackNewlines(parser.isTrackErrors() || trackSourceRange); // when tracking errors or source ranges, enable newline tracking for better legibility currentToken = null; - tokeniser = new Tokeniser(reader, parser.getErrors()); + tokeniser = new Tokeniser(reader, parser.getErrors(), trackSourceRange); stack = new ArrayList<>(32); seenTags = new HashMap<>(); + start = new Token.StartTag(trackSourceRange, reader); this.baseUri = baseUri; } @@ -100,7 +102,7 @@ boolean processStartTag(String name) { // these are "virtual" start tags (auto-created by the treebuilder), so not tracking the start position final Token.StartTag start = this.start; if (currentToken == start) { // don't recycle an in-use token - return process(new Token.StartTag().name(name)); + return process(new Token.StartTag(trackSourceRange, reader).name(name)); } return process(start.reset().name(name)); } @@ -108,7 +110,7 @@ boolean processStartTag(String name) { boolean processStartTag(String name, Attributes attrs) { final Token.StartTag start = this.start; if (currentToken == start) { // don't recycle an in-use token - return process(new Token.StartTag().nameAttr(name, attrs)); + return process(new Token.StartTag(trackSourceRange, reader).nameAttr(name, attrs)); } start.reset(); start.nameAttr(name, attrs); diff --git a/src/test/java/org/jsoup/nodes/PositionTest.java b/src/test/java/org/jsoup/nodes/PositionTest.java index 5d5c80331f..a59f6520d8 100644 --- a/src/test/java/org/jsoup/nodes/PositionTest.java +++ b/src/test/java/org/jsoup/nodes/PositionTest.java @@ -303,4 +303,80 @@ private void printRange(Node node) { assertEquals("1,20:19-1,25:24", h2.endSourceRange().toString()); } + @Test void tracksAttributes() { + String html = "
Text"; + Document doc = Jsoup.parse(html, TrackingParser); + + Element div = doc.expectFirst("div"); + + StringBuilder track = new StringBuilder(); + for (Attribute attr : div.attributes()) { + if (attr.isInternal()) continue; + + Range.AttributeRange attrRange = attr.sourceRange(); + assertTrue(attrRange.nameRange().isTracked()); + assertTrue(attrRange.valueRange().isTracked()); + assertSame(attrRange, div.attributes().sourceRange(attr.getKey())); + + assertFalse(attrRange.nameRange().isImplicit()); + if (attr.getValue().isEmpty()) + assertTrue(attrRange.valueRange().isImplicit()); + else + assertFalse(attrRange.valueRange().isImplicit()); + + + accumulatePositions(attr, track); + } + + System.out.println(track); + assertEquals("one:5-8=10-21; id:23-25=26-27; class:28-33=34-37; attr1:38-43=47-60; attr2:62-67=69-78; attr3:80-85=85-85; attr4:89-94=94-94; attr5:95-100=100-100; ", track.toString()); + } + + @Test void tracksAttributesAcrossLines() { + String html = "
Text"; + Document doc = Jsoup.parse(html, TrackingParser); + + Element div = doc.expectFirst("div"); + + StringBuilder track = new StringBuilder(); + for (Attribute attr : div.attributes()) { + if (attr.isInternal()) continue; + Range.AttributeRange attrRange = attr.sourceRange(); + assertTrue(attrRange.nameRange().isTracked()); + assertTrue(attrRange.valueRange().isTracked()); + assertSame(attrRange, div.attributes().sourceRange(attr.getKey())); + assertFalse(attrRange.nameRange().isImplicit()); + if (attr.getValue().isEmpty()) + assertTrue(attrRange.valueRange().isImplicit()); + else + assertFalse(attrRange.valueRange().isImplicit()); + accumulatePositions(attr, track); + } + + String value = div.attributes().get("class"); + assertEquals("foo", value); + Range.AttributeRange foo = div.attributes().sourceRange("class"); + assertEquals("4,1:30-4,6:35=5,1:37-5,4:40", foo.toString()); + + System.out.println(track); + assertEquals("one:5-8=10-21; id:24-26=27-28; class:30-35=37-40; attr5:41-46=46-46; ", track.toString()); + } + + static void accumulatePositions(Attribute attr, StringBuilder sb) { + Range.AttributeRange range = attr.sourceRange(); + + sb + .append(attr.getKey()) + .append(':') + .append(range.nameRange().startPos()) + .append('-') + .append(range.nameRange().endPos()) + + .append('=') + .append(range.valueRange().startPos()) + .append('-') + .append(range.valueRange().endPos()); + + sb.append("; "); + } } \ No newline at end of file diff --git a/src/test/java/org/jsoup/parser/CharacterReaderTest.java b/src/test/java/org/jsoup/parser/CharacterReaderTest.java index 121b07afd3..7071bfe51d 100644 --- a/src/test/java/org/jsoup/parser/CharacterReaderTest.java +++ b/src/test/java/org/jsoup/parser/CharacterReaderTest.java @@ -433,14 +433,14 @@ public void notEmptyAtBufferSplitPoint() { assertEquals(12, noTrack.pos()); assertEquals(1, noTrack.lineNumber()); assertEquals(13, noTrack.columnNumber()); - assertEquals("1:13", noTrack.cursorPos()); + assertEquals("1:13", noTrack.posLineCol()); // get over the buffer while (!noTrack.matches("[foo]")) noTrack.consumeTo("[foo]"); assertEquals(32778, noTrack.pos()); assertEquals(1, noTrack.lineNumber()); assertEquals(noTrack.pos()+1, noTrack.columnNumber()); - assertEquals("1:32779", noTrack.cursorPos()); + assertEquals("1:32779", noTrack.posLineCol()); // and the line numbers: "\n\n\n" assertEquals(0, track.pos()); @@ -462,24 +462,24 @@ public void notEmptyAtBufferSplitPoint() { assertEquals(12, track.pos()); assertEquals(3, track.lineNumber()); assertEquals(1, track.columnNumber()); - assertEquals("3:1", track.cursorPos()); + assertEquals("3:1", track.posLineCol()); assertEquals("", track.consumeTo('\n')); - assertEquals("3:6", track.cursorPos()); + assertEquals("3:6", track.posLineCol()); // get over the buffer while (!track.matches("[foo]")) track.consumeTo("[foo]"); assertEquals(32778, track.pos()); assertEquals(4, track.lineNumber()); assertEquals(32761, track.columnNumber()); - assertEquals("4:32761", track.cursorPos()); + assertEquals("4:32761", track.posLineCol()); track.consumeTo('\n'); - assertEquals("4:32766", track.cursorPos()); + assertEquals("4:32766", track.posLineCol()); track.consumeTo("[bar]"); assertEquals(5, track.lineNumber()); - assertEquals("5:1", track.cursorPos()); + assertEquals("5:1", track.posLineCol()); track.consumeToEnd(); - assertEquals("5:6", track.cursorPos()); + assertEquals("5:6", track.posLineCol()); } @Test public void countsColumnsOverBufferWhenNoNewlines() { @@ -490,7 +490,7 @@ public void notEmptyAtBufferSplitPoint() { CharacterReader reader = new CharacterReader(content); reader.trackNewlines(true); - assertEquals("1:1", reader.cursorPos()); + assertEquals("1:1", reader.posLineCol()); while (!reader.isEmpty()) reader.consume(); assertEquals(131096, reader.pos());