Skip to content

Commit

Permalink
Tracks the source range of attributes (name and value) when source tr…
Browse files Browse the repository at this point in the history
…acking is on (#2057)

When source tracking is enabled, the source position for attribute names and values is now available.`Attribute#sourceRange()` provides the ranges.
  • Loading branch information
jhy authored Nov 21, 2023
1 parent 2cf9e90 commit 2d517c7
Show file tree
Hide file tree
Showing 16 changed files with 392 additions and 132 deletions.
4 changes: 4 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ Release 1.17.1 [PENDING]
are tracked and detectable via Range.isImplicit().
<https://github.com/jhy/jsoup/pull/2056>

* Improvement: when source tracking is enabled, the source position for attribute names and values is now available.
Attribute#sourceRange() provides the ranges.
<https://github.com/jhy/jsoup/pull/2057>

* Bugfix: when outputting with XML syntax, HTML elements that were parsed as data nodes (<script> and <style>) should
be emitted as CDATA nodes, so that they can be parsed correctly by an XML parser.
<https://github.com/jhy/jsoup/pull/1720>
Expand Down
16 changes: 16 additions & 0 deletions src/main/java/org/jsoup/internal/SharedConstants.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package org.jsoup.internal;

/**
jsoup constants used between packages. Do not use as they may change without warning. Users will not be able to see
this package when modules are enabled.
*/
public final class SharedConstants {
// Indicates a jsoup internal key. Can't be set via HTML. (It could be set via accessor, but not too worried about
// that. Suppressed from list, iter.
public static final char InternalPrefix = '/';
public static final String PrivatePrefix = "/jsoup.";

public static final String AttrRange = PrivatePrefix + "attrRange.";

private SharedConstants() {}
}
28 changes: 27 additions & 1 deletion src/main/java/org/jsoup/nodes/Attribute.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import org.jsoup.SerializationException;
import org.jsoup.helper.Validate;
import org.jsoup.internal.Normalizer;
import org.jsoup.internal.SharedConstants;
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.Document.OutputSettings.Syntax;
import org.jspecify.annotations.Nullable;
Expand Down Expand Up @@ -99,7 +100,7 @@ public boolean hasDeclaredValue() {
@param val the new attribute value; may be null (to set an enabled boolean attribute)
@return the previous value (if was null; an empty string)
*/
public String setValue(@Nullable String val) {
@Override public String setValue(@Nullable String val) {
String oldVal = this.val;
if (parent != null) {
int i = parent.indexOfKey(this.key);
Expand Down Expand Up @@ -127,6 +128,23 @@ public String html() {
return StringUtil.releaseBuilder(sb);
}

/**
Get the source ranges (start to end positions) in the original input source from which this attribute's <b>name</b>
and <b>value</b> were parsed.
<p>Position tracking must be enabled prior to parsing the content.</p>
@return the ranges for the attribute's name and value, or {@code untracked} if the attribute does not exist or its range
was not tracked.
@see org.jsoup.parser.Parser#setTrackPosition(boolean)
@see Attributes#sourceRange(String)
@see Node#sourceRange()
@see Element#endSourceRange()
@since 1.17.1
*/
public Range.AttributeRange sourceRange() {
if (parent == null) return Range.AttributeRange.Untracked;
return parent.sourceRange(key);
}

protected void html(Appendable accum, Document.OutputSettings out) throws IOException {
html(key, val, accum, out);
}
Expand Down Expand Up @@ -193,6 +211,14 @@ protected static boolean isDataAttribute(String key) {
return key.startsWith(Attributes.dataPrefix) && key.length() > Attributes.dataPrefix.length();
}

/**
Is this an internal attribute? Internal attributes can be fetched by key, but are not serialized.
* @return if an internal attribute.
*/
public boolean isInternal() {
return Attributes.isInternalKey(key);
}

/**
* Collapsible if it's a boolean attribute and value is empty or same as name
*
Expand Down
53 changes: 40 additions & 13 deletions src/main/java/org/jsoup/nodes/Attributes.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import org.jsoup.SerializationException;
import org.jsoup.helper.Validate;
import org.jsoup.internal.SharedConstants;
import org.jsoup.internal.StringUtil;
import org.jsoup.parser.ParseSettings;
import org.jspecify.annotations.Nullable;
Expand All @@ -19,11 +20,14 @@
import java.util.Set;

import static org.jsoup.internal.Normalizer.lowerCase;
import static org.jsoup.nodes.Range.AttributeRange.Untracked;

/**
* The attributes of an Element.
* <p>
* Attributes are treated as a map: there can be only one value associated with an attribute key/name.
* During parsing, attributes in with the same name in an element are deduplicated, according to the configured parser's
* attribute case-sensitive setting. It is possible to have duplicate attributes subsequently if
* {@link #add(String, String)} vs {@link #put(String, String)} is used.
* </p>
* <p>
* Attribute name and value comparisons are generally <b>case sensitive</b>. By default for HTML, attribute names are
Expand All @@ -37,9 +41,6 @@ public class Attributes implements Iterable<Attribute>, Cloneable {
// The Attributes object is only created on the first use of an attribute; the Element will just have a null
// Attribute slot otherwise
protected static final String dataPrefix = "data-";
// Indicates a jsoup internal key. Can't be set via HTML. (It could be set via accessor, but not too worried about
// that. Suppressed from list, iter.
static final char InternalPrefix = '/';
private static final int InitialCapacity = 3; // sampling found mean count when attrs present = 1.49; 1.08 overall. 2.6:1 don't have any attrs.

// manages the key/val arrays
Expand All @@ -51,6 +52,7 @@ public class Attributes implements Iterable<Attribute>, Cloneable {
private int size = 0; // number of slots used (not total capacity, which is keys.length)
String[] keys = new String[InitialCapacity];
Object[] vals = new Object[InitialCapacity]; // Genericish: all non-internal attribute values must be Strings and are cast on access.
// todo - make keys iterable without creating Attribute objects

// check there's room for more
private void checkCapacity(int minNewSize) {
Expand Down Expand Up @@ -115,12 +117,14 @@ public String getIgnoreCase(String key) {
Get an arbitrary user data object by key.
* @param key case-sensitive key to the object.
* @return the object associated to this key, or {@code null} if not found.
* @see #userData(String key, Object val)
* @since 1.17.2
*/
@Nullable
Object userData(String key) {
public Object userData(String key) {
Validate.notNull(key);
if (!isInternalKey(key)) key = internalKey(key);
int i = indexOfKeyIgnoreCase(key);
int i = indexOfKey(key);
return i == NotFound ? null : vals[i];
}

Expand Down Expand Up @@ -161,9 +165,10 @@ public Attributes put(String key, @Nullable String value) {
* @param key case-sensitive key
* @param value object value
* @return these attributes
* @see #userData(String)
* @see #userData(String key)
* @since 1.17.1
*/
Attributes userData(String key, Object value) {
public Attributes userData(String key, Object value) {
Validate.notNull(key);
if (!isInternalKey(key)) key = internalKey(key);
Validate.notNull(value);
Expand Down Expand Up @@ -291,6 +296,7 @@ public boolean hasDeclaredValueForKeyIgnoreCase(String key) {
*/
public int size() {
return size;
// todo - exclude internal attributes from this count - maintain size, count of internals
}

/**
Expand Down Expand Up @@ -319,6 +325,26 @@ public void addAll(Attributes incoming) {
}
}

/**
Get the source ranges (start to end position) in the original input source from which this attribute's <b>name</b>
and <b>value</b> were parsed.
<p>Position tracking must be enabled prior to parsing the content.</p>
@param key the attribute name
@return the ranges for the attribute's name and value, or {@code untracked} if the attribute does not exist or its range
was not tracked.
@see org.jsoup.parser.Parser#setTrackPosition(boolean)
@see Attribute#sourceRange()
@see Node#sourceRange()
@see Element#endSourceRange()
@since 1.17.1
*/
public Range.AttributeRange sourceRange(String key) {
if (!hasKey(key)) return Untracked;
final String rangeKey = SharedConstants.AttrRange + key;
if (!hasDeclaredValueForKey(rangeKey)) return Untracked;
return (Range.AttributeRange) Validate.ensureNotNull(userData(rangeKey));
}

public Iterator<Attribute> iterator() {
return new Iterator<Attribute>() {
int expectedSize = size;
Expand Down Expand Up @@ -467,11 +493,12 @@ public Attributes clone() {
}

/**
* Internal method. Lowercases all keys.
* Internal method. Lowercases all (non-internal) keys.
*/
public void normalize() {
for (int i = 0; i < size; i++) {
keys[i] = lowerCase(keys[i]);
if (!isInternalKey(keys[i]))
keys[i] = lowerCase(keys[i]);
}
}

Expand Down Expand Up @@ -562,10 +589,10 @@ private static String dataKey(String key) {
}

static String internalKey(String key) {
return InternalPrefix + key;
return SharedConstants.InternalPrefix + key;
}

private boolean isInternalKey(String key) {
return key != null && key.length() > 1 && key.charAt(0) == InternalPrefix;
static boolean isInternalKey(String key) {
return key != null && key.length() > 1 && key.charAt(0) == SharedConstants.InternalPrefix;
}
}
4 changes: 2 additions & 2 deletions src/main/java/org/jsoup/nodes/Element.java
Original file line number Diff line number Diff line change
Expand Up @@ -1728,10 +1728,10 @@ public Element val(String value) {
/**
Get the source range (start and end positions) of the end (closing) tag for this Element. Position tracking must be
enabled prior to parsing the content.
@return the range of the closing tag for this element, if it was explicitly closed in the source. {@code Untracked}
otherwise.
@return the range of the closing tag for this element, or {@code untracked} if its range was not tracked.
@see org.jsoup.parser.Parser#setTrackPosition(boolean)
@see Node#sourceRange()
@see Range#isImplicit()
@since 1.15.2
*/
public Range endSourceRange() {
Expand Down
12 changes: 8 additions & 4 deletions src/main/java/org/jsoup/nodes/Node.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
import java.util.stream.Stream;

/**
The base, abstract Node model. Elements, Documents, Comments etc are all Node instances.
The base, abstract Node model. {@link Element}, {@link Document}, {@link Comment}, {@link TextNode}, et al.,
are instances of Node.
@author Jonathan Hedley, [email protected] */
public abstract class Node implements Cloneable {
Expand Down Expand Up @@ -757,11 +758,14 @@ public <T extends Appendable> T html(T appendable) {
}

/**
Get the source range (start and end positions) in the original input source that this node was parsed from. Position
tracking must be enabled prior to parsing the content. For an Element, this will be the positions of the start tag.
@return the range for the start of the node.
Get the source range (start and end positions) in the original input source from which this node was parsed.
Position tracking must be enabled prior to parsing the content. For an Element, this will be the positions of the
start tag.
@return the range for the start of the node, or {@code untracked} if its range was not tracked.
@see org.jsoup.parser.Parser#setTrackPosition(boolean)
@see Range#isImplicit()
@see Element#endSourceRange()
@see Attributes#nameRange(String name)
@since 1.15.2
*/
public Range sourceRange() {
Expand Down
64 changes: 57 additions & 7 deletions src/main/java/org/jsoup/nodes/Range.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
package org.jsoup.nodes;

import org.jsoup.helper.Validate;
import org.jsoup.internal.SharedConstants;

import java.util.Objects;

import static org.jsoup.internal.SharedConstants.*;

/**
A Range object tracks the character positions in the original input source where a Node starts or ends. If you want to
Expand All @@ -12,10 +17,12 @@
public class Range {
private final Position start, end;

private static final String RangeKey = Attributes.internalKey("jsoup.sourceRange");
private static final String EndRangeKey = Attributes.internalKey("jsoup.endSourceRange");
private static final String RangeKey = PrivatePrefix + "sourceRange";
private static final String EndRangeKey = PrivatePrefix + "endSourceRange";
private static final Position UntrackedPos = new Position(-1, -1, -1);
private static final Range Untracked = new Range(UntrackedPos, UntrackedPos);

/** An untracked source range. */
static final Range Untracked = new Range(UntrackedPos, UntrackedPos);

/**
Creates a new Range with start and end Positions. Called by TreeBuilder when position tracking is on.
Expand Down Expand Up @@ -91,10 +98,8 @@ public boolean isImplicit() {
*/
static Range of(Node node, boolean start) {
final String key = start ? RangeKey : EndRangeKey;
if (!node.hasAttr(key))
return Untracked;
else
return (Range) Validate.ensureNotNull(node.attributes().userData(key));
if (!node.hasAttr(key)) return Untracked;
return (Range) Validate.ensureNotNull(node.attributes().userData(key));
}

/**
Expand Down Expand Up @@ -214,6 +219,51 @@ public int hashCode() {
result = 31 * result + columnNumber;
return result;
}
}

public static class AttributeRange {
static final AttributeRange Untracked = new AttributeRange(Range.Untracked, Range.Untracked);

private final Range nameRange;
private final Range valueRange;

/** Creates a new AttributeRange. Called during parsing by Token.StartTag. */
public AttributeRange(Range nameRange, Range valueRange) {
this.nameRange = nameRange;
this.valueRange = valueRange;
}

/** Get the source range for the attribute's name. */
public Range nameRange() {
return nameRange;
}

/** Get the source range for the attribute's value. */
public Range valueRange() {
return valueRange;
}

/** Get a String presentation of this Attribute range, in the form
{@code line,column:pos-line,column:pos=line,column:pos-line,column:pos} (name start - name end = val start - val end).
. */
@Override public String toString() {
return nameRange().toString() + "=" + valueRange().toString();
}

@Override public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;

AttributeRange that = (AttributeRange) o;

if (!nameRange.equals(that.nameRange)) return false;
return valueRange.equals(that.valueRange);
}

@Override public int hashCode() {
int result = nameRange.hashCode();
result = 31 * result + valueRange.hashCode();
return result;
}
}
}
4 changes: 2 additions & 2 deletions src/main/java/org/jsoup/parser/CharacterReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -190,13 +190,13 @@ int columnNumber(int pos) {
}

/**
Get a formatted string representing the current line and cursor positions. E.g. <code>5:10</code> indicating line
Get a formatted string representing the current line and column positions. E.g. <code>5:10</code> indicating line
number 5 and column number 10.
@return line:col position
@since 1.14.3
@see #trackNewlines(boolean)
*/
String cursorPos() {
String posLineCol() {
return lineNumber() + ":" + columnNumber();
}

Expand Down
4 changes: 2 additions & 2 deletions src/main/java/org/jsoup/parser/ParseError.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@ public class ParseError {

ParseError(CharacterReader reader, String errorMsg) {
pos = reader.pos();
cursorPos = reader.cursorPos();
cursorPos = reader.posLineCol();
this.errorMsg = errorMsg;
}

ParseError(CharacterReader reader, String errorFormat, Object... args) {
pos = reader.pos();
cursorPos = reader.cursorPos();
cursorPos = reader.posLineCol();
this.errorMsg = String.format(errorFormat, args);
}

Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/jsoup/parser/Parser.java
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ public static Document parseBodyFragment(String bodyHtml, String baseUri) {
* @return an unescaped string
*/
public static String unescapeEntities(String string, boolean inAttribute) {
Tokeniser tokeniser = new Tokeniser(new CharacterReader(string), ParseErrorList.noTracking());
Tokeniser tokeniser = new Tokeniser(new CharacterReader(string), ParseErrorList.noTracking(), false);
return tokeniser.unescapeEntities(inAttribute);
}

Expand Down
Loading

0 comments on commit 2d517c7

Please sign in to comment.