Skip to content

Commit

Permalink
WAT extractor: do not extract page title from embedded SVG images
Browse files Browse the repository at this point in the history
- do not use <title> elements embedded in <svg> as page/document title
- use the first non-empty <title> element to set the page/document
  title. This is required for documents where the <title> is not
  enclosed in the <head> element. Note: HTML5 allows the <head> element
  to be ommitted, see
   https://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#optional-tags
- overwrite the page/document title by the content of a <title> element
  inside the <head> element
- for text extraction: define the title element as block element
  • Loading branch information
sebastian-nagel committed Oct 14, 2024
1 parent fc11441 commit e36c876
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@ public class ExtractingParseObserver implements ParseObserver {
Stack<StringBuilder> openAnchorTexts;
StringBuilder textExtract;
String title = null;
boolean inHead = false;
boolean inTitle = false;
boolean inPre = false;
boolean inSVG = false;

protected static String cssUrlPatString =
"url\\s*\\(\\s*([^)\\s]{1,8000}?)\\s*\\)";
Expand Down Expand Up @@ -59,7 +61,7 @@ public class ExtractingParseObserver implements ParseObserver {
"button", "canvas", "caption", "col", "colgroup", "dd", "div", "dl", "dt", "embed", "fieldset",
"figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr",
"li", "map", "noscript", "object", "ol", "output", "p", "pre", "progress", "section", "table", "tbody",
"textarea", "tfoot", "th", "thead", "tr", "ul", "video" };
"textarea", "tfoot", "th", "thead", "title", "tr", "ul", "video" };
private static final Set<String> blockElements;
/* inline elements which content is not melted with surrounding words */
private final static String[] INLINE_ELEMENTS_SPACING = { "address", "cite", "details", "datalist", "iframe", "img",
Expand Down Expand Up @@ -144,11 +146,17 @@ public void handleTagEmpty(TagNode tag) {
@Override
public void handleTagOpen(TagNode tag) {
String name = tag.getTagName();
if(name.equals("TITLE")) {
if (name.equals("HEAD")) {
inHead = true;
} else if (name.equals("TITLE")) {
inTitle = !tag.isEmptyXmlTag();
return;
} else if (name.equals("PRE")) {
inPre = true;
} else if (name.equals("SVG")) {
inSVG = true;
} else if (name.equals("BODY")) {
inHead = false;
}

if (blockElements.contains(name)) {
Expand Down Expand Up @@ -183,9 +191,11 @@ public void handleTagOpen(TagNode tag) {
public void handleTagClose(TagNode tag) {
String name = tag.getTagName();

if(inTitle) {
if (inTitle) {
inTitle = false;
data.setTitle(title);
if (!inSVG && (inHead || !data.hasTitle())) {
data.setTitle(title);
}
title = null;
}

Expand Down Expand Up @@ -222,8 +232,12 @@ public void handleTagClose(TagNode tag) {
data.addHref(vals);
}
}
} else if (tag.getTagName().equals("HEAD")) {
inHead = false;
} else if (tag.getTagName().equals("PRE")) {
inPre = false;
} else if (tag.getTagName().equals("SVG")) {
inSVG = false;
}
}

Expand Down
7 changes: 7 additions & 0 deletions src/main/java/org/archive/resource/html/HTMLMetaData.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,15 @@ private JSONObject getHeader() {
public void setBaseHref(String href) {
putUnlessNull(getHeader(),HTML_BASE, href);
}

public void setTitle(String title) {
putUnlessNull(getHeader(),HTML_TITLE, title);
}

public boolean hasTitle() {
return header != null && header.has(HTML_TITLE);
}

private void putUnlessNull(JSONObject o, String k, String v) {
if(o != null) {
try {
Expand All @@ -43,6 +49,7 @@ private void putUnlessNull(JSONObject o, String k, String v) {
}
}
}

public String[] LtoA(List<String> l) {
String[] a = new String[l.size()];
l.toArray(a);
Expand Down

0 comments on commit e36c876

Please sign in to comment.