Skip to content

Commit

Permalink
WAT extractor: add attributes of the <html> element as metadata,
Browse files Browse the repository at this point in the history
fixes #35
- add lang attributes from <html> root element as metadata
  { "name": "HTML@/lang", "content": "es-MX" }
  • Loading branch information
sebastian-nagel committed Dec 5, 2024
1 parent 456635c commit 581b43a
Show file tree
Hide file tree
Showing 3 changed files with 152 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
Expand Down Expand Up @@ -110,6 +111,8 @@ public class ExtractingParseObserver implements ParseObserver {
extractors.put("AUDIO", new EmbedTagExtractor());
extractors.put("TRACK", new EmbedTagExtractor());
extractors.put("SOURCE", new EmbedTagExtractor());
// language from HTML root element
extractors.put("HTML", new HTMLTagExtractor());

globalHrefAttributes = new HashSet<String>();
globalHrefAttributes.add("background");
Expand Down Expand Up @@ -604,6 +607,23 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}

private static class HTMLTagExtractor implements TagExtractor {
@Override
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
ArrayList<String> l = getAttrList(node, "lang", "xml:lang");
if(l != null) {
Iterator<String> it = l.iterator();
while (it.hasNext()) {
String name = it.next();
if (it.hasNext()) {
String lang = it.next();
data.addMeta("name", makePath("HTML", name), "content", lang);
}
}
}
}
}

private static class IFrameTagExtractor implements TagExtractor {
@Override
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.logging.Logger;

import org.archive.extract.ExtractingResourceFactoryMapper;
Expand Down Expand Up @@ -240,6 +241,19 @@ private void checkLinks(Resource resource, String[][] expectedLinks) {
}
}

private void checkExtractHtmlLangAttribute(Resource resource, Map<String, String> langAttributes)
throws JSONException {
assertNotNull(resource);
assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
JSONArray metas = resource.getMetaData().getJSONObject("Head").getJSONArray("Metas");
assertNotNull(metas);
JSONObject meta = metas.getJSONObject(0);
for (String key : langAttributes.keySet()) {
assertNotNull(meta.get(key));
assertEquals(meta.get(key), langAttributes.get(key));
}
}

public void testLinkExtraction() throws ResourceParseException, IOException {
String testFileName = "link-extraction-test.warc";
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
Expand Down Expand Up @@ -414,6 +428,18 @@ public void testTitleExtraction() throws ResourceParseException, IOException {
checkTitle(resource, "Testing title extraction with embedded SVG");
}

public void testHtmlLanguageAttributeExtraction() throws ResourceParseException, IOException {
String testFileName = "html-lang-attribute.warc";
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper);
checkExtractHtmlLangAttribute(extractor.getNext(), Map.of("name", "HTML@/lang", "content", "en"));
checkExtractHtmlLangAttribute(extractor.getNext(), Map.of("name", "HTML@/lang", "content", "zh-CN"));
checkExtractHtmlLangAttribute(extractor.getNext(), Map.of("name", "HTML@/lang", "content", "cs-cz"));
checkExtractHtmlLangAttribute(extractor.getNext(), Map.of("name", "HTML@/lang", "content", "en"));
checkExtractHtmlLangAttribute(extractor.getNext(), Map.of("name", "HTML@/xml:lang", "content", "es-MX"));
}

public void testHtmlParserEntityDecoding() {
String[][] entities = { //
/* ampersand */
Expand Down
106 changes: 106 additions & 0 deletions src/test/resources/org/archive/resource/html/html-lang-attribute.warc
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
WARC/1.0
WARC-Type: response
WARC-Date: 2024-12-05T10:47:02Z
Content-Length: 169
Content-Type: application/http; msgtype=response
WARC-Target-URI: https://www.example.org/1
WARC-Identified-Payload-Type: text/html

HTTP/1.1 200
content-type: text/html; charset=UTF-8

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Test</title>
</head>
<body/>
</html>



WARC/1.0
WARC-Type: response
WARC-Date: 2024-12-05T10:47:02Z
Content-Length: 185
Content-Type: application/http; msgtype=response
WARC-Target-URI: https://www.example.org/2
WARC-Identified-Payload-Type: text/html

HTTP/1.1 200
content-type: text/html; charset=UTF-8

<!DOCTYPE html>
<html lang="zh-CN" xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Test</title>
</head>
<body/>
</html>



WARC/1.0
WARC-Type: response
WARC-Date: 2024-12-05T10:47:02Z
Content-Length: 158
Content-Type: application/http; msgtype=response
WARC-Target-URI: https://www.example.org/3
WARC-Identified-Payload-Type: text/html

HTTP/1.1 200
content-type: text/html; charset=UTF-8

<!DOCTYPE html>
<html dir="ltr" lang="cs-cz">
<head>
<title>Test</title>
</head>
<body/>
</html>



WARC/1.0
WARC-Type: response
WARC-Date: 2024-12-05T10:47:02Z
Content-Length: 319
Content-Type: application/http; msgtype=response
WARC-Target-URI: https://www.example.org/4
WARC-Identified-Payload-Type: text/html

HTTP/1.1 200
content-type: text/html; charset=UTF-8

<!DOCTYPE html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#" xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr" style="overflow-x: hidden !important;">
<head>
<title>Test</title>
</head>
<body/>
</html>



WARC/1.0
WARC-Type: response
WARC-Date: 2024-12-05T10:47:02Z
Content-Length: 189
Content-Type: application/http; msgtype=response
WARC-Target-URI: https://www.example.org/5
WARC-Identified-Payload-Type: text/html

HTTP/1.1 200
content-type: text/html; charset=UTF-8

<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="es-MX">
<head>
<title>Test</title>
</head>
<body/>
</html>



0 comments on commit 581b43a

Please sign in to comment.