diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index ad3ad463..b0b37f4a 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -3,6 +3,7 @@
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
+import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
@@ -110,6 +111,8 @@ public class ExtractingParseObserver implements ParseObserver {
extractors.put("AUDIO", new EmbedTagExtractor());
extractors.put("TRACK", new EmbedTagExtractor());
extractors.put("SOURCE", new EmbedTagExtractor());
+ // language from HTML root element
+ extractors.put("HTML", new HTMLTagExtractor());
globalHrefAttributes = new HashSet();
globalHrefAttributes.add("background");
@@ -604,6 +607,23 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
+ private static class HTMLTagExtractor implements TagExtractor {
+ @Override
+ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
+ ArrayList l = getAttrList(node, "lang", "xml:lang");
+ if(l != null) {
+ Iterator it = l.iterator();
+ while (it.hasNext()) {
+ String name = it.next();
+ if (it.hasNext()) {
+ String lang = it.next();
+ data.addMeta("name", makePath("HTML", name), "content", lang);
+ }
+ }
+ }
+ }
+ }
+
private static class IFrameTagExtractor implements TagExtractor {
@Override
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
index 15098011..18f35767 100644
--- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
+++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
@@ -3,6 +3,7 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
+import java.util.Map;
import java.util.logging.Logger;
import org.archive.extract.ExtractingResourceFactoryMapper;
@@ -240,6 +241,19 @@ private void checkLinks(Resource resource, String[][] expectedLinks) {
}
}
+ private void checkExtractHtmlLangAttribute(Resource resource, Map langAttributes)
+ throws JSONException {
+ assertNotNull(resource);
+ assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
+ JSONArray metas = resource.getMetaData().getJSONObject("Head").getJSONArray("Metas");
+ assertNotNull(metas);
+ JSONObject meta = metas.getJSONObject(0);
+ for (String key : langAttributes.keySet()) {
+ assertNotNull(meta.get(key));
+ assertEquals(meta.get(key), langAttributes.get(key));
+ }
+ }
+
public void testLinkExtraction() throws ResourceParseException, IOException {
String testFileName = "link-extraction-test.warc";
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
@@ -414,6 +428,18 @@ public void testTitleExtraction() throws ResourceParseException, IOException {
checkTitle(resource, "Testing title extraction with embedded SVG");
}
+ public void testHtmlLanguageAttributeExtraction() throws ResourceParseException, IOException {
+ String testFileName = "html-lang-attribute.warc";
+ ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
+ ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
+ ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper);
+ checkExtractHtmlLangAttribute(extractor.getNext(), Map.of("name", "HTML@/lang", "content", "en"));
+ checkExtractHtmlLangAttribute(extractor.getNext(), Map.of("name", "HTML@/lang", "content", "zh-CN"));
+ checkExtractHtmlLangAttribute(extractor.getNext(), Map.of("name", "HTML@/lang", "content", "cs-cz"));
+ checkExtractHtmlLangAttribute(extractor.getNext(), Map.of("name", "HTML@/lang", "content", "en"));
+ checkExtractHtmlLangAttribute(extractor.getNext(), Map.of("name", "HTML@/xml:lang", "content", "es-MX"));
+ }
+
public void testHtmlParserEntityDecoding() {
String[][] entities = { //
/* ampersand */
diff --git a/src/test/resources/org/archive/resource/html/html-lang-attribute.warc b/src/test/resources/org/archive/resource/html/html-lang-attribute.warc
new file mode 100644
index 00000000..b74e5c18
--- /dev/null
+++ b/src/test/resources/org/archive/resource/html/html-lang-attribute.warc
@@ -0,0 +1,106 @@
+WARC/1.0
+WARC-Type: response
+WARC-Date: 2024-12-05T10:47:02Z
+Content-Length: 169
+Content-Type: application/http; msgtype=response
+WARC-Target-URI: https://www.example.org/1
+WARC-Identified-Payload-Type: text/html
+
+HTTP/1.1 200
+content-type: text/html; charset=UTF-8
+
+
+
+
+
+ Test
+
+
+
+
+
+
+WARC/1.0
+WARC-Type: response
+WARC-Date: 2024-12-05T10:47:02Z
+Content-Length: 185
+Content-Type: application/http; msgtype=response
+WARC-Target-URI: https://www.example.org/2
+WARC-Identified-Payload-Type: text/html
+
+HTTP/1.1 200
+content-type: text/html; charset=UTF-8
+
+
+
+
+ Test
+
+
+
+
+
+
+WARC/1.0
+WARC-Type: response
+WARC-Date: 2024-12-05T10:47:02Z
+Content-Length: 158
+Content-Type: application/http; msgtype=response
+WARC-Target-URI: https://www.example.org/3
+WARC-Identified-Payload-Type: text/html
+
+HTTP/1.1 200
+content-type: text/html; charset=UTF-8
+
+
+
+
+ Test
+
+
+
+
+
+
+WARC/1.0
+WARC-Type: response
+WARC-Date: 2024-12-05T10:47:02Z
+Content-Length: 319
+Content-Type: application/http; msgtype=response
+WARC-Target-URI: https://www.example.org/4
+WARC-Identified-Payload-Type: text/html
+
+HTTP/1.1 200
+content-type: text/html; charset=UTF-8
+
+
+
+
+ Test
+
+
+
+
+
+
+WARC/1.0
+WARC-Type: response
+WARC-Date: 2024-12-05T10:47:02Z
+Content-Length: 189
+Content-Type: application/http; msgtype=response
+WARC-Target-URI: https://www.example.org/5
+WARC-Identified-Payload-Type: text/html
+
+HTTP/1.1 200
+content-type: text/html; charset=UTF-8
+
+
+
+
+ Test
+
+