diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index ad3ad463..b0b37f4a 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -3,6 +3,7 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.Locale; import java.util.Map; import java.util.Set; @@ -110,6 +111,8 @@ public class ExtractingParseObserver implements ParseObserver { extractors.put("AUDIO", new EmbedTagExtractor()); extractors.put("TRACK", new EmbedTagExtractor()); extractors.put("SOURCE", new EmbedTagExtractor()); + // language from HTML root element + extractors.put("HTML", new HTMLTagExtractor()); globalHrefAttributes = new HashSet(); globalHrefAttributes.add("background"); @@ -604,6 +607,23 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs } } + private static class HTMLTagExtractor implements TagExtractor { + @Override + public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { + ArrayList l = getAttrList(node, "lang", "xml:lang"); + if(l != null) { + Iterator it = l.iterator(); + while (it.hasNext()) { + String name = it.next(); + if (it.hasNext()) { + String lang = it.next(); + data.addMeta("name", makePath("HTML", name), "content", lang); + } + } + } + } + } + private static class IFrameTagExtractor implements TagExtractor { @Override public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) { diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java index 15098011..18f35767 100644 --- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java +++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java @@ -3,6 +3,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.Map; import java.util.logging.Logger; import org.archive.extract.ExtractingResourceFactoryMapper; @@ -240,6 +241,19 @@ private void checkLinks(Resource resource, String[][] expectedLinks) { } } + private void checkExtractHtmlLangAttribute(Resource resource, Map langAttributes) + throws JSONException { + assertNotNull(resource); + assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource); + JSONArray metas = resource.getMetaData().getJSONObject("Head").getJSONArray("Metas"); + assertNotNull(metas); + JSONObject meta = metas.getJSONObject(0); + for (String key : langAttributes.keySet()) { + assertNotNull(meta.get(key)); + assertEquals(meta.get(key), langAttributes.get(key)); + } + } + public void testLinkExtraction() throws ResourceParseException, IOException { String testFileName = "link-extraction-test.warc"; ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath()); @@ -414,6 +428,18 @@ public void testTitleExtraction() throws ResourceParseException, IOException { checkTitle(resource, "Testing title extraction with embedded SVG"); } + public void testHtmlLanguageAttributeExtraction() throws ResourceParseException, IOException { + String testFileName = "html-lang-attribute.warc"; + ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath()); + ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper(); + ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper); + checkExtractHtmlLangAttribute(extractor.getNext(), Map.of("name", "HTML@/lang", "content", "en")); + checkExtractHtmlLangAttribute(extractor.getNext(), Map.of("name", "HTML@/lang", "content", "zh-CN")); + checkExtractHtmlLangAttribute(extractor.getNext(), Map.of("name", "HTML@/lang", "content", "cs-cz")); + checkExtractHtmlLangAttribute(extractor.getNext(), Map.of("name", "HTML@/lang", "content", "en")); + checkExtractHtmlLangAttribute(extractor.getNext(), Map.of("name", "HTML@/xml:lang", "content", "es-MX")); + } + public void testHtmlParserEntityDecoding() { String[][] entities = { // /* ampersand */ diff --git a/src/test/resources/org/archive/resource/html/html-lang-attribute.warc b/src/test/resources/org/archive/resource/html/html-lang-attribute.warc new file mode 100644 index 00000000..b74e5c18 --- /dev/null +++ b/src/test/resources/org/archive/resource/html/html-lang-attribute.warc @@ -0,0 +1,106 @@ +WARC/1.0 +WARC-Type: response +WARC-Date: 2024-12-05T10:47:02Z +Content-Length: 169 +Content-Type: application/http; msgtype=response +WARC-Target-URI: https://www.example.org/1 +WARC-Identified-Payload-Type: text/html + +HTTP/1.1 200 +content-type: text/html; charset=UTF-8 + + + + + + Test + + + + + + +WARC/1.0 +WARC-Type: response +WARC-Date: 2024-12-05T10:47:02Z +Content-Length: 185 +Content-Type: application/http; msgtype=response +WARC-Target-URI: https://www.example.org/2 +WARC-Identified-Payload-Type: text/html + +HTTP/1.1 200 +content-type: text/html; charset=UTF-8 + + + + + Test + + + + + + +WARC/1.0 +WARC-Type: response +WARC-Date: 2024-12-05T10:47:02Z +Content-Length: 158 +Content-Type: application/http; msgtype=response +WARC-Target-URI: https://www.example.org/3 +WARC-Identified-Payload-Type: text/html + +HTTP/1.1 200 +content-type: text/html; charset=UTF-8 + + + + + Test + + + + + + +WARC/1.0 +WARC-Type: response +WARC-Date: 2024-12-05T10:47:02Z +Content-Length: 319 +Content-Type: application/http; msgtype=response +WARC-Target-URI: https://www.example.org/4 +WARC-Identified-Payload-Type: text/html + +HTTP/1.1 200 +content-type: text/html; charset=UTF-8 + + + + + Test + + + + + + +WARC/1.0 +WARC-Type: response +WARC-Date: 2024-12-05T10:47:02Z +Content-Length: 189 +Content-Type: application/http; msgtype=response +WARC-Target-URI: https://www.example.org/5 +WARC-Identified-Payload-Type: text/html + +HTTP/1.1 200 +content-type: text/html; charset=UTF-8 + + + + + Test + + + + + +