Improve parsing of content charset from HTTP entity

HHN · Oct 24, 2024 · eea6ac1 · eea6ac1
1 parent f94962d
commit eea6ac1
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 29 deletions.
diff --git a/crawler4j-core/src/main/java/edu/uci/ics/crawler4j/crawler/Page.java b/crawler4j-core/src/main/java/edu/uci/ics/crawler4j/crawler/Page.java
@@ -21,10 +21,11 @@
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.util.Locale;
+import java.util.Optional;
 
+import org.apache.hc.core5.http.ContentType;
 import org.apache.hc.core5.http.Header;
 import org.apache.hc.core5.http.HttpEntity;
 import org.apache.hc.core5.util.ByteArrayBuffer;
@@ -167,17 +168,9 @@ public void load(HttpEntity entity, int maxBytes) throws IOException {
 
         contentEncoding = entity.getContentEncoding();
 
-        Charset charset;
-        try {
-            //FIX how to obtain it with http client 5
-            // charset = ContentType.getOrDefault(entity).getCharset();
-            charset = StandardCharsets.UTF_8;
-        } catch (Exception e) {
-            logger.warn("parse charset failed: {}", e.getMessage());
-            charset = StandardCharsets.UTF_8;
-        }
-
-        contentCharset = charset.displayName(Locale.ROOT);
+        contentCharset = Optional.ofNullable(ContentType.parseLenient(contentType).getCharset())
+                .orElse(StandardCharsets.UTF_8)
+                .displayName(Locale.ROOT);
         contentData = toByteArray(entity, maxBytes);
     }
 

diff --git a/crawler4j-core/src/test/java/edu/uci/ics/crawler4j/tests/crawler/PageTest.java b/crawler4j-core/src/test/java/edu/uci/ics/crawler4j/tests/crawler/PageTest.java
@@ -19,40 +19,45 @@
  */
 package edu.uci.ics.crawler4j.tests.crawler;
 
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
 import java.io.IOException;
 
 import org.apache.commons.io.IOUtils;
 import org.apache.hc.core5.http.ContentType;
 import org.apache.hc.core5.http.HttpEntity;
 import org.apache.hc.core5.http.io.entity.BasicHttpEntity;
-import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
 import edu.uci.ics.crawler4j.crawler.Page;
 import edu.uci.ics.crawler4j.test.Crawler4jTestUtils;
 import edu.uci.ics.crawler4j.url.WebURL;
 
-public class PageTest {
-
-	@Disabled("Not possible to create UNPARSABLE ContentType + charset is ALWAYS set to StandardCharsets.UTF_8; (needs fix)")
+class PageTest {
+
+	@Test
+	void charsetFromContentType() throws IOException {
+		String charset = "ISO-8859-1";
+		testCharset(ContentType.create("text/html", charset), charset);
+	}
+
 	@Test
-	void defaultCharsetFallback()
-			throws IOException
-	{
+	void defaultCharsetFallback() throws IOException {
+		ContentType contentType = ContentType.parse("text/html");
+		// "charset should fallback to UTF-8"
+		testCharset(contentType, "UTF-8");
+	}
+
+	private static void testCharset(ContentType contentType, String expectedCharset) throws IOException {
 		String content = "The content";
-		// "http entity with unsupported charset"
-		HttpEntity entity = new BasicHttpEntity(//
-				IOUtils.toInputStream(content, "UTF-8")//
-				, content.length()//
-				, ContentType.create("text/html", "UNPARSABLE")//
+		HttpEntity entity = new BasicHttpEntity(
+				IOUtils.toInputStream(content, "UTF-8"), content.length(), contentType
 		);
-
-		// "trying to load the entity"
+
 		WebURL u = Crawler4jTestUtils.newWebURLFactory().newWebUrl();
 		Page page = new Page(u);
 		page.load(entity, 1024);
-
-		// "charset should fallback to UTF-8"
-		"UTF-8".equals(page.getContentCharset());
+
+		assertEquals(expectedCharset, page.getContentCharset());
 	}
 }