Skip to content

Commit

Permalink
Improve parsing of content charset from HTTP entity
Browse files Browse the repository at this point in the history
  • Loading branch information
valfirst authored and rzo1 committed Oct 24, 2024
1 parent f94962d commit eea6ac1
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,11 @@

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Locale;
import java.util.Optional;

import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.Header;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.util.ByteArrayBuffer;
Expand Down Expand Up @@ -167,17 +168,9 @@ public void load(HttpEntity entity, int maxBytes) throws IOException {

contentEncoding = entity.getContentEncoding();

Charset charset;
try {
//FIX how to obtain it with http client 5
// charset = ContentType.getOrDefault(entity).getCharset();
charset = StandardCharsets.UTF_8;
} catch (Exception e) {
logger.warn("parse charset failed: {}", e.getMessage());
charset = StandardCharsets.UTF_8;
}

contentCharset = charset.displayName(Locale.ROOT);
contentCharset = Optional.ofNullable(ContentType.parseLenient(contentType).getCharset())
.orElse(StandardCharsets.UTF_8)
.displayName(Locale.ROOT);
contentData = toByteArray(entity, maxBytes);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,40 +19,45 @@
*/
package edu.uci.ics.crawler4j.tests.crawler;

import static org.junit.jupiter.api.Assertions.assertEquals;

import java.io.IOException;

import org.apache.commons.io.IOUtils;
import org.apache.hc.core5.http.ContentType;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.io.entity.BasicHttpEntity;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;

import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.test.Crawler4jTestUtils;
import edu.uci.ics.crawler4j.url.WebURL;

public class PageTest {

@Disabled("Not possible to create UNPARSABLE ContentType + charset is ALWAYS set to StandardCharsets.UTF_8; (needs fix)")
class PageTest {

@Test
void charsetFromContentType() throws IOException {
String charset = "ISO-8859-1";
testCharset(ContentType.create("text/html", charset), charset);
}

@Test
void defaultCharsetFallback()
throws IOException
{
void defaultCharsetFallback() throws IOException {
ContentType contentType = ContentType.parse("text/html");
// "charset should fallback to UTF-8"
testCharset(contentType, "UTF-8");
}

private static void testCharset(ContentType contentType, String expectedCharset) throws IOException {
String content = "The content";
// "http entity with unsupported charset"
HttpEntity entity = new BasicHttpEntity(//
IOUtils.toInputStream(content, "UTF-8")//
, content.length()//
, ContentType.create("text/html", "UNPARSABLE")//
HttpEntity entity = new BasicHttpEntity(
IOUtils.toInputStream(content, "UTF-8"), content.length(), contentType
);

// "trying to load the entity"

WebURL u = Crawler4jTestUtils.newWebURLFactory().newWebUrl();
Page page = new Page(u);
page.load(entity, 1024);

// "charset should fallback to UTF-8"
"UTF-8".equals(page.getContentCharset());

assertEquals(expectedCharset, page.getContentCharset());
}
}

0 comments on commit eea6ac1

Please sign in to comment.