Skip to content

Commit

Permalink
HTTPCLIENT-2159: Fix handling of charset in ContentType for specific …
Browse files Browse the repository at this point in the history
…media types

 * Updated ContentType to ensure that no charset is included for media types like application/octet-stream, multipart/form-data, and image/*, which do not require a charset as per the RFC.
 * Refactored the toString() method to properly handle the omission of charset for these media types.
 * Adjusted the creation methods to better handle implicit charsets and added validation for reserved characters in MIME types.
  • Loading branch information
arturobernalg committed Sep 24, 2024
1 parent 077142a commit 4b6f836
Show file tree
Hide file tree
Showing 3 changed files with 270 additions and 41 deletions.
192 changes: 152 additions & 40 deletions httpcore5/src/main/java/org/apache/hc/core5/http/ContentType.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@

import org.apache.hc.core5.annotation.Contract;
import org.apache.hc.core5.annotation.ThreadingBehavior;
import org.apache.hc.core5.http.message.BasicHeaderValueFormatter;
import org.apache.hc.core5.http.message.BasicNameValuePair;
import org.apache.hc.core5.http.message.MessageSupport;
import org.apache.hc.core5.http.message.ParserCursor;
Expand All @@ -67,105 +68,125 @@ public final class ContentType implements Serializable {
*/
private static final String CHARSET = "charset";

/**
* Flag indicating whether the charset is implicit.
* <p>
* When {@code implicitCharset} is {@code true}, the charset will not be explicitly
* included in the string representation of this {@link ContentType} (i.e., in the {@code toString} method),
* unless it is required for the given MIME type.
* If {@code implicitCharset} is {@code false}, the charset will always be included in the string representation,
* unless the MIME type explicitly disallows charset parameters (e.g., certain binary or multipart types).
* </p>
* <p>
* This flag is essential for proper handling of content types where the charset is either implied by the specification
* (e.g., JSON is always UTF-8) or where including a charset is not meaningful (e.g., binary types like
* {@code application/octet-stream}).
* </p>
*
* @since 5.5
*/
private final boolean implicitCharset;


// constants
public static final ContentType APPLICATION_ATOM_XML = create(
"application/atom+xml", StandardCharsets.UTF_8);
"application/atom+xml", StandardCharsets.UTF_8, false);
public static final ContentType APPLICATION_FORM_URLENCODED = create(
"application/x-www-form-urlencoded", StandardCharsets.ISO_8859_1);
"application/x-www-form-urlencoded", StandardCharsets.ISO_8859_1, true);
public static final ContentType APPLICATION_JSON = create(
"application/json", StandardCharsets.UTF_8);
"application/json", StandardCharsets.UTF_8, true);

/**
* Public constant media type for {@code application/x-ndjson}.
* @since 5.1
*/
public static final ContentType APPLICATION_NDJSON = create(
"application/x-ndjson", StandardCharsets.UTF_8);
"application/x-ndjson", StandardCharsets.UTF_8, true);

public static final ContentType APPLICATION_OCTET_STREAM = create(
"application/octet-stream", (Charset) null);
"application/octet-stream", (Charset) null, true);
/**
* Public constant media type for {@code application/pdf}.
* @since 5.1
*/
public static final ContentType APPLICATION_PDF = create(
"application/pdf", StandardCharsets.UTF_8);
"application/pdf", (Charset) null, true);

public static final ContentType APPLICATION_SOAP_XML = create(
"application/soap+xml", StandardCharsets.UTF_8);
"application/soap+xml", StandardCharsets.UTF_8, false);
public static final ContentType APPLICATION_SVG_XML = create(
"application/svg+xml", StandardCharsets.UTF_8);
"application/svg+xml", StandardCharsets.UTF_8, false);
public static final ContentType APPLICATION_XHTML_XML = create(
"application/xhtml+xml", StandardCharsets.UTF_8);
"application/xhtml+xml", StandardCharsets.UTF_8, false);
public static final ContentType APPLICATION_XML = create(
"application/xml", StandardCharsets.UTF_8);
"application/xml", StandardCharsets.UTF_8, false);
/**
* Public constant media type for {@code application/problem+json}.
* @see <a href="https://tools.ietf.org/html/rfc7807#section-6.1">Problem Details for HTTP APIs, 6.1. application/problem+json</a>
* @since 5.1
*/
public static final ContentType APPLICATION_PROBLEM_JSON = create(
"application/problem+json", StandardCharsets.UTF_8);
"application/problem+json", StandardCharsets.UTF_8, true);
/**
* Public constant media type for {@code application/problem+xml}.
* @see <a href="https://tools.ietf.org/html/rfc7807#section-6.2">Problem Details for HTTP APIs, 6.2. application/problem+xml</a>
* @since 5.1
*/
public static final ContentType APPLICATION_PROBLEM_XML = create(
"application/problem+xml", StandardCharsets.UTF_8);
"application/problem+xml", StandardCharsets.UTF_8, false);

/**
* Public constant media type for {@code application/rss+xml}.
* @since 5.1
*/
public static final ContentType APPLICATION_RSS_XML = create(
"application/rss+xml", StandardCharsets.UTF_8);
"application/rss+xml", StandardCharsets.UTF_8, false);

public static final ContentType IMAGE_BMP = create(
"image/bmp");
"image/bmp", (Charset) null, true);
public static final ContentType IMAGE_GIF = create(
"image/gif");
"image/gif", (Charset) null, true);
public static final ContentType IMAGE_JPEG = create(
"image/jpeg");
"image/jpeg", (Charset) null, true);
public static final ContentType IMAGE_PNG = create(
"image/png");
"image/png", (Charset) null, true);
public static final ContentType IMAGE_SVG = create(
"image/svg+xml");
"image/svg+xml", (Charset) null, false);
public static final ContentType IMAGE_TIFF = create(
"image/tiff");
"image/tiff", (Charset) null, true);
public static final ContentType IMAGE_WEBP = create(
"image/webp");
"image/webp", (Charset) null, true);
public static final ContentType MULTIPART_FORM_DATA = create(
"multipart/form-data", StandardCharsets.ISO_8859_1);
"multipart/form-data", StandardCharsets.ISO_8859_1, true);

/**
* Public constant media type for {@code multipart/mixed}.
* @since 5.1
*/
public static final ContentType MULTIPART_MIXED = create(
"multipart/mixed", StandardCharsets.ISO_8859_1);
"multipart/mixed", StandardCharsets.ISO_8859_1, true);

/**
* Public constant media type for {@code multipart/related}.
* @since 5.1
*/
public static final ContentType MULTIPART_RELATED = create(
"multipart/related", StandardCharsets.ISO_8859_1);
"multipart/related", StandardCharsets.ISO_8859_1, true);

public static final ContentType TEXT_HTML = create(
"text/html", StandardCharsets.UTF_8);
"text/html", StandardCharsets.UTF_8, true);

/**
* Public constant media type for {@code text/markdown}.
* @since 5.1
*/
public static final ContentType TEXT_MARKDOWN = create(
"text/markdown", StandardCharsets.UTF_8);
"text/markdown", StandardCharsets.UTF_8, false);

public static final ContentType TEXT_PLAIN = create(
"text/plain", StandardCharsets.UTF_8);
"text/plain", StandardCharsets.UTF_8, true);
public static final ContentType TEXT_XML = create(
"text/xml", StandardCharsets.UTF_8);
"text/xml", StandardCharsets.UTF_8, false);
/**
* Public constant media type for {@code text/event-stream}.
* @see <a href="https://www.w3.org/TR/eventsource/">Server-Sent Events W3C recommendation</a>
Expand All @@ -175,7 +196,7 @@ public final class ContentType implements Serializable {
"text/event-stream", StandardCharsets.UTF_8);

public static final ContentType WILDCARD = create(
"*/*", (Charset) null);
"*/*", (Charset) null, true);

/**
* An empty immutable {@code NameValuePair} array.
Expand Down Expand Up @@ -225,18 +246,42 @@ public final class ContentType implements Serializable {
ContentType(
final String mimeType,
final Charset charset) {
this.mimeType = mimeType;
this.charset = charset;
this.params = null;
this (mimeType,charset,null, false);
}

ContentType(
final String mimeType,
final Charset charset,
final NameValuePair[] params) {

this (mimeType,charset,params, false);
}

/**
* Constructs a new instance of {@link ContentType} with the given MIME type, charset, parameters,
* and an implicit charset flag.
* <p>
* If {@code implicitCharset} is set to {@code true}, the charset will not be explicitly
* included in the string representation of this content type (i.e., the {@code toString} method)
* unless it is required for the given MIME type.
* If {@code implicitCharset} is {@code false}, the charset will always be included in the
* string representation unless the MIME type is one of those that should not include a charset.
* </p>
*
* @param mimeType the MIME type. It must not be {@code null} or empty and must not contain
* reserved characters such as {@code <">, <;>, <,>}.
* @param charset the character set for this content type. This can be {@code null}.
* @param params optional parameters for this content type. If {@code null}, no additional
* parameters will be included.
* @param implicitCharset whether the charset is implicit. If {@code true}, the charset is not
* included in the {@code toString} output unless required.
* @since 5.5
*/
ContentType(final String mimeType, final Charset charset, final NameValuePair[] params, final boolean implicitCharset) {
this.mimeType = mimeType;
this.charset = charset;
this.params = params;
this.implicitCharset = implicitCharset;
this.params = params != null ? params.clone() : null;
}

public String getMimeType() {
Expand Down Expand Up @@ -288,8 +333,19 @@ public String toString() {
buf.append(this.mimeType);
if (this.params != null) {
buf.append("; ");
MessageSupport.formatParameters(buf, this.params);
} else if (this.charset != null) {
boolean first = true;
for (final NameValuePair param : params) {
if (!first) {
buf.append("; ");
}
if (param.getName().equalsIgnoreCase("charset") && implicitCharset) {
continue;
}
BasicHeaderValueFormatter.INSTANCE.formatNameValuePair(buf, param, false);
first = false;
}
} else if (this.charset != null && !implicitCharset) {
// Append charset only if it's not one of the types that shouldn't have charset
buf.append("; charset=");
buf.append(this.charset.name());
}
Expand All @@ -306,6 +362,58 @@ private static boolean valid(final String s) {
return true;
}


/**
* Creates a new instance of {@link ContentType} with the given MIME type, charset,
* and an implicit charset flag.
* <p>
* This method allows specifying whether the charset should be implicit or explicit.
* If {@code implicitCharset} is set to {@code true}, the charset will not be explicitly
* included in the string representation of this content type (i.e., the {@code toString} method),
* unless it is required for the given MIME type. If {@code implicitCharset} is {@code false},
* the charset will always be included unless the MIME type does not allow a charset.
* </p>
*
* @param mimeType the MIME type. It must not be {@code null} or empty and must not contain
* reserved characters such as {@code <">, <;>, <,>}.
* @param charset the character set for this content type. This can be {@code null}.
* @param implicitCharset whether the charset is implicit. If {@code true}, the charset is
* not included in the {@code toString} output unless required.
* @return a new instance of {@link ContentType}.
* @throws IllegalArgumentException if the MIME type is invalid or contains reserved characters.
* @since 5.5
*/
public static ContentType create(final String mimeType, final Charset charset, final boolean implicitCharset) {
final String normalizedMimeType = TextUtils.toLowerCase(Args.notBlank(mimeType, "MIME type"));
Args.check(valid(normalizedMimeType), "MIME type may not contain reserved characters");
return new ContentType(normalizedMimeType, charset, null, implicitCharset);
}

/**
* Creates a new instance of {@link ContentType} with the given MIME type, parameters,
* and an implicit charset flag.
* <p>
* This method allows specifying additional parameters for the content type and whether
* the charset should be implicit or explicit. If {@code implicitCharset} is {@code true},
* the charset will not be included in the string representation unless required.
* </p>
*
* @param mimeType the MIME type. It must not be {@code null} or empty and must not contain
* reserved characters such as {@code <">, <;>, <,>}.
* @param implicitCharset whether the charset is implicit. If {@code true}, the charset is
* not included in the {@code toString} output unless required.
* @param params optional parameters for the content type. Can be {@code null}.
* @return a new instance of {@link ContentType}.
* @throws IllegalArgumentException if the MIME type is invalid or contains reserved characters.
* @throws UnsupportedCharsetException if the charset provided in the parameters is not supported.
* @since 5.5
*/
public static ContentType create(final String mimeType, final boolean implicitCharset, final NameValuePair... params) throws UnsupportedCharsetException {
final String type = TextUtils.toLowerCase(Args.notBlank(mimeType, "MIME type"));
Args.check(valid(type), "MIME type may not contain reserved characters");
return create(mimeType, params != null ? params.clone() : null, implicitCharset);
}

/**
* Creates a new instance of {@link ContentType}.
*
Expand All @@ -315,9 +423,7 @@ private static boolean valid(final String s) {
* @return content type
*/
public static ContentType create(final String mimeType, final Charset charset) {
final String normalizedMimeType = TextUtils.toLowerCase(Args.notBlank(mimeType, "MIME type"));
Args.check(valid(normalizedMimeType), "MIME type may not contain reserved characters");
return new ContentType(normalizedMimeType, charset);
return create(mimeType, charset, false);
}

/**
Expand All @@ -326,7 +432,9 @@ public static ContentType create(final String mimeType, final Charset charset) {
* @param mimeType MIME type. It may not be {@code null} or empty. It may not contain
* characters {@code <">, <;>, <,>} reserved by the HTTP specification.
* @return content type
* @deprecated Use {@link #create(String, Charset, boolean)} with an explicit charset or null, and specify if the charset is implicit.
*/
@Deprecated
public static ContentType create(final String mimeType) {
return create(mimeType, (Charset) null);
}
Expand Down Expand Up @@ -356,6 +464,10 @@ private static ContentType create(final HeaderElement helem, final boolean stric
}

private static ContentType create(final String mimeType, final NameValuePair[] params, final boolean strict) {
return create(mimeType, params != null ? params.clone() : null, strict, false);
}

private static ContentType create(final String mimeType, final NameValuePair[] params, final boolean strict, final boolean implicitCharset) {
Charset charset = null;
if (params != null) {
for (final NameValuePair param : params) {
Expand All @@ -374,7 +486,7 @@ private static ContentType create(final String mimeType, final NameValuePair[] p
}
}
}
return new ContentType(mimeType, charset, params != null && params.length > 0 ? params : null);
return new ContentType(mimeType, charset, params != null && params.length > 0 ? params : null, implicitCharset);
}

/**
Expand Down Expand Up @@ -517,11 +629,11 @@ public ContentType withParameters(
for (final Map.Entry<String, String> entry: paramMap.entrySet()) {
newParams.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
}
return create(this.getMimeType(), newParams.toArray(EMPTY_NAME_VALUE_PAIR_ARRAY), true);
return create(this.getMimeType(), newParams.toArray(EMPTY_NAME_VALUE_PAIR_ARRAY), true, this.implicitCharset);
}

public boolean isSameMimeType(final ContentType contentType) {
return contentType != null && mimeType.equalsIgnoreCase(contentType.getMimeType());
}

}
}
Loading

0 comments on commit 4b6f836

Please sign in to comment.