diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 4b5febe7cf..bdef854fd9 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -2885,6 +2885,16 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
+
+ store.protocol.versions
+ false
+
+ Store protocol versions in response metadata: HTTP and SSL/TLS
+ versions, SSL/TTL cipher suites and related information depending
+ on the protocol implementation. Supported by: protocol-okhttp.
+
+
+
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index 3fb083ace1..2d86ddfe64 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -45,6 +45,7 @@
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.net.protocols.ProtocolLogUtil;
+import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
@@ -136,6 +137,7 @@ public class FetcherThread extends Thread {
private boolean storingContent;
private boolean storingWarc;
private boolean storing404s;
+ private boolean storingProtocolVersions;
private boolean signatureWithoutParsing;
@@ -195,6 +197,8 @@ public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQ
this.storingContent = storingContent;
this.storing404s = conf.getBoolean("fetcher.store.404s", false);
this.storingWarc = Fetcher.isStoringWarc(conf);
+ this.storingProtocolVersions = conf.getBoolean("store.protocol.versions",
+ false);
this.pages = pages;
this.bytes = bytes;
@@ -428,6 +432,10 @@ public void run() {
}
context.getCounter("FetcherStatus", status.getName()).increment(1);
+ if (storingProtocolVersions && content != null) {
+ countProtocolVersions(content.getMetadata());
+ }
+
switch (status.getCode()) {
case ProtocolStatus.WOULDBLOCK:
@@ -684,6 +692,24 @@ private void logError(Text url, String message) {
errors.incrementAndGet();
}
+ private void countProtocolVersions(Metadata contentMetadata) {
+ if (contentMetadata == null) {
+ return;
+ }
+ String versionStr = contentMetadata.get(Response.PROTOCOL_VERSIONS);
+ if (versionStr != null) {
+ String[] versions = versionStr.split(",");
+ if (versions.length >= 1) {
+ context.getCounter("HttpProtocolVersion", versions[0]).increment(1);
+ } else {
+ context.getCounter("HttpProtocolVersion", "unknown").increment(1);
+ }
+ for (int i = 1; i < versions.length; i++) {
+ context.getCounter("TlsProtocolVersion", versions[i]).increment(1);
+ }
+ }
+ }
+
private ParseStatus output(Text key, CrawlDatum datum, Content content,
ProtocolStatus pstatus, int status) throws InterruptedException{
return output(key, datum, content, pstatus, status, 0);
diff --git a/src/java/org/apache/nutch/net/protocols/Response.java b/src/java/org/apache/nutch/net/protocols/Response.java
index 514ce85613..3fbe932667 100644
--- a/src/java/org/apache/nutch/net/protocols/Response.java
+++ b/src/java/org/apache/nutch/net/protocols/Response.java
@@ -41,6 +41,18 @@ public interface Response extends HttpHeaders {
*/
public static final String IP_ADDRESS = "_ip_";
+ /**
+ * Key to hold the HTTP and SSL/TLS protocol versions if
+ * store.protocol.versions
is true.
+ */
+ public static final String PROTOCOL_VERSIONS = "_protocol_versions_";
+
+ /**
+ * Key to hold the SSL/TLS cipher suites
+ * store.protocol.versions
is true.
+ */
+ public static final String CIPHER_SUITES = "_cipher_suites_";
+
/**
* Key to hold the time when the page has been fetched
*/
diff --git a/src/java/org/commoncrawl/util/WarcCdxWriter.java b/src/java/org/commoncrawl/util/WarcCdxWriter.java
index 5032d4a3ae..305aa99826 100644
--- a/src/java/org/commoncrawl/util/WarcCdxWriter.java
+++ b/src/java/org/commoncrawl/util/WarcCdxWriter.java
@@ -98,12 +98,13 @@ public WarcCdxWriter(OutputStream warcOut, OutputStream cdxOut,
public URI writeWarcRevisitRecord(final URI targetUri, final String ip,
final int httpStatusCode, final Date date, final URI warcinfoId,
final URI relatedId, final String warcProfile, final Date refersToDate,
- final String payloadDigest, final String blockDigest, byte[] block,
+ final String payloadDigest, final String blockDigest,
+ String[] protocolVersions, String[] cipherSuites, byte[] block,
Content content) throws IOException {
long offset = countingOut.getByteCount();
URI recordId = super.writeWarcRevisitRecord(targetUri, ip, httpStatusCode,
date, warcinfoId, relatedId, warcProfile, refersToDate, payloadDigest,
- blockDigest, block, content);
+ blockDigest, protocolVersions, cipherSuites, block, content);
long length = (countingOut.getByteCount() - offset);
writeCdxLine(targetUri, date, offset, length, payloadDigest, content, true,
null, null);
@@ -114,12 +115,12 @@ public URI writeWarcRevisitRecord(final URI targetUri, final String ip,
public URI writeWarcResponseRecord(final URI targetUri, final String ip,
final int httpStatusCode, final Date date, final URI warcinfoId,
final URI relatedId, final String payloadDigest, final String blockDigest,
- final String truncated, final byte[] block, Content content)
- throws IOException {
+ final String truncated, String[] protocolVersions, String[] cipherSuites,
+ final byte[] block, Content content) throws IOException {
long offset = countingOut.getByteCount();
URI recordId = super.writeWarcResponseRecord(targetUri, ip, httpStatusCode,
date, warcinfoId, relatedId, payloadDigest, blockDigest, truncated,
- block, content);
+ protocolVersions, cipherSuites, block, content);
long length = (countingOut.getByteCount() - offset);
String redirectLocation = null;
if (isRedirect(httpStatusCode)) {
diff --git a/src/java/org/commoncrawl/util/WarcRecordWriter.java b/src/java/org/commoncrawl/util/WarcRecordWriter.java
index 4585dddbc0..56820fe6b2 100644
--- a/src/java/org/commoncrawl/util/WarcRecordWriter.java
+++ b/src/java/org/commoncrawl/util/WarcRecordWriter.java
@@ -76,6 +76,15 @@ class WarcRecordWriter extends RecordWriter {
protected static final String X_HIDE_HEADER = "X-Crawler-";
public static final String WARC_WRITER_COUNTER_GROUP = "WARC-Writer";
+ protected static final Pattern STATUS_LINE_PATTERN = Pattern
+ .compile("^HTTP/1\\.[01] [0-9]{3}(?: .*)?$");
+ protected static final Pattern WS_PATTERN = Pattern.compile("\\s+");
+ protected static final Pattern HTTP_VERSION_PATTERN = Pattern
+ .compile("^HTTP/1\\.[01]$");
+ protected static final Pattern HTTP_STATUS_CODE_PATTERN = Pattern
+ .compile("^[0-9]{3}$");
+ protected static final String HTTP_VERSION_FALLBACK = "HTTP/1.1";
+
private TaskAttemptContext context;
private DataOutputStream warcOut;
private WarcWriter warcWriter;
@@ -291,20 +300,54 @@ public static String formatHttpHeaders(String statusLine, List headers)
}
/**
+ * Fix the HTTP version in the status line - replace HTTP/2
+ * by HTTP/1.1
({@link this#HTTP_VERSION_FALLBACK}}.
+ *
+ * See also {@link #fixHttpHeaders(String, int)}
+ *
+ * @param headers
+ * HTTP 1.1 or 1.0 request header string, CR-LF-separated lines,
+ * first line is the status line
+ * @return safe HTTP request header
+ */
+ public static String fixHttpRequestHeaders(String headers) {
+ String http2version = " HTTP/2\r\n";
+ int pos = headers.indexOf(http2version);
+ if (pos >= 0) {
+ StringBuilder replacement = new StringBuilder();
+ String statusLinePrefix = headers.substring(0, pos);
+ if (statusLinePrefix.indexOf(CRLF) > 0) {
+ // match in subsequent header lines (should not or rarely happen)
+ return headers;
+ }
+ replacement.append(statusLinePrefix);
+ replacement.append(' ');
+ replacement.append(HTTP_VERSION_FALLBACK);
+ replacement.append(CRLF);
+ replacement.append(headers.substring(pos + http2version.length()));
+ return replacement.toString();
+ }
+ return headers;
+ }
+
+ /**
* Modify verbatim HTTP response headers: fix, remove or replace headers
* Content-Length
, Content-Encoding
and
* Transfer-Encoding
which may confuse WARC readers. Ensure that
* returned header end with a single empty line (\r\n\r\n
).
*
+ * If the HTTP version in the status line is HTTP/2
, replace it
+ * by HTTP/1.1
({@link this#HTTP_VERSION_FALLBACK}}.
+ *
* @param headers
* HTTP 1.1 or 1.0 response header string, CR-LF-separated lines,
- * first line is status line
+ * first line is the status line
* @return safe HTTP response header
*/
public static String fixHttpHeaders(String headers, int contentLength) {
int start = 0, lineEnd = 0, last = 0, trailingCrLf= 0;
boolean hasContentLength = false;
- StringBuilder replace = new StringBuilder();
+ StringBuilder replacement = new StringBuilder();
while (start < headers.length()) {
lineEnd = headers.indexOf(CRLF, start);
trailingCrLf = 1;
@@ -323,7 +366,32 @@ public static String fixHttpHeaders(String headers, int contentLength) {
boolean valid = true;
if (start == 0) {
// status line (without colon)
- // TODO: http/2
+ final String statusLine = headers.substring(0, lineEnd);
+ if (!STATUS_LINE_PATTERN.matcher(statusLine).matches()) {
+ final String[] parts = WS_PATTERN
+ .split(headers.substring(0, lineEnd), 3);
+ if (parts.length < 2
+ || !HTTP_STATUS_CODE_PATTERN.matcher(parts[1]).matches()) {
+ // nothing we can do here, leave status line as is
+ LOG.warn(
+ "WARC parsers may fail on non-standard HTTP 1.0 / 1.1 response status line: {}",
+ statusLine);
+ } else {
+ if (HTTP_VERSION_PATTERN.matcher(parts[0]).matches()) {
+ replacement.append(parts[0]);
+ } else {
+ replacement.append(HTTP_VERSION_FALLBACK);
+ }
+ replacement.append(' ');
+ replacement.append(parts[1]); // status code
+ replacement.append(' ');
+ if (parts.length == 3) {
+ replacement.append(parts[2]); // message
+ }
+ replacement.append(CRLF);
+ last = lineEnd + 2 * trailingCrLf;
+ }
+ }
} else if ((lineEnd + 4) == headers.length()
&& headers.endsWith(CRLF + CRLF)) {
// ok, trailing empty line
@@ -339,7 +407,7 @@ public static String fixHttpHeaders(String headers, int contentLength) {
}
if (!valid) {
if (last < start) {
- replace.append(headers.substring(last, start));
+ replacement.append(headers.substring(last, start));
}
last = lineEnd + 2 * trailingCrLf;
}
@@ -367,18 +435,18 @@ public static String fixHttpHeaders(String headers, int contentLength) {
}
if (needsFix) {
if (last < start) {
- replace.append(headers.substring(last, start));
+ replacement.append(headers.substring(last, start));
}
last = lineEnd + 2 * trailingCrLf;
- replace.append(X_HIDE_HEADER)
+ replacement.append(X_HIDE_HEADER)
.append(headers.substring(start, lineEnd + 2 * trailingCrLf));
if (trailingCrLf == 0) {
- replace.append(CRLF);
+ replacement.append(CRLF);
trailingCrLf = 1;
}
if (name.equalsIgnoreCase("content-length")) {
// add effective uncompressed and unchunked length of content
- replace.append("Content-Length").append(COLONSP)
+ replacement.append("Content-Length").append(COLONSP)
.append(contentLength).append(CRLF);
}
}
@@ -388,17 +456,17 @@ public static String fixHttpHeaders(String headers, int contentLength) {
if (last > 0 || trailingCrLf != 2 || !hasContentLength) {
if (last < headers.length()) {
// append trailing headers
- replace.append(headers.substring(last));
+ replacement.append(headers.substring(last));
}
if (!hasContentLength) {
- replace.append("Content-Length").append(COLONSP).append(contentLength)
+ replacement.append("Content-Length").append(COLONSP).append(contentLength)
.append(CRLF);
}
while (trailingCrLf < 2) {
- replace.append(CRLF);
+ replacement.append(CRLF);
trailingCrLf++;
}
- return replace.toString();
+ return replacement.toString();
}
return headers;
}
@@ -558,6 +626,8 @@ public synchronized void write(Text key, WarcCapture value)
int httpStatusCode = 200;
String fetchDuration = null;
String truncatedReason = null;
+ String[] protocolVersions = null;
+ String[] cipherSuites = null;
if (value.datum != null) {
date = new Date(value.datum.getFetchTime());
@@ -666,6 +736,12 @@ public synchronized void write(Text key, WarcCapture value)
case Response.TRUNCATED_CONTENT_REASON:
truncatedReason = val;
break;
+ case Response.PROTOCOL_VERSIONS:
+ protocolVersions = val.split(",");
+ break;
+ case Response.CIPHER_SUITES:
+ cipherSuites = val.split(",");
+ break;
case Nutch.SEGMENT_NAME_KEY:
case Nutch.FETCH_STATUS_KEY:
case Nutch.SCORE_KEY:
@@ -739,7 +815,9 @@ public synchronized void write(Text key, WarcCapture value)
URI requestId = null;
if (verbatimRequestHeaders != null) {
requestId = writer.writeWarcRequestRecord(targetUri, ip, date, infoId,
- verbatimRequestHeaders.getBytes(StandardCharsets.UTF_8));
+ protocolVersions, cipherSuites,
+ fixHttpRequestHeaders(verbatimRequestHeaders)
+ .getBytes(StandardCharsets.UTF_8));
}
if (generateCdx) {
@@ -804,7 +882,8 @@ public synchronized void write(Text key, WarcCapture value)
String payloadDigest = null;
writer.writeWarcRevisitRecord(targetUri, ip, httpStatusCode, date, infoId,
requestId, WarcWriter.PROFILE_REVISIT_NOT_MODIFIED, lastModifiedDate,
- payloadDigest, blockDigest, responseHeaderBytes, value.content);
+ payloadDigest, blockDigest, protocolVersions, cipherSuites,
+ responseHeaderBytes, value.content);
} else {
StringBuilder responsesb = new StringBuilder(4096);
responsesb.append(responseHeaders);
@@ -822,7 +901,7 @@ public synchronized void write(Text key, WarcCapture value)
String blockDigest = getSha1DigestWithAlg(responseBytes);
URI responseId = writer.writeWarcResponseRecord(targetUri, ip,
httpStatusCode, date, infoId, requestId, payloadDigest, blockDigest,
- truncatedReason, responseBytes, value.content);
+ truncatedReason, protocolVersions, cipherSuites, responseBytes, value.content);
// Write metadata record
StringBuilder metadatasb = new StringBuilder(4096);
diff --git a/src/java/org/commoncrawl/util/WarcWriter.java b/src/java/org/commoncrawl/util/WarcWriter.java
index 0c866d76fe..f59a585710 100644
--- a/src/java/org/commoncrawl/util/WarcWriter.java
+++ b/src/java/org/commoncrawl/util/WarcWriter.java
@@ -20,22 +20,33 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
+import java.lang.invoke.MethodHandles;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.text.SimpleDateFormat;
import java.util.Date;
-import java.util.LinkedHashMap;
import java.util.Locale;
import java.util.Map;
+import java.util.Set;
import java.util.TimeZone;
import java.util.UUID;
+import java.util.regex.Pattern;
import java.util.zip.GZIPOutputStream;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.protocol.Content;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.collect.LinkedListMultimap;
+import com.google.common.collect.Multimap;
public class WarcWriter {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
+
protected OutputStream out = null;
protected OutputStream origOut = null;
@@ -68,6 +79,10 @@ public class WarcWriter {
private static final String WARC_IDENTIFIED_PAYLOAD_TYPE = "WARC-Identified-Payload-Type";
private static final String WARC_PROFILE = "WARC-Profile";
private static final String WARC_FILENAME = "WARC-Filename";
+ /** WARC-Protocol, see https://github.com/iipc/warc-specifications/issues/42 */
+ private static final String WARC_PROTOCOL = "WARC-Protocol";
+ /** WARC-Cipher-Suite, see https://github.com/iipc/warc-specifications/issues/94 */
+ private static final String WARC_CIPHER_SUITE = "WARC-Cipher-Suite";
public static final String PROFILE_REVISIT_IDENTICAL_DIGEST = "http://netpreserve.org/warc/1.1/revisit/identical-payload-digest";
public static final String PROFILE_REVISIT_NOT_MODIFIED = "http://netpreserve.org/warc/1.1/revisit/server-not-modified";
@@ -98,6 +113,36 @@ public WarcWriter(final OutputStream out) {
isoDate.setTimeZone(TimeZone.getTimeZone("GMT"));
}
+ /**
+ * Class to hold HTTP and SSL/TLS protocol versions to fill the
+ * WARC-Protocol
field. Protocol names require normalization, see
+ * https://github.com/iipc/warc-specifications/issues/42
+ */
+ public static class WarcProtocol {
+ public static Set protocols = Set.of("dns", "ftp", "gemini",
+ "gopher", "http/0.9", "http/1.0", "http/1.1", "h2", "h2c", "spdy/1",
+ "spdy/2", "spdy/3", "ssl/2", "ssl/3", "tls/1.0", "tls/1.1", "tls/1.2",
+ "tls/1.3");
+ public static Pattern vPattern = Pattern.compile("^(?:ssl|tls)v[0-9]",
+ Pattern.CASE_INSENSITIVE);
+ private String name;
+
+ public WarcProtocol(final String protocol) {
+ name = protocol.toLowerCase(Locale.ROOT);
+ if (vPattern.matcher(name).find()) {
+ name = name.substring(0, 3) + '/' + name.substring(4);
+ }
+ if (!protocols.contains(name)) {
+ LOG.warn("Unknown protocol name or version: {}", name);
+ }
+ }
+
+ @Override
+ public String toString() {
+ return name;
+ }
+ }
+
/**
*
* @return record id for the warcinfo record
@@ -107,11 +152,11 @@ public URI writeWarcinfoRecord(String filename, String hostname,
String publisher, String operator, String software, String isPartOf,
String description, Date date)
throws IOException {
- Map extra = new LinkedHashMap();
+ Multimap extra = LinkedListMultimap.create();
extra.put(WARC_FILENAME, filename);
StringBuilder sb = new StringBuilder();
- Map settings = new LinkedHashMap();
+ Multimap settings = LinkedListMultimap.create();
if (isPartOf != null) {
settings.put("isPartOf", isPartOf);
@@ -157,12 +202,22 @@ public URI writeWarcinfoRecord(String filename, String hostname,
}
public URI writeWarcRequestRecord(final URI targetUri, final String ip,
- final Date date, final URI warcinfoId, final byte[] block)
- throws IOException {
- Map extra = new LinkedHashMap();
+ final Date date, final URI warcinfoId, String[] protocolVersions,
+ String[] cipherSuites, final byte[] block) throws IOException {
+ Multimap extra = LinkedListMultimap.create();
extra.put(WARC_WARCINFO_ID, "<" + warcinfoId.toString() + ">");
extra.put(WARC_IP_ADDRESS, ip);
extra.put(WARC_TARGET_URI, targetUri.toASCIIString());
+ if (protocolVersions != null) {
+ for (String pVersion : protocolVersions) {
+ extra.put(WARC_PROTOCOL, new WarcProtocol(pVersion).toString());
+ }
+ }
+ if (cipherSuites != null) {
+ for (String cipher : cipherSuites) {
+ extra.put(WARC_CIPHER_SUITE, cipher);
+ }
+ }
URI recordId = getRecordId();
writeRecord(WARC_REQUEST, date, "application/http; msgtype=request",
@@ -173,15 +228,25 @@ public URI writeWarcRequestRecord(final URI targetUri, final String ip,
public URI writeWarcResponseRecord(final URI targetUri, final String ip,
final int httpStatusCode, final Date date, final URI warcinfoId,
final URI relatedId, final String payloadDigest, final String blockDigest,
- final String truncated, final byte[] block, Content content)
- throws IOException {
- Map extra = new LinkedHashMap();
+ final String truncated, String[] protocolVersions, String[] cipherSuites,
+ final byte[] block, Content content) throws IOException {
+ Multimap extra = LinkedListMultimap.create();
extra.put(WARC_WARCINFO_ID, "<" + warcinfoId.toString() + ">");
if (relatedId != null) {
extra.put(WARC_CONCURRENT_TO, "<" + relatedId.toString() + ">");
}
extra.put(WARC_IP_ADDRESS, ip);
extra.put(WARC_TARGET_URI, targetUri.toASCIIString());
+ if (protocolVersions != null) {
+ for (String pVersion : protocolVersions) {
+ extra.put(WARC_PROTOCOL, new WarcProtocol(pVersion).toString());
+ }
+ }
+ if (cipherSuites != null) {
+ for (String cipher : cipherSuites) {
+ extra.put(WARC_CIPHER_SUITE, cipher);
+ }
+ }
if (payloadDigest != null) {
extra.put(WARC_PAYLOAD_DIGEST, payloadDigest);
@@ -206,13 +271,24 @@ public URI writeWarcResponseRecord(final URI targetUri, final String ip,
public URI writeWarcRevisitRecord(final URI targetUri, final String ip,
final int httpStatusCode, final Date date, final URI warcinfoId,
final URI relatedId, final String warcProfile, final Date refersToDate,
- final String payloadDigest, final String blockDigest, byte[] block,
+ final String payloadDigest, final String blockDigest,
+ String[] protocolVersions, String[] cipherSuites, byte[] block,
Content content) throws IOException {
- Map extra = new LinkedHashMap();
+ Multimap extra = LinkedListMultimap.create();
extra.put(WARC_WARCINFO_ID, "<" + warcinfoId.toString() + ">");
extra.put(WARC_REFERS_TO, "<" + relatedId.toString() + ">");
extra.put(WARC_IP_ADDRESS, ip);
extra.put(WARC_TARGET_URI, targetUri.toASCIIString());
+ if (protocolVersions != null) {
+ for (String pVersion : protocolVersions) {
+ extra.put(WARC_PROTOCOL, new WarcProtocol(pVersion).toString());
+ }
+ }
+ if (cipherSuites != null) {
+ for (String cipher : cipherSuites) {
+ extra.put(WARC_CIPHER_SUITE, cipher);
+ }
+ }
// WARC-Refers-To-Target-URI only useful for revisit by digest
extra.put(WARC_REFERS_TO_TARGET_URI, targetUri.toASCIIString());
if (refersToDate != null) {
@@ -235,7 +311,7 @@ public URI writeWarcRevisitRecord(final URI targetUri, final String ip,
public URI writeWarcMetadataRecord(final URI targetUri, final Date date,
final URI warcinfoId, final URI relatedId, final String blockDigest,
final byte[] block) throws IOException {
- Map extra = new LinkedHashMap();
+ Multimap extra = LinkedListMultimap.create();
extra.put(WARC_WARCINFO_ID, "<" + warcinfoId.toString() + ">");
extra.put(WARC_CONCURRENT_TO, "<" + relatedId.toString() + ">");
extra.put(WARC_TARGET_URI, targetUri.toASCIIString());
@@ -253,7 +329,7 @@ public URI writeWarcMetadataRecord(final URI targetUri, final Date date,
public URI writeWarcConversionRecord(final URI targetUri, final Date date,
final URI warcinfoId, final URI relatedId, final String blockDigest,
final String contentType, final byte[] block) throws IOException {
- Map extra = new LinkedHashMap();
+ Multimap extra = LinkedListMultimap.create();
extra.put(WARC_WARCINFO_ID, "<" + warcinfoId.toString() + ">");
extra.put(WARC_REFERS_TO, "<" + relatedId.toString() + ">");
extra.put(WARC_TARGET_URI, targetUri.toASCIIString());
@@ -268,13 +344,14 @@ public URI writeWarcConversionRecord(final URI targetUri, final Date date,
}
protected void writeRecord(final String type, final Date date,
- final String contentType, final URI recordId, Map extra,
- final InputStream content, final long contentLength) throws IOException {
+ final String contentType, final URI recordId,
+ Multimap extra, final InputStream content,
+ final long contentLength) throws IOException {
StringBuilder sb = new StringBuilder(4096);
sb.append(WARC_VERSION).append(CRLF);
- Map header = new LinkedHashMap();
+ Multimap header = LinkedListMultimap.create();
header.put(WARC_TYPE, type);
header.put(WARC_DATE, isoDate.format(date));
header.put(WARC_RECORD_ID, "<" + recordId.toString() + ">");
@@ -298,13 +375,13 @@ protected void writeRecord(final String type, final Date date,
}
protected void writeRecord(final String type, final Date date,
- final String contentType, final URI recordId, Map extra,
- final byte[] block) throws IOException {
+ final String contentType, final URI recordId,
+ Multimap extra, final byte[] block) throws IOException {
StringBuilder sb = new StringBuilder(4096);
sb.append(WARC_VERSION).append(CRLF);
- Map header = new LinkedHashMap();
+ Multimap header = LinkedListMultimap.create();
header.put(WARC_TYPE, type);
header.put(WARC_DATE, isoDate.format(date));
header.put(WARC_RECORD_ID, "<" + recordId.toString() + ">");
@@ -359,16 +436,25 @@ protected long copyStream(InputStream input, OutputStream output,
return count;
}
- protected void writeWarcKeyValue(StringBuilder sb,
+ protected static void writeWarcKeyValue(StringBuilder sb,
Map headers) {
if (headers != null) {
- for (Map.Entry entry : headers.entrySet()) {
- sb.append(entry.getKey()).append(COLONSP).append(entry.getValue())
- .append(CRLF);
- }
+ headers.forEach((k, v) -> writeWarcKeyValue(sb, k, v));
+ }
+ }
+
+ protected static void writeWarcKeyValue(StringBuilder sb,
+ Multimap headers) {
+ if (headers != null) {
+ headers.forEach((k, v) -> writeWarcKeyValue(sb, k, v));
}
}
+ protected static void writeWarcKeyValue(StringBuilder sb, String key,
+ String value) {
+ sb.append(key).append(COLONSP).append(value).append(CRLF);
+ }
+
private String getUUID() {
return UUID.randomUUID().toString();
}
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index 150f1ad821..7e337f844e 100644
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -167,6 +167,12 @@ public abstract class HttpBase implements Protocol {
*/
protected boolean storeHttpHeaders = false;
+ /**
+ * Record the HTTP and SSL/TLS protocol versions and the SSL/TLS cipher
+ * suites, see property store.protocol.versions
.
+ */
+ protected boolean storeProtocolVersions = false;
+
/** Skip page if Crawl-Delay longer than this value. */
protected long maxCrawlDelay = -1L;
@@ -235,6 +241,7 @@ public void setConf(Configuration conf) {
this.storeIPAddress = conf.getBoolean("store.ip.address", false);
this.storeHttpRequest = conf.getBoolean("store.http.request", false);
this.storeHttpHeaders = conf.getBoolean("store.http.headers", false);
+ this.storeProtocolVersions = conf.getBoolean("store.protocol.versions", false);
this.enableIfModifiedsinceHeader = conf
.getBoolean("http.enable.if.modified.since.header", true);
this.enableCookieHeader = conf.getBoolean("http.enable.cookie.header",
diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
index 876c4ef249..954c3f6df1 100644
--- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
+++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
@@ -25,6 +25,7 @@
import java.net.SocketAddress;
import java.net.URI;
import java.net.URL;
+import java.nio.charset.StandardCharsets;
import java.security.cert.CertificateException;
import java.util.ArrayList;
import java.util.Base64;
@@ -53,6 +54,7 @@
import okhttp3.Authenticator;
import okhttp3.Connection;
import okhttp3.ConnectionPool;
+import okhttp3.Handshake;
import okhttp3.Headers;
import okhttp3.Interceptor;
import okhttp3.OkHttpClient;
@@ -217,7 +219,8 @@ public Request authenticate(okhttp3.Route route,
builder.addNetworkInterceptor(new HTTPFilterIPAddressInterceptor(ipFilterRules));
}
- if (this.storeIPAddress || this.storeHttpHeaders || this.storeHttpRequest) {
+ if (this.storeIPAddress || this.storeHttpHeaders || this.storeHttpRequest
+ || this.storeProtocolVersions) {
builder.addNetworkInterceptor(new HTTPHeadersInterceptor());
}
@@ -373,17 +376,31 @@ public okhttp3.Response intercept(Interceptor.Chain chain)
}
if (requestverbatim != null) {
- byte[] encodedBytesRequest = Base64.getEncoder()
- .encode(requestverbatim.toString().getBytes());
+ byte[] encodedBytesRequest = Base64.getEncoder().encode(
+ requestverbatim.toString().getBytes(StandardCharsets.ISO_8859_1));
builder = builder.header(Response.REQUEST,
- new String(encodedBytesRequest));
+ new String(encodedBytesRequest, StandardCharsets.ISO_8859_1));
}
if (responseverbatim != null) {
- byte[] encodedBytesResponse = Base64.getEncoder()
- .encode(responseverbatim.toString().getBytes());
+ byte[] encodedBytesResponse = Base64.getEncoder().encode(
+ responseverbatim.toString().getBytes(StandardCharsets.ISO_8859_1));
builder = builder.header(Response.RESPONSE_HEADERS,
- new String(encodedBytesResponse));
+ new String(encodedBytesResponse, StandardCharsets.ISO_8859_1));
+ }
+
+ // store the HTTP and SSL/TLS protocol versions and SSL/TLS cipher suites
+ if (storeProtocolVersions) {
+ final StringBuilder protocols = new StringBuilder(
+ response.protocol().toString());
+ final Handshake handshake = connection.handshake();
+ if (handshake != null) {
+ protocols.append(',').append(handshake.tlsVersion().javaName());
+ builder = builder.header(Response.CIPHER_SUITES,
+ handshake.cipherSuite().toString());
+ }
+ builder = builder.header(Response.PROTOCOL_VERSIONS,
+ protocols.toString());
}
// returns a modified version of the response