From 518b2c11cac28c12bc3bb6a7edc0b580a2416eab Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 9 Jul 2024 17:26:14 +0200 Subject: [PATCH 1/4] NUTCH-3062 protocol-okhttp: optionally record HTTP and SSL/TLS versions Adds property store.protocol.versions - if true, protocol-okhttp records protocol versions and related in response metadata. --- conf/nutch-default.xml | 10 +++++++ .../apache/nutch/net/protocols/Response.java | 12 ++++++++ .../nutch/protocol/http/api/HttpBase.java | 7 +++++ .../apache/nutch/protocol/okhttp/OkHttp.java | 28 +++++++++++++++---- 4 files changed, 51 insertions(+), 6 deletions(-) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 4b5febe7cf..bdef854fd9 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -2885,6 +2885,16 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this + + store.protocol.versions + false + + Store protocol versions in response metadata: HTTP and SSL/TLS + versions, SSL/TTL cipher suites and related information depending + on the protocol implementation. Supported by: protocol-okhttp. + + + diff --git a/src/java/org/apache/nutch/net/protocols/Response.java b/src/java/org/apache/nutch/net/protocols/Response.java index 514ce85613..3fbe932667 100644 --- a/src/java/org/apache/nutch/net/protocols/Response.java +++ b/src/java/org/apache/nutch/net/protocols/Response.java @@ -41,6 +41,18 @@ public interface Response extends HttpHeaders { */ public static final String IP_ADDRESS = "_ip_"; + /** + * Key to hold the HTTP and SSL/TLS protocol versions if + * store.protocol.versions is true. + */ + public static final String PROTOCOL_VERSIONS = "_protocol_versions_"; + + /** + * Key to hold the SSL/TLS cipher suites + * store.protocol.versions is true. + */ + public static final String CIPHER_SUITES = "_cipher_suites_"; + /** * Key to hold the time when the page has been fetched */ diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java index 150f1ad821..7e337f844e 100644 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java @@ -167,6 +167,12 @@ public abstract class HttpBase implements Protocol { */ protected boolean storeHttpHeaders = false; + /** + * Record the HTTP and SSL/TLS protocol versions and the SSL/TLS cipher + * suites, see property store.protocol.versions. + */ + protected boolean storeProtocolVersions = false; + /** Skip page if Crawl-Delay longer than this value. */ protected long maxCrawlDelay = -1L; @@ -235,6 +241,7 @@ public void setConf(Configuration conf) { this.storeIPAddress = conf.getBoolean("store.ip.address", false); this.storeHttpRequest = conf.getBoolean("store.http.request", false); this.storeHttpHeaders = conf.getBoolean("store.http.headers", false); + this.storeProtocolVersions = conf.getBoolean("store.protocol.versions", false); this.enableIfModifiedsinceHeader = conf .getBoolean("http.enable.if.modified.since.header", true); this.enableCookieHeader = conf.getBoolean("http.enable.cookie.header", diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java index 876c4ef249..2fe2329776 100644 --- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java +++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java @@ -25,6 +25,7 @@ import java.net.SocketAddress; import java.net.URI; import java.net.URL; +import java.nio.charset.StandardCharsets; import java.security.cert.CertificateException; import java.util.ArrayList; import java.util.Base64; @@ -53,6 +54,7 @@ import okhttp3.Authenticator; import okhttp3.Connection; import okhttp3.ConnectionPool; +import okhttp3.Handshake; import okhttp3.Headers; import okhttp3.Interceptor; import okhttp3.OkHttpClient; @@ -373,17 +375,31 @@ public okhttp3.Response intercept(Interceptor.Chain chain) } if (requestverbatim != null) { - byte[] encodedBytesRequest = Base64.getEncoder() - .encode(requestverbatim.toString().getBytes()); + byte[] encodedBytesRequest = Base64.getEncoder().encode( + requestverbatim.toString().getBytes(StandardCharsets.ISO_8859_1)); builder = builder.header(Response.REQUEST, - new String(encodedBytesRequest)); + new String(encodedBytesRequest, StandardCharsets.ISO_8859_1)); } if (responseverbatim != null) { - byte[] encodedBytesResponse = Base64.getEncoder() - .encode(responseverbatim.toString().getBytes()); + byte[] encodedBytesResponse = Base64.getEncoder().encode( + responseverbatim.toString().getBytes(StandardCharsets.ISO_8859_1)); builder = builder.header(Response.RESPONSE_HEADERS, - new String(encodedBytesResponse)); + new String(encodedBytesResponse, StandardCharsets.ISO_8859_1)); + } + + // store the HTTP and SSL/TLS protocol versions and SSL/TLS cipher suites + if (storeProtocolVersions) { + final StringBuilder protocols = new StringBuilder( + response.protocol().toString()); + final Handshake handshake = connection.handshake(); + if (handshake != null) { + protocols.append(',').append(handshake.tlsVersion().javaName()); + builder = builder.header(Response.CIPHER_SUITES, + handshake.cipherSuite().toString()); + } + builder = builder.header(Response.PROTOCOL_VERSIONS, + protocols.toString()); } // returns a modified version of the response From fc0d538f4b128be6a49d0665ed93f5e6a2ab5baa Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 17 Jul 2024 15:56:14 +0200 Subject: [PATCH 2/4] NUTCH-3062 protocol-okhttp: optionally record HTTP and SSL/TLS versions Activate HTTPHeadersInterceptor also if only store.protocol.versions is true, but no other headers and connection information is intercepted. --- .../src/java/org/apache/nutch/protocol/okhttp/OkHttp.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java index 2fe2329776..954c3f6df1 100644 --- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java +++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java @@ -219,7 +219,8 @@ public Request authenticate(okhttp3.Route route, builder.addNetworkInterceptor(new HTTPFilterIPAddressInterceptor(ipFilterRules)); } - if (this.storeIPAddress || this.storeHttpHeaders || this.storeHttpRequest) { + if (this.storeIPAddress || this.storeHttpHeaders || this.storeHttpRequest + || this.storeProtocolVersions) { builder.addNetworkInterceptor(new HTTPHeadersInterceptor()); } From 39d731aa02449d00e10c53a93544622c3099c347 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 9 Jul 2024 18:26:25 +0200 Subject: [PATCH 3/4] Fetcher to count protocol versions In combination with NUTCH-3062, if store.protocol.versions is true: adds two counters "HttpProtocolVersion" and "TlsProtocolVersion" counting the values stored by protocol-okhttp in the protocol versions field of content metadata. Counted values are the protocol versions. --- .../apache/nutch/fetcher/FetcherThread.java | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java index 3fb083ace1..2d86ddfe64 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherThread.java +++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java @@ -45,6 +45,7 @@ import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.net.protocols.ProtocolLogUtil; +import org.apache.nutch.net.protocols.Response; import org.apache.nutch.parse.Outlink; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseData; @@ -136,6 +137,7 @@ public class FetcherThread extends Thread { private boolean storingContent; private boolean storingWarc; private boolean storing404s; + private boolean storingProtocolVersions; private boolean signatureWithoutParsing; @@ -195,6 +197,8 @@ public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQ this.storingContent = storingContent; this.storing404s = conf.getBoolean("fetcher.store.404s", false); this.storingWarc = Fetcher.isStoringWarc(conf); + this.storingProtocolVersions = conf.getBoolean("store.protocol.versions", + false); this.pages = pages; this.bytes = bytes; @@ -428,6 +432,10 @@ public void run() { } context.getCounter("FetcherStatus", status.getName()).increment(1); + if (storingProtocolVersions && content != null) { + countProtocolVersions(content.getMetadata()); + } + switch (status.getCode()) { case ProtocolStatus.WOULDBLOCK: @@ -684,6 +692,24 @@ private void logError(Text url, String message) { errors.incrementAndGet(); } + private void countProtocolVersions(Metadata contentMetadata) { + if (contentMetadata == null) { + return; + } + String versionStr = contentMetadata.get(Response.PROTOCOL_VERSIONS); + if (versionStr != null) { + String[] versions = versionStr.split(","); + if (versions.length >= 1) { + context.getCounter("HttpProtocolVersion", versions[0]).increment(1); + } else { + context.getCounter("HttpProtocolVersion", "unknown").increment(1); + } + for (int i = 1; i < versions.length; i++) { + context.getCounter("TlsProtocolVersion", versions[i]).increment(1); + } + } + } + private ParseStatus output(Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status) throws InterruptedException{ return output(key, datum, content, pstatus, status, 0); From 5f4369298cbedf85572514d5f97a346935e338f0 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 9 Jul 2024 22:35:42 +0200 Subject: [PATCH 4/4] WARC writer support HTTP/2 - HTTP headers: replace HTTP/2 and alike by HTTP/1.1 to ensure backward-compatibility for WARC readers, see https://github.com/iipc/warc-specifications/issues/15 - store protocol versions and cipher suites in WARC headers WARC-Protocol and WARC-Cipher-Suite, see https://github.com/iipc/warc-specifications/issues/42 https://github.com/iipc/warc-specifications/issues/86 - allow multiple WARC headers of the same name (WARC-Protocol may occur twice to hold the HTTP and TLS version) --- .../org/commoncrawl/util/WarcCdxWriter.java | 11 +- .../commoncrawl/util/WarcRecordWriter.java | 109 ++++++++++++-- src/java/org/commoncrawl/util/WarcWriter.java | 134 ++++++++++++++---- 3 files changed, 210 insertions(+), 44 deletions(-) diff --git a/src/java/org/commoncrawl/util/WarcCdxWriter.java b/src/java/org/commoncrawl/util/WarcCdxWriter.java index 5032d4a3ae..305aa99826 100644 --- a/src/java/org/commoncrawl/util/WarcCdxWriter.java +++ b/src/java/org/commoncrawl/util/WarcCdxWriter.java @@ -98,12 +98,13 @@ public WarcCdxWriter(OutputStream warcOut, OutputStream cdxOut, public URI writeWarcRevisitRecord(final URI targetUri, final String ip, final int httpStatusCode, final Date date, final URI warcinfoId, final URI relatedId, final String warcProfile, final Date refersToDate, - final String payloadDigest, final String blockDigest, byte[] block, + final String payloadDigest, final String blockDigest, + String[] protocolVersions, String[] cipherSuites, byte[] block, Content content) throws IOException { long offset = countingOut.getByteCount(); URI recordId = super.writeWarcRevisitRecord(targetUri, ip, httpStatusCode, date, warcinfoId, relatedId, warcProfile, refersToDate, payloadDigest, - blockDigest, block, content); + blockDigest, protocolVersions, cipherSuites, block, content); long length = (countingOut.getByteCount() - offset); writeCdxLine(targetUri, date, offset, length, payloadDigest, content, true, null, null); @@ -114,12 +115,12 @@ public URI writeWarcRevisitRecord(final URI targetUri, final String ip, public URI writeWarcResponseRecord(final URI targetUri, final String ip, final int httpStatusCode, final Date date, final URI warcinfoId, final URI relatedId, final String payloadDigest, final String blockDigest, - final String truncated, final byte[] block, Content content) - throws IOException { + final String truncated, String[] protocolVersions, String[] cipherSuites, + final byte[] block, Content content) throws IOException { long offset = countingOut.getByteCount(); URI recordId = super.writeWarcResponseRecord(targetUri, ip, httpStatusCode, date, warcinfoId, relatedId, payloadDigest, blockDigest, truncated, - block, content); + protocolVersions, cipherSuites, block, content); long length = (countingOut.getByteCount() - offset); String redirectLocation = null; if (isRedirect(httpStatusCode)) { diff --git a/src/java/org/commoncrawl/util/WarcRecordWriter.java b/src/java/org/commoncrawl/util/WarcRecordWriter.java index 4585dddbc0..56820fe6b2 100644 --- a/src/java/org/commoncrawl/util/WarcRecordWriter.java +++ b/src/java/org/commoncrawl/util/WarcRecordWriter.java @@ -76,6 +76,15 @@ class WarcRecordWriter extends RecordWriter { protected static final String X_HIDE_HEADER = "X-Crawler-"; public static final String WARC_WRITER_COUNTER_GROUP = "WARC-Writer"; + protected static final Pattern STATUS_LINE_PATTERN = Pattern + .compile("^HTTP/1\\.[01] [0-9]{3}(?: .*)?$"); + protected static final Pattern WS_PATTERN = Pattern.compile("\\s+"); + protected static final Pattern HTTP_VERSION_PATTERN = Pattern + .compile("^HTTP/1\\.[01]$"); + protected static final Pattern HTTP_STATUS_CODE_PATTERN = Pattern + .compile("^[0-9]{3}$"); + protected static final String HTTP_VERSION_FALLBACK = "HTTP/1.1"; + private TaskAttemptContext context; private DataOutputStream warcOut; private WarcWriter warcWriter; @@ -291,20 +300,54 @@ public static String formatHttpHeaders(String statusLine, List headers) } /** + * Fix the HTTP version in the status line - replace HTTP/2 + * by HTTP/1.1 ({@link this#HTTP_VERSION_FALLBACK}}. + * + * See also {@link #fixHttpHeaders(String, int)} + * + * @param headers + * HTTP 1.1 or 1.0 request header string, CR-LF-separated lines, + * first line is the status line + * @return safe HTTP request header + */ + public static String fixHttpRequestHeaders(String headers) { + String http2version = " HTTP/2\r\n"; + int pos = headers.indexOf(http2version); + if (pos >= 0) { + StringBuilder replacement = new StringBuilder(); + String statusLinePrefix = headers.substring(0, pos); + if (statusLinePrefix.indexOf(CRLF) > 0) { + // match in subsequent header lines (should not or rarely happen) + return headers; + } + replacement.append(statusLinePrefix); + replacement.append(' '); + replacement.append(HTTP_VERSION_FALLBACK); + replacement.append(CRLF); + replacement.append(headers.substring(pos + http2version.length())); + return replacement.toString(); + } + return headers; + } + + /** * Modify verbatim HTTP response headers: fix, remove or replace headers * Content-Length, Content-Encoding and * Transfer-Encoding which may confuse WARC readers. Ensure that * returned header end with a single empty line (\r\n\r\n). * + * If the HTTP version in the status line is HTTP/2, replace it + * by HTTP/1.1 ({@link this#HTTP_VERSION_FALLBACK}}. + * * @param headers * HTTP 1.1 or 1.0 response header string, CR-LF-separated lines, - * first line is status line + * first line is the status line * @return safe HTTP response header */ public static String fixHttpHeaders(String headers, int contentLength) { int start = 0, lineEnd = 0, last = 0, trailingCrLf= 0; boolean hasContentLength = false; - StringBuilder replace = new StringBuilder(); + StringBuilder replacement = new StringBuilder(); while (start < headers.length()) { lineEnd = headers.indexOf(CRLF, start); trailingCrLf = 1; @@ -323,7 +366,32 @@ public static String fixHttpHeaders(String headers, int contentLength) { boolean valid = true; if (start == 0) { // status line (without colon) - // TODO: http/2 + final String statusLine = headers.substring(0, lineEnd); + if (!STATUS_LINE_PATTERN.matcher(statusLine).matches()) { + final String[] parts = WS_PATTERN + .split(headers.substring(0, lineEnd), 3); + if (parts.length < 2 + || !HTTP_STATUS_CODE_PATTERN.matcher(parts[1]).matches()) { + // nothing we can do here, leave status line as is + LOG.warn( + "WARC parsers may fail on non-standard HTTP 1.0 / 1.1 response status line: {}", + statusLine); + } else { + if (HTTP_VERSION_PATTERN.matcher(parts[0]).matches()) { + replacement.append(parts[0]); + } else { + replacement.append(HTTP_VERSION_FALLBACK); + } + replacement.append(' '); + replacement.append(parts[1]); // status code + replacement.append(' '); + if (parts.length == 3) { + replacement.append(parts[2]); // message + } + replacement.append(CRLF); + last = lineEnd + 2 * trailingCrLf; + } + } } else if ((lineEnd + 4) == headers.length() && headers.endsWith(CRLF + CRLF)) { // ok, trailing empty line @@ -339,7 +407,7 @@ public static String fixHttpHeaders(String headers, int contentLength) { } if (!valid) { if (last < start) { - replace.append(headers.substring(last, start)); + replacement.append(headers.substring(last, start)); } last = lineEnd + 2 * trailingCrLf; } @@ -367,18 +435,18 @@ public static String fixHttpHeaders(String headers, int contentLength) { } if (needsFix) { if (last < start) { - replace.append(headers.substring(last, start)); + replacement.append(headers.substring(last, start)); } last = lineEnd + 2 * trailingCrLf; - replace.append(X_HIDE_HEADER) + replacement.append(X_HIDE_HEADER) .append(headers.substring(start, lineEnd + 2 * trailingCrLf)); if (trailingCrLf == 0) { - replace.append(CRLF); + replacement.append(CRLF); trailingCrLf = 1; } if (name.equalsIgnoreCase("content-length")) { // add effective uncompressed and unchunked length of content - replace.append("Content-Length").append(COLONSP) + replacement.append("Content-Length").append(COLONSP) .append(contentLength).append(CRLF); } } @@ -388,17 +456,17 @@ public static String fixHttpHeaders(String headers, int contentLength) { if (last > 0 || trailingCrLf != 2 || !hasContentLength) { if (last < headers.length()) { // append trailing headers - replace.append(headers.substring(last)); + replacement.append(headers.substring(last)); } if (!hasContentLength) { - replace.append("Content-Length").append(COLONSP).append(contentLength) + replacement.append("Content-Length").append(COLONSP).append(contentLength) .append(CRLF); } while (trailingCrLf < 2) { - replace.append(CRLF); + replacement.append(CRLF); trailingCrLf++; } - return replace.toString(); + return replacement.toString(); } return headers; } @@ -558,6 +626,8 @@ public synchronized void write(Text key, WarcCapture value) int httpStatusCode = 200; String fetchDuration = null; String truncatedReason = null; + String[] protocolVersions = null; + String[] cipherSuites = null; if (value.datum != null) { date = new Date(value.datum.getFetchTime()); @@ -666,6 +736,12 @@ public synchronized void write(Text key, WarcCapture value) case Response.TRUNCATED_CONTENT_REASON: truncatedReason = val; break; + case Response.PROTOCOL_VERSIONS: + protocolVersions = val.split(","); + break; + case Response.CIPHER_SUITES: + cipherSuites = val.split(","); + break; case Nutch.SEGMENT_NAME_KEY: case Nutch.FETCH_STATUS_KEY: case Nutch.SCORE_KEY: @@ -739,7 +815,9 @@ public synchronized void write(Text key, WarcCapture value) URI requestId = null; if (verbatimRequestHeaders != null) { requestId = writer.writeWarcRequestRecord(targetUri, ip, date, infoId, - verbatimRequestHeaders.getBytes(StandardCharsets.UTF_8)); + protocolVersions, cipherSuites, + fixHttpRequestHeaders(verbatimRequestHeaders) + .getBytes(StandardCharsets.UTF_8)); } if (generateCdx) { @@ -804,7 +882,8 @@ public synchronized void write(Text key, WarcCapture value) String payloadDigest = null; writer.writeWarcRevisitRecord(targetUri, ip, httpStatusCode, date, infoId, requestId, WarcWriter.PROFILE_REVISIT_NOT_MODIFIED, lastModifiedDate, - payloadDigest, blockDigest, responseHeaderBytes, value.content); + payloadDigest, blockDigest, protocolVersions, cipherSuites, + responseHeaderBytes, value.content); } else { StringBuilder responsesb = new StringBuilder(4096); responsesb.append(responseHeaders); @@ -822,7 +901,7 @@ public synchronized void write(Text key, WarcCapture value) String blockDigest = getSha1DigestWithAlg(responseBytes); URI responseId = writer.writeWarcResponseRecord(targetUri, ip, httpStatusCode, date, infoId, requestId, payloadDigest, blockDigest, - truncatedReason, responseBytes, value.content); + truncatedReason, protocolVersions, cipherSuites, responseBytes, value.content); // Write metadata record StringBuilder metadatasb = new StringBuilder(4096); diff --git a/src/java/org/commoncrawl/util/WarcWriter.java b/src/java/org/commoncrawl/util/WarcWriter.java index 0c866d76fe..f59a585710 100644 --- a/src/java/org/commoncrawl/util/WarcWriter.java +++ b/src/java/org/commoncrawl/util/WarcWriter.java @@ -20,22 +20,33 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.lang.invoke.MethodHandles; import java.net.URI; import java.net.URISyntaxException; import java.nio.charset.StandardCharsets; import java.text.SimpleDateFormat; import java.util.Date; -import java.util.LinkedHashMap; import java.util.Locale; import java.util.Map; +import java.util.Set; import java.util.TimeZone; import java.util.UUID; +import java.util.regex.Pattern; import java.util.zip.GZIPOutputStream; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.protocol.Content; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.collect.LinkedListMultimap; +import com.google.common.collect.Multimap; public class WarcWriter { + + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + protected OutputStream out = null; protected OutputStream origOut = null; @@ -68,6 +79,10 @@ public class WarcWriter { private static final String WARC_IDENTIFIED_PAYLOAD_TYPE = "WARC-Identified-Payload-Type"; private static final String WARC_PROFILE = "WARC-Profile"; private static final String WARC_FILENAME = "WARC-Filename"; + /** WARC-Protocol, see https://github.com/iipc/warc-specifications/issues/42 */ + private static final String WARC_PROTOCOL = "WARC-Protocol"; + /** WARC-Cipher-Suite, see https://github.com/iipc/warc-specifications/issues/94 */ + private static final String WARC_CIPHER_SUITE = "WARC-Cipher-Suite"; public static final String PROFILE_REVISIT_IDENTICAL_DIGEST = "http://netpreserve.org/warc/1.1/revisit/identical-payload-digest"; public static final String PROFILE_REVISIT_NOT_MODIFIED = "http://netpreserve.org/warc/1.1/revisit/server-not-modified"; @@ -98,6 +113,36 @@ public WarcWriter(final OutputStream out) { isoDate.setTimeZone(TimeZone.getTimeZone("GMT")); } + /** + * Class to hold HTTP and SSL/TLS protocol versions to fill the + * WARC-Protocol field. Protocol names require normalization, see + * https://github.com/iipc/warc-specifications/issues/42 + */ + public static class WarcProtocol { + public static Set protocols = Set.of("dns", "ftp", "gemini", + "gopher", "http/0.9", "http/1.0", "http/1.1", "h2", "h2c", "spdy/1", + "spdy/2", "spdy/3", "ssl/2", "ssl/3", "tls/1.0", "tls/1.1", "tls/1.2", + "tls/1.3"); + public static Pattern vPattern = Pattern.compile("^(?:ssl|tls)v[0-9]", + Pattern.CASE_INSENSITIVE); + private String name; + + public WarcProtocol(final String protocol) { + name = protocol.toLowerCase(Locale.ROOT); + if (vPattern.matcher(name).find()) { + name = name.substring(0, 3) + '/' + name.substring(4); + } + if (!protocols.contains(name)) { + LOG.warn("Unknown protocol name or version: {}", name); + } + } + + @Override + public String toString() { + return name; + } + } + /** * * @return record id for the warcinfo record @@ -107,11 +152,11 @@ public URI writeWarcinfoRecord(String filename, String hostname, String publisher, String operator, String software, String isPartOf, String description, Date date) throws IOException { - Map extra = new LinkedHashMap(); + Multimap extra = LinkedListMultimap.create(); extra.put(WARC_FILENAME, filename); StringBuilder sb = new StringBuilder(); - Map settings = new LinkedHashMap(); + Multimap settings = LinkedListMultimap.create(); if (isPartOf != null) { settings.put("isPartOf", isPartOf); @@ -157,12 +202,22 @@ public URI writeWarcinfoRecord(String filename, String hostname, } public URI writeWarcRequestRecord(final URI targetUri, final String ip, - final Date date, final URI warcinfoId, final byte[] block) - throws IOException { - Map extra = new LinkedHashMap(); + final Date date, final URI warcinfoId, String[] protocolVersions, + String[] cipherSuites, final byte[] block) throws IOException { + Multimap extra = LinkedListMultimap.create(); extra.put(WARC_WARCINFO_ID, "<" + warcinfoId.toString() + ">"); extra.put(WARC_IP_ADDRESS, ip); extra.put(WARC_TARGET_URI, targetUri.toASCIIString()); + if (protocolVersions != null) { + for (String pVersion : protocolVersions) { + extra.put(WARC_PROTOCOL, new WarcProtocol(pVersion).toString()); + } + } + if (cipherSuites != null) { + for (String cipher : cipherSuites) { + extra.put(WARC_CIPHER_SUITE, cipher); + } + } URI recordId = getRecordId(); writeRecord(WARC_REQUEST, date, "application/http; msgtype=request", @@ -173,15 +228,25 @@ public URI writeWarcRequestRecord(final URI targetUri, final String ip, public URI writeWarcResponseRecord(final URI targetUri, final String ip, final int httpStatusCode, final Date date, final URI warcinfoId, final URI relatedId, final String payloadDigest, final String blockDigest, - final String truncated, final byte[] block, Content content) - throws IOException { - Map extra = new LinkedHashMap(); + final String truncated, String[] protocolVersions, String[] cipherSuites, + final byte[] block, Content content) throws IOException { + Multimap extra = LinkedListMultimap.create(); extra.put(WARC_WARCINFO_ID, "<" + warcinfoId.toString() + ">"); if (relatedId != null) { extra.put(WARC_CONCURRENT_TO, "<" + relatedId.toString() + ">"); } extra.put(WARC_IP_ADDRESS, ip); extra.put(WARC_TARGET_URI, targetUri.toASCIIString()); + if (protocolVersions != null) { + for (String pVersion : protocolVersions) { + extra.put(WARC_PROTOCOL, new WarcProtocol(pVersion).toString()); + } + } + if (cipherSuites != null) { + for (String cipher : cipherSuites) { + extra.put(WARC_CIPHER_SUITE, cipher); + } + } if (payloadDigest != null) { extra.put(WARC_PAYLOAD_DIGEST, payloadDigest); @@ -206,13 +271,24 @@ public URI writeWarcResponseRecord(final URI targetUri, final String ip, public URI writeWarcRevisitRecord(final URI targetUri, final String ip, final int httpStatusCode, final Date date, final URI warcinfoId, final URI relatedId, final String warcProfile, final Date refersToDate, - final String payloadDigest, final String blockDigest, byte[] block, + final String payloadDigest, final String blockDigest, + String[] protocolVersions, String[] cipherSuites, byte[] block, Content content) throws IOException { - Map extra = new LinkedHashMap(); + Multimap extra = LinkedListMultimap.create(); extra.put(WARC_WARCINFO_ID, "<" + warcinfoId.toString() + ">"); extra.put(WARC_REFERS_TO, "<" + relatedId.toString() + ">"); extra.put(WARC_IP_ADDRESS, ip); extra.put(WARC_TARGET_URI, targetUri.toASCIIString()); + if (protocolVersions != null) { + for (String pVersion : protocolVersions) { + extra.put(WARC_PROTOCOL, new WarcProtocol(pVersion).toString()); + } + } + if (cipherSuites != null) { + for (String cipher : cipherSuites) { + extra.put(WARC_CIPHER_SUITE, cipher); + } + } // WARC-Refers-To-Target-URI only useful for revisit by digest extra.put(WARC_REFERS_TO_TARGET_URI, targetUri.toASCIIString()); if (refersToDate != null) { @@ -235,7 +311,7 @@ public URI writeWarcRevisitRecord(final URI targetUri, final String ip, public URI writeWarcMetadataRecord(final URI targetUri, final Date date, final URI warcinfoId, final URI relatedId, final String blockDigest, final byte[] block) throws IOException { - Map extra = new LinkedHashMap(); + Multimap extra = LinkedListMultimap.create(); extra.put(WARC_WARCINFO_ID, "<" + warcinfoId.toString() + ">"); extra.put(WARC_CONCURRENT_TO, "<" + relatedId.toString() + ">"); extra.put(WARC_TARGET_URI, targetUri.toASCIIString()); @@ -253,7 +329,7 @@ public URI writeWarcMetadataRecord(final URI targetUri, final Date date, public URI writeWarcConversionRecord(final URI targetUri, final Date date, final URI warcinfoId, final URI relatedId, final String blockDigest, final String contentType, final byte[] block) throws IOException { - Map extra = new LinkedHashMap(); + Multimap extra = LinkedListMultimap.create(); extra.put(WARC_WARCINFO_ID, "<" + warcinfoId.toString() + ">"); extra.put(WARC_REFERS_TO, "<" + relatedId.toString() + ">"); extra.put(WARC_TARGET_URI, targetUri.toASCIIString()); @@ -268,13 +344,14 @@ public URI writeWarcConversionRecord(final URI targetUri, final Date date, } protected void writeRecord(final String type, final Date date, - final String contentType, final URI recordId, Map extra, - final InputStream content, final long contentLength) throws IOException { + final String contentType, final URI recordId, + Multimap extra, final InputStream content, + final long contentLength) throws IOException { StringBuilder sb = new StringBuilder(4096); sb.append(WARC_VERSION).append(CRLF); - Map header = new LinkedHashMap(); + Multimap header = LinkedListMultimap.create(); header.put(WARC_TYPE, type); header.put(WARC_DATE, isoDate.format(date)); header.put(WARC_RECORD_ID, "<" + recordId.toString() + ">"); @@ -298,13 +375,13 @@ protected void writeRecord(final String type, final Date date, } protected void writeRecord(final String type, final Date date, - final String contentType, final URI recordId, Map extra, - final byte[] block) throws IOException { + final String contentType, final URI recordId, + Multimap extra, final byte[] block) throws IOException { StringBuilder sb = new StringBuilder(4096); sb.append(WARC_VERSION).append(CRLF); - Map header = new LinkedHashMap(); + Multimap header = LinkedListMultimap.create(); header.put(WARC_TYPE, type); header.put(WARC_DATE, isoDate.format(date)); header.put(WARC_RECORD_ID, "<" + recordId.toString() + ">"); @@ -359,16 +436,25 @@ protected long copyStream(InputStream input, OutputStream output, return count; } - protected void writeWarcKeyValue(StringBuilder sb, + protected static void writeWarcKeyValue(StringBuilder sb, Map headers) { if (headers != null) { - for (Map.Entry entry : headers.entrySet()) { - sb.append(entry.getKey()).append(COLONSP).append(entry.getValue()) - .append(CRLF); - } + headers.forEach((k, v) -> writeWarcKeyValue(sb, k, v)); + } + } + + protected static void writeWarcKeyValue(StringBuilder sb, + Multimap headers) { + if (headers != null) { + headers.forEach((k, v) -> writeWarcKeyValue(sb, k, v)); } } + protected static void writeWarcKeyValue(StringBuilder sb, String key, + String value) { + sb.append(key).append(COLONSP).append(value).append(CRLF); + } + private String getUUID() { return UUID.randomUUID().toString(); }