diff --git a/scrapy_webarchive/extensions.py b/scrapy_webarchive/extensions.py index fcb9014..0f0595c 100644 --- a/scrapy_webarchive/extensions.py +++ b/scrapy_webarchive/extensions.py @@ -108,6 +108,7 @@ def from_crawler(cls, crawler: Crawler) -> Self: exporter = cls(crawler.settings, crawler) crawler.signals.connect(exporter.response_received, signal=signals.response_received) + crawler.signals.connect(exporter.response_downloaded, signal=signals.response_downloaded) crawler.signals.connect(exporter.spider_closed, signal=signals.spider_closed) crawler.signals.connect(exporter.spider_opened, signal=signals.spider_opened) return exporter @@ -143,6 +144,22 @@ def from_settings(cls, settings: Settings, crawler: Crawler): def spider_opened(self) -> None: self.writer.write_warcinfo(robotstxt_obey=self.settings["ROBOTSTXT_OBEY"]) + def response_downloaded(self, response: Response, request: Request, spider: Spider) -> None: + if 400 > response.status >= 300: + request.meta["WARC-Date"] = get_formatted_dt_string(format=WARC_DT_FORMAT) + + # Write response WARC record + record = self.writer.write_response(response, request) + self.stats.inc_value("webarchive/exporter/response_written", spider=spider) + self.stats.inc_value( + f"webarchive/exporter/writer_status_count/{record.http_headers.get_statuscode()}", + spider=spider, + ) + + # Write request WARC record + self.writer.write_request(request, concurrent_to=record) + self.stats.inc_value("webarchive/exporter/request_written", spider=spider) + def response_received(self, response: Response, request: Request, spider: Spider) -> None: request.meta["WARC-Date"] = get_formatted_dt_string(format=WARC_DT_FORMAT)