Skip to content

Commit

Permalink
Write redirect request/response to WARC
Browse files Browse the repository at this point in the history
  • Loading branch information
Wesley van Lee committed Nov 5, 2024
1 parent aa7c701 commit e781fd2
Showing 1 changed file with 17 additions and 0 deletions.
17 changes: 17 additions & 0 deletions scrapy_webarchive/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def from_crawler(cls, crawler: Crawler) -> Self:
exporter = cls(crawler.settings, crawler)

crawler.signals.connect(exporter.response_received, signal=signals.response_received)
crawler.signals.connect(exporter.response_downloaded, signal=signals.response_downloaded)
crawler.signals.connect(exporter.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(exporter.spider_opened, signal=signals.spider_opened)
return exporter
Expand Down Expand Up @@ -143,6 +144,22 @@ def from_settings(cls, settings: Settings, crawler: Crawler):
def spider_opened(self) -> None:
self.writer.write_warcinfo(robotstxt_obey=self.settings["ROBOTSTXT_OBEY"])

def response_downloaded(self, response: Response, request: Request, spider: Spider) -> None:
if 400 > response.status >= 300:
request.meta["WARC-Date"] = get_formatted_dt_string(format=WARC_DT_FORMAT)

# Write response WARC record
record = self.writer.write_response(response, request)
self.stats.inc_value("webarchive/exporter/response_written", spider=spider)
self.stats.inc_value(
f"webarchive/exporter/writer_status_count/{record.http_headers.get_statuscode()}",
spider=spider,
)

# Write request WARC record
self.writer.write_request(request, concurrent_to=record)
self.stats.inc_value("webarchive/exporter/request_written", spider=spider)

def response_received(self, response: Response, request: Request, spider: Spider) -> None:
request.meta["WARC-Date"] = get_formatted_dt_string(format=WARC_DT_FORMAT)

Expand Down

0 comments on commit e781fd2

Please sign in to comment.