Add datapackage.json generation in WACZ (#15)

q-m · Oct 28, 2024 · 9ea5f71 · 9ea5f71
1 parent ba6dcaa
commit 9ea5f71
Show file tree

Hide file tree

Showing 9 changed files with 253 additions and 20 deletions.
diff --git a/docs/settings.md b/docs/settings.md
@@ -16,6 +16,19 @@ This is the output path of the WACZ file. Multiple variables can be added that a
 
 Supported variables: `spider`, `year`, `month`, `day` and `timestamp`.
 
+### `SW_WACZ_TITLE`
+
+This setting defines the title of the WACZ used in the `datapackage.json`, which is generated durning the WACZ creation. It will default to the spider name if it is not configured.
+
+### `SW_WACZ_DESCRIPTION`
+
+This setting defines the description of the WACZ used in the `datapackage.json`, which is generated durning the WACZ creation. It will default to the spider name if it is not configured. Defaults to:
+
+> This is the web archive generated by a scrapy-webarchive extension for the
+> <spider_name> spider. It is mainly for scraping purposes as it does not contain
+> any js/css data. Though it can be replayed as bare HTML if the site does not depend on 
+> JavaScript.
+
 ## Downloader middleware and spider middleware
 
 ### `SW_WACZ_SOURCE_URI`

diff --git a/scrapy_webarchive/__init__.py b/scrapy_webarchive/__init__.py
@@ -0,0 +1 @@
+__version__ = "0.0.1.dev2"
diff --git a/scrapy_webarchive/extensions.py b/scrapy_webarchive/extensions.py
@@ -14,7 +14,7 @@
 from twisted.internet.defer import Deferred
 from typing_extensions import Any, Dict, Protocol, Self, Type, Union, cast
 
-from scrapy_webarchive.utils import get_scheme_from_uri, get_warc_date
+from scrapy_webarchive.utils import WARC_DT_FORMAT, get_formatted_dt_string, get_scheme_from_uri
 from scrapy_webarchive.wacz import WaczFileCreator
 from scrapy_webarchive.warc import WarcFileWriter
 
@@ -112,7 +112,7 @@ def spider_opened(self) -> None:
         self.writer.write_warcinfo(robotstxt_obey=self.settings["ROBOTSTXT_OBEY"])
 
     def response_received(self, response: Response, request: Request, spider: Spider) -> None:
-        request.meta["WARC-Date"] = get_warc_date()
+        request.meta["WARC-Date"] = get_formatted_dt_string(format=WARC_DT_FORMAT)
 
         # Write response WARC record
         record = self.writer.write_response(response, request)
@@ -127,7 +127,14 @@ def response_received(self, response: Response, request: Request, spider: Spider
         self.stats.inc_value("webarchive/exporter/request_written", spider=spider)
 
     def spider_closed(self, spider: Spider) -> None:
-        WaczFileCreator(store=self.store, warc_fname=self.writer.warc_fname, collection_name=spider.name).create()
+        wacz_creator = WaczFileCreator(
+            store=self.store, 
+            warc_fname=self.writer.warc_fname, 
+            collection_name=spider.name,
+            title=self.settings["SW_WACZ_TITLE"],
+            description=self.settings["SW_WACZ_DESCRIPTION"],
+        )
+        wacz_creator.create()
 
 
 def get_archive_uri_template_dt_variables() -> dict:

diff --git a/scrapy_webarchive/utils.py b/scrapy_webarchive/utils.py
@@ -1,21 +1,22 @@
 from __future__ import annotations
 
+import hashlib
+import logging
 from datetime import datetime, timezone
 from pathlib import Path
+from typing import IO, Tuple
 from urllib.parse import urlparse, urlunparse
 
 from scrapy.settings import Settings
 
 WARC_DT_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
 TIMESTAMP_DT_FORMAT = "%Y%m%d%H%M%S"
+BUFF_SIZE = 1024 * 64
 
+logger = logging.getLogger(__name__)
 
-def get_current_timestamp() -> str:
-    return datetime.now(timezone.utc).strftime(TIMESTAMP_DT_FORMAT)
-
-
-def get_warc_date() -> str:
-    return datetime.now(timezone.utc).strftime(WARC_DT_FORMAT)
+def get_formatted_dt_string(format: str) -> str:
+    return datetime.now(timezone.utc).strftime(format)
 
 
 def header_lines_to_dict(lines):
@@ -78,3 +79,21 @@ def add_ftp_credentials(wacz_uri: str, settings: Settings) -> str:
         return urlunparse(updated_uri)
 
     return wacz_uri
+
+
+def hash_stream(hash_type: str, stream: IO) -> Tuple[int, str]:
+    """Hashes the stream with given hash_type hasher."""
+
+    # At this moment the `hash_type` (or algorithm) that we pass will always be sha256 as it is hardcoded.
+    # This check is implemented in case any other algorithms will be made available in the future.
+    if hash_type not in hashlib.algorithms_guaranteed:
+        raise ValueError(f"Unsupported hash type: {hash_type}")
+
+    hasher = hashlib.new(hash_type)
+
+    size = 0
+    for chunk in iter(lambda: stream.read(BUFF_SIZE), b""):
+        size += len(chunk)
+        hasher.update(chunk)
+
+    return size, f"{hash_type}:{hasher.hexdigest()}"
diff --git a/scrapy_webarchive/wacz.py b/scrapy_webarchive/wacz.py
@@ -2,38 +2,52 @@
 
 import gzip
 import io
+import json
 import os
 import zipfile
 from collections import defaultdict
 from functools import partial
+from typing import Any
 
+from scrapy import __version__ as scrapy_version
 from scrapy.settings import Settings
 from smart_open import open as smart_open
 from typing_extensions import IO, TYPE_CHECKING, Dict, Generator, List, Union
 from warc.warc import WARCRecord
 
+from scrapy_webarchive import __version__ as scrapy_webarchive_version
 from scrapy_webarchive.cdxj import CdxjRecord, write_cdxj_index
 from scrapy_webarchive.utils import (
+    TIMESTAMP_DT_FORMAT,
+    WARC_DT_FORMAT,
     add_ftp_credentials,
-    get_current_timestamp,
+    get_formatted_dt_string,
     get_gcs_client,
     get_s3_client,
     get_scheme_from_uri,
+    hash_stream,
 )
 from scrapy_webarchive.warc import WARCReader
 
 if TYPE_CHECKING:
     from scrapy_webarchive.extensions import FilesStoreProtocol
 
 
+WACZ_VERSION = "1.1.1"
+
 class WaczFileCreator:
     """Handles creating WACZ archives."""
 
-    def __init__(self, store: 'FilesStoreProtocol', warc_fname: str, collection_name: str, cdxj_fname: str = "index.cdxj") -> None:
+    hash_type = "sha256"
+    datapackage_fname = "datapackage.json"
+
+    def __init__(self, store: 'FilesStoreProtocol', warc_fname: str, collection_name: str, title: str, description: str, cdxj_fname: str = "index.cdxj") -> None:
         self.store = store
         self.warc_fname = warc_fname
         self.cdxj_fname = cdxj_fname
         self.collection_name = collection_name
+        self._title = title
+        self._description = description
 
     def create(self) -> None:
         """Create the WACZ file from the WARC and CDXJ index and save it in the configured store."""
@@ -59,6 +73,7 @@ def create_wacz_zip(self) -> io.BytesIO:
         with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file:
             self.write_to_zip(zip_file, self.cdxj_fname, "indexes/")
             self.write_to_zip(zip_file, self.warc_fname, "archive/")
+            self.write_datapackage(zip_file)
 
         return zip_buffer
 
@@ -77,7 +92,80 @@ def cleanup_files(self, *files: str) -> None:
     def get_wacz_fname(self) -> str:
         """Generate WACZ filename based on the WARC filename."""
 
-        return f"{self.collection_name}-{get_current_timestamp()}.wacz"
+        return f"{self.collection_name}-{get_formatted_dt_string(format=TIMESTAMP_DT_FORMAT)}.wacz"
+
+    def write_datapackage(self, zip_file: zipfile.ZipFile) -> None:
+        """Main function to create and write the datapackage.json."""
+
+        package_dict = self.create_package_dict()
+
+        with zip_file.open("archive/" + self.warc_fname) as warc_fh:
+            package_dict = self.update_package_metadata_from_warc(warc_fh, package_dict)
+
+        package_dict["resources"] = self.collect_resources(zip_file)
+
+        zip_file.writestr(self.datapackage_fname, json.dumps(package_dict, indent=2))
+
+    def create_package_dict(self) -> Dict[str, Any]:
+        """Creates the initial package dictionary."""
+
+        dt_string = get_formatted_dt_string(format=WARC_DT_FORMAT)
+        return {
+            "profile": "data-package",
+            "title": self.title,
+            "description": self.description,
+            "created": dt_string,
+            "modified": dt_string,
+            "wacz_version": WACZ_VERSION,
+            "software": f"scrapy-webarchive/{scrapy_webarchive_version}, Scrapy/{scrapy_version}",
+        }
+
+    def update_package_metadata_from_warc(self, warc_fh: IO, package_dict: Dict[str, Any]) -> Dict[str, Any]:
+        """Updates the package dictionary with metadata from the WARC records."""
+
+        warc_reader = WARCReader(gzip.open(warc_fh)) if self.warc_fname.endswith(".gz") else WARCReader(warc_fh)
+
+        while True:
+            warc_record = warc_reader.read_record()
+            if warc_record is None:
+                break
+
+            if warc_record.type == "request":
+                package_dict.update({
+                    "mainPageUrl": warc_record.url,
+                    "mainPageDate": warc_record.date,
+                })
+                break
+
+        return package_dict
+
+    def collect_resources(self, zip_file: zipfile.ZipFile) -> List[Dict[str, Any]]:
+        """Collects resource information from the zip file."""
+
+        resources = []
+
+        for zip_entry in zip_file.infolist():
+            with zip_file.open(zip_entry, "r") as stream:
+                size, hash_ = hash_stream(self.hash_type, stream)
+
+                resources.append({
+                    "name": os.path.basename(zip_entry.filename).lower(),
+                    "path": zip_entry.filename,
+                    "hash": hash_,
+                    "bytes": size,
+                })
+
+        return resources
+
+    @property
+    def title(self):
+        return self._title or self.collection_name
+
+    @property
+    def description(self):
+        return self._description or "This is the web archive generated by a scrapy-webarchive extension for the " \
+                        f"{self.collection_name} spider. It is mainly for scraping purposes as it does not contain " \
+                        "any js/css data. Though it can be replayed as bare HTML if the site does not depend on JavaScript."
 
 
 class WaczFile:

diff --git a/scrapy_webarchive/warc.py b/scrapy_webarchive/warc.py
@@ -18,7 +18,7 @@
 
 from scrapy_webarchive.cdxj import CdxjRecord
 from scrapy_webarchive.exceptions import WaczMiddlewareException
-from scrapy_webarchive.utils import get_current_timestamp, header_lines_to_dict
+from scrapy_webarchive.utils import TIMESTAMP_DT_FORMAT, get_formatted_dt_string, header_lines_to_dict
 
 
 def generate_warc_fname(prefix: str) -> str:
@@ -28,10 +28,12 @@ def generate_warc_fname(prefix: str) -> str:
     {prefix}-{timestamp}-{serial}-{crawlhost}.warc.gz
     """
 
+    timestamp = get_formatted_dt_string(format=TIMESTAMP_DT_FORMAT)
     crawlhost = socket.gethostname().split(".")[0]
     # As of now we only generate one WARC file. Add serial in here to adhere to the warc specification.
     serial = '00000'
-    return "-".join([prefix, get_current_timestamp(), serial, crawlhost]) + ".warc.gz"
+
+    return "-".join([prefix, timestamp, serial, crawlhost]) + ".warc.gz"
 
 
 class WARCReader(BaseWARCReader):

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,17 @@
+import pytest
+
+
+@pytest.fixture
+def warc_example():
+    return b"\
+WARC/1.0\r\n\
+Content-Length: 10\r\n\
+WARC-Date: 2024-02-10T16:15:52Z\r\n\
+Content-Type: application/http; msgtype=request\r\n\
+WARC-Type: request\r\n\
+WARC-Record-ID: <urn:uuid:80fb9262-5402-11e1-8206-545200690126>\r\n\
+WARC-Target-URI: http://example.com/\r\n\
+\r\n\
+Helloworld\
+\r\n\r\n\
+"
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -0,0 +1,53 @@
+import hashlib
+import io
+
+import pytest
+
+from scrapy_webarchive.utils import BUFF_SIZE, hash_stream
+
+
+def test_hash_stream_with_empty_stream():
+    # Test with an empty stream
+    data = b""
+    stream = io.BytesIO(data)
+    size, result = hash_stream("sha256", stream)
+
+    assert size == 0
+    assert result == f"sha256:{hashlib.sha256(data).hexdigest()}"
+
+def test_hash_stream_with_md5_algorithm():
+    data = b"Hello world"
+    expected_hash = hashlib.md5(data).hexdigest()
+
+    stream = io.BytesIO(data)
+    size, result = hash_stream("md5", stream)
+
+    assert size == len(data)
+    assert result == f"md5:{expected_hash}"
+
+def test_hash_stream_with_sha256_algorithm():
+    data = b"Hello world"
+    expected_hash = hashlib.sha256(data).hexdigest()
+
+    stream = io.BytesIO(data)
+    size, result = hash_stream("sha256", stream)
+
+    assert size == len(data)
+    assert result == f"sha256:{expected_hash}"
+
+def test_hash_stream_with_unsupported_hash_type():
+    data = b"Hello world"
+    stream = io.BytesIO(data)
+
+    with pytest.raises(ValueError):
+        hash_stream("unsupported_hash", stream)
+
+def test_hash_stream_with_large_stream():
+    data = b"a" * (2 * BUFF_SIZE)  # Twice the buffer size
+    expected_hash = hashlib.sha256(data).hexdigest()
+
+    stream = io.BytesIO(data)
+    size, result = hash_stream("sha256", stream)
+
+    assert size == len(data)
+    assert result == f"sha256:{expected_hash}"