Use Scrapy settings to configure client for opening WACZ file when us…

…ing cloud provide
q-m · Oct 16, 2024 · a22c34f · a22c34f
1 parent 89fecc2
commit a22c34f
Show file tree

Hide file tree

Showing 11 changed files with 140 additions and 49 deletions.
diff --git a/docs/installation.md b/docs/installation.md
@@ -5,3 +5,11 @@ To install `scrapy-webarchive`, run:
 ```bash
 pip install scrapy-webarchive
 ```
+
+If you want to use a cloud provider for storing/scraping, you opt-in to install these dependencies:
+
+```bash
+pip install scrapy-webarchive[aws]
+pip install scrapy-webarchive[gcs]
+pip install scrapy-webarchive[all]
+```
diff --git a/docs/settings.md b/docs/settings.md
@@ -17,13 +17,13 @@ Supported variables: `year`, `month`, `day` and `timestamp`.
 
 ## Downloader middleware and spider middleware
 
-### `SW_WACZ_SOURCE_URL`
+### `SW_WACZ_SOURCE_URI`
 
 ```python
-SW_WACZ_SOURCE_URL = "s3://scrapy-webarchive/archive.wacz"
+SW_WACZ_SOURCE_URI = "s3://scrapy-webarchive/archive.wacz"
 
 # Allows multiple sources, comma seperated.
-SW_WACZ_SOURCE_URL = "s3://scrapy-webarchive/archive.wacz,/path/to/archive.wacz"
+SW_WACZ_SOURCE_URI = "s3://scrapy-webarchive/archive.wacz,/path/to/archive.wacz"
 ```
 
 This setting defines the location of the WACZ file that should be used as a source for the crawl job.
@@ -42,4 +42,4 @@ Setting to ignore original `start_requests`, just yield all responses found.
 SW_WACZ_TIMEOUT = 60
 ```
 
-Transport parameter for retrieving the `SW_WACZ_SOURCE_URL` from the defined location.
+Transport parameter for retrieving the `SW_WACZ_SOURCE_URI` from the defined location.
diff --git a/docs/usage.md b/docs/usage.md
@@ -36,10 +36,10 @@ DOWNLOADER_MIDDLEWARES = {
 }
 ```
 
-Then define the location of the WACZ archive with `SW_WACZ_SOURCE_URL` setting:
+Then define the location of the WACZ archive with `SW_WACZ_SOURCE_URI` setting:
 
 ```python
-SW_WACZ_SOURCE_URL = "s3://scrapy-webarchive/archive.wacz"
+SW_WACZ_SOURCE_URI = "s3://scrapy-webarchive/archive.wacz"
 ```
 
 ### Iterating a WACZ archive
@@ -54,9 +54,9 @@ SPIDER_MIDDLEWARES = {
 }
 ```
 
-Then define the location of the WACZ archive with `SW_WACZ_SOURCE_URL` setting:
+Then define the location of the WACZ archive with `SW_WACZ_SOURCE_URI` setting:
 
 ```python
-SW_WACZ_SOURCE_URL = "s3://scrapy-webarchive/archive.wacz"
+SW_WACZ_SOURCE_URI = "s3://scrapy-webarchive/archive.wacz"
 SW_WACZ_CRAWL = True
 ```
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,6 +25,11 @@ classifiers = [
     "Programming Language :: Python",
 ]
 
+[project.optional-dependencies]
+aws = ["boto3"]
+gcs = ["google-cloud-storage"]
+all = ["boto3", "google-cloud-storage"]
+
 [project.urls]
 Repository = "https://github.com/q-m/scrapy-webarchive"
 

diff --git a/scrapy_webarchive/extensions.py b/scrapy_webarchive/extensions.py
@@ -1,6 +1,4 @@
 from datetime import datetime
-from pathlib import Path
-from urllib.parse import urlparse
 
 from scrapy import Spider, signals
 from scrapy.crawler import Crawler
@@ -11,7 +9,7 @@
 from scrapy.settings import Settings
 from typing_extensions import Self
 
-from scrapy_webarchive.utils import get_warc_date
+from scrapy_webarchive.utils import get_scheme_from_uri, get_warc_date
 from scrapy_webarchive.wacz import WaczFileCreator
 from scrapy_webarchive.warc import WarcFileWriter
 
@@ -40,13 +38,7 @@ def __init__(self, settings: Settings, crawler: Crawler) -> None:
     def _get_store(self):
         archive_uri_template = self.settings["SW_EXPORT_URI"]
         uri = archive_uri_template.format(**get_archive_uri_template_variables())
-
-        if Path(uri).is_absolute():  # to support win32 paths like: C:\\some\dir
-            scheme = "file"
-        else:
-            scheme = urlparse(uri).scheme
-
-        store_cls = self.STORE_SCHEMES[scheme]
+        store_cls = self.STORE_SCHEMES[get_scheme_from_uri(uri)]
         return store_cls(uri)
 
     @classmethod

diff --git a/scrapy_webarchive/middleware.py b/scrapy_webarchive/middleware.py
@@ -7,10 +7,9 @@
 from scrapy.exceptions import NotConfigured
 from scrapy.settings import Settings
 from scrapy.statscollectors import StatsCollector
-from smart_open import open
 from typing_extensions import Iterable, Self
 
-from scrapy_webarchive.wacz import MultiWaczFile, WaczFile
+from scrapy_webarchive.wacz import MultiWaczFile, WaczFile, open_wacz_file
 from scrapy_webarchive.warc import record_transformer
 
 
@@ -19,14 +18,14 @@ class BaseWaczMiddleware:
 
     def __init__(self, settings: Settings, stats: StatsCollector) -> None:
         self.stats = stats
-        wacz_url = settings.get("SW_WACZ_SOURCE_URL", None)
+        wacz_uri = settings.get("SW_WACZ_SOURCE_URI", None)
 
-        if not wacz_url:
+        if not wacz_uri:
             raise NotConfigured
 
-        self.wacz_urls = re.split(r"\s*,\s*", wacz_url)
+        self.wacz_uris = re.split(r"\s*,\s*", wacz_uri)
         self.crawl = settings.get("SW_WACZ_CRAWL", False)
-        self.timeout = settings.getfloat("SW_WACZ_TIMEOUT", 60)
+        self.timeout = settings.getfloat("SW_WACZ_TIMEOUT", 60.0)
 
     @classmethod
     def from_crawler(cls, crawler: Crawler) -> Self:
@@ -36,30 +35,26 @@ def from_crawler(cls, crawler: Crawler) -> Self:
         return o
 
     def spider_opened(self, spider: Spider) -> None:
-        tp = {"timeout": self.timeout}
-        multiple_entries = len(self.wacz_urls) != 1
-
-        def open_wacz_file(wacz_url: str) -> Union[IO, None]:
-            spider.logger.info(f"[WACZDownloader] Opening WACZ {wacz_url}")
-
-            try:
-                return open(wacz_url, "rb", transport_params=tp)
-            except OSError:
-                spider.logger.error(f"[WACZDownloader] Could not open WACZ {wacz_url}")
-                return None
+        multiple_entries = len(self.wacz_uris) != 1
 
         if not multiple_entries:
-            wacz_url = self.wacz_urls[0]
-            wacz_file = open_wacz_file(wacz_url)
+            wacz_uri = self.wacz_uris[0]
+            spider.logger.info(f"[WACZDownloader] Opening WACZ {wacz_uri}")
+            wacz_file = open_wacz_file(wacz_uri, self.timeout, spider.settings)
             if wacz_file:
                 self.wacz = WaczFile(wacz_file)
+            else:
+                spider.logger.error(f"[WACZDownloader] Could not open WACZ {wacz_uri}")
         else:
             wacz_files: List[IO] = []
 
-            for wacz_url in self.wacz_urls:
-                wacz_file = open_wacz_file(wacz_url)
+            for wacz_uri in self.wacz_uris:
+                spider.logger.info(f"[WACZDownloader] Opening WACZ {wacz_uri}")
+                wacz_file = open_wacz_file(wacz_uri, self.timeout, spider.settings)
                 if wacz_file:
                     wacz_files.append(wacz_file)
+                else:
+                    spider.logger.error(f"[WACZDownloader] Could not open WACZ {wacz_uri}")
 
             if wacz_files:
                 self.wacz = MultiWaczFile(wacz_files)

diff --git a/scrapy_webarchive/utils.py b/scrapy_webarchive/utils.py
@@ -1,4 +1,8 @@
 from datetime import datetime, timezone
+from pathlib import Path
+from urllib.parse import urlparse, urlunparse
+
+from scrapy.settings import Settings
 
 WARC_DT_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
 TIMESTAMP_DT_FORMAT = "%Y%m%d%H%M%S"
@@ -20,3 +24,55 @@ def header_lines_to_dict(lines):
         v = v.lstrip()
         headers[k] = v
     return headers
+
+
+def get_scheme_from_uri(uri: str) -> str:
+    if Path(uri).is_absolute():  # to support win32 paths like: C:\\some\dir
+        return "file"
+    else:
+        return urlparse(uri).scheme
+
+
+def get_s3_client(settings: Settings):
+    """Create an S3 client using the given settings."""
+
+    import botocore.session
+    session = botocore.session.get_session()
+    return session.create_client(
+        "s3",
+        aws_access_key_id=settings["AWS_ACCESS_KEY_ID"],
+        aws_secret_access_key=settings["AWS_SECRET_ACCESS_KEY"],
+        aws_session_token=settings["AWS_SESSION_TOKEN"],
+        endpoint_url=settings["AWS_ENDPOINT_URL"],
+        region_name=settings["AWS_REGION_NAME"],
+        use_ssl=settings["AWS_USE_SSL"],
+        verify=settings["AWS_VERIFY"],
+    )
+
+
+def get_gcs_client(settings: Settings):
+    """Create a Google Cloud Storage client using the given settings."""
+
+    from google.cloud import storage
+    return storage.Client(project=settings["GCS_PROJECT_ID"])
+
+
+def add_ftp_credentials(wacz_uri: str, settings: Settings) -> str:
+    """Add FTP username and password to the URI if not present."""
+
+    parsed_uri = urlparse(wacz_uri)
+
+    if parsed_uri.username is None:
+        # Build netloc with credentials
+        credentials = f'{settings["FTP_USER"]}:{settings["FTP_PASSWORD"]}'
+        netloc = f'{credentials}@{parsed_uri.hostname}'
+
+        # Add port if present
+        if parsed_uri.port:
+            netloc += f":{parsed_uri.port}"
+
+        # Update and return the URI with credentials
+        updated_uri = parsed_uri._replace(netloc=netloc)
+        return urlunparse(updated_uri)
+
+    return wacz_uri
diff --git a/scrapy_webarchive/wacz.py b/scrapy_webarchive/wacz.py
@@ -3,19 +3,22 @@
 import os
 import zipfile
 from collections import defaultdict
+from functools import partial
 from typing import IO, Dict, Generator, List, Union
 
-from warc import WARCReader as BaseWARCReader
+from scrapy.settings import Settings
+from smart_open import open as smart_open
 from warc.warc import WARCRecord
 
 from scrapy_webarchive.cdxj import CdxjRecord, write_cdxj_index
-from scrapy_webarchive.utils import get_current_timestamp
-
-
-class WARCReader(BaseWARCReader):
-    """WARC reader with compatibility for WARC version 1.0 and 1.1"""
-
-    SUPPORTED_VERSIONS = ["1.0", "1.1"]
+from scrapy_webarchive.utils import (
+    add_ftp_credentials,
+    get_current_timestamp,
+    get_gcs_client,
+    get_s3_client,
+    get_scheme_from_uri,
+)
+from scrapy_webarchive.warc import WARCReader
 
 
 class WaczFileCreator:
@@ -172,3 +175,28 @@ def iter_index(self) -> Generator[CdxjRecord, None, None]:
             for cdxj_record in wacz.iter_index():
                 cdxj_record.wacz_file = wacz
                 yield cdxj_record
+
+
+def open_wacz_file(wacz_uri: str, timeout: float, settings: Settings) -> Union[IO, None]:
+    """Open a WACZ file from a remote location, supporting S3, GCS, and FTP."""
+
+    tp = {"timeout": timeout}
+    scheme = get_scheme_from_uri(wacz_uri)
+
+    # Map schemes to client creation functions
+    scheme_client_map = {
+        "s3": partial(get_s3_client, settings),
+        "gs": partial(get_gcs_client, settings),
+    }
+
+    # Handle clients for specific schemes using the map
+    if scheme in scheme_client_map:
+        tp["client"] = scheme_client_map[scheme]()
+    elif scheme == "ftp":
+        wacz_uri = add_ftp_credentials(wacz_uri, settings)
+
+    # Try opening the WACZ file
+    try:
+        return smart_open(wacz_uri, "rb", transport_params=tp)
+    except OSError:
+        return None
diff --git a/scrapy_webarchive/warc.py b/scrapy_webarchive/warc.py
@@ -8,6 +8,7 @@
 from scrapy.http.response import Response
 from scrapy.responsetypes import ResponseTypes
 from typing_extensions import List, Optional, Tuple
+from warc import WARCReader as BaseWARCReader
 from warc.warc import WARCRecord
 from warcio.recordloader import ArcWarcRecord
 from warcio.statusandheaders import StatusAndHeaders
@@ -31,6 +32,12 @@ def generate_warc_fname(prefix: str) -> str:
     return "-".join([prefix, get_current_timestamp(), serial, crawlhost]) + ".warc.gz"
 
 
+class WARCReader(BaseWARCReader):
+    """WARC reader with compatibility for WARC version 1.0 and 1.1"""
+
+    SUPPORTED_VERSIONS = ["1.0", "1.1"]
+
+
 class WarcFileWriter:
     """Handles writing WARC files"""
 

diff --git a/tests/test_downloadermiddlewares.py b/tests/test_downloadermiddlewares.py
@@ -22,7 +22,7 @@ def _get_wacz_source_url(self) -> str:
 
     def _get_settings(self, **new_settings):
         settings = {
-            "SW_WACZ_SOURCE_URL": self._get_wacz_source_url(),
+            "SW_WACZ_SOURCE_URI": self._get_wacz_source_url(),
             "SW_WACZ_CRAWL": False,
             "SW_WACZ_TIMEOUT": 60,
         }

diff --git a/tests/test_middleware.py b/tests/test_middleware.py
@@ -16,7 +16,7 @@ def setup_method(self):
 
     def _get_settings(self, **new_settings):
         settings = {
-            "SW_WACZ_SOURCE_URL": get_test_data_path("warc_1_1", "quotes.wacz.gz").as_uri(),
+            "SW_WACZ_SOURCE_URI": get_test_data_path("warc_1_1", "quotes.wacz.gz").as_uri(),
             "SW_WACZ_TIMEOUT": 60,
         }
         settings.update(new_settings)