Skip to content

Commit

Permalink
Use Scrapy settings to configure client for opening WACZ file when us…
Browse files Browse the repository at this point in the history
…ing cloud provide
  • Loading branch information
Wesley van Lee committed Oct 16, 2024
1 parent 89fecc2 commit a22c34f
Show file tree
Hide file tree
Showing 11 changed files with 140 additions and 49 deletions.
8 changes: 8 additions & 0 deletions docs/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,11 @@ To install `scrapy-webarchive`, run:
```bash
pip install scrapy-webarchive
```

If you want to use a cloud provider for storing/scraping, you opt-in to install these dependencies:

```bash
pip install scrapy-webarchive[aws]
pip install scrapy-webarchive[gcs]
pip install scrapy-webarchive[all]
```
8 changes: 4 additions & 4 deletions docs/settings.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@ Supported variables: `year`, `month`, `day` and `timestamp`.

## Downloader middleware and spider middleware

### `SW_WACZ_SOURCE_URL`
### `SW_WACZ_SOURCE_URI`

```python
SW_WACZ_SOURCE_URL = "s3://scrapy-webarchive/archive.wacz"
SW_WACZ_SOURCE_URI = "s3://scrapy-webarchive/archive.wacz"

# Allows multiple sources, comma seperated.
SW_WACZ_SOURCE_URL = "s3://scrapy-webarchive/archive.wacz,/path/to/archive.wacz"
SW_WACZ_SOURCE_URI = "s3://scrapy-webarchive/archive.wacz,/path/to/archive.wacz"
```

This setting defines the location of the WACZ file that should be used as a source for the crawl job.
Expand All @@ -42,4 +42,4 @@ Setting to ignore original `start_requests`, just yield all responses found.
SW_WACZ_TIMEOUT = 60
```

Transport parameter for retrieving the `SW_WACZ_SOURCE_URL` from the defined location.
Transport parameter for retrieving the `SW_WACZ_SOURCE_URI` from the defined location.
8 changes: 4 additions & 4 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,10 @@ DOWNLOADER_MIDDLEWARES = {
}
```

Then define the location of the WACZ archive with `SW_WACZ_SOURCE_URL` setting:
Then define the location of the WACZ archive with `SW_WACZ_SOURCE_URI` setting:

```python
SW_WACZ_SOURCE_URL = "s3://scrapy-webarchive/archive.wacz"
SW_WACZ_SOURCE_URI = "s3://scrapy-webarchive/archive.wacz"
```

### Iterating a WACZ archive
Expand All @@ -54,9 +54,9 @@ SPIDER_MIDDLEWARES = {
}
```

Then define the location of the WACZ archive with `SW_WACZ_SOURCE_URL` setting:
Then define the location of the WACZ archive with `SW_WACZ_SOURCE_URI` setting:

```python
SW_WACZ_SOURCE_URL = "s3://scrapy-webarchive/archive.wacz"
SW_WACZ_SOURCE_URI = "s3://scrapy-webarchive/archive.wacz"
SW_WACZ_CRAWL = True
```
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ classifiers = [
"Programming Language :: Python",
]

[project.optional-dependencies]
aws = ["boto3"]
gcs = ["google-cloud-storage"]
all = ["boto3", "google-cloud-storage"]

[project.urls]
Repository = "https://github.com/q-m/scrapy-webarchive"

Expand Down
12 changes: 2 additions & 10 deletions scrapy_webarchive/extensions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
from datetime import datetime
from pathlib import Path
from urllib.parse import urlparse

from scrapy import Spider, signals
from scrapy.crawler import Crawler
Expand All @@ -11,7 +9,7 @@
from scrapy.settings import Settings
from typing_extensions import Self

from scrapy_webarchive.utils import get_warc_date
from scrapy_webarchive.utils import get_scheme_from_uri, get_warc_date
from scrapy_webarchive.wacz import WaczFileCreator
from scrapy_webarchive.warc import WarcFileWriter

Expand Down Expand Up @@ -40,13 +38,7 @@ def __init__(self, settings: Settings, crawler: Crawler) -> None:
def _get_store(self):
archive_uri_template = self.settings["SW_EXPORT_URI"]
uri = archive_uri_template.format(**get_archive_uri_template_variables())

if Path(uri).is_absolute(): # to support win32 paths like: C:\\some\dir
scheme = "file"
else:
scheme = urlparse(uri).scheme

store_cls = self.STORE_SCHEMES[scheme]
store_cls = self.STORE_SCHEMES[get_scheme_from_uri(uri)]
return store_cls(uri)

@classmethod
Expand Down
37 changes: 16 additions & 21 deletions scrapy_webarchive/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,9 @@
from scrapy.exceptions import NotConfigured
from scrapy.settings import Settings
from scrapy.statscollectors import StatsCollector
from smart_open import open
from typing_extensions import Iterable, Self

from scrapy_webarchive.wacz import MultiWaczFile, WaczFile
from scrapy_webarchive.wacz import MultiWaczFile, WaczFile, open_wacz_file
from scrapy_webarchive.warc import record_transformer


Expand All @@ -19,14 +18,14 @@ class BaseWaczMiddleware:

def __init__(self, settings: Settings, stats: StatsCollector) -> None:
self.stats = stats
wacz_url = settings.get("SW_WACZ_SOURCE_URL", None)
wacz_uri = settings.get("SW_WACZ_SOURCE_URI", None)

if not wacz_url:
if not wacz_uri:
raise NotConfigured

self.wacz_urls = re.split(r"\s*,\s*", wacz_url)
self.wacz_uris = re.split(r"\s*,\s*", wacz_uri)
self.crawl = settings.get("SW_WACZ_CRAWL", False)
self.timeout = settings.getfloat("SW_WACZ_TIMEOUT", 60)
self.timeout = settings.getfloat("SW_WACZ_TIMEOUT", 60.0)

@classmethod
def from_crawler(cls, crawler: Crawler) -> Self:
Expand All @@ -36,30 +35,26 @@ def from_crawler(cls, crawler: Crawler) -> Self:
return o

def spider_opened(self, spider: Spider) -> None:
tp = {"timeout": self.timeout}
multiple_entries = len(self.wacz_urls) != 1

def open_wacz_file(wacz_url: str) -> Union[IO, None]:
spider.logger.info(f"[WACZDownloader] Opening WACZ {wacz_url}")

try:
return open(wacz_url, "rb", transport_params=tp)
except OSError:
spider.logger.error(f"[WACZDownloader] Could not open WACZ {wacz_url}")
return None
multiple_entries = len(self.wacz_uris) != 1

if not multiple_entries:
wacz_url = self.wacz_urls[0]
wacz_file = open_wacz_file(wacz_url)
wacz_uri = self.wacz_uris[0]
spider.logger.info(f"[WACZDownloader] Opening WACZ {wacz_uri}")
wacz_file = open_wacz_file(wacz_uri, self.timeout, spider.settings)
if wacz_file:
self.wacz = WaczFile(wacz_file)
else:
spider.logger.error(f"[WACZDownloader] Could not open WACZ {wacz_uri}")
else:
wacz_files: List[IO] = []

for wacz_url in self.wacz_urls:
wacz_file = open_wacz_file(wacz_url)
for wacz_uri in self.wacz_uris:
spider.logger.info(f"[WACZDownloader] Opening WACZ {wacz_uri}")
wacz_file = open_wacz_file(wacz_uri, self.timeout, spider.settings)
if wacz_file:
wacz_files.append(wacz_file)
else:
spider.logger.error(f"[WACZDownloader] Could not open WACZ {wacz_uri}")

if wacz_files:
self.wacz = MultiWaczFile(wacz_files)
Expand Down
56 changes: 56 additions & 0 deletions scrapy_webarchive/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import urlparse, urlunparse

from scrapy.settings import Settings

WARC_DT_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
TIMESTAMP_DT_FORMAT = "%Y%m%d%H%M%S"
Expand All @@ -20,3 +24,55 @@ def header_lines_to_dict(lines):
v = v.lstrip()
headers[k] = v
return headers


def get_scheme_from_uri(uri: str) -> str:
if Path(uri).is_absolute(): # to support win32 paths like: C:\\some\dir
return "file"
else:
return urlparse(uri).scheme


def get_s3_client(settings: Settings):
"""Create an S3 client using the given settings."""

import botocore.session
session = botocore.session.get_session()
return session.create_client(
"s3",
aws_access_key_id=settings["AWS_ACCESS_KEY_ID"],
aws_secret_access_key=settings["AWS_SECRET_ACCESS_KEY"],
aws_session_token=settings["AWS_SESSION_TOKEN"],
endpoint_url=settings["AWS_ENDPOINT_URL"],
region_name=settings["AWS_REGION_NAME"],
use_ssl=settings["AWS_USE_SSL"],
verify=settings["AWS_VERIFY"],
)


def get_gcs_client(settings: Settings):
"""Create a Google Cloud Storage client using the given settings."""

from google.cloud import storage
return storage.Client(project=settings["GCS_PROJECT_ID"])


def add_ftp_credentials(wacz_uri: str, settings: Settings) -> str:
"""Add FTP username and password to the URI if not present."""

parsed_uri = urlparse(wacz_uri)

if parsed_uri.username is None:
# Build netloc with credentials
credentials = f'{settings["FTP_USER"]}:{settings["FTP_PASSWORD"]}'
netloc = f'{credentials}@{parsed_uri.hostname}'

# Add port if present
if parsed_uri.port:
netloc += f":{parsed_uri.port}"

# Update and return the URI with credentials
updated_uri = parsed_uri._replace(netloc=netloc)
return urlunparse(updated_uri)

return wacz_uri
44 changes: 36 additions & 8 deletions scrapy_webarchive/wacz.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,22 @@
import os
import zipfile
from collections import defaultdict
from functools import partial
from typing import IO, Dict, Generator, List, Union

from warc import WARCReader as BaseWARCReader
from scrapy.settings import Settings
from smart_open import open as smart_open
from warc.warc import WARCRecord

from scrapy_webarchive.cdxj import CdxjRecord, write_cdxj_index
from scrapy_webarchive.utils import get_current_timestamp


class WARCReader(BaseWARCReader):
"""WARC reader with compatibility for WARC version 1.0 and 1.1"""

SUPPORTED_VERSIONS = ["1.0", "1.1"]
from scrapy_webarchive.utils import (
add_ftp_credentials,
get_current_timestamp,
get_gcs_client,
get_s3_client,
get_scheme_from_uri,
)
from scrapy_webarchive.warc import WARCReader


class WaczFileCreator:
Expand Down Expand Up @@ -172,3 +175,28 @@ def iter_index(self) -> Generator[CdxjRecord, None, None]:
for cdxj_record in wacz.iter_index():
cdxj_record.wacz_file = wacz
yield cdxj_record


def open_wacz_file(wacz_uri: str, timeout: float, settings: Settings) -> Union[IO, None]:
"""Open a WACZ file from a remote location, supporting S3, GCS, and FTP."""

tp = {"timeout": timeout}
scheme = get_scheme_from_uri(wacz_uri)

# Map schemes to client creation functions
scheme_client_map = {
"s3": partial(get_s3_client, settings),
"gs": partial(get_gcs_client, settings),
}

# Handle clients for specific schemes using the map
if scheme in scheme_client_map:
tp["client"] = scheme_client_map[scheme]()
elif scheme == "ftp":
wacz_uri = add_ftp_credentials(wacz_uri, settings)

# Try opening the WACZ file
try:
return smart_open(wacz_uri, "rb", transport_params=tp)
except OSError:
return None
7 changes: 7 additions & 0 deletions scrapy_webarchive/warc.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from scrapy.http.response import Response
from scrapy.responsetypes import ResponseTypes
from typing_extensions import List, Optional, Tuple
from warc import WARCReader as BaseWARCReader
from warc.warc import WARCRecord
from warcio.recordloader import ArcWarcRecord
from warcio.statusandheaders import StatusAndHeaders
Expand All @@ -31,6 +32,12 @@ def generate_warc_fname(prefix: str) -> str:
return "-".join([prefix, get_current_timestamp(), serial, crawlhost]) + ".warc.gz"


class WARCReader(BaseWARCReader):
"""WARC reader with compatibility for WARC version 1.0 and 1.1"""

SUPPORTED_VERSIONS = ["1.0", "1.1"]


class WarcFileWriter:
"""Handles writing WARC files"""

Expand Down
2 changes: 1 addition & 1 deletion tests/test_downloadermiddlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def _get_wacz_source_url(self) -> str:

def _get_settings(self, **new_settings):
settings = {
"SW_WACZ_SOURCE_URL": self._get_wacz_source_url(),
"SW_WACZ_SOURCE_URI": self._get_wacz_source_url(),
"SW_WACZ_CRAWL": False,
"SW_WACZ_TIMEOUT": 60,
}
Expand Down
2 changes: 1 addition & 1 deletion tests/test_middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def setup_method(self):

def _get_settings(self, **new_settings):
settings = {
"SW_WACZ_SOURCE_URL": get_test_data_path("warc_1_1", "quotes.wacz.gz").as_uri(),
"SW_WACZ_SOURCE_URI": get_test_data_path("warc_1_1", "quotes.wacz.gz").as_uri(),
"SW_WACZ_TIMEOUT": 60,
}
settings.update(new_settings)
Expand Down

0 comments on commit a22c34f

Please sign in to comment.