Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: error handling in CLI and templates documentation #273

Merged
merged 9 commits into from
Jul 8, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/quick-start/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ The JSON file will contain data similar to the following:

:::tip

If you want to change the storage directory, you can set the `CRAWLEE_LOCAL_STORAGE_DIR` environment variable to your preferred path.
If you want to change the storage directory, you can set the `CRAWLEE_STORAGE_DIR` environment variable to your preferred path.

:::

Expand Down
6 changes: 3 additions & 3 deletions src/crawlee/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,12 +166,12 @@ class Configuration(BaseSettings):
),
] = None

local_storage_dir: Annotated[
storage_dir: Annotated[
str,
Field(
validation_alias=AliasChoices(
'apify_local_storage_dir',
'crawlee_local_storage_dir',
'apify_storage_dir',
vdusek marked this conversation as resolved.
Show resolved Hide resolved
'crawlee_storage_dir',
),
),
] = './storage'
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/memory_storage_client/memory_storage_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def persist_storage(self) -> bool:
@property
def storage_dir(self) -> str:
"""Path to the storage directory."""
return self._configuration.local_storage_dir
return self._configuration.storage_dir

@property
def datasets_directory(self) -> str:
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/storages/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ class Dataset(BaseStorage):
removal of existing ones. This class is typically used for storing crawling results.

Data can be stored locally or in the cloud, with local storage paths formatted as:
`{CRAWLEE_LOCAL_STORAGE_DIR}/datasets/{DATASET_ID}/{INDEX}.json`. Here, `{DATASET_ID}` is either "default" or
`{CRAWLEE_STORAGE_DIR}/datasets/{DATASET_ID}/{INDEX}.json`. Here, `{DATASET_ID}` is either "default" or
a specific dataset ID, and `{INDEX}` represents the zero-based index of the item in the dataset.

To open a dataset, use the `open` class method with an `id`, `name`, or `config`. If unspecified, the default
Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/storages/key_value_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ class KeyValueStore(BaseStorage):
Each record is identified by a unique key and associated with a MIME content type. This class is used within
crawler runs to store inputs and outputs, typically in JSON format, but supports other types as well.

The data can be stored on a local filesystem or in the cloud, determined by the `CRAWLEE_LOCAL_STORAGE_DIR`
The data can be stored on a local filesystem or in the cloud, determined by the `CRAWLEE_STORAGE_DIR`
environment variable.

By default, data is stored in `{CRAWLEE_LOCAL_STORAGE_DIR}/key_value_stores/{STORE_ID}/{INDEX}.{EXT}`, where
By default, data is stored in `{CRAWLEE_STORAGE_DIR}/key_value_stores/{STORE_ID}/{INDEX}.{EXT}`, where
`{STORE_ID}` is either "default" or specified by `CRAWLEE_DEFAULT_KEY_VALUE_STORE_ID`, `{KEY}` is the record key,
and `{EXT}` is the MIME type.

Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/storages/request_queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ class RequestQueue(BaseStorage, RequestProvider):
following links. Each URL is uniquely identified by a `unique_key` field, which can be overridden to add the same
URL multiple times under different keys.

Local storage path (if `CRAWLEE_LOCAL_STORAGE_DIR` is set):
`{CRAWLEE_LOCAL_STORAGE_DIR}/request_queues/{QUEUE_ID}/{REQUEST_ID}.json`, where `{QUEUE_ID}` is the request
Local storage path (if `CRAWLEE_STORAGE_DIR` is set):
`{CRAWLEE_STORAGE_DIR}/request_queues/{QUEUE_ID}/{REQUEST_ID}.json`, where `{QUEUE_ID}` is the request
queue's ID (default or specified) and `{REQUEST_ID}` is the request's ID.

Usage includes creating or opening existing queues by ID or name, with named queues retained indefinitely and
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ Project skeleton generated by Crawlee (Beautifulsoup template).

## Usage

First, make sure you have Poetry package management system installed.

First, install the dependencies:
```sh
poetry install
Expand Down
6 changes: 3 additions & 3 deletions tests/unit/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def _isolate_test_environment(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -
tmp_path: A unique temporary directory path provided by pytest for test isolation.
"""
# Set the environment variable for the local storage directory to the temporary path
monkeypatch.setenv('CRAWLEE_LOCAL_STORAGE_DIR', str(tmp_path))
monkeypatch.setenv('CRAWLEE_STORAGE_DIR', str(tmp_path))

# Reset the local and cloud clients in StorageClientManager
StorageClientManager._local_client = MemoryStorageClient()
Expand All @@ -43,15 +43,15 @@ def _isolate_test_environment(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -
monkeypatch.setattr(_creation_management, '_cache_rq_by_name', {})

# Verify that the environment variable is set correctly
assert os.environ.get('CRAWLEE_LOCAL_STORAGE_DIR') == str(tmp_path)
assert os.environ.get('CRAWLEE_STORAGE_DIR') == str(tmp_path)


@pytest.fixture()
def memory_storage_client(tmp_path: Path) -> MemoryStorageClient:
cfg = Configuration(
write_metadata=True,
persist_storage=True,
crawlee_local_storage_dir=str(tmp_path), # type: ignore
crawlee_storage_dir=str(tmp_path), # type: ignore
vdusek marked this conversation as resolved.
Show resolved Hide resolved
)
return MemoryStorageClient(cfg)

Expand Down
36 changes: 18 additions & 18 deletions tests/unit/memory_storage_client/test_memory_storage_client.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# TODO: type ignores and crawlee_local_storage_dir
# TODO: type ignores and crawlee_storage_dir
# https://github.com/apify/crawlee-py/issues/146

from __future__ import annotations
Expand All @@ -21,13 +21,13 @@ async def test_write_metadata(tmp_path: Path) -> None:
dataset_no_metadata_name = 'test-no-metadata'
ms = MemoryStorageClient(
Configuration(
crawlee_local_storage_dir=str(tmp_path), # type: ignore
crawlee_storage_dir=str(tmp_path), # type: ignore
write_metadata=True,
vdusek marked this conversation as resolved.
Show resolved Hide resolved
),
)
ms_no_metadata = MemoryStorageClient(
Configuration(
crawlee_local_storage_dir=str(tmp_path), # type: ignore
crawlee_storage_dir=str(tmp_path), # type: ignore
write_metadata=False,
)
)
Expand All @@ -45,13 +45,13 @@ async def test_write_metadata(tmp_path: Path) -> None:
async def test_persist_storage(tmp_path: Path) -> None:
ms = MemoryStorageClient(
Configuration(
crawlee_local_storage_dir=str(tmp_path), # type: ignore
crawlee_storage_dir=str(tmp_path), # type: ignore
persist_storage=True,
)
)
ms_no_persist = MemoryStorageClient(
Configuration(
crawlee_local_storage_dir=str(tmp_path), # type: ignore
crawlee_storage_dir=str(tmp_path), # type: ignore
persist_storage=False,
)
)
Expand All @@ -71,20 +71,20 @@ async def test_persist_storage(tmp_path: Path) -> None:

def test_persist_storage_set_to_false_via_string_env_var(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
monkeypatch.setenv('CRAWLEE_PERSIST_STORAGE', 'false')
ms = MemoryStorageClient(Configuration(crawlee_local_storage_dir=str(tmp_path))) # type: ignore
ms = MemoryStorageClient(Configuration(crawlee_storage_dir=str(tmp_path))) # type: ignore
assert ms.persist_storage is False


def test_persist_storage_set_to_false_via_numeric_env_var(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
monkeypatch.setenv('CRAWLEE_PERSIST_STORAGE', '0')
ms = MemoryStorageClient(Configuration(crawlee_local_storage_dir=str(tmp_path))) # type: ignore
ms = MemoryStorageClient(Configuration(crawlee_storage_dir=str(tmp_path))) # type: ignore
assert ms.persist_storage is False


def test_persist_storage_true_via_constructor_arg(tmp_path: Path) -> None:
ms = MemoryStorageClient(
Configuration(
crawlee_local_storage_dir=str(tmp_path), # type: ignore
crawlee_storage_dir=str(tmp_path), # type: ignore
persist_storage=True,
)
)
Expand All @@ -93,14 +93,14 @@ def test_persist_storage_true_via_constructor_arg(tmp_path: Path) -> None:

def test_default_write_metadata_behavior(tmp_path: Path) -> None:
# Default behavior
ms = MemoryStorageClient(Configuration(crawlee_local_storage_dir=str(tmp_path))) # type: ignore
ms = MemoryStorageClient(Configuration(crawlee_storage_dir=str(tmp_path))) # type: ignore
assert ms.write_metadata is True


def test_write_metadata_set_to_false_via_env_var(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
# Test if env var changes write_metadata to False
monkeypatch.setenv('CRAWLEE_WRITE_METADATA', 'false')
ms = MemoryStorageClient(Configuration(crawlee_local_storage_dir=str(tmp_path))) # type: ignore
ms = MemoryStorageClient(Configuration(crawlee_storage_dir=str(tmp_path))) # type: ignore
assert ms.write_metadata is False


Expand All @@ -109,7 +109,7 @@ def test_write_metadata_false_via_constructor_arg_overrides_env_var(tmp_path: Pa
ms = MemoryStorageClient(
Configuration(
write_metadata=False,
crawlee_local_storage_dir=str(tmp_path), # type: ignore
crawlee_storage_dir=str(tmp_path), # type: ignore
)
)
assert ms.write_metadata is False
Expand All @@ -119,7 +119,7 @@ async def test_purge_datasets(tmp_path: Path) -> None:
ms = MemoryStorageClient(
Configuration(
write_metadata=True,
crawlee_local_storage_dir=str(tmp_path), # type: ignore
crawlee_storage_dir=str(tmp_path), # type: ignore
)
)
# Create default and non-default datasets
Expand All @@ -142,7 +142,7 @@ async def test_purge_key_value_stores(tmp_path: Path) -> None:
ms = MemoryStorageClient(
Configuration(
write_metadata=True,
crawlee_local_storage_dir=str(tmp_path), # type: ignore
crawlee_storage_dir=str(tmp_path), # type: ignore
)
)

Expand Down Expand Up @@ -177,7 +177,7 @@ async def test_purge_request_queues(tmp_path: Path) -> None:
ms = MemoryStorageClient(
Configuration(
write_metadata=True,
crawlee_local_storage_dir=str(tmp_path), # type: ignore
crawlee_storage_dir=str(tmp_path), # type: ignore
)
)
# Create default and non-default request queues
Expand All @@ -199,7 +199,7 @@ async def test_not_implemented_method(tmp_path: Path) -> None:
ms = MemoryStorageClient(
Configuration(
write_metadata=True,
crawlee_local_storage_dir=str(tmp_path), # type: ignore
crawlee_storage_dir=str(tmp_path), # type: ignore
)
)
ddt = ms.dataset('test')
Expand All @@ -212,21 +212,21 @@ async def test_not_implemented_method(tmp_path: Path) -> None:

async def test_default_storage_path_used(monkeypatch: pytest.MonkeyPatch) -> None:
# We expect the default value to be used
monkeypatch.delenv('CRAWLEE_LOCAL_STORAGE_DIR', raising=False)
monkeypatch.delenv('CRAWLEE_STORAGE_DIR', raising=False)
ms = MemoryStorageClient()
assert ms.storage_dir == './storage'


async def test_storage_path_from_env_var_overrides_default(monkeypatch: pytest.MonkeyPatch) -> None:
# We expect the env var to override the default value
monkeypatch.setenv('CRAWLEE_LOCAL_STORAGE_DIR', './env_var_storage_dir')
monkeypatch.setenv('CRAWLEE_STORAGE_DIR', './env_var_storage_dir')
ms = MemoryStorageClient()
assert ms.storage_dir == './env_var_storage_dir'


async def test_parametrized_storage_path_overrides_env_var() -> None:
# We expect the parametrized value to be used
ms = MemoryStorageClient(
Configuration(crawlee_local_storage_dir='./parametrized_storage_dir'), # type: ignore
Configuration(crawlee_storage_dir='./parametrized_storage_dir'), # type: ignore
)
assert ms.storage_dir == './parametrized_storage_dir'
Loading