From 1c438f1a1cf3de0f18fcf87b6aa83e25405e8b85 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 8 Jul 2024 11:44:36 +0200 Subject: [PATCH 1/9] fix: rename local_storage_dir to storage_dir --- docs/quick-start/index.md | 2 +- src/crawlee/configuration.py | 6 ++-- .../memory_storage_client.py | 2 +- src/crawlee/storages/dataset.py | 2 +- src/crawlee/storages/key_value_store.py | 4 +-- src/crawlee/storages/request_queue.py | 4 +-- .../{{cookiecutter.project_name}}/README.md | 2 ++ tests/unit/conftest.py | 6 ++-- .../test_memory_storage_client.py | 36 +++++++++---------- 9 files changed, 33 insertions(+), 31 deletions(-) diff --git a/docs/quick-start/index.md b/docs/quick-start/index.md index 3907ac1eb8..5531620375 100644 --- a/docs/quick-start/index.md +++ b/docs/quick-start/index.md @@ -182,7 +182,7 @@ The JSON file will contain data similar to the following: :::tip -If you want to change the storage directory, you can set the `CRAWLEE_LOCAL_STORAGE_DIR` environment variable to your preferred path. +If you want to change the storage directory, you can set the `CRAWLEE_STORAGE_DIR` environment variable to your preferred path. ::: diff --git a/src/crawlee/configuration.py b/src/crawlee/configuration.py index df4bb57648..2a715695af 100644 --- a/src/crawlee/configuration.py +++ b/src/crawlee/configuration.py @@ -166,12 +166,12 @@ class Configuration(BaseSettings): ), ] = None - local_storage_dir: Annotated[ + storage_dir: Annotated[ str, Field( validation_alias=AliasChoices( - 'apify_local_storage_dir', - 'crawlee_local_storage_dir', + 'apify_storage_dir', + 'crawlee_storage_dir', ), ), ] = './storage' diff --git a/src/crawlee/memory_storage_client/memory_storage_client.py b/src/crawlee/memory_storage_client/memory_storage_client.py index 218668ee14..b17c762785 100644 --- a/src/crawlee/memory_storage_client/memory_storage_client.py +++ b/src/crawlee/memory_storage_client/memory_storage_client.py @@ -77,7 +77,7 @@ def persist_storage(self) -> bool: @property def storage_dir(self) -> str: """Path to the storage directory.""" - return self._configuration.local_storage_dir + return self._configuration.storage_dir @property def datasets_directory(self) -> str: diff --git a/src/crawlee/storages/dataset.py b/src/crawlee/storages/dataset.py index 8af781af12..2616622056 100644 --- a/src/crawlee/storages/dataset.py +++ b/src/crawlee/storages/dataset.py @@ -78,7 +78,7 @@ class Dataset(BaseStorage): removal of existing ones. This class is typically used for storing crawling results. Data can be stored locally or in the cloud, with local storage paths formatted as: - `{CRAWLEE_LOCAL_STORAGE_DIR}/datasets/{DATASET_ID}/{INDEX}.json`. Here, `{DATASET_ID}` is either "default" or + `{CRAWLEE_STORAGE_DIR}/datasets/{DATASET_ID}/{INDEX}.json`. Here, `{DATASET_ID}` is either "default" or a specific dataset ID, and `{INDEX}` represents the zero-based index of the item in the dataset. To open a dataset, use the `open` class method with an `id`, `name`, or `config`. If unspecified, the default diff --git a/src/crawlee/storages/key_value_store.py b/src/crawlee/storages/key_value_store.py index b4436d61a3..b63706e319 100644 --- a/src/crawlee/storages/key_value_store.py +++ b/src/crawlee/storages/key_value_store.py @@ -20,10 +20,10 @@ class KeyValueStore(BaseStorage): Each record is identified by a unique key and associated with a MIME content type. This class is used within crawler runs to store inputs and outputs, typically in JSON format, but supports other types as well. - The data can be stored on a local filesystem or in the cloud, determined by the `CRAWLEE_LOCAL_STORAGE_DIR` + The data can be stored on a local filesystem or in the cloud, determined by the `CRAWLEE_STORAGE_DIR` environment variable. - By default, data is stored in `{CRAWLEE_LOCAL_STORAGE_DIR}/key_value_stores/{STORE_ID}/{INDEX}.{EXT}`, where + By default, data is stored in `{CRAWLEE_STORAGE_DIR}/key_value_stores/{STORE_ID}/{INDEX}.{EXT}`, where `{STORE_ID}` is either "default" or specified by `CRAWLEE_DEFAULT_KEY_VALUE_STORE_ID`, `{KEY}` is the record key, and `{EXT}` is the MIME type. diff --git a/src/crawlee/storages/request_queue.py b/src/crawlee/storages/request_queue.py index e6bdff180a..d5e7411725 100644 --- a/src/crawlee/storages/request_queue.py +++ b/src/crawlee/storages/request_queue.py @@ -40,8 +40,8 @@ class RequestQueue(BaseStorage, RequestProvider): following links. Each URL is uniquely identified by a `unique_key` field, which can be overridden to add the same URL multiple times under different keys. - Local storage path (if `CRAWLEE_LOCAL_STORAGE_DIR` is set): - `{CRAWLEE_LOCAL_STORAGE_DIR}/request_queues/{QUEUE_ID}/{REQUEST_ID}.json`, where `{QUEUE_ID}` is the request + Local storage path (if `CRAWLEE_STORAGE_DIR` is set): + `{CRAWLEE_STORAGE_DIR}/request_queues/{QUEUE_ID}/{REQUEST_ID}.json`, where `{QUEUE_ID}` is the request queue's ID (default or specified) and `{REQUEST_ID}` is the request's ID. Usage includes creating or opening existing queues by ID or name, with named queues retained indefinitely and diff --git a/templates/beautifulsoup/{{cookiecutter.project_name}}/README.md b/templates/beautifulsoup/{{cookiecutter.project_name}}/README.md index 496c4f6a5d..34223a7293 100644 --- a/templates/beautifulsoup/{{cookiecutter.project_name}}/README.md +++ b/templates/beautifulsoup/{{cookiecutter.project_name}}/README.md @@ -4,6 +4,8 @@ Project skeleton generated by Crawlee (Beautifulsoup template). ## Usage +First, make sure you have Poetry package management system installed. + First, install the dependencies: ```sh poetry install diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index bb29e27fce..ad84a81054 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -25,7 +25,7 @@ def _isolate_test_environment(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) - tmp_path: A unique temporary directory path provided by pytest for test isolation. """ # Set the environment variable for the local storage directory to the temporary path - monkeypatch.setenv('CRAWLEE_LOCAL_STORAGE_DIR', str(tmp_path)) + monkeypatch.setenv('CRAWLEE_STORAGE_DIR', str(tmp_path)) # Reset the local and cloud clients in StorageClientManager StorageClientManager._local_client = MemoryStorageClient() @@ -43,7 +43,7 @@ def _isolate_test_environment(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) - monkeypatch.setattr(_creation_management, '_cache_rq_by_name', {}) # Verify that the environment variable is set correctly - assert os.environ.get('CRAWLEE_LOCAL_STORAGE_DIR') == str(tmp_path) + assert os.environ.get('CRAWLEE_STORAGE_DIR') == str(tmp_path) @pytest.fixture() @@ -51,7 +51,7 @@ def memory_storage_client(tmp_path: Path) -> MemoryStorageClient: cfg = Configuration( write_metadata=True, persist_storage=True, - crawlee_local_storage_dir=str(tmp_path), # type: ignore + crawlee_storage_dir=str(tmp_path), # type: ignore ) return MemoryStorageClient(cfg) diff --git a/tests/unit/memory_storage_client/test_memory_storage_client.py b/tests/unit/memory_storage_client/test_memory_storage_client.py index a6f1758218..e99992219c 100644 --- a/tests/unit/memory_storage_client/test_memory_storage_client.py +++ b/tests/unit/memory_storage_client/test_memory_storage_client.py @@ -1,4 +1,4 @@ -# TODO: type ignores and crawlee_local_storage_dir +# TODO: type ignores and crawlee_storage_dir # https://github.com/apify/crawlee-py/issues/146 from __future__ import annotations @@ -21,13 +21,13 @@ async def test_write_metadata(tmp_path: Path) -> None: dataset_no_metadata_name = 'test-no-metadata' ms = MemoryStorageClient( Configuration( - crawlee_local_storage_dir=str(tmp_path), # type: ignore + crawlee_storage_dir=str(tmp_path), # type: ignore write_metadata=True, ), ) ms_no_metadata = MemoryStorageClient( Configuration( - crawlee_local_storage_dir=str(tmp_path), # type: ignore + crawlee_storage_dir=str(tmp_path), # type: ignore write_metadata=False, ) ) @@ -45,13 +45,13 @@ async def test_write_metadata(tmp_path: Path) -> None: async def test_persist_storage(tmp_path: Path) -> None: ms = MemoryStorageClient( Configuration( - crawlee_local_storage_dir=str(tmp_path), # type: ignore + crawlee_storage_dir=str(tmp_path), # type: ignore persist_storage=True, ) ) ms_no_persist = MemoryStorageClient( Configuration( - crawlee_local_storage_dir=str(tmp_path), # type: ignore + crawlee_storage_dir=str(tmp_path), # type: ignore persist_storage=False, ) ) @@ -71,20 +71,20 @@ async def test_persist_storage(tmp_path: Path) -> None: def test_persist_storage_set_to_false_via_string_env_var(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: monkeypatch.setenv('CRAWLEE_PERSIST_STORAGE', 'false') - ms = MemoryStorageClient(Configuration(crawlee_local_storage_dir=str(tmp_path))) # type: ignore + ms = MemoryStorageClient(Configuration(crawlee_storage_dir=str(tmp_path))) # type: ignore assert ms.persist_storage is False def test_persist_storage_set_to_false_via_numeric_env_var(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: monkeypatch.setenv('CRAWLEE_PERSIST_STORAGE', '0') - ms = MemoryStorageClient(Configuration(crawlee_local_storage_dir=str(tmp_path))) # type: ignore + ms = MemoryStorageClient(Configuration(crawlee_storage_dir=str(tmp_path))) # type: ignore assert ms.persist_storage is False def test_persist_storage_true_via_constructor_arg(tmp_path: Path) -> None: ms = MemoryStorageClient( Configuration( - crawlee_local_storage_dir=str(tmp_path), # type: ignore + crawlee_storage_dir=str(tmp_path), # type: ignore persist_storage=True, ) ) @@ -93,14 +93,14 @@ def test_persist_storage_true_via_constructor_arg(tmp_path: Path) -> None: def test_default_write_metadata_behavior(tmp_path: Path) -> None: # Default behavior - ms = MemoryStorageClient(Configuration(crawlee_local_storage_dir=str(tmp_path))) # type: ignore + ms = MemoryStorageClient(Configuration(crawlee_storage_dir=str(tmp_path))) # type: ignore assert ms.write_metadata is True def test_write_metadata_set_to_false_via_env_var(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: # Test if env var changes write_metadata to False monkeypatch.setenv('CRAWLEE_WRITE_METADATA', 'false') - ms = MemoryStorageClient(Configuration(crawlee_local_storage_dir=str(tmp_path))) # type: ignore + ms = MemoryStorageClient(Configuration(crawlee_storage_dir=str(tmp_path))) # type: ignore assert ms.write_metadata is False @@ -109,7 +109,7 @@ def test_write_metadata_false_via_constructor_arg_overrides_env_var(tmp_path: Pa ms = MemoryStorageClient( Configuration( write_metadata=False, - crawlee_local_storage_dir=str(tmp_path), # type: ignore + crawlee_storage_dir=str(tmp_path), # type: ignore ) ) assert ms.write_metadata is False @@ -119,7 +119,7 @@ async def test_purge_datasets(tmp_path: Path) -> None: ms = MemoryStorageClient( Configuration( write_metadata=True, - crawlee_local_storage_dir=str(tmp_path), # type: ignore + crawlee_storage_dir=str(tmp_path), # type: ignore ) ) # Create default and non-default datasets @@ -142,7 +142,7 @@ async def test_purge_key_value_stores(tmp_path: Path) -> None: ms = MemoryStorageClient( Configuration( write_metadata=True, - crawlee_local_storage_dir=str(tmp_path), # type: ignore + crawlee_storage_dir=str(tmp_path), # type: ignore ) ) @@ -177,7 +177,7 @@ async def test_purge_request_queues(tmp_path: Path) -> None: ms = MemoryStorageClient( Configuration( write_metadata=True, - crawlee_local_storage_dir=str(tmp_path), # type: ignore + crawlee_storage_dir=str(tmp_path), # type: ignore ) ) # Create default and non-default request queues @@ -199,7 +199,7 @@ async def test_not_implemented_method(tmp_path: Path) -> None: ms = MemoryStorageClient( Configuration( write_metadata=True, - crawlee_local_storage_dir=str(tmp_path), # type: ignore + crawlee_storage_dir=str(tmp_path), # type: ignore ) ) ddt = ms.dataset('test') @@ -212,14 +212,14 @@ async def test_not_implemented_method(tmp_path: Path) -> None: async def test_default_storage_path_used(monkeypatch: pytest.MonkeyPatch) -> None: # We expect the default value to be used - monkeypatch.delenv('CRAWLEE_LOCAL_STORAGE_DIR', raising=False) + monkeypatch.delenv('CRAWLEE_STORAGE_DIR', raising=False) ms = MemoryStorageClient() assert ms.storage_dir == './storage' async def test_storage_path_from_env_var_overrides_default(monkeypatch: pytest.MonkeyPatch) -> None: # We expect the env var to override the default value - monkeypatch.setenv('CRAWLEE_LOCAL_STORAGE_DIR', './env_var_storage_dir') + monkeypatch.setenv('CRAWLEE_STORAGE_DIR', './env_var_storage_dir') ms = MemoryStorageClient() assert ms.storage_dir == './env_var_storage_dir' @@ -227,6 +227,6 @@ async def test_storage_path_from_env_var_overrides_default(monkeypatch: pytest.M async def test_parametrized_storage_path_overrides_env_var() -> None: # We expect the parametrized value to be used ms = MemoryStorageClient( - Configuration(crawlee_local_storage_dir='./parametrized_storage_dir'), # type: ignore + Configuration(crawlee_storage_dir='./parametrized_storage_dir'), # type: ignore ) assert ms.storage_dir == './parametrized_storage_dir' From 0da5c02e20681347e863450e2ea0fa7aaccaac7e Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 8 Jul 2024 12:29:42 +0200 Subject: [PATCH 2/9] doc: mention CLI and templates in readme and doc --- CHANGELOG.md | 2 +- CONTRIBUTING.md | 22 +++--- README.md | 68 +++++++++++-------- docs/introduction/01-setting-up.md | 30 +++++--- docs/quick-start/index.md | 10 +-- .../{{cookiecutter.project_name}}/README.md | 12 +++- .../{{cookiecutter.project_name}}/README.md | 12 +++- 7 files changed, 97 insertions(+), 59 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5deb26a245..da2061efce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Changelog -## [0.0.8](../../releases/tag/v0.0.8) - Unreleased +## [0.1.0](../../releases/tag/v0.1.0) - Unreleased ### Adds diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 51aa29328a..ef5353ccde 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -12,7 +12,7 @@ We use [Poetry](https://python-poetry.org/) for project management. Install it a To install this package and its development dependencies, run: -```bash +```sh make install-dev ``` @@ -20,7 +20,7 @@ make install-dev To execute all code checking tools together, run: -```bash +```sh make check-code ``` @@ -30,7 +30,7 @@ We utilize [ruff](https://docs.astral.sh/ruff/) for linting, which analyzes code To run linting: -```bash +```sh make lint ``` @@ -40,7 +40,7 @@ Our automated code formatting also leverages [ruff](https://docs.astral.sh/ruff/ To run formatting: -```bash +```sh make format ``` @@ -50,7 +50,7 @@ Type checking is handled by [mypy](https://mypy.readthedocs.io/), verifying code To run type checking: -```bash +```sh make type-check ``` @@ -62,13 +62,13 @@ We use [pytest](https://docs.pytest.org/) as a testing framework with many plugi To run unit tests: -```bash +```sh make unit-tests ``` To run unit tests with HTML coverage report: -```bash +```sh make unit-tests-cov ``` @@ -90,25 +90,25 @@ To run the documentation locally, you need to have Node.js version 20 or higher Navigate to the `website/` directory: -```bash +```sh cd website/ ``` Enable Corepack, which installs Yarn automatically: -```bash +```sh corepack enable ``` Install the necessary dependencies: -```bash +```sh yarn ``` Start the project in development mode with Hot Module Replacement (HMR): -```bash +```sh yarn start ``` diff --git a/README.md b/README.md index 0ae8d52b9d..34ba23c035 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Crawlee covers your crawling and scraping end-to-end and **helps you build reliable scrapers. Fast.** -Your crawlers will appear almost human-like and fly under the radar of modern bot protections even with the default configuration. Crawlee gives you the tools to crawl the web for links, scrape data, and store it to disk or cloud while staying configurable to suit your project's needs. +Your crawlers will appear almost human-like and fly under the radar of modern bot protections even with the default configuration. Crawlee gives you the tools to crawl the web for links, scrape data and persistently store it in machine-readable formats, without having to worry about the technical details. And thanks to rich configuration options, you can tweak almost any aspect of Crawlee to suit your project's needs if the default settings don't cut it. > 👉 **View full documentation, guides and examples on the [Crawlee project website](https://crawlee.dev/python/)** 👈 @@ -23,7 +23,7 @@ We recommend visiting the [Introduction tutorial](https://crawlee.dev/python/doc Crawlee is available as the [`crawlee`](https://pypi.org/project/crawlee/) PyPI package. -```bash +```sh pip install crawlee ``` @@ -31,56 +31,45 @@ Additional, optional dependencies unlocking more features are shipped as package If you plan to use `BeautifulSoupCrawler`, install `crawlee` with `beautifulsoup` extra: -```bash +```sh pip install 'crawlee[beautifulsoup]' ``` If you plan to use `PlaywrightCrawler`, install `crawlee` with the `playwright` extra: -```bash +```sh pip install 'crawlee[playwright]' ``` Then, install the Playwright dependencies: -```bash +```sh playwright install ``` You can install multiple extras at once by using a comma as a separator: -```bash +```sh pip install 'crawlee[beautifulsoup,playwright]' ``` -## Features - -Why Crawlee is the preferred choice for web scraping and crawling? +### With Crawlee CLI -### Why use Crawlee instead of just a random HTTP library with an HTML parser? - -- Unified interface for **HTTP & headless browser** crawling. -- Automatic **parallel crawling** based on available system resources. -- Written in Python with **type hints** - enhances DX (IDE autocompletion) and reduces bugs (static type checking). -- Automatic **retries** on errors or when you’re getting blocked. -- Integrated **proxy rotation** and session management. -- Configurable **request routing** - direct URLs to the appropriate handlers. -- Persistent **queue for URLs** to crawl. -- Pluggable **storage** of both tabular data and files. -- Robust **error handling**. +The quickest way to get started with Crawlee is by using the Crawlee CLI and selecting one of the prepared templates. First, ensure you have [Pipx](https://pipx.pypa.io/) installed: -### Why to use Crawlee rather than Scrapy? +```sh +pipx --help +``` -- Crawlee has out-of-the-box support for **headless browser** crawling (Playwright). -- Crawlee has a **minimalistic & elegant interface** - Set up your scraper with fewer than 10 lines of code. -- Complete **type hint** coverage. -- Based on standard **Asyncio**. +Then, run the CLI and choose from the available templates: -## Introduction +```sh +pipx run crawlee create my-crawler +``` -Crawlee covers your crawling and scraping end-to-end and helps you build reliable scrapers. Fast. +## Examples -Your crawlers will appear human-like and fly under the radar of modern bot protections even with the default configuration. Crawlee gives you the tools to crawl the web for links, scrape data and persistently store it in machine-readable formats, without having to worry about the technical details. And thanks to rich configuration options, you can tweak almost any aspect of Crawlee to suit your project's needs if the default settings don't cut it. +Here are some practical examples to help you get started with different types of crawlers in Crawlee. Each example demonstrates how to set up and run a crawler for specific use cases, whether you need to handle simple HTML pages or interact with JavaScript-heavy sites. ### BeautifulSoupCrawler @@ -167,6 +156,29 @@ if __name__ == '__main__': Explore our [Examples](https://crawlee.dev/python/docs/examples) page in the Crawlee documentation for a wide range of additional use cases and demonstrations. +## Features + +Why Crawlee is the preferred choice for web scraping and crawling? + +### Why use Crawlee instead of just a random HTTP library with an HTML parser? + +- Unified interface for **HTTP & headless browser** crawling. +- Automatic **parallel crawling** based on available system resources. +- Written in Python with **type hints** - enhances DX (IDE autocompletion) and reduces bugs (static type checking). +- Automatic **retries** on errors or when you’re getting blocked. +- Integrated **proxy rotation** and session management. +- Configurable **request routing** - direct URLs to the appropriate handlers. +- Persistent **queue for URLs** to crawl. +- Pluggable **storage** of both tabular data and files. +- Robust **error handling**. + +### Why to use Crawlee rather than Scrapy? + +- Crawlee has out-of-the-box support for **headless browser** crawling (Playwright). +- Crawlee has a **minimalistic & elegant interface** - Set up your scraper with fewer than 10 lines of code. +- Complete **type hint** coverage. +- Based on standard **Asyncio**. + ## Running on the Apify platform Crawlee is open-source and runs anywhere, but since it's developed by [Apify](https://apify.com), it's easy to set up on the Apify platform and run in the cloud. Visit the [Apify SDK website](https://docs.apify.com/sdk/python/) to learn more about deploying Crawlee to the Apify platform. diff --git a/docs/introduction/01-setting-up.md b/docs/introduction/01-setting-up.md index 628f0ba8b7..4ce2072f27 100644 --- a/docs/introduction/01-setting-up.md +++ b/docs/introduction/01-setting-up.md @@ -10,11 +10,11 @@ To run Crawlee on your computer, ensure you meet the following requirements: You can verify these by running the following commands: -```bash +```sh python --version ``` -```bash +```sh pip --version ``` @@ -22,7 +22,7 @@ pip --version Crawlee is available as the [`crawlee`](https://pypi.org/project/crawlee/) PyPI package. -```bash +```sh pip install crawlee ``` @@ -30,35 +30,47 @@ Additional, optional dependencies unlocking more features are shipped as package If you plan to use `BeautifulSoupCrawler`, install `crawlee` with `beautifulsoup` extra: -```bash +```sh pip install 'crawlee[beautifulsoup]' ``` If you plan to use `PlaywrightCrawler`, install `crawlee` with the `playwright` extra: -```bash +```sh pip install 'crawlee[playwright]' ``` Then, install the Playwright dependencies: -```bash +```sh playwright install ``` You can install multiple extras at once by using a comma as a separator: -```bash +```sh pip install 'crawlee[beautifulsoup,playwright]' ``` Verify that Crawlee is successfully installed: -```bash +```sh python -c 'import crawlee; print(crawlee.__version__)' ``` - +## With Crawlee CLI + +The quickest way to get started with Crawlee is by using the Crawlee CLI and selecting one of the prepared templates. First, ensure you have [Pipx](https://pipx.pypa.io/) installed: + +```sh +pipx --help +``` + +Then, run the CLI and choose from the available templates: + +```sh +pipx run crawlee create my-crawler +``` ## Next steps diff --git a/docs/quick-start/index.md b/docs/quick-start/index.md index 5531620375..95c35b94bf 100644 --- a/docs/quick-start/index.md +++ b/docs/quick-start/index.md @@ -30,7 +30,7 @@ Crawlee requires Python 3.9 or later. Crawlee is available as the [`crawlee`](https://pypi.org/project/crawlee/) PyPI package. -```bash +```sh pip install crawlee ``` @@ -38,25 +38,25 @@ Additional, optional dependencies unlocking more features are shipped as package If you plan to use `BeautifulSoupCrawler`, install `crawlee` with `beautifulsoup` extra: -```bash +```sh pip install 'crawlee[beautifulsoup]' ``` If you plan to use `PlaywrightCrawler`, install `crawlee` with the `playwright` extra: -```bash +```sh pip install 'crawlee[playwright]' ``` Then, install the Playwright dependencies: -```bash +```sh playwright install ``` You can install multiple extras at once by using a comma as a separator: -```bash +```sh pip install 'crawlee[beautifulsoup,playwright]' ``` diff --git a/templates/beautifulsoup/{{cookiecutter.project_name}}/README.md b/templates/beautifulsoup/{{cookiecutter.project_name}}/README.md index 34223a7293..46556c65b3 100644 --- a/templates/beautifulsoup/{{cookiecutter.project_name}}/README.md +++ b/templates/beautifulsoup/{{cookiecutter.project_name}}/README.md @@ -4,14 +4,20 @@ Project skeleton generated by Crawlee (Beautifulsoup template). ## Usage -First, make sure you have Poetry package management system installed. +To get started, ensure you have [Poetry](https://python-poetry.org/), a package and dependency management system, installed on your machine. You can install it with the following command: + +```sh +pip install poetry +``` + +Next, install the project dependencies: -First, install the dependencies: ```sh poetry install ``` -Then you can launch the crawler: +Finally, launch the crawler with: + ```sh poetry run python -m {{cookiecutter.project_name}} ``` diff --git a/templates/playwright/{{cookiecutter.project_name}}/README.md b/templates/playwright/{{cookiecutter.project_name}}/README.md index 434696b84a..72ff6525f3 100644 --- a/templates/playwright/{{cookiecutter.project_name}}/README.md +++ b/templates/playwright/{{cookiecutter.project_name}}/README.md @@ -4,12 +4,20 @@ Project skeleton generated by Crawlee (Playwright template). ## Usage -First, install the dependencies: +To get started, ensure you have [Poetry](https://python-poetry.org/), a package and dependency management system, installed on your machine. You can install it with the following command: + +```sh +pip install poetry +``` + +Next, install the project dependencies: + ```sh poetry install ``` -Then you can launch the crawler: +Finally, launch the crawler with: + ```sh poetry run python -m {{cookiecutter.project_name}} ``` From 5bfe836216694a8eee5ff631707b2fe4aa28341c Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 8 Jul 2024 12:32:22 +0200 Subject: [PATCH 3/9] update versions to 0.1.0 --- pyproject.toml | 2 +- src/crawlee/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6c35d9dbc6..960d31e85e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "crawlee" -version = "0.0.8" +version = "0.1.0" description = "Crawlee for Python" authors = ["Apify Technologies s.r.o. "] license = "Apache-2.0" diff --git a/src/crawlee/__init__.py b/src/crawlee/__init__.py index f512926fd4..785cebb3e0 100644 --- a/src/crawlee/__init__.py +++ b/src/crawlee/__init__.py @@ -1,3 +1,3 @@ from ._utils.globs import Glob -__version__ = '0.0.8' +__version__ = '0.1.0' From 060cb8786e87971b8eef99a2d1d7e31353e598ff Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 8 Jul 2024 13:31:53 +0200 Subject: [PATCH 4/9] change pr template --- .github/pull_request_template.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 30d27df2a9..9fac14a7ad 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -4,11 +4,11 @@ - TODO -### Related issues +### Issues -- TODO +- Closes: #TODO ### Testing From 8a91174b0f8779e58e2ec231aaf69850ab750f9f Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 8 Jul 2024 13:32:09 +0200 Subject: [PATCH 5/9] templates work --- src/crawlee/cli.py | 159 +++++++++--------- .../{{cookiecutter.project_name}}/routes.py | 1 + .../{{cookiecutter.project_name}}/routes.py | 1 + 3 files changed, 85 insertions(+), 76 deletions(-) diff --git a/src/crawlee/cli.py b/src/crawlee/cli.py index 28a6fed409..09aa7afe4e 100644 --- a/src/crawlee/cli.py +++ b/src/crawlee/cli.py @@ -1,8 +1,8 @@ -# ruff: noqa: FA100 ASYNC210 ASYNC100 -import asyncio -from functools import wraps +# ruff: noqa: TRY301, FBT002, UP007 +from __future__ import annotations + from pathlib import Path -from typing import Annotated, Any, Callable, Coroutine, List, Union +from typing import Annotated, Union import httpx import inquirer # type: ignore @@ -12,23 +12,12 @@ TEMPLATE_LIST_URL = 'https://api.github.com/repos/apify/crawlee-python/contents/templates' - -def run_async(func: Callable[..., Coroutine]) -> Callable: - """Decorates a coroutine function so that it is ran with `asyncio.run`.""" - - @wraps(func) - def wrapper(*args: Any, **kwargs: Any) -> None: - asyncio.run(func(*args, **kwargs)) - - return wrapper - - cli = typer.Typer(no_args_is_help=True) @cli.callback(invoke_without_command=True) def callback( - version: Annotated[ # noqa: FBT002 + version: Annotated[ bool, typer.Option( '-V', @@ -38,7 +27,7 @@ def callback( ), ] = False, ) -> None: - """Implements the 'no command' behavior.""" + """Crawlee is a web scraping and browser automation library.""" if version: from crawlee import __version__ @@ -46,77 +35,95 @@ def callback( @cli.command() -@run_async -async def create( +def create( project_name: Annotated[ - Union[str, None], + Union[str | None], typer.Argument( help='The name of the project and the directory that will be created to contain it. ' 'If none is given, you will be prompted.' ), ] = None, template: Annotated[ - Union[str, None], + Union[str | None], typer.Option(help='The template to be used to create the project. If none is given, you will be prompted.'), ] = None, ) -> None: """Bootstrap a new Crawlee project.""" - if template is None: - templates_response = httpx.get(TEMPLATE_LIST_URL, timeout=httpx.Timeout(10)) - template_choices: List[str] = [item['name'] for item in templates_response.json() if item['type'] == 'dir'] - else: - template_choices = [] - - while project_name is None: - answers = ( - inquirer.prompt( - [ - inquirer.Text( - 'project_name', - message='Name of the new project folder', - validate=lambda _, it: len(it) > 0, - ignore=project_name is not None, - ), - ] + try: + # Update template choices if template is not provided. + if template is None: + templates_response = httpx.get(TEMPLATE_LIST_URL, timeout=httpx.Timeout(10)) + template_choices: list[str] = [item['name'] for item in templates_response.json() if item['type'] == 'dir'] + else: + template_choices = [] + + # Get project name. + if project_name is None: + answers = ( + inquirer.prompt( + [ + inquirer.Text( + name='project_name', + message='Name of the new project folder', + validate=lambda _, it: len(it) > 0, + ignore=project_name is not None, + ), + ] + ) + or {} ) - or {} - ) - project_path = Path.cwd() / answers['project_name'] + project_name = answers.get('project_name') + + if project_name is None: + typer.echo('Project name is required.', err=True) + raise typer.Exit + + project_path = Path.cwd() / project_name if project_path.exists(): - typer.echo(f'Folder {project_path} exists', err=True) - else: - project_name = answers['project_name'] - - answers = ( - inquirer.prompt( - [ - inquirer.List( - 'template', - message='Please select the template for your new Crawlee project', - choices=[(choice[0].upper() + choice[1:], choice) for choice in template_choices], - ignore=template is not None, - ), - ] - ) - or {} - ) - - template = template or answers['template'] - - with Progress( - SpinnerColumn(), - TextColumn('[progress.description]{task.description}'), - transient=True, - ) as progress: - progress.add_task(description='Bootstrapping...', total=None) - cookiecutter( - 'gh:apify/crawlee-python', - directory=f'templates/{template}', - no_input=True, - extra_context={'project_name': project_name}, - ) - - typer.echo(f'Your project was created in {Path.cwd() / project_name}') - typer.echo(f'To run your project, run `cd {project_name}`, `poetry install` and `python -m {project_name}`') + typer.echo(f'Folder {project_path} exists, please choose another name.', err=True) + raise typer.Exit + + # Get teamplate choice. + + if template is None: + answers = ( + inquirer.prompt( + [ + inquirer.List( + name='template', + message='Please select the template for your new Crawlee project', + choices=[(choice[0].upper() + choice[1:], choice) for choice in template_choices], + ignore=template is not None, + ), + ] + ) + or {} + ) + + template = answers.get('template') + + # Start the bootstrap process. + with Progress( + SpinnerColumn(), + TextColumn('[progress.description]{task.description}'), + transient=True, + ) as progress: + progress.add_task(description='Bootstrapping...', total=None) + cookiecutter( + template='gh:apify/crawlee-python', + directory=f'templates/{template}', + no_input=True, + extra_context={'project_name': project_name}, + ) + + typer.echo(f'Your project was created in {project_path}.') + typer.echo(f'See the created `{project_name}/README.md` file for more information.') + + except KeyboardInterrupt: + typer.echo('Operation cancelled by user.') + + +if __name__ == '__main__': + cli() diff --git a/templates/beautifulsoup/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/routes.py b/templates/beautifulsoup/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/routes.py index 1f68544f8b..a096131770 100644 --- a/templates/beautifulsoup/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/routes.py +++ b/templates/beautifulsoup/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/routes.py @@ -6,6 +6,7 @@ @router.default_handler async def default_handler(context: BeautifulSoupCrawlingContext) -> None: + """Default request handler.""" title = context.soup.find('title') await context.push_data( { diff --git a/templates/playwright/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/routes.py b/templates/playwright/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/routes.py index 985a3753c6..37273ef30d 100644 --- a/templates/playwright/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/routes.py +++ b/templates/playwright/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/routes.py @@ -6,6 +6,7 @@ @router.default_handler async def default_handler(context: PlaywrightCrawlingContext) -> None: + """Default request handler.""" title = await context.page.query_selector('title') await context.push_data( { From ecafa4e4b67b2c7bb170629a244f713ec39d8799 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 8 Jul 2024 14:00:42 +0200 Subject: [PATCH 6/9] update templates & CLI --- src/crawlee/cli.py | 72 ++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/src/crawlee/cli.py b/src/crawlee/cli.py index 09aa7afe4e..36a193d214 100644 --- a/src/crawlee/cli.py +++ b/src/crawlee/cli.py @@ -50,14 +50,7 @@ def create( ) -> None: """Bootstrap a new Crawlee project.""" try: - # Update template choices if template is not provided. - if template is None: - templates_response = httpx.get(TEMPLATE_LIST_URL, timeout=httpx.Timeout(10)) - template_choices: list[str] = [item['name'] for item in templates_response.json() if item['type'] == 'dir'] - else: - template_choices = [] - - # Get project name. + # Prompt for project name if not provided. if project_name is None: answers = ( inquirer.prompt( @@ -65,8 +58,7 @@ def create( inquirer.Text( name='project_name', message='Name of the new project folder', - validate=lambda _, it: len(it) > 0, - ignore=project_name is not None, + validate=lambda _, value: bool(value.strip()), ), ] ) @@ -75,18 +67,25 @@ def create( project_name = answers.get('project_name') - if project_name is None: + if not project_name: typer.echo('Project name is required.', err=True) - raise typer.Exit + raise typer.Exit(1) project_path = Path.cwd() / project_name if project_path.exists(): - typer.echo(f'Folder {project_path} exists, please choose another name.', err=True) - raise typer.Exit + typer.echo(f'Folder {project_path} already exists. Please choose another name.', err=True) + raise typer.Exit(1) - # Get teamplate choice. + template_choices: list[str] = [] + # Fetch available templates if a template is not provided. + if template is None: + response = httpx.get(TEMPLATE_LIST_URL, timeout=httpx.Timeout(10)) + response.raise_for_status() + template_choices = [item['name'] for item in response.json() if item['type'] == 'dir'] + + # Prompt for template choice if not provided. if template is None: answers = ( inquirer.prompt( @@ -101,29 +100,32 @@ def create( ) or {} ) - template = answers.get('template') - # Start the bootstrap process. - with Progress( - SpinnerColumn(), - TextColumn('[progress.description]{task.description}'), - transient=True, - ) as progress: - progress.add_task(description='Bootstrapping...', total=None) - cookiecutter( - template='gh:apify/crawlee-python', - directory=f'templates/{template}', - no_input=True, - extra_context={'project_name': project_name}, - ) + if project_name and template: + # Start the bootstrap process. + with Progress( + SpinnerColumn(), + TextColumn('[progress.description]{task.description}'), + transient=True, + ) as progress: + progress.add_task(description='Bootstrapping...', total=None) + cookiecutter( + template='gh:apify/crawlee-python', + directory=f'templates/{template}', + no_input=True, + extra_context={'project_name': project_name}, + ) - typer.echo(f'Your project was created in {project_path}.') - typer.echo(f'See the created `{project_name}/README.md` file for more information.') + typer.echo(f'Your project "{project_name}" was created.') + typer.echo( + f'To run it, navigate to the directory: "cd {project_name}", ' + 'install dependencies with "poetry install", ' + f'and run it using "poetry run python -m {project_name}".' + ) + typer.echo(f'See the "{project_name}/README.md" for more information.') + except httpx.HTTPStatusError as exc: + typer.echo(f'Failed to fetch templates: {exc}.', err=True) except KeyboardInterrupt: typer.echo('Operation cancelled by user.') - - -if __name__ == '__main__': - cli() From fd3bc9877c4f22695909b8e8aa578e9e2163f802 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 8 Jul 2024 14:07:36 +0200 Subject: [PATCH 7/9] changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index da2061efce..821e579751 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ - new project bootstrapping via `pipx run crawlee create` +### Fixes + +- improve error handling in project bootstrapping + ## [0.0.7](../../releases/tag/v0.0.7) - 2024-06-27 ### Fixes From c5e22a32ea9282eef65a31c53f979a128499e037 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 8 Jul 2024 14:08:54 +0200 Subject: [PATCH 8/9] apify local storage dir --- src/crawlee/configuration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/crawlee/configuration.py b/src/crawlee/configuration.py index 2a715695af..c81b744bcf 100644 --- a/src/crawlee/configuration.py +++ b/src/crawlee/configuration.py @@ -170,7 +170,7 @@ class Configuration(BaseSettings): str, Field( validation_alias=AliasChoices( - 'apify_storage_dir', + 'apify_local_storage_dir', 'crawlee_storage_dir', ), ), From c3a25aab41b5e47ae5ec31fcd280a2dbd6d2d795 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 8 Jul 2024 14:47:04 +0200 Subject: [PATCH 9/9] add comment --- tests/unit/conftest.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index ad84a81054..df7cde1ebe 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -1,3 +1,6 @@ +# TODO: type ignores and crawlee_storage_dir +# https://github.com/apify/crawlee-py/issues/146 + from __future__ import annotations import os