diff --git a/.codecov.yml b/.codecov.yml new file mode 100644 index 0000000..d8aa6b9 --- /dev/null +++ b/.codecov.yml @@ -0,0 +1,6 @@ +comment: + layout: "header, diff, tree" + +coverage: + status: + project: false diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..545885b --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,23 @@ +name: Publish +on: + push: + tags: + - 'v[0-9]+.[0-9]+.[0-9]+' +jobs: + publish: + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/${{ github.event.repository.name }} + permissions: + id-token: write + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.13 + - run: | + python -m pip install --upgrade build + python -m build + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..b3a17c0 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,49 @@ +name: test +on: + push: + branches: [ main ] + pull_request: +jobs: + test: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - python-version: '3.13' + toxenv: pre-commit + no-coverage: true + - python-version: '3.13' + toxenv: docs + no-coverage: true + - python-version: '3.13' + toxenv: mypy + no-coverage: true + - python-version: '3.13' + toxenv: twine + no-coverage: true + - python-version: '3.9' + toxenv: min + - python-version: '3.9' + - python-version: '3.10' + - python-version: '3.11' + - python-version: '3.12' + - python-version: '3.13' + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install tox + - name: tox + run: | + tox -e ${{ matrix.toxenv || 'py' }} + - name: coverage + if: ${{ success() && !matrix.no-coverage }} + uses: codecov/codecov-action@v4.0.1 + with: + token: ${{ secrets.CODECOV_TOKEN }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b98ad68 --- /dev/null +++ b/.gitignore @@ -0,0 +1,13 @@ +.coverage +.mypy_cache/ +.tox/ +dist/ +htmlcov/ +coverage.xml +docs/_build +*.egg-info/ +__pycache__/ +coverage-html/ +build/ +.idea/ +venv/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..4134e6d --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,13 @@ +repos: +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.11.10 + hooks: + - id: ruff + args: [ --fix ] + - id: ruff-format +- repo: https://github.com/adamchainz/blacken-docs + rev: 1.19.1 + hooks: + - id: blacken-docs + additional_dependencies: + - black==25.1.0 diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 0000000..45e6290 --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,12 @@ +version: 2 +formats: all +sphinx: + configuration: docs/conf.py +build: + os: ubuntu-24.04 + tools: + python: "3.13" # Keep in sync with .github/workflows/test.yml +python: + install: + - requirements: docs/requirements.txt + - path: . diff --git a/CHANGES.rst b/CHANGES.rst new file mode 100644 index 0000000..4725838 --- /dev/null +++ b/CHANGES.rst @@ -0,0 +1,8 @@ +======= +Changes +======= + +0.0.1 (unreleased) +================== + +Initial version. diff --git a/LICENSE b/LICENSE index ede93ba..6817eb3 100644 --- a/LICENSE +++ b/LICENSE @@ -1,28 +1,27 @@ -BSD 3-Clause License +Copyright (c) Zyte Group Ltd +All rights reserved. -Copyright (c) 2025, Scrapy Plugins +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. + 3. Neither the name of Zyte nor the names of its contributors may be used + to endorse or promote products derived from this software without + specific prior written permission. -3. Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..5d92fd2 --- /dev/null +++ b/README.rst @@ -0,0 +1,33 @@ +================= +scrapy-crawl-maps +================= + +.. image:: https://img.shields.io/pypi/v/scrapy-crawl-maps.svg + :target: https://pypi.python.org/pypi/scrapy-crawl-maps + :alt: PyPI Version + +.. image:: https://img.shields.io/pypi/pyversions/scrapy-crawl-maps.svg + :target: https://pypi.python.org/pypi/scrapy-crawl-maps + :alt: Supported Python Versions + +.. image:: https://github.com/scrapy-plugins/scrapy-crawl-maps/actions/workflows/test.yml/badge.svg + :target: https://github.com/scrapy-plugins/scrapy-crawl-maps/actions/workflows/test.yml + :alt: Automated tests + +.. image:: https://codecov.io/github/scrapy-plugins/scrapy-crawl-maps/coverage.svg?branch=main + :target: https://codecov.io/gh/scrapy-plugins/scrapy-crawl-maps + :alt: Coverage report + +.. description-start + +**scrapy-crawl-maps** is a Scrapy_ plugin that allows defining the logic of a +spider using a `directed graph`_ defined in JSON_ format. + +.. _directed graph: https://en.wikipedia.org/wiki/Directed_graph +.. _JSON: https://www.json.org/json-en.html +.. _Scrapy: https://scrapy.org/ + +.. description-end + +* Documentation: https://scrapy-crawl-maps.readthedocs.io/en/latest/ +* License: BSD 3-clause diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/_ext/__init__.py b/docs/_ext/__init__.py new file mode 100644 index 0000000..e0ea404 --- /dev/null +++ b/docs/_ext/__init__.py @@ -0,0 +1,25 @@ +def setup(app): + # https://stackoverflow.com/a/13663325 + # + # Copied from Scrapy: + # https://github.com/scrapy/scrapy/blob/dba37674e6eaa6c2030c8eb35ebf8127cd488062/docs/_ext/scrapydocs.py#L90C16-L110C6 + app.add_crossref_type( + directivename="setting", + rolename="setting", + indextemplate="pair: %s; setting", + ) + app.add_crossref_type( + directivename="signal", + rolename="signal", + indextemplate="pair: %s; signal", + ) + app.add_crossref_type( + directivename="command", + rolename="command", + indextemplate="pair: %s; command", + ) + app.add_crossref_type( + directivename="reqmeta", + rolename="reqmeta", + indextemplate="pair: %s; reqmeta", + ) diff --git a/docs/api.rst b/docs/api.rst new file mode 100644 index 0000000..34325c6 --- /dev/null +++ b/docs/api.rst @@ -0,0 +1,106 @@ +.. _reference: + +========== +Python API +========== + +Nodes +===== + +.. _builtin-node-types: + +Built-in node types +------------------- + +.. autoclass:: scrapy_crawl_maps.FetchNode() + :show-inheritance: + :members: type, spec + +.. autoclass:: scrapy_crawl_maps.ItemFollowNode() + :show-inheritance: + :members: type, spec + +.. autopydantic_model:: scrapy_crawl_maps.ItemFollowNodeParams + :inherited-members: BaseModel + +.. autoclass:: scrapy_crawl_maps.ItemsNode() + :show-inheritance: + :members: type, spec + +.. autopydantic_model:: scrapy_crawl_maps.ItemsNodeParams + :inherited-members: BaseModel + +.. autoclass:: scrapy_crawl_maps.SelectorParserNode() + :show-inheritance: + :members: type, spec + +.. autopydantic_model:: scrapy_crawl_maps.SelectorParserNodeParams + :inherited-members: BaseModel + +.. autoclass:: scrapy_crawl_maps.UrlsNode() + :show-inheritance: + :members: type, spec + +.. autopydantic_model:: scrapy_crawl_maps.UrlsNodeParams + :inherited-members: BaseModel + +.. autoclass:: scrapy_crawl_maps.UrlsFileNode() + :show-inheritance: + :members: type, spec + +.. autopydantic_model:: scrapy_crawl_maps.UrlsFileNodeParams + :inherited-members: BaseModel + + +Node base classes +----------------- + +.. autoclass:: scrapy_crawl_maps.ProcessorNode + :members: type, spec, process, process_request + +.. autoclass:: scrapy_crawl_maps.SpiderNode + :members: type, spec, process_input, process_output + +.. autoclass:: scrapy_crawl_maps.NodeArgs() + :members: args + +.. _port-types: + +Port types +========== + +… + + +.. _spiders: + +Spiders +======= + +Crawl map spider +---------------- + +.. autoclass:: scrapy_crawl_maps.CrawlMapSpider() + :show-inheritance: + +.. autopydantic_model:: scrapy_crawl_maps.CrawlMapSpiderParams + :inherited-members: BaseModel + +.. autoclass:: scrapy_crawl_maps.CrawlMapSpiderCrawlMap + :show-inheritance: + + +Base spider +----------- + +.. autoclass:: scrapy_crawl_maps.CrawlMapBaseSpider() + + +Crawl map +========= + +.. autoclass:: scrapy_crawl_maps.CrawlMap() + :members: + +.. autoclass:: scrapy_crawl_maps.ResponseData + :members: diff --git a/docs/changes.rst b/docs/changes.rst new file mode 100644 index 0000000..d9e113e --- /dev/null +++ b/docs/changes.rst @@ -0,0 +1 @@ +.. include:: ../CHANGES.rst diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..bbe4eb2 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,61 @@ +import sys +from pathlib import Path + +project = "scrapy-crawl-maps" +copyright = "2025, Zyte Group Ltd" +author = "Zyte Group Ltd" +release = "0.0.0" + +sys.path.insert(0, str(Path(__file__).parent.absolute())) # _ext +extensions = [ + "_ext", + "sphinx.ext.autodoc", + "sphinx.ext.intersphinx", + "sphinx.ext.viewcode", + "sphinx_design", + "sphinxcontrib.autodoc_pydantic", +] + +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +html_theme = "sphinx_rtd_theme" + +intersphinx_mapping = { + "python": ( + "https://docs.python.org/3", + None, + ), + "scrapy": ( + "https://docs.scrapy.org/en/latest", + None, + ), + "scrapy-poet": ( + "https://scrapy-poet.readthedocs.io/en/stable", + None, + ), + "scrapy-spider-metadata": ( + "https://scrapy-spider-metadata.readthedocs.io/en/latest", + None, + ), + "web-poet": ( + "https://web-poet.readthedocs.io/en/stable", + None, + ), + "zyte": ( + "https://docs.zyte.com", + None, + ), +} + +autodoc_default_options = { + "member-order": "bysource", +} + +autodoc_pydantic_model_member_order = "bysource" +autodoc_pydantic_model_show_config_summary = False +autodoc_pydantic_model_show_field_summary = False +autodoc_pydantic_model_show_json = False +autodoc_pydantic_model_show_validator_members = False +autodoc_pydantic_model_show_validator_summary = False +autodoc_pydantic_field_list_validators = False +autodoc_pydantic_field_show_constraints = False diff --git a/docs/customize.rst b/docs/customize.rst new file mode 100644 index 0000000..ef75828 --- /dev/null +++ b/docs/customize.rst @@ -0,0 +1,126 @@ +.. _customize: + +============= +Customization +============= + +.. currentmodule:: scrapy_crawl_maps + +scrapy-crawl-maps allows defining :ref:`custom node types ` and +:ref:`custom spiders `. + +.. _custom-node: + +Creating a custom node type +=========================== + +To create a custom node type: + +#. :ref:`Subclass an appropriate node base class `. + +#. Set in :attr:`~ProcessorNode.type` the ID of your node type in :ref:`crawl + maps `. + +#. Set in :attr:`~ProcessorNode.spec` your :ref:`input and output ports + ` and :ref:`parameters `. + +#. Implement the abstract methods of your base class. + +For example, this node type has no inputs and outputs a request to +https://toscrape.com: + +.. code-block:: python + + from scrapy import Request + from scrapy_crawl_maps import ProcessorNode + + + class MyNode(ProcessorNode): + type = "my-node" + spec = { + "output": { + "type": "request", + }, + } + + async def process(self, *, inputs, outputs, response_data): + await outputs["main"].put(Request("https://toscrape.com")) + +To use your node, define a :class:`CrawlMap` subclass that includes it in +:attr:`~CrawlMap.node_types`, and use that subclass as type for the +crawl map parameter of your spider. For example: + +.. code-block:: python + + from scrapy_crawl_maps import ( + CrawlMapSpider, + CrawlMapSpiderCrawlMap, + ) + from myproject.nodes import MyNode + + + class MySpiderCrawlMap(CrawlMapSpiderCrawlMap): + node_types = CrawlMapSpiderCrawlMap.node_types | {MyNode} + + + class MySpiderParams(BaseModel): + map: MySpiderCrawlMap = Field() + + + class MySpider(CrawlMapSpider, Args[MySpiderParams]): + pass + + +.. _node-base-classes: + +Processor nodes and spider nodes +-------------------------------- + +To write a custom node type, first you need to determine if you need a +processor node or a spider node: + +- A **processor node** has a single :meth:`~ProcessorNode.process` method + that can access inputs, write to outputs, and access the response data when + called in the context of a spider callback (i.e. not as part of + :meth:`Spider.start() `). + +- A **spider node** interfaces with the Scrapy spider, so it must implement + two separate methods: :meth:`~SpiderNode.process_input` and + :meth:`~SpiderNode.process_output`. + +Generally, you should use a processor node if you can, or a spider node if you +must. + +… + + +.. _inputs: + +Iterating inputs +---------------- + +… + + +.. _outputs: + +Enqueuing outputs +----------------- + +… + + +.. _node-params: + +Defining node parameters +------------------------ + +… + + +.. _custom-spider: + +Creating a custom spider +======================== + +… diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..0ec2b8e --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,29 @@ +=============================== +scrapy-crawl-maps documentation +=============================== + +.. include:: ../README.rst + :start-after: description-start + :end-before: description-end + +.. toctree:: + :caption: Get started + :hidden: + + setup + tutorial + +.. toctree:: + :caption: Usage + :hidden: + + map + customize + +.. toctree:: + :caption: Reference + :hidden: + + api + specs + changes diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..954237b --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/map.rst b/docs/map.rst new file mode 100644 index 0000000..651704b --- /dev/null +++ b/docs/map.rst @@ -0,0 +1,125 @@ +.. _map: + +========== +Crawl maps +========== + +.. currentmodule:: scrapy_crawl_maps + +Crawl-map based spiders like :class:`CrawlMapSpider` expose a :ref:`spider +parameter ` that accepts a crawl map. + +A **crawl map** is a JSON_ object that defines the crawling logic of the spider +as a `directed graph`_ with :ref:`nodes ` and :ref:`edges `. + +.. _directed graph: https://en.wikipedia.org/wiki/Directed_graph +.. _JSON: https://www.json.org/json-en.html + +You can write crawl maps by hand, but it is usually easier to use a +:ref:`builder `. + +Continue reading to learn how to build crawl maps. For reference documentation, +see :ref:`crawl-map-spec`. + +.. _node-args: +.. _node-type: +.. _node-types: +.. _nodes: + +Nodes +===== + +The :ref:`"nodes" ` key in a crawl map defines the nodes that make +up the spider logic. + +It is a JSON object where each key is an arbitrary node ID chosen by you, and +each value is a JSON object that defines the :ref:`"type" ` and +possibly :ref:`"args" ` of that node. + +For example, to define the start URLs of your spider, you can define a node of +type :class:`UrlsNode` with the start URLs defined in its ``urls`` parameters: + +.. code-block:: json + + { + "nodes": { + "start_urls": { + "type": "urls", + "args": { + "urls": [ + "https://toscrape.com" + ] + } + } + } + } + +When reading the reference documentation of :ref:`built-in node types +` like :class:`UrlsNode`: + +- You can find their ``type`` ID in their :attr:`~UrlsNode.type` attribute. + For :class:`UrlsNode`, it is ``"urls"``. + +- If a node type supports parameters, its list of base classes will include + :class:`NodeArgs` with some other class between brackets, e.g. + :class:`NodeArgs` [:class:`UrlsNodeParams`]. The class between brackets is + a `pydantic model`_ that defines the node type parameters. + + .. _pydantic model: https://docs.pydantic.dev/latest/concepts/models/ + + +.. _edges: +.. _port: + +Edges +===== + +The ``edges`` key in a crawl map defines connections between nodes. + +Node types may define **input and output ports**. + +Ports have a name and a type. The port name is a string, conventionally +``"main"`` when there is only 1 port. The port type is a string, conventionally +one of: ``"request"``, ``"response"``, ``"item"``, ``"string"``. + +In a crawl map, you can connect an input port to an output port provided they +both have the same type. + +For example, to download your start URLs, you can define a node of type +:class:`FetchNode` and connect the output of your start URLs node to the input +of your fetch node: + +.. code-block:: python + + { + "nodes": { + "start_urls": {"type": "urls", "args": {"urls": ["https://toscrape.com"]}}, + "fetch": {"type": "fetch"}, + }, + "edges": [{"from": ["start_urls", "main"], "to": ["fetch", "main"]}], + } + +As you can see, the ``"from"`` and ``"to"`` values as JSON arrays of 2 strings, +a node ID and a port name. + +If the source node has a single output port or the target node has a single +input port, as with both :class:`UrlsNode` and :class:`FetchNode`, you can +specify the node ID only as a string: + +.. code-block:: python + + {"edges": [{"from": "start_urls", "to": "fetch"}]} + + +.. _builders: + +Builders +======== + +**Crawl map builders** are tools, usually visual, that help you create and edit +crawl maps. + +However, at the moment there is no known crawl map builder. If you find or +develop one, please `open an issue`_ to get it listed here. + +.. _open an issue: https://github.com/scrapy-plugins/scrapy-crawl-maps/issues diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..c1cc245 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,4 @@ +autodoc_pydantic==2.2.0 +Sphinx==8.2.3 +sphinx_design==0.6.1 +sphinx-rtd-theme==3.0.2 diff --git a/docs/setup.rst b/docs/setup.rst new file mode 100644 index 0000000..b172ade --- /dev/null +++ b/docs/setup.rst @@ -0,0 +1,26 @@ +.. _setup: + +===== +Setup +===== + +.. + TODO: The following is currently fictional, assuming a future where this + code is split offf into a separate library. + +Install from PyPI_: + +.. _PyPI: https://pypi.org/project/scrapy-crawl-maps/ + +.. code-block:: shell + + pip install scrapy-crawl-maps + +And update :setting:`ADDONS`: + +.. code-block:: python + :caption: settings.py + + ADDONS = { + "scrapy_crawl_maps.Addon": 800, + } diff --git a/docs/specs.rst b/docs/specs.rst new file mode 100644 index 0000000..ecb462e --- /dev/null +++ b/docs/specs.rst @@ -0,0 +1,432 @@ +======== +JSON API +======== + +.. _crawl-map-spec: + +Crawl map specification +======================= + +A crawl map is a JSON object that supports the following keys: + +- .. _nodes-spec: + + ``"nodes"`` (``object``, required), where: + + - .. _node-id: + + Keys (``string``) are node IDs. + + They can be arbitrary strings, provided they are unique within a crawl + map, i.e. no 2 nodes can have the same ID. + + - Values (``object``) support the following keys: + + - .. _node-type-key: + + ``"type"`` (``string``, required) indicates the node type, e.g. + ``"urls"`` or ``"fetch"``. + + - .. _node-args-key: + + ``"args"`` (``object``, optional) indicates node arguments, where + keys are parameter names (``string``) and values are parameter + values (any valid JSON structure). + +- ``"edges"`` (``array``, optional), with items (``object``) that support the + following keys: + + - .. _edge-from: + + ``"from"`` (``string`` or ``array`` of ``string``, required) is one of + the following: + + - A node ID (see ``"nodes"``), e.g. ``"fetch-1"``. + + The node type of that node must define 1 (and only 1) :ref:`output + port ` matching the type of the :ref:`input port + ` defined in :ref:`"to" `. + + - An array of node ID and port ID, e.g. ``["parser-1", + "subcategories"]``. + + The node type of that node must define an :ref:`output port + ` with the specified ID (e.g. + ``"subcategories"``) matching the type of the :ref:`input port + ` defined in :ref:`"to" `. + + - .. _edge-to: + + ``"to"`` (``string`` or ``array`` of ``string``, required) indicates + the target node and port, with the same syntax as :ref:`"from" + `. + +Any node output of ``item`` type not linked to anything is implicitly linked to +the default output. + +Crawl map example +----------------- + +Spider that extracts the book title out of book detail pages from +https://books.toscrape.com: + +.. code-block:: json + + { + "nodes": { + "input": { + "type": "urls", + "args": { + "urls": [ + "https://books.toscrape.com/catalogue/soumission_998/index.html" + ] + } + }, + "fetch": { + "type": "fetch" + }, + "item-parser": { + "type": "selector-parser", + "args": { + "map": { + "title": { + "type": "css", + "value": "h1::text" + } + } + } + } + }, + "edges": [ + {"from": "input", "to": "fetch"}, + {"from": "fetch", "to": "item-parser"} + ] + } + + +.. _crawl-map-schema: +.. _crawl-map-schema-spec: + +Crawl map schema specification +============================== + +Crawl-map spiders expose the supported crawl map schema (i.e. the crawl map +node types that can be used and their supported inputs, outputs and parameters) +through spider metadata, specifically through additional keys in the metadata +JSON object of their crawl map parameter. + +Those additional keys are as follows: + +- .. _node-group-spec: + + ``"node_groups"`` (``object``, optional), where: + + - Keys are node type group IDs (see ``"node_types"."group"`` below). + + Note that node type group IDs may match node type IDs, e.g. the + ``"downloader"`` node type group may contain the ``"downloader"``, + ``"http_downloader"`` and ``"browser_downloader"`` node types. + + - Values (``object``) support the following keys: + + - ``"title"`` (``string``, optional) is the group title. + + If the UI offers a palette of node types containing all node types, it + may optionally allow grouping node types by their group, and use the + title as the group name to be shown in the UI. + + Node types in no group or in an untitled group could be all grouped in + some special “Misc” group. + + - ``"order"`` (``integer``, optional) enables sorting titled groups + in the UI. Lower values come first. + + For example, the ``request-input`` group could be ``1``, the + ``downloader`` group could be ``2``, and the ``parser`` group could + be ``3``. + + Groups without a value are sorted second to last. Untitled groups + are sorted last, even if they have a value. + + 2 or more groups can have the same value, in which case their + relative sorting is unspecified, i.e. up to the UI to decide, + although alphabetically may be a good choice. The UI is also free + to group same-value groups further, but these “groups of groups” + will have no title or any other metadata. + +- ``"node_types"`` (``object``, required), where: + + - Keys are node types (e.g. ``urls`` or ``fetch``). + + - .. _node-spec: + + Values (``object``) are node spec, which support the following keys: + + - ``"group"`` (``string``, optional) is the ID of a node type group. + + It can be an arbitrary string. Metadata about a given group may be + defined in the ``"node_groups"`` object (see above). + + - ``"title"`` (``string``, optional) is the node type title. If + missing, the node type ID should be used as title. + + - ``"description"`` (``string``, optional) is the node type + description, in plain text. + + .. + TODO: Clarify if we support e.g. new lines, and if we also + support some subset of reStructuredText, indicate so as well. + + - .. _node-type-inputs: + + ``"inputs"`` (``object``, optional) defines supported inputs with + the same syntax as ``"outputs"`` below. + + - .. _node-type-outputs: + + ``"outputs"`` (``object``, optional) defines supported outputs, + where: + + - Keys are output port IDs (see ``"edges"."from"``), which can be + arbitrary strings. + + - Values (``object``) support the following keys: + + - ``"type"`` (``string``, required). See ``"input"``. + + - ``"title"`` (``string``, optional). The name of the + output to show in the UI. If not defined, the UI should + not label the output. + + - ``"param_schema"`` (``object``, optional) is a parameter + specification defined the same way as scrapy-spider-metadata + parameters. + +Crawl map schema example +------------------------ + +.. code-block:: json + + { + "node_groups": { + "inputs": { + "title": "Inputs", + "order": 0, + }, + "steps": { + "title": "Steps", + "order": 1, + }, + "links": { + "title": "Links", + "order": 2, + }, + "data": { + "title": "Data to Return", + "order": 3, + } + }, + "node_types": { + "urls": { + "group": "inputs", + "title": "Start URLs", + "outputs": { + "main": { + "type": "request" + } + }, + "param_schema": { + "properties": { + "urls": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Initial URLs for the crawl, separated by new lines. Enter the full URL including http(s), you can copy and paste it from your browser. Example: https://toscrape.com/", + "title": "URLs", + "widget": "textarea" + } + }, + "title": "UrlsNodeParams", + "type": "object" + } + }, + "fetch": { + "group": "steps", + "title": "Fetch Pages", + "inputs": { + "main": { + "type": "request" + } + }, + "outputs": { + "main": { + "type": "response" + } + } + }, + "search": { + "group": "steps", + "title": "Search Form Input", + "inputs": { + "request": { + "type": "request" + }, + "query": { + "type": "string" + } + }, + "outputs": { + "main": { + "type": "request" + } + } + }, + "product-links": { + "group": "links", + "title": "Product Links", + "inputs": { + "main": { + "type": "response" + } + }, + "outputs": { + "main": { + "type": "request" + } + } + }, + "subcategory-links": { + "group": "links", + "title": "Subcategory Links", + "inputs": { + "main": { + "type": "response" + } + }, + "outputs": { + "main": { + "type": "request" + } + } + }, + "next-page-links": { + "group": "links", + "title": "Next Page Links", + "inputs": { + "main": { + "type": "response" + } + }, + "outputs": { + "main": { + "type": "request" + } + } + }, + "product": { + "group": "data", + "title": "Product", + "inputs": { + "main": { + "type": "response" + } + }, + "outputs": { + "main": { + "type": "item" + } + } + }, + "product-list": { + "group": "data", + "title": "Product List", + "inputs": { + "main": { + "type": "response" + } + }, + "outputs": { + "main": { + "type": "item" + } + } + }, + "product-navigation": { + "group": "data", + "title": "Product Navigation", + "inputs": { + "main": { + "type": "response" + } + }, + "outputs": { + "main": { + "type": "item" + } + } + } + } + } + + +Configuration page specification +================================ + +Spiders that define a :ref:`crawl map schema ` may also +define additional configuration pages in which to split spider parameters +other than the ``map`` parameter in :ref:`builders `. + +Configuration pages are defined with the ``config_pages`` metadata key of a +spider. It is a JSON array where each item is a JSON object supporting the +following keys: + +- ``"id"`` (``string``, required) is an arbitrary string representing the ID + of the configuration page. + + Certain tools may support special behaviors for pages with a given ID. For + example, Scrapy Cloud supports a ``job`` config page where it includes + settings that users might want to customize for specific job runs of a + given (virtual) spider, such as the number of Scrapy Cloud units to use. + +- ``"title"`` (``string``, optional) is the display name of the config page. + If not defined, the ``"id"`` is used instead for display purposes. + +- ``"order"`` (``integer``, optional) enables sorting pages in the UI. Lower + values come first. + + Pages without a value are sorted second to last. Untitled pages + are sorted last, even if they have a value. + + 2 or more pages can have the same value, in which case their relative + sorting is unspecified, i.e. up to the UI to decide, although + alphabetically may be a good choice. + +- ``"params"`` (``array`` of ``string``, optional) contains the names of + spider parameters that should be featured on the page. + + If a parameter in the value does not match any declared parameter of the + spider, the UI should ignore it, and possibly warn the user about it as a + programming error (e.g. a typo could be preventing the intended parameter + to show up in the right configuration page). + +By default, spiders with a crawl map schema have a default configuration page +that is the first page and contains any spider parameter, besides ``map``, that +is not listed in the ``"params"`` array of any other configuration page. The ID +of this configuration page is ``"main"``, and UI tools may give it a default +title, e.g. ``"Configuration"``. You may define a configuration page with ID +``"main"`` to override the position of that configuration page or override its +title. + +UI tools may hide configuration pages without parameters. In cases where 2 or +more pages with the same ID are defined, only the first definition should be +taken into account, and an error may be reported to users about this +programming error. diff --git a/docs/tutorial.rst b/docs/tutorial.rst new file mode 100644 index 0000000..3f5e04f --- /dev/null +++ b/docs/tutorial.rst @@ -0,0 +1,186 @@ +.. _tutorial: + +======== +Tutorial +======== + +.. currentmodule:: scrapy_crawl_maps + +Once you have :ref:`installed and configured scrapy-crawl-maps `, follow +this tutorial to learn how to use it. + +Hello world! +============ + +scrapy-crawl-maps provides a spider, :class:`CrawlMapSpider`, that can follow +a :ref:`crawl map `. + +Write a ``hello.json`` file with the following crawl map, which is the shortest +possible crawl map you can define, where you use a single :class:`ItemsNode` to +hardcode an output item: + +.. code-block:: json + :caption: hello.json + + { + "nodes": { + "items": { + "type": "items", + "args": {"items": [{"hello": "world"}]} + } + } + } + +It yields the following :ref:`item `: + +.. code-block:: json + + {"hello": "world"} + +Run the following command: + +.. code-block:: bash + + scrapy crawl map -a map=hello.json -o items.jsonl + +The command will create an ``items.jsonl`` file with that item. + +Congratulations! You have just run your first crawl map! + + +Parsing +======= + +While simple, the previous crawl map is not very useful. You will now create a +more useful crawl map. + +Write a ``parse.json`` file with the following crawl map, which uses +:class:`UrlsNode` to define input URLs, :class:`FetchNode` to fetch them, and +:class:`SelectorParserNode` to output items based on the response data using a +CSS selector: + +.. code-block:: json + :caption: parse.json + + { + "nodes": { + "input": { + "type": "urls", + "args": {"urls": ["https://toscrape.com"]} + }, + "fetch": {"type": "fetch"}, + "item-parser": { + "type": "selector-parser", + "args": { + "map": {"title": {"type": "css", "value": "h1::text"}} + } + } + }, + "edges": [ + {"from": "input", "to": "fetch"}, + {"from": "fetch", "to": "item-parser"} + ] + } + +Notice how, in addition to the nodes, you now define ``"edges"`` that connect +those nodes. + +To run this crawl map, run the following command: + +.. code-block:: bash + + scrapy crawl map -a map=parse.json -o items.jsonl + +``items.jsonl`` will now contain a new item: + +.. code-block:: json + + {"title": "Web Scraping Sandbox"} + + +Crawling +======== + +The previous crawl map handles parsing. Now you will create a crawl map that +also handles crawling, i.e. following parsed URLs. + +Write a ``crawl.json`` file with the following crawl map, which crawls books +from all pages of a category of http://books.toscrape.com/: + +.. code-block:: json + :caption: crawl.json + + { + "nodes": { + "input": { + "type": "urls", + "args": {"urls": ["http://books.toscrape.com/catalogue/category/books/mystery_3/index.html"]}, + }, + "fetch-navigation": {"type": "fetch"}, + "navigation-parser": { + "type": "selector-parser", + "args": { + "map": { + "book_urls": {"type": "css", "value": "section h3 a::attr(href)", "getter": "getall"}, + "next_url": {"type": "css", "value": ".next a::attr(href)"}, + } + }, + }, + "follow-next": { + "type": "item-follow", + "args": { + "url_jmes": "next_url", + }, + }, + "follow-book": { + "type": "item-follow", + "args": { + "url_jmes": "book_urls", + }, + }, + "fetch-book": {"type": "fetch"}, + "book-parser": { + "type": "selector-parser", + "args": { + "map": { + "name": {"type": "css", "value": "h1::text"}, + } + }, + }, + }, + "edges": [ + {"from": "input", "to": "fetch-navigation"}, + {"from": "fetch-navigation", "to": "navigation-parser"}, + {"from": "navigation-parser", "to": "follow-next"}, + {"from": "navigation-parser", "to": "follow-book"}, + {"from": "follow-next", "to": "fetch-navigation"}, + {"from": "follow-book", "to": "fetch-book"}, + {"from": "fetch-book", "to": "book-parser"}, + ], + } + +To run this crawl map, run the following command: + +.. code-block:: bash + + scrapy crawl map -a map=crawl.json -o items.jsonl + +``items.jsonl`` will now contain new items with book titles, e.g.: + +.. code-block:: json + + {"title": "A Light in the Attic"} + + +Next steps +========== + +You are now familiar with the basics of crawl maps. To practice and learn more, +you can: + +- Take an existing spider, and try to convert it into a crawl map and run it + with :class:`CrawlMapSpider`. + +- Learn more about :ref:`writing crawl maps `. + +- Learn to :ref:`write your own node types `. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..c08ac6c --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,183 @@ +[build-system] +requires = ["hatchling>=1.27.0"] +build-backend = "hatchling.build" + +[project] +name = "scrapy-crawl-maps" +version = "0.0.0" +description = "Scrapy plugin that allows defining the logic of a spider using a directed graph defined in JSON format." +dependencies = [ + "itemadapter>=0.8.0", + "jmespath>=0.9.5", + "pydantic>=2.4.0", + "scrapy>=2.13.0", + "scrapy-poet>=0.26.0", + "scrapy-spider-metadata>=0.1.1", +] +classifiers = [ + "Development Status :: 3 - Alpha", + "Environment :: Console", + "Framework :: Scrapy", + "Intended Audience :: Developers", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Topic :: Internet :: WWW/HTTP", + "Topic :: Software Development :: Libraries :: Python Modules", +] +license = "BSD-3-Clause" +license-files = ["LICENSE"] +readme = "README.rst" +requires-python = ">=3.9" +authors = [{ name = "Scrapy developers", email = "info@scrapy.org" }] +maintainers = [{ name = "Scrapy developers", email = "info@scrapy.org" }] + +[project.urls] +Documentation = "https://scrapy-crawl-maps.readthedocs.io" +Source = "https://github.com/scrapy-plugins/scrapy-crawl-maps" +Tracker = "https://github.com/scrapy-plugins/scrapy-crawl-maps/issues" +Changelog = "https://github.com/scrapy-plugins/scrapy-crawl-maps/commits/main" +releasenotes = "https://scrapy-crawl-maps.readthedocs.io/en/latest/changes.html" + +[tool.hatch.build.targets.sdist] +include = [ + "/docs", + "/scrapy_crawl_maps", + "/tests", + "/CHANGES.rst", + "/codecov.yml", + "/tox.ini", +] + +[tool.bumpversion] +current_version = "0.0.0" +commit = true +tag = true +tag_name = "{new_version}" + +[[tool.bumpversion.files]] +filename = "docs/conf.py" + +[tool.coverage.run] +branch = true +include = ["scrapy_crawl_maps/*"] +omit = ["tests/*"] +disable_warnings = ["include-ignored"] + +[tool.coverage.paths] +source = [ + "scrapy_crawl_maps", + ".tox/**/site-packages/scrapy_crawl_maps" +] + +[tool.coverage.report] +# https://github.com/nedbat/coveragepy/issues/831#issuecomment-517778185 +exclude_lines = ["pragma: no cover", "if TYPE_CHECKING:"] + +[tool.mypy] +check_untyped_defs = true + +[tool.pytest.ini_options] +xfail_strict = true +addopts = [ + "--reactor=asyncio", +] + +[tool.ruff.lint] +extend-select = [ + # flake8-bugbear + "B", + # flake8-comprehensions + "C4", + # pydocstyle + "D", + # flake8-future-annotations + "FA", + # flynt + "FLY", + # refurb + "FURB", + # isort + "I", + # flake8-implicit-str-concat + "ISC", + # flake8-logging + "LOG", + # Perflint + "PERF", + # pygrep-hooks + "PGH", + # flake8-pie + "PIE", + # pylint + "PL", + # flake8-pytest-style + "PT", + # flake8-use-pathlib + "PTH", + # flake8-pyi + "PYI", + # flake8-quotes + "Q", + # flake8-return + "RET", + # flake8-raise + "RSE", + # Ruff-specific rules + "RUF", + # flake8-bandit + "S", + # flake8-simplify + "SIM", + # flake8-slots + "SLOT", + # flake8-debugger + "T10", + # flake8-type-checking + "TC", + # pyupgrade + "UP", + # pycodestyle warnings + "W", + # flake8-2020 + "YTT", +] +ignore = [ + # Ones we want to ignore + + # Missing docstring in public module + "D100", + # Missing docstring in public package + "D104", + # Missing docstring in magic method + "D105", + # Missing docstring in __init__ + "D107", + # One-line docstring should fit on one line with quotes + "D200", + # No blank lines allowed after function docstring + "D202", + # 1 blank line required between summary line and description + "D205", + # Multi-line docstring closing quotes should be on a separate line + "D209", + # First line should end with a period + "D400", + # Use of `assert` detected; needed for mypy + "S101", +] + +[tool.ruff.lint.per-file-ignores] +# Skip docstring and security checks in tests +"tests/**" = ["D", "S"] + +[tool.ruff.lint.flake8-pytest-style] +parametrize-values-type = "tuple" + +[tool.ruff.lint.pydocstyle] +convention = "pep257" diff --git a/scrapy_crawl_maps/__init__.py b/scrapy_crawl_maps/__init__.py new file mode 100644 index 0000000..dc18a8b --- /dev/null +++ b/scrapy_crawl_maps/__init__.py @@ -0,0 +1,49 @@ +from ._addon import Addon +from ._maps import CrawlMap +from ._nodes import ( + FetchNode, + ItemFollowNode, + ItemFollowNodeParams, + ItemsNode, + ItemsNodeParams, + NodeArgs, + ProcessorNode, + SelectorParserNode, + SelectorParserNodeParams, + SpiderNode, + UrlsFileNode, + UrlsFileNodeParams, + UrlsNode, + UrlsNodeParams, +) +from .spiders import ( + CrawlMapBaseSpider, + CrawlMapSpider, + CrawlMapSpiderCrawlMap, + CrawlMapSpiderParams, + ResponseData, +) + +__all__ = [ + "Addon", + "CrawlMap", + "CrawlMapBaseSpider", + "CrawlMapSpider", + "CrawlMapSpiderCrawlMap", + "CrawlMapSpiderParams", + "FetchNode", + "ItemFollowNode", + "ItemFollowNodeParams", + "ItemsNode", + "ItemsNodeParams", + "NodeArgs", + "ProcessorNode", + "ResponseData", + "SelectorParserNode", + "SelectorParserNodeParams", + "SpiderNode", + "UrlsFileNode", + "UrlsFileNodeParams", + "UrlsNode", + "UrlsNodeParams", +] diff --git a/scrapy_crawl_maps/_addon.py b/scrapy_crawl_maps/_addon.py new file mode 100644 index 0000000..8554f03 --- /dev/null +++ b/scrapy_crawl_maps/_addon.py @@ -0,0 +1,7 @@ +from scrapy.settings import BaseSettings + + +class Addon: + @classmethod + def update_pre_crawler_settings(cls, settings: BaseSettings) -> None: + settings.add_to_list("SPIDER_MODULES", "scrapy_crawl_maps.spiders") diff --git a/scrapy_crawl_maps/_maps.py b/scrapy_crawl_maps/_maps.py new file mode 100644 index 0000000..946d3fd --- /dev/null +++ b/scrapy_crawl_maps/_maps.py @@ -0,0 +1,719 @@ +from __future__ import annotations + +import json +from asyncio import FIRST_COMPLETED, Queue, Task, create_task, gather, wait +from collections import defaultdict, deque +from copy import copy +from logging import getLogger +from pathlib import Path +from typing import ( + TYPE_CHECKING, + Any, + Callable, + ClassVar, + NamedTuple, + cast, +) + +from pydantic_core import CoreSchema +from pydantic_core import core_schema as cs +from scrapy import Request +from scrapy.signals import spider_closed +from scrapy.utils.python import global_object_name + +from ._nodes import NodeArgs, ProcessorNode, SpiderNode, _Node + +if TYPE_CHECKING: + from collections.abc import AsyncIterator, Collection + + from pydantic import GetCoreSchemaHandler + from pydantic.annotated_handlers import GetJsonSchemaHandler + from pydantic.json_schema import JsonSchemaValue + from scrapy.crawler import Crawler + + from .spiders import ResponseData + +logger = getLogger(__name__) + +# To be replaced with Queue.shutdown() when requiring Python 3.13+. +_QUEUE_SHUTDOWN = object() + + +async def _await_next(iterator: _OutputIterator | _SpiderInputIterator) -> Any: + return await iterator.__anext__() + + +def _next_task(iterator: _OutputIterator | _SpiderInputIterator) -> Task[Any]: + return create_task(_await_next(iterator), name=str(iterator)) + + +async def _merge_iterators( + iterators: Collection[_OutputIterator | _SpiderInputIterator], +) -> AsyncIterator[Any]: + next_tasks = {iterator: _next_task(iterator) for iterator in iterators} + while next_tasks: + done, _ = await wait(next_tasks.values(), return_when=FIRST_COMPLETED) + for task in done: + iterator = next(it for it, t in next_tasks.items() if t == task) + try: + yield task.result(), iterator + except StopAsyncIteration: + del next_tasks[iterator] + except Exception as exception: + del next_tasks[iterator] + logger.error( + f"Error while processing a node: {exception}", exc_info=True + ) + else: + next_tasks[iterator] = _next_task(iterator) + + +class GraphLoc(NamedTuple): + node: ProcessorNode | SpiderNode + port: str + + +class _SpiderInputIterator: + def __init__(self, node: SpiderNode, inputs: dict[str, AsyncIterator]) -> None: + self.node = node + self._iterator = node.process_input(inputs=inputs) + + async def __anext__(self): + return await self._iterator.__anext__() + + def __str__(self) -> str: + return f"{self.node} → spider" + + +class _OutputIterator: + def __init__( + self, + initial: list[Any], + queue: Queue[Any], + source: ProcessorNode | SpiderNode, + target: ProcessorNode | SpiderNode | None, + ): + self._initial = deque(initial) + self._queue = queue + self._source = source + self._target = target + + def __str__(self) -> str: + return f"{self._source} → {self._target}" + + async def __anext__(self) -> Any: + if self._initial: + return self._initial.popleft() + next = await self._queue.get() + self._queue.task_done() + if next is _QUEUE_SHUTDOWN: + raise StopAsyncIteration + return next + + def __aiter__(self): + return self + + +class _OutputReader: + def __init__( + self, + queue: Queue | None = None, + log_output: Callable[[], None] | None = None, + *, + node: ProcessorNode | SpiderNode, + ): + self._input_queue = queue + self._output_queues: list[Queue] = [] + self._read: list[Any] = [] + self._log_output = log_output + self._read_finished = queue is None + self.node = node + + async def read(self): + if self._input_queue is not None: + tasks: list[Task] = [] + assert self._log_output is not None + while True: + item = await self._input_queue.get() + self._input_queue.task_done() + if item is _QUEUE_SHUTDOWN: + self._read_finished = True + break + self._log_output() + self._read.append(item) + tasks.extend( + create_task(queue.put(item)) for queue in self._output_queues + ) + await gather(*tasks) + tasks = [] + for queue in self._output_queues: + tasks.append(create_task(queue.put(_QUEUE_SHUTDOWN))) + await gather(*tasks) + + def get_iterator( + self, node: ProcessorNode | SpiderNode | None = None + ) -> _OutputIterator: + queue: Queue = Queue() + if self._read_finished: + queue.put_nowait(_QUEUE_SHUTDOWN) + else: + self._output_queues.append(queue) + return _OutputIterator(copy(self._read), queue, source=self.node, target=node) + + +async def _iterate_with_node( + iterator: AsyncIterator[Any], node: SpiderNode | None = None +) -> AsyncIterator[Any]: + async for item in iterator: + yield item, node + + +class CrawlMap: + """:ref:`Crawl map ` handler. + + A subclass should be used as type for the crawl map parameter of a + crawl-map spider (e.g. :attr:`CrawlMapSpiderParams.map`). + + It defines a supported :ref:`crawl map schema ` through + its :attr:`node_types` and :attr:`node_groups` attributes, allows loading a + compatible crawl map from a JSON object, a Python dict or a file path, and + provides methods (:meth:`load`, :meth:`start`, :meth:`parse`) for + crawl-map spiders like :class:`CrawlMapSpider` to follow the loaded crawl + map. + """ + + #: Groups for :attr:`node_types` defined according to the :ref:`node group + #: spec `. + #: + #: :ref:`Crawl map builders ` can use groups, for example, to + #: organize a “node palette”. + node_groups: ClassVar[dict[str, Any]] + + #: Declares supported :ref:`node types `. + #: + #: See :ref:`custom-node` for an example on how to create a new spider that + #: declares support for additional node types. + node_types: ClassVar[set[type[ProcessorNode | SpiderNode]]] + + # This makes CrawlMap a valid Pydantic type that can take a JSON object in + # 3 different forms: as a Python dict, as a Python string or as a file + # path. + @classmethod + def __get_pydantic_core_schema__( + cls, source_type: Any, handler: GetCoreSchemaHandler + ) -> CoreSchema: + def validate_crawl_map(value: Any, info: cs.ValidationInfo) -> dict[str, Any]: + if isinstance(value, dict): + return value + if not isinstance(value, str): + raise ValueError( + f"Expected a JSON object as a string, as a dictionary or " + f"as a file path, got {value!r}." + ) + if value.lstrip().startswith("{"): + return json.loads(value) + file_path = Path(value) + if file_path.is_file(): + try: + with file_path.open() as f: + return json.load(f) + except (FileNotFoundError, json.JSONDecodeError) as e: + raise ValueError( + f"Could not load or parse JSON from path {value!r}: {e}" + ) from e + else: + raise ValueError(f"{value!r} is not a valid file path.") + + try: + validator_fn = cs.with_info_plain_validator_function + except AttributeError: + validator_fn = cs.general_plain_validator_function + + return cs.no_info_after_validator_function( + cls, validator_fn(validate_crawl_map) + ) + + @classmethod + def __get_pydantic_json_schema__( + cls, core_schema: cs.CoreSchema, handler: GetJsonSchemaHandler + ) -> JsonSchemaValue: + return { + "type": "object", + "properties": { + "nodes": { + "type": "object", + "properties": { + # TODO: Improve this typing based on node_types. + "type": {"type": "string"}, + "args": {"type": "object"}, + }, + "required": ["type"], + }, + "edges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "from": { + "oneOf": [ + {"type": "string"}, + { + "type": "array", + "items": {"type": "string"}, + "minItems": 2, + "maxItems": 2, + }, + ] + }, + "to": { + "oneOf": [ + {"type": "string"}, + { + "type": "array", + "items": {"type": "string"}, + "minItems": 2, + "maxItems": 2, + }, + ], + }, + "required": ["from", "to"], + }, + }, + }, + "required": ["nodes"], + }, + "node_groups": cls.node_groups, + "node_types": cls._get_node_type_metadata(), + } + + @classmethod + def _get_node_type_metadata(cls) -> dict[str, Any]: + result = {} + for node_type in cls.node_types: + node_data = node_type.spec.copy() + for k in ("group", "title", "description"): + if hasattr(node_type, k): + node_data[k] = getattr(node_type, k) + if "group" in node_data and node_data["group"] not in getattr( + cls, "node_groups", {} + ): + node_type_ref = f"{node_type.type!r} ({global_object_name(node_type)})" + logger.warning( + f"Node type {node_type_ref} is in an unknown group: {node_data['group']}" + ) + if issubclass(node_type, NodeArgs): + node_data["param_schema"] = node_type.get_param_schema(normalize=True) + result[node_type.type] = node_data + return result + + def __init__(self, data: dict[str, Any], /): + self._data = data + self._dep_ids: defaultdict[str, set[str]] = defaultdict(set) + self._sources: defaultdict[str, defaultdict[str, set[GraphLoc]]] = defaultdict( + lambda: defaultdict(set) + ) + self._targets: defaultdict[str, defaultdict[str, set[GraphLoc]]] = defaultdict( + lambda: defaultdict(set) + ) + # Tasks to be canceled once the spider is closing. + self._tasks: list[Task] = [] + self._nodes: dict[str, ProcessorNode | SpiderNode] = {} + + def _resolve_ref(self, link: dict[str, Any], direction: str) -> GraphLoc: + ref: str | tuple[str, str] = link[direction] + if isinstance(ref, str): + node_id = ref + node = self._nodes[node_id] + io = "inputs" if direction == "to" else "outputs" + port = next(iter(node.spec.get(io, {}))) + else: + node_id, port = ref + node = self._nodes[node_id] + return GraphLoc(node, port) + + def _load_edges(self): + for edge in self._data.get("edges", []): + source = self._resolve_ref(edge, "from") + target = self._resolve_ref(edge, "to") + + source_type = source.node.spec["outputs"][source.port]["type"] + target_type = target.node.spec["inputs"][target.port]["type"] + if source_type != target_type: + raise ValueError( + f"Type mismatch in link {edge!r}: cannot link {source_type!r} output to {target_type!r} input." + ) + + self._dep_ids[target.node.id].add(source.node.id) + self._sources[target.node.id][target.port].add(source) + self._targets[source.node.id][source.port].add(target) + + def _get_inputless_nodes(self) -> list[ProcessorNode | SpiderNode]: + return [ + node for node in self._nodes.values() if not node.spec.get("inputs", {}) + ] + + def _get_spiderless_node_ids(self) -> set[str]: + node_ids = set(self._nodes) + spider_nodes = [ + node for node in self._nodes.values() if isinstance(node, SpiderNode) + ] + seen_node_ids = {node.id for node in spider_nodes} + pending_nodes: deque[ProcessorNode | SpiderNode] = deque(spider_nodes) + + # Discard all spider nodes, as well as any node that is connected to + # their output. + while pending_nodes: + node = pending_nodes.popleft() + node_ids.discard(node.id) + for port in node.spec["outputs"]: + for target in self._targets[node.id][port]: + if target.node.id not in seen_node_ids: + seen_node_ids.add(target.node.id) + pending_nodes.append(target.node) + + return node_ids + + async def _input_iterator( + self, node, port, spiderfull_output_readers, allow_spiderless=True + ): + iterators = [] + for source in self._sources[node.id][port]: + if allow_spiderless and source.node.id in self._spiderless_node_ids: + reader = self._spiderless_output_readers[source.node.id][source.port] + else: + try: + reader = spiderfull_output_readers[source.node.id][source.port] + except KeyError: + # Empty reader for circular dependency or disallowed spiderless. + reader = _OutputReader(node=source.node) + iterators.append(reader.get_iterator(node)) + async for item, _ in _merge_iterators(iterators): + self._track_input(node, port) + yield item + + def _prepare_outputs( + self, + node: ProcessorNode | SpiderNode, + output_readers: defaultdict[str, dict[str, _OutputReader]], + ): + tasks = [] + outputs: dict[str, Queue] = {port: Queue() for port in node.spec["outputs"]} + for port, queue in outputs.items(): + + def log_output(node=node, port=port): + self._track_output(node, port) + + reader = _OutputReader(queue, log_output, node=node) + output_readers[node.id][port] = reader + tasks.append(create_task(reader.read())) + return outputs, tasks + + def _is_circular_dep(self, node_id: str, dep_id: str) -> bool: + pending_dep_ids = deque([dep_id]) + seen_dep_ids = set() + while pending_dep_ids: + current_dep_id = pending_dep_ids.popleft() + if current_dep_id == node_id: + return True + if current_dep_id not in seen_dep_ids: + seen_dep_ids.add(current_dep_id) + pending_dep_ids.extend(self._dep_ids[current_dep_id]) + return False + + def _resolve_deps(self, start_nodes: list[SpiderNode | ProcessorNode]): + pending_nodes: deque[ProcessorNode | SpiderNode] = deque(start_nodes) + start_node_ids = {node.id for node in start_nodes} + seen_node_ids = start_node_ids.copy() + previous_pending_nodes = None + + while pending_nodes: + if pending_nodes == previous_pending_nodes: + start = ", ".join(sorted(start_node_ids)) + pending = ", ".join(sorted(node.id for node in pending_nodes)) + raise ValueError( + f"While resolving the downstream nodes from ({start}), it " + f"was not possible to resolve the following nodes: " + f"{pending}. Is it possible that you have an unsupported " + f"circular flow?" + ) + previous_pending_nodes = pending_nodes.copy() + + node = pending_nodes.popleft() + if node.id not in start_node_ids: + deps_fullfilled = True + for dep_id in self._dep_ids[node.id]: + if ( + dep_id in seen_node_ids + or dep_id in self._spiderless_node_ids + or self._is_circular_dep(node.id, dep_id) + ): + continue + pending_nodes.append(node) + deps_fullfilled = False + break + if not deps_fullfilled: + continue + for targets in self._targets[node.id].values(): + for target in targets: + if target.node.id not in seen_node_ids: + seen_node_ids.add(target.node.id) + pending_nodes.append(target.node) + yield node + + async def _iter_output( # noqa: PLR0912, PLR0915 + self, response_data: ResponseData | None = None + ) -> AsyncIterator[Any]: + if response_data is not None: + node_id = response_data.meta["_crawl_map_node"] + source_node: SpiderNode | None = cast("SpiderNode", self._nodes[node_id]) + assert source_node is not None + start_nodes: list[ProcessorNode | SpiderNode] = [source_node] + else: + source_node = None + start_nodes = self._get_inputless_nodes() + source_node_is_also_target = False + + tasks = [] + iterators: list[_OutputIterator | _SpiderInputIterator] = [] + spiderful_output_readers: defaultdict[str, dict[str, _OutputReader]] = ( + defaultdict(dict) + ) + pending_nodes: deque[ProcessorNode | SpiderNode] = deque(start_nodes) + seen_node_ids = {node.id for node in pending_nodes} + previous_pending_nodes = None + + while pending_nodes: + if pending_nodes == previous_pending_nodes: + pending = ", ".join(sorted(node.id for node in pending_nodes)) + raise ValueError( + f"Cannot resolve the following crawl map dependencies: " + f"{pending}. Is it possible that you have a circular " + f"dependency?" + ) + previous_pending_nodes = pending_nodes.copy() + + node = pending_nodes.popleft() + if node is not source_node: + deps_fullfilled = True + for dep_id in self._dep_ids[node.id]: + if ( + dep_id in seen_node_ids + or dep_id in self._spiderless_node_ids + or self._is_circular_dep(node.id, dep_id) + ): + continue + pending_nodes.append(node) + deps_fullfilled = False + break + if not deps_fullfilled: + continue + if isinstance(node, SpiderNode) and node is not source_node: + inputs: dict[str, AsyncIterator] = { + port: self._input_iterator( + node, + port, + spiderful_output_readers, + allow_spiderless=source_node is None, + ) + for port in node.spec["inputs"] + } + iterator = _SpiderInputIterator(node, inputs) + iterators.append(iterator) + continue + if node.id in self._spiderless_node_ids: + if node.id not in self._spiderless_outputs: + self._spiderless_outputs[node.id], subtasks = self._prepare_outputs( + node, self._spiderless_output_readers + ) + self._tasks.extend(subtasks) + outputs = self._spiderless_outputs[node.id] + output_readers = self._spiderless_output_readers + else: + outputs, subtasks = self._prepare_outputs( + node, spiderful_output_readers + ) + tasks.extend(subtasks) + output_readers = spiderful_output_readers + for port in outputs: + if self._targets[node.id][port]: + for target in self._targets[node.id][port]: + if target.node.id not in seen_node_ids: + seen_node_ids.add(target.node.id) + pending_nodes.append(target.node) + elif ( + source_node is not None and target.node.id == source_node.id + ): + source_node_is_also_target = True + elif node.spec["outputs"][port]["type"] == "item": + reader = output_readers[node.id][port] + iterators.append(reader.get_iterator()) + if node is source_node: + assert isinstance(node, SpiderNode) + assert response_data is not None + node_processor = node.process_output( + response_data=response_data, outputs=outputs + ) + else: + # TODO: Most likely, we need to allow spiderless nodes + # regardless of response_data for inputs from nodes that are + # not processed during the start requests, but only as part of + # non-None response processing. + inputs = { + port: self._input_iterator( + node, + port, + spiderful_output_readers, + allow_spiderless=response_data is None, + ) + for port in node.spec.get("inputs", {}) + } + assert isinstance(node, ProcessorNode) + node_processor = node.process( + inputs=inputs, outputs=outputs, response_data=response_data + ) + + async def process(node_processor, outputs): + try: + await node_processor + except Exception as exception: + logger.error( + f"Error while processing a node: {exception}", exc_info=True + ) + await gather( + *( + create_task(queue.put(_QUEUE_SHUTDOWN)) + for queue in outputs.values() + ) + ) + + tasks.append(create_task(process(node_processor, outputs))) + + if source_node_is_also_target: + assert source_node is not None + inputs = { + port: self._input_iterator( + source_node, port, spiderful_output_readers, allow_spiderless=False + ) + for port in source_node.spec["inputs"] + } + iterator = _SpiderInputIterator(source_node, inputs) + iterators.append(iterator) + + async for item_or_request, iterator in _merge_iterators(iterators): + if isinstance(item_or_request, Request): + if response_data: + for k, v in response_data.meta.items(): + if k.startswith(_Node._persistent_meta_prefix): + item_or_request.meta[k] = v + assert isinstance(iterator, _SpiderInputIterator) + item_or_request.meta["_crawl_map_node"] = iterator.node.id + for dep in self._resolve_deps([iterator.node]): + if not isinstance(dep, SpiderNode): + item_or_request = dep.process_request( # noqa: PLW2901 + item_or_request, response_data=response_data + ) + yield item_or_request + + await gather(*tasks) + + def prepare(self, /, *, crawler: Crawler): + """Prepare the crawl map. + + This method must be called before any call to :meth:`start` or + :meth:`parse`. + + *crawler* is the running crawler. A spider using this crawl map can + read it from :attr:`Spider.crawler ` and pass it + to this method. + """ + node_type_map = {cls.type: cls for cls in self.node_types} + for id, data in self._data["nodes"].items(): + node_type = data["type"] + try: + node_cls = node_type_map[node_type] + except KeyError: + raise ValueError( + f"Unknown node type: {node_type!r}. Supported node types: " + f"{', '.join(sorted(node_type_map))}. See the reference " + f"docs of CrawlMap.node_types for more information." + ) from None + self._nodes[id] = node_cls( + id=id, + map=self, + crawler=crawler, + args=data.get("args", {}), + ) + + self._load_edges() + + # Nodes that are neither spider nodes nor nodes that have spider nodes + # as input, directly or indirectly, i.e. nodes that run independent of + # spider interaction, not conditioned by responses. + self._spiderless_node_ids = self._get_spiderless_node_ids() + self._spiderless_outputs: dict[str, dict[str, Queue]] = {} + self._spiderless_output_readers: defaultdict[str, dict[str, _OutputReader]] = ( + defaultdict(dict) + ) + + crawler.signals.connect(self._close, signal=spider_closed) + self._crawler = crawler + + async def start(self) -> AsyncIterator[Any]: + """Yield :ref:`items ` and :class:`~scrapy.Request` + objects that should be yielded by :meth:`Spider.start() + `.""" + if not self._nodes: + raise ValueError("Crawl map not prepared. Call prepare() before start().") + async for item_or_request in self._iter_output(): + yield item_or_request + + async def parse(self, *, response_data: ResponseData) -> AsyncIterator[Any]: + """Yield :ref:`items ` and :class:`~scrapy.Request` + objects that should be yielded by a :attr:`~scrapy.Request.callback`. + + *response_data* usually contains the callback response. For example: + + .. code-block:: python + + async def callback(self, response): + response_data = ResponseData.from_objects(response) + async for item_or_request in self.map.parse(response_data=response_data): + yield item_or_request + + But it can contain additional data. For example, when using + :doc:`scrapy-poet `: + + .. code-block:: python + + async def callback(self, response: Response, deps: DynamicDeps): + deps[response.__class__] = response + response_data = ResponseData.from_objects(*deps.values()) + async for item_or_request in self.map.parse(response_data=response_data): + yield item_or_request + """ + if not self._nodes: + raise ValueError("Crawl map not prepared. Call prepare() before parse().") + async for item_or_request in self._iter_output(response_data): + yield item_or_request + + def _track_io( + self, node: ProcessorNode | SpiderNode, type: str, port: str, count: int = 1 + ) -> None: + assert self._crawler.stats is not None + self._crawler.stats.inc_value( + f"crawl_maps/nodes/{node.id}/{type}/{port}", count + ) + + def _track_input( + self, node: ProcessorNode | SpiderNode, port: str, count: int = 1 + ) -> None: + self._track_io(node, "inputs", port, count) + + def _track_output( + self, node: ProcessorNode | SpiderNode, port: str, count: int = 1 + ) -> None: + self._track_io(node, "outputs", port, count) + + async def _close(self): + for task in self._tasks: + task.cancel("spider_closed") diff --git a/scrapy_crawl_maps/_nodes.py b/scrapy_crawl_maps/_nodes.py new file mode 100644 index 0000000..4acb728 --- /dev/null +++ b/scrapy_crawl_maps/_nodes.py @@ -0,0 +1,692 @@ +from __future__ import annotations + +import re +from abc import abstractmethod +from logging import getLogger +from typing import ( + TYPE_CHECKING, + Any, + ClassVar, + Generic, + TypeVar, + cast, +) + +import jmespath +from itemadapter import ItemAdapter +from pydantic import BaseModel, Field, ValidationError, field_validator +from scrapy import Request +from scrapy.http.response import Response +from scrapy.utils.defer import deferred_to_future +from scrapy.utils.python import global_object_name + +# TODO: If we are OK with this approach, make an upstream release publishing +# these methods. +from scrapy_spider_metadata._utils import get_generic_param, normalize_param_schema + +if TYPE_CHECKING: + import builtins + from asyncio import Queue + from collections.abc import AsyncIterator + + from scrapy.crawler import Crawler + + from ._maps import CrawlMap + from .spiders import ResponseData + +logger = getLogger(__name__) + +# Validation #----------------------------------------------------------------# + +URL_PATTERN = r"^(?:https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?|data:.*?)$" + + +def validate_url_list(value: list[str] | str) -> list[str]: + """Validate a list of URLs. + + If a string is received as input, it is split into multiple strings + on new lines. + + List items that do not match a URL pattern trigger a warning and are + removed from the list. If all URLs are invalid, validation fails. + """ + if isinstance(value, str): + value = value.split("\n") + if not value: + return value + result = [] + for v in value: + v = v.strip() # noqa: PLW2901 + if not v: + continue + if not re.search(URL_PATTERN, v): + logger.warning( + f"{v!r}, from the 'urls' spider argument, is not a " + f"valid URL and will be ignored." + ) + continue + result.append(v) + if not result: + raise ValueError(f"No valid URL found in {value!r}") + return result + + +# Base classes #--------------------------------------------------------------# + + +class _Node: + #: :ref:`Node type ` used in :ref:`crawl maps ` to + #: instantiate nodes of this type. + type: ClassVar[str] + + #: Supported :ref:`input and output ports `. + spec: ClassVar[dict[str, Any]] + + _persistent_meta_prefix = "_crawl_map_presistent_" + + def __init__(self, *, id: str, map: CrawlMap, crawler: Crawler, args: Any): + #: ID of this instance of the node type, as :ref:`defined ` in + #: the :ref:`crawl map `. + self.id: str = id + + #: :ref:`Crawl map ` that this node belongs to. + self.map: CrawlMap = map + + #: Running :class:`~scrapy.crawler.Crawler`. + #: + #: Can be useful to access :ref:`settings `. + self.crawler = crawler + + def __repr__(self) -> str: + return f"{self.__class__.__name__}(id={self.id!r})" + + @property + def meta_key(self) -> str: + """Key that can be set in :attr:`Request.meta + ` to store sticky, node-specific metadata. + + The key is guaranteed to be unique for this node instance. It is also + copied from responses to follow-up requests, even in subsets of the + crawl map where this node is not present. + + It can be useful, for example, to limit how many times a request and + its follow-up requests can go through this node: + + .. code-block:: python + + class FollowNextNode(Node): + def process_request(self, request, /, *, response_data=None): + request = super().process_request(request, response_data=response_data) + meta = request.meta.get(self.meta_key, {}) + meta["depth"] = meta.get("depth", -1) + 1 + request.meta[self.meta_key] = meta + + async def process(self, inputs, outputs): + async for response_data in inputs["main"]: + meta = response_data.meta.get(self.meta_key, {}) + if meta["depth"] > 0: + continue + request = Request(response_data.response.css("next::attr(href)").get()) + await outputs["main"].put(request) + """ + return f"{self._persistent_meta_prefix}_{self.id}" + + +class ProcessorNode(_Node): + """Base class for most crawl map nodes. + + Subclasses must define :attr:`type` and :attr:`spec` and implement the + :meth:`process` method. They can also optionally implement the + process_request() method to process requests whose response will affect + them. + """ + + deps: ClassVar[set[type]] = set() + + @abstractmethod + async def process( + self, + *, + inputs: dict[str, AsyncIterator[Any]], + outputs: dict[str, Queue[Any]], + response_data: ResponseData | None, + ) -> None: + """Process *inputs* into *outputs*. + + *inputs* is a dictionary of input port iterators, where keys are input + port IDs and values are asynchronous iterators that yield input data. + + *outputs* is a dictionary of output port queues, where keys are output + port IDs and values are queues where you should put any output data. + + This method may be called multiple times. + """ + raise NotImplementedError( + f"{global_object_name(self.__class__)} does not implement a " + f"process() method" + ) + + def process_request( + self, request: Request, /, *, response_data: ResponseData | None = None + ) -> Request: + """Process a request whose response will affect this node. + + This method is called for each request yielded by the closest earlier + :class:`SpiderNode` connected to this node. It can be used, for + example, to set metadata on the request. See :meth:`meta_key`. + + *response_data* is the response data that triggered this request, not + the response to *request* itself. It is ``None`` for start requests. + + It must return the original request (e.g. with in-place modifications) + or a new one (e.g. created with :meth:`~scrapy.Request.replace`). + """ + if not self.deps: + return request + deps = request.meta.setdefault("inject", []) + for dep in self.deps: + if dep not in deps: + deps.append(dep) + return request + + +class SpiderNode(_Node): + """Base class for crawl map nodes that can yield items and requests and + process the response to requests that it yields. + + Subclasses must define :attr:`type` and :attr:`spec` and implement the + :meth:`process_input` and :meth:`process_output` methods. + """ + + @abstractmethod + async def process_input( + self, + *, + inputs: dict[str, AsyncIterator[Any]], + ) -> AsyncIterator[Any]: + """Process *inputs* and yield :ref:`items ` and :ref:`requests + `. + + .. + TODO: Switch the reference to "requests" after the release of + Scrapy #6715. + + *inputs* is a dictionary of input port iterators, where keys are input + port IDs and values are asynchronous iterators that yield input data. + + This method may be called multiple times. + """ + raise NotImplementedError( + f"{global_object_name(self.__class__)} does not implement a " + f"process_input() method" + ) + yield + + @abstractmethod + async def process_output( + self, + *, + response_data: ResponseData, + outputs: dict[str, Queue[Any]], + ) -> None: + """Process the *response_data* input *outputs*. + + *response_data* is response data received for a request yielded from + :meth:`process_input`. + + *outputs* is a dictionary of output port queues, where keys are output + port IDs and values are queues where you should put any output data. + + This method is called once per *response_data*. That is usually once per request + yielded from :meth:`process_input`, but it may be fewer times if a + request is dropped (e.g. duplicate requests). + """ + raise NotImplementedError( + f"{global_object_name(self.__class__)} does not implement a " + f"process_output() method" + ) + + +ParamSpecT = TypeVar("ParamSpecT", bound=BaseModel) + + +class NodeArgs(Generic[ParamSpecT]): + """Validates and type-converts :ref:`node arguments ` into the + :attr:`args` instance attribute according to the :ref:`parameter + specification `.""" + + def __init__(self, *pargs: Any, args, **kwargs: Any): + param_model = get_generic_param(self.__class__, NodeArgs) + assert param_model is not None + try: + #: Node arguments, as :ref:`defined ` in the :ref:`crawl + #: map `. + self.args: ParamSpecT = param_model(**args) + except ValidationError as e: + logger.error(f"Node parameter validation failed: {e}") + raise + kwargs["args"] = args + super().__init__(*pargs, **kwargs) + + @classmethod + def get_param_schema(cls, normalize: bool = False) -> dict[Any, Any]: + """Return a :class:`dict` with the :ref:`parameter definition + ` as `JSON Schema`_. + + .. _JSON Schema: https://json-schema.org/ + + If *normalize* is ``True``, the returned schema will be the same + regardless of whether you are using Pydantic 1.x or Pydantic 2.x. The + normalized schema may not match the output of any Pydantic version, but + it will be functionally equivalent where possible. + """ + param_model = get_generic_param(cls, NodeArgs) + assert param_model is not None + assert issubclass(param_model, BaseModel) + try: + param_schema = param_model.model_json_schema() + except AttributeError: # pydantic 1.x + param_schema = param_model.schema() + if normalize: + normalize_param_schema(param_schema) + return param_schema + + +# Input nodes #---------------------------------------------------------------# +# +# Nodes without input ports. + + +class ItemsNodeParams(BaseModel): + items: list[dict[str, Any]] = Field( + title="Items", + description=( + "JSON array of JSON objects to output as items.\n" + "\n" + "For example:\n" + "\n" + ".. code-block:: json\n" + "\n" + ' [{"foo": "bar"}]\n' + ), + json_schema_extra={ + # Trigger consistent output when using older Pydantic versions. + "items": {"additionalProperties": True, "type": "object"}, + }, + ) + + +class ItemsNode(NodeArgs[ItemsNodeParams], ProcessorNode): + """Items to output.""" + + type = "items" + spec: ClassVar[dict[str, Any]] = { + "outputs": { + "main": { + "type": "item", + }, + }, + } + + async def process( + self, + *, + inputs: dict[str, AsyncIterator[Any]], + outputs: dict[str, Queue[Any]], + response_data: ResponseData | None, + ) -> None: + for item in self.args.items: + await outputs["main"].put(item) + + +class UrlsNodeUrlsParam(BaseModel): + urls: list[str] = Field( + title="URLs", + description=( + "Input URLs.\n" + "\n" + "Define 1 absolute URL (e.g. including the http(s) prefix) per " + "line.\n" + "\n" + "Example: https://toscrape.com/" + ), + ) + + @field_validator("urls", mode="before") + @classmethod + def validate_url_list(cls, value: list[str] | str) -> list[str]: + return validate_url_list(value) + + +class UrlsNodeParams( + UrlsNodeUrlsParam, +): + pass + + +class UrlsNode(NodeArgs[UrlsNodeParams], ProcessorNode): + """URLs to output as requests.""" + + type = "urls" + spec: ClassVar[dict[str, Any]] = { + "outputs": { + "main": { + "type": "request", + }, + }, + } + + async def process( + self, + *, + inputs: dict[str, AsyncIterator[Any]], + outputs: dict[str, Queue[Any]], + response_data: ResponseData | None, + ) -> None: + for url in cast("list[str]", self.args.urls): + await outputs["main"].put(Request(url)) + + +class UrlsFileNodeUrlsFileParam(BaseModel): + urls_file: str = Field( + title="URLs file", + description=( + "URL that point to a plain-text file with a list of URLs to " + "crawl, e.g. https://example.com/url-list.txt. The linked file " + "must contain 1 URL per line." + ), + pattern=URL_PATTERN, + ) + + +class UrlsFileNodeParams( + UrlsFileNodeUrlsFileParam, +): + pass + + +class UrlsFileNode(NodeArgs[UrlsFileNodeParams], ProcessorNode): + """URL to a file with URLs to output as requests.""" + + type = "urls_file" + spec: ClassVar[dict[str, Any]] = { + "outputs": { + "main": { + "type": "request", + }, + }, + } + + async def process( + self, + *, + inputs: dict[str, AsyncIterator[Any]], + outputs: dict[str, Queue[Any]], + response_data: ResponseData | None, + ) -> None: + request = Request(self.args.urls_file) + assert self.crawler.engine is not None + response = await deferred_to_future(self.crawler.engine.download(request)) + for url in response.text.splitlines(): + if not (url := url.strip()): + continue + if not re.search(URL_PATTERN, url): + logger.error( + f"Ignoring bad URL {url!r} from URLs file " + f"{self.args.urls_file!r} loaded by node {self!r}" + ) + continue + try: + request = Request(url) + except ValueError: + logger.error( + f"Ignoring bad URL {url!r} from URLs file " + f"{self.args.urls_file!r} loaded by node {self!r}", + exc_info=True, + ) + continue + await outputs["main"].put(request) + + +# Spider nodes #--------------------------------------------------------------# +# +# Nodes that subclass SpiderNode. They usually send requests and process their +# responses. + + +class FetchNode(SpiderNode): + """Fetch requests and outputs their responses.""" + + type = "fetch" + spec: ClassVar[dict[str, Any]] = { + "inputs": { + "main": { + "type": "request", + }, + }, + "outputs": { + "main": { + "type": "response", + }, + }, + } + + async def process_input( + self, + *, + inputs: dict[str, AsyncIterator[Any]], + ) -> AsyncIterator[Any]: + async for request in inputs["main"]: + yield request + + async def process_output( + self, + *, + response_data: ResponseData, + outputs: dict[str, Queue[Any]], + ) -> None: + await outputs["main"].put(response_data) + + +# Parser nodes #--------------------------------------------------------------# +# +# Nodes that yield items from responses. + + +class SelectorParserNodeParams(BaseModel): + map: dict[str, dict[str, str]] = Field( + title="Selector map", + description=( + "JSON object mapping field names to JSON objects that define a " + 'selector type and value. For example: {"name": {"type": "css", ' + '"value": ".name::text"}.\n' + "\n" + 'Each field query can also define a "getter" key, which is the ' + 'name of the selector method to use: "get" (default) or "getall".' + ), + ) + + +class SelectorParserNode(NodeArgs[SelectorParserNodeParams], ProcessorNode): + type = "selector-parser" + spec: ClassVar[dict[str, Any]] = { + "inputs": { + "main": { + "type": "response", + }, + }, + "outputs": { + "main": { + "type": "item", + }, + }, + } + deps: ClassVar[set[builtins.type]] = {Response} + + async def process( + self, + *, + inputs: dict[str, AsyncIterator[Any]], + outputs: dict[str, Queue[Any]], + response_data: ResponseData | None, + ) -> None: + async for data in inputs["main"]: + assert data is not None + try: + response = data.response + except Exception: # noqa: S112 + continue + item = {} + for field, query in self.args.map.items(): + select_fn = getattr(response, query["type"]) + selector = select_fn(query["value"]) + value = getattr(selector, query.get("getter", "get"))() + item[field] = value + await outputs["main"].put(item) + + +# Follow nodes #--------------------------------------------------------------# +# +# Nodes that yield requests from responses or items. + + +class SelectorFollowNodeParams(BaseModel): + selectors: list[dict[str, str]] = Field( + title="Selector list", + description=( + "JSON array of JSON objects that each define a selector type and " + 'value. For example: [{"type": "css", ' + '"value": "a::attr(href)"}].\n' + "\n" + 'Each field query can also define a "getter" key, which is the ' + 'name of the selector method to use: "getall" (default) or "get".' + ), + ) + + +class SelectorFollowNode(NodeArgs[SelectorFollowNodeParams], ProcessorNode): + type = "selector-follow" + spec: ClassVar[dict[str, Any]] = { + "inputs": { + "main": { + "type": "response", + }, + }, + "outputs": { + "main": { + "type": "request", + }, + }, + } + deps: ClassVar[set[builtins.type]] = {Response} + + async def process( + self, + *, + inputs: dict[str, AsyncIterator[Any]], + outputs: dict[str, Queue[Any]], + response_data: ResponseData | None, + ) -> None: + async for data in inputs["main"]: + assert data is not None + try: + response = data.response + except Exception: # noqa: S112 + continue + assert response is not None + for query in self.args.selectors: + select_fn = getattr(response, query["type"]) + selector = select_fn(query["value"]) + urls = getattr(selector, query.get("getter", "getall"))() + if not urls: + continue + if isinstance(urls, str): + urls = [urls] + for url in urls: + request = response.follow(url) + await outputs["main"].put(request) + + +class ItemFollowNodeParams(BaseModel): + url_jmes: str = Field( + title="URL JMESPath query", + description=( + "Determines which URLs to follow.\n" + "\n" + "It is a JMESPath_ query that matches a URL or a list of URLs.\n" + "\n" + ".. _JMESPath: https://jmespath.org/" + ), + ) + max_recursion: int = Field( + title="Maximum recursion", + description=( + "Limits how many times a given chain of requests can go through " + "this node.\n" + "\n" + "-1 means no limit.\n" + "\n" + "It is useful, for example, to limit pagination to the first few " + "pages." + ), + default=-1, + ) + + +class ItemFollowNode(NodeArgs[ItemFollowNodeParams], ProcessorNode): + """Parse URLs from items and yield requests.""" + + type = "item-follow" + spec: ClassVar[dict[str, Any]] = { + "inputs": { + "main": { + "type": "item", + }, + }, + "outputs": { + "main": { + "type": "request", + }, + }, + } + deps: ClassVar[set[builtins.type]] = {Response} + + def process_request( + self, request: Request, /, *, response_data: ResponseData | None = None + ) -> Request: + request = super().process_request(request, response_data=response_data) + if self.args.max_recursion == -1: + return request + meta = request.meta.get(self.meta_key, {}) + meta["recursion"] = meta.get("recursion", 0) + 1 + request.meta[self.meta_key] = meta + return request + + async def process( + self, + *, + inputs: dict[str, AsyncIterator[Any]], + outputs: dict[str, Queue[Any]], + response_data: ResponseData | None, + ) -> None: + recursion = ( + response_data.meta.get(self.meta_key, {}).get("recursion", 0) + if response_data + else 0 + ) + async for item in inputs["main"]: + adapter = ItemAdapter(item) + if self.args.max_recursion != -1 and recursion > self.args.max_recursion: + continue + urls = jmespath.search(self.args.url_jmes, adapter) + if urls is None: + continue + if isinstance(urls, str): + urls = [urls] + for url in urls: + if response_data is not None: + assert response_data.response is not None + request = response_data.response.follow(url) + else: + request = Request(url) + await outputs["main"].put(request) diff --git a/scrapy_crawl_maps/spiders.py b/scrapy_crawl_maps/spiders.py new file mode 100644 index 0000000..999023a --- /dev/null +++ b/scrapy_crawl_maps/spiders.py @@ -0,0 +1,197 @@ +from __future__ import annotations + +from collections.abc import ( + AsyncIterator, # noqa: TC003 (needed for dependency injection) +) +from typing import TYPE_CHECKING, Any, ClassVar + +from pydantic import BaseModel, Field +from scrapy import Request, Spider +from scrapy.exceptions import CloseSpider +from scrapy.http.response import Response +from scrapy.utils.python import global_object_name +from scrapy_poet import ( # type: ignore[import-untyped] + DummyResponse, # noqa: TC002 (needed for dependency injection) + DynamicDeps, # noqa: TC002 (needed for dependency injection) +) +from scrapy_spider_metadata import Args +from twisted.python.failure import Failure + +from ._maps import CrawlMap +from ._nodes import ( + FetchNode, + ItemFollowNode, + ItemsNode, + ProcessorNode, + SelectorFollowNode, + SelectorParserNode, + SpiderNode, + UrlsNode, +) + +if TYPE_CHECKING: + from typing_extensions import Self # Python 3.11+ + + +class ResponseData(dict): + """Dictionary of response data where keys are data types (e.g. + :class:`HtmlResponse`) and values are the corresponding objects.""" + + @classmethod + def from_objects(cls, *objects) -> Self: + """Build a :class:`ResponseData` instance from *objects*. + + See :meth:`CrawlMap.parse` for examples. + """ + response_data = cls() + for object in objects: + response_data[object.__class__] = object + return response_data + + @property + def response(self) -> Response | None: + """:class:`Response` object, if any. + + Raises an exception if no :class:`Response` class or subclass is found + but a :exc:`~twisted.python.failure.Failure` class or subclass is found + instead, which may happen e.g. if an unhandled exception is raised + during request processing (see :attr:`~scrapy.Request.errback`). + """ + for type, value in self.items(): + if issubclass(type, Response): + return value + for type, value in self.items(): + if issubclass(type, Failure): + raise value.value + return None + + @property + def meta(self) -> dict[str, Any]: + """:attr:`~scrapy.Request.meta` from the source request.""" + for type, value in self.items(): + if issubclass(type, (Failure, Response)): + return value.request.meta + raise AssertionError("There should always be a meta attribute") + + +class CrawlMapSpiderCrawlMap(CrawlMap): # noqa: D101 + node_groups: ClassVar[dict[str, Any]] = { + "input": {"title": "Input", "order": 0}, + "fetch": {"title": "Fetch", "order": 1}, + "parse": {"title": "Parse", "order": 2}, + "follow": {"title": "Follow", "order": 3}, + } + node_types: ClassVar[set[type[ProcessorNode | SpiderNode]]] = { + FetchNode, + ItemFollowNode, + ItemsNode, + SelectorFollowNode, + SelectorParserNode, + UrlsNode, + } + + +class CrawlMapSpiderMapParam(BaseModel): # noqa: D101 + map: CrawlMapSpiderCrawlMap = Field( + title="Crawl map", + description="Definition of the steps that the spider must follow.", + ) + + +class CrawlMapSpiderParams( # noqa: D101 + CrawlMapSpiderMapParam, +): + pass + + +class CrawlMapBaseSpider(Spider): + """Base class for spiders that follow a :ref:`crawl map `. + + Subclasses must :ref:`define a spider parameter ` with + :class:`CrawlMap` or a subclass of it as type. + """ + + def _load_crawl_map(self) -> None: + if not isinstance(self, Args): + raise TypeError( + f"{global_object_name(self.__class__)} is a subclass of " + f"{global_object_name(CrawlMapBaseSpider)}, so it must also " + f"subclass scrapy_spider_metadata.Args to specify its spider " + f"parameters, including one with CrawlMap or a subclass as " + f"type." + ) + for v in self.args.model_dump().values(): + if isinstance(v, CrawlMap): + try: + v.prepare(crawler=self.crawler) + except Exception as exception: + self.logger.error( + f"Error while loading the crawl map: {exception}", exc_info=True + ) + raise CloseSpider("bad_crawl_map") from exception + self._crawl_map = v + break + else: + self.logger.error( + f"No crawl-map parameter found in {global_object_name(self.__class__)}" + ) + raise CloseSpider("no_crawl_map_param") + + def _process_item_or_request(self, item_or_request): + if isinstance(item_or_request, Request): + deps = item_or_request.meta.get("inject", []) + for dep in list(deps): + if issubclass(dep, Response): + deps.remove(dep) + callback = self.callback + break + else: + callback = self.callback_for_dummy_response + item_or_request = item_or_request.replace( + callback=callback, + errback=self.errback, + ) + return item_or_request + + async def start(self) -> AsyncIterator[Any]: # noqa: D102 + try: + self._load_crawl_map() + except CloseSpider: + assert self.crawler.engine is not None + self.crawler.engine.close_spider(self, reason="bad_crawl_map") + return + async for item_or_request in self._crawl_map.start(): + yield self._process_item_or_request(item_or_request) + + async def callback( # noqa: D102 + self, response: Response, /, *, deps: DynamicDeps + ) -> AsyncIterator[Any]: + deps[response.__class__] = response + response_data = ResponseData.from_objects(*deps.values()) + async for item_or_request in self._crawl_map.parse(response_data=response_data): + yield self._process_item_or_request(item_or_request) + + async def callback_for_dummy_response( # noqa: D102 + self, response: DummyResponse, /, *, deps: DynamicDeps + ) -> AsyncIterator[Any]: + async for item_or_request in self.callback(response, deps=deps): + yield item_or_request + + async def errback(self, failure: Failure, /) -> AsyncIterator[Any]: # noqa: D102 + response_data = ResponseData.from_objects(failure) + async for item_or_request in self._crawl_map.parse(response_data=response_data): + yield self._process_item_or_request(item_or_request) + + +class CrawlMapSpider(Args[CrawlMapSpiderParams], CrawlMapBaseSpider): + """Template spider for spiders that follow a :ref:`crawl map `. + + Supports all :ref:`built-in node types `. + """ + + name: str = "map" + metadata: ClassVar[dict[str, Any]] = { + "title": "Crawl Map Spider", + "description": "Template for spiders that follow a crawl map.", + "template": True, + } diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..b38723b --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,52 @@ +from __future__ import annotations + +import json +from typing import TYPE_CHECKING, Any + +from scrapy_crawl_maps.spiders import CrawlMapSpider + +if TYPE_CHECKING: + from scrapy import Spider + +SETTINGS = {"ADDONS": {"scrapy_poet.Addon": 300}} + + +# scrapy.utils.test.get_crawler alternative that does not freeze settings. +def get_crawler( + *, + settings: dict[str, Any] | None = None, + spider_cls: type[Spider] = CrawlMapSpider, +): + from scrapy.crawler import CrawlerRunner + + settings = settings or SETTINGS + runner = CrawlerRunner(settings) + return runner.create_crawler(spider_cls) + + +def assertEqualsParamOrder(actual, expected): + for k in actual: + if k not in expected: + continue + if k == "param_schema": + assert tuple(actual[k]["properties"]) == tuple(expected[k]["properties"]) + continue + if not isinstance(actual[k], dict) or not isinstance(expected[k], dict): + continue + assertEqualsParamOrder(actual[k], expected[k]) + + +def assertEqualSpiderMetadata(actual, expected): + """Compare 2 JSON schemas of spider metadata. + + The parameter order in the parameter schema is taken into account, given + how it affects the UI, while the order of other object keys may be + different. + + It also generates a better diff in pytest output when enums are involved, + e.g. geolocation values. + """ + assertEqualsParamOrder(actual, expected) + actual_json = json.dumps(actual, indent=2, sort_keys=True) + expected_json = json.dumps(expected, indent=2, sort_keys=True) + assert actual_json == expected_json diff --git a/tests/test_main.py b/tests/test_main.py new file mode 100644 index 0000000..8d23e26 --- /dev/null +++ b/tests/test_main.py @@ -0,0 +1,982 @@ +from __future__ import annotations + +import html +import json +from collections.abc import ( + AsyncIterator, # noqa: TC003 (needed for dependency injection) +) +from typing import TYPE_CHECKING, Any, ClassVar +from urllib.parse import urlparse + +import attrs +import pytest +from pydantic import BaseModel +from pydantic.fields import Field +from pytest_twisted import ensureDeferred # type: ignore[import-untyped] +from scrapy import signals +from scrapy_spider_metadata import Args, get_spider_metadata +from web_poet import ItemPage +from web_poet.fields import field +from web_poet.page_inputs.url import ( + RequestUrl, # noqa: TC002 (needed for dependency injection) +) +from web_poet.rules import RulesRegistry + +from scrapy_crawl_maps import ( + CrawlMapSpider, + CrawlMapSpiderCrawlMap, + ResponseData, +) +from scrapy_crawl_maps._nodes import ( + FetchNode, + ItemsNode, + ProcessorNode, + SpiderNode, +) + +from . import SETTINGS, assertEqualSpiderMetadata, get_crawler + +if TYPE_CHECKING: + import builtins + from asyncio import Queue + + +def serialize_items(items: list[dict[Any, Any]]) -> set[str]: + return {json.dumps(item, sort_keys=True) for item in items} + + +def assert_same_items(items1: list[dict[Any, Any]], items2: list[dict[Any, Any]]): + assert serialize_items(items1) == serialize_items(items2) + + +@pytest.mark.parametrize( + ("map", "expected"), + ( + # Shortest. + ( + { + "nodes": { + "items": { + "type": "items", + "args": {"items": [{"foo": "bar"}]}, + }, + }, + }, + { + "items": [{"foo": "bar"}], + "crawl_map_stats": { + "items/outputs/main": 1, + }, + }, + ), + # Basic. + ( + { + "nodes": { + "input": { + "type": "urls", + "args": {"urls": ["data:text/html,

Foo

"]}, + }, + "fetch": {"type": "fetch"}, + "item-parser": { + "type": "selector-parser", + "args": { + "map": {"title": {"type": "css", "value": "h1::text"}} + }, + }, + }, + "edges": [ + {"from": "input", "to": "fetch"}, + {"from": "fetch", "to": "item-parser"}, + ], + }, + { + "items": [{"title": "Foo"}], + "crawl_map_stats": { + "fetch/inputs/main": 1, + "fetch/outputs/main": 1, + "input/outputs/main": 1, + "item-parser/inputs/main": 1, + "item-parser/outputs/main": 1, + }, + }, + ), + # You can link multiple inputs to a single output. + ( + { + "nodes": { + "input-1": { + "type": "urls", + "args": {"urls": ["data:text/html,

Foo

"]}, + }, + "input-2": { + "type": "urls", + "args": {"urls": ["data:text/html,

Bar

"]}, + }, + "fetch": {"type": "fetch"}, + "item-parser": { + "type": "selector-parser", + "args": { + "map": {"title": {"type": "css", "value": "h1::text"}} + }, + }, + }, + "edges": [ + {"from": "input-1", "to": "fetch"}, + {"from": "input-2", "to": "fetch"}, + {"from": "fetch", "to": "item-parser"}, + ], + }, + { + "items": [{"title": "Foo"}, {"title": "Bar"}], + "crawl_map_stats": { + "fetch/inputs/main": 2, + "fetch/outputs/main": 2, + "input-1/outputs/main": 1, + "input-2/outputs/main": 1, + "item-parser/inputs/main": 2, + "item-parser/outputs/main": 2, + }, + }, + ), + # You can define a map that performs no download and yields no items, + # however pointless. + ( + { + "nodes": { + "urls": {"type": "urls", "args": {"urls": ["data:,"]}}, + }, + "edges": [], + }, + { + "items": [], + "crawl_map_stats": { + "urls/outputs/main": 1, + }, + }, + ), + # You can use item-follow. + ( + { + "nodes": { + "items": { + "type": "items", + "args": { + "items": [ + { + "items": ["data:text/html,

Bar

"], + "nextPage": "data:text/html,

Foo

", + } + ] + }, + }, + "follow-items": { + "type": "item-follow", + "args": {"url_jmes": "items"}, + }, + "follow-next": { + "type": "item-follow", + "args": {"url_jmes": "nextPage"}, + }, + "fetch": {"type": "fetch"}, + "item-parser": { + "type": "selector-parser", + "args": { + "map": {"title": {"type": "css", "value": "h1::text"}} + }, + }, + }, + "edges": [ + {"from": "items", "to": "follow-items"}, + {"from": "items", "to": "follow-next"}, + {"from": "follow-items", "to": "fetch"}, + {"from": "follow-next", "to": "fetch"}, + {"from": "fetch", "to": "item-parser"}, + ], + }, + { + "items": [{"title": "Foo"}, {"title": "Bar"}], + "crawl_map_stats": { + "items/outputs/main": 1, + "follow-items/inputs/main": 1, + "follow-items/outputs/main": 1, + "follow-next/inputs/main": 1, + "follow-next/outputs/main": 1, + "fetch/inputs/main": 2, + "fetch/outputs/main": 2, + "item-parser/inputs/main": 2, + "item-parser/outputs/main": 2, + }, + }, + ), + # Defining links between incompatible components causes a ValueError + # and for the spider to close. + ( + { + "nodes": { + "input": {"type": "urls", "args": {"urls": ["data:,"]}}, + "item-parser": { + "type": "selector-parser", + "args": { + "map": {"title": {"type": "css", "value": "h1::text"}} + }, + }, + }, + "edges": [{"from": "input", "to": "item-parser"}], + }, + { + "finish_reason": "bad_crawl_map", + "items": [], + "crawl_map_stats": {}, + }, + ), + # Support recursivity. + ( + { + "nodes": { + "input": { + "type": "urls", + "args": {"urls": ['data:,']}, + }, + "fetch": {"type": "fetch"}, + "follow": { + "type": "selector-follow", + "args": { + "selectors": [{"type": "css", "value": "a::attr(href)"}] + }, + }, + }, + "edges": [ + {"from": "input", "to": "fetch"}, + {"from": "fetch", "to": "follow"}, + {"from": "follow", "to": "fetch"}, + ], + }, + { + "items": [], + "crawl_map_stats": { + "input/outputs/main": 1, + "fetch/inputs/main": 2, + "fetch/outputs/main": 2, + "follow/inputs/main": 2, + "follow/outputs/main": 1, + }, + }, + ), + # item-follow accepts a recursion limit, implemented with + # ProcessorNode.process_request(). + # + # Without recursion limit: + ( + { + "nodes": { + "items": { + "type": "items", + "args": { + "items": [ + { + "next": 'data:text/html,

Foo

'.format( + data=html.escape("data:text/html,

Bar

") + ), + } + ] + }, + }, + "follow-next": { + "type": "item-follow", + "args": {"url_jmes": "next"}, + }, + "fetch": {"type": "fetch"}, + "item-parser": { + "type": "selector-parser", + "args": { + "map": {"title": {"type": "css", "value": "h1::text"}} + }, + }, + "next-parser": { + "type": "selector-parser", + "args": { + "map": {"next": {"type": "css", "value": "a::attr(href)"}} + }, + }, + }, + "edges": [ + {"from": "items", "to": "follow-next"}, + {"from": "follow-next", "to": "fetch"}, + {"from": "fetch", "to": "item-parser"}, + {"from": "fetch", "to": "next-parser"}, + {"from": "next-parser", "to": "follow-next"}, + ], + }, + { + "items": [{"title": "Foo"}, {"title": "Bar"}], + "crawl_map_stats": { + "items/outputs/main": 1, + "follow-next/inputs/main": 3, + "follow-next/outputs/main": 2, + "fetch/inputs/main": 2, + "fetch/outputs/main": 2, + "item-parser/inputs/main": 2, + "item-parser/outputs/main": 2, + "next-parser/inputs/main": 2, + "next-parser/outputs/main": 2, + }, + }, + ), + # With recursion limit: + ( + { + "nodes": { + "items": { + "type": "items", + "args": { + "items": [ + { + "next": 'data:text/html,

Foo

'.format( + data=html.escape("data:text/html,

Bar

") + ), + } + ] + }, + }, + "follow-next": { + "type": "item-follow", + "args": {"url_jmes": "next", "max_recursion": 0}, + }, + "fetch": {"type": "fetch"}, + "item-parser": { + "type": "selector-parser", + "args": { + "map": {"title": {"type": "css", "value": "h1::text"}} + }, + }, + "next-parser": { + "type": "selector-parser", + "args": { + "map": {"next": {"type": "css", "value": "a::attr(href)"}} + }, + }, + }, + "edges": [ + {"from": "items", "to": "follow-next"}, + {"from": "follow-next", "to": "fetch"}, + {"from": "fetch", "to": "item-parser"}, + {"from": "fetch", "to": "next-parser"}, + {"from": "next-parser", "to": "follow-next"}, + ], + }, + { + "items": [{"title": "Foo"}], + "crawl_map_stats": { + "items/outputs/main": 1, + "follow-next/inputs/main": 2, + "follow-next/outputs/main": 1, + "fetch/inputs/main": 1, + "fetch/outputs/main": 1, + "item-parser/inputs/main": 1, + "item-parser/outputs/main": 1, + "next-parser/inputs/main": 1, + "next-parser/outputs/main": 1, + }, + }, + ), + # With a high-enough recursion limit: + ( + { + "nodes": { + "items": { + "type": "items", + "args": { + "items": [ + { + "next": 'data:text/html,

Foo

'.format( + data=html.escape("data:text/html,

Bar

") + ), + } + ] + }, + }, + "follow-next": { + "type": "item-follow", + "args": {"url_jmes": "next", "max_recursion": 1}, + }, + "fetch": {"type": "fetch"}, + "item-parser": { + "type": "selector-parser", + "args": { + "map": {"title": {"type": "css", "value": "h1::text"}} + }, + }, + "next-parser": { + "type": "selector-parser", + "args": { + "map": {"next": {"type": "css", "value": "a::attr(href)"}} + }, + }, + }, + "edges": [ + {"from": "items", "to": "follow-next"}, + {"from": "follow-next", "to": "fetch"}, + {"from": "fetch", "to": "item-parser"}, + {"from": "fetch", "to": "next-parser"}, + {"from": "next-parser", "to": "follow-next"}, + ], + }, + { + "items": [{"title": "Foo"}, {"title": "Bar"}], + "crawl_map_stats": { + "items/outputs/main": 1, + "follow-next/inputs/main": 3, + "follow-next/outputs/main": 2, + "fetch/inputs/main": 2, + "fetch/outputs/main": 2, + "item-parser/inputs/main": 2, + "item-parser/outputs/main": 2, + "next-parser/inputs/main": 2, + "next-parser/outputs/main": 2, + }, + }, + ), + ), +) +@ensureDeferred +async def test_main(map, expected): + crawler = get_crawler() + items = [] + + def track_item(item, response, spider): + items.append(item) + + crawler.signals.connect(track_item, signal=signals.item_scraped) + await crawler.crawl(map=json.dumps(map)) + + assert_same_items(items, expected["items"]) + stat_prefix = "crawl_maps/nodes/" + assert crawler.stats is not None + stats = crawler.stats.get_stats() + crawl_map_stats = { + k[len(stat_prefix) :]: v for k, v in stats.items() if k.startswith(stat_prefix) + } + assert crawl_map_stats == expected["crawl_map_stats"] + + expected_finish_reason = expected.get("finish_reason", "finished") + assert stats["finish_reason"] == expected_finish_reason + assert "log_count/WARNING" not in stats + if expected_finish_reason == "finished": + assert "log_count/ERROR" not in stats + else: + assert stats["log_count/ERROR"] == 1 + + +@pytest.mark.parametrize( + ("map", "expected"), + ( + # Unknown node type. + ( + { + "nodes": { + "items": { + "type": "unknown", + }, + }, + }, + { + "error": "Unknown node type: 'unknown'", + }, + ), + ), +) +@ensureDeferred +async def test_bad_maps(map, expected, caplog): + crawler = get_crawler() + caplog.clear() + await crawler.crawl(map=json.dumps(map)) + assert expected["error"] in caplog.text + assert crawler.stats is not None + stats = crawler.stats.get_stats() + assert stats["finish_reason"] == "bad_crawl_map" + + +EXCEPTION = ValueError("foo") + + +class BadItemsNode(ItemsNode): + type = "bad-items" + + async def process( + self, + *, + inputs: dict[str, AsyncIterator[Any]], + outputs: dict[str, Queue[Any]], + response_data: ResponseData | None, + ) -> None: + raise EXCEPTION + + +class BadInputFetchNode(FetchNode): + type = "bad-input-fetch" + + async def process_input( + self, *, inputs: dict[str, AsyncIterator[Any]] + ) -> AsyncIterator[Any]: + raise EXCEPTION + yield + + +class BadOutputFetchNode(FetchNode): + type = "bad-output-fetch" + + async def process_output( + self, *, response_data: ResponseData, outputs: dict[str, Queue[Any]] + ) -> None: + raise EXCEPTION + + +@pytest.mark.parametrize( + ("extra_node_types", "map", "expected"), + ( + # Node.process() + ( + {BadItemsNode}, + { + "nodes": { + "items": { + "type": "bad-items", + "args": {"items": [{"foo": "bar"}]}, + }, + }, + }, + { + "exception": EXCEPTION, + }, + ), + # SpiderNode.process_input() + ( + {BadInputFetchNode}, + { + "nodes": { + "input": { + "type": "urls", + "args": {"urls": ["data:text/html,

Foo

"]}, + }, + "fetch": {"type": "bad-input-fetch"}, + "item-parser": { + "type": "selector-parser", + "args": { + "map": {"title": {"type": "css", "value": "h1::text"}} + }, + }, + }, + "edges": [ + {"from": "input", "to": "fetch"}, + {"from": "fetch", "to": "item-parser"}, + ], + }, + { + "exception": EXCEPTION, + }, + ), + # SpiderNode.process_output() + ( + {BadOutputFetchNode}, + { + "nodes": { + "input": { + "type": "urls", + "args": {"urls": ["data:text/html,

Foo

"]}, + }, + "fetch": {"type": "bad-output-fetch"}, + "item-parser": { + "type": "selector-parser", + "args": { + "map": {"title": {"type": "css", "value": "h1::text"}} + }, + }, + }, + "edges": [ + {"from": "input", "to": "fetch"}, + {"from": "fetch", "to": "item-parser"}, + ], + }, + { + "exception": EXCEPTION, + }, + ), + ), +) +@ensureDeferred +async def test_node_exceptions( + extra_node_types: set[type[ProcessorNode | SpiderNode]], map, expected, caplog +): + """Unhandled exceptions in node methods must not make the crawler hang, and + must be reported with a traceback in an error log message.""" + + class TestSpiderCrawlMap(CrawlMapSpiderCrawlMap): + node_types = CrawlMapSpiderCrawlMap.node_types | extra_node_types + + class TestSpiderParams(BaseModel): + map: TestSpiderCrawlMap = Field() + + class TestSpider(CrawlMapSpider, Args[TestSpiderParams]): + pass + + crawler = get_crawler(spider_cls=TestSpider) + caplog.clear() + await crawler.crawl(map=json.dumps(map)) + exception = expected["exception"] + assert f"Error while processing a node: {exception}" in caplog.text # message + assert f"{exception.__class__.__name__}: {exception}" in caplog.text # traceback + + +@ensureDeferred +async def test_poet(): + """Test a crawl map that uses a page object and does not require a response + (i.e. uses DummyResponse).""" + + url = "https://toscrape.com" + registry = RulesRegistry() + + @attrs.define + class Object: + url: str + + @registry.handle_urls(urlparse(url).netloc) + @attrs.define + class ObjectPage(ItemPage[Object]): + request_url: RequestUrl + + @field + def url(self) -> str: + return str(self.request_url) + + class ObjectNode(ProcessorNode): + type = "object" + spec: ClassVar[dict[str, Any]] = { + "inputs": { + "main": { + "type": "response", + }, + }, + "outputs": { + "main": { + "type": "item", + }, + }, + } + deps: ClassVar[set[builtins.type]] = {Object} + + async def process( + self, + *, + inputs: dict[str, AsyncIterator[Any]], + outputs: dict[str, Queue[Any]], + response_data: ResponseData | None, + ) -> None: + async for data in inputs["main"]: + assert data is not None + await outputs["main"].put(data[Object]) + + class TestSpiderCrawlMap(CrawlMapSpiderCrawlMap): + node_types: ClassVar[set[type[ProcessorNode | SpiderNode]]] = ( + CrawlMapSpiderCrawlMap.node_types | {ObjectNode} + ) + + class TestSpiderParams(BaseModel): + map: TestSpiderCrawlMap = Field() + + class TestSpider(CrawlMapSpider, Args[TestSpiderParams]): + pass + + settings = { + **SETTINGS, + "SCRAPY_POET_RULES": registry.get_rules(), + } + map = { + "nodes": { + "input": { + "type": "urls", + "args": {"urls": [url]}, + }, + "fetch": {"type": "fetch"}, + "object": { + "type": "object", + }, + }, + "edges": [ + {"from": "input", "to": "fetch"}, + {"from": "fetch", "to": "object"}, + ], + } + crawler = get_crawler(spider_cls=TestSpider, settings=settings) + items = [] + + def track_item(item, response, spider): + items.append(item) + + crawler.signals.connect(track_item, signal=signals.item_scraped) + await crawler.crawl(map=json.dumps(map)) + + assert len(items) == 1 + assert items[0] == Object(url=url) + + assert crawler.stats is not None + stats = crawler.stats.get_stats() + assert "log_count/ERROR" not in stats + assert "downloader/request_count" not in stats + + +@pytest.mark.parametrize( + ("spider_cls", "metadata"), + ( + ( + CrawlMapSpider, + { + "description": "Template for spiders that follow a crawl map.", + "param_schema": { + "properties": { + "map": { + "description": "Definition of the steps that the spider must follow.", + "node_groups": { + "input": {"order": 0, "title": "Input"}, + "fetch": {"order": 1, "title": "Fetch"}, + "parse": {"order": 2, "title": "Parse"}, + "follow": {"order": 3, "title": "Follow"}, + }, + "node_types": { + "fetch": { + "inputs": {"main": {"type": "request"}}, + "outputs": {"main": {"type": "response"}}, + }, + "item-follow": { + "inputs": {"main": {"type": "item"}}, + "outputs": {"main": {"type": "request"}}, + "param_schema": { + "properties": { + "max_recursion": { + "default": -1, + "description": ( + "Limits how many times a given chain of " + "requests can go through this node.\n" + "\n" + "-1 means no limit.\n" + "\n" + "It is useful, for example, to limit " + "pagination to the first few pages." + ), + "title": "Maximum recursion", + "type": "integer", + }, + "url_jmes": { + "description": ( + "Determines which URLs to follow.\n" + "\n" + "It is a JMESPath_ query that matches a " + "URL or a list of URLs.\n" + "\n" + ".. _JMESPath: https://jmespath.org/" + ), + "title": "URL JMESPath query", + "type": "string", + }, + }, + "required": [ + "url_jmes", + ], + "title": "ItemFollowNodeParams", + "type": "object", + }, + }, + "items": { + "outputs": {"main": {"type": "item"}}, + "param_schema": { + "properties": { + "items": { + "description": ( + "JSON array of JSON objects to output as items.\n" + "\n" + "For example:\n" + "\n" + ".. code-block:: json\n" + "\n" + ' [{"foo": "bar"}]\n' + ), + "items": { + "additionalProperties": True, + "type": "object", + }, + "title": "Items", + "type": "array", + } + }, + "required": ["items"], + "title": "ItemsNodeParams", + "type": "object", + }, + }, + "selector-follow": { + "inputs": {"main": {"type": "response"}}, + "outputs": {"main": {"type": "request"}}, + "param_schema": { + "properties": { + "selectors": { + "description": ( + "JSON array of JSON " + "objects that each define " + "a selector type and " + "value. For example: " + '[{"type": "css", ' + '"value": ' + '"a::attr(href)"}].\n' + "\n" + "Each field query can " + 'also define a "getter" ' + "key, which is the name " + "of the selector method " + 'to use: "getall" ' + '(default) or "get".' + ), + "items": { + "additionalProperties": { + "type": "string" + }, + "type": "object", + }, + "title": "Selector list", + "type": "array", + } + }, + "required": ["selectors"], + "title": "SelectorFollowNodeParams", + "type": "object", + }, + }, + "selector-parser": { + "inputs": {"main": {"type": "response"}}, + "outputs": {"main": {"type": "item"}}, + "param_schema": { + "properties": { + "map": { + "additionalProperties": { + "additionalProperties": { + "type": "string" + }, + "type": "object", + }, + "description": ( + "JSON object " + "mapping field " + "names to JSON " + "objects that " + "define a selector " + "type and value. " + "For example: " + '{"name": {"type": "css", ' + '"value": ' + '".name::text"}.\n' + "\n" + "Each field query " + "can also define a " + '"getter" key, ' + "which is the name " + "of the selector " + "method to use: " + '"get" (default) ' + 'or "getall".' + ), + "title": "Selector map", + "type": "object", + } + }, + "required": ["map"], + "title": "SelectorParserNodeParams", + "type": "object", + }, + }, + "urls": { + "outputs": {"main": {"type": "request"}}, + "param_schema": { + "properties": { + "urls": { + "description": ( + "Input URLs.\n" + "\n" + "Define 1 absolute URL " + "(e.g. including the " + "http(s) prefix) per " + "line.\n" + "\n" + "Example: " + "https://toscrape.com/" + ), + "items": { + "type": "string", + }, + "title": "URLs", + "type": "array", + }, + }, + "required": ["urls"], + "title": "UrlsNodeParams", + "type": "object", + }, + }, + }, + "properties": { + "edges": { + "items": { + "properties": { + "from": { + "oneOf": [ + {"type": "string"}, + { + "items": {"type": "string"}, + "maxItems": 2, + "minItems": 2, + "type": "array", + }, + ] + }, + "required": ["from", "to"], + "to": { + "oneOf": [ + {"type": "string"}, + { + "items": {"type": "string"}, + "maxItems": 2, + "minItems": 2, + "type": "array", + }, + ] + }, + }, + "type": "object", + }, + "type": "array", + }, + "nodes": { + "properties": { + "args": {"type": "object"}, + "type": {"type": "string"}, + }, + "required": ["type"], + "type": "object", + }, + "required": ["nodes"], + }, + "title": "Crawl map", + "type": "object", + } + }, + "required": ["map"], + "title": "CrawlMapSpiderParams", + "type": "object", + }, + "template": True, + "title": "Crawl Map Spider", + }, + ), + ), +) +@ensureDeferred +async def test_metadata(spider_cls, metadata): + assertEqualSpiderMetadata(get_spider_metadata(spider_cls, normalize=True), metadata) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..c8478d9 --- /dev/null +++ b/tox.ini @@ -0,0 +1,55 @@ +[tox] +envlist = pre-commit,docs,mypy,twine,min,py39,py310,py311,py312,py313 + +[testenv] +deps = + pytest==8.3.5 + pytest-cov==6.1.1 + pytest-twisted==1.14.3 +commands = + pytest \ + {posargs: \ + --cov scrapy_crawl_maps \ + -vv \ + scrapy_crawl_maps \ + tests \ + } + +[testenv:min] +basepython = python3.9 +deps = + {[testenv]deps} + itemadapter==0.8.0 + jmespath==0.9.5 + pydantic==2.4.0 + scrapy==2.13.0 + scrapy-poet==0.26.0 + scrapy-spider-metadata==0.1.1 + +[testenv:mypy] +deps = + {[testenv]deps} + mypy==1.15.0 + types-jmespath==1.0.2.20240106 + typing-extensions==4.13.2 +commands = mypy scrapy_crawl_maps tests + +[testenv:pre-commit] +deps = + pre-commit==4.2.0 +commands = pre-commit run --all-files --show-diff-on-failure + +[testenv:twine] +deps = + twine==6.1.0 + build==1.2.2.post1 +commands = + python -m build --sdist + twine check dist/* + +[testenv:docs] +changedir = docs +deps = + -rdocs/requirements.txt +commands = + sphinx-build -W -b html . {envtmpdir}/html