Skip to content

Commit

Permalink
Merge pull request #28 from zytedata/fix-dupe-requests
Browse files Browse the repository at this point in the history
support AnyResponse
  • Loading branch information
kmike authored Feb 9, 2024
2 parents 7df8e6c + 6a6f09a commit 8718a5f
Show file tree
Hide file tree
Showing 8 changed files with 49 additions and 50 deletions.
6 changes: 3 additions & 3 deletions docs/reference/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ Base classes

.. autoclass:: zyte_spider_templates.spiders.base.BaseSpider

.. autoenum:: zyte_spider_templates.spiders.base.ExtractFrom
:noindex:

.. autoenum:: zyte_spider_templates.spiders.base.Geolocation
:noindex:

Expand All @@ -23,9 +26,6 @@ E-commerce
.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy
:noindex:

.. autoclass:: zyte_spider_templates.spiders.ecommerce.ExtractFrom
:noindex:

.. autoclass:: zyte_spider_templates.spiders.ecommerce.EcommerceSpider
:noindex:

Expand Down
2 changes: 1 addition & 1 deletion docs/templates/e-commerce.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,6 @@ Parameters

.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy

.. autoenum:: zyte_spider_templates.spiders.ecommerce.ExtractFrom
.. autoenum:: zyte_spider_templates.spiders.base.ExtractFrom

.. autoenum:: zyte_spider_templates.spiders.base.Geolocation
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
install_requires=[
"pydantic>=2",
"scrapy>=2.11.0",
"scrapy-poet>=0.20.1",
"scrapy-poet>=0.21.0",
"scrapy-spider-metadata>=0.1.2",
"scrapy-zyte-api[provider]>=0.15.0",
"scrapy-zyte-api[provider]>=0.16.0",
"zyte-common-items>=0.13.0",
],
classifiers=[
Expand Down
6 changes: 3 additions & 3 deletions tests/pages/test_product_navigation_heuristics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest
from pytest_twisted import ensureDeferred
from web_poet import HttpResponse, PageParams, RequestUrl
from web_poet import AnyResponse, HttpResponse, PageParams, RequestUrl
from zyte_common_items import ProbabilityRequest, ProductNavigation

from zyte_spider_templates.pages.product_navigation_heuristics import (
Expand Down Expand Up @@ -38,7 +38,7 @@ async def test_unknown_product_page():
</body>
</html>
"""
response = HttpResponse("https://example.com", body)
response = AnyResponse(HttpResponse("https://example.com", body))
navigation = ProductNavigation.from_dict(
{
"url": "https://example.com",
Expand Down Expand Up @@ -118,7 +118,7 @@ async def test_crawl_nofollow_links():
</html>
"""
url = "https://example.com"
response = HttpResponse(url, body)
response = AnyResponse(HttpResponse(url, body))
request_url = RequestUrl(response.url)
navigation = ProductNavigation(url=url)

Expand Down
6 changes: 3 additions & 3 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ deps =
{[testenv]deps}
pydantic==2
scrapy==2.11.0
scrapy-poet==0.20.1
scrapy-poet==0.21.0
scrapy-spider-metadata==0.1.2
scrapy-zyte-api[provider]==0.15.0
scrapy-zyte-api[provider]==0.16.0
zyte-common-items==0.13.0

[testenv:mypy]
Expand All @@ -51,4 +51,4 @@ changedir = docs
deps =
-rdocs/requirements.txt
commands =
sphinx-build -W -b html . {envtmpdir}/html
sphinx-build -W -b html . {envtmpdir}/html
13 changes: 5 additions & 8 deletions zyte_spider_templates/pages/product_navigation_heuristics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import attrs
from scrapy.http import TextResponse
from scrapy.linkextractors import LinkExtractor
from web_poet import HttpResponse, PageParams, field, handle_urls
from web_poet import AnyResponse, PageParams, field, handle_urls
from zyte_common_items import AutoProductNavigationPage, ProbabilityRequest

from zyte_spider_templates.heuristics import might_be_category
Expand All @@ -12,12 +12,7 @@
@handle_urls("")
@attrs.define
class HeuristicsProductNavigationPage(AutoProductNavigationPage):
# TODO: swap with BrowserResponse after evaluating it.
# Also after when the following issue has been fixed:
# https://github.com/scrapy-plugins/scrapy-zyte-api/issues/91#issuecomment-1744305554
# NOTE: Even with BrowserResponse, it would still send separate
# requests for it and productNavigation.
response: HttpResponse
response: AnyResponse
page_params: PageParams

@field
Expand Down Expand Up @@ -55,7 +50,9 @@ def _probably_category_links(self) -> List[ProbabilityRequest]:
ignore_urls = set(self._urls_for_category())

links = []
response = TextResponse(url=str(self.response.url), body=self.response.body)
response = TextResponse(
url=str(self.response.url), body=self.response.text.encode()
)
for link in link_extractor.extract_links(response):
if link.url in ignore_urls:
continue
Expand Down
32 changes: 32 additions & 0 deletions zyte_spider_templates/spiders/base.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from enum import Enum
from importlib.metadata import version
from typing import Any, Dict, Optional

Expand All @@ -9,11 +10,22 @@
GEOLOCATION_OPTIONS_WITH_CODE,
Geolocation,
)
from zyte_spider_templates.documentation import document_enum

# Higher priority than command-line-defined settings (40).
ARG_SETTING_PRIORITY: int = 50


@document_enum
class ExtractFrom(str, Enum):
httpResponseBody: str = "httpResponseBody"
"""Use HTTP responses. Cost-efficient and fast extraction method, which
works well on many websites."""

browserHtml: str = "browserHtml"
"""Use browser rendering. Often provides the best quality."""


class BaseSpiderParams(BaseModel):
url: str = Field(
title="URL",
Expand Down Expand Up @@ -48,6 +60,26 @@ class BaseSpiderParams(BaseModel):
"widget": "request-limit",
},
)
extract_from: Optional[ExtractFrom] = Field(
title="Extraction source",
description=(
"Whether to perform extraction using a browser request "
"(browserHtml) or an HTTP request (httpResponseBody)."
),
default=None,
json_schema_extra={
"enumMeta": {
ExtractFrom.browserHtml: {
"title": "browserHtml",
"description": "Use browser rendering. Often provides the best quality.",
},
ExtractFrom.httpResponseBody: {
"title": "httpResponseBody",
"description": "Use HTTP responses. Cost-efficient and fast extraction method, which works well on many websites.",
},
},
},
)


class BaseSpider(scrapy.Spider):
Expand Down
30 changes: 0 additions & 30 deletions zyte_spider_templates/spiders/ecommerce.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,6 @@ class EcommerceCrawlStrategy(str, Enum):
ML-extraction."""


@document_enum
class ExtractFrom(str, Enum):
httpResponseBody: str = "httpResponseBody"
"""Use HTTP responses. Cost-efficient and fast extraction method, which
works well on many websites."""

browserHtml: str = "browserHtml"
"""Use browser rendering. Often provides the best quality."""


class EcommerceSpiderParams(BaseSpiderParams):
crawl_strategy: EcommerceCrawlStrategy = Field(
title="Crawl strategy",
Expand All @@ -68,26 +58,6 @@ class EcommerceSpiderParams(BaseSpiderParams):
},
},
)
extract_from: Optional[ExtractFrom] = Field(
title="Extraction source",
description=(
"Whether to perform extraction using a browser request "
"(browserHtml) or an HTTP request (httpResponseBody)."
),
default=None,
json_schema_extra={
"enumMeta": {
ExtractFrom.browserHtml: {
"title": "browserHtml",
"description": "Use browser rendering. Often provides the best quality.",
},
ExtractFrom.httpResponseBody: {
"title": "httpResponseBody",
"description": "Use HTTP responses. Cost-efficient and fast extraction method, which works well on many websites.",
},
},
},
)


class EcommerceSpider(Args[EcommerceSpiderParams], BaseSpider):
Expand Down

0 comments on commit 8718a5f

Please sign in to comment.