Merge pull request #28 from zytedata/fix-dupe-requests

support AnyResponse
zytedata · Feb 9, 2024 · 8718a5f · 8718a5f
2 parents 7df8e6c + 6a6f09a
commit 8718a5f
Show file tree

Hide file tree

Showing 8 changed files with 49 additions and 50 deletions.
diff --git a/docs/reference/index.rst b/docs/reference/index.rst
@@ -10,6 +10,9 @@ Base classes
 
 .. autoclass:: zyte_spider_templates.spiders.base.BaseSpider
 
+.. autoenum:: zyte_spider_templates.spiders.base.ExtractFrom
+    :noindex:
+
 .. autoenum:: zyte_spider_templates.spiders.base.Geolocation
     :noindex:
 
@@ -23,9 +26,6 @@ E-commerce
 .. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy
     :noindex:
 
-.. autoclass:: zyte_spider_templates.spiders.ecommerce.ExtractFrom
-    :noindex:
-
 .. autoclass:: zyte_spider_templates.spiders.ecommerce.EcommerceSpider
     :noindex:
 

diff --git a/docs/templates/e-commerce.rst b/docs/templates/e-commerce.rst
@@ -19,6 +19,6 @@ Parameters
 
 .. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy
 
-.. autoenum:: zyte_spider_templates.spiders.ecommerce.ExtractFrom
+.. autoenum:: zyte_spider_templates.spiders.base.ExtractFrom
 
 .. autoenum:: zyte_spider_templates.spiders.base.Geolocation
diff --git a/setup.py b/setup.py
@@ -14,9 +14,9 @@
     install_requires=[
         "pydantic>=2",
         "scrapy>=2.11.0",
-        "scrapy-poet>=0.20.1",
+        "scrapy-poet>=0.21.0",
         "scrapy-spider-metadata>=0.1.2",
-        "scrapy-zyte-api[provider]>=0.15.0",
+        "scrapy-zyte-api[provider]>=0.16.0",
         "zyte-common-items>=0.13.0",
     ],
     classifiers=[

diff --git a/tests/pages/test_product_navigation_heuristics.py b/tests/pages/test_product_navigation_heuristics.py
@@ -1,6 +1,6 @@
 import pytest
 from pytest_twisted import ensureDeferred
-from web_poet import HttpResponse, PageParams, RequestUrl
+from web_poet import AnyResponse, HttpResponse, PageParams, RequestUrl
 from zyte_common_items import ProbabilityRequest, ProductNavigation
 
 from zyte_spider_templates.pages.product_navigation_heuristics import (
@@ -38,7 +38,7 @@ async def test_unknown_product_page():
         </body>
         </html>
     """
-    response = HttpResponse("https://example.com", body)
+    response = AnyResponse(HttpResponse("https://example.com", body))
     navigation = ProductNavigation.from_dict(
         {
             "url": "https://example.com",
@@ -118,7 +118,7 @@ async def test_crawl_nofollow_links():
             </html>
         """
     url = "https://example.com"
-    response = HttpResponse(url, body)
+    response = AnyResponse(HttpResponse(url, body))
     request_url = RequestUrl(response.url)
     navigation = ProductNavigation(url=url)
 

diff --git a/tox.ini b/tox.ini
@@ -22,9 +22,9 @@ deps =
     {[testenv]deps}
     pydantic==2
     scrapy==2.11.0
-    scrapy-poet==0.20.1
+    scrapy-poet==0.21.0
     scrapy-spider-metadata==0.1.2
-    scrapy-zyte-api[provider]==0.15.0
+    scrapy-zyte-api[provider]==0.16.0
     zyte-common-items==0.13.0
 
 [testenv:mypy]
@@ -51,4 +51,4 @@ changedir = docs
 deps =
     -rdocs/requirements.txt
 commands =
-    sphinx-build -W -b html . {envtmpdir}/html
+    sphinx-build -W -b html . {envtmpdir}/html
diff --git a/zyte_spider_templates/pages/product_navigation_heuristics.py b/zyte_spider_templates/pages/product_navigation_heuristics.py
@@ -3,7 +3,7 @@
 import attrs
 from scrapy.http import TextResponse
 from scrapy.linkextractors import LinkExtractor
-from web_poet import HttpResponse, PageParams, field, handle_urls
+from web_poet import AnyResponse, PageParams, field, handle_urls
 from zyte_common_items import AutoProductNavigationPage, ProbabilityRequest
 
 from zyte_spider_templates.heuristics import might_be_category
@@ -12,12 +12,7 @@
 @handle_urls("")
 @attrs.define
 class HeuristicsProductNavigationPage(AutoProductNavigationPage):
-    # TODO: swap with BrowserResponse after evaluating it.
-    # Also after when the following issue has been fixed:
-    # https://github.com/scrapy-plugins/scrapy-zyte-api/issues/91#issuecomment-1744305554
-    # NOTE: Even with BrowserResponse, it would still send separate
-    # requests for it and productNavigation.
-    response: HttpResponse
+    response: AnyResponse
     page_params: PageParams
 
     @field
@@ -55,7 +50,9 @@ def _probably_category_links(self) -> List[ProbabilityRequest]:
         ignore_urls = set(self._urls_for_category())
 
         links = []
-        response = TextResponse(url=str(self.response.url), body=self.response.body)
+        response = TextResponse(
+            url=str(self.response.url), body=self.response.text.encode()
+        )
         for link in link_extractor.extract_links(response):
             if link.url in ignore_urls:
                 continue

diff --git a/zyte_spider_templates/spiders/base.py b/zyte_spider_templates/spiders/base.py
@@ -1,3 +1,4 @@
+from enum import Enum
 from importlib.metadata import version
 from typing import Any, Dict, Optional
 
@@ -9,11 +10,22 @@
     GEOLOCATION_OPTIONS_WITH_CODE,
     Geolocation,
 )
+from zyte_spider_templates.documentation import document_enum
 
 # Higher priority than command-line-defined settings (40).
 ARG_SETTING_PRIORITY: int = 50
 
 
+@document_enum
+class ExtractFrom(str, Enum):
+    httpResponseBody: str = "httpResponseBody"
+    """Use HTTP responses. Cost-efficient and fast extraction method, which
+    works well on many websites."""
+
+    browserHtml: str = "browserHtml"
+    """Use browser rendering. Often provides the best quality."""
+
+
 class BaseSpiderParams(BaseModel):
     url: str = Field(
         title="URL",
@@ -48,6 +60,26 @@ class BaseSpiderParams(BaseModel):
             "widget": "request-limit",
         },
     )
+    extract_from: Optional[ExtractFrom] = Field(
+        title="Extraction source",
+        description=(
+            "Whether to perform extraction using a browser request "
+            "(browserHtml) or an HTTP request (httpResponseBody)."
+        ),
+        default=None,
+        json_schema_extra={
+            "enumMeta": {
+                ExtractFrom.browserHtml: {
+                    "title": "browserHtml",
+                    "description": "Use browser rendering. Often provides the best quality.",
+                },
+                ExtractFrom.httpResponseBody: {
+                    "title": "httpResponseBody",
+                    "description": "Use HTTP responses. Cost-efficient and fast extraction method, which works well on many websites.",
+                },
+            },
+        },
+    )
 
 
 class BaseSpider(scrapy.Spider):

diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py
@@ -33,16 +33,6 @@ class EcommerceCrawlStrategy(str, Enum):
     ML-extraction."""
 
 
-@document_enum
-class ExtractFrom(str, Enum):
-    httpResponseBody: str = "httpResponseBody"
-    """Use HTTP responses. Cost-efficient and fast extraction method, which
-    works well on many websites."""
-
-    browserHtml: str = "browserHtml"
-    """Use browser rendering. Often provides the best quality."""
-
-
 class EcommerceSpiderParams(BaseSpiderParams):
     crawl_strategy: EcommerceCrawlStrategy = Field(
         title="Crawl strategy",
@@ -68,26 +58,6 @@ class EcommerceSpiderParams(BaseSpiderParams):
             },
         },
     )
-    extract_from: Optional[ExtractFrom] = Field(
-        title="Extraction source",
-        description=(
-            "Whether to perform extraction using a browser request "
-            "(browserHtml) or an HTTP request (httpResponseBody)."
-        ),
-        default=None,
-        json_schema_extra={
-            "enumMeta": {
-                ExtractFrom.browserHtml: {
-                    "title": "browserHtml",
-                    "description": "Use browser rendering. Often provides the best quality.",
-                },
-                ExtractFrom.httpResponseBody: {
-                    "title": "httpResponseBody",
-                    "description": "Use HTTP responses. Cost-efficient and fast extraction method, which works well on many websites.",
-                },
-            },
-        },
-    )
 
 
 class EcommerceSpider(Args[EcommerceSpiderParams], BaseSpider):