Merge branch 'main' into add/failed-deployment-tips

WordPress · Jun 25, 2023 · 06260d5 · 06260d5
2 parents 1f4b454 + 9dcb4e3
commit 06260d5
Show file tree

Hide file tree

Showing 39 changed files with 1,011 additions and 364 deletions.
diff --git a/.github/workflows/ci_cd.yml b/.github/workflows/ci_cd.yml
@@ -565,13 +565,13 @@ jobs:
         name:
           - playwright_vr
           - playwright_e2e
-          - storybook_vr
+          - storybook
         include:
           - name: playwright_vr
             script: "test:playwright visual-regression"
           - name: playwright_e2e
             script: "test:playwright e2e"
-          - name: storybook_vr
+          - name: storybook
             script: "test:storybook"
 
     steps:
@@ -609,7 +609,7 @@ jobs:
         name:
           - playwright_vr
           - playwright_e2e
-          - storybook_vr
+          - storybook
 
     steps:
       - name: Pass
@@ -949,6 +949,22 @@ jobs:
           wait_time: 60 # check every minute
           max_time: 1800 # allow up to 30 minutes for a deployment
 
+      - name: Deploy staging thumbnails
+        uses: felixp8/[email protected]
+        with:
+          owner: WordPress
+          repo: openverse-infrastructure
+          token: ${{ secrets.ACCESS_TOKEN }}
+          event_type: deploy_staging_api_thumbnails
+          client_payload: |
+            {
+              "actor": "${{ github.actor }}",
+              "tag": "${{ needs.get-image-tag.outputs.image_tag }}",
+              "run_name": "${{ steps.commit.outputs.commit_message }}"
+            }
+          wait_time: 60 # check every minute
+          max_time: 1800 # allow up to 30 minutes for a deployment
+
   ################
   # Notification #
   ################

diff --git a/api/api/controllers/search_controller.py b/api/api/controllers/search_controller.py
@@ -205,10 +205,26 @@ def _post_process_results(
             end = 90 + 45
             ```
             """
+            if end >= search_results.hits.total.value:
+                # Total available hits already exhausted in previous iteration
+                return results
+
             end += int(end / 2)
-            if start + end > ELASTICSEARCH_MAX_RESULT_WINDOW:
+            query_size = start + end
+            if query_size > ELASTICSEARCH_MAX_RESULT_WINDOW:
                 return results
 
+            # subtract start to account for the records skipped
+            # and which should not count towards the total
+            # available hits for the query
+            total_available_hits = search_results.hits.total.value - start
+            if query_size > total_available_hits:
+                # Clamp the query size to last available hit. On the next
+                # iteration, if results are still insufficient, the check
+                # to compare previous_query_size and total_available_hits
+                # will prevent further query attempts
+                end = search_results.hits.total.value
+
             s = s[start:end]
             search_response = s.execute()
 

diff --git a/api/api/utils/dead_link_mask.py b/api/api/utils/dead_link_mask.py
@@ -1,5 +1,5 @@
+import django_redis
 from deepdiff import DeepHash
-from django_redis import get_redis_connection
 from elasticsearch_dsl import Search
 
 
@@ -32,7 +32,7 @@ def get_query_mask(query_hash: str) -> list[int]:
     :param query_hash: Unique value for a particular query.
     :return: Boolean mask as a list of integers (0 or 1).
     """
-    redis = get_redis_connection("default")
+    redis = django_redis.get_redis_connection("default")
     key = f"{query_hash}:dead_link_mask"
     return list(map(int, redis.lrange(key, 0, -1)))
 
@@ -44,7 +44,7 @@ def save_query_mask(query_hash: str, mask: list):
     :param mask: Boolean mask as a list of integers (0 or 1).
     :param query_hash: Unique value to be used as key.
     """
-    redis_pipe = get_redis_connection("default").pipeline()
+    redis_pipe = django_redis.get_redis_connection("default").pipeline()
     key = f"{query_hash}:dead_link_mask"
 
     redis_pipe.delete(key)

diff --git a/api/api/utils/image_proxy/__init__.py b/api/api/utils/image_proxy/__init__.py
@@ -0,0 +1,128 @@
+import logging
+from typing import Literal
+from urllib.parse import urlparse
+
+from django.conf import settings
+from django.http import HttpResponse
+from rest_framework.exceptions import UnsupportedMediaType
+
+import django_redis
+import requests
+import sentry_sdk
+
+from api.utils.image_proxy.exception import UpstreamThumbnailException
+from api.utils.image_proxy.extension import get_image_extension
+from api.utils.image_proxy.photon import get_photon_request_params
+from api.utils.tallies import get_monthly_timestamp
+
+
+parent_logger = logging.getLogger(__name__)
+
+HEADERS = {
+    "User-Agent": settings.OUTBOUND_USER_AGENT_TEMPLATE.format(
+        purpose="ThumbnailGeneration"
+    )
+}
+
+PHOTON_TYPES = {"gif", "jpg", "jpeg", "png", "webp"}
+ORIGINAL_TYPES = {"svg"}
+
+PHOTON = "photon"
+ORIGINAL = "original"
+THUMBNAIL_STRATEGY = Literal["photon_proxy", "original"]
+
+
+def get_request_params_for_extension(
+    ext: str,
+    headers: dict[str, str],
+    image_url: str,
+    parsed_image_url: urlparse,
+    is_full_size: bool,
+    is_compressed: bool,
+) -> tuple[str, dict[str, str], dict[str, str]]:
+    """
+    Get the request params (url, params, headers) for the thumbnail proxy.
+    If the image type is supported by photon, we use photon, and compute the necessary
+    request params, if the file can be cached and returned as is (SVG), we do that,
+    otherwise we raise UnsupportedMediaType exception.
+    """
+    if ext in PHOTON_TYPES:
+        return get_photon_request_params(
+            parsed_image_url, is_full_size, is_compressed, headers
+        )
+    elif ext in ORIGINAL_TYPES:
+        return image_url, {}, headers
+    raise UnsupportedMediaType(
+        f"Image extension {ext} is not supported by the thumbnail proxy."
+    )
+
+
+def get(
+    image_url: str,
+    media_identifier: str,
+    accept_header: str = "image/*",
+    is_full_size: bool = False,
+    is_compressed: bool = True,
+) -> HttpResponse:
+    """
+    Proxy an image through Photon if its file type is supported, else return the
+    original image if the file type is SVG. Otherwise, raise an exception.
+    """
+    logger = parent_logger.getChild("get")
+    tallies = django_redis.get_redis_connection("tallies")
+    month = get_monthly_timestamp()
+
+    image_extension = get_image_extension(image_url, media_identifier)
+
+    headers = {"Accept": accept_header} | HEADERS
+
+    parsed_image_url = urlparse(image_url)
+    domain = parsed_image_url.netloc
+
+    upstream_url, params, headers = get_request_params_for_extension(
+        image_extension,
+        headers,
+        image_url,
+        parsed_image_url,
+        is_full_size,
+        is_compressed,
+    )
+
+    try:
+        upstream_response = requests.get(
+            upstream_url,
+            timeout=15,
+            params=params,
+            headers=headers,
+        )
+        tallies.incr(f"thumbnail_response_code:{month}:{upstream_response.status_code}")
+        tallies.incr(
+            f"thumbnail_response_code_by_domain:{domain}:"
+            f"{month}:{upstream_response.status_code}"
+        )
+        upstream_response.raise_for_status()
+    except Exception as exc:
+        exception_name = f"{exc.__class__.__module__}.{exc.__class__.__name__}"
+        key = f"thumbnail_error:{exception_name}:{domain}:{month}"
+        count = tallies.incr(key)
+        if count <= settings.THUMBNAIL_ERROR_INITIAL_ALERT_THRESHOLD or (
+            count % settings.THUMBNAIL_ERROR_REPEATED_ALERT_FREQUENCY == 0
+        ):
+            sentry_sdk.capture_exception(exc)
+        if isinstance(exc, requests.exceptions.HTTPError):
+            tallies.incr(
+                f"thumbnail_http_error:{domain}:{month}:{exc.response.status_code}:{exc.response.text}"
+            )
+        raise UpstreamThumbnailException(f"Failed to render thumbnail. {exc}")
+
+    res_status = upstream_response.status_code
+    content_type = upstream_response.headers.get("Content-Type")
+    logger.debug(
+        f"Image proxy response status: {res_status}, content-type: {content_type}"
+    )
+
+    return HttpResponse(
+        upstream_response.content,
+        status=res_status,
+        content_type=content_type,
+    )
diff --git a/api/api/utils/image_proxy/exception.py b/api/api/utils/image_proxy/exception.py
@@ -0,0 +1,8 @@
+from rest_framework import status
+from rest_framework.exceptions import APIException
+
+
+class UpstreamThumbnailException(APIException):
+    status_code = status.HTTP_424_FAILED_DEPENDENCY
+    default_detail = "Could not render thumbnail due to upstream provider error."
+    default_code = "upstream_photon_failure"
diff --git a/api/api/utils/image_proxy/extension.py b/api/api/utils/image_proxy/extension.py
@@ -0,0 +1,58 @@
+from os.path import splitext
+from urllib.parse import urlparse
+
+import django_redis
+import requests
+import sentry_sdk
+
+from api.utils.image_proxy.exception import UpstreamThumbnailException
+
+
+def get_image_extension(image_url: str, media_identifier: str) -> str | None:
+    cache = django_redis.get_redis_connection("default")
+    key = f"media:{media_identifier}:thumb_type"
+
+    ext = _get_file_extension_from_url(image_url)
+
+    if not ext:
+        # If the extension is not present in the URL, try to get it from the redis cache
+        ext = cache.get(key)
+        ext = ext.decode("utf-8") if ext else None
+
+    if not ext:
+        # If the extension is still not present, try getting it from the content type
+        try:
+            response = requests.head(image_url, timeout=10)
+            response.raise_for_status()
+        except Exception as exc:
+            sentry_sdk.capture_exception(exc)
+            raise UpstreamThumbnailException(
+                "Failed to render thumbnail due to inability to check media "
+                f"type. {exc}"
+            )
+        else:
+            if response.headers and "Content-Type" in response.headers:
+                content_type = response.headers["Content-Type"]
+                ext = _get_file_extension_from_content_type(content_type)
+            else:
+                ext = None
+
+            cache.set(key, ext if ext else "unknown")
+    return ext
+
+
+def _get_file_extension_from_url(image_url: str) -> str:
+    """Return the image extension if present in the URL."""
+    parsed = urlparse(image_url)
+    _, ext = splitext(parsed.path)
+    return ext[1:].lower()  # remove the leading dot
+
+
+def _get_file_extension_from_content_type(content_type: str) -> str | None:
+    """
+    Return the image extension if present in the Response's content type
+    header.
+    """
+    if content_type and "/" in content_type:
+        return content_type.split("/")[1]
+    return None
diff --git a/api/api/utils/image_proxy/photon.py b/api/api/utils/image_proxy/photon.py
@@ -0,0 +1,43 @@
+from django.conf import settings
+
+
+def get_photon_request_params(
+    parsed_image_url,
+    is_full_size: bool,
+    is_compressed: bool,
+    headers: dict,
+):
+    """
+    Photon options documented here:
+    https://developer.wordpress.com/docs/photon/api/
+    """
+    params = {}
+
+    if not is_full_size:
+        params["w"] = settings.THUMBNAIL_WIDTH_PX
+
+    if is_compressed:
+        params["quality"] = settings.THUMBNAIL_QUALITY
+
+    if parsed_image_url.query:
+        # No need to URL encode this string because requests will already
+        # pass the `params` object to `urlencode` before it appends it to the
+        # request URL.
+        params["q"] = parsed_image_url.query
+
+    if parsed_image_url.scheme == "https":
+        # Photon defaults to HTTP without this parameter
+        # which will cause some providers to fail (if they
+        # do not serve over HTTP and do not have a redirect)
+        params["ssl"] = "true"
+
+    # Photon excludes the protocol, so we need to reconstruct the url + port + path
+    # to send as the "path" of the Photon request
+    domain = parsed_image_url.netloc
+    path = parsed_image_url.path
+    upstream_url = f"{settings.PHOTON_ENDPOINT}{domain}{path}"
+
+    if settings.PHOTON_AUTH_KEY:
+        headers["X-Photon-Authentication"] = settings.PHOTON_AUTH_KEY
+
+    return upstream_url, params, headers