diff --git a/.github/workflows/ci_cd.yml b/.github/workflows/ci_cd.yml index 3691c94330d..b72ea5901b3 100644 --- a/.github/workflows/ci_cd.yml +++ b/.github/workflows/ci_cd.yml @@ -565,13 +565,13 @@ jobs: name: - playwright_vr - playwright_e2e - - storybook_vr + - storybook include: - name: playwright_vr script: "test:playwright visual-regression" - name: playwright_e2e script: "test:playwright e2e" - - name: storybook_vr + - name: storybook script: "test:storybook" steps: @@ -609,7 +609,7 @@ jobs: name: - playwright_vr - playwright_e2e - - storybook_vr + - storybook steps: - name: Pass @@ -949,6 +949,22 @@ jobs: wait_time: 60 # check every minute max_time: 1800 # allow up to 30 minutes for a deployment + - name: Deploy staging thumbnails + uses: felixp8/dispatch-and-wait@v0.1.0 + with: + owner: WordPress + repo: openverse-infrastructure + token: ${{ secrets.ACCESS_TOKEN }} + event_type: deploy_staging_api_thumbnails + client_payload: | + { + "actor": "${{ github.actor }}", + "tag": "${{ needs.get-image-tag.outputs.image_tag }}", + "run_name": "${{ steps.commit.outputs.commit_message }}" + } + wait_time: 60 # check every minute + max_time: 1800 # allow up to 30 minutes for a deployment + ################ # Notification # ################ diff --git a/api/api/controllers/search_controller.py b/api/api/controllers/search_controller.py index def66bb344d..a723e52f9d6 100644 --- a/api/api/controllers/search_controller.py +++ b/api/api/controllers/search_controller.py @@ -205,10 +205,26 @@ def _post_process_results( end = 90 + 45 ``` """ + if end >= search_results.hits.total.value: + # Total available hits already exhausted in previous iteration + return results + end += int(end / 2) - if start + end > ELASTICSEARCH_MAX_RESULT_WINDOW: + query_size = start + end + if query_size > ELASTICSEARCH_MAX_RESULT_WINDOW: return results + # subtract start to account for the records skipped + # and which should not count towards the total + # available hits for the query + total_available_hits = search_results.hits.total.value - start + if query_size > total_available_hits: + # Clamp the query size to last available hit. On the next + # iteration, if results are still insufficient, the check + # to compare previous_query_size and total_available_hits + # will prevent further query attempts + end = search_results.hits.total.value + s = s[start:end] search_response = s.execute() diff --git a/api/api/utils/dead_link_mask.py b/api/api/utils/dead_link_mask.py index 4d0cc281da2..fdd1034b0fc 100644 --- a/api/api/utils/dead_link_mask.py +++ b/api/api/utils/dead_link_mask.py @@ -1,5 +1,5 @@ +import django_redis from deepdiff import DeepHash -from django_redis import get_redis_connection from elasticsearch_dsl import Search @@ -32,7 +32,7 @@ def get_query_mask(query_hash: str) -> list[int]: :param query_hash: Unique value for a particular query. :return: Boolean mask as a list of integers (0 or 1). """ - redis = get_redis_connection("default") + redis = django_redis.get_redis_connection("default") key = f"{query_hash}:dead_link_mask" return list(map(int, redis.lrange(key, 0, -1))) @@ -44,7 +44,7 @@ def save_query_mask(query_hash: str, mask: list): :param mask: Boolean mask as a list of integers (0 or 1). :param query_hash: Unique value to be used as key. """ - redis_pipe = get_redis_connection("default").pipeline() + redis_pipe = django_redis.get_redis_connection("default").pipeline() key = f"{query_hash}:dead_link_mask" redis_pipe.delete(key) diff --git a/api/api/utils/image_proxy/__init__.py b/api/api/utils/image_proxy/__init__.py new file mode 100644 index 00000000000..2b83eda904d --- /dev/null +++ b/api/api/utils/image_proxy/__init__.py @@ -0,0 +1,128 @@ +import logging +from typing import Literal +from urllib.parse import urlparse + +from django.conf import settings +from django.http import HttpResponse +from rest_framework.exceptions import UnsupportedMediaType + +import django_redis +import requests +import sentry_sdk + +from api.utils.image_proxy.exception import UpstreamThumbnailException +from api.utils.image_proxy.extension import get_image_extension +from api.utils.image_proxy.photon import get_photon_request_params +from api.utils.tallies import get_monthly_timestamp + + +parent_logger = logging.getLogger(__name__) + +HEADERS = { + "User-Agent": settings.OUTBOUND_USER_AGENT_TEMPLATE.format( + purpose="ThumbnailGeneration" + ) +} + +PHOTON_TYPES = {"gif", "jpg", "jpeg", "png", "webp"} +ORIGINAL_TYPES = {"svg"} + +PHOTON = "photon" +ORIGINAL = "original" +THUMBNAIL_STRATEGY = Literal["photon_proxy", "original"] + + +def get_request_params_for_extension( + ext: str, + headers: dict[str, str], + image_url: str, + parsed_image_url: urlparse, + is_full_size: bool, + is_compressed: bool, +) -> tuple[str, dict[str, str], dict[str, str]]: + """ + Get the request params (url, params, headers) for the thumbnail proxy. + If the image type is supported by photon, we use photon, and compute the necessary + request params, if the file can be cached and returned as is (SVG), we do that, + otherwise we raise UnsupportedMediaType exception. + """ + if ext in PHOTON_TYPES: + return get_photon_request_params( + parsed_image_url, is_full_size, is_compressed, headers + ) + elif ext in ORIGINAL_TYPES: + return image_url, {}, headers + raise UnsupportedMediaType( + f"Image extension {ext} is not supported by the thumbnail proxy." + ) + + +def get( + image_url: str, + media_identifier: str, + accept_header: str = "image/*", + is_full_size: bool = False, + is_compressed: bool = True, +) -> HttpResponse: + """ + Proxy an image through Photon if its file type is supported, else return the + original image if the file type is SVG. Otherwise, raise an exception. + """ + logger = parent_logger.getChild("get") + tallies = django_redis.get_redis_connection("tallies") + month = get_monthly_timestamp() + + image_extension = get_image_extension(image_url, media_identifier) + + headers = {"Accept": accept_header} | HEADERS + + parsed_image_url = urlparse(image_url) + domain = parsed_image_url.netloc + + upstream_url, params, headers = get_request_params_for_extension( + image_extension, + headers, + image_url, + parsed_image_url, + is_full_size, + is_compressed, + ) + + try: + upstream_response = requests.get( + upstream_url, + timeout=15, + params=params, + headers=headers, + ) + tallies.incr(f"thumbnail_response_code:{month}:{upstream_response.status_code}") + tallies.incr( + f"thumbnail_response_code_by_domain:{domain}:" + f"{month}:{upstream_response.status_code}" + ) + upstream_response.raise_for_status() + except Exception as exc: + exception_name = f"{exc.__class__.__module__}.{exc.__class__.__name__}" + key = f"thumbnail_error:{exception_name}:{domain}:{month}" + count = tallies.incr(key) + if count <= settings.THUMBNAIL_ERROR_INITIAL_ALERT_THRESHOLD or ( + count % settings.THUMBNAIL_ERROR_REPEATED_ALERT_FREQUENCY == 0 + ): + sentry_sdk.capture_exception(exc) + if isinstance(exc, requests.exceptions.HTTPError): + tallies.incr( + f"thumbnail_http_error:{domain}:{month}:{exc.response.status_code}:{exc.response.text}" + ) + raise UpstreamThumbnailException(f"Failed to render thumbnail. {exc}") + + res_status = upstream_response.status_code + content_type = upstream_response.headers.get("Content-Type") + logger.debug( + f"Image proxy response status: {res_status}, content-type: {content_type}" + ) + + return HttpResponse( + upstream_response.content, + status=res_status, + content_type=content_type, + ) diff --git a/api/api/utils/image_proxy/exception.py b/api/api/utils/image_proxy/exception.py new file mode 100644 index 00000000000..903de58e99c --- /dev/null +++ b/api/api/utils/image_proxy/exception.py @@ -0,0 +1,8 @@ +from rest_framework import status +from rest_framework.exceptions import APIException + + +class UpstreamThumbnailException(APIException): + status_code = status.HTTP_424_FAILED_DEPENDENCY + default_detail = "Could not render thumbnail due to upstream provider error." + default_code = "upstream_photon_failure" diff --git a/api/api/utils/image_proxy/extension.py b/api/api/utils/image_proxy/extension.py new file mode 100644 index 00000000000..ae74d7f0963 --- /dev/null +++ b/api/api/utils/image_proxy/extension.py @@ -0,0 +1,58 @@ +from os.path import splitext +from urllib.parse import urlparse + +import django_redis +import requests +import sentry_sdk + +from api.utils.image_proxy.exception import UpstreamThumbnailException + + +def get_image_extension(image_url: str, media_identifier: str) -> str | None: + cache = django_redis.get_redis_connection("default") + key = f"media:{media_identifier}:thumb_type" + + ext = _get_file_extension_from_url(image_url) + + if not ext: + # If the extension is not present in the URL, try to get it from the redis cache + ext = cache.get(key) + ext = ext.decode("utf-8") if ext else None + + if not ext: + # If the extension is still not present, try getting it from the content type + try: + response = requests.head(image_url, timeout=10) + response.raise_for_status() + except Exception as exc: + sentry_sdk.capture_exception(exc) + raise UpstreamThumbnailException( + "Failed to render thumbnail due to inability to check media " + f"type. {exc}" + ) + else: + if response.headers and "Content-Type" in response.headers: + content_type = response.headers["Content-Type"] + ext = _get_file_extension_from_content_type(content_type) + else: + ext = None + + cache.set(key, ext if ext else "unknown") + return ext + + +def _get_file_extension_from_url(image_url: str) -> str: + """Return the image extension if present in the URL.""" + parsed = urlparse(image_url) + _, ext = splitext(parsed.path) + return ext[1:].lower() # remove the leading dot + + +def _get_file_extension_from_content_type(content_type: str) -> str | None: + """ + Return the image extension if present in the Response's content type + header. + """ + if content_type and "/" in content_type: + return content_type.split("/")[1] + return None diff --git a/api/api/utils/image_proxy/photon.py b/api/api/utils/image_proxy/photon.py new file mode 100644 index 00000000000..639839c85a8 --- /dev/null +++ b/api/api/utils/image_proxy/photon.py @@ -0,0 +1,43 @@ +from django.conf import settings + + +def get_photon_request_params( + parsed_image_url, + is_full_size: bool, + is_compressed: bool, + headers: dict, +): + """ + Photon options documented here: + https://developer.wordpress.com/docs/photon/api/ + """ + params = {} + + if not is_full_size: + params["w"] = settings.THUMBNAIL_WIDTH_PX + + if is_compressed: + params["quality"] = settings.THUMBNAIL_QUALITY + + if parsed_image_url.query: + # No need to URL encode this string because requests will already + # pass the `params` object to `urlencode` before it appends it to the + # request URL. + params["q"] = parsed_image_url.query + + if parsed_image_url.scheme == "https": + # Photon defaults to HTTP without this parameter + # which will cause some providers to fail (if they + # do not serve over HTTP and do not have a redirect) + params["ssl"] = "true" + + # Photon excludes the protocol, so we need to reconstruct the url + port + path + # to send as the "path" of the Photon request + domain = parsed_image_url.netloc + path = parsed_image_url.path + upstream_url = f"{settings.PHOTON_ENDPOINT}{domain}{path}" + + if settings.PHOTON_AUTH_KEY: + headers["X-Photon-Authentication"] = settings.PHOTON_AUTH_KEY + + return upstream_url, params, headers diff --git a/api/api/utils/photon.py b/api/api/utils/photon.py deleted file mode 100644 index 1e161736cd6..00000000000 --- a/api/api/utils/photon.py +++ /dev/null @@ -1,178 +0,0 @@ -import logging -from os.path import splitext -from urllib.parse import urlparse - -from django.conf import settings -from django.http import HttpResponse -from rest_framework import status -from rest_framework.exceptions import APIException, UnsupportedMediaType - -import django_redis -import requests -import sentry_sdk - -from api.utils.tallies import get_monthly_timestamp - - -parent_logger = logging.getLogger(__name__) - - -class UpstreamThumbnailException(APIException): - status_code = status.HTTP_424_FAILED_DEPENDENCY - default_detail = "Could not render thumbnail due to upstream provider error." - default_code = "upstream_photon_failure" - - -ALLOWED_TYPES = {"gif", "jpg", "jpeg", "png", "webp"} - -HEADERS = { - "User-Agent": settings.OUTBOUND_USER_AGENT_TEMPLATE.format( - purpose="ThumbnailGeneration" - ) -} - - -def _get_file_extension_from_url(image_url: str) -> str: - """Return the image extension if present in the URL.""" - parsed = urlparse(image_url) - _, ext = splitext(parsed.path) - return ext[1:].lower() # remove the leading dot - - -def _get_file_extension_from_content_type(content_type: str) -> str | None: - """ - Return the image extension if present in the Response's content type - header. - """ - if content_type and "/" in content_type: - return content_type.split("/")[1] - return None - - -def check_image_type(image_url: str, media_obj) -> None: - cache = django_redis.get_redis_connection("default") - key = f"media:{media_obj.identifier}:thumb_type" - - ext = _get_file_extension_from_url(image_url) - - if not ext: - # If the extension is not present in the URL, try to get it from the redis cache - ext = cache.get(key) - ext = ext.decode("utf-8") if ext else None - - if not ext: - # If the extension is still not present, try getting it from the content type - try: - response = requests.head(image_url, timeout=10) - response.raise_for_status() - except Exception as exc: - sentry_sdk.capture_exception(exc) - raise UpstreamThumbnailException( - "Failed to render thumbnail due to inability to check media " - f"type. {exc}" - ) - else: - if response.headers and "Content-Type" in response.headers: - content_type = response.headers["Content-Type"] - ext = _get_file_extension_from_content_type(content_type) - else: - ext = None - - cache.set(key, ext if ext else "unknown") - - if ext not in ALLOWED_TYPES: - raise UnsupportedMediaType(ext) - - -def _get_photon_params(image_url, is_full_size, is_compressed): - """ - Photon options documented here: - https://developer.wordpress.com/docs/photon/api/ - """ - params = {} - - if not is_full_size: - params["w"] = settings.THUMBNAIL_WIDTH_PX - - if is_compressed: - params["quality"] = settings.THUMBNAIL_QUALITY - - parsed_image_url = urlparse(image_url) - - if parsed_image_url.query: - # No need to URL encode this string because requests will already - # pass the `params` object to `urlencode` before it appends it to the - # request URL. - params["q"] = parsed_image_url.query - - if parsed_image_url.scheme == "https": - # Photon defaults to HTTP without this parameter - # which will cause some providers to fail (if they - # do not serve over HTTP and do not have a redirect) - params["ssl"] = "true" - - return params, parsed_image_url - - -def get( - image_url: str, - accept_header: str = "image/*", - is_full_size: bool = False, - is_compressed: bool = True, -) -> HttpResponse: - logger = parent_logger.getChild("get") - tallies = django_redis.get_redis_connection("tallies") - month = get_monthly_timestamp() - - params, parsed_image_url = _get_photon_params( - image_url, is_full_size, is_compressed - ) - - # Photon excludes the protocol, so we need to reconstruct the url + port + path - # to send as the "path" of the Photon request - domain = parsed_image_url.netloc - path = parsed_image_url.path - upstream_url = f"{settings.PHOTON_ENDPOINT}{domain}{path}" - - headers = {"Accept": accept_header} | HEADERS - if settings.PHOTON_AUTH_KEY: - headers["X-Photon-Authentication"] = settings.PHOTON_AUTH_KEY - - try: - upstream_response = requests.get( - upstream_url, - timeout=15, - params=params, - headers=headers, - ) - tallies.incr(f"thumbnail_response_code:{month}:{upstream_response.status_code}") - tallies.incr( - f"thumbnail_response_code_by_domain:{domain}:" - f"{month}:{upstream_response.status_code}" - ) - upstream_response.raise_for_status() - except Exception as exc: - exception_name = f"{exc.__class__.__module__}.{exc.__class__.__name__}" - key = f"thumbnail_error:{exception_name}:{domain}:{month}" - count = tallies.incr(key) - if count <= settings.THUMBNAIL_ERROR_INITIAL_ALERT_THRESHOLD or ( - count % settings.THUMBNAIL_ERROR_REPEATED_ALERT_FREQUENCY == 0 - ): - sentry_sdk.capture_exception(exc) - if isinstance(exc, requests.exceptions.HTTPError): - tallies.incr( - f"thumbnail_http_error:{domain}:{month}:{exc.response.status_code}:{exc.response.text}" - ) - raise UpstreamThumbnailException(f"Failed to render thumbnail. {exc}") - - res_status = upstream_response.status_code - content_type = upstream_response.headers.get("Content-Type") - logger.debug( - f"Image proxy response status: {res_status}, content-type: {content_type}" - ) - - return HttpResponse( - upstream_response.content, - status=res_status, - content_type=content_type, - ) diff --git a/api/api/utils/watermark.py b/api/api/utils/watermark.py index 6352ca5e58f..b111ab0d9aa 100644 --- a/api/api/utils/watermark.py +++ b/api/api/utils/watermark.py @@ -79,7 +79,9 @@ def _fit_in_width(text, font, max_width): """ char_length = font.getlength("x") # x has the closest to average width - max_chars = max_width // char_length + max_chars = int( + max_width // char_length + ) # Must be an integer to be used with `wrap` below text = "\n".join(["\n".join(wrap(line, max_chars)) for line in text.split("\n")]) diff --git a/api/api/views/media_views.py b/api/api/views/media_views.py index 6a6d69fea92..def37d283e5 100644 --- a/api/api/views/media_views.py +++ b/api/api/views/media_views.py @@ -9,7 +9,7 @@ from api.controllers import search_controller from api.models import ContentProvider from api.serializers.provider_serializers import ProviderSerializer -from api.utils import photon +from api.utils import image_proxy from api.utils.pagination import StandardPagination @@ -164,10 +164,9 @@ def thumbnail(self, request, media_obj, image_url): serializer = self.get_serializer(data=request.query_params) serializer.is_valid(raise_exception=True) - photon.check_image_type(image_url, media_obj) - - return photon.get( + return image_proxy.get( image_url, + media_obj.identifier, accept_header=request.headers.get("Accept", "image/*"), **serializer.validated_data, ) diff --git a/api/test/factory/es_http.py b/api/test/factory/es_http.py new file mode 100644 index 00000000000..f80de2e0680 --- /dev/null +++ b/api/test/factory/es_http.py @@ -0,0 +1,81 @@ +from uuid import uuid4 + + +MOCK_LIVE_RESULT_URL_PREFIX = "https://example.com/openverse-live-image-result-url" +MOCK_DEAD_RESULT_URL_PREFIX = "https://example.com/openverse-dead-image-result-url" + + +def create_mock_es_http_image_hit(_id: str, index: str, live: bool = True): + return { + "_index": index, + "_type": "_doc", + "_id": _id, + "_score": 7.607353, + "_source": { + "thumbnail": None, + "aspect_ratio": "wide", + "extension": "jpg", + "size": "large", + "authority_boost": 85, + "max_boost": 85, + "min_boost": 1, + "id": _id, + "identifier": str(uuid4()), + "title": "Bird Nature Photo", + "foreign_landing_url": "https://example.com/photo/LYTN21EBYO", + "creator": "Nature's Beauty", + "creator_url": "https://example.com/author/121424", + "url": f"{MOCK_LIVE_RESULT_URL_PREFIX}/{_id}" + if live + else f"{MOCK_DEAD_RESULT_URL_PREFIX}/{_id}", + "license": "cc0", + "license_version": "1.0", + "license_url": "https://creativecommons.org/publicdomain/zero/1.0/", + "provider": "example", + "source": "example", + "category": "photograph", + "created_on": "2022-02-26T08:48:33+00:00", + "tags": [{"name": "bird"}], + "mature": False, + }, + } + + +def create_mock_es_http_image_search_response( + index: str, + total_hits: int, + hit_count: int, + live_hit_count: int | None = None, + base_hits=None, +): + live_hit_count = live_hit_count if live_hit_count is not None else hit_count + base_hits = base_hits or [] + + live_hits = [ + create_mock_es_http_image_hit( + _id=len(base_hits) + i, + index=index, + live=True, + ) + for i in range(live_hit_count) + ] + + dead_hits = [ + create_mock_es_http_image_hit( + _id=len(live_hits) + len(base_hits) + i, + index=index, + live=False, + ) + for i in range(hit_count - live_hit_count) + ] + + return { + "took": 3, + "timed_out": False, + "_shards": {"total": 18, "successful": 18, "skipped": 0, "failed": 0}, + "hits": { + "total": {"value": total_hits, "relation": "eq"}, + "max_score": 11.0007305, + "hits": base_hits + live_hits + dead_hits, + }, + } diff --git a/api/test/factory/models/media.py b/api/test/factory/models/media.py index 8f47d12c304..8fc6505d602 100644 --- a/api/test/factory/models/media.py +++ b/api/test/factory/models/media.py @@ -156,7 +156,7 @@ def _save_model_to_es( source_document = cls._create_es_source_document(media, mature) es.create( index=origin_index, - id=media.pk, + id=str(media.pk), document=source_document, refresh=True, ) @@ -172,7 +172,7 @@ def _save_model_to_es( { "_index": origin_index, "_score": 1.0, - "_id": media.pk, + "_id": str(media.pk), "_source": source_document, } ) diff --git a/api/test/test_dead_link_filter.py b/api/test/test_dead_link_filter.py index 6c3d4fc0371..4c682dcc687 100644 --- a/api/test/test_dead_link_filter.py +++ b/api/test/test_dead_link_filter.py @@ -18,9 +18,6 @@ def redis(monkeypatch) -> FakeRedis: def get_redis_connection(*args, **kwargs): return fake_redis - monkeypatch.setattr( - "api.utils.dead_link_mask.get_redis_connection", get_redis_connection - ) monkeypatch.setattr("django_redis.get_redis_connection", get_redis_connection) yield fake_redis diff --git a/api/test/unit/conftest.py b/api/test/unit/conftest.py index 6f41dc40b68..f838fb61961 100644 --- a/api/test/unit/conftest.py +++ b/api/test/unit/conftest.py @@ -7,6 +7,7 @@ import pytest from elasticsearch import Elasticsearch +from fakeredis import FakeRedis from api.serializers.audio_serializers import ( AudioSearchRequestSerializer, @@ -22,6 +23,19 @@ ) +@pytest.fixture() +def redis(monkeypatch) -> FakeRedis: + fake_redis = FakeRedis() + + def get_redis_connection(*args, **kwargs): + return fake_redis + + monkeypatch.setattr("django_redis.get_redis_connection", get_redis_connection) + + yield fake_redis + fake_redis.client().close() + + @pytest.fixture def api_client(): return APIClient() @@ -54,8 +68,8 @@ class MediaTypeConfig: model_serializer: MediaSerializer -MEDIA_TYPE_CONFIGS = ( - MediaTypeConfig( +MEDIA_TYPE_CONFIGS = { + "image": MediaTypeConfig( media_type="image", url_prefix="images", origin_index="image", @@ -65,7 +79,7 @@ class MediaTypeConfig: search_request_serializer=ImageSearchRequestSerializer, model_serializer=ImageSerializer, ), - MediaTypeConfig( + "audio": MediaTypeConfig( media_type="audio", url_prefix="audio", origin_index="audio", @@ -75,11 +89,22 @@ class MediaTypeConfig: search_request_serializer=AudioSearchRequestSerializer, model_serializer=AudioSerializer, ), -) +} + + +@pytest.fixture +def image_media_type_config(): + return MEDIA_TYPE_CONFIGS["image"] + + +@pytest.fixture +def audio_media_type_config(): + return MEDIA_TYPE_CONFIGS["audio"] @pytest.fixture( - params=MEDIA_TYPE_CONFIGS, ids=lambda x: f"{x.media_type}_media_type_config" + params=MEDIA_TYPE_CONFIGS.values(), + ids=lambda x: f"{x.media_type}_media_type_config", ) def media_type_config(request: pytest.FixtureRequest) -> MediaTypeConfig: return request.param diff --git a/api/test/unit/controllers/test_search_controller.py b/api/test/unit/controllers/test_search_controller.py index bc38c0940a8..21feb62d187 100644 --- a/api/test/unit/controllers/test_search_controller.py +++ b/api/test/unit/controllers/test_search_controller.py @@ -1,9 +1,16 @@ import random +import re from collections.abc import Callable from enum import Enum, auto +from test.factory.es_http import ( + MOCK_DEAD_RESULT_URL_PREFIX, + MOCK_LIVE_RESULT_URL_PREFIX, + create_mock_es_http_image_search_response, +) from unittest import mock from uuid import uuid4 +import pook import pytest from django_redis import get_redis_connection from elasticsearch_dsl import Search @@ -11,6 +18,7 @@ from api.controllers import search_controller from api.utils import tallies from api.utils.dead_link_mask import get_query_hash, save_query_mask +from api.utils.search_context import SearchContext pytestmark = pytest.mark.django_db @@ -539,3 +547,212 @@ def test_resolves_index( ) search_class.assert_called_once_with(index=searched_index) + + +@mock.patch( + "api.controllers.search_controller._post_process_results", + wraps=search_controller._post_process_results, +) +@mock.patch("api.controllers.search_controller.SearchContext") +@pook.on +def test_no_post_process_results_recursion( + mock_search_context, + wrapped_post_process_results, + image_media_type_config, + settings, + # request the redis mock to auto-clean Redis between each test run + # otherwise the dead link query mask causes test details to leak + # between each run + redis, +): + # Search context does not matter for this test, so we can mock it + # to avoid needing to account for additional ES requests + mock_search_context.build.return_value = SearchContext(set(), set()) + + hit_count = 5 + mock_es_response = create_mock_es_http_image_search_response( + index=image_media_type_config.origin_index, + total_hits=45, + hit_count=hit_count, + ) + + es_host = settings.ES.transport.kwargs["host"] + es_port = settings.ES.transport.kwargs["port"] + + # `origin_index` enforced by passing `exact_index=True` below. + es_endpoint = ( + f"http://{es_host}:{es_port}/{image_media_type_config.origin_index}/_search" + ) + + mock_search = pook.post(es_endpoint).times(1).reply(200).json(mock_es_response).mock + + # Ensure dead link filtering does not remove any results + pook.head( + pook.regex(rf"{MOCK_LIVE_RESULT_URL_PREFIX}/\d"), + ).times( + hit_count + ).reply(200) + + serializer = image_media_type_config.search_request_serializer( + # This query string does not matter, ultimately, as pook is mocking + # the ES response regardless of the input + data={"q": "bird perched"} + ) + serializer.is_valid() + results, _, _, _ = search_controller.search( + search_params=serializer, + ip=0, + origin_index=image_media_type_config.origin_index, + exact_index=True, + page=3, + page_size=20, + filter_dead=True, + ) + + assert {r["_source"]["identifier"] for r in mock_es_response["hits"]["hits"]} == { + r.identifier for r in results + } + + assert wrapped_post_process_results.call_count == 1 + assert mock_search.total_matches == 1 + + +@pytest.mark.parametrize( + # both scenarios force `post_process_results` + # to recurse to fill the page due to the dead link + # configuration present in the test body + "page, page_size, mock_total_hits", + # Note the following + # - DEAD_LINK_RATIO causes all query sizes to start at double the page size + # - The test function is configured so that each request only returns 2 live + # results + # - We clear the redis cache between each test, meaning there is no query-based + # dead link mask. This forces `from` to 0 for each case. + # - Recursion should only continue while results still exist that could fulfill + # the requested page size. Once the available hits are exhaused, the function + # should stop recursing. This is why exceeding or matching the available + # hits is significant. + ( + # First request: from: 0, size: 10 + # Second request: from: 0, size: 15, exceeds max results + pytest.param(1, 5, 12, id="first_page"), + # First request: from: 0, size: 22 + # Second request: from 0, size: 33, exceeds max results + pytest.param(3, 4, 32, id="last_page"), + # First request: from: 0, size: 22 + # Second request: from 0, size: 33, matches max results + pytest.param(3, 4, 33, id="last_page_with_exact_max_results"), + ), +) +@mock.patch( + "api.controllers.search_controller._post_process_results", + wraps=search_controller._post_process_results, +) +@mock.patch("api.controllers.search_controller.SearchContext") +@pook.on +def test_post_process_results_recurses_as_needed( + mock_search_context, + wrapped_post_process_results, + image_media_type_config, + settings, + page, + page_size, + mock_total_hits, + # request the redis mock to auto-clean Redis between each test run + # otherwise the dead link query mask causes test details to leak + # between each run + redis, +): + # Search context does not matter for this test, so we can mock it + # to avoid needing to account for additional ES requests + mock_search_context.build.return_value = SearchContext(set(), set()) + + mock_es_response_1 = create_mock_es_http_image_search_response( + index=image_media_type_config.origin_index, + total_hits=mock_total_hits, + hit_count=10, + live_hit_count=2, + ) + + mock_es_response_2 = create_mock_es_http_image_search_response( + index=image_media_type_config.origin_index, + total_hits=mock_total_hits, + hit_count=4, + live_hit_count=2, + base_hits=mock_es_response_1["hits"]["hits"], + ) + + es_host = settings.ES.transport.kwargs["host"] + es_port = settings.ES.transport.kwargs["port"] + + # `origin_index` enforced by passing `exact_index=True` below. + es_endpoint = ( + f"http://{es_host}:{es_port}/{image_media_type_config.origin_index}/_search" + ) + + # `from` is always 0 if there is no query mask + # see `_paginate_with_dead_link_mask` branch 1 + # Testing this with a query mask would introduce far more complexity + # with no significant benefit + re.compile('from":0') + + mock_first_es_request = ( + pook.post(es_endpoint) + # The dead link ratio causes the initial query size to double + .body(re.compile(f'size":{(page_size * page) * 2}')) + .body(re.compile('from":0')) + .times(1) + .reply(200) + .json(mock_es_response_1) + .mock + ) + + mock_second_es_request = ( + pook.post(es_endpoint) + # Size is clamped to the total number of available hits + .body(re.compile(f'size":{mock_total_hits}')) + .body(re.compile('from":0')) + .times(1) + .reply(200) + .json(mock_es_response_2) + .mock + ) + + live_results = [ + r + for r in mock_es_response_2["hits"]["hits"] + if r["_source"]["url"].startswith(MOCK_LIVE_RESULT_URL_PREFIX) + ] + + pook.head(pook.regex(rf"{MOCK_LIVE_RESULT_URL_PREFIX}/\d")).times( + len(live_results) + ).reply(200) + + pook.head(pook.regex(rf"{MOCK_DEAD_RESULT_URL_PREFIX}/\d")).times( + len(mock_es_response_2["hits"]["hits"]) - len(live_results) + ).reply(400) + + serializer = image_media_type_config.search_request_serializer( + # This query string does not matter, ultimately, as pook is mocking + # the ES response regardless of the input + data={"q": "bird perched"} + ) + serializer.is_valid() + results, _, _, _ = search_controller.search( + search_params=serializer, + ip=0, + origin_index=image_media_type_config.origin_index, + exact_index=True, + page=page, + page_size=page_size, + filter_dead=True, + ) + + assert mock_first_es_request.total_matches == 1 + assert mock_second_es_request.total_matches == 1 + + assert {r["_source"]["identifier"] for r in live_results} == { + r.identifier for r in results + } + + assert wrapped_post_process_results.call_count == 2 diff --git a/api/test/unit/utils/test_photon.py b/api/test/unit/utils/test_image_proxy.py similarity index 83% rename from api/test/unit/utils/test_photon.py rename to api/test/unit/utils/test_image_proxy.py index 7ea41252b2b..655c84f4925 100644 --- a/api/test/unit/utils/test_photon.py +++ b/api/test/unit/utils/test_image_proxy.py @@ -10,18 +10,14 @@ import pytest import requests -from api.utils.photon import ( - HEADERS, - UpstreamThumbnailException, - _get_file_extension_from_url, - check_image_type, -) -from api.utils.photon import get as photon_get +from api.utils.image_proxy import HEADERS, UpstreamThumbnailException, extension +from api.utils.image_proxy import get as photon_get from api.utils.tallies import get_monthly_timestamp PHOTON_URL_FOR_TEST_IMAGE = f"{settings.PHOTON_ENDPOINT}subdomain.example.com/path_part1/part2/image_dot_jpg.jpg" TEST_IMAGE_URL = PHOTON_URL_FOR_TEST_IMAGE.replace(settings.PHOTON_ENDPOINT, "http://") +TEST_MEDIA_IDENTIFIER = "123" UA_HEADER = HEADERS["User-Agent"] @@ -31,6 +27,11 @@ # this will get the tests working. MOCK_BODY = "mock response body" +SVG_BODY = """ +""" + @pytest.fixture def auth_key(): @@ -59,13 +60,31 @@ def test_get_successful_no_auth_key_default_args(mock_image_data): .mock ) - res = photon_get(TEST_IMAGE_URL) + res = photon_get(TEST_IMAGE_URL, TEST_MEDIA_IDENTIFIER) assert res.content == MOCK_BODY.encode() assert res.status_code == 200 assert mock_get.matched +@pook.on +def test_get_successful_original_svg_no_auth_key_default_args(mock_image_data): + mock_get: pook.Mock = ( + pook.get(TEST_IMAGE_URL.replace(".jpg", ".svg")) + .header("User-Agent", UA_HEADER) + .header("Accept", "image/*") + .reply(200) + .body(SVG_BODY) + .mock + ) + + res = photon_get(TEST_IMAGE_URL.replace(".jpg", ".svg"), TEST_MEDIA_IDENTIFIER) + + assert res.content == SVG_BODY.encode() + assert res.status_code == 200 + assert mock_get.matched + + @pook.on def test_get_successful_with_auth_key_default_args(mock_image_data, auth_key): mock_get: pook.Mock = ( @@ -84,7 +103,7 @@ def test_get_successful_with_auth_key_default_args(mock_image_data, auth_key): .mock ) - res = photon_get(TEST_IMAGE_URL) + res = photon_get(TEST_IMAGE_URL, TEST_MEDIA_IDENTIFIER) assert res.content == MOCK_BODY.encode() assert res.status_code == 200 @@ -107,7 +126,7 @@ def test_get_successful_no_auth_key_not_compressed(mock_image_data): .mock ) - res = photon_get(TEST_IMAGE_URL, is_compressed=False) + res = photon_get(TEST_IMAGE_URL, TEST_MEDIA_IDENTIFIER, is_compressed=False) assert res.content == MOCK_BODY.encode() assert res.status_code == 200 @@ -130,7 +149,7 @@ def test_get_successful_no_auth_key_full_size(mock_image_data): .mock ) - res = photon_get(TEST_IMAGE_URL, is_full_size=True) + res = photon_get(TEST_IMAGE_URL, TEST_MEDIA_IDENTIFIER, is_full_size=True) assert res.content == MOCK_BODY.encode() assert res.status_code == 200 @@ -148,7 +167,9 @@ def test_get_successful_no_auth_key_full_size_not_compressed(mock_image_data): .mock ) - res = photon_get(TEST_IMAGE_URL, is_full_size=True, is_compressed=False) + res = photon_get( + TEST_IMAGE_URL, TEST_MEDIA_IDENTIFIER, is_full_size=True, is_compressed=False + ) assert res.content == MOCK_BODY.encode() assert res.status_code == 200 @@ -172,7 +193,7 @@ def test_get_successful_no_auth_key_png_only(mock_image_data): .mock ) - res = photon_get(TEST_IMAGE_URL, accept_header="image/png") + res = photon_get(TEST_IMAGE_URL, TEST_MEDIA_IDENTIFIER, accept_header="image/png") assert res.content == MOCK_BODY.encode() assert res.status_code == 200 @@ -200,7 +221,7 @@ def test_get_successful_forward_query_params(mock_image_data): url_with_params = f"{TEST_IMAGE_URL}?{params}" - res = photon_get(url_with_params) + res = photon_get(url_with_params, TEST_MEDIA_IDENTIFIER) assert res.content == MOCK_BODY.encode() assert res.status_code == 200 @@ -235,7 +256,7 @@ def test_get_successful_records_response_code(mock_image_data, redis): .mock ) - photon_get(TEST_IMAGE_URL) + photon_get(TEST_IMAGE_URL, TEST_MEDIA_IDENTIFIER) month = get_monthly_timestamp() assert redis.get(f"thumbnail_response_code:{month}:200") == b"1" assert ( @@ -290,7 +311,7 @@ def test_get_exception_handles_error( redis.set(key, count_start) with pytest.raises(UpstreamThumbnailException): - photon_get(TEST_IMAGE_URL) + photon_get(TEST_IMAGE_URL, TEST_MEDIA_IDENTIFIER) assert_func = ( capture_exception.assert_called_once @@ -331,7 +352,7 @@ def test_get_http_exception_handles_error( redis.set(key, count_start) with pytest.raises(UpstreamThumbnailException): - photon_get(TEST_IMAGE_URL) + photon_get(TEST_IMAGE_URL, TEST_MEDIA_IDENTIFIER) assert_func = ( capture_exception.assert_called_once @@ -369,7 +390,7 @@ def test_get_successful_https_image_url_sends_ssl_parameter(mock_image_data): .mock ) - res = photon_get(https_url) + res = photon_get(https_url, TEST_MEDIA_IDENTIFIER) assert res.content == MOCK_BODY.encode() assert res.status_code == 200 @@ -383,7 +404,7 @@ def test_get_unsuccessful_request_raises_custom_exception(): with pytest.raises( UpstreamThumbnailException, match=r"Failed to render thumbnail." ): - photon_get(TEST_IMAGE_URL) + photon_get(TEST_IMAGE_URL, TEST_MEDIA_IDENTIFIER) assert mock_get.matched @@ -404,34 +425,34 @@ def test_get_unsuccessful_request_raises_custom_exception(): ], ) def test__get_extension_from_url(image_url, expected_ext): - assert _get_file_extension_from_url(image_url) == expected_ext + assert extension._get_file_extension_from_url(image_url) == expected_ext @pytest.mark.django_db -@pytest.mark.parametrize("image_type", ["apng", "svg", "bmp"]) -def test_check_image_type_raises_by_not_allowed_types(image_type): +@pytest.mark.parametrize("image_type", ["apng", "tiff", "bmp"]) +def test_photon_get_raises_by_not_allowed_types(image_type): image_url = TEST_IMAGE_URL.replace(".jpg", f".{image_type}") image = ImageFactory.create(url=image_url) with pytest.raises(UnsupportedMediaType): - check_image_type(image_url, image) + photon_get(image_url, image.identifier) @pytest.mark.django_db @pytest.mark.parametrize( "headers, expected_cache_val", [ - ({"Content-Type": "image/svg+xml"}, b"svg+xml"), + ({"Content-Type": "image/tiff"}, b"tiff"), (None, b"unknown"), ], ) -def test_check_image_type_saves_image_type_to_cache(redis, headers, expected_cache_val): +def test_photon_get_saves_image_type_to_cache(redis, headers, expected_cache_val): image_url = TEST_IMAGE_URL.replace(".jpg", "") image = ImageFactory.create(url=image_url) with mock.patch("requests.head") as mock_head, pytest.raises(UnsupportedMediaType): mock_head.return_value.headers = headers - check_image_type(image_url, image) + photon_get(image_url, image.identifier) key = f"media:{image.identifier}:thumb_type" assert redis.get(key) == expected_cache_val diff --git a/api/test/unit/utils/test_watermark.py b/api/test/unit/utils/test_watermark.py index f9a8c994d44..0cb5b7f87e7 100644 --- a/api/test/unit/utils/test_watermark.py +++ b/api/test/unit/utils/test_watermark.py @@ -51,3 +51,18 @@ def test_sends_UA_header(requests): assert len(requests.requests) > 0 for r in requests.requests: assert r.headers == HEADERS + + +# Previously, wrapped titles would throw a TypeError: +# slice indices must be integers or None or have an __index__ method. +# See: https://github.com/WordPress/openverse/issues/2466 +def test_long_title_wraps_correctly(requests): + # Make the title 400 chars long + _MOCK_IMAGE_INFO_LONG_TITLE = dict(_MOCK_IMAGE_INFO) + _MOCK_IMAGE_INFO_LONG_TITLE["title"] = "a" * 400 + + watermark("http://example.com/", _MOCK_IMAGE_INFO_LONG_TITLE) + + assert len(requests.requests) > 0 + for r in requests.requests: + assert r.headers == HEADERS diff --git a/catalog/dags/common/constants.py b/catalog/dags/common/constants.py index dace6d3b246..6cdef64b26f 100644 --- a/catalog/dags/common/constants.py +++ b/catalog/dags/common/constants.py @@ -36,3 +36,4 @@ ) AWS_CONN_ID = os.getenv("AWS_CONN_ID", "aws_conn_id") AWS_RDS_CONN_ID = os.environ.get("AWS_RDS_CONN_ID", AWS_CONN_ID) +ES_PROD_HTTP_CONN_ID = "elasticsearch_http_production" diff --git a/catalog/dags/common/ingestion_server.py b/catalog/dags/common/ingestion_server.py index 07043466d67..f3ac3eeff47 100644 --- a/catalog/dags/common/ingestion_server.py +++ b/catalog/dags/common/ingestion_server.py @@ -8,13 +8,15 @@ from airflow.providers.http.sensors.http import HttpSensor from requests import Response -from common.constants import XCOM_PULL_TEMPLATE +from common.constants import ES_PROD_HTTP_CONN_ID, XCOM_PULL_TEMPLATE logger = logging.getLogger(__name__) POKE_INTERVAL = int(os.getenv("DATA_REFRESH_POKE_INTERVAL", 60 * 15)) +# Minimum number of records we expect to get back from ES when querying an index. +THRESHOLD_RESULT_COUNT = int(os.getenv("ES_INDEX_READINESS_RECORD_COUNT", 10_000)) def response_filter_stat(response: Response) -> str: @@ -65,6 +67,22 @@ def response_check_wait_for_completion(response: Response) -> bool: return True +def response_check_index_readiness_check(response: Response) -> bool: + """ + Handle the response for `index_readiness_check` Sensor, to await a + healthy Elasticsearch cluster. We expect to retrieve a healthy number + of results. + """ + data = response.json() + hits = data.get("hits", {}).get("total", {}).get("value", 0) + logger.info( + f"Retrieved {hits} records from Elasticsearch using the new index." + f" Checking against threshold of {THRESHOLD_RESULT_COUNT}." + ) + + return hits >= THRESHOLD_RESULT_COUNT + + def get_current_index(target_alias: str) -> SimpleHttpOperator: return SimpleHttpOperator( task_id="get_current_index", @@ -125,3 +143,25 @@ def trigger_and_wait_for_task( waiter = wait_for_task(action, trigger, timeout, poke_interval) trigger >> waiter return trigger, waiter + + +def index_readiness_check( + media_type: str, + index_suffix: str, + timeout: timedelta = timedelta(days=1), + poke_interval: int = POKE_INTERVAL, +) -> HttpSensor: + """ + Poll the Elasticsearch index, returning true only when results greater + than the expected threshold_count are returned. + """ + return HttpSensor( + task_id="index_readiness_check", + http_conn_id=ES_PROD_HTTP_CONN_ID, + endpoint=f"{media_type}-{index_suffix}/_search", + method="GET", + response_check=response_check_index_readiness_check, + mode="reschedule", + poke_interval=poke_interval, + timeout=timeout.total_seconds(), + ) diff --git a/catalog/dags/data_refresh/create_filtered_index_dag.py b/catalog/dags/data_refresh/create_filtered_index_dag.py index b2978376b2c..4597540e9fe 100644 --- a/catalog/dags/data_refresh/create_filtered_index_dag.py +++ b/catalog/dags/data_refresh/create_filtered_index_dag.py @@ -206,6 +206,12 @@ def point_alias(destination_index_suffix: str): destination_index_suffix=destination_index_suffix, ) + # Await healthy results from the newly created elasticsearch index. + index_readiness_check = ingestion_server.index_readiness_check( + media_type=media_type, + index_suffix=destination_index_suffix, + ) + do_point_alias = point_alias(destination_index_suffix=destination_index_suffix) delete_old_index = ingestion_server.trigger_task( @@ -235,7 +241,7 @@ def point_alias(destination_index_suffix: str): ) get_current_index_if_exists >> continue_if_no_current_index >> do_create - await_create >> do_point_alias + await_create >> index_readiness_check >> do_point_alias [get_current_index_if_exists, do_point_alias] >> delete_old_index diff --git a/catalog/dags/data_refresh/data_refresh_task_factory.py b/catalog/dags/data_refresh/data_refresh_task_factory.py index abbae1b780d..9ac2fa2bf87 100644 --- a/catalog/dags/data_refresh/data_refresh_task_factory.py +++ b/catalog/dags/data_refresh/data_refresh_task_factory.py @@ -152,25 +152,48 @@ def create_data_refresh_task_group( ) tasks.append(generate_index_suffix) - action_data_map: dict[str, dict] = { - "ingest_upstream": {}, - "promote": {"alias": target_alias}, - } - for action, action_post_data in action_data_map.items(): - with TaskGroup(group_id=action) as task_group: - ingestion_server.trigger_and_wait_for_task( - action=action, - model=data_refresh.media_type, - data={ - "index_suffix": XCOM_PULL_TEMPLATE.format( - generate_index_suffix.task_id, "return_value" - ), - } - | action_post_data, - timeout=data_refresh.data_refresh_timeout, - ) - - tasks.append(task_group) + # Trigger the 'ingest_upstream' task on the ingestion server and await its + # completion. This task copies the media table for the given model from the + # Catalog into the API DB and builds the elasticsearch index. The new table + # and index are not promoted until a later step. + with TaskGroup(group_id="ingest_upstream") as ingest_upstream_tasks: + ingestion_server.trigger_and_wait_for_task( + action="ingest_upstream", + model=data_refresh.media_type, + data={ + "index_suffix": XCOM_PULL_TEMPLATE.format( + generate_index_suffix.task_id, "return_value" + ), + }, + timeout=data_refresh.data_refresh_timeout, + ) + tasks.append(ingest_upstream_tasks) + + # Await healthy results from the newly created elasticsearch index. + index_readiness_check = ingestion_server.index_readiness_check( + media_type=data_refresh.media_type, + index_suffix=XCOM_PULL_TEMPLATE.format( + generate_index_suffix.task_id, "return_value" + ), + timeout=data_refresh.index_readiness_timeout, + ) + tasks.append(index_readiness_check) + + # Trigger the `promote` task on the ingestion server and await its completion. + # This task promotes the newly created API DB table and elasticsearch index. + with TaskGroup(group_id="promote") as promote_tasks: + ingestion_server.trigger_and_wait_for_task( + action="promote", + model=data_refresh.media_type, + data={ + "index_suffix": XCOM_PULL_TEMPLATE.format( + generate_index_suffix.task_id, "return_value" + ), + "alias": target_alias, + }, + timeout=data_refresh.data_refresh_timeout, + ) + tasks.append(promote_tasks) # Delete the alias' previous target index, now unused. delete_old_index = ingestion_server.trigger_task( diff --git a/catalog/dags/data_refresh/data_refresh_types.py b/catalog/dags/data_refresh/data_refresh_types.py index 4426453cad8..f54e7c4d33f 100644 --- a/catalog/dags/data_refresh/data_refresh_types.py +++ b/catalog/dags/data_refresh/data_refresh_types.py @@ -38,6 +38,8 @@ class DataRefresh: may take create_materialized_view_timeout: timedelta expressing amount of time the creation of the matview may take + index_readiness_timeout: timedelta expressing amount of time it may take + to await a healthy ES index after reindexing doc_md: str used for the DAG's documentation markdown """ @@ -52,6 +54,7 @@ class DataRefresh: create_pop_constants_view_timeout: timedelta = timedelta(hours=1) create_materialized_view_timeout: timedelta = timedelta(hours=1) create_filtered_index_timeout: timedelta = timedelta(days=1) + index_readiness_timeout: timedelta = timedelta(days=1) def __post_init__(self): self.dag_id = f"{self.media_type}_data_refresh" diff --git a/catalog/docker-compose.yml b/catalog/docker-compose.yml deleted file mode 100644 index 12d09320def..00000000000 --- a/catalog/docker-compose.yml +++ /dev/null @@ -1,41 +0,0 @@ -version: "3" - -services: - scheduler: - image: ghcr.io/wordpress/openverse-catalog:${DOCKER_IMAGE_TAG:-latest} - env_file: - - .env - restart: always - environment: - # Upgrade the DB on startup - _AIRFLOW_DB_UPGRADE: "true" - command: scheduler - expose: - - "8793" # Used for fetching logs - volumes: - - airflow:/var/workflow_output - - ./dags:/opt/airflow/catalog/dags - logging: - options: - max-size: "100m" - max-file: "10" - - webserver: - image: ghcr.io/wordpress/openverse-catalog:${DOCKER_IMAGE_TAG:-latest} - depends_on: - - scheduler - env_file: - - .env - restart: always - command: webserver - ports: - - "${AIRFLOW_PORT}:8080" - volumes: - - ./dags:/opt/airflow/catalog/dags - logging: - options: - max-size: "100m" - max-file: "10" - -volumes: - airflow: diff --git a/catalog/env.template b/catalog/env.template index e4edcc90f5e..35b92826481 100644 --- a/catalog/env.template +++ b/catalog/env.template @@ -55,6 +55,9 @@ AIRFLOW_CONN_POSTGRES_OPENLEDGER_API_STAGING=postgres://deploy:deploy@db:5432/op OPENLEDGER_CONN_ID=postgres_openledger_upstream TEST_CONN_ID=postgres_openledger_testing +# Elasticsearch connections. Change the following line in prod to use the appropriate DB. +AIRFLOW_CONN_ELASTICSEARCH_HTTP_PRODUCTION=http://es:9200 + # API DB connection. Change the following line in prod to use the appropriate DB AIRFLOW_CONN_POSTGRES_OPENLEDGER_API=postgres://deploy:deploy@db:5432/openledger OPENLEDGER_API_CONN_ID=postgres_openledger_api @@ -113,6 +116,9 @@ DATA_REFRESH_POOL=default_pool DEFAULT_RETRY_COUNT = 2 # Whether to enable catchup for dated DAGs, allowing automatic backfill. AIRFLOW_VAR_CATCHUP_ENABLED=false +# Number of records to expect in a healthy ES index. Used during the data refresh to verify that +# a new index is healthy before promoting. +ES_INDEX_READINESS_RECORD_COUNT=1000 AIRFLOW_VAR_AIRFLOW_RDS_ARN=unset AIRFLOW_VAR_AIRFLOW_RDS_SNAPSHOTS_TO_RETAIN=7 diff --git a/catalog/tests/dags/common/test_ingestion_server.py b/catalog/tests/dags/common/test_ingestion_server.py index 086a0519e09..6f78e341dde 100644 --- a/catalog/tests/dags/common/test_ingestion_server.py +++ b/catalog/tests/dags/common/test_ingestion_server.py @@ -1,11 +1,47 @@ -from unittest.mock import MagicMock +from datetime import timedelta +from unittest import mock import pytest +import requests from airflow.exceptions import AirflowSkipException +from airflow.models import DagRun, TaskInstance +from airflow.models.dag import DAG +from airflow.utils.session import create_session +from airflow.utils.state import DagRunState, TaskInstanceState +from airflow.utils.timezone import datetime +from airflow.utils.types import DagRunType from common import ingestion_server +TEST_START_DATE = datetime(2022, 2, 1, 0, 0, 0) +TEST_DAG_ID = "api_healthcheck_test_dag" + + +@pytest.fixture(autouse=True) +def clean_db(): + with create_session() as session: + # synchronize_session='fetch' required here to refresh models + # https://stackoverflow.com/a/51222378 CC BY-SA 4.0 + session.query(DagRun).filter(DagRun.dag_id.startswith(TEST_DAG_ID)).delete( + synchronize_session="fetch" + ) + session.query(TaskInstance).filter( + TaskInstance.dag_id.startswith(TEST_DAG_ID) + ).delete(synchronize_session="fetch") + + +@pytest.fixture() +def index_readiness_dag(): + # Create a DAG that just has an index_readiness_check task + with DAG(dag_id=TEST_DAG_ID, schedule=None, start_date=TEST_START_DATE) as dag: + ingestion_server.index_readiness_check( + media_type="image", index_suffix="my_test_suffix", timeout=timedelta(days=1) + ) + + return dag + + @pytest.mark.parametrize( "data, expected", [ @@ -19,7 +55,60 @@ ], ) def test_response_filter_stat(data, expected): - response = MagicMock() + response = mock.MagicMock() response.json.return_value = data actual = ingestion_server.response_filter_stat(response) assert actual == expected + + +@pytest.mark.parametrize( + "response_code, response_json, expected_status", + [ + pytest.param( + 200, + {"hits": {"total": {"value": 20_000, "relation": "eq"}}}, + TaskInstanceState.SUCCESS, + id="healthy-index", + ), + pytest.param( + 200, + {"hits": {"total": {"value": 100, "relation": "eq"}}}, + TaskInstanceState.UP_FOR_RESCHEDULE, + id="not-enough-records", + ), + pytest.param( + 200, {"foo": "bar"}, TaskInstanceState.UP_FOR_RESCHEDULE, id="missing-hits" + ), + pytest.param( + 404, + {"error": {"root_cause": [{"type": "index_not_found_exception"}]}}, + TaskInstanceState.UP_FOR_RESCHEDULE, + id="index-not-found-error", + ), + ], +) +def test_index_readiness_check( + index_readiness_dag, response_code, response_json, expected_status +): + execution_date = TEST_START_DATE + timedelta(days=1) + dagrun = index_readiness_dag.create_dagrun( + start_date=execution_date, + execution_date=execution_date, + data_interval=(execution_date, execution_date), + state=DagRunState.RUNNING, + run_type=DagRunType.MANUAL, + ) + + with mock.patch( + "airflow.providers.http.hooks.http.requests.Session.send" + ) as mock_session_send: + r = requests.Response() + r.status_code = response_code + r.reason = "test" + r.json = mock.MagicMock(return_value=response_json) + mock_session_send.return_value = r + + ti = dagrun.get_task_instance(task_id="index_readiness_check") + ti.task = index_readiness_dag.get_task(task_id="index_readiness_check") + ti.run() + assert ti.state == expected_status diff --git a/documentation/frontend/reference/storybook_tests.md b/documentation/frontend/reference/storybook_tests.md index 235066ceb94..9502c1e187e 100644 --- a/documentation/frontend/reference/storybook_tests.md +++ b/documentation/frontend/reference/storybook_tests.md @@ -4,12 +4,20 @@ We run Playwright tests for our Nuxt application. We also run them for Storybook, where components are tested in isolation and (ideally) in a comprehensive way that is not always possible inside an actual application. -Generally it is preferable to write regular Jest tests with +Generally, it is preferable to write regular Jest tests with `@testing-library/vue`, however, some interactions require a real web browser -context (like loading audio files, testing drag and drop interactions, etc). For -these, Playwright is the perfect tool. We can also use Playwright to write -visual regression tests for components in isolation so that we don't have to -duplicate things like focus state testing into our application level tests. +context (like loading audio files, complex popover interactions, testing drag +and drop interactions, etc). For these, Playwright is the perfect tool. + +We also use Playwright to write visual regression tests for components in +isolation so that we don't have to duplicate things like focus state testing +into our application-level tests. Sometimes, components cannot be tested in the +full app because they are shifted 1-2 pixels between the tests and therefore the +identical images do not match due to shifting. + +You will find the functional component tests in `/test/storybook/functional`, +and visual regression tests that with snapshots in +`/test/storybook/visual-regression` Our Nuxt playwright tests are described by the [Playwright testing guide](./playwright_tests.md). Please see the section on diff --git a/documentation/meta/ci_cd/jobs/deployment.md b/documentation/meta/ci_cd/jobs/deployment.md index a7bfe07f2ef..950a3dda534 100644 --- a/documentation/meta/ci_cd/jobs/deployment.md +++ b/documentation/meta/ci_cd/jobs/deployment.md @@ -2,8 +2,9 @@ ## `deploy-api` -Triggers a separate workflow using `workflow_dispatch` that deploys the staging -environment of the API service to AWS ECS. That workflow is given two inputs. +Triggers two separate workflows using `workflow_dispatch` that deploys the +staging environment of the API service and the thumbnails-specific API +deployment to AWS ECS. That workflows are given two inputs. - the tag of the image that was published by the [`publish-images`](/meta/ci_cd/jobs/docker.md#publish-images) job, which is @@ -11,7 +12,7 @@ environment of the API service to AWS ECS. That workflow is given two inputs. [`get-image-tag`](/meta/ci_cd/jobs/preparation.md#get-image-tag) job - the actor of the CI + CD workflow, for tagging them in Slack messages -This deployment is only triggered if all the following conditions are met. +The deployments are only triggered if all the following conditions are met. - the API codebase has changed - the [`publish-images`](/meta/ci_cd/jobs/docker.md#publish-images) job has diff --git a/documentation/meta/ci_cd/jobs/frontend.md b/documentation/meta/ci_cd/jobs/frontend.md index f864c741f24..6b81a9357d6 100644 --- a/documentation/meta/ci_cd/jobs/frontend.md +++ b/documentation/meta/ci_cd/jobs/frontend.md @@ -40,7 +40,7 @@ Node.js scripts. | ---------------- | ----------------------------------- | | `playwright_vr` | `test:playwright visual-regression` | | `playwright_e2e` | `test:playwright e2e` | -| `storybook_vr` | `test:storybook` | +| `storybook` | `test:storybook` | This job is skipped if the frontend codebase has not changed. diff --git a/frontend/src/components/VFilters/VSearchGridFilter.vue b/frontend/src/components/VFilters/VSearchGridFilter.vue index dd711917aad..a9835dc1175 100644 --- a/frontend/src/components/VFilters/VSearchGridFilter.vue +++ b/frontend/src/components/VFilters/VSearchGridFilter.vue @@ -18,7 +18,7 @@ {{ $t("filterList.clear") }} -