Skip to content

Commit

Permalink
Revert "Change search query approach to include only available provid…
Browse files Browse the repository at this point in the history
…ers (#4238)" (#4397)

This reverts commit 3d7fa09.
  • Loading branch information
AetherUnbound authored May 28, 2024
1 parent 28f0356 commit a504365
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 76 deletions.
8 changes: 4 additions & 4 deletions api/api/controllers/elasticsearch/related.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from api.controllers.elasticsearch.helpers import get_es_response, get_query_slice
from api.controllers.search_controller import (
_post_process_results,
get_enabled_sources_query,
get_excluded_providers_query,
)


Expand Down Expand Up @@ -36,7 +36,7 @@ def related_media(uuid: str, index: str, filter_dead: bool) -> list[Hit]:
tags = getattr(item_hit, "tags", None)
creator = getattr(item_hit, "creator", None)

related_query = {"filter": [], "must_not": [], "should": []}
related_query = {"must_not": [], "must": [], "should": []}

if not title and not tags:
if not creator:
Expand All @@ -55,8 +55,8 @@ def related_media(uuid: str, index: str, filter_dead: bool) -> list[Hit]:
related_query["should"].append(Q("terms", tags__name__keyword=tags))

# Exclude the dynamically disabled sources.
if enabled_sources_query := get_enabled_sources_query():
related_query["filter"].append(enabled_sources_query)
if excluded_providers_query := get_excluded_providers_query():
related_query["must_not"].append(excluded_providers_query)
# Exclude the current item and mature content.
related_query["must_not"].extend(
[Q("term", mature=True), Q("term", identifier=uuid)]
Expand Down
57 changes: 28 additions & 29 deletions api/api/controllers/search_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,12 @@
NESTING_THRESHOLD = config("POST_PROCESS_NESTING_THRESHOLD", cast=int, default=5)
SOURCE_CACHE_TIMEOUT = 60 * 60 * 4 # 4 hours
FILTER_CACHE_TIMEOUT = 30
ENABLED_SOURCES_CACHE_KEY = "enabled_sources"
ENABLED_SOURCES_CACHE_VERSION = 2
FILTERED_PROVIDERS_CACHE_KEY = "filtered_providers"
FILTERED_PROVIDERS_CACHE_VERSION = 2
THUMBNAIL = "thumbnail"
URL = "url"
PROVIDER = "provider"
QUERY_SPECIAL_CHARACTER_ERROR = "Unescaped special characters are not allowed."
DEFAULT_BOOST = 10000
DEFAULT_SEARCH_FIELDS = ["title", "description", "tags.name"]
DEFAULT_SQS_FLAGS = "AND|NOT|PHRASE|WHITESPACE"
Expand Down Expand Up @@ -158,46 +162,42 @@ def _post_process_results(
return results[:page_size]


def get_enabled_sources_query() -> Q | None:
def get_excluded_providers_query() -> Q | None:
"""
Get a query that only includes enabled sources.
To exclude a source, set ``filter_content`` to ``True`` in the
Hide data sources from the catalog dynamically.
To exclude a provider, set ``filter_content`` to ``True`` in the
``ContentProvider`` model in Django admin.
The list of ``provider_identifier``s is cached in Redis with
`:ENABLED_SOURCES_CACHE_VERSION:ENABLED_SOURCES_CACHE_KEY` key.
`:FILTERED_PROVIDERS_CACHE_VERSION:FILTERED_PROVIDERS_CACHE_KEY` key.
"""

try:
enabled_sources = cache.get(
key=ENABLED_SOURCES_CACHE_KEY, version=ENABLED_SOURCES_CACHE_VERSION
filtered_providers = cache.get(
key=FILTERED_PROVIDERS_CACHE_KEY, version=FILTERED_PROVIDERS_CACHE_VERSION
)
except ConnectionError:
logger.warning("Redis connect failed, cannot get cached enabled sources.")
enabled_sources = None

if not enabled_sources:
# `ContentProvider` currently only handles _sources_, not providers.
# TODO: This is a legacy naming convention that should be updated.
# https://github.com/WordPress/openverse/issues/4346
enabled_sources = list(
models.ContentProvider.objects.filter(filter_content=False).values_list(
logger.warning("Redis connect failed, cannot get cached filtered providers.")
filtered_providers = None

if not filtered_providers:
filtered_providers = list(
models.ContentProvider.objects.filter(filter_content=True).values_list(
"provider_identifier", flat=True
)
)

try:
cache.set(
key=ENABLED_SOURCES_CACHE_KEY,
version=ENABLED_SOURCES_CACHE_VERSION,
key=FILTERED_PROVIDERS_CACHE_KEY,
version=FILTERED_PROVIDERS_CACHE_VERSION,
timeout=FILTER_CACHE_TIMEOUT,
value=enabled_sources,
value=filtered_providers,
)
except ConnectionError:
logger.warning("Redis connect failed, cannot cache enabled sources.")
logger.warning("Redis connect failed, cannot cache filtered providers.")

if enabled_sources:
return Q("terms", source=enabled_sources)
if filtered_providers:
return Q("terms", provider=filtered_providers)
return None


Expand Down Expand Up @@ -285,8 +285,8 @@ def build_search_query(
if not search_params.validated_data["include_sensitive_results"]:
search_queries["must_not"].append(Q("term", mature=True))
# Exclude dynamically disabled sources (see Redis cache)
if enabled_sources_query := get_enabled_sources_query():
search_queries["filter"].append(enabled_sources_query)
if excluded_providers_query := get_excluded_providers_query():
search_queries["must_not"].append(excluded_providers_query)

# Search either by generic multimatch or by "advanced search" with
# individual field-level queries specified.
Expand Down Expand Up @@ -375,7 +375,6 @@ def build_collection_query(
:return: the search client with the query applied.
"""
search_query = {"filter": [], "must": [], "should": [], "must_not": []}

# Apply the term filters. Each tuple pairs a filter's parameter name in the API
# with its corresponding field in Elasticsearch. "None" means that the
# names are identical.
Expand All @@ -396,8 +395,8 @@ def build_collection_query(
if not include_sensitive_by_params:
search_query["must_not"].append({"term": {"mature": True}})

if enabled_sources_query := get_enabled_sources_query():
search_query["filter"].append(enabled_sources_query)
if excluded_providers_query := get_excluded_providers_query():
search_query["must_not"].append(excluded_providers_query)

return Q("bool", **search_query)

Expand Down
1 change: 1 addition & 0 deletions api/conf/settings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from pathlib import Path

from decouple import config
from split_settings.tools import include


Expand Down
24 changes: 11 additions & 13 deletions api/test/unit/controllers/elasticsearch/test_related.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

from api.controllers.elasticsearch import related
from api.controllers.search_controller import (
ENABLED_SOURCES_CACHE_KEY,
ENABLED_SOURCES_CACHE_VERSION,
FILTERED_PROVIDERS_CACHE_KEY,
FILTERED_PROVIDERS_CACHE_VERSION,
)
from test.factory.es_http import (
MOCK_LIVE_RESULT_URL_PREFIX,
Expand All @@ -20,22 +20,22 @@


@pytest.fixture
def enabled_sources_cache(django_cache, monkeypatch):
def excluded_providers_cache(django_cache, monkeypatch):
cache = django_cache
monkeypatch.setattr("api.controllers.search_controller.cache", cache)

enabled_source = "enabled_source"
cache_value = [enabled_source]
excluded_provider = "excluded_provider"
cache_value = [excluded_provider]
cache.set(
key=ENABLED_SOURCES_CACHE_KEY,
version=ENABLED_SOURCES_CACHE_VERSION,
key=FILTERED_PROVIDERS_CACHE_KEY,
version=FILTERED_PROVIDERS_CACHE_VERSION,
value=cache_value,
timeout=1,
)

yield enabled_source
yield excluded_provider

cache.delete(ENABLED_SOURCES_CACHE_KEY, version=ENABLED_SOURCES_CACHE_VERSION)
cache.delete(FILTERED_PROVIDERS_CACHE_KEY, version=FILTERED_PROVIDERS_CACHE_VERSION)


@mock.patch(
Expand All @@ -47,7 +47,7 @@ def test_related_media(
wrapped_related_results,
image_media_type_config,
settings,
enabled_sources_cache,
excluded_providers_cache,
):
image = ImageFactory.create()

Expand Down Expand Up @@ -82,10 +82,8 @@ def test_related_media(
"from": 0,
"query": {
"bool": {
"filter": [
{"terms": {"source": [enabled_sources_cache]}},
],
"must_not": [
{"terms": {"provider": [excluded_providers_cache]}},
{"term": {"mature": True}},
{"term": {"identifier": image.identifier}},
],
Expand Down
32 changes: 16 additions & 16 deletions api/test/unit/controllers/test_search_controller.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import datetime
import random
import re
from collections.abc import Callable
from datetime import datetime, timezone
from enum import Enum, auto
from unittest import mock
from unittest.mock import patch
Expand All @@ -16,7 +16,7 @@

from api.controllers import search_controller
from api.controllers.elasticsearch import helpers as es_helpers
from api.controllers.search_controller import ENABLED_SOURCES_CACHE_KEY
from api.controllers.search_controller import FILTERED_PROVIDERS_CACHE_KEY
from api.utils import tallies
from api.utils.dead_link_mask import get_query_hash, save_query_mask
from api.utils.search_context import SearchContext
Expand Down Expand Up @@ -837,40 +837,40 @@ def _delete_all_results_but_first(_, __, results, ___):
@pytest.mark.django_db
@cache_availability_params
@pytest.mark.parametrize(
"enabled_count, result",
[(2, Terms(source=["source1", "source2"])), (0, None)],
"excluded_count, result",
[(2, Terms(provider=["provider1", "provider2"])), (0, None)],
)
def test_get_enabled_sources_query_returns_available_sources(
enabled_count, result, is_cache_reachable, cache_name, request
def test_get_excluded_providers_query_returns_excluded(
excluded_count, result, is_cache_reachable, cache_name, request
):
cache = request.getfixturevalue(cache_name)

if is_cache_reachable:
cache.set(
key=ENABLED_SOURCES_CACHE_KEY,
key=FILTERED_PROVIDERS_CACHE_KEY,
version=2,
timeout=30,
value=[f"source{i + 1}" for i in range(enabled_count)],
value=[f"provider{i + 1}" for i in range(excluded_count)],
)
else:
for i in range(enabled_count):
for i in range(excluded_count):
ContentProviderFactory.create(
created_on=datetime.now(tz=timezone.utc),
provider_identifier=f"source{i + 1}",
provider_name=f"Source {i + 1}",
filter_content=False,
created_on=datetime.datetime.now(),
provider_identifier=f"provider{i + 1}",
provider_name=f"Provider {i + 1}",
filter_content=True,
)

with capture_logs() as cap_logs:
assert search_controller.get_enabled_sources_query() == result
assert search_controller.get_excluded_providers_query() == result

if not is_cache_reachable:
messages = [record["event"] for record in cap_logs]
assert all(
message in messages
for message in [
"Redis connect failed, cannot get cached enabled sources.",
"Redis connect failed, cannot cache enabled sources.",
"Redis connect failed, cannot get cached filtered providers.",
"Redis connect failed, cannot cache filtered providers.",
]
)

Expand Down
26 changes: 12 additions & 14 deletions api/test/unit/controllers/test_search_controller_search_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,31 +5,31 @@
from api.controllers import search_controller
from api.controllers.search_controller import (
DEFAULT_SQS_FLAGS,
ENABLED_SOURCES_CACHE_KEY,
ENABLED_SOURCES_CACHE_VERSION,
FILTERED_PROVIDERS_CACHE_KEY,
FILTERED_PROVIDERS_CACHE_VERSION,
)


pytestmark = pytest.mark.django_db


@pytest.fixture
def enabled_sources_cache(django_cache, monkeypatch):
def excluded_providers_cache(django_cache, monkeypatch):
cache = django_cache
monkeypatch.setattr("api.controllers.search_controller.cache", cache)

enabled_source = "enabled_source"
cache_value = [enabled_source]
excluded_provider = "excluded_provider"
cache_value = [excluded_provider]
cache.set(
key=ENABLED_SOURCES_CACHE_KEY,
version=ENABLED_SOURCES_CACHE_VERSION,
key=FILTERED_PROVIDERS_CACHE_KEY,
version=FILTERED_PROVIDERS_CACHE_VERSION,
value=cache_value,
timeout=1,
)

yield enabled_source
yield excluded_provider

cache.delete(ENABLED_SOURCES_CACHE_KEY, version=ENABLED_SOURCES_CACHE_VERSION)
cache.delete(FILTERED_PROVIDERS_CACHE_KEY, version=FILTERED_PROVIDERS_CACHE_VERSION)


def test_create_search_query_empty(media_type_config):
Expand Down Expand Up @@ -268,9 +268,9 @@ def test_create_search_query_q_search_license_license_type_creates_2_terms_filte
}


def test_create_search_query_empty_with_dynamically_enabled_sources(
def test_create_search_query_empty_with_dynamically_excluded_providers(
image_media_type_config,
enabled_sources_cache,
excluded_providers_cache,
):
serializer = image_media_type_config.search_request_serializer(
data={}, context={"media_type": image_media_type_config.media_type}
Expand All @@ -283,11 +283,9 @@ def test_create_search_query_empty_with_dynamically_enabled_sources(
assert actual_query_clauses == {
"must_not": [
{"term": {"mature": True}},
{"terms": {"provider": [excluded_providers_cache]}},
],
"must": [{"match_all": {}}],
"filter": [
{"terms": {"source": [enabled_sources_cache]}},
],
"should": [
{"rank_feature": {"boost": 10000, "field": "standardized_popularity"}}
],
Expand Down

0 comments on commit a504365

Please sign in to comment.