Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Soft delete collections #35496

Merged
merged 16 commits into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 105 additions & 20 deletions openedx/core/djangoapps/content/search/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from django.core.cache import cache
from django.core.paginator import Paginator
from meilisearch import Client as MeilisearchClient
from meilisearch.errors import MeilisearchError
from meilisearch.errors import MeilisearchApiError, MeilisearchError
from meilisearch.models.task import TaskInfo
from opaque_keys.edx.keys import UsageKey
from opaque_keys.edx.locator import LibraryLocatorV2, LibraryCollectionLocator
Expand All @@ -34,6 +34,7 @@
searchable_doc_for_course_block,
searchable_doc_for_collection,
searchable_doc_for_library_block,
searchable_doc_for_usage_key,
searchable_doc_collections,
searchable_doc_tags,
searchable_doc_tags_for_collection,
Expand Down Expand Up @@ -400,8 +401,8 @@ def index_collection_batch(batch, num_done, library_key) -> int:
docs = []
for collection in batch:
try:
doc = searchable_doc_for_collection(collection)
doc.update(searchable_doc_tags_for_collection(library_key, collection))
doc = searchable_doc_for_collection(library_key, collection.key, collection=collection)
doc.update(searchable_doc_tags_for_collection(library_key, collection.key))
docs.append(doc)
except Exception as err: # pylint: disable=broad-except
status_cb(f"Error indexing collection {collection}: {err}")
Expand Down Expand Up @@ -510,15 +511,28 @@ def delete_index_doc(usage_key: UsageKey) -> None:
Args:
usage_key (UsageKey): The usage key of the XBlock to be removed from the index
"""
current_rebuild_index_name = _get_running_rebuild_index_name()
doc = searchable_doc_for_usage_key(usage_key)
_delete_index_doc(doc[Fields.id])


def _delete_index_doc(doc_id) -> None:
"""
Helper function that deletes the document with the given ID from the search index

If there is a rebuild in progress, the document will also be removed from the new index.
"""
if not doc_id:
return

client = _get_meilisearch_client()
current_rebuild_index_name = _get_running_rebuild_index_name()

tasks = []
if current_rebuild_index_name:
# If there is a rebuild in progress, the document will also be deleted from the new index.
tasks.append(client.index(current_rebuild_index_name).delete_document(meili_id_from_opaque_key(usage_key)))
tasks.append(client.index(STUDIO_INDEX_NAME).delete_document(meili_id_from_opaque_key(usage_key)))
# If there is a rebuild in progress, the document will also be removed from the new index.
tasks.append(client.index(current_rebuild_index_name).delete_document(doc_id))

tasks.append(client.index(STUDIO_INDEX_NAME).delete_document(doc_id))

_wait_for_meili_tasks(tasks)

Expand Down Expand Up @@ -561,20 +575,93 @@ def upsert_library_block_index_doc(usage_key: UsageKey) -> None:
_update_index_docs(docs)


def _get_document_from_index(document_id: str) -> dict:
"""
Returns the Document identified by the given ID, from the given index.

Returns None if the document or index do not exist.
"""
client = _get_meilisearch_client()
document = None
try:
index = client.get_index(STUDIO_INDEX_NAME)
return index.get_document(id)
except (MeilisearchError, MeilisearchApiError) as err:
# The index or documennt doesn't exist
log.exception(err)

return None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

document is not used. I think you meant to save index.get_document(id) in document and return it at the end.

Suggested change
document = None
try:
index = client.get_index(STUDIO_INDEX_NAME)
return index.get_document(id)
except (MeilisearchError, MeilisearchApiError) as err:
# The index or documennt doesn't exist
log.exception(err)
return None
document = None
try:
index = client.get_index(STUDIO_INDEX_NAME)
document = index.get_document(id)
except (MeilisearchError, MeilisearchApiError) as err:
# The index or documennt doesn't exist
log.exception(err)
return document

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh wow, thank you for spotting those typos @navinkarkera ! Fixed with e92f405.



def upsert_library_collection_index_doc(library_key: LibraryLocatorV2, collection_key: str) -> None:
"""
Creates or updates the document for the given Library Collection in the search index
Creates, updates, or deletes the document for the given Library Collection in the search index.

If the Collection is not found or disabled (i.e. soft-deleted), then delete it from the search index.
"""
content_library = lib_api.ContentLibrary.objects.get_by_key(library_key)
collection = authoring_api.get_collection(
learning_package_id=content_library.learning_package_id,
collection_key=collection_key,
)
docs = [
searchable_doc_for_collection(collection)
]
doc = searchable_doc_for_collection(library_key, collection_key)
update_components = False

_update_index_docs(docs)
# Soft-deleted/disabled collections are removed from the index
# and their components updated.
if doc.get('_disabled'):

_delete_index_doc(doc[Fields.id])

update_components = True

# Hard-deleted collections are also deleted from the index,
# but their components are automatically updated as part of the deletion process, so we don't have to.
elif not doc.get(Fields.type):

_delete_index_doc(doc[Fields.id])

# Otherwise, upsert the collection.
# Newly-added/restored collection get their components updated too.
else:
already_indexed = _get_document_from_index(doc[Fields.id])
if not already_indexed:
update_components = True

_update_index_docs([doc])

# Asynchronously update the collection's components "collections" field
if update_components:
from .tasks import update_library_components_collections as update_task

update_task.delay(str(library_key), collection_key)


def update_library_components_collections(
library_key: LibraryLocatorV2,
collection_key: str,
batch_size: int = 1000,
) -> None:
"""
Updates the "collections" field for all components associated with a given Library Collection.

Because there may be a lot of components, we send these updates to Meilisearch in batches.
"""
library = lib_api.get_library(library_key)
components = authoring_api.get_collection_components(library.learning_package.id, collection_key)

paginator = Paginator(components, batch_size)
for page in paginator.page_range:
docs = []

for component in paginator.page(page).object_list:
usage_key = lib_api.library_component_usage_key(
library_key,
component,
)
doc = searchable_doc_collections(usage_key)
docs.append(doc)

log.info(
f"Updating document.collections for library {library_key} components"
f" page {page} / {paginator.num_pages}"
)
_update_index_docs(docs)


def upsert_content_library_index_docs(library_key: LibraryLocatorV2) -> None:
Expand Down Expand Up @@ -612,10 +699,8 @@ def upsert_collection_tags_index_docs(collection_usage_key: LibraryCollectionLoc
"""
Updates the tags data in documents for the given library collection
"""
collection = lib_api.get_library_collection_from_usage_key(collection_usage_key)

doc = {Fields.id: collection.id}
doc.update(searchable_doc_tags_for_collection(collection_usage_key.library_key, collection))
doc = searchable_doc_tags_for_collection(collection_usage_key.library_key, collection_usage_key.collection_id)
_update_index_docs([doc])


Expand Down
109 changes: 63 additions & 46 deletions openedx/core/djangoapps/content/search/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from openedx.core.djangoapps.content_libraries import api as lib_api
from openedx.core.djangoapps.content_tagging import api as tagging_api
from openedx.core.djangoapps.xblock import api as xblock_api
from openedx_learning.api.authoring_models import LearningPackage
from openedx_learning.api.authoring_models import Collection

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -112,6 +112,15 @@ def _meili_access_id_from_context_key(context_key: LearningContextKey) -> int:
return access.id


def searchable_doc_for_usage_key(usage_key: UsageKey) -> dict:
"""
Generates a base document identified by its usage key.
"""
return {
Fields.id: meili_id_from_opaque_key(usage_key),
}


def _fields_from_block(block) -> dict:
"""
Given an XBlock instance, call its index_dictionary() method to load any
Expand Down Expand Up @@ -297,14 +306,14 @@ def searchable_doc_for_library_block(xblock_metadata: lib_api.LibraryXBlockMetad
library_name = lib_api.get_library(xblock_metadata.usage_key.context_key).title
block = xblock_api.load_block(xblock_metadata.usage_key, user=None)

doc = {
Fields.id: meili_id_from_opaque_key(xblock_metadata.usage_key),
doc = searchable_doc_for_usage_key(xblock_metadata.usage_key)
doc.update({
Fields.type: DocType.library_block,
Fields.breadcrumbs: [],
Fields.created: xblock_metadata.created.timestamp(),
Fields.modified: xblock_metadata.modified.timestamp(),
Fields.last_published: xblock_metadata.last_published.timestamp() if xblock_metadata.last_published else None,
}
})

doc.update(_fields_from_block(block))

Expand All @@ -319,9 +328,7 @@ def searchable_doc_tags(usage_key: UsageKey) -> dict:
Generate a dictionary document suitable for ingestion into a search engine
like Meilisearch or Elasticsearch, with the tags data for the given content object.
"""
doc = {
Fields.id: meili_id_from_opaque_key(usage_key),
}
doc = searchable_doc_for_usage_key(usage_key)
doc.update(_tags_for_content_object(usage_key))

return doc
Expand All @@ -332,31 +339,25 @@ def searchable_doc_collections(usage_key: UsageKey) -> dict:
Generate a dictionary document suitable for ingestion into a search engine
like Meilisearch or Elasticsearch, with the collections data for the given content object.
"""
doc = {
Fields.id: meili_id_from_opaque_key(usage_key),
}
doc = searchable_doc_for_usage_key(usage_key)
doc.update(_collections_for_content_object(usage_key))

return doc


def searchable_doc_tags_for_collection(
library_key: LibraryLocatorV2,
collection,
collection_key: str,
) -> dict:
"""
Generate a dictionary document suitable for ingestion into a search engine
like Meilisearch or Elasticsearch, with the tags data for the given library collection.
"""
doc = {
Fields.id: collection.id,
}

collection_usage_key = lib_api.get_library_collection_usage_key(
library_key,
collection.key,
collection_key,
)

doc = searchable_doc_for_usage_key(collection_usage_key)
doc.update(_tags_for_content_object(collection_usage_key))

return doc
Expand All @@ -368,49 +369,65 @@ def searchable_doc_for_course_block(block) -> dict:
like Meilisearch or Elasticsearch, so that the given course block can be
found using faceted search.
"""
doc = {
Fields.id: meili_id_from_opaque_key(block.usage_key),
doc = searchable_doc_for_usage_key(block.usage_key)
doc.update({
Fields.type: DocType.course_block,
}
})

doc.update(_fields_from_block(block))

return doc


def searchable_doc_for_collection(collection) -> dict:
def searchable_doc_for_collection(
library_key: LibraryLocatorV2,
collection_key: str,
*,
# Optionally provide the collection if we've already fetched one
collection: Collection | None = None,
) -> dict:
"""
Generate a dictionary document suitable for ingestion into a search engine
like Meilisearch or Elasticsearch, so that the given collection can be
found using faceted search.

If no collection is found for the given library_key + collection_key, the returned document will contain only basic
information derived from the collection usage key, and no Fields.type value will be included in the returned dict.
"""
doc = {
Fields.id: collection.id,
Fields.block_id: collection.key,
Fields.type: DocType.collection,
Fields.display_name: collection.title,
Fields.description: collection.description,
Fields.created: collection.created.timestamp(),
Fields.modified: collection.modified.timestamp(),
# Add related learning_package.key as context_key by default.
# If related contentlibrary is found, it will override this value below.
# Mostly contentlibrary.library_key == learning_package.key
Fields.context_key: collection.learning_package.key,
Fields.num_children: collection.entities.count(),
}
# Just in case learning_package is not related to a library
collection_usage_key = lib_api.get_library_collection_usage_key(
library_key,
collection_key,
)

doc = searchable_doc_for_usage_key(collection_usage_key)

try:
context_key = collection.learning_package.contentlibrary.library_key
org = str(context_key.org)
collection = collection or lib_api.get_library_collection_from_usage_key(collection_usage_key)
except lib_api.ContentLibraryCollectionNotFound:
# Collection not found, so we can only return the base doc
pass

if collection:
assert collection.key == collection_key

doc.update({
Fields.context_key: str(context_key),
Fields.org: org,
Fields.usage_key: str(lib_api.get_library_collection_usage_key(context_key, collection.key)),
Fields.context_key: str(library_key),
Fields.org: str(library_key.org),
Fields.usage_key: str(collection_usage_key),
Fields.block_id: collection.key,
Fields.type: DocType.collection,
Fields.display_name: collection.title,
Fields.description: collection.description,
Fields.created: collection.created.timestamp(),
Fields.modified: collection.modified.timestamp(),
Fields.num_children: collection.entities.count(),
Fields.access_id: _meili_access_id_from_context_key(library_key),
Fields.breadcrumbs: [{"display_name": collection.learning_package.title}],
})
except LearningPackage.contentlibrary.RelatedObjectDoesNotExist:
log.warning(f"Related library not found for {collection}")
doc[Fields.access_id] = _meili_access_id_from_context_key(doc[Fields.context_key])
# Add the breadcrumbs.
doc[Fields.breadcrumbs] = [{"display_name": collection.learning_package.title}]

# Disabled collections should be removed from the search index,
# so we mark them as _disabled
if not collection.enabled:
doc['_disabled'] = True

return doc
2 changes: 2 additions & 0 deletions openedx/core/djangoapps/content/search/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
LIBRARY_BLOCK_DELETED,
LIBRARY_BLOCK_UPDATED,
LIBRARY_COLLECTION_CREATED,
LIBRARY_COLLECTION_DELETED,
LIBRARY_COLLECTION_UPDATED,
XBLOCK_CREATED,
XBLOCK_DELETED,
Expand Down Expand Up @@ -166,6 +167,7 @@ def content_library_updated_handler(**kwargs) -> None:


@receiver(LIBRARY_COLLECTION_CREATED)
@receiver(LIBRARY_COLLECTION_DELETED)
@receiver(LIBRARY_COLLECTION_UPDATED)
@only_if_meilisearch_enabled
def library_collection_updated_handler(**kwargs) -> None:
Expand Down
Loading
Loading