Skip to content

Commit 1001aee

Browse files
authored
Make embedding generation task use correct run (#2074)
* switch to using next_run * adding test * adding fallback for missing next runs * adding test * checking published * fixing test flakiness
1 parent 01fc6b6 commit 1001aee

File tree

2 files changed

+81
-2
lines changed

2 files changed

+81
-2
lines changed

Diff for: vector_search/tasks.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,9 @@ def start_embed_resources(self, indexes, skip_content_files, overwrite):
105105
.order_by("id")
106106
):
107107
run = (
108-
course.runs.filter(published=True)
108+
course.next_run
109+
if course.next_run
110+
else course.runs.filter(published=True)
109111
.order_by("-start_date")
110112
.first()
111113
)
@@ -193,7 +195,9 @@ def embed_learning_resources_by_id(self, ids, skip_content_files, overwrite):
193195
etl_source__in=RESOURCE_FILE_ETL_SOURCES
194196
).order_by("id"):
195197
run = (
196-
course.runs.filter(published=True)
198+
course.next_run
199+
if course.next_run
200+
else course.runs.filter(published=True)
197201
.order_by("-start_date")
198202
.first()
199203
)

Diff for: vector_search/tasks_test.py

+75
Original file line numberDiff line numberDiff line change
@@ -221,3 +221,78 @@ def test_embed_learning_resources_by_id(mocker, mocked_celery):
221221
assert mock_call.args[1] == "content_file"
222222
embedded_resource_ids = generate_embeddings_mock.si.mock_calls[0].args[0]
223223
assert sorted(resource_ids) == sorted(embedded_resource_ids)
224+
225+
226+
def test_embedded_content_from_next_run(mocker, mocked_celery):
227+
"""
228+
Content files to embed should come from next course run
229+
"""
230+
231+
mocker.patch("vector_search.tasks.load_course_blocklist", return_value=[])
232+
233+
course = CourseFactory.create(etl_source=ETLSource.ocw.value)
234+
235+
other_run = LearningResourceRunFactory.create(
236+
learning_resource=course.learning_resource,
237+
created_on=datetime.datetime.now(tz=datetime.UTC) - datetime.timedelta(days=2),
238+
)
239+
LearningResourceRunFactory.create(
240+
learning_resource=course.learning_resource,
241+
created_on=datetime.datetime.now(tz=datetime.UTC),
242+
)
243+
244+
next_run_contentfiles = [
245+
cf.id
246+
for cf in ContentFileFactory.create_batch(
247+
3, run=course.learning_resource.next_run
248+
)
249+
]
250+
# create contentfiles using the other run
251+
ContentFileFactory.create_batch(3, run=other_run)
252+
253+
generate_embeddings_mock = mocker.patch(
254+
"vector_search.tasks.generate_embeddings", autospec=True
255+
)
256+
257+
with pytest.raises(mocked_celery.replace_exception_class):
258+
start_embed_resources.delay(
259+
["course"], skip_content_files=False, overwrite=True
260+
)
261+
262+
generate_embeddings_mock.si.assert_called_with(
263+
next_run_contentfiles,
264+
"content_file",
265+
True, # noqa: FBT003
266+
)
267+
268+
269+
def test_embedded_content_from_latest_run_if_next_missing(mocker, mocked_celery):
270+
"""
271+
Content files to embed should come from latest run if the next run is missing
272+
"""
273+
274+
mocker.patch("vector_search.tasks.load_course_blocklist", return_value=[])
275+
276+
course = CourseFactory.create(etl_source=ETLSource.ocw.value)
277+
course.runs.all().delete()
278+
latest_run = LearningResourceRunFactory.create(
279+
learning_resource=course.learning_resource,
280+
created_on=datetime.datetime.now(tz=datetime.UTC) - datetime.timedelta(hours=1),
281+
)
282+
latest_run_contentfiles = [
283+
cf.id for cf in ContentFileFactory.create_batch(3, run=latest_run)
284+
]
285+
generate_embeddings_mock = mocker.patch(
286+
"vector_search.tasks.generate_embeddings", autospec=True
287+
)
288+
289+
with pytest.raises(mocked_celery.replace_exception_class):
290+
start_embed_resources.delay(
291+
["course"], skip_content_files=False, overwrite=True
292+
)
293+
294+
generate_embeddings_mock.si.assert_called_with(
295+
latest_run_contentfiles,
296+
"content_file",
297+
True, # noqa: FBT003
298+
)

0 commit comments

Comments
 (0)