Skip to content

Commit f50389c

Browse files
authored
Scrape marketing sites with dynamic content (#2180)
* stashing some changes * adding method to fetch extra data * adding webdriver path setting * adding tasks to fetch marketing content. moving util methods to vector_search app * removed explicit wait * removing need for chrome webdriver path setting * updating tests, adding some webdriver flags * adding tests * adding cront entry * adding cron job * fix flaky test * move to main celery schedule * switch to daily task * fixing tests * fixing tests * fix comment * fix task name * fix task name
1 parent 0ddd8b1 commit f50389c

File tree

13 files changed

+1361
-1177
lines changed

13 files changed

+1361
-1177
lines changed

Dockerfile

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,23 @@ ENV PATH="$VIRTUAL_ENV/bin:$POETRY_HOME/bin:$PATH"
4141
# Install poetry
4242
RUN pip install --no-cache-dir "poetry==$POETRY_VERSION"
4343

44+
45+
46+
# Install Chromium (commented out lines illustrate the syntax for getting specific chromium versions)
47+
RUN echo "deb http://deb.debian.org/debian/ sid main" >> /etc/apt/sources.list \
48+
&& apt-get update -qqy \
49+
# && apt-get -qqy install chromium=89.0.4389.82-1 \
50+
# && apt-get -qqy install chromium=90.0.4430.212-1 \
51+
# && apt-get -qqy install chromium=93.0.4577.82-1 \
52+
# && apt-get -qqy install chromium=97.0.4692.71-0.1 \
53+
# && apt-get -qqy install chromium=98.0.4758.102-1+b1 \
54+
&& apt-get -qqy install chromium \
55+
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/*
56+
57+
# install chromedriver, which will be located at /usr/bin/chromedriver
58+
RUN apt-get update -qqy \
59+
&& apt-get -qqy install chromium-driver \
60+
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/*
4461
COPY pyproject.toml /src
4562
COPY poetry.lock /src
4663
RUN chown -R mitodl:mitodl /src && \

learning_resources/conftest.py

Lines changed: 33 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -84,42 +84,10 @@ def add_file_to_bucket_recursive(bucket, file_base, s3_base, file_object):
8484
add_file_to_bucket_recursive(bucket, local_path, file_key, child.name)
8585

8686

87-
def setup_s3_ocw(settings):
88-
"""
89-
Set up the fake s3 data for OCW
90-
"""
91-
# Fake the settings
92-
settings.AWS_ACCESS_KEY_ID = "abc"
93-
settings.AWS_SECRET_ACCESS_KEY = "abc" # noqa: S105
94-
settings.OCW_LIVE_BUCKET = "test_bucket"
95-
# Create our fake bucket
96-
conn = boto3.resource(
97-
"s3",
98-
aws_access_key_id=settings.AWS_ACCESS_KEY_ID,
99-
aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY,
100-
)
101-
conn.create_bucket(Bucket=settings.OCW_LIVE_BUCKET)
102-
103-
# Add data to the fake ocw next bucket
104-
ocw_next_bucket = conn.Bucket(name=settings.OCW_LIVE_BUCKET)
105-
106-
base_folder = OCW_TEST_JSON_PATH.replace("./test_json/", "")
107-
108-
for file in Path(OCW_TEST_JSON_PATH).iterdir():
109-
add_file_to_bucket_recursive(
110-
ocw_next_bucket, OCW_TEST_JSON_PATH, base_folder, file.name
111-
)
112-
LearningResourcePlatformFactory.create(code=PlatformType.ocw.name)
113-
LearningResourceOfferorFactory.create(is_ocw=True)
114-
LearningResourceDepartmentFactory.create(
115-
department_id="16", name="Aeronautics and Astronautics"
116-
)
117-
118-
11987
@pytest.fixture(autouse=True)
12088
def marketing_metadata_mocks(mocker):
12189
mocker.patch(
122-
"learning_resources.etl.loaders._fetch_page",
90+
"learning_resources.utils.fetch_page",
12391
return_value="""
12492
<html>
12593
<body>
@@ -150,6 +118,38 @@ def marketing_metadata_mocks(mocker):
150118
)
151119

152120

121+
def setup_s3_ocw(settings):
122+
"""
123+
Set up the fake s3 data for OCW
124+
"""
125+
# Fake the settings
126+
settings.AWS_ACCESS_KEY_ID = "abc"
127+
settings.AWS_SECRET_ACCESS_KEY = "abc" # noqa: S105
128+
settings.OCW_LIVE_BUCKET = "test_bucket"
129+
# Create our fake bucket
130+
conn = boto3.resource(
131+
"s3",
132+
aws_access_key_id=settings.AWS_ACCESS_KEY_ID,
133+
aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY,
134+
)
135+
conn.create_bucket(Bucket=settings.OCW_LIVE_BUCKET)
136+
137+
# Add data to the fake ocw next bucket
138+
ocw_next_bucket = conn.Bucket(name=settings.OCW_LIVE_BUCKET)
139+
140+
base_folder = OCW_TEST_JSON_PATH.replace("./test_json/", "")
141+
142+
for file in Path(OCW_TEST_JSON_PATH).iterdir():
143+
add_file_to_bucket_recursive(
144+
ocw_next_bucket, OCW_TEST_JSON_PATH, base_folder, file.name
145+
)
146+
LearningResourcePlatformFactory.create(code=PlatformType.ocw.name)
147+
LearningResourceOfferorFactory.create(is_ocw=True)
148+
LearningResourceDepartmentFactory.create(
149+
department_id="16", name="Aeronautics and Astronautics"
150+
)
151+
152+
153153
@pytest.fixture
154154
def summarizer_configuration():
155155
"""Create a summarizer configuration"""

learning_resources/etl/loaders.py

Lines changed: 2 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import logging
44

5-
import requests
65
from django.contrib.auth import get_user_model
76
from django.db import transaction
87

@@ -15,15 +14,14 @@
1514
)
1615
from learning_resources.etl.constants import (
1716
CONTENT_TAG_CATEGORIES,
18-
MARKETING_PAGE_FILE_TYPE,
1917
READABLE_ID_FIELD,
2018
ContentTagCategory,
2119
CourseLoaderConfig,
2220
ProgramLoaderConfig,
2321
ResourceNextRunConfig,
2422
)
2523
from learning_resources.etl.exceptions import ExtractException
26-
from learning_resources.etl.utils import html_to_markdown, most_common_topics
24+
from learning_resources.etl.utils import most_common_topics
2725
from learning_resources.models import (
2826
ContentFile,
2927
Course,
@@ -512,7 +510,7 @@ def load_course(
512510
load_image(learning_resource, image_data)
513511
load_departments(learning_resource, department_data)
514512
load_content_tags(learning_resource, content_tags_data)
515-
load_marketing_page(learning_resource)
513+
516514
update_index(learning_resource, created)
517515
return learning_resource
518516

@@ -604,7 +602,6 @@ def load_program(
604602
load_image(learning_resource, image_data)
605603
load_offered_by(learning_resource, offered_by_data)
606604
load_departments(learning_resource, departments_data)
607-
load_marketing_page(learning_resource)
608605

609606
program, _ = Program.objects.get_or_create(learning_resource=learning_resource)
610607

@@ -757,33 +754,6 @@ def calculate_completeness(
757754
return new_score
758755

759756

760-
def _fetch_page(url):
761-
if url:
762-
try:
763-
response = requests.get(url, timeout=10)
764-
if response.ok:
765-
return response.text
766-
except requests.exceptions.RequestException:
767-
logging.exception("Error fetching page from %s", url)
768-
return None
769-
770-
771-
def load_marketing_page(learning_resource: LearningResource):
772-
marketing_page_url = learning_resource.url
773-
page_content = _fetch_page(marketing_page_url)
774-
if page_content:
775-
content_file, _ = ContentFile.objects.update_or_create(
776-
learning_resource=learning_resource,
777-
file_type=MARKETING_PAGE_FILE_TYPE,
778-
defaults={
779-
"file_extension": ".md",
780-
},
781-
)
782-
content_file.key = marketing_page_url
783-
content_file.content = html_to_markdown(page_content)
784-
content_file.save()
785-
786-
787757
def load_content_files(
788758
course_run: LearningResourceRun,
789759
content_files_data: list[dict],

learning_resources/etl/loaders_test.py

Lines changed: 0 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
"""Tests for ETL loaders"""
22

3-
import copy
43
from datetime import timedelta
54
from decimal import Decimal
65

@@ -22,7 +21,6 @@
2221
RunStatus,
2322
)
2423
from learning_resources.etl.constants import (
25-
MARKETING_PAGE_FILE_TYPE,
2624
CourseLoaderConfig,
2725
ETLSource,
2826
ProgramLoaderConfig,
@@ -1612,125 +1610,3 @@ def test_calculate_completeness(mocker, is_scholar_course, tag_counts, expected_
16121610
== expected_score
16131611
)
16141612
assert mock_index.call_count == (1 if resource.completeness != 1.0 else 0)
1615-
1616-
1617-
def test_load_course_fetches_marketing_page_info(mocker):
1618-
"""Test that loading a course produces a marketing page document"""
1619-
platform = LearningResourcePlatformFactory.create()
1620-
1621-
props = {
1622-
"readable_id": "abc123",
1623-
"platform": platform.code,
1624-
"etl_source": ETLSource.ocw.name,
1625-
"title": "course title",
1626-
"image": {"url": "https://www.test.edu/image.jpg"},
1627-
"description": "description",
1628-
"url": "https://test.edu",
1629-
"published": True,
1630-
"runs": [
1631-
{
1632-
"run_id": "test_run_id",
1633-
"enrollment_start": now_in_utc(),
1634-
"start_date": now_in_utc(),
1635-
"end_date": now_in_utc(),
1636-
}
1637-
],
1638-
}
1639-
result = load_course(props, [], [], config=CourseLoaderConfig(prune=True))
1640-
assert ContentFile.objects.filter(key=result.url).exists()
1641-
1642-
1643-
def test_load_program_fetches_marketing_page_info(
1644-
mock_upsert_tasks,
1645-
):
1646-
"""Test that load_program produces a marketing page document"""
1647-
platform = LearningResourcePlatformFactory.create()
1648-
1649-
program = ProgramFactory.create(courses=[], platform=platform.code)
1650-
1651-
LearningResourcePlatformFactory.create(code=platform.code)
1652-
1653-
learning_resource = program.learning_resource
1654-
learning_resource.is_published = True
1655-
learning_resource.platform = platform
1656-
learning_resource.runs.set([])
1657-
learning_resource.save()
1658-
courses = CourseFactory.create_batch(2, platform=platform.code)
1659-
run_data = {
1660-
"run_id": program.learning_resource.readable_id,
1661-
"enrollment_start": "2017-01-01T00:00:00Z",
1662-
"start_date": "2017-01-20T00:00:00Z",
1663-
"end_date": "2017-06-20T00:00:00Z",
1664-
}
1665-
delivery_data = {}
1666-
result = load_program(
1667-
{
1668-
"platform": platform.code,
1669-
"readable_id": program.learning_resource.readable_id,
1670-
"professional": False,
1671-
"title": program.learning_resource.title,
1672-
"url": program.learning_resource.url,
1673-
"image": {"url": program.learning_resource.image.url},
1674-
"published": True,
1675-
"runs": [run_data],
1676-
"availability": program.learning_resource.availability,
1677-
"courses": [
1678-
{
1679-
"readable_id": course.learning_resource.readable_id,
1680-
"platform": platform.code,
1681-
"availability": course.learning_resource.availability,
1682-
}
1683-
for course in courses
1684-
],
1685-
**delivery_data,
1686-
},
1687-
[],
1688-
[],
1689-
)
1690-
assert ContentFile.objects.filter(key=result.url).exists()
1691-
1692-
1693-
def test_only_one_marketing_page_instance_exists(
1694-
mocker,
1695-
):
1696-
"""Test that we end up with only one marketing page instance per learning resource"""
1697-
platform = LearningResourcePlatformFactory.create()
1698-
original_url = "https://test.edu"
1699-
new_url = "https://newurl.com"
1700-
readable_id = "abc123"
1701-
props = {
1702-
"readable_id": readable_id,
1703-
"platform": platform.code,
1704-
"etl_source": ETLSource.ocw.name,
1705-
"title": "course title",
1706-
"image": {"url": "https://www.test.edu/image.jpg"},
1707-
"description": "description",
1708-
"url": original_url,
1709-
"published": True,
1710-
"runs": [
1711-
{
1712-
"run_id": "test_run_id",
1713-
"enrollment_start": now_in_utc(),
1714-
"start_date": now_in_utc(),
1715-
"end_date": now_in_utc(),
1716-
}
1717-
],
1718-
}
1719-
initial_props = copy.deepcopy(props)
1720-
result = load_course(initial_props, [], [], config=CourseLoaderConfig(prune=True))
1721-
assert ContentFile.objects.filter(
1722-
key=result.url, file_type=MARKETING_PAGE_FILE_TYPE
1723-
).exists()
1724-
props["url"] = new_url
1725-
result = load_course(props, [], [], config=CourseLoaderConfig(prune=True))
1726-
1727-
assert ContentFile.objects.filter(
1728-
key=new_url, file_type=MARKETING_PAGE_FILE_TYPE
1729-
).exists()
1730-
assert (
1731-
ContentFile.objects.filter(
1732-
learning_resource__readable_id=readable_id,
1733-
file_type=MARKETING_PAGE_FILE_TYPE,
1734-
).count()
1735-
== 1
1736-
)

learning_resources/etl/utils.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
from tempfile import TemporaryDirectory
2020

2121
import boto3
22-
import html2text
2322
import rapidjson
2423
import requests
2524
from django.conf import settings
@@ -901,9 +900,3 @@ def parse_resource_commitment(commitment_str: str) -> CommitmentConfig:
901900
else:
902901
log.warning("Invalid commitment: %s", commitment_str)
903902
return CommitmentConfig(commitment=commitment_str or "")
904-
905-
906-
def html_to_markdown(html):
907-
htmlformatter = html2text.HTML2Text()
908-
htmlformatter.body_width = 0
909-
return htmlformatter.handle(html)

0 commit comments

Comments
 (0)