Skip to content

multi page marketing site scraping #2196

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Apr 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion learning_resources/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def add_file_to_bucket_recursive(bucket, file_base, s3_base, file_object):
@pytest.fixture(autouse=True)
def marketing_metadata_mocks(mocker):
mocker.patch(
"learning_resources.utils.fetch_page",
"learning_resources.site_scrapers.base_scraper.BaseScraper.fetch_page",
return_value="""
<html>
<body>
Expand Down
Empty file.
45 changes: 45 additions & 0 deletions learning_resources/site_scrapers/base_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import logging

import requests
from django.conf import settings
from selenium.webdriver.support.ui import WebDriverWait

from learning_resources.utils import get_web_driver

logger = logging.getLogger(__name__)


class BaseScraper:
use_webdriver = settings.EMBEDDINGS_EXTERNAL_FETCH_USE_WEBDRIVER
driver = None

def __init__(self, start_url):
self.start_url = start_url
if self.use_webdriver:
self.driver = get_web_driver()

def fetch_page(self, url):
if url:
if self.driver:
self.driver.get(url)
WebDriverWait(self.driver, 10).until(
lambda d: d.execute_script("return document.readyState")
== "complete"
)
return self.driver.execute_script("return document.body.innerHTML")
else:
try:
response = requests.get(url, timeout=10)
if response.ok:
return response.text
except requests.exceptions.RequestException:
logger.exception("Error fetching page from %s", url)
return None

def scrape(self):
page_content = self.fetch_page(self.start_url)
if page_content:
return page_content
else:
logger.error("Failed to fetch page content from %s", self.start_url)
return None
11 changes: 11 additions & 0 deletions learning_resources/site_scrapers/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from learning_resources.site_scrapers.mitx_program_page_scraper import (
MITXProgramPageScraper,
)
from learning_resources.site_scrapers.sloan_course_page_scraper import (
SloanCoursePageScraper,
)

SITE_SCRAPER_MAP = {
r"^https://executive.mit.edu/course/(.*?)": SloanCoursePageScraper,
r"https://micromasters.mit.edu/(.*?)/$": MITXProgramPageScraper,
}
19 changes: 19 additions & 0 deletions learning_resources/site_scrapers/mitx_program_page_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from selenium.webdriver.common.by import By

from learning_resources.site_scrapers.base_scraper import BaseScraper


class MITXProgramPageScraper(BaseScraper):
def scrape(self, *args, **kwargs):
content = super().scrape(*args, **kwargs)
extra_links = []
if self.driver:
for link in self.driver.find_elements(By.CLASS_NAME, "tab-link"):
link_url = link.get_attribute("href")
if link_url != self.start_url:
extra_links.append(link_url)
for link_url in extra_links:
page_content = self.fetch_page(link_url)
if page_content:
content += page_content
return content
40 changes: 40 additions & 0 deletions learning_resources/site_scrapers/sloan_course_page_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from selenium.common.exceptions import (
ElementNotInteractableException,
JavascriptException,
NoSuchElementException,
TimeoutException,
)
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.ui import WebDriverWait

from learning_resources.site_scrapers.base_scraper import BaseScraper


class SloanCoursePageScraper(BaseScraper):
def webdriver_fetch_extra_elements(self):
"""
Attempt to Fetch any extra possible js loaded elements that
require interaction to display
"""
errors = [
NoSuchElementException,
JavascriptException,
ElementNotInteractableException,
TimeoutException,
]
wait = WebDriverWait(
self.driver, timeout=0.1, poll_frequency=0.01, ignored_exceptions=errors
)
for tab_id in ["faculty-tab", "reviews-tab", "participants-tab"]:
wait.until(
expected_conditions.visibility_of_element_located((By.ID, tab_id))
)
self.driver.execute_script(f"document.getElementById('{tab_id}').click()")
return self.driver.execute_script("return document.body.innerHTML")

def scrape(self, *args, **kwargs):
content = super().scrape(*args, **kwargs)
if self.driver:
content = self.webdriver_fetch_extra_elements()
return content
11 changes: 11 additions & 0 deletions learning_resources/site_scrapers/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import re

from learning_resources.site_scrapers.base_scraper import BaseScraper
from learning_resources.site_scrapers.constants import SITE_SCRAPER_MAP


def scraper_for_site(url):
for pattern in SITE_SCRAPER_MAP:
if re.search(pattern, url):
return SITE_SCRAPER_MAP[pattern](url)
return BaseScraper(url)
31 changes: 31 additions & 0 deletions learning_resources/site_scrapers/utils_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import pytest

from learning_resources.site_scrapers.base_scraper import BaseScraper
from learning_resources.site_scrapers.mitx_program_page_scraper import (
MITXProgramPageScraper,
)
from learning_resources.site_scrapers.sloan_course_page_scraper import (
SloanCoursePageScraper,
)
from learning_resources.site_scrapers.utils import scraper_for_site


@pytest.mark.parametrize(
("url", "expected_scraper_class"),
[
("https://example.com", BaseScraper),
("https://micromasters.mit.edu/ds/", MITXProgramPageScraper),
("https://unknownsite.com", BaseScraper),
(
"https://executive.mit.edu/course/innovation-executive-academy/a05U1000005l8nFIAQ.html",
SloanCoursePageScraper,
),
],
)
def test_scraper_for_site(mocker, url, expected_scraper_class):
"""
Test that scraper_for_site returns the correct scraper class based on the URL
"""

scraper = scraper_for_site(url)
assert isinstance(scraper, expected_scraper_class)
6 changes: 4 additions & 2 deletions learning_resources/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
from learning_resources.etl.pipelines import ocw_courses_etl
from learning_resources.etl.utils import get_learning_course_bucket_name
from learning_resources.models import ContentFile, LearningResource
from learning_resources.utils import fetch_page, html_to_markdown, load_course_blocklist
from learning_resources.site_scrapers.utils import scraper_for_site
from learning_resources.utils import html_to_markdown, load_course_blocklist
from learning_resources_search.exceptions import RetryError
from main.celery import app
from main.constants import ISOFORMAT
Expand Down Expand Up @@ -474,7 +475,8 @@ def scrape_marketing_pages(self):
def marketing_page_for_resources(resource_ids):
for learning_resource in LearningResource.objects.filter(id__in=resource_ids):
marketing_page_url = learning_resource.url
page_content = fetch_page(marketing_page_url)
scraper = scraper_for_site(marketing_page_url)
page_content = scraper.scrape()
if page_content:
content_file, _ = ContentFile.objects.update_or_create(
learning_resource=learning_resource,
Expand Down
31 changes: 2 additions & 29 deletions learning_resources/tasks_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,6 @@
scrape_marketing_pages,
update_next_start_date_and_prices,
)
from learning_resources.utils import (
fetch_page,
)

pytestmark = pytest.mark.django_db
# pylint:disable=redefined-outer-name,unused-argument,too-many-arguments
Expand Down Expand Up @@ -428,31 +425,6 @@ def test_summarize_unprocessed_content(
assert get_unprocessed_content_file_ids_mock.call_count == 0 if ids else 1


@pytest.mark.parametrize("use_webdriver", [True], ids=["with_webdriver"])
def test_fetch_page_with_webdriver(mocker, use_webdriver, settings):
"""Test that fetch_page uses WebDriver when settings.EMBEDDINGS_EXTERNAL_FETCH_USE_WEBDRIVER is True"""

settings.EMBEDDINGS_EXTERNAL_FETCH_USE_WEBDRIVER = use_webdriver

mock_driver = mocker.MagicMock()
mock_driver.execute_script.return_value = "<html><body>Page content</body></html>"
mock_get_web_driver = mocker.patch(
"learning_resources.utils._get_web_driver", return_value=mock_driver
)
mock_webdriver_fetch_extra = mocker.patch(
"learning_resources.utils._webdriver_fetch_extra_elements"
)

url = "https://example.com/course"
result = fetch_page(url, use_webdriver=use_webdriver)

assert result == "<html><body>Page content</body></html>"
mock_get_web_driver.assert_called_once()
mock_driver.get.assert_called_once_with(url)
mock_webdriver_fetch_extra.assert_called_once_with(mock_driver)
mock_driver.execute_script.assert_called_once_with("return document.body.innerHTML")


@pytest.mark.django_db
def test_marketing_page_for_resources_with_webdriver(mocker, settings):
"""Test that marketing_page_for_resources uses WebDriver to fetch content"""
Expand All @@ -468,7 +440,8 @@ def test_marketing_page_for_resources_with_webdriver(mocker, settings):

html_content = "<html><body><h1>Test Course</h1><p>Course content</p></body></html>"
mock_fetch_page = mocker.patch(
"learning_resources.tasks.fetch_page", return_value=html_content
"learning_resources.site_scrapers.base_scraper.BaseScraper.fetch_page",
return_value=html_content,
)

markdown_content = "# Test Course\n\nCourse content"
Expand Down
50 changes: 1 addition & 49 deletions learning_resources/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,7 @@
from django.db.models import Q
from retry import retry
from selenium import webdriver
from selenium.common.exceptions import (
ElementNotInteractableException,
JavascriptException,
NoSuchElementException,
TimeoutException,
)
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.ui import WebDriverWait

from learning_resources.constants import (
GROUP_STAFF_LISTS_EDITORS,
Expand Down Expand Up @@ -619,7 +610,7 @@ def html_to_markdown(html):


@cache
def _get_web_driver():
def get_web_driver():
service = webdriver.ChromeService(executable_path=which("chromedriver"))
chrome_options = Options()
chrome_options.add_argument("--headless=new")
Expand All @@ -633,45 +624,6 @@ def _get_web_driver():
return webdriver.Chrome(service=service, options=chrome_options)


def _webdriver_fetch_extra_elements(driver):
"""
Attempt to Fetch any extra possible js loaded elements that
require interaction to display
"""
errors = [
NoSuchElementException,
JavascriptException,
ElementNotInteractableException,
TimeoutException,
]
wait = WebDriverWait(
driver, timeout=0.1, poll_frequency=0.01, ignored_exceptions=errors
)
for tab_id in ["faculty-tab", "reviews-tab", "participants-tab"]:
wait.until(expected_conditions.visibility_of_element_located((By.ID, tab_id)))
driver.execute_script(f"document.getElementById('{tab_id}').click()")


def fetch_page(url, use_webdriver=settings.EMBEDDINGS_EXTERNAL_FETCH_USE_WEBDRIVER):
if url:
if use_webdriver:
driver = _get_web_driver()
driver.get(url)
try:
_webdriver_fetch_extra_elements(driver)
except TimeoutException:
log.warning("Error custom elements page from %s", url)
return driver.execute_script("return document.body.innerHTML")
else:
try:
response = requests.get(url, timeout=10)
if response.ok:
return response.text
except requests.exceptions.RequestException:
log.exception("Error fetching page from %s", url)
return None


def json_to_markdown(obj, indent=0):
"""
Recursively converts a JSON object into a readable
Expand Down
Loading