diff --git a/learning_resources/conftest.py b/learning_resources/conftest.py index bc749dd264..45f035a319 100644 --- a/learning_resources/conftest.py +++ b/learning_resources/conftest.py @@ -87,7 +87,7 @@ def add_file_to_bucket_recursive(bucket, file_base, s3_base, file_object): @pytest.fixture(autouse=True) def marketing_metadata_mocks(mocker): mocker.patch( - "learning_resources.utils.fetch_page", + "learning_resources.site_scrapers.base_scraper.BaseScraper.fetch_page", return_value=""" diff --git a/learning_resources/site_scrapers/__init__.py b/learning_resources/site_scrapers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/learning_resources/site_scrapers/base_scraper.py b/learning_resources/site_scrapers/base_scraper.py new file mode 100644 index 0000000000..6276ee58a8 --- /dev/null +++ b/learning_resources/site_scrapers/base_scraper.py @@ -0,0 +1,45 @@ +import logging + +import requests +from django.conf import settings +from selenium.webdriver.support.ui import WebDriverWait + +from learning_resources.utils import get_web_driver + +logger = logging.getLogger(__name__) + + +class BaseScraper: + use_webdriver = settings.EMBEDDINGS_EXTERNAL_FETCH_USE_WEBDRIVER + driver = None + + def __init__(self, start_url): + self.start_url = start_url + if self.use_webdriver: + self.driver = get_web_driver() + + def fetch_page(self, url): + if url: + if self.driver: + self.driver.get(url) + WebDriverWait(self.driver, 10).until( + lambda d: d.execute_script("return document.readyState") + == "complete" + ) + return self.driver.execute_script("return document.body.innerHTML") + else: + try: + response = requests.get(url, timeout=10) + if response.ok: + return response.text + except requests.exceptions.RequestException: + logger.exception("Error fetching page from %s", url) + return None + + def scrape(self): + page_content = self.fetch_page(self.start_url) + if page_content: + return page_content + else: + logger.error("Failed to fetch page content from %s", self.start_url) + return None diff --git a/learning_resources/site_scrapers/constants.py b/learning_resources/site_scrapers/constants.py new file mode 100644 index 0000000000..f1fa669e9d --- /dev/null +++ b/learning_resources/site_scrapers/constants.py @@ -0,0 +1,11 @@ +from learning_resources.site_scrapers.mitx_program_page_scraper import ( + MITXProgramPageScraper, +) +from learning_resources.site_scrapers.sloan_course_page_scraper import ( + SloanCoursePageScraper, +) + +SITE_SCRAPER_MAP = { + r"^https://executive.mit.edu/course/(.*?)": SloanCoursePageScraper, + r"https://micromasters.mit.edu/(.*?)/$": MITXProgramPageScraper, +} diff --git a/learning_resources/site_scrapers/mitx_program_page_scraper.py b/learning_resources/site_scrapers/mitx_program_page_scraper.py new file mode 100644 index 0000000000..3c65210714 --- /dev/null +++ b/learning_resources/site_scrapers/mitx_program_page_scraper.py @@ -0,0 +1,19 @@ +from selenium.webdriver.common.by import By + +from learning_resources.site_scrapers.base_scraper import BaseScraper + + +class MITXProgramPageScraper(BaseScraper): + def scrape(self, *args, **kwargs): + content = super().scrape(*args, **kwargs) + extra_links = [] + if self.driver: + for link in self.driver.find_elements(By.CLASS_NAME, "tab-link"): + link_url = link.get_attribute("href") + if link_url != self.start_url: + extra_links.append(link_url) + for link_url in extra_links: + page_content = self.fetch_page(link_url) + if page_content: + content += page_content + return content diff --git a/learning_resources/site_scrapers/sloan_course_page_scraper.py b/learning_resources/site_scrapers/sloan_course_page_scraper.py new file mode 100644 index 0000000000..c7dc5f672e --- /dev/null +++ b/learning_resources/site_scrapers/sloan_course_page_scraper.py @@ -0,0 +1,40 @@ +from selenium.common.exceptions import ( + ElementNotInteractableException, + JavascriptException, + NoSuchElementException, + TimeoutException, +) +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions +from selenium.webdriver.support.ui import WebDriverWait + +from learning_resources.site_scrapers.base_scraper import BaseScraper + + +class SloanCoursePageScraper(BaseScraper): + def webdriver_fetch_extra_elements(self): + """ + Attempt to Fetch any extra possible js loaded elements that + require interaction to display + """ + errors = [ + NoSuchElementException, + JavascriptException, + ElementNotInteractableException, + TimeoutException, + ] + wait = WebDriverWait( + self.driver, timeout=0.1, poll_frequency=0.01, ignored_exceptions=errors + ) + for tab_id in ["faculty-tab", "reviews-tab", "participants-tab"]: + wait.until( + expected_conditions.visibility_of_element_located((By.ID, tab_id)) + ) + self.driver.execute_script(f"document.getElementById('{tab_id}').click()") + return self.driver.execute_script("return document.body.innerHTML") + + def scrape(self, *args, **kwargs): + content = super().scrape(*args, **kwargs) + if self.driver: + content = self.webdriver_fetch_extra_elements() + return content diff --git a/learning_resources/site_scrapers/utils.py b/learning_resources/site_scrapers/utils.py new file mode 100644 index 0000000000..102ba88dae --- /dev/null +++ b/learning_resources/site_scrapers/utils.py @@ -0,0 +1,11 @@ +import re + +from learning_resources.site_scrapers.base_scraper import BaseScraper +from learning_resources.site_scrapers.constants import SITE_SCRAPER_MAP + + +def scraper_for_site(url): + for pattern in SITE_SCRAPER_MAP: + if re.search(pattern, url): + return SITE_SCRAPER_MAP[pattern](url) + return BaseScraper(url) diff --git a/learning_resources/site_scrapers/utils_test.py b/learning_resources/site_scrapers/utils_test.py new file mode 100644 index 0000000000..181f963d0a --- /dev/null +++ b/learning_resources/site_scrapers/utils_test.py @@ -0,0 +1,31 @@ +import pytest + +from learning_resources.site_scrapers.base_scraper import BaseScraper +from learning_resources.site_scrapers.mitx_program_page_scraper import ( + MITXProgramPageScraper, +) +from learning_resources.site_scrapers.sloan_course_page_scraper import ( + SloanCoursePageScraper, +) +from learning_resources.site_scrapers.utils import scraper_for_site + + +@pytest.mark.parametrize( + ("url", "expected_scraper_class"), + [ + ("https://example.com", BaseScraper), + ("https://micromasters.mit.edu/ds/", MITXProgramPageScraper), + ("https://unknownsite.com", BaseScraper), + ( + "https://executive.mit.edu/course/innovation-executive-academy/a05U1000005l8nFIAQ.html", + SloanCoursePageScraper, + ), + ], +) +def test_scraper_for_site(mocker, url, expected_scraper_class): + """ + Test that scraper_for_site returns the correct scraper class based on the URL + """ + + scraper = scraper_for_site(url) + assert isinstance(scraper, expected_scraper_class) diff --git a/learning_resources/tasks.py b/learning_resources/tasks.py index 60c065941a..28755c66ee 100644 --- a/learning_resources/tasks.py +++ b/learning_resources/tasks.py @@ -22,7 +22,8 @@ from learning_resources.etl.pipelines import ocw_courses_etl from learning_resources.etl.utils import get_learning_course_bucket_name from learning_resources.models import ContentFile, LearningResource -from learning_resources.utils import fetch_page, html_to_markdown, load_course_blocklist +from learning_resources.site_scrapers.utils import scraper_for_site +from learning_resources.utils import html_to_markdown, load_course_blocklist from learning_resources_search.exceptions import RetryError from main.celery import app from main.constants import ISOFORMAT @@ -474,7 +475,8 @@ def scrape_marketing_pages(self): def marketing_page_for_resources(resource_ids): for learning_resource in LearningResource.objects.filter(id__in=resource_ids): marketing_page_url = learning_resource.url - page_content = fetch_page(marketing_page_url) + scraper = scraper_for_site(marketing_page_url) + page_content = scraper.scrape() if page_content: content_file, _ = ContentFile.objects.update_or_create( learning_resource=learning_resource, diff --git a/learning_resources/tasks_test.py b/learning_resources/tasks_test.py index 44a7743500..29ffc725ed 100644 --- a/learning_resources/tasks_test.py +++ b/learning_resources/tasks_test.py @@ -25,9 +25,6 @@ scrape_marketing_pages, update_next_start_date_and_prices, ) -from learning_resources.utils import ( - fetch_page, -) pytestmark = pytest.mark.django_db # pylint:disable=redefined-outer-name,unused-argument,too-many-arguments @@ -428,31 +425,6 @@ def test_summarize_unprocessed_content( assert get_unprocessed_content_file_ids_mock.call_count == 0 if ids else 1 -@pytest.mark.parametrize("use_webdriver", [True], ids=["with_webdriver"]) -def test_fetch_page_with_webdriver(mocker, use_webdriver, settings): - """Test that fetch_page uses WebDriver when settings.EMBEDDINGS_EXTERNAL_FETCH_USE_WEBDRIVER is True""" - - settings.EMBEDDINGS_EXTERNAL_FETCH_USE_WEBDRIVER = use_webdriver - - mock_driver = mocker.MagicMock() - mock_driver.execute_script.return_value = "Page content" - mock_get_web_driver = mocker.patch( - "learning_resources.utils._get_web_driver", return_value=mock_driver - ) - mock_webdriver_fetch_extra = mocker.patch( - "learning_resources.utils._webdriver_fetch_extra_elements" - ) - - url = "https://example.com/course" - result = fetch_page(url, use_webdriver=use_webdriver) - - assert result == "Page content" - mock_get_web_driver.assert_called_once() - mock_driver.get.assert_called_once_with(url) - mock_webdriver_fetch_extra.assert_called_once_with(mock_driver) - mock_driver.execute_script.assert_called_once_with("return document.body.innerHTML") - - @pytest.mark.django_db def test_marketing_page_for_resources_with_webdriver(mocker, settings): """Test that marketing_page_for_resources uses WebDriver to fetch content""" @@ -468,7 +440,8 @@ def test_marketing_page_for_resources_with_webdriver(mocker, settings): html_content = "

Test Course

Course content

" mock_fetch_page = mocker.patch( - "learning_resources.tasks.fetch_page", return_value=html_content + "learning_resources.site_scrapers.base_scraper.BaseScraper.fetch_page", + return_value=html_content, ) markdown_content = "# Test Course\n\nCourse content" diff --git a/learning_resources/utils.py b/learning_resources/utils.py index 85802bd8d9..24b848ed19 100644 --- a/learning_resources/utils.py +++ b/learning_resources/utils.py @@ -17,16 +17,7 @@ from django.db.models import Q from retry import retry from selenium import webdriver -from selenium.common.exceptions import ( - ElementNotInteractableException, - JavascriptException, - NoSuchElementException, - TimeoutException, -) from selenium.webdriver.chrome.options import Options -from selenium.webdriver.common.by import By -from selenium.webdriver.support import expected_conditions -from selenium.webdriver.support.ui import WebDriverWait from learning_resources.constants import ( GROUP_STAFF_LISTS_EDITORS, @@ -619,7 +610,7 @@ def html_to_markdown(html): @cache -def _get_web_driver(): +def get_web_driver(): service = webdriver.ChromeService(executable_path=which("chromedriver")) chrome_options = Options() chrome_options.add_argument("--headless=new") @@ -633,45 +624,6 @@ def _get_web_driver(): return webdriver.Chrome(service=service, options=chrome_options) -def _webdriver_fetch_extra_elements(driver): - """ - Attempt to Fetch any extra possible js loaded elements that - require interaction to display - """ - errors = [ - NoSuchElementException, - JavascriptException, - ElementNotInteractableException, - TimeoutException, - ] - wait = WebDriverWait( - driver, timeout=0.1, poll_frequency=0.01, ignored_exceptions=errors - ) - for tab_id in ["faculty-tab", "reviews-tab", "participants-tab"]: - wait.until(expected_conditions.visibility_of_element_located((By.ID, tab_id))) - driver.execute_script(f"document.getElementById('{tab_id}').click()") - - -def fetch_page(url, use_webdriver=settings.EMBEDDINGS_EXTERNAL_FETCH_USE_WEBDRIVER): - if url: - if use_webdriver: - driver = _get_web_driver() - driver.get(url) - try: - _webdriver_fetch_extra_elements(driver) - except TimeoutException: - log.warning("Error custom elements page from %s", url) - return driver.execute_script("return document.body.innerHTML") - else: - try: - response = requests.get(url, timeout=10) - if response.ok: - return response.text - except requests.exceptions.RequestException: - log.exception("Error fetching page from %s", url) - return None - - def json_to_markdown(obj, indent=0): """ Recursively converts a JSON object into a readable