From 3cfd3cbc415309b855633238cfd21568823714bd Mon Sep 17 00:00:00 2001 From: Samuel Wu Date: Tue, 27 Aug 2024 23:38:05 +0100 Subject: [PATCH 01/11] Add type hints to functions in scrape_movie_data --- backend/utils/scrape_movie_data.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/backend/utils/scrape_movie_data.py b/backend/utils/scrape_movie_data.py index bc82fd5..7ce43b0 100644 --- a/backend/utils/scrape_movie_data.py +++ b/backend/utils/scrape_movie_data.py @@ -1,8 +1,9 @@ import asyncio from aiohttp import ClientSession from lxml import html +from typing import List, Dict, Any -async def get_movie_data(url, session, movie, username): +async def get_movie_data(url, session, movie, username) -> Dict[str, Any]: """Gets a movie's TMDB ID from a Letterboxd URL.""" async with session.get(url) as r: response = await r.text() @@ -65,7 +66,7 @@ async def get_movie_data(url, session, movie, username): 'user_ratings': [] } - if movie != None: + if movie != None and username != None: movie_data['user_ratings'].append({ 'username': username, 'liked': movie['liked'], @@ -77,7 +78,7 @@ async def get_movie_data(url, session, movie, username): print(f"Error processing data for {url}!") return None -async def scrape_movies(movie_list: list, username: str): +async def scrape_movies(movie_list: list, username: str) -> List[Dict[str, Any]]: url = "https://letterboxd.com/film/{}/" async with ClientSession() as session: From a08fc7a9437fc0534295839828dcbefeba66de00 Mon Sep 17 00:00:00 2001 From: Samuel Wu Date: Tue, 27 Aug 2024 23:38:32 +0100 Subject: [PATCH 02/11] Add playwright to requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 3bc82cf..6e25e47 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ aiohttp==3.10.5 flask==3.0.3 lxml==4.9.4 +playwright==1.46.0 pymongo==4.8.0 redis==5.0.8 tmdbv3api==1.9.0 From ab7960389176cb335783770bccd6ea3e9131aa1b Mon Sep 17 00:00:00 2001 From: Samuel Wu Date: Tue, 27 Aug 2024 23:39:39 +0100 Subject: [PATCH 03/11] Create new function to scrape movies by popularity. This is extremely slow and could be unreliable. --- backend/utils/__init__.py | 3 +- ...ser_ratings.py => scrape_movie_gallery.py} | 30 +++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) rename backend/utils/{scrape_user_ratings.py => scrape_movie_gallery.py} (70%) diff --git a/backend/utils/__init__.py b/backend/utils/__init__.py index 1d29d39..0ce633b 100644 --- a/backend/utils/__init__.py +++ b/backend/utils/__init__.py @@ -1,5 +1,5 @@ from .fetch_tmdb_data import get_tmdb_data, init_tmdb -from .scrape_user_ratings import scrape_user_ratings +from .scrape_movie_gallery import scrape_user_ratings, scrape_popular_pages from .scrape_movie_data import scrape_movies from .user_movie_preprocessing import save_user_data_to_db, write_to_csv, get_user_movie_data @@ -7,6 +7,7 @@ 'get_tmdb_data', 'init_tmdb', 'scrape_user_ratings', + 'scrape_popular_pages', 'scrape_movies', 'save_user_data_to_db', 'write_to_csv', diff --git a/backend/utils/scrape_user_ratings.py b/backend/utils/scrape_movie_gallery.py similarity index 70% rename from backend/utils/scrape_user_ratings.py rename to backend/utils/scrape_movie_gallery.py index 1a07976..e29bfe4 100644 --- a/backend/utils/scrape_user_ratings.py +++ b/backend/utils/scrape_movie_gallery.py @@ -2,6 +2,8 @@ import aiohttp from lxml import html from typing import List, Dict, Any +from playwright.async_api import async_playwright +import re PAGES_PER_BATCH = 30 BATCH_DELAY = 0.5 @@ -59,4 +61,32 @@ async def _fetch_page(session, url): "rating": rating }) + return film_data + +async def scrape_popular_pages(num_pages: int) -> List[Dict[str, Any]]: + async def _fetch_page(page, url): + await page.goto(url) + await page.wait_for_selector('.poster-container') + return await page.content() + + base_url = "https://letterboxd.com/films/popular/" + + async with async_playwright() as p: + browser = await p.chromium.launch() + page = await browser.new_page() + + urls = [base_url] + [f"{base_url}page/{page_num}/" + for page_num in range(2, num_pages + 1)] + + film_data = [] + for url in urls: + content = await _fetch_page(page, url) + film_slugs = re.findall(r'data-film-slug="([^"]+)"', content) + film_data.extend([{"film_slug": slug} for slug in film_slugs]) + + + if urls.index(url) < len(urls) - 1: + await asyncio.sleep(BATCH_DELAY) + + await browser.close() return film_data \ No newline at end of file From b257b4861a4c83a586dbb39895f7ed0438feece4 Mon Sep 17 00:00:00 2001 From: Samuel Wu Date: Tue, 27 Aug 2024 23:43:46 +0100 Subject: [PATCH 04/11] Add warning to scrape_popular_pages as it is unreliable --- backend/utils/scrape_movie_gallery.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/backend/utils/scrape_movie_gallery.py b/backend/utils/scrape_movie_gallery.py index e29bfe4..935fbda 100644 --- a/backend/utils/scrape_movie_gallery.py +++ b/backend/utils/scrape_movie_gallery.py @@ -64,6 +64,10 @@ async def _fetch_page(session, url): return film_data async def scrape_popular_pages(num_pages: int) -> List[Dict[str, Any]]: + """Scrapes Letterboxd by most popular movies. + + WARNING: Very Slow. We may need to consider another method. + """ async def _fetch_page(page, url): await page.goto(url) await page.wait_for_selector('.poster-container') From 50091ff5938278f58c52019596c0a1841cedcc8b Mon Sep 17 00:00:00 2001 From: Sam/Samuel <57896620+cern1710@users.noreply.github.com> Date: Wed, 28 Aug 2024 01:02:39 +0100 Subject: [PATCH 05/11] Create pylint.yml --- .github/workflows/pylint.yml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 .github/workflows/pylint.yml diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml new file mode 100644 index 0000000..c73e032 --- /dev/null +++ b/.github/workflows/pylint.yml @@ -0,0 +1,23 @@ +name: Pylint + +on: [push] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10"] + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pylint + - name: Analysing the code with pylint + run: | + pylint $(git ls-files '*.py') From debdad80e581ae906e2c69e0268242c93980fcc6 Mon Sep 17 00:00:00 2001 From: Samuel Wu Date: Wed, 28 Aug 2024 18:05:48 +0100 Subject: [PATCH 06/11] Rewrote database test script using pytest. --- backend/tests/test_database.py | 31 +++++++++++++++---------------- requirements.txt | 2 ++ 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/backend/tests/test_database.py b/backend/tests/test_database.py index 32bf39e..c3a2b15 100644 --- a/backend/tests/test_database.py +++ b/backend/tests/test_database.py @@ -1,27 +1,26 @@ +import pytest import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from database import * -from utils import * -def test_database_connection(): +@pytest.fixture(scope="module") +def db_connection(): client, db = connect_to_mongodb(config_path="../config.json") - assert db is not None - return client, db + yield db + client.close() -def test_insert_and_retrieve_movie(db): - test_movie = {"tmdb_id": 12345, "title": "Test Movie"} - insert_movie(db, test_movie) - movies = get_all_movies(db) - assert any(movie['tmdb_id'] == 12345 for movie in movies) - delete_movie_by_id(db, 12345) +def test_database_connection(db_connection): + assert db_connection is not None -if __name__ == "__main__": - client, db = test_database_connection() - print("Connected to database!") +def test_insert_and_retrieve_movie(db_connection): + test_movie = {"tmdb_id": 12345, "title": "Test Movie"} + insert_movie(db_connection, test_movie) - test_insert_and_retrieve_movie(db) - print("Inserted and retrieved movie!") + movies = get_all_movies(db_connection) + assert any(movie['tmdb_id'] == 12345 for movie in movies) - print("All tests passed!") \ No newline at end of file + delete_movie_by_id(db_connection, 12345) + movies_after_deletion = get_all_movies(db_connection) + assert not any(movie['tmdb_id'] == 12345 for movie in movies_after_deletion) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 6e25e47..04feebc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,8 @@ flask==3.0.3 lxml==4.9.4 playwright==1.46.0 pymongo==4.8.0 +pytest==8.3.2 +pytest-asyncio==0.24.0 redis==5.0.8 tmdbv3api==1.9.0 urllib3==2.2.1 \ No newline at end of file From 670767f884372fffe9d96b51f7a658906def9757 Mon Sep 17 00:00:00 2001 From: Samuel Wu Date: Wed, 28 Aug 2024 18:17:03 +0100 Subject: [PATCH 07/11] Rewrote TMDB API testing using pytest; proper asserts on all dictionaries performed --- backend/tests/test_tmdb_api.py | 58 ++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/backend/tests/test_tmdb_api.py b/backend/tests/test_tmdb_api.py index 485af51..dd675fd 100644 --- a/backend/tests/test_tmdb_api.py +++ b/backend/tests/test_tmdb_api.py @@ -1,11 +1,63 @@ +import pytest import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from utils import * -if __name__ == "__main__": - init_tmdb() +@pytest.fixture(scope="module") +def tmdb_setup(): + init_tmdb("../config.json") + +def remove_key_from_list_of_dicts(lst, key): + """Remove specified key from all dictionaries in a list.""" + for item in lst: + if key in item: + del item[key] + return lst + +def test_get_tmdb_data(tmdb_setup): movies = [1730, 104871, 31414, 76819] + expected_info = [ + { + 'tmdb_id': 1730, + 'title': 'Inland Empire', + 'directors': [{'id': 5602, 'name': 'David Lynch'}], + 'genres': ['Thriller', 'Mystery', 'Fantasy', 'Horror'], + 'release_year': 2006, + 'runtime': 180, + 'user_rating': [] + }, + { + 'tmdb_id': 104871, + 'title': 'Sicily!', + 'directors': [{'id': 935136, 'name': 'Jean-Marie Straub'}, + {'id': 935137, 'name': 'Danièle Huillet'}], + 'genres': ['Drama'], + 'release_year': 1999, + 'runtime': 66, + 'user_rating': [] + }, + { + 'tmdb_id': 31414, + 'title': 'Satantango', + 'directors': [{'id': 85637, 'name': 'Béla Tarr'}], + 'genres': ['Drama'], + 'release_year': 1994, + 'runtime': 432, + 'user_rating': [] + }, + { + 'tmdb_id': 76819, + 'title': 'Teenage Hooker Became A Killing Machine In Daehakro', + 'directors': [{'id': 1371835, 'name': 'Nam Gee-woong'}], + 'genres': ['Science Fiction', 'Horror'], + 'release_year': 2000, + 'runtime': 60, + 'user_rating': [] + } + ] + info = [get_tmdb_data(movie) for movie in movies] - print(info) \ No newline at end of file + info_filtered = remove_key_from_list_of_dicts(info, 'popularity') + assert info_filtered == expected_info \ No newline at end of file From 72dac81491019366ef8563986fe73f21772a0103 Mon Sep 17 00:00:00 2001 From: Samuel Wu Date: Wed, 28 Aug 2024 18:17:54 +0100 Subject: [PATCH 08/11] Test API is redundant --- backend/tests/test_api.py | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 backend/tests/test_api.py diff --git a/backend/tests/test_api.py b/backend/tests/test_api.py deleted file mode 100644 index 6e7b105..0000000 --- a/backend/tests/test_api.py +++ /dev/null @@ -1,6 +0,0 @@ -import requests - -if __name__ == "__main__": - r = requests.get("http://localhost:5000/movies") - data = r.json() - print(data) \ No newline at end of file From c79ae58da8cb3c7ec0199c8c011bf85b5c46616e Mon Sep 17 00:00:00 2001 From: Samuel Wu Date: Wed, 28 Aug 2024 18:28:52 +0100 Subject: [PATCH 09/11] Update test data integration to fetch mscorsese's fields --- backend/tests/test_data_integration.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/backend/tests/test_data_integration.py b/backend/tests/test_data_integration.py index cf54ebf..2265968 100644 --- a/backend/tests/test_data_integration.py +++ b/backend/tests/test_data_integration.py @@ -1,13 +1,24 @@ +import pytest import sys import os -import asyncio PARENT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.append(PARENT_DIR) from utils import write_to_csv, save_user_data_to_db, get_user_movie_data -if __name__ == "__main__": +@pytest.mark.asyncio +async def test_get_user_movie_data(): username = "mscorsese" - user_movie_data = asyncio.run(get_user_movie_data(username)) - write_to_csv(username, user_movie_data) \ No newline at end of file + user_movie_data = await get_user_movie_data(username) + + assert isinstance(user_movie_data, list), "Expected a list of movie data" + assert len(user_movie_data) > 0, "Expected at least one movie in the list" + + fields = ['tmdb_id', 'title', 'directors', 'genres', + 'release_year', 'num_ratings', 'avg_rating', + 'runtime', 'user_ratings'] + + for movie in user_movie_data: + for field in fields: + assert field in movie \ No newline at end of file From 10309f517d4d1188d51695261a2becaa7e694d38 Mon Sep 17 00:00:00 2001 From: Samuel Wu Date: Wed, 28 Aug 2024 18:33:29 +0100 Subject: [PATCH 10/11] Lbxd scraper converted into a pytest --- backend/tests/test_lbxd_scraper.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/backend/tests/test_lbxd_scraper.py b/backend/tests/test_lbxd_scraper.py index 8f94f82..1c23c41 100644 --- a/backend/tests/test_lbxd_scraper.py +++ b/backend/tests/test_lbxd_scraper.py @@ -1,3 +1,4 @@ +import pytest import sys import os import asyncio @@ -5,8 +6,14 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from utils import * -if __name__ == "__main__": - movies = [{'film_slug':'inland-empire'}, {'film_slug':'satantango'}, - {'film_slug':'sicily'}] - info = asyncio.run(scrape_movies(movies)) - print(info) \ No newline at end of file +@pytest.mark.asyncio +async def test_scrape_movies(): + username = "mscorsese" + expected_info = [ + { + 'film_slug': 'yeelen', + 'title': 'yeelen', + }, + ] + user_ratings = await scrape_user_ratings(username) + info = await scrape_movies(user_ratings, username=username) \ No newline at end of file From 0382273dae2702252f18b35c411788cab82c623e Mon Sep 17 00:00:00 2001 From: Samuel Wu Date: Wed, 28 Aug 2024 18:38:23 +0100 Subject: [PATCH 11/11] Update pylint to include needed python packages --- .github/workflows/pylint.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index c73e032..3e0d16c 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -18,6 +18,7 @@ jobs: run: | python -m pip install --upgrade pip pip install pylint + pip install aiohttp==3.10.5 flask==3.0.3 lxml==4.9.4 playwright==1.46.0 pymongo==4.8.0 pytest==8.3.2 pytest-asyncio==0.24.0 redis==5.0.8 tmdbv3api==1.9.0 urllib3==2.2.1 - name: Analysing the code with pylint run: | pylint $(git ls-files '*.py')