From 3cfd3cbc415309b855633238cfd21568823714bd Mon Sep 17 00:00:00 2001
From: Samuel Wu <cernunnos1710@gmail.com>
Date: Tue, 27 Aug 2024 23:38:05 +0100
Subject: [PATCH 01/11] Add type hints to functions in scrape_movie_data

---
 backend/utils/scrape_movie_data.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/backend/utils/scrape_movie_data.py b/backend/utils/scrape_movie_data.py
index bc82fd5..7ce43b0 100644
--- a/backend/utils/scrape_movie_data.py
+++ b/backend/utils/scrape_movie_data.py
@@ -1,8 +1,9 @@
 import asyncio
 from aiohttp import ClientSession
 from lxml import html
+from typing import List, Dict, Any
 
-async def get_movie_data(url, session, movie, username):
+async def get_movie_data(url, session, movie, username) -> Dict[str, Any]:
     """Gets a movie's TMDB ID from a Letterboxd URL."""
     async with session.get(url) as r:
         response = await r.text()
@@ -65,7 +66,7 @@ async def get_movie_data(url, session, movie, username):
                 'user_ratings': []
             }
 
-            if movie != None:
+            if movie != None and username != None:
                 movie_data['user_ratings'].append({
                     'username': username,
                     'liked': movie['liked'],
@@ -77,7 +78,7 @@ async def get_movie_data(url, session, movie, username):
             print(f"Error processing data for {url}!")
             return None
 
-async def scrape_movies(movie_list: list, username: str):
+async def scrape_movies(movie_list: list, username: str) -> List[Dict[str, Any]]:
     url = "https://letterboxd.com/film/{}/"
 
     async with ClientSession() as session:

From a08fc7a9437fc0534295839828dcbefeba66de00 Mon Sep 17 00:00:00 2001
From: Samuel Wu <cernunnos1710@gmail.com>
Date: Tue, 27 Aug 2024 23:38:32 +0100
Subject: [PATCH 02/11] Add playwright to requirements.txt

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 3bc82cf..6e25e47 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,7 @@
 aiohttp==3.10.5
 flask==3.0.3
 lxml==4.9.4
+playwright==1.46.0
 pymongo==4.8.0
 redis==5.0.8
 tmdbv3api==1.9.0

From ab7960389176cb335783770bccd6ea3e9131aa1b Mon Sep 17 00:00:00 2001
From: Samuel Wu <cernunnos1710@gmail.com>
Date: Tue, 27 Aug 2024 23:39:39 +0100
Subject: [PATCH 03/11] Create new function to scrape movies by popularity.
 This is extremely slow and could be unreliable.

---
 backend/utils/__init__.py                     |  3 +-
 ...ser_ratings.py => scrape_movie_gallery.py} | 30 +++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)
 rename backend/utils/{scrape_user_ratings.py => scrape_movie_gallery.py} (70%)

diff --git a/backend/utils/__init__.py b/backend/utils/__init__.py
index 1d29d39..0ce633b 100644
--- a/backend/utils/__init__.py
+++ b/backend/utils/__init__.py
@@ -1,5 +1,5 @@
 from .fetch_tmdb_data import get_tmdb_data, init_tmdb
-from .scrape_user_ratings import scrape_user_ratings
+from .scrape_movie_gallery import scrape_user_ratings, scrape_popular_pages
 from .scrape_movie_data import scrape_movies
 from .user_movie_preprocessing import save_user_data_to_db, write_to_csv, get_user_movie_data
 
@@ -7,6 +7,7 @@
     'get_tmdb_data',
     'init_tmdb',
     'scrape_user_ratings',
+    'scrape_popular_pages',
     'scrape_movies',
     'save_user_data_to_db',
     'write_to_csv',
diff --git a/backend/utils/scrape_user_ratings.py b/backend/utils/scrape_movie_gallery.py
similarity index 70%
rename from backend/utils/scrape_user_ratings.py
rename to backend/utils/scrape_movie_gallery.py
index 1a07976..e29bfe4 100644
--- a/backend/utils/scrape_user_ratings.py
+++ b/backend/utils/scrape_movie_gallery.py
@@ -2,6 +2,8 @@
 import aiohttp
 from lxml import html
 from typing import List, Dict, Any
+from playwright.async_api import async_playwright
+import re
 
 PAGES_PER_BATCH = 30
 BATCH_DELAY = 0.5
@@ -59,4 +61,32 @@ async def _fetch_page(session, url):
                     "rating": rating
                 })
 
+        return film_data
+
+async def scrape_popular_pages(num_pages: int) -> List[Dict[str, Any]]:
+    async def _fetch_page(page, url):
+        await page.goto(url)
+        await page.wait_for_selector('.poster-container')
+        return await page.content()
+
+    base_url = "https://letterboxd.com/films/popular/"
+
+    async with async_playwright() as p:
+        browser = await p.chromium.launch()
+        page = await browser.new_page()
+
+        urls = [base_url] + [f"{base_url}page/{page_num}/"
+                             for page_num in range(2, num_pages + 1)]
+
+        film_data = []
+        for url in urls:
+            content = await _fetch_page(page, url)
+            film_slugs = re.findall(r'data-film-slug="([^"]+)"', content)
+            film_data.extend([{"film_slug": slug} for slug in film_slugs])
+
+
+            if urls.index(url) < len(urls) - 1:
+                await asyncio.sleep(BATCH_DELAY)
+
+        await browser.close()
         return film_data
\ No newline at end of file

From b257b4861a4c83a586dbb39895f7ed0438feece4 Mon Sep 17 00:00:00 2001
From: Samuel Wu <cernunnos1710@gmail.com>
Date: Tue, 27 Aug 2024 23:43:46 +0100
Subject: [PATCH 04/11] Add warning to scrape_popular_pages as it is unreliable

---
 backend/utils/scrape_movie_gallery.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/backend/utils/scrape_movie_gallery.py b/backend/utils/scrape_movie_gallery.py
index e29bfe4..935fbda 100644
--- a/backend/utils/scrape_movie_gallery.py
+++ b/backend/utils/scrape_movie_gallery.py
@@ -64,6 +64,10 @@ async def _fetch_page(session, url):
         return film_data
 
 async def scrape_popular_pages(num_pages: int) -> List[Dict[str, Any]]:
+    """Scrapes Letterboxd by most popular movies.
+
+    WARNING: Very Slow. We may need to consider another method.
+    """
     async def _fetch_page(page, url):
         await page.goto(url)
         await page.wait_for_selector('.poster-container')

From 50091ff5938278f58c52019596c0a1841cedcc8b Mon Sep 17 00:00:00 2001
From: Sam/Samuel <57896620+cern1710@users.noreply.github.com>
Date: Wed, 28 Aug 2024 01:02:39 +0100
Subject: [PATCH 05/11] Create pylint.yml

---
 .github/workflows/pylint.yml | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 .github/workflows/pylint.yml

diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
new file mode 100644
index 0000000..c73e032
--- /dev/null
+++ b/.github/workflows/pylint.yml
@@ -0,0 +1,23 @@
+name: Pylint
+
+on: [push]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.8", "3.9", "3.10"]
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install pylint
+    - name: Analysing the code with pylint
+      run: |
+        pylint $(git ls-files '*.py')

From debdad80e581ae906e2c69e0268242c93980fcc6 Mon Sep 17 00:00:00 2001
From: Samuel Wu <cernunnos1710@gmail.com>
Date: Wed, 28 Aug 2024 18:05:48 +0100
Subject: [PATCH 06/11] Rewrote database test script using pytest.

---
 backend/tests/test_database.py | 31 +++++++++++++++----------------
 requirements.txt               |  2 ++
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/backend/tests/test_database.py b/backend/tests/test_database.py
index 32bf39e..c3a2b15 100644
--- a/backend/tests/test_database.py
+++ b/backend/tests/test_database.py
@@ -1,27 +1,26 @@
+import pytest
 import sys
 import os
 
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from database import *
-from utils import *
 
-def test_database_connection():
+@pytest.fixture(scope="module")
+def db_connection():
     client, db = connect_to_mongodb(config_path="../config.json")
-    assert db is not None
-    return client, db
+    yield db
+    client.close()
 
-def test_insert_and_retrieve_movie(db):
-    test_movie = {"tmdb_id": 12345, "title": "Test Movie"}
-    insert_movie(db, test_movie)
-    movies = get_all_movies(db)
-    assert any(movie['tmdb_id'] == 12345 for movie in movies)
-    delete_movie_by_id(db, 12345)
+def test_database_connection(db_connection):
+    assert db_connection is not None
 
-if __name__ == "__main__":
-    client, db = test_database_connection()
-    print("Connected to database!")
+def test_insert_and_retrieve_movie(db_connection):
+    test_movie = {"tmdb_id": 12345, "title": "Test Movie"}
+    insert_movie(db_connection, test_movie)
 
-    test_insert_and_retrieve_movie(db)
-    print("Inserted and retrieved movie!")
+    movies = get_all_movies(db_connection)
+    assert any(movie['tmdb_id'] == 12345 for movie in movies)
 
-    print("All tests passed!")
\ No newline at end of file
+    delete_movie_by_id(db_connection, 12345)
+    movies_after_deletion = get_all_movies(db_connection)
+    assert not any(movie['tmdb_id'] == 12345 for movie in movies_after_deletion)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 6e25e47..04feebc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,6 +3,8 @@ flask==3.0.3
 lxml==4.9.4
 playwright==1.46.0
 pymongo==4.8.0
+pytest==8.3.2
+pytest-asyncio==0.24.0
 redis==5.0.8
 tmdbv3api==1.9.0
 urllib3==2.2.1
\ No newline at end of file

From 670767f884372fffe9d96b51f7a658906def9757 Mon Sep 17 00:00:00 2001
From: Samuel Wu <cernunnos1710@gmail.com>
Date: Wed, 28 Aug 2024 18:17:03 +0100
Subject: [PATCH 07/11] Rewrote TMDB API testing using pytest; proper asserts
 on all dictionaries performed

---
 backend/tests/test_tmdb_api.py | 58 ++++++++++++++++++++++++++++++++--
 1 file changed, 55 insertions(+), 3 deletions(-)

diff --git a/backend/tests/test_tmdb_api.py b/backend/tests/test_tmdb_api.py
index 485af51..dd675fd 100644
--- a/backend/tests/test_tmdb_api.py
+++ b/backend/tests/test_tmdb_api.py
@@ -1,11 +1,63 @@
+import pytest
 import sys
 import os
 
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from utils import *
 
-if __name__ == "__main__":
-    init_tmdb()
+@pytest.fixture(scope="module")
+def tmdb_setup():
+    init_tmdb("../config.json")
+
+def remove_key_from_list_of_dicts(lst, key):
+    """Remove specified key from all dictionaries in a list."""
+    for item in lst:
+        if key in item:
+            del item[key]
+    return lst
+
+def test_get_tmdb_data(tmdb_setup):
     movies = [1730, 104871, 31414, 76819]
+    expected_info = [
+        {
+            'tmdb_id': 1730,
+            'title': 'Inland Empire',
+            'directors': [{'id': 5602, 'name': 'David Lynch'}],
+            'genres': ['Thriller', 'Mystery', 'Fantasy', 'Horror'],
+            'release_year': 2006,
+            'runtime': 180,
+            'user_rating': []
+        },
+        {
+            'tmdb_id': 104871,
+            'title': 'Sicily!',
+            'directors': [{'id': 935136, 'name': 'Jean-Marie Straub'},
+                          {'id': 935137, 'name': 'Danièle Huillet'}],
+            'genres': ['Drama'],
+            'release_year': 1999,
+            'runtime': 66,
+            'user_rating': []
+        },
+        {
+            'tmdb_id': 31414,
+            'title': 'Satantango',
+            'directors': [{'id': 85637, 'name': 'Béla Tarr'}],
+            'genres': ['Drama'],
+            'release_year': 1994,
+            'runtime': 432,
+            'user_rating': []
+        },
+        {
+            'tmdb_id': 76819,
+            'title': 'Teenage Hooker Became A Killing Machine In Daehakro',
+            'directors': [{'id': 1371835, 'name': 'Nam Gee-woong'}],
+            'genres': ['Science Fiction', 'Horror'],
+            'release_year': 2000,
+            'runtime': 60,
+            'user_rating': []
+        }
+    ]
+
     info = [get_tmdb_data(movie) for movie in movies]
-    print(info)
\ No newline at end of file
+    info_filtered = remove_key_from_list_of_dicts(info, 'popularity')
+    assert info_filtered == expected_info
\ No newline at end of file

From 72dac81491019366ef8563986fe73f21772a0103 Mon Sep 17 00:00:00 2001
From: Samuel Wu <cernunnos1710@gmail.com>
Date: Wed, 28 Aug 2024 18:17:54 +0100
Subject: [PATCH 08/11] Test API is redundant

---
 backend/tests/test_api.py | 6 ------
 1 file changed, 6 deletions(-)
 delete mode 100644 backend/tests/test_api.py

diff --git a/backend/tests/test_api.py b/backend/tests/test_api.py
deleted file mode 100644
index 6e7b105..0000000
--- a/backend/tests/test_api.py
+++ /dev/null
@@ -1,6 +0,0 @@
-import requests
-
-if __name__ == "__main__":
-    r = requests.get("http://localhost:5000/movies")
-    data = r.json()
-    print(data)
\ No newline at end of file

From c79ae58da8cb3c7ec0199c8c011bf85b5c46616e Mon Sep 17 00:00:00 2001
From: Samuel Wu <cernunnos1710@gmail.com>
Date: Wed, 28 Aug 2024 18:28:52 +0100
Subject: [PATCH 09/11] Update test data integration to fetch mscorsese's
 fields

---
 backend/tests/test_data_integration.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/backend/tests/test_data_integration.py b/backend/tests/test_data_integration.py
index cf54ebf..2265968 100644
--- a/backend/tests/test_data_integration.py
+++ b/backend/tests/test_data_integration.py
@@ -1,13 +1,24 @@
+import pytest
 import sys
 import os
-import asyncio
 
 PARENT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(PARENT_DIR)
 
 from utils import write_to_csv, save_user_data_to_db, get_user_movie_data
 
-if __name__ == "__main__":
+@pytest.mark.asyncio
+async def test_get_user_movie_data():
     username = "mscorsese"
-    user_movie_data = asyncio.run(get_user_movie_data(username))
-    write_to_csv(username, user_movie_data)
\ No newline at end of file
+    user_movie_data = await get_user_movie_data(username)
+
+    assert isinstance(user_movie_data, list), "Expected a list of movie data"
+    assert len(user_movie_data) > 0, "Expected at least one movie in the list"
+
+    fields = ['tmdb_id', 'title', 'directors', 'genres',
+              'release_year', 'num_ratings', 'avg_rating',
+              'runtime', 'user_ratings']
+
+    for movie in user_movie_data:
+        for field in fields:
+            assert field in movie
\ No newline at end of file

From 10309f517d4d1188d51695261a2becaa7e694d38 Mon Sep 17 00:00:00 2001
From: Samuel Wu <cernunnos1710@gmail.com>
Date: Wed, 28 Aug 2024 18:33:29 +0100
Subject: [PATCH 10/11] Lbxd scraper converted into a pytest

---
 backend/tests/test_lbxd_scraper.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/backend/tests/test_lbxd_scraper.py b/backend/tests/test_lbxd_scraper.py
index 8f94f82..1c23c41 100644
--- a/backend/tests/test_lbxd_scraper.py
+++ b/backend/tests/test_lbxd_scraper.py
@@ -1,3 +1,4 @@
+import pytest
 import sys
 import os
 import asyncio
@@ -5,8 +6,14 @@
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from utils import *
 
-if __name__ == "__main__":
-    movies = [{'film_slug':'inland-empire'}, {'film_slug':'satantango'},
-              {'film_slug':'sicily'}]
-    info = asyncio.run(scrape_movies(movies))
-    print(info)
\ No newline at end of file
+@pytest.mark.asyncio
+async def test_scrape_movies():
+    username = "mscorsese"
+    expected_info = [
+        {
+            'film_slug': 'yeelen',
+            'title': 'yeelen',
+        },
+    ]
+    user_ratings = await scrape_user_ratings(username)
+    info = await scrape_movies(user_ratings, username=username)
\ No newline at end of file

From 0382273dae2702252f18b35c411788cab82c623e Mon Sep 17 00:00:00 2001
From: Samuel Wu <cernunnos1710@gmail.com>
Date: Wed, 28 Aug 2024 18:38:23 +0100
Subject: [PATCH 11/11] Update pylint to include needed python packages

---
 .github/workflows/pylint.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
index c73e032..3e0d16c 100644
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@@ -18,6 +18,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install pylint
+        pip install aiohttp==3.10.5 flask==3.0.3 lxml==4.9.4 playwright==1.46.0 pymongo==4.8.0 pytest==8.3.2 pytest-asyncio==0.24.0 redis==5.0.8 tmdbv3api==1.9.0 urllib3==2.2.1
     - name: Analysing the code with pylint
       run: |
         pylint $(git ls-files '*.py')