Skip to content

Commit

Permalink
Merge pull request #36 from cern1710/35-scrape-letterboxd-by-popular-…
Browse files Browse the repository at this point in the history
…movies

Scrape letterboxd by popular movies
  • Loading branch information
cern1710 authored Aug 28, 2024
2 parents c137217 + 8c498d1 commit 1161a34
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 4 deletions.
1 change: 1 addition & 0 deletions .github/workflows/pylint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install pylint
pip install aiohttp==3.10.5 flask==3.0.3 lxml==4.9.4 playwright==1.46.0 pymongo==4.8.0 pytest==8.3.2 pytest-asyncio==0.24.0 redis==5.0.8 tmdbv3api==1.9.0 urllib3==2.2.1
- name: Analysing the code with pylint
run: |
pylint $(git ls-files '*.py')
3 changes: 2 additions & 1 deletion backend/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
from .fetch_tmdb_data import get_tmdb_data, init_tmdb
from .scrape_user_ratings import scrape_user_ratings
from .scrape_movie_gallery import scrape_user_ratings, scrape_popular_pages
from .scrape_movie_data import scrape_movies
from .user_movie_preprocessing import save_user_data_to_db, write_to_csv, get_user_movie_data

__all__ = [
'get_tmdb_data',
'init_tmdb',
'scrape_user_ratings',
'scrape_popular_pages',
'scrape_movies',
'save_user_data_to_db',
'write_to_csv',
Expand Down
7 changes: 4 additions & 3 deletions backend/utils/scrape_movie_data.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import asyncio
from aiohttp import ClientSession
from lxml import html
from typing import List, Dict, Any

async def get_movie_data(url, session, movie, username):
async def get_movie_data(url, session, movie, username) -> Dict[str, Any]:
"""Gets a movie's TMDB ID from a Letterboxd URL."""
async with session.get(url) as r:
response = await r.text()
Expand Down Expand Up @@ -65,7 +66,7 @@ async def get_movie_data(url, session, movie, username):
'user_ratings': []
}

if movie != None:
if movie != None and username != None:
movie_data['user_ratings'].append({
'username': username,
'liked': movie['liked'],
Expand All @@ -77,7 +78,7 @@ async def get_movie_data(url, session, movie, username):
print(f"Error processing data for {url}!")
return None

async def scrape_movies(movie_list: list, username: str):
async def scrape_movies(movie_list: list, username: str) -> List[Dict[str, Any]]:
url = "https://letterboxd.com/film/{}/"

async with ClientSession() as session:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import aiohttp
from lxml import html
from typing import List, Dict, Any
from playwright.async_api import async_playwright
import re

PAGES_PER_BATCH = 30
BATCH_DELAY = 0.5
Expand Down Expand Up @@ -59,4 +61,36 @@ async def _fetch_page(session, url):
"rating": rating
})

return film_data

async def scrape_popular_pages(num_pages: int) -> List[Dict[str, Any]]:
"""Scrapes Letterboxd by most popular movies.
WARNING: Very Slow. We may need to consider another method.
"""
async def _fetch_page(page, url):
await page.goto(url)
await page.wait_for_selector('.poster-container')
return await page.content()

base_url = "https://letterboxd.com/films/popular/"

async with async_playwright() as p:
browser = await p.chromium.launch()
page = await browser.new_page()

urls = [base_url] + [f"{base_url}page/{page_num}/"
for page_num in range(2, num_pages + 1)]

film_data = []
for url in urls:
content = await _fetch_page(page, url)
film_slugs = re.findall(r'data-film-slug="([^"]+)"', content)
film_data.extend([{"film_slug": slug} for slug in film_slugs])


if urls.index(url) < len(urls) - 1:
await asyncio.sleep(BATCH_DELAY)

await browser.close()
return film_data
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
aiohttp==3.10.5
flask==3.0.3
lxml==4.9.4
playwright==1.46.0
pymongo==4.8.0
pytest==8.3.2
pytest-asyncio==0.24.0
Expand Down

0 comments on commit 1161a34

Please sign in to comment.