Added function to get best-selling books from NYT

EmmaTellblom · Dec 9, 2023 · 0c5c3e9 · 0c5c3e9
1 parent 2dbf9f0
commit 0c5c3e9
Show file tree

Hide file tree

Showing 5 changed files with 100 additions and 6 deletions.
diff --git a/config_sample.py b/config_sample.py
@@ -1 +1,2 @@
-USER_ID=19514847
+USER_ID=19514847
+NYT_KEY='123123'
diff --git a/format_data.py b/format_data.py
@@ -13,7 +13,6 @@ def format_ratings(book_data):
 
     choices = [5, 4, 3, 2, 1]
     book_data['My_Rating'] = np.select(conditions, choices, default=None)
-
     return book_data
 
 # Put the to-read and read to one dataframe to look like Goodreads export
@@ -23,10 +22,21 @@ def create_combined_list(books_have_read, books_to_read):
     books_all_shelves = pd.concat([books_have_read, books_to_read], ignore_index=True)
     return books_all_shelves
 
+def combine_bestseller_total(booklist, bestseller):
+    # Need to check if there are duplicate book_id in dataframes
+    booklist = pd.DataFrame(booklist)
+    bestseller = pd.DataFrame(bestseller)
+    # Check for duplicates in book_id
+    duplicates = pd.merge(booklist[['Book_Id']], bestseller[['Book_Id']], how='inner', on='Book_Id')
+    bestseller = bestseller[~bestseller['Book_Id'].isin(duplicates['Book_Id'])]
+    # Combine the list
+    books_all_shelves = pd.concat([booklist, bestseller], ignore_index=True)
+    return books_all_shelves
+
 def save_to_csv(book_data):
     # Add columns to make sure they match goodreads export
-    columns_to_ensure = ['Author_l-f', 'Additional_Authors', 'ISBN', 'ISBN13', 'Publisher', 'Binding', 'Year_Published', 'Original_Publication_Year', 'Date_Read', 'Date_Added', 'Bookshelves', 'Bookshelves_with_positions', 'My_Review', 'Spoiler', 'Private_Notes', 'Read_Count', 'Owned_Copies']
-    for col in columns_to_ensure:
+    columns_to_include = ['Author_l-f', 'Additional_Authors', 'ISBN', 'ISBN13', 'Publisher', 'Binding', 'Year_Published', 'Original_Publication_Year', 'Date_Read', 'Date_Added', 'Bookshelves', 'Bookshelves_with_positions', 'My_Review', 'Spoiler', 'Private_Notes', 'Read_Count', 'Owned_Copies']
+    for col in columns_to_include:
         if col not in book_data.columns:
             book_data[col] = np.nan
 
@@ -42,3 +52,7 @@ def save_to_csv(book_data):
 
     # Save to CSV
     ordered_book_data.to_csv('goodreads_python_export.csv', encoding='utf-8', sep=',', index=False)
+
+def save_to_csv_as_is(booklist):
+    booklist = pd.DataFrame(booklist)
+    booklist.to_csv('validate_data.csv', encoding='utf-8', sep=',', index=False)
diff --git a/get_bestseller.py b/get_bestseller.py
@@ -0,0 +1,24 @@
+from config import NYT_KEY
+import requests
+import pandas as pd
+from get_generic_book import fetch_book_data_from_isbn13
+from format_data import save_to_csv, save_to_csv_as_is
+
+# Set the parameters to get books from New York Times Best Seller List
+def set_parameters(): 
+    base_nyt_url = 'https://api.nytimes.com/svc/books/v3/lists/full-overview.json'
+    nyt_url = f'{base_nyt_url}?api-key={NYT_KEY}'
+    return nyt_url
+
+# Get the books from the API
+def get_books(nyt_url):
+    best_sellers = []
+    response = requests.get(nyt_url)
+    if response.status_code == 200:  # Check so status is 200
+        data = response.json()
+        for document in data['results']['lists']:  # Nestle down in the API
+            book_info = document.get('books', [])  # Get the book data
+            for b in book_info:
+                # Save some data from the NYT API
+                best_sellers.append([b.get('primary_isbn13'), b.get('title').title(), b.get('author')])
+    return best_sellers[:100] # Return only 100 first books
diff --git a/get_generic_book.py b/get_generic_book.py
@@ -2,6 +2,8 @@
 import pandas as pd
 from bs4 import BeautifulSoup
 from concurrent.futures import ThreadPoolExecutor
+from urllib.parse import urlparse, parse_qs
+import re
 
 #####################################################
 ## This file is to get book-data from the          ##
@@ -48,4 +50,44 @@ def get_generic_book_data(booklist): # Initiate the fetching of book-data
     additional_info = pd.DataFrame(results)
     booklist = pd.concat([booklist, additional_info], axis=1)
 
-    return booklist
+    return booklist
+
+# This function is used for getting Goodreads book data from the ISBN13 gotten from NYT API
+def fetch_book_data_from_isbn13(book_list):
+    book_df = pd.DataFrame(columns=['Book_Id', 'Book_Title', 'Author', 'Year_Published', 'Exclusive_Shelf', 'Number_of_Pages', 'Genres'])
+    base_url = 'https://www.goodreads.com/search?q='
+    for b in book_list:
+        isbn13_url = f'{base_url}{b[0]}' # ISBN from the booklist
+        page = requests.get(isbn13_url)
+        soup = BeautifulSoup(page.content, 'html.parser')
+
+        # Get the book_id
+        final_url = page.url # Redirected url
+        parsed_url = urlparse(final_url)
+        path_segments = parsed_url.path.split('/')
+        book_id = path_segments[-1] # The book ID is at the back of the url
+        book_id = book_id.split('-')[0] # Remove the text after the book_id
+
+        # Extract genres
+        genre_links = soup.select('a[href*="/genres/"]')
+        book_genres = [link.text.strip() for link in genre_links]
+
+        # Extract published year and number of pages
+        publication_info_element = soup.find('p', {'data-testid': 'publicationInfo'})
+        year_published = None
+        if publication_info_element:
+            publication_info_text = publication_info_element.text.strip()
+            year_published = int(publication_info_text.split()[-1]) if publication_info_text else None
+        pages_element = soup.find('p', {'data-testid': 'pagesFormat'})
+        pages_text = pages_element.text.strip() if pages_element else ''
+        try:
+            number_of_pages = int(pages_text.split()[0])
+        except (ValueError, IndexError):
+            number_of_pages = None
+        # Join genres into a comma-separated string
+        genres_string = ', '.join(book_genres)
+        book_df.loc[len(book_df)] = {'Book_Id': book_id, 'Book_Title': b[1],'Author': b[2], 'Year_Published': year_published, 
+                                     'Exclusive_Shelf': 'best-seller', 'Number_of_Pages': number_of_pages, 
+                                     'Genres': genres_string}
+
+    return book_df
diff --git a/main.py b/main.py
@@ -1,8 +1,13 @@
 from get_user_books import set_url
-from format_data import create_combined_list, format_ratings, save_to_csv
+from format_data import create_combined_list, format_ratings, save_to_csv, combine_bestseller_total
 from get_generic_book import get_generic_book_data
+from get_bestseller import set_parameters, get_books, fetch_book_data_from_isbn13
 from config import USER_ID
 
+# Set this to True if we are to collect NYT Best sellers top 100
+# To use for recommending books from the best selling list later
+getNYbooks = True
+
 books_read, books_to_read = set_url(USER_ID)
 
 # Create a combined dataframe with the books
@@ -14,5 +19,13 @@
 # Add the generic book-data to the dataframe
 all_books = get_generic_book_data(all_books)
 
+# If getting NYT best sellers
+if(getNYbooks==True):
+    print('Getting NYT best sellers')
+    nyt_url = set_parameters()
+    nyt_books = get_books(nyt_url)
+    nyt_books = fetch_book_data_from_isbn13(nyt_books)
+    all_books = combine_bestseller_total(all_books, nyt_books)
+
 # Save to CSV
 save_to_csv(all_books)