Skip to content

Commit

Permalink
Added function to get best-selling books from NYT
Browse files Browse the repository at this point in the history
  • Loading branch information
EmmaTellblom committed Dec 9, 2023
1 parent 2dbf9f0 commit 0c5c3e9
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 6 deletions.
3 changes: 2 additions & 1 deletion config_sample.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
USER_ID=19514847
USER_ID=19514847
NYT_KEY='123123'
20 changes: 17 additions & 3 deletions format_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ def format_ratings(book_data):

choices = [5, 4, 3, 2, 1]
book_data['My_Rating'] = np.select(conditions, choices, default=None)

return book_data

# Put the to-read and read to one dataframe to look like Goodreads export
Expand All @@ -23,10 +22,21 @@ def create_combined_list(books_have_read, books_to_read):
books_all_shelves = pd.concat([books_have_read, books_to_read], ignore_index=True)
return books_all_shelves

def combine_bestseller_total(booklist, bestseller):
# Need to check if there are duplicate book_id in dataframes
booklist = pd.DataFrame(booklist)
bestseller = pd.DataFrame(bestseller)
# Check for duplicates in book_id
duplicates = pd.merge(booklist[['Book_Id']], bestseller[['Book_Id']], how='inner', on='Book_Id')
bestseller = bestseller[~bestseller['Book_Id'].isin(duplicates['Book_Id'])]
# Combine the list
books_all_shelves = pd.concat([booklist, bestseller], ignore_index=True)
return books_all_shelves

def save_to_csv(book_data):
# Add columns to make sure they match goodreads export
columns_to_ensure = ['Author_l-f', 'Additional_Authors', 'ISBN', 'ISBN13', 'Publisher', 'Binding', 'Year_Published', 'Original_Publication_Year', 'Date_Read', 'Date_Added', 'Bookshelves', 'Bookshelves_with_positions', 'My_Review', 'Spoiler', 'Private_Notes', 'Read_Count', 'Owned_Copies']
for col in columns_to_ensure:
columns_to_include = ['Author_l-f', 'Additional_Authors', 'ISBN', 'ISBN13', 'Publisher', 'Binding', 'Year_Published', 'Original_Publication_Year', 'Date_Read', 'Date_Added', 'Bookshelves', 'Bookshelves_with_positions', 'My_Review', 'Spoiler', 'Private_Notes', 'Read_Count', 'Owned_Copies']
for col in columns_to_include:
if col not in book_data.columns:
book_data[col] = np.nan

Expand All @@ -42,3 +52,7 @@ def save_to_csv(book_data):

# Save to CSV
ordered_book_data.to_csv('goodreads_python_export.csv', encoding='utf-8', sep=',', index=False)

def save_to_csv_as_is(booklist):
booklist = pd.DataFrame(booklist)
booklist.to_csv('validate_data.csv', encoding='utf-8', sep=',', index=False)
24 changes: 24 additions & 0 deletions get_bestseller.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from config import NYT_KEY
import requests
import pandas as pd
from get_generic_book import fetch_book_data_from_isbn13
from format_data import save_to_csv, save_to_csv_as_is

# Set the parameters to get books from New York Times Best Seller List
def set_parameters():
base_nyt_url = 'https://api.nytimes.com/svc/books/v3/lists/full-overview.json'
nyt_url = f'{base_nyt_url}?api-key={NYT_KEY}'
return nyt_url

# Get the books from the API
def get_books(nyt_url):
best_sellers = []
response = requests.get(nyt_url)
if response.status_code == 200: # Check so status is 200
data = response.json()
for document in data['results']['lists']: # Nestle down in the API
book_info = document.get('books', []) # Get the book data
for b in book_info:
# Save some data from the NYT API
best_sellers.append([b.get('primary_isbn13'), b.get('title').title(), b.get('author')])
return best_sellers[:100] # Return only 100 first books
44 changes: 43 additions & 1 deletion get_generic_book.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import pandas as pd
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urlparse, parse_qs
import re

#####################################################
## This file is to get book-data from the ##
Expand Down Expand Up @@ -48,4 +50,44 @@ def get_generic_book_data(booklist): # Initiate the fetching of book-data
additional_info = pd.DataFrame(results)
booklist = pd.concat([booklist, additional_info], axis=1)

return booklist
return booklist

# This function is used for getting Goodreads book data from the ISBN13 gotten from NYT API
def fetch_book_data_from_isbn13(book_list):
book_df = pd.DataFrame(columns=['Book_Id', 'Book_Title', 'Author', 'Year_Published', 'Exclusive_Shelf', 'Number_of_Pages', 'Genres'])
base_url = 'https://www.goodreads.com/search?q='
for b in book_list:
isbn13_url = f'{base_url}{b[0]}' # ISBN from the booklist
page = requests.get(isbn13_url)
soup = BeautifulSoup(page.content, 'html.parser')

# Get the book_id
final_url = page.url # Redirected url
parsed_url = urlparse(final_url)
path_segments = parsed_url.path.split('/')
book_id = path_segments[-1] # The book ID is at the back of the url
book_id = book_id.split('-')[0] # Remove the text after the book_id

# Extract genres
genre_links = soup.select('a[href*="/genres/"]')
book_genres = [link.text.strip() for link in genre_links]

# Extract published year and number of pages
publication_info_element = soup.find('p', {'data-testid': 'publicationInfo'})
year_published = None
if publication_info_element:
publication_info_text = publication_info_element.text.strip()
year_published = int(publication_info_text.split()[-1]) if publication_info_text else None
pages_element = soup.find('p', {'data-testid': 'pagesFormat'})
pages_text = pages_element.text.strip() if pages_element else ''
try:
number_of_pages = int(pages_text.split()[0])
except (ValueError, IndexError):
number_of_pages = None
# Join genres into a comma-separated string
genres_string = ', '.join(book_genres)
book_df.loc[len(book_df)] = {'Book_Id': book_id, 'Book_Title': b[1],'Author': b[2], 'Year_Published': year_published,
'Exclusive_Shelf': 'best-seller', 'Number_of_Pages': number_of_pages,
'Genres': genres_string}

return book_df
15 changes: 14 additions & 1 deletion main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
from get_user_books import set_url
from format_data import create_combined_list, format_ratings, save_to_csv
from format_data import create_combined_list, format_ratings, save_to_csv, combine_bestseller_total
from get_generic_book import get_generic_book_data
from get_bestseller import set_parameters, get_books, fetch_book_data_from_isbn13
from config import USER_ID

# Set this to True if we are to collect NYT Best sellers top 100
# To use for recommending books from the best selling list later
getNYbooks = True

books_read, books_to_read = set_url(USER_ID)

# Create a combined dataframe with the books
Expand All @@ -14,5 +19,13 @@
# Add the generic book-data to the dataframe
all_books = get_generic_book_data(all_books)

# If getting NYT best sellers
if(getNYbooks==True):
print('Getting NYT best sellers')
nyt_url = set_parameters()
nyt_books = get_books(nyt_url)
nyt_books = fetch_book_data_from_isbn13(nyt_books)
all_books = combine_bestseller_total(all_books, nyt_books)

# Save to CSV
save_to_csv(all_books)

0 comments on commit 0c5c3e9

Please sign in to comment.