Skip to content

Commit

Permalink
Fixed some bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
EmmaTellblom committed Nov 20, 2023
1 parent bf5ada6 commit 2dbf9f0
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 11 deletions.
19 changes: 18 additions & 1 deletion format_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,21 @@ def create_combined_list(books_have_read, books_to_read):
return books_all_shelves

def save_to_csv(book_data):
book_data.to_csv('goodreads_python_export.csv', encoding='utf-8', sep=';', index=False)
# Add columns to make sure they match goodreads export
columns_to_ensure = ['Author_l-f', 'Additional_Authors', 'ISBN', 'ISBN13', 'Publisher', 'Binding', 'Year_Published', 'Original_Publication_Year', 'Date_Read', 'Date_Added', 'Bookshelves', 'Bookshelves_with_positions', 'My_Review', 'Spoiler', 'Private_Notes', 'Read_Count', 'Owned_Copies']
for col in columns_to_ensure:
if col not in book_data.columns:
book_data[col] = np.nan

# Arrange columns in the desired order
desired_order = ['Book_Id', 'Book_Title', 'Author', 'Author_l-f', 'Additional_Authors', 'ISBN', 'ISBN13',
'My_Rating', 'Average_Rating', 'Publisher', 'Binding', 'Number_of_Pages', 'Year_Published',
'Original_Publication_Year', 'Date_Read', 'Date_Added', 'Bookshelves',
'Bookshelves_with_positions', 'Exclusive_Shelf', 'My_Review', 'Spoiler', 'Private_Notes',
'Read_Count', 'Owned_Copies', 'Genres']

# Reorder columns
ordered_book_data = book_data[desired_order]

# Save to CSV
ordered_book_data.to_csv('goodreads_python_export.csv', encoding='utf-8', sep=',', index=False)
25 changes: 18 additions & 7 deletions get_generic_book.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,25 +9,36 @@
## is collected in file get_user_books.py ##
#####################################################


def fetch_book_data(book_id):
base_url = 'https://www.goodreads.com/book/show/'
book_url = f'{base_url}{book_id}'
page = requests.get(book_url)
soup = BeautifulSoup(page.content, 'html.parser')

# Extract genres
book_genres = [a.text for a in soup.select('a[href*="/genres/"]')]
genre_links = soup.select('a[href*="/genres/"]')
book_genres = [link.text.strip() for link in genre_links]

# Extract published year and number of pages
publication_info_element = soup.find('p', {'data-testid': 'publicationInfo'})
publication_info_text = publication_info_element.text.strip()
published_year = int(publication_info_text.split()[-1]) if publication_info_text else None
year_published = None
if publication_info_element:
publication_info_text = publication_info_element.text.strip()
year_published = int(publication_info_text.split()[-1]) if publication_info_text else None

pages_element = soup.find('p', {'data-testid': 'pagesFormat'})
pages_text = pages_element.text.strip()
number_of_pages = int(pages_text.split()[0]) if pages_text else None
pages_text = pages_element.text.strip() if pages_element else ''

try:
number_of_pages = int(pages_text.split()[0])
except (ValueError, IndexError):
number_of_pages = None

# Join genres into a comma-separated string
genres_string = ', '.join(book_genres)

return {'Number_of_Pages': number_of_pages, 'Published_Year': published_year, 'Genres': [book_genres]}
return {'Number_of_Pages': number_of_pages, 'Year_Published': year_published, 'Genres': genres_string}

def get_generic_book_data(booklist): # Initiate the fetching of book-data
book_ids = booklist['Book_Id'].tolist()
Expand Down
7 changes: 4 additions & 3 deletions get_user_books.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def get_no_of_pages(book_url): # Get number of pages needed to paginate through
return number_of_pages # No of pages to paginate

def get_user_book_info(book_url, number_of_pages):
books = pd.DataFrame(columns=['Book_Id', 'Book_Title', 'Author', 'Average_Rating', 'My_Rating'])
books = pd.DataFrame(columns=['Book_Id', 'Book_Title', 'Author','Author_l-f', 'Additional_Authors', 'ISBN', 'ISBN13', 'My_Rating', 'Average_Rating', 'Publisher', 'Binding', 'Number_of_Pages', 'Year_Published', 'Original_Publication_Year', 'Date_Read', 'Date_Added', 'Bookshelves', 'Bookshelves_with_positions', 'Exclusive_Shelf', 'My_Review', 'Spoiler', 'Private_Notes', 'Read_Count', 'Owned_Copies', 'Genres'])
dataframes = []
for i in range(1,number_of_pages+1):
book_url_pages = book_url + f'&page={i}' # Create pagination URL
Expand All @@ -48,7 +48,8 @@ def get_user_book_info(book_url, number_of_pages):
info = {
'Book_Id': link['href'].split('/')[-1].split('-')[0].split('.')[0],
'Book_Title': link.get('title'),
'Author': link.find_next('td', class_='field author').find('a').text.strip(),
'Author': link.find_next('td', class_='field author').find('a').text.strip().replace('"', ''),
#'Author': link.find_next('td', class_='field author').find('a').text.strip(),
'Average_Rating': link.find_next('td', class_='field avg_rating').find('div', class_='value').text.strip(),
}
# Check if 'user_rating' element is present
Expand All @@ -65,7 +66,7 @@ def get_user_book_info(book_url, number_of_pages):

# Concatenate all DataFrames in the list
books = pd.concat(dataframes, ignore_index=True)

print('I have collected user books')
# Drop rows with None values in the 'title' column becuse of duplicates
books = books.dropna(subset=['Book_Title'])
books = books.reset_index(drop=True)
Expand Down

0 comments on commit 2dbf9f0

Please sign in to comment.