diff --git a/format_data.py b/format_data.py index 7e01bb5..e08264f 100644 --- a/format_data.py +++ b/format_data.py @@ -24,4 +24,21 @@ def create_combined_list(books_have_read, books_to_read): return books_all_shelves def save_to_csv(book_data): - book_data.to_csv('goodreads_python_export.csv', encoding='utf-8', sep=';', index=False) \ No newline at end of file + # Add columns to make sure they match goodreads export + columns_to_ensure = ['Author_l-f', 'Additional_Authors', 'ISBN', 'ISBN13', 'Publisher', 'Binding', 'Year_Published', 'Original_Publication_Year', 'Date_Read', 'Date_Added', 'Bookshelves', 'Bookshelves_with_positions', 'My_Review', 'Spoiler', 'Private_Notes', 'Read_Count', 'Owned_Copies'] + for col in columns_to_ensure: + if col not in book_data.columns: + book_data[col] = np.nan + + # Arrange columns in the desired order + desired_order = ['Book_Id', 'Book_Title', 'Author', 'Author_l-f', 'Additional_Authors', 'ISBN', 'ISBN13', + 'My_Rating', 'Average_Rating', 'Publisher', 'Binding', 'Number_of_Pages', 'Year_Published', + 'Original_Publication_Year', 'Date_Read', 'Date_Added', 'Bookshelves', + 'Bookshelves_with_positions', 'Exclusive_Shelf', 'My_Review', 'Spoiler', 'Private_Notes', + 'Read_Count', 'Owned_Copies', 'Genres'] + + # Reorder columns + ordered_book_data = book_data[desired_order] + + # Save to CSV + ordered_book_data.to_csv('goodreads_python_export.csv', encoding='utf-8', sep=',', index=False) diff --git a/get_generic_book.py b/get_generic_book.py index a03f48c..4bc3a41 100644 --- a/get_generic_book.py +++ b/get_generic_book.py @@ -9,25 +9,36 @@ ## is collected in file get_user_books.py ## ##################################################### + def fetch_book_data(book_id): base_url = 'https://www.goodreads.com/book/show/' book_url = f'{base_url}{book_id}' page = requests.get(book_url) soup = BeautifulSoup(page.content, 'html.parser') - + # Extract genres - book_genres = [a.text for a in soup.select('a[href*="/genres/"]')] + genre_links = soup.select('a[href*="/genres/"]') + book_genres = [link.text.strip() for link in genre_links] # Extract published year and number of pages publication_info_element = soup.find('p', {'data-testid': 'publicationInfo'}) - publication_info_text = publication_info_element.text.strip() - published_year = int(publication_info_text.split()[-1]) if publication_info_text else None + year_published = None + if publication_info_element: + publication_info_text = publication_info_element.text.strip() + year_published = int(publication_info_text.split()[-1]) if publication_info_text else None pages_element = soup.find('p', {'data-testid': 'pagesFormat'}) - pages_text = pages_element.text.strip() - number_of_pages = int(pages_text.split()[0]) if pages_text else None + pages_text = pages_element.text.strip() if pages_element else '' + + try: + number_of_pages = int(pages_text.split()[0]) + except (ValueError, IndexError): + number_of_pages = None + + # Join genres into a comma-separated string + genres_string = ', '.join(book_genres) - return {'Number_of_Pages': number_of_pages, 'Published_Year': published_year, 'Genres': [book_genres]} + return {'Number_of_Pages': number_of_pages, 'Year_Published': year_published, 'Genres': genres_string} def get_generic_book_data(booklist): # Initiate the fetching of book-data book_ids = booklist['Book_Id'].tolist() diff --git a/get_user_books.py b/get_user_books.py index 315a7b3..10a18af 100644 --- a/get_user_books.py +++ b/get_user_books.py @@ -36,7 +36,7 @@ def get_no_of_pages(book_url): # Get number of pages needed to paginate through return number_of_pages # No of pages to paginate def get_user_book_info(book_url, number_of_pages): - books = pd.DataFrame(columns=['Book_Id', 'Book_Title', 'Author', 'Average_Rating', 'My_Rating']) + books = pd.DataFrame(columns=['Book_Id', 'Book_Title', 'Author','Author_l-f', 'Additional_Authors', 'ISBN', 'ISBN13', 'My_Rating', 'Average_Rating', 'Publisher', 'Binding', 'Number_of_Pages', 'Year_Published', 'Original_Publication_Year', 'Date_Read', 'Date_Added', 'Bookshelves', 'Bookshelves_with_positions', 'Exclusive_Shelf', 'My_Review', 'Spoiler', 'Private_Notes', 'Read_Count', 'Owned_Copies', 'Genres']) dataframes = [] for i in range(1,number_of_pages+1): book_url_pages = book_url + f'&page={i}' # Create pagination URL @@ -48,7 +48,8 @@ def get_user_book_info(book_url, number_of_pages): info = { 'Book_Id': link['href'].split('/')[-1].split('-')[0].split('.')[0], 'Book_Title': link.get('title'), - 'Author': link.find_next('td', class_='field author').find('a').text.strip(), + 'Author': link.find_next('td', class_='field author').find('a').text.strip().replace('"', ''), + #'Author': link.find_next('td', class_='field author').find('a').text.strip(), 'Average_Rating': link.find_next('td', class_='field avg_rating').find('div', class_='value').text.strip(), } # Check if 'user_rating' element is present @@ -65,7 +66,7 @@ def get_user_book_info(book_url, number_of_pages): # Concatenate all DataFrames in the list books = pd.concat(dataframes, ignore_index=True) - + print('I have collected user books') # Drop rows with None values in the 'title' column becuse of duplicates books = books.dropna(subset=['Book_Title']) books = books.reset_index(drop=True)