Fixed some bugs

EmmaTellblom · Nov 20, 2023 · 2dbf9f0 · 2dbf9f0
1 parent bf5ada6
commit 2dbf9f0
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 11 deletions.
diff --git a/format_data.py b/format_data.py
@@ -24,4 +24,21 @@ def create_combined_list(books_have_read, books_to_read):
     return books_all_shelves
 
 def save_to_csv(book_data):
-    book_data.to_csv('goodreads_python_export.csv', encoding='utf-8', sep=';', index=False)
+    # Add columns to make sure they match goodreads export
+    columns_to_ensure = ['Author_l-f', 'Additional_Authors', 'ISBN', 'ISBN13', 'Publisher', 'Binding', 'Year_Published', 'Original_Publication_Year', 'Date_Read', 'Date_Added', 'Bookshelves', 'Bookshelves_with_positions', 'My_Review', 'Spoiler', 'Private_Notes', 'Read_Count', 'Owned_Copies']
+    for col in columns_to_ensure:
+        if col not in book_data.columns:
+            book_data[col] = np.nan
+
+    # Arrange columns in the desired order
+    desired_order = ['Book_Id', 'Book_Title', 'Author', 'Author_l-f', 'Additional_Authors', 'ISBN', 'ISBN13',
+                     'My_Rating', 'Average_Rating', 'Publisher', 'Binding', 'Number_of_Pages', 'Year_Published',
+                     'Original_Publication_Year', 'Date_Read', 'Date_Added', 'Bookshelves',
+                     'Bookshelves_with_positions', 'Exclusive_Shelf', 'My_Review', 'Spoiler', 'Private_Notes',
+                     'Read_Count', 'Owned_Copies', 'Genres']
+
+    # Reorder columns
+    ordered_book_data = book_data[desired_order]
+
+    # Save to CSV
+    ordered_book_data.to_csv('goodreads_python_export.csv', encoding='utf-8', sep=',', index=False)
diff --git a/get_generic_book.py b/get_generic_book.py
@@ -9,25 +9,36 @@
 ## is collected in file get_user_books.py          ##
 #####################################################
 
+
 def fetch_book_data(book_id): 
     base_url = 'https://www.goodreads.com/book/show/'
     book_url = f'{base_url}{book_id}'
     page = requests.get(book_url)
     soup = BeautifulSoup(page.content, 'html.parser')
-
+    
     # Extract genres
-    book_genres = [a.text for a in soup.select('a[href*="/genres/"]')]
+    genre_links = soup.select('a[href*="/genres/"]')
+    book_genres = [link.text.strip() for link in genre_links]
 
     # Extract published year and number of pages
     publication_info_element = soup.find('p', {'data-testid': 'publicationInfo'})
-    publication_info_text = publication_info_element.text.strip()
-    published_year = int(publication_info_text.split()[-1]) if publication_info_text else None
+    year_published = None
+    if publication_info_element:
+        publication_info_text = publication_info_element.text.strip()
+        year_published = int(publication_info_text.split()[-1]) if publication_info_text else None
 
     pages_element = soup.find('p', {'data-testid': 'pagesFormat'})
-    pages_text = pages_element.text.strip()
-    number_of_pages = int(pages_text.split()[0]) if pages_text else None
+    pages_text = pages_element.text.strip() if pages_element else ''
+
+    try:
+        number_of_pages = int(pages_text.split()[0])
+    except (ValueError, IndexError):
+        number_of_pages = None
+
+    # Join genres into a comma-separated string
+    genres_string = ', '.join(book_genres)
 
-    return {'Number_of_Pages': number_of_pages, 'Published_Year': published_year, 'Genres': [book_genres]}
+    return {'Number_of_Pages': number_of_pages, 'Year_Published': year_published, 'Genres': genres_string}
 
 def get_generic_book_data(booklist): # Initiate the fetching of book-data
     book_ids = booklist['Book_Id'].tolist()

diff --git a/get_user_books.py b/get_user_books.py
@@ -36,7 +36,7 @@ def get_no_of_pages(book_url): # Get number of pages needed to paginate through
     return number_of_pages # No of pages to paginate
 
 def get_user_book_info(book_url, number_of_pages):
-    books = pd.DataFrame(columns=['Book_Id', 'Book_Title', 'Author', 'Average_Rating', 'My_Rating'])
+    books = pd.DataFrame(columns=['Book_Id', 'Book_Title', 'Author','Author_l-f', 'Additional_Authors', 'ISBN', 'ISBN13', 'My_Rating', 'Average_Rating', 'Publisher', 'Binding', 'Number_of_Pages', 'Year_Published', 'Original_Publication_Year', 'Date_Read', 'Date_Added', 'Bookshelves', 'Bookshelves_with_positions', 'Exclusive_Shelf', 'My_Review', 'Spoiler', 'Private_Notes', 'Read_Count', 'Owned_Copies', 'Genres'])
     dataframes = []
     for i in range(1,number_of_pages+1):
         book_url_pages = book_url + f'&page={i}' # Create pagination URL
@@ -48,7 +48,8 @@ def get_user_book_info(book_url, number_of_pages):
             info = {
                 'Book_Id': link['href'].split('/')[-1].split('-')[0].split('.')[0],
                 'Book_Title': link.get('title'),
-                'Author': link.find_next('td', class_='field author').find('a').text.strip(),
+                'Author': link.find_next('td', class_='field author').find('a').text.strip().replace('"', ''),
+                #'Author': link.find_next('td', class_='field author').find('a').text.strip(),
                 'Average_Rating': link.find_next('td', class_='field avg_rating').find('div', class_='value').text.strip(),
             }
             # Check if 'user_rating' element is present
@@ -65,7 +66,7 @@ def get_user_book_info(book_url, number_of_pages):
 
     # Concatenate all DataFrames in the list
     books = pd.concat(dataframes, ignore_index=True)
-
+    print('I have collected user books')
     # Drop rows with None values in the 'title' column becuse of duplicates
     books = books.dropna(subset=['Book_Title'])   
     books = books.reset_index(drop=True)