Skip to content

Commit

Permalink
Added pagination in web scraping code
Browse files Browse the repository at this point in the history
  • Loading branch information
tanvi0909 committed Nov 7, 2024
1 parent 9483c5f commit ff84fdb
Showing 1 changed file with 43 additions and 33 deletions.
76 changes: 43 additions & 33 deletions web scraping in python/quotes.toscrape.com/scrap.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand All @@ -18,47 +18,57 @@
"from bs4 import BeautifulSoup\n",
"import pandas as pd\n",
"\n",
"def scrape_quotes():\n",
" # URL of the site to scrape\n",
" url = 'http://quotes.toscrape.com/'\n",
"def scrape_quotes(max_pages=10): # max_pages parameter to limit the number of pages\n",
" # Base URL of the site to scrape\n",
" base_url = 'http://quotes.toscrape.com/page/'\n",
" \n",
" # Send a GET request to fetch the webpage content\n",
" response = requests.get(url)\n",
" # Lists to hold quotes and authors\n",
" quotes_list = []\n",
" authors_list = []\n",
" \n",
" page = 1 # Start with the first page\n",
" while page <= max_pages: # Limit the loop to max_pages\n",
" # Send a GET request to fetch the webpage content of the current page\n",
" url = f'{base_url}{page}/'\n",
" response = requests.get(url)\n",
"\n",
" # Check if the request was successful\n",
" if response.status_code == 200:\n",
" # Parse the webpage content\n",
" soup = BeautifulSoup(response.text, 'html.parser')\n",
" # Check if the request was successful\n",
" if response.status_code == 200:\n",
" # Parse the webpage content\n",
" soup = BeautifulSoup(response.text, 'html.parser')\n",
"\n",
" # Find all quote elements\n",
" quotes = soup.find_all('div', class_='quote')\n",
" # Find all quote elements on the current page\n",
" quotes = soup.find_all('div', class_='quote')\n",
"\n",
" # Lists to hold quotes and authors\n",
" quotes_list = []\n",
" authors_list = []\n",
" # If no quotes are found, break the loop (end of pages)\n",
" if not quotes:\n",
" break\n",
"\n",
" # Loop through the quotes and store the text and author\n",
" for quote in quotes:\n",
" text = quote.find('span', class_='text').text\n",
" author = quote.find('small', class_='author').text\n",
" quotes_list.append(text)\n",
" authors_list.append(author)\n",
" # Loop through the quotes and store the text and author\n",
" for quote in quotes:\n",
" text = quote.find('span', class_='text').text\n",
" author = quote.find('small', class_='author').text\n",
" quotes_list.append(text)\n",
" authors_list.append(author)\n",
"\n",
" # Create a DataFrame\n",
" quotes_df = pd.DataFrame({\n",
" 'Quote': quotes_list,\n",
" 'Author': authors_list\n",
" })\n",
" # Move to the next page\n",
" page += 1\n",
" else:\n",
" print(f'Failed to retrieve webpage. Status code: {response.status_code}')\n",
" break\n",
"\n",
" # Write the DataFrame to an Excel file\n",
" quotes_df.to_excel('quotes.xlsx', index=False)\n",
" print(\"Quotes have been written to quotes.xlsx\")\n",
" # Create a DataFrame\n",
" quotes_df = pd.DataFrame({\n",
" 'Quote': quotes_list,\n",
" 'Author': authors_list\n",
" })\n",
"\n",
" else:\n",
" print(f'Failed to retrieve webpage. Status code: {response.status_code}')\n",
" # Write the DataFrame to an Excel file\n",
" quotes_df.to_excel('quotes.xlsx', index=False)\n",
" print(\"Quotes have been written to quotes.xlsx\")\n",
"\n",
"# Run the scraper\n",
"scrape_quotes()"
"# Run the scraper with page limit\n",
"scrape_quotes(max_pages=10)"
]
}
],
Expand Down

0 comments on commit ff84fdb

Please sign in to comment.