Added pagination in web scraping code

UppuluriKalyani · Nov 7, 2024 · ff84fdb · ff84fdb
1 parent 9483c5f
commit ff84fdb
Showing 1 changed file with 43 additions and 33 deletions.
diff --git a/web scraping in python/quotes.toscrape.com/scrap.ipynb b/web scraping in python/quotes.toscrape.com/scrap.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -18,47 +18,57 @@
     "from bs4 import BeautifulSoup\n",
     "import pandas as pd\n",
     "\n",
-    "def scrape_quotes():\n",
-    "    # URL of the site to scrape\n",
-    "    url = 'http://quotes.toscrape.com/'\n",
+    "def scrape_quotes(max_pages=10):  # max_pages parameter to limit the number of pages\n",
+    "    # Base URL of the site to scrape\n",
+    "    base_url = 'http://quotes.toscrape.com/page/'\n",
     "    \n",
-    "    # Send a GET request to fetch the webpage content\n",
-    "    response = requests.get(url)\n",
+    "    # Lists to hold quotes and authors\n",
+    "    quotes_list = []\n",
+    "    authors_list = []\n",
+    "    \n",
+    "    page = 1  # Start with the first page\n",
+    "    while page <= max_pages:  # Limit the loop to max_pages\n",
+    "        # Send a GET request to fetch the webpage content of the current page\n",
+    "        url = f'{base_url}{page}/'\n",
+    "        response = requests.get(url)\n",
     "\n",
-    "    # Check if the request was successful\n",
-    "    if response.status_code == 200:\n",
-    "        # Parse the webpage content\n",
-    "        soup = BeautifulSoup(response.text, 'html.parser')\n",
+    "        # Check if the request was successful\n",
+    "        if response.status_code == 200:\n",
+    "            # Parse the webpage content\n",
+    "            soup = BeautifulSoup(response.text, 'html.parser')\n",
     "\n",
-    "        # Find all quote elements\n",
-    "        quotes = soup.find_all('div', class_='quote')\n",
+    "            # Find all quote elements on the current page\n",
+    "            quotes = soup.find_all('div', class_='quote')\n",
     "\n",
-    "        # Lists to hold quotes and authors\n",
-    "        quotes_list = []\n",
-    "        authors_list = []\n",
+    "            # If no quotes are found, break the loop (end of pages)\n",
+    "            if not quotes:\n",
+    "                break\n",
     "\n",
-    "        # Loop through the quotes and store the text and author\n",
-    "        for quote in quotes:\n",
-    "            text = quote.find('span', class_='text').text\n",
-    "            author = quote.find('small', class_='author').text\n",
-    "            quotes_list.append(text)\n",
-    "            authors_list.append(author)\n",
+    "            # Loop through the quotes and store the text and author\n",
+    "            for quote in quotes:\n",
+    "                text = quote.find('span', class_='text').text\n",
+    "                author = quote.find('small', class_='author').text\n",
+    "                quotes_list.append(text)\n",
+    "                authors_list.append(author)\n",
     "\n",
-    "        # Create a DataFrame\n",
-    "        quotes_df = pd.DataFrame({\n",
-    "            'Quote': quotes_list,\n",
-    "            'Author': authors_list\n",
-    "        })\n",
+    "            # Move to the next page\n",
+    "            page += 1\n",
+    "        else:\n",
+    "            print(f'Failed to retrieve webpage. Status code: {response.status_code}')\n",
+    "            break\n",
     "\n",
-    "        # Write the DataFrame to an Excel file\n",
-    "        quotes_df.to_excel('quotes.xlsx', index=False)\n",
-    "        print(\"Quotes have been written to quotes.xlsx\")\n",
+    "    # Create a DataFrame\n",
+    "    quotes_df = pd.DataFrame({\n",
+    "        'Quote': quotes_list,\n",
+    "        'Author': authors_list\n",
+    "    })\n",
     "\n",
-    "    else:\n",
-    "        print(f'Failed to retrieve webpage. Status code: {response.status_code}')\n",
+    "    # Write the DataFrame to an Excel file\n",
+    "    quotes_df.to_excel('quotes.xlsx', index=False)\n",
+    "    print(\"Quotes have been written to quotes.xlsx\")\n",
     "\n",
-    "# Run the scraper\n",
-    "scrape_quotes()"
+    "# Run the scraper with page limit\n",
+    "scrape_quotes(max_pages=10)"
    ]
   }
  ],