data-bootcamp-v4 · gcastellano3 · Nov 1, 2025
diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb
@@ -111,13 +111,109 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "40359eee-9cd7-4884-bfa4-83344c222305",
-      "metadata": {
-        "id": "40359eee-9cd7-4884-bfa4-83344c222305"
-      },
+      "id": "39ebd249",
+      "metadata": {},
       "outputs": [],
       "source": [
-        "# Your solution goes here"
+        "# Importing the libraries\n",
+        "import requests\n",
+        "from bs4 import BeautifulSoup\n",
+        "import pandas as pd\n",
+        "\n",
+        "# Function to scrape book data\n",
+        "def scrape_books(min_rating=1, max_price=float('inf')):\n",
+        "\n",
+        "    # Generating the list of URLs to scrape\n",
+        "    web_urls = [f'https://books.toscrape.com/catalogue/page-{pag}.html' for pag in range(0,51)]    \n",
+        "\n",
+        "    books_urls = []\n",
+        "    # Scraping data from each URL\n",
+        "    for url in web_urls:\n",
+        "        # Making a GET request to the URL\n",
+        "        response = requests.get(url)\n",
+        "        # Parsing the HTML content\n",
+        "        soup = BeautifulSoup(response.content, \"html.parser\")\n",
+        "        # Extracting book URLs\n",
+        "        for container in soup.find_all(class_='image_container'):\n",
+        "            books_urls.extend([a['href'] for a in container.find_all('a')])\n",
+        "\n",
+        "    # Completing the book URLs\n",
+        "    books_urls = [f'https://books.toscrape.com/catalogue/{url}' for url in books_urls]\n",
+        "\n",
+        "    # Initializing lists to store the scraped data\n",
+        "    upc_codes = []\n",
+        "    titles = []\n",
+        "    prices = []\n",
+        "    rating_stars = []\n",
+        "    genres = []\n",
+        "    availability = []\n",
+        "    descriptions = []\n",
+        "\n",
+        "    # Scraping data from each book URL\n",
+        "    for url in books_urls:\n",
+        "        response = requests.get(url)\n",
+        "        soup = BeautifulSoup(response.content, \"html.parser\")\n",
+        "\n",
+        "        # Extracting the book description\n",
+        "        desc_find = soup.find('div', id='product_description')\n",
+        "        if desc_find:\n",
+        "            description = desc_find.find_next_sibling('p').get_text(strip=True)\n",
+        "            descriptions.append(description)\n",
+        "        else:\n",
+        "            descriptions.append(\"No description available.\")\n",
+        "\n",
+        "        # Extracting data from the product information table\n",
+        "        table = soup.find('table', class_='table table-striped')\n",
+        "        tds = table.find_all('td')\n",
+        "\n",
+        "        # Extracting and appending the UPC code\n",
+        "        upc_codes.append(tds[0].get_text(strip=True))\n",
+        "\n",
+        "        # Extracting and appending the price\n",
+        "        prices.append(tds[2].get_text(strip=True))\n",
+        "\n",
+        "        # Extracting and appending the availability\n",
+        "        availability.append(tds[5].get_text(strip=True))\n",
+        "\n",
+        "        # Extracting and appending the title\n",
+        "        title = soup.find('div', class_='product_main').find('h1').get_text(strip=True)\n",
+        "        titles.append(title)\n",
+        "\n",
+        "        # Extracting and appending the rating stars\n",
+        "        rating_stars.append(soup.find('p', class_='star-rating')['class'][1])\n",
+        "\n",
+        "        # Extracting and appending the genre\n",
+        "        genre = soup.find('ul', class_='breadcrumb').find_all('li')[2].get_text(strip=True)\n",
+        "        genres.append(genre)\n",
+        "\n",
+        "    # Creating a DataFrame to store the scraped data\n",
+        "    df = pd.DataFrame({\n",
+        "        'UPC': upc_codes,\n",
+        "        'Title': titles,\n",
+        "        'Price': prices,\n",
+        "        'Rating': rating_stars,\n",
+        "        'Genre': genres,\n",
+        "        'Availability': availability,\n",
+        "        'Description': descriptions\n",
+        "    })\n",
+        "\n",
+        "    # Cleaning and converting the 'Price' column to float\n",
+        "    df['Price (£)'] = df['Price (£)'].str.replace('£', '').astype(float)\n",
+        "\n",
+        "    # Converting the 'Rating' column to numerical values\n",
+        "    df['Rating'] = df['Rating'].map({\n",
+        "        'One': 1,\n",
+        "        'Two': 2,\n",
+        "        'Three': 3,\n",
+        "        'Four': 4,\n",
+        "        'Five': 5\n",
+        "    })\n",
+        "    df['Rating'] = df['Rating'].astype('int64')\n",
+        "\n",
+        "    # Filtering the DataFrame based on the provided criteria\n",
+        "    filtered_df = df[(df['Rating'] >= min_rating) & (df['Price (£)'] <= max_price)]\n",
+        "    \n",
+        "    return filtered_df"
       ]
     }
   ],
@@ -126,7 +222,7 @@
       "provenance": []
     },
     "kernelspec": {
-      "display_name": "Python 3 (ipykernel)",
+      "display_name": "Python 3",
       "language": "python",
       "name": "python3"
     },
@@ -140,7 +236,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.9.13"
+      "version": "3.12.9"
     }
   },
   "nbformat": 4,