diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb index e552783..6c65096 100644 --- a/lab-web-scraping.ipynb +++ b/lab-web-scraping.ipynb @@ -111,13 +111,109 @@ { "cell_type": "code", "execution_count": null, - "id": "40359eee-9cd7-4884-bfa4-83344c222305", - "metadata": { - "id": "40359eee-9cd7-4884-bfa4-83344c222305" - }, + "id": "39ebd249", + "metadata": {}, "outputs": [], "source": [ - "# Your solution goes here" + "# Importing the libraries\n", + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import pandas as pd\n", + "\n", + "# Function to scrape book data\n", + "def scrape_books(min_rating=1, max_price=float('inf')):\n", + "\n", + " # Generating the list of URLs to scrape\n", + " web_urls = [f'https://books.toscrape.com/catalogue/page-{pag}.html' for pag in range(0,51)] \n", + "\n", + " books_urls = []\n", + " # Scraping data from each URL\n", + " for url in web_urls:\n", + " # Making a GET request to the URL\n", + " response = requests.get(url)\n", + " # Parsing the HTML content\n", + " soup = BeautifulSoup(response.content, \"html.parser\")\n", + " # Extracting book URLs\n", + " for container in soup.find_all(class_='image_container'):\n", + " books_urls.extend([a['href'] for a in container.find_all('a')])\n", + "\n", + " # Completing the book URLs\n", + " books_urls = [f'https://books.toscrape.com/catalogue/{url}' for url in books_urls]\n", + "\n", + " # Initializing lists to store the scraped data\n", + " upc_codes = []\n", + " titles = []\n", + " prices = []\n", + " rating_stars = []\n", + " genres = []\n", + " availability = []\n", + " descriptions = []\n", + "\n", + " # Scraping data from each book URL\n", + " for url in books_urls:\n", + " response = requests.get(url)\n", + " soup = BeautifulSoup(response.content, \"html.parser\")\n", + "\n", + " # Extracting the book description\n", + " desc_find = soup.find('div', id='product_description')\n", + " if desc_find:\n", + " description = desc_find.find_next_sibling('p').get_text(strip=True)\n", + " descriptions.append(description)\n", + " else:\n", + " descriptions.append(\"No description available.\")\n", + "\n", + " # Extracting data from the product information table\n", + " table = soup.find('table', class_='table table-striped')\n", + " tds = table.find_all('td')\n", + "\n", + " # Extracting and appending the UPC code\n", + " upc_codes.append(tds[0].get_text(strip=True))\n", + "\n", + " # Extracting and appending the price\n", + " prices.append(tds[2].get_text(strip=True))\n", + "\n", + " # Extracting and appending the availability\n", + " availability.append(tds[5].get_text(strip=True))\n", + "\n", + " # Extracting and appending the title\n", + " title = soup.find('div', class_='product_main').find('h1').get_text(strip=True)\n", + " titles.append(title)\n", + "\n", + " # Extracting and appending the rating stars\n", + " rating_stars.append(soup.find('p', class_='star-rating')['class'][1])\n", + "\n", + " # Extracting and appending the genre\n", + " genre = soup.find('ul', class_='breadcrumb').find_all('li')[2].get_text(strip=True)\n", + " genres.append(genre)\n", + "\n", + " # Creating a DataFrame to store the scraped data\n", + " df = pd.DataFrame({\n", + " 'UPC': upc_codes,\n", + " 'Title': titles,\n", + " 'Price': prices,\n", + " 'Rating': rating_stars,\n", + " 'Genre': genres,\n", + " 'Availability': availability,\n", + " 'Description': descriptions\n", + " })\n", + "\n", + " # Cleaning and converting the 'Price' column to float\n", + " df['Price (£)'] = df['Price (£)'].str.replace('£', '').astype(float)\n", + "\n", + " # Converting the 'Rating' column to numerical values\n", + " df['Rating'] = df['Rating'].map({\n", + " 'One': 1,\n", + " 'Two': 2,\n", + " 'Three': 3,\n", + " 'Four': 4,\n", + " 'Five': 5\n", + " })\n", + " df['Rating'] = df['Rating'].astype('int64')\n", + "\n", + " # Filtering the DataFrame based on the provided criteria\n", + " filtered_df = df[(df['Rating'] >= min_rating) & (df['Price (£)'] <= max_price)]\n", + " \n", + " return filtered_df" ] } ], @@ -126,7 +222,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -140,7 +236,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.12.9" } }, "nbformat": 4,