Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 103 additions & 7 deletions lab-web-scraping.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -111,13 +111,109 @@
{
"cell_type": "code",
"execution_count": null,
"id": "40359eee-9cd7-4884-bfa4-83344c222305",
"metadata": {
"id": "40359eee-9cd7-4884-bfa4-83344c222305"
},
"id": "39ebd249",
"metadata": {},
"outputs": [],
"source": [
"# Your solution goes here"
"# Importing the libraries\n",
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import pandas as pd\n",
"\n",
"# Function to scrape book data\n",
"def scrape_books(min_rating=1, max_price=float('inf')):\n",
"\n",
" # Generating the list of URLs to scrape\n",
" web_urls = [f'https://books.toscrape.com/catalogue/page-{pag}.html' for pag in range(0,51)] \n",
"\n",
" books_urls = []\n",
" # Scraping data from each URL\n",
" for url in web_urls:\n",
" # Making a GET request to the URL\n",
" response = requests.get(url)\n",
" # Parsing the HTML content\n",
" soup = BeautifulSoup(response.content, \"html.parser\")\n",
" # Extracting book URLs\n",
" for container in soup.find_all(class_='image_container'):\n",
" books_urls.extend([a['href'] for a in container.find_all('a')])\n",
"\n",
" # Completing the book URLs\n",
" books_urls = [f'https://books.toscrape.com/catalogue/{url}' for url in books_urls]\n",
"\n",
" # Initializing lists to store the scraped data\n",
" upc_codes = []\n",
" titles = []\n",
" prices = []\n",
" rating_stars = []\n",
" genres = []\n",
" availability = []\n",
" descriptions = []\n",
"\n",
" # Scraping data from each book URL\n",
" for url in books_urls:\n",
" response = requests.get(url)\n",
" soup = BeautifulSoup(response.content, \"html.parser\")\n",
"\n",
" # Extracting the book description\n",
" desc_find = soup.find('div', id='product_description')\n",
" if desc_find:\n",
" description = desc_find.find_next_sibling('p').get_text(strip=True)\n",
" descriptions.append(description)\n",
" else:\n",
" descriptions.append(\"No description available.\")\n",
"\n",
" # Extracting data from the product information table\n",
" table = soup.find('table', class_='table table-striped')\n",
" tds = table.find_all('td')\n",
"\n",
" # Extracting and appending the UPC code\n",
" upc_codes.append(tds[0].get_text(strip=True))\n",
"\n",
" # Extracting and appending the price\n",
" prices.append(tds[2].get_text(strip=True))\n",
"\n",
" # Extracting and appending the availability\n",
" availability.append(tds[5].get_text(strip=True))\n",
"\n",
" # Extracting and appending the title\n",
" title = soup.find('div', class_='product_main').find('h1').get_text(strip=True)\n",
" titles.append(title)\n",
"\n",
" # Extracting and appending the rating stars\n",
" rating_stars.append(soup.find('p', class_='star-rating')['class'][1])\n",
"\n",
" # Extracting and appending the genre\n",
" genre = soup.find('ul', class_='breadcrumb').find_all('li')[2].get_text(strip=True)\n",
" genres.append(genre)\n",
"\n",
" # Creating a DataFrame to store the scraped data\n",
" df = pd.DataFrame({\n",
" 'UPC': upc_codes,\n",
" 'Title': titles,\n",
" 'Price': prices,\n",
" 'Rating': rating_stars,\n",
" 'Genre': genres,\n",
" 'Availability': availability,\n",
" 'Description': descriptions\n",
" })\n",
"\n",
" # Cleaning and converting the 'Price' column to float\n",
" df['Price (£)'] = df['Price (£)'].str.replace('£', '').astype(float)\n",
"\n",
" # Converting the 'Rating' column to numerical values\n",
" df['Rating'] = df['Rating'].map({\n",
" 'One': 1,\n",
" 'Two': 2,\n",
" 'Three': 3,\n",
" 'Four': 4,\n",
" 'Five': 5\n",
" })\n",
" df['Rating'] = df['Rating'].astype('int64')\n",
"\n",
" # Filtering the DataFrame based on the provided criteria\n",
" filtered_df = df[(df['Rating'] >= min_rating) & (df['Price (£)'] <= max_price)]\n",
" \n",
" return filtered_df"
]
}
],
Expand All @@ -126,7 +222,7 @@
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
Expand All @@ -140,7 +236,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
"version": "3.12.9"
}
},
"nbformat": 4,
Expand Down