Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
278 changes: 274 additions & 4 deletions lab-web-scraping.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -110,14 +110,284 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 41,
"id": "40359eee-9cd7-4884-bfa4-83344c222305",
"metadata": {
"id": "40359eee-9cd7-4884-bfa4-83344c222305"
},
"outputs": [],
"source": [
"# Your solution goes here"
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import pandas as pd\n",
"\n",
"# Step 1: Send a GET request to the webpage\n",
"url = 'https://books.toscrape.com/'\n",
"response = requests.get(url) \n",
"response\n",
"\n",
"soup = BeautifulSoup(response.content) \n",
"\n",
"grid = soup.find(\"ol\", attrs = {\"class\":\"row\"})\n",
"books = grid.find_all(\"li\", attrs = {\"class\":\"col-xs-6 col-sm-4 col-md-3 col-lg-3\"})"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "4d1faef2",
"metadata": {},
"outputs": [],
"source": [
"# Define function to get book name\n",
"def get_book_name(book):\n",
" return book.find(\"h3\").get_text()\n",
"\n",
"# Define function to get book price and exclude books above 20 from inclusion in dataframe\n",
"def get_book_price(book):\n",
" price1 = book.find(\"p\", attrs = {\"class\":\"price_color\"}).get_text().replace(\"£\",\"\")\n",
" price = float(price1)\n",
" return price\n",
"\n",
"# Define function to get book rating, and filter out books with less than four stars\n",
"def get_book_rating(book):\n",
" rating_str = book.find(\"p\", attrs = {\"class\":\"star-rating\"})['class'][1]\n",
" if rating_str == \"One\":\n",
" rating = 1\n",
" elif rating_str == \"Two\":\n",
" rating = 2\n",
" elif rating_str == \"Three\":\n",
" rating = 3\n",
" elif rating_str == \"Four\":\n",
" rating = 4\n",
" elif rating_str == \"Five\":\n",
" rating = 5\n",
" return rating\n",
"\n",
"# Define function to get book URL\n",
"def get_book_url(book):\n",
" partial_link = book.find(\"h3\").find(\"a\")['href']\n",
" full_link = \"https://books.toscrape.com/\" + partial_link\n",
" return full_link\n",
"\n",
"\n",
"# Define function to add UPC code to dataframe\n",
"def get_book_upc(book_url):\n",
" book_response = requests.get(book_url)\n",
" book_soup = BeautifulSoup(book_response.content)\n",
" table = book_soup.find(\"table\", attrs = {\"class\":\"table table-striped\"})\n",
" upc = table.find(\"td\").get_text()\n",
" return upc\n",
"\n",
"# Define function to add book title:\n",
"def get_book_title(book_url):\n",
" book_response = requests.get(book_url)\n",
" book_soup = BeautifulSoup(book_response.content)\n",
" table = book_soup.find(\"div\", attrs = {\"class\":\"product_main\"})\n",
" book_title = table.find(\"h1\").get_text()\n",
" return book_title\n",
"\n",
"# Define function to add book genre:\n",
"def get_book_genre(book_url):\n",
" book_response = requests.get(book_url)\n",
" book_soup = BeautifulSoup(book_response.content)\n",
" breadcrumb = book_soup.find(\"ul\", attrs = {\"class\":\"breadcrumb\"})\n",
" genre = breadcrumb.find_all(\"li\")[2].get_text().strip()\n",
" return genre\n",
"\n",
"# Define function to check for availability:\n",
"def get_book_availability(book_url):\n",
" book_response = requests.get(book_url)\n",
" book_soup = BeautifulSoup(book_response.content)\n",
" table = book_soup.find(\"table\", attrs = {\"class\":\"table table-striped\"})\n",
" availability1 = table.find_all(\"td\")[5].get_text().strip().split(\"(\")[1].split()[0]\n",
" availability = int(availability1)\n",
" return availability\n",
"\n",
"# Define function to return book description:\n",
"def get_book_description(book_url):\n",
" book_response = requests.get(book_url)\n",
" book_soup = BeautifulSoup(book_response.content)\n",
" table = book_soup.find(\"article\", attrs = {\"class\":\"product_page\"})\n",
" description = table.find_all(\"p\")[3].get_text()\n",
" return description\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5da840a3",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.microsoft.datawrangler.viewer.v0+json": {
"columns": [
{
"name": "index",
"rawType": "int64",
"type": "integer"
},
{
"name": "Title",
"rawType": "object",
"type": "string"
},
{
"name": "Price",
"rawType": "float64",
"type": "float"
},
{
"name": "Rating",
"rawType": "int64",
"type": "integer"
},
{
"name": "URL",
"rawType": "object",
"type": "string"
},
{
"name": "UPC",
"rawType": "object",
"type": "string"
},
{
"name": "Genre",
"rawType": "object",
"type": "string"
},
{
"name": "Availability",
"rawType": "int64",
"type": "integer"
},
{
"name": "Description",
"rawType": "object",
"type": "string"
}
],
"ref": "f05b2f53-6fc9-4394-8fca-114a582b8aff",
"rows": [
[
"0",
"Set Me Free",
"17.46",
"5",
"https://books.toscrape.com/catalogue/set-me-free_988/index.html",
"ce6396b0f23f6ecc",
"Young Adult",
"19",
"Aaron Ledbetter’s future had been planned out for him since before he was born. Each year, the Ledbetter family vacation on Tybee Island gave Aaron a chance to briefly free himself from his family’s expectations. When he meets Jonas “Lucky” Luckett, a caricature artist in town with the traveling carnival, he must choose between the life that’s been mapped out for him, and Aaron Ledbetter’s future had been planned out for him since before he was born. Each year, the Ledbetter family vacation on Tybee Island gave Aaron a chance to briefly free himself from his family’s expectations. When he meets Jonas “Lucky” Luckett, a caricature artist in town with the traveling carnival, he must choose between the life that’s been mapped out for him, and the chance at true love. ...more"
]
],
"shape": {
"columns": 8,
"rows": 1
}
},
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Title</th>\n",
" <th>Price</th>\n",
" <th>Rating</th>\n",
" <th>URL</th>\n",
" <th>UPC</th>\n",
" <th>Genre</th>\n",
" <th>Availability</th>\n",
" <th>Description</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Set Me Free</td>\n",
" <td>17.46</td>\n",
" <td>5</td>\n",
" <td>https://books.toscrape.com/catalogue/set-me-fr...</td>\n",
" <td>ce6396b0f23f6ecc</td>\n",
" <td>Young Adult</td>\n",
" <td>19</td>\n",
" <td>Aaron Ledbetter’s future had been planned out ...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Title Price Rating \\\n",
"0 Set Me Free 17.46 5 \n",
"\n",
" URL UPC \\\n",
"0 https://books.toscrape.com/catalogue/set-me-fr... ce6396b0f23f6ecc \n",
"\n",
" Genre Availability \\\n",
"0 Young Adult 19 \n",
"\n",
" Description \n",
"0 Aaron Ledbetter’s future had been planned out ... "
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"# Define function to only append books with price <=20 and rating >=4 to dataframe\n",
"def books_to_scrape(max_price, min_rating):\n",
" book_dict = {}\n",
" index = 0\n",
" for book in books:\n",
" price = get_book_price(book)\n",
" rating = get_book_rating(book)\n",
" url = get_book_url(book)\n",
" \n",
"\n",
" if price <= max_price and rating >= min_rating:\n",
" upc = get_book_upc(url)\n",
" title = get_book_title(url)\n",
" genre = get_book_genre(url)\n",
" availability = get_book_availability(url)\n",
" description = get_book_description(url)\n",
" \n",
" book_dict[index] = {\"Title\": title,\n",
" \"Price\": price,\n",
" \"Rating\": rating,\n",
" \"URL\": url,\n",
" \"UPC\": upc,\n",
" \"Genre\": genre,\n",
" \"Availability\": availability,\n",
" \"Description\": description\n",
" }\n",
" index += 1\n",
" return book_dict\n",
"\n",
"book_dict1 = books_to_scrape(20, 4)\n",
"\n",
"df = pd.DataFrame.from_dict(book_dict1, orient=\"index\")\n",
"df\n",
"\n"
]
}
],
Expand All @@ -126,7 +396,7 @@
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "base",
"language": "python",
"name": "python3"
},
Expand All @@ -140,7 +410,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
"version": "3.13.5"
}
},
"nbformat": 4,
Expand Down