diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb index e552783..7f018c6 100644 --- a/lab-web-scraping.ipynb +++ b/lab-web-scraping.ipynb @@ -110,14 +110,284 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 41, "id": "40359eee-9cd7-4884-bfa4-83344c222305", "metadata": { "id": "40359eee-9cd7-4884-bfa4-83344c222305" }, "outputs": [], "source": [ - "# Your solution goes here" + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import pandas as pd\n", + "\n", + "# Step 1: Send a GET request to the webpage\n", + "url = 'https://books.toscrape.com/'\n", + "response = requests.get(url) \n", + "response\n", + "\n", + "soup = BeautifulSoup(response.content) \n", + "\n", + "grid = soup.find(\"ol\", attrs = {\"class\":\"row\"})\n", + "books = grid.find_all(\"li\", attrs = {\"class\":\"col-xs-6 col-sm-4 col-md-3 col-lg-3\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "4d1faef2", + "metadata": {}, + "outputs": [], + "source": [ + "# Define function to get book name\n", + "def get_book_name(book):\n", + " return book.find(\"h3\").get_text()\n", + "\n", + "# Define function to get book price and exclude books above 20 from inclusion in dataframe\n", + "def get_book_price(book):\n", + " price1 = book.find(\"p\", attrs = {\"class\":\"price_color\"}).get_text().replace(\"£\",\"\")\n", + " price = float(price1)\n", + " return price\n", + "\n", + "# Define function to get book rating, and filter out books with less than four stars\n", + "def get_book_rating(book):\n", + " rating_str = book.find(\"p\", attrs = {\"class\":\"star-rating\"})['class'][1]\n", + " if rating_str == \"One\":\n", + " rating = 1\n", + " elif rating_str == \"Two\":\n", + " rating = 2\n", + " elif rating_str == \"Three\":\n", + " rating = 3\n", + " elif rating_str == \"Four\":\n", + " rating = 4\n", + " elif rating_str == \"Five\":\n", + " rating = 5\n", + " return rating\n", + "\n", + "# Define function to get book URL\n", + "def get_book_url(book):\n", + " partial_link = book.find(\"h3\").find(\"a\")['href']\n", + " full_link = \"https://books.toscrape.com/\" + partial_link\n", + " return full_link\n", + "\n", + "\n", + "# Define function to add UPC code to dataframe\n", + "def get_book_upc(book_url):\n", + " book_response = requests.get(book_url)\n", + " book_soup = BeautifulSoup(book_response.content)\n", + " table = book_soup.find(\"table\", attrs = {\"class\":\"table table-striped\"})\n", + " upc = table.find(\"td\").get_text()\n", + " return upc\n", + "\n", + "# Define function to add book title:\n", + "def get_book_title(book_url):\n", + " book_response = requests.get(book_url)\n", + " book_soup = BeautifulSoup(book_response.content)\n", + " table = book_soup.find(\"div\", attrs = {\"class\":\"product_main\"})\n", + " book_title = table.find(\"h1\").get_text()\n", + " return book_title\n", + "\n", + "# Define function to add book genre:\n", + "def get_book_genre(book_url):\n", + " book_response = requests.get(book_url)\n", + " book_soup = BeautifulSoup(book_response.content)\n", + " breadcrumb = book_soup.find(\"ul\", attrs = {\"class\":\"breadcrumb\"})\n", + " genre = breadcrumb.find_all(\"li\")[2].get_text().strip()\n", + " return genre\n", + "\n", + "# Define function to check for availability:\n", + "def get_book_availability(book_url):\n", + " book_response = requests.get(book_url)\n", + " book_soup = BeautifulSoup(book_response.content)\n", + " table = book_soup.find(\"table\", attrs = {\"class\":\"table table-striped\"})\n", + " availability1 = table.find_all(\"td\")[5].get_text().strip().split(\"(\")[1].split()[0]\n", + " availability = int(availability1)\n", + " return availability\n", + "\n", + "# Define function to return book description:\n", + "def get_book_description(book_url):\n", + " book_response = requests.get(book_url)\n", + " book_soup = BeautifulSoup(book_response.content)\n", + " table = book_soup.find(\"article\", attrs = {\"class\":\"product_page\"})\n", + " description = table.find_all(\"p\")[3].get_text()\n", + " return description\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5da840a3", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Title", + "rawType": "object", + "type": "string" + }, + { + "name": "Price", + "rawType": "float64", + "type": "float" + }, + { + "name": "Rating", + "rawType": "int64", + "type": "integer" + }, + { + "name": "URL", + "rawType": "object", + "type": "string" + }, + { + "name": "UPC", + "rawType": "object", + "type": "string" + }, + { + "name": "Genre", + "rawType": "object", + "type": "string" + }, + { + "name": "Availability", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Description", + "rawType": "object", + "type": "string" + } + ], + "ref": "f05b2f53-6fc9-4394-8fca-114a582b8aff", + "rows": [ + [ + "0", + "Set Me Free", + "17.46", + "5", + "https://books.toscrape.com/catalogue/set-me-free_988/index.html", + "ce6396b0f23f6ecc", + "Young Adult", + "19", + "Aaron Ledbetter’s future had been planned out for him since before he was born. Each year, the Ledbetter family vacation on Tybee Island gave Aaron a chance to briefly free himself from his family’s expectations. When he meets Jonas “Lucky” Luckett, a caricature artist in town with the traveling carnival, he must choose between the life that’s been mapped out for him, and Aaron Ledbetter’s future had been planned out for him since before he was born. Each year, the Ledbetter family vacation on Tybee Island gave Aaron a chance to briefly free himself from his family’s expectations. When he meets Jonas “Lucky” Luckett, a caricature artist in town with the traveling carnival, he must choose between the life that’s been mapped out for him, and the chance at true love. ...more" + ] + ], + "shape": { + "columns": 8, + "rows": 1 + } + }, + "text/html": [ + "
| \n", + " | Title | \n", + "Price | \n", + "Rating | \n", + "URL | \n", + "UPC | \n", + "Genre | \n", + "Availability | \n", + "Description | \n", + "
|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "Set Me Free | \n", + "17.46 | \n", + "5 | \n", + "https://books.toscrape.com/catalogue/set-me-fr... | \n", + "ce6396b0f23f6ecc | \n", + "Young Adult | \n", + "19 | \n", + "Aaron Ledbetter’s future had been planned out ... | \n", + "