diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb index e552783..42494d7 100644 --- a/lab-web-scraping.ipynb +++ b/lab-web-scraping.ipynb @@ -110,15 +110,668 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "40359eee-9cd7-4884-bfa4-83344c222305", "metadata": { "id": "40359eee-9cd7-4884-bfa4-83344c222305" }, "outputs": [], "source": [ - "# Your solution goes here" + "import requests\n", + "from bs4 import BeautifulSoup\n", + "\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "4368425e", + "metadata": {}, + "outputs": [], + "source": [ + "url = \"https://books.toscrape.com/catalogue/page-1.html\"\n", + "response = requests.get(url)\n", + "\n", + "soup = BeautifulSoup(response.content)\n", + "\n", + "#identify the grid\n", + "grid = soup.find(\"ol\", attrs = {\"class\":\"row\"})\n", + "\n", + "#within the grid, identify all books\n", + "books = grid.find_all(\"article\", attrs = {\"class\":\"product_pod\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "a985fe3c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Three'" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#title\n", + "books[0].find_all(\"a\")[-1][\"title\"]\n", + "\n", + "#price\n", + "books[0].find(\"p\", attrs = {\"class\":\"price_color\"}).get_text()\n", + "\n", + "#rating\n", + "books[0].find(\"p\", attrs = {\"class\":\"star-rating\"})[\"class\"][-1]" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "id": "6a157e27", + "metadata": {}, + "outputs": [], + "source": [ + "def get_title(book):\n", + " return book.find_all(\"a\")[-1][\"title\"]\n", + "\n", + "def get_price(book):\n", + " raw_price = book.find(\"p\", attrs = {\"class\":\"price_color\"}).get_text()\n", + " clean_price = float(raw_price.replace(\"£\", \"\"))\n", + " return float(clean_price)\n", + "\n", + "def get_rating(book):\n", + " raw_rating = book.find(\"p\", attrs = {\"class\":\"star-rating\"})[\"class\"][-1]\n", + " if raw_rating == \"One\":\n", + " return 1\n", + " elif raw_rating == \"Two\":\n", + " return 2\n", + " elif raw_rating == \"Three\":\n", + " return 3\n", + " elif raw_rating == \"Four\":\n", + " return 4\n", + " else:\n", + " return 5\n", + " \n", + "def get_link(book):\n", + " domain = \"https://books.toscrape.com/catalogue/\"\n", + " href = book.find_all(\"a\")[-1][\"href\"]\n", + "\n", + " return domain + href\n", + "\n", + "\n", + "def get_upc(soup_book):\n", + " upc = soup_book.find(\"td\").get_text()\n", + " return upc\n", + "\n", + "\n", + "def get_availability(soup_book):\n", + " return soup_book.find(\"p\", attrs={\"class\":\"instock availability\"}).get_text().strip()\n", + "\n", + "def get_description(soup_book):\n", + " return soup_book.find(\"div\", id=\"product_description\").find_next(\"p\").get_text().strip()\n", + "\n", + "def get_genre(soup_book):\n", + " return soup_book.find(\"ul\", attrs = {\"class\":\"breadcrumb\"}).find_all(\"li\")[-2].get_text().strip()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1168e2c4", + "metadata": {}, + "outputs": [], + "source": [ + "def scrape_books(max_price, min_rating, page_number):\n", + "\n", + " url = f\"https://books.toscrape.com/catalogue/page-{page_number}.html\"\n", + " response = requests.get(url)\n", + "\n", + " soup = BeautifulSoup(response.content)\n", + "\n", + " #identify the grid\n", + " grid = soup.find(\"ol\", attrs = {\"class\":\"row\"})\n", + "\n", + " #within the grid, identify all books\n", + " books = grid.find_all(\"article\", attrs = {\"class\":\"product_pod\"})\n", + "\n", + " dict = {}\n", + " index = 0\n", + "\n", + " for book in books:\n", + " title = get_title(book)\n", + " price = get_price(book)\n", + " rating = get_rating(book)\n", + " book_url = get_link(book)\n", + "\n", + "\n", + "\n", + " if max_price >= price and min_rating <= rating:\n", + " #if both conditions above are met, we need to extract more data (UPC, Genre, etc)\n", + " r_book = requests.get(book_url)\n", + " soup_book = BeautifulSoup(r_book.content)\n", + "\n", + " #extra fields to scrap\n", + " upc = get_upc(soup_book)\n", + " availability = get_availability(soup_book)\n", + " description = get_description(soup_book)\n", + " genre = get_genre(soup_book)\n", + "\n", + "\n", + " dict[index] = {\"title\": title,\n", + " \"price\":price,\n", + " \"rating\":rating,\n", + " \"url\": book_url,\n", + " \"upc\":upc,\n", + " \"availability\": availability,\n", + " \"description\":description,\n", + " \"genre\": genre}\n", + " \n", + " index +=1\n", + " else:\n", + " pass\n", + "\n", + " return pd.DataFrame.from_dict(dict, orient = \"index\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e51553bb", + "metadata": {}, + "outputs": [], + "source": [ + "max_price = 20\n", + "min_rating = 4\n", + "\n", + "list_of_dfs = []\n", + "for i in range(1, 51):\n", + " df = scrape_books(max_price= max_price, \n", + " min_rating= min_rating, \n", + " page_number=i)\n", + " \n", + " list_of_dfs.append(df)\n", + " print(f\"Scraping page number {i}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "id": "54cc8f93", + "metadata": {}, + "outputs": [], + "source": [ + "full_df = pd.concat(list_of_dfs, ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "id": "181d5703", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | title | \n", + "price | \n", + "rating | \n", + "url | \n", + "upc | \n", + "availability | \n", + "description | \n", + "genre | \n", + "
|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "Set Me Free | \n", + "17.46 | \n", + "5 | \n", + "https://books.toscrape.com/catalogue/set-me-fr... | \n", + "ce6396b0f23f6ecc | \n", + "In stock (19 available) | \n", + "Aaron Ledbetter’s future had been planned out ... | \n", + "Young Adult | \n", + "
| 1 | \n", + "The Four Agreements: A Practical Guide to Pers... | \n", + "17.66 | \n", + "5 | \n", + "https://books.toscrape.com/catalogue/the-four-... | \n", + "6258a1f6a6dcfe50 | \n", + "In stock (18 available) | \n", + "In The Four Agreements, don Miguel Ruiz reveal... | \n", + "Spirituality | \n", + "
| 2 | \n", + "Sophie's World | \n", + "15.94 | \n", + "5 | \n", + "https://books.toscrape.com/catalogue/sophies-w... | \n", + "6be3beb0793a53e7 | \n", + "In stock (18 available) | \n", + "A page-turning novel that is also an explorati... | \n", + "Philosophy | \n", + "
| 3 | \n", + "Untitled Collection: Sabbath Poems 2014 | \n", + "14.27 | \n", + "4 | \n", + "https://books.toscrape.com/catalogue/untitled-... | \n", + "657fe5ead67a7767 | \n", + "In stock (16 available) | \n", + "More than thirty-five years ago, when the weat... | \n", + "Poetry | \n", + "
| 4 | \n", + "This One Summer | \n", + "19.49 | \n", + "4 | \n", + "https://books.toscrape.com/catalogue/this-one-... | \n", + "51653ef291ab7ddc | \n", + "In stock (16 available) | \n", + "Every summer, Rose goes with her mom and dad t... | \n", + "Sequential Art | \n", + "
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 70 | \n", + "The Zombie Room | \n", + "19.69 | \n", + "5 | \n", + "https://books.toscrape.com/catalogue/the-zombi... | \n", + "9c96cd1329fbd82d | \n", + "In stock (1 available) | \n", + "An unlikely bond is forged between three men f... | \n", + "Default | \n", + "
| 71 | \n", + "The Silent Wife | \n", + "12.34 | \n", + "5 | \n", + "https://books.toscrape.com/catalogue/the-silen... | \n", + "b78deb463531d078 | \n", + "In stock (1 available) | \n", + "A chilling psychological thriller about a marr... | \n", + "Fiction | \n", + "
| 72 | \n", + "The Girl You Lost | \n", + "12.29 | \n", + "5 | \n", + "https://books.toscrape.com/catalogue/the-girl-... | \n", + "4280ac3eab57aa5d | \n", + "In stock (1 available) | \n", + "Eighteen years ago your baby daughter was snat... | \n", + "Mystery | \n", + "
| 73 | \n", + "The Edge of Reason (Bridget Jones #2) | \n", + "19.18 | \n", + "4 | \n", + "https://books.toscrape.com/catalogue/the-edge-... | \n", + "29fc016c459aeb14 | \n", + "In stock (1 available) | \n", + "Monday 27 January“7:15 a.m. Hurrah! The wilder... | \n", + "Womens Fiction | \n", + "
| 74 | \n", + "A Spy's Devotion (The Regency Spies of London #1) | \n", + "16.97 | \n", + "5 | \n", + "https://books.toscrape.com/catalogue/a-spys-de... | \n", + "19fec36a1dfb4c16 | \n", + "In stock (1 available) | \n", + "In England’s Regency era, manners and elegance... | \n", + "Historical Fiction | \n", + "
75 rows × 8 columns
\n", + "| \n", + " | title | \n", + "price | \n", + "rating | \n", + "url | \n", + "upc | \n", + "availability | \n", + "description | \n", + "genre | \n", + "
|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "Set Me Free | \n", + "17.46 | \n", + "5 | \n", + "https://books.toscrape.com/catalogue/set-me-fr... | \n", + "ce6396b0f23f6ecc | \n", + "In stock (19 available) | \n", + "Aaron Ledbetter’s future had been planned out ... | \n", + "Young Adult | \n", + "
| 1 | \n", + "The Four Agreements: A Practical Guide to Pers... | \n", + "17.66 | \n", + "5 | \n", + "https://books.toscrape.com/catalogue/the-four-... | \n", + "6258a1f6a6dcfe50 | \n", + "In stock (18 available) | \n", + "In The Four Agreements, don Miguel Ruiz reveal... | \n", + "Spirituality | \n", + "
| 2 | \n", + "Sophie's World | \n", + "15.94 | \n", + "5 | \n", + "https://books.toscrape.com/catalogue/sophies-w... | \n", + "6be3beb0793a53e7 | \n", + "In stock (18 available) | \n", + "A page-turning novel that is also an explorati... | \n", + "Philosophy | \n", + "
| 3 | \n", + "Untitled Collection: Sabbath Poems 2014 | \n", + "14.27 | \n", + "4 | \n", + "https://books.toscrape.com/catalogue/untitled-... | \n", + "657fe5ead67a7767 | \n", + "In stock (16 available) | \n", + "More than thirty-five years ago, when the weat... | \n", + "Poetry | \n", + "
| 4 | \n", + "This One Summer | \n", + "19.49 | \n", + "4 | \n", + "https://books.toscrape.com/catalogue/this-one-... | \n", + "51653ef291ab7ddc | \n", + "In stock (16 available) | \n", + "Every summer, Rose goes with her mom and dad t... | \n", + "Sequential Art | \n", + "
| 5 | \n", + "Thirst | \n", + "17.27 | \n", + "5 | \n", + "https://books.toscrape.com/catalogue/thirst_94... | \n", + "709822d0b5bcb7f4 | \n", + "In stock (16 available) | \n", + "On a searing summer Friday, Eddie Chapman has ... | \n", + "Fiction | \n", + "