diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb index e552783..42494d7 100644 --- a/lab-web-scraping.ipynb +++ b/lab-web-scraping.ipynb @@ -110,15 +110,668 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "40359eee-9cd7-4884-bfa4-83344c222305", "metadata": { "id": "40359eee-9cd7-4884-bfa4-83344c222305" }, "outputs": [], "source": [ - "# Your solution goes here" + "import requests\n", + "from bs4 import BeautifulSoup\n", + "\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "4368425e", + "metadata": {}, + "outputs": [], + "source": [ + "url = \"https://books.toscrape.com/catalogue/page-1.html\"\n", + "response = requests.get(url)\n", + "\n", + "soup = BeautifulSoup(response.content)\n", + "\n", + "#identify the grid\n", + "grid = soup.find(\"ol\", attrs = {\"class\":\"row\"})\n", + "\n", + "#within the grid, identify all books\n", + "books = grid.find_all(\"article\", attrs = {\"class\":\"product_pod\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "a985fe3c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Three'" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#title\n", + "books[0].find_all(\"a\")[-1][\"title\"]\n", + "\n", + "#price\n", + "books[0].find(\"p\", attrs = {\"class\":\"price_color\"}).get_text()\n", + "\n", + "#rating\n", + "books[0].find(\"p\", attrs = {\"class\":\"star-rating\"})[\"class\"][-1]" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "id": "6a157e27", + "metadata": {}, + "outputs": [], + "source": [ + "def get_title(book):\n", + " return book.find_all(\"a\")[-1][\"title\"]\n", + "\n", + "def get_price(book):\n", + " raw_price = book.find(\"p\", attrs = {\"class\":\"price_color\"}).get_text()\n", + " clean_price = float(raw_price.replace(\"£\", \"\"))\n", + " return float(clean_price)\n", + "\n", + "def get_rating(book):\n", + " raw_rating = book.find(\"p\", attrs = {\"class\":\"star-rating\"})[\"class\"][-1]\n", + " if raw_rating == \"One\":\n", + " return 1\n", + " elif raw_rating == \"Two\":\n", + " return 2\n", + " elif raw_rating == \"Three\":\n", + " return 3\n", + " elif raw_rating == \"Four\":\n", + " return 4\n", + " else:\n", + " return 5\n", + " \n", + "def get_link(book):\n", + " domain = \"https://books.toscrape.com/catalogue/\"\n", + " href = book.find_all(\"a\")[-1][\"href\"]\n", + "\n", + " return domain + href\n", + "\n", + "\n", + "def get_upc(soup_book):\n", + " upc = soup_book.find(\"td\").get_text()\n", + " return upc\n", + "\n", + "\n", + "def get_availability(soup_book):\n", + " return soup_book.find(\"p\", attrs={\"class\":\"instock availability\"}).get_text().strip()\n", + "\n", + "def get_description(soup_book):\n", + " return soup_book.find(\"div\", id=\"product_description\").find_next(\"p\").get_text().strip()\n", + "\n", + "def get_genre(soup_book):\n", + " return soup_book.find(\"ul\", attrs = {\"class\":\"breadcrumb\"}).find_all(\"li\")[-2].get_text().strip()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1168e2c4", + "metadata": {}, + "outputs": [], + "source": [ + "def scrape_books(max_price, min_rating, page_number):\n", + "\n", + " url = f\"https://books.toscrape.com/catalogue/page-{page_number}.html\"\n", + " response = requests.get(url)\n", + "\n", + " soup = BeautifulSoup(response.content)\n", + "\n", + " #identify the grid\n", + " grid = soup.find(\"ol\", attrs = {\"class\":\"row\"})\n", + "\n", + " #within the grid, identify all books\n", + " books = grid.find_all(\"article\", attrs = {\"class\":\"product_pod\"})\n", + "\n", + " dict = {}\n", + " index = 0\n", + "\n", + " for book in books:\n", + " title = get_title(book)\n", + " price = get_price(book)\n", + " rating = get_rating(book)\n", + " book_url = get_link(book)\n", + "\n", + "\n", + "\n", + " if max_price >= price and min_rating <= rating:\n", + " #if both conditions above are met, we need to extract more data (UPC, Genre, etc)\n", + " r_book = requests.get(book_url)\n", + " soup_book = BeautifulSoup(r_book.content)\n", + "\n", + " #extra fields to scrap\n", + " upc = get_upc(soup_book)\n", + " availability = get_availability(soup_book)\n", + " description = get_description(soup_book)\n", + " genre = get_genre(soup_book)\n", + "\n", + "\n", + " dict[index] = {\"title\": title,\n", + " \"price\":price,\n", + " \"rating\":rating,\n", + " \"url\": book_url,\n", + " \"upc\":upc,\n", + " \"availability\": availability,\n", + " \"description\":description,\n", + " \"genre\": genre}\n", + " \n", + " index +=1\n", + " else:\n", + " pass\n", + "\n", + " return pd.DataFrame.from_dict(dict, orient = \"index\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e51553bb", + "metadata": {}, + "outputs": [], + "source": [ + "max_price = 20\n", + "min_rating = 4\n", + "\n", + "list_of_dfs = []\n", + "for i in range(1, 51):\n", + " df = scrape_books(max_price= max_price, \n", + " min_rating= min_rating, \n", + " page_number=i)\n", + " \n", + " list_of_dfs.append(df)\n", + " print(f\"Scraping page number {i}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "id": "54cc8f93", + "metadata": {}, + "outputs": [], + "source": [ + "full_df = pd.concat(list_of_dfs, ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "id": "181d5703", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlepriceratingurlupcavailabilitydescriptiongenre
0Set Me Free17.465https://books.toscrape.com/catalogue/set-me-fr...ce6396b0f23f6eccIn stock (19 available)Aaron Ledbetter’s future had been planned out ...Young Adult
1The Four Agreements: A Practical Guide to Pers...17.665https://books.toscrape.com/catalogue/the-four-...6258a1f6a6dcfe50In stock (18 available)In The Four Agreements, don Miguel Ruiz reveal...Spirituality
2Sophie's World15.945https://books.toscrape.com/catalogue/sophies-w...6be3beb0793a53e7In stock (18 available)A page-turning novel that is also an explorati...Philosophy
3Untitled Collection: Sabbath Poems 201414.274https://books.toscrape.com/catalogue/untitled-...657fe5ead67a7767In stock (16 available)More than thirty-five years ago, when the weat...Poetry
4This One Summer19.494https://books.toscrape.com/catalogue/this-one-...51653ef291ab7ddcIn stock (16 available)Every summer, Rose goes with her mom and dad t...Sequential Art
...........................
70The Zombie Room19.695https://books.toscrape.com/catalogue/the-zombi...9c96cd1329fbd82dIn stock (1 available)An unlikely bond is forged between three men f...Default
71The Silent Wife12.345https://books.toscrape.com/catalogue/the-silen...b78deb463531d078In stock (1 available)A chilling psychological thriller about a marr...Fiction
72The Girl You Lost12.295https://books.toscrape.com/catalogue/the-girl-...4280ac3eab57aa5dIn stock (1 available)Eighteen years ago your baby daughter was snat...Mystery
73The Edge of Reason (Bridget Jones #2)19.184https://books.toscrape.com/catalogue/the-edge-...29fc016c459aeb14In stock (1 available)Monday 27 January“7:15 a.m. Hurrah! The wilder...Womens Fiction
74A Spy's Devotion (The Regency Spies of London #1)16.975https://books.toscrape.com/catalogue/a-spys-de...19fec36a1dfb4c16In stock (1 available)In England’s Regency era, manners and elegance...Historical Fiction
\n", + "

75 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " title price rating \\\n", + "0 Set Me Free 17.46 5 \n", + "1 The Four Agreements: A Practical Guide to Pers... 17.66 5 \n", + "2 Sophie's World 15.94 5 \n", + "3 Untitled Collection: Sabbath Poems 2014 14.27 4 \n", + "4 This One Summer 19.49 4 \n", + ".. ... ... ... \n", + "70 The Zombie Room 19.69 5 \n", + "71 The Silent Wife 12.34 5 \n", + "72 The Girl You Lost 12.29 5 \n", + "73 The Edge of Reason (Bridget Jones #2) 19.18 4 \n", + "74 A Spy's Devotion (The Regency Spies of London #1) 16.97 5 \n", + "\n", + " url upc \\\n", + "0 https://books.toscrape.com/catalogue/set-me-fr... ce6396b0f23f6ecc \n", + "1 https://books.toscrape.com/catalogue/the-four-... 6258a1f6a6dcfe50 \n", + "2 https://books.toscrape.com/catalogue/sophies-w... 6be3beb0793a53e7 \n", + "3 https://books.toscrape.com/catalogue/untitled-... 657fe5ead67a7767 \n", + "4 https://books.toscrape.com/catalogue/this-one-... 51653ef291ab7ddc \n", + ".. ... ... \n", + "70 https://books.toscrape.com/catalogue/the-zombi... 9c96cd1329fbd82d \n", + "71 https://books.toscrape.com/catalogue/the-silen... b78deb463531d078 \n", + "72 https://books.toscrape.com/catalogue/the-girl-... 4280ac3eab57aa5d \n", + "73 https://books.toscrape.com/catalogue/the-edge-... 29fc016c459aeb14 \n", + "74 https://books.toscrape.com/catalogue/a-spys-de... 19fec36a1dfb4c16 \n", + "\n", + " availability \\\n", + "0 In stock (19 available) \n", + "1 In stock (18 available) \n", + "2 In stock (18 available) \n", + "3 In stock (16 available) \n", + "4 In stock (16 available) \n", + ".. ... \n", + "70 In stock (1 available) \n", + "71 In stock (1 available) \n", + "72 In stock (1 available) \n", + "73 In stock (1 available) \n", + "74 In stock (1 available) \n", + "\n", + " description genre \n", + "0 Aaron Ledbetter’s future had been planned out ... Young Adult \n", + "1 In The Four Agreements, don Miguel Ruiz reveal... Spirituality \n", + "2 A page-turning novel that is also an explorati... Philosophy \n", + "3 More than thirty-five years ago, when the weat... Poetry \n", + "4 Every summer, Rose goes with her mom and dad t... Sequential Art \n", + ".. ... ... \n", + "70 An unlikely bond is forged between three men f... Default \n", + "71 A chilling psychological thriller about a marr... Fiction \n", + "72 Eighteen years ago your baby daughter was snat... Mystery \n", + "73 Monday 27 January“7:15 a.m. Hurrah! The wilder... Womens Fiction \n", + "74 In England’s Regency era, manners and elegance... Historical Fiction \n", + "\n", + "[75 rows x 8 columns]" + ] + }, + "execution_count": 163, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "full_df" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eaff14db", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7dae506e", + "metadata": {}, + "outputs": [], + "source": [ + "#alternative -> for loop inside the function\n", + "\n", + "def scrape_books(max_price, min_rating):\n", + "\n", + " list_of_dfs = []\n", + " for page_number in range(1, 51):\n", + "\n", + " url = f\"https://books.toscrape.com/catalogue/page-{page_number}.html\"\n", + " response = requests.get(url)\n", + "\n", + " soup = BeautifulSoup(response.content)\n", + "\n", + " #identify the grid\n", + " grid = soup.find(\"ol\", attrs = {\"class\":\"row\"})\n", + "\n", + " #within the grid, identify all books\n", + " books = grid.find_all(\"article\", attrs = {\"class\":\"product_pod\"})\n", + "\n", + " dict = {}\n", + " index = 0\n", + "\n", + " for book in books:\n", + " title = get_title(book)\n", + " price = get_price(book)\n", + " rating = get_rating(book)\n", + " book_url = get_link(book)\n", + "\n", + "\n", + "\n", + " if max_price >= price and min_rating <= rating:\n", + " #if both conditions above are met, we need to extract more data (UPC, Genre, etc)\n", + " r_book = requests.get(book_url)\n", + " soup_book = BeautifulSoup(r_book.content)\n", + "\n", + " #extra fields to scrap\n", + " upc = get_upc(soup_book)\n", + " availability = get_availability(soup_book)\n", + " description = get_description(soup_book)\n", + " genre = get_genre(soup_book)\n", + "\n", + "\n", + " dict[index] = {\"title\": title,\n", + " \"price\":price,\n", + " \"rating\":rating,\n", + " \"url\": book_url,\n", + " \"upc\":upc,\n", + " \"availability\": availability,\n", + " \"description\":description,\n", + " \"genre\": genre}\n", + " \n", + " index +=1\n", + " else:\n", + " pass\n", + "\n", + " page_df = pd.DataFrame.from_dict(dict, orient = \"index\")\n", + " list_of_dfs.append(page_df)\n", + "\n", + " return pd.concat(list_of_dfs, ignore_index=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "id": "ed2938a0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlepriceratingurlupcavailabilitydescriptiongenre
0Set Me Free17.465https://books.toscrape.com/catalogue/set-me-fr...ce6396b0f23f6eccIn stock (19 available)Aaron Ledbetter’s future had been planned out ...Young Adult
1The Four Agreements: A Practical Guide to Pers...17.665https://books.toscrape.com/catalogue/the-four-...6258a1f6a6dcfe50In stock (18 available)In The Four Agreements, don Miguel Ruiz reveal...Spirituality
2Sophie's World15.945https://books.toscrape.com/catalogue/sophies-w...6be3beb0793a53e7In stock (18 available)A page-turning novel that is also an explorati...Philosophy
3Untitled Collection: Sabbath Poems 201414.274https://books.toscrape.com/catalogue/untitled-...657fe5ead67a7767In stock (16 available)More than thirty-five years ago, when the weat...Poetry
4This One Summer19.494https://books.toscrape.com/catalogue/this-one-...51653ef291ab7ddcIn stock (16 available)Every summer, Rose goes with her mom and dad t...Sequential Art
5Thirst17.275https://books.toscrape.com/catalogue/thirst_94...709822d0b5bcb7f4In stock (16 available)On a searing summer Friday, Eddie Chapman has ...Fiction
\n", + "
" + ], + "text/plain": [ + " title price rating \\\n", + "0 Set Me Free 17.46 5 \n", + "1 The Four Agreements: A Practical Guide to Pers... 17.66 5 \n", + "2 Sophie's World 15.94 5 \n", + "3 Untitled Collection: Sabbath Poems 2014 14.27 4 \n", + "4 This One Summer 19.49 4 \n", + "5 Thirst 17.27 5 \n", + "\n", + " url upc \\\n", + "0 https://books.toscrape.com/catalogue/set-me-fr... ce6396b0f23f6ecc \n", + "1 https://books.toscrape.com/catalogue/the-four-... 6258a1f6a6dcfe50 \n", + "2 https://books.toscrape.com/catalogue/sophies-w... 6be3beb0793a53e7 \n", + "3 https://books.toscrape.com/catalogue/untitled-... 657fe5ead67a7767 \n", + "4 https://books.toscrape.com/catalogue/this-one-... 51653ef291ab7ddc \n", + "5 https://books.toscrape.com/catalogue/thirst_94... 709822d0b5bcb7f4 \n", + "\n", + " availability description \\\n", + "0 In stock (19 available) Aaron Ledbetter’s future had been planned out ... \n", + "1 In stock (18 available) In The Four Agreements, don Miguel Ruiz reveal... \n", + "2 In stock (18 available) A page-turning novel that is also an explorati... \n", + "3 In stock (16 available) More than thirty-five years ago, when the weat... \n", + "4 In stock (16 available) Every summer, Rose goes with her mom and dad t... \n", + "5 In stock (16 available) On a searing summer Friday, Eddie Chapman has ... \n", + "\n", + " genre \n", + "0 Young Adult \n", + "1 Spirituality \n", + "2 Philosophy \n", + "3 Poetry \n", + "4 Sequential Art \n", + "5 Fiction " + ] + }, + "execution_count": 165, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scrape_books(20, 4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e92603f8", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -126,7 +779,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -140,7 +793,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.12.7" } }, "nbformat": 4,