diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb index e552783..f8af802 100644 --- a/lab-web-scraping.ipynb +++ b/lab-web-scraping.ipynb @@ -110,14 +110,353 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "40359eee-9cd7-4884-bfa4-83344c222305", - "metadata": { - "id": "40359eee-9cd7-4884-bfa4-83344c222305" - }, + "execution_count": 7, + "id": "9be10ca2", + "metadata": {}, "outputs": [], "source": [ - "# Your solution goes here" + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import pandas as pd\n", + "\n", + "rating_map = {\n", + " \"One\": 1,\n", + " \"Two\": 2,\n", + " \"Three\": 3,\n", + " \"Four\": 4,\n", + " \"Five\": 5\n", + "}\n", + "\n", + "def scrape_books(min_rating, max_price):\n", + " base_url = \"https://books.toscrape.com/catalogue/\"\n", + " page_url = \"https://books.toscrape.com/catalogue/page-1.html\"\n", + " \n", + " books_data = []\n", + "\n", + " while True:\n", + " print(f\"Scrapeando: {page_url}\")\n", + " response = requests.get(page_url)\n", + " response.encoding = 'utf-8' # ← FIX IMPORTANTE\n", + " soup = BeautifulSoup(response.text, \"html.parser\")\n", + "\n", + " books = soup.select(\"article.product_pod\")\n", + "\n", + " for book in books:\n", + " rating_class = book.find(\"p\", class_=\"star-rating\")[\"class\"][1]\n", + " rating = rating_map[rating_class]\n", + "\n", + " if rating < min_rating:\n", + " continue\n", + "\n", + " book_url = base_url + book.find(\"a\")[\"href\"].replace(\"../\", \"\")\n", + "\n", + " book_resp = requests.get(book_url)\n", + " book_resp.encoding = 'utf-8' # ← FIX IMPORTANTE\n", + " book_soup = BeautifulSoup(book_resp.text, \"html.parser\")\n", + "\n", + " title = book_soup.find(\"h1\").text.strip()\n", + "\n", + " table = book_soup.find(\"table\", class_=\"table-striped\")\n", + " rows = {row.th.text: row.td.text for row in table.find_all(\"tr\")}\n", + " upc = rows.get(\"UPC\")\n", + "\n", + " price_text = rows.get(\"Price (incl. tax)\").replace(\"£\", \"\").strip()\n", + " price = float(price_text)\n", + "\n", + " if price > max_price:\n", + " continue\n", + "\n", + " availability = rows.get(\"Availability\")\n", + "\n", + " genre = book_soup.select(\"ul.breadcrumb li a\")[2].text.strip()\n", + "\n", + " desc_tag = book_soup.find(\"div\", id=\"product_description\")\n", + " if desc_tag:\n", + " description = desc_tag.find_next_sibling(\"p\").text.strip()\n", + " else:\n", + " description = \"No disponible\"\n", + "\n", + " books_data.append({\n", + " \"UPC\": upc,\n", + " \"Title\": title,\n", + " \"Price (£)\": price,\n", + " \"Rating\": rating,\n", + " \"Genre\": genre,\n", + " \"Availability\": availability,\n", + " \"Description\": description\n", + " })\n", + "\n", + " next_button = soup.find(\"li\", class_=\"next\")\n", + " if not next_button:\n", + " break\n", + "\n", + " next_page = next_button.find(\"a\")[\"href\"]\n", + " page_url = \"https://books.toscrape.com/catalogue/\" + next_page\n", + "\n", + " return pd.DataFrame(books_data)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6b29dbf2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Scrapeando: https://books.toscrape.com/catalogue/page-1.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-2.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-3.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-4.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-5.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-6.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-7.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-8.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-9.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-10.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-11.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-12.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-13.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-14.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-15.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-16.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-17.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-18.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-19.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-20.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-21.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-22.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-23.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-24.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-25.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-26.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-27.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-28.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-29.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-30.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-31.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-32.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-33.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-34.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-35.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-36.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-37.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-38.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-39.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-40.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-41.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-42.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-43.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-44.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-45.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-46.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-47.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-48.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-49.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-50.html\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
UPCTitlePrice (£)RatingGenreAvailabilityDescription
0ce6396b0f23f6eccSet Me Free17.465Young AdultIn stock (19 available)Aaron Ledbetter’s future had been planned out ...
16258a1f6a6dcfe50The Four Agreements: A Practical Guide to Pers...17.665SpiritualityIn stock (18 available)In The Four Agreements, don Miguel Ruiz reveal...
26be3beb0793a53e7Sophie's World15.945PhilosophyIn stock (18 available)A page-turning novel that is also an explorati...
3657fe5ead67a7767Untitled Collection: Sabbath Poems 201414.274PoetryIn stock (16 available)More than thirty-five years ago, when the weat...
451653ef291ab7ddcThis One Summer19.494Sequential ArtIn stock (16 available)Every summer, Rose goes with her mom and dad t...
........................
709c96cd1329fbd82dThe Zombie Room19.695DefaultIn stock (1 available)An unlikely bond is forged between three men f...
71b78deb463531d078The Silent Wife12.345FictionIn stock (1 available)A chilling psychological thriller about a marr...
724280ac3eab57aa5dThe Girl You Lost12.295MysteryIn stock (1 available)Eighteen years ago your baby daughter was snat...
7329fc016c459aeb14The Edge of Reason (Bridget Jones #2)19.184Womens FictionIn stock (1 available)Monday 27 January“7:15 a.m. Hurrah! The wilder...
7419fec36a1dfb4c16A Spy's Devotion (The Regency Spies of London #1)16.975Historical FictionIn stock (1 available)In England’s Regency era, manners and elegance...
\n", + "

75 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " UPC Title \\\n", + "0 ce6396b0f23f6ecc Set Me Free \n", + "1 6258a1f6a6dcfe50 The Four Agreements: A Practical Guide to Pers... \n", + "2 6be3beb0793a53e7 Sophie's World \n", + "3 657fe5ead67a7767 Untitled Collection: Sabbath Poems 2014 \n", + "4 51653ef291ab7ddc This One Summer \n", + ".. ... ... \n", + "70 9c96cd1329fbd82d The Zombie Room \n", + "71 b78deb463531d078 The Silent Wife \n", + "72 4280ac3eab57aa5d The Girl You Lost \n", + "73 29fc016c459aeb14 The Edge of Reason (Bridget Jones #2) \n", + "74 19fec36a1dfb4c16 A Spy's Devotion (The Regency Spies of London #1) \n", + "\n", + " Price (£) Rating Genre Availability \\\n", + "0 17.46 5 Young Adult In stock (19 available) \n", + "1 17.66 5 Spirituality In stock (18 available) \n", + "2 15.94 5 Philosophy In stock (18 available) \n", + "3 14.27 4 Poetry In stock (16 available) \n", + "4 19.49 4 Sequential Art In stock (16 available) \n", + ".. ... ... ... ... \n", + "70 19.69 5 Default In stock (1 available) \n", + "71 12.34 5 Fiction In stock (1 available) \n", + "72 12.29 5 Mystery In stock (1 available) \n", + "73 19.18 4 Womens Fiction In stock (1 available) \n", + "74 16.97 5 Historical Fiction In stock (1 available) \n", + "\n", + " Description \n", + "0 Aaron Ledbetter’s future had been planned out ... \n", + "1 In The Four Agreements, don Miguel Ruiz reveal... \n", + "2 A page-turning novel that is also an explorati... \n", + "3 More than thirty-five years ago, when the weat... \n", + "4 Every summer, Rose goes with her mom and dad t... \n", + ".. ... \n", + "70 An unlikely bond is forged between three men f... \n", + "71 A chilling psychological thriller about a marr... \n", + "72 Eighteen years ago your baby daughter was snat... \n", + "73 Monday 27 January“7:15 a.m. Hurrah! The wilder... \n", + "74 In England’s Regency era, manners and elegance... \n", + "\n", + "[75 rows x 7 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "df = scrape_books(min_rating=4, max_price=20)\n", + "df\n", + "\n" ] } ], @@ -126,7 +465,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -140,7 +479,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.12.9" } }, "nbformat": 4,