diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb index e552783..f8af802 100644 --- a/lab-web-scraping.ipynb +++ b/lab-web-scraping.ipynb @@ -110,14 +110,353 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "40359eee-9cd7-4884-bfa4-83344c222305", - "metadata": { - "id": "40359eee-9cd7-4884-bfa4-83344c222305" - }, + "execution_count": 7, + "id": "9be10ca2", + "metadata": {}, "outputs": [], "source": [ - "# Your solution goes here" + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import pandas as pd\n", + "\n", + "rating_map = {\n", + " \"One\": 1,\n", + " \"Two\": 2,\n", + " \"Three\": 3,\n", + " \"Four\": 4,\n", + " \"Five\": 5\n", + "}\n", + "\n", + "def scrape_books(min_rating, max_price):\n", + " base_url = \"https://books.toscrape.com/catalogue/\"\n", + " page_url = \"https://books.toscrape.com/catalogue/page-1.html\"\n", + " \n", + " books_data = []\n", + "\n", + " while True:\n", + " print(f\"Scrapeando: {page_url}\")\n", + " response = requests.get(page_url)\n", + " response.encoding = 'utf-8' # ← FIX IMPORTANTE\n", + " soup = BeautifulSoup(response.text, \"html.parser\")\n", + "\n", + " books = soup.select(\"article.product_pod\")\n", + "\n", + " for book in books:\n", + " rating_class = book.find(\"p\", class_=\"star-rating\")[\"class\"][1]\n", + " rating = rating_map[rating_class]\n", + "\n", + " if rating < min_rating:\n", + " continue\n", + "\n", + " book_url = base_url + book.find(\"a\")[\"href\"].replace(\"../\", \"\")\n", + "\n", + " book_resp = requests.get(book_url)\n", + " book_resp.encoding = 'utf-8' # ← FIX IMPORTANTE\n", + " book_soup = BeautifulSoup(book_resp.text, \"html.parser\")\n", + "\n", + " title = book_soup.find(\"h1\").text.strip()\n", + "\n", + " table = book_soup.find(\"table\", class_=\"table-striped\")\n", + " rows = {row.th.text: row.td.text for row in table.find_all(\"tr\")}\n", + " upc = rows.get(\"UPC\")\n", + "\n", + " price_text = rows.get(\"Price (incl. tax)\").replace(\"£\", \"\").strip()\n", + " price = float(price_text)\n", + "\n", + " if price > max_price:\n", + " continue\n", + "\n", + " availability = rows.get(\"Availability\")\n", + "\n", + " genre = book_soup.select(\"ul.breadcrumb li a\")[2].text.strip()\n", + "\n", + " desc_tag = book_soup.find(\"div\", id=\"product_description\")\n", + " if desc_tag:\n", + " description = desc_tag.find_next_sibling(\"p\").text.strip()\n", + " else:\n", + " description = \"No disponible\"\n", + "\n", + " books_data.append({\n", + " \"UPC\": upc,\n", + " \"Title\": title,\n", + " \"Price (£)\": price,\n", + " \"Rating\": rating,\n", + " \"Genre\": genre,\n", + " \"Availability\": availability,\n", + " \"Description\": description\n", + " })\n", + "\n", + " next_button = soup.find(\"li\", class_=\"next\")\n", + " if not next_button:\n", + " break\n", + "\n", + " next_page = next_button.find(\"a\")[\"href\"]\n", + " page_url = \"https://books.toscrape.com/catalogue/\" + next_page\n", + "\n", + " return pd.DataFrame(books_data)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6b29dbf2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Scrapeando: https://books.toscrape.com/catalogue/page-1.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-2.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-3.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-4.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-5.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-6.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-7.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-8.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-9.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-10.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-11.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-12.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-13.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-14.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-15.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-16.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-17.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-18.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-19.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-20.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-21.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-22.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-23.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-24.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-25.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-26.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-27.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-28.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-29.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-30.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-31.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-32.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-33.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-34.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-35.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-36.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-37.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-38.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-39.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-40.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-41.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-42.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-43.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-44.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-45.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-46.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-47.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-48.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-49.html\n", + "Scrapeando: https://books.toscrape.com/catalogue/page-50.html\n" + ] + }, + { + "data": { + "text/html": [ + "
| \n", + " | UPC | \n", + "Title | \n", + "Price (£) | \n", + "Rating | \n", + "Genre | \n", + "Availability | \n", + "Description | \n", + "
|---|---|---|---|---|---|---|---|
| 0 | \n", + "ce6396b0f23f6ecc | \n", + "Set Me Free | \n", + "17.46 | \n", + "5 | \n", + "Young Adult | \n", + "In stock (19 available) | \n", + "Aaron Ledbetter’s future had been planned out ... | \n", + "
| 1 | \n", + "6258a1f6a6dcfe50 | \n", + "The Four Agreements: A Practical Guide to Pers... | \n", + "17.66 | \n", + "5 | \n", + "Spirituality | \n", + "In stock (18 available) | \n", + "In The Four Agreements, don Miguel Ruiz reveal... | \n", + "
| 2 | \n", + "6be3beb0793a53e7 | \n", + "Sophie's World | \n", + "15.94 | \n", + "5 | \n", + "Philosophy | \n", + "In stock (18 available) | \n", + "A page-turning novel that is also an explorati... | \n", + "
| 3 | \n", + "657fe5ead67a7767 | \n", + "Untitled Collection: Sabbath Poems 2014 | \n", + "14.27 | \n", + "4 | \n", + "Poetry | \n", + "In stock (16 available) | \n", + "More than thirty-five years ago, when the weat... | \n", + "
| 4 | \n", + "51653ef291ab7ddc | \n", + "This One Summer | \n", + "19.49 | \n", + "4 | \n", + "Sequential Art | \n", + "In stock (16 available) | \n", + "Every summer, Rose goes with her mom and dad t... | \n", + "
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 70 | \n", + "9c96cd1329fbd82d | \n", + "The Zombie Room | \n", + "19.69 | \n", + "5 | \n", + "Default | \n", + "In stock (1 available) | \n", + "An unlikely bond is forged between three men f... | \n", + "
| 71 | \n", + "b78deb463531d078 | \n", + "The Silent Wife | \n", + "12.34 | \n", + "5 | \n", + "Fiction | \n", + "In stock (1 available) | \n", + "A chilling psychological thriller about a marr... | \n", + "
| 72 | \n", + "4280ac3eab57aa5d | \n", + "The Girl You Lost | \n", + "12.29 | \n", + "5 | \n", + "Mystery | \n", + "In stock (1 available) | \n", + "Eighteen years ago your baby daughter was snat... | \n", + "
| 73 | \n", + "29fc016c459aeb14 | \n", + "The Edge of Reason (Bridget Jones #2) | \n", + "19.18 | \n", + "4 | \n", + "Womens Fiction | \n", + "In stock (1 available) | \n", + "Monday 27 January“7:15 a.m. Hurrah! The wilder... | \n", + "
| 74 | \n", + "19fec36a1dfb4c16 | \n", + "A Spy's Devotion (The Regency Spies of London #1) | \n", + "16.97 | \n", + "5 | \n", + "Historical Fiction | \n", + "In stock (1 available) | \n", + "In England’s Regency era, manners and elegance... | \n", + "
75 rows × 7 columns
\n", + "