Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
355 changes: 347 additions & 8 deletions lab-web-scraping.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -110,14 +110,353 @@
},
{
"cell_type": "code",
"execution_count": null,
"id": "40359eee-9cd7-4884-bfa4-83344c222305",
"metadata": {
"id": "40359eee-9cd7-4884-bfa4-83344c222305"
},
"execution_count": 7,
"id": "9be10ca2",
"metadata": {},
"outputs": [],
"source": [
"# Your solution goes here"
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import pandas as pd\n",
"\n",
"rating_map = {\n",
" \"One\": 1,\n",
" \"Two\": 2,\n",
" \"Three\": 3,\n",
" \"Four\": 4,\n",
" \"Five\": 5\n",
"}\n",
"\n",
"def scrape_books(min_rating, max_price):\n",
" base_url = \"https://books.toscrape.com/catalogue/\"\n",
" page_url = \"https://books.toscrape.com/catalogue/page-1.html\"\n",
" \n",
" books_data = []\n",
"\n",
" while True:\n",
" print(f\"Scrapeando: {page_url}\")\n",
" response = requests.get(page_url)\n",
" response.encoding = 'utf-8' # ← FIX IMPORTANTE\n",
" soup = BeautifulSoup(response.text, \"html.parser\")\n",
"\n",
" books = soup.select(\"article.product_pod\")\n",
"\n",
" for book in books:\n",
" rating_class = book.find(\"p\", class_=\"star-rating\")[\"class\"][1]\n",
" rating = rating_map[rating_class]\n",
"\n",
" if rating < min_rating:\n",
" continue\n",
"\n",
" book_url = base_url + book.find(\"a\")[\"href\"].replace(\"../\", \"\")\n",
"\n",
" book_resp = requests.get(book_url)\n",
" book_resp.encoding = 'utf-8' # ← FIX IMPORTANTE\n",
" book_soup = BeautifulSoup(book_resp.text, \"html.parser\")\n",
"\n",
" title = book_soup.find(\"h1\").text.strip()\n",
"\n",
" table = book_soup.find(\"table\", class_=\"table-striped\")\n",
" rows = {row.th.text: row.td.text for row in table.find_all(\"tr\")}\n",
" upc = rows.get(\"UPC\")\n",
"\n",
" price_text = rows.get(\"Price (incl. tax)\").replace(\"£\", \"\").strip()\n",
" price = float(price_text)\n",
"\n",
" if price > max_price:\n",
" continue\n",
"\n",
" availability = rows.get(\"Availability\")\n",
"\n",
" genre = book_soup.select(\"ul.breadcrumb li a\")[2].text.strip()\n",
"\n",
" desc_tag = book_soup.find(\"div\", id=\"product_description\")\n",
" if desc_tag:\n",
" description = desc_tag.find_next_sibling(\"p\").text.strip()\n",
" else:\n",
" description = \"No disponible\"\n",
"\n",
" books_data.append({\n",
" \"UPC\": upc,\n",
" \"Title\": title,\n",
" \"Price (£)\": price,\n",
" \"Rating\": rating,\n",
" \"Genre\": genre,\n",
" \"Availability\": availability,\n",
" \"Description\": description\n",
" })\n",
"\n",
" next_button = soup.find(\"li\", class_=\"next\")\n",
" if not next_button:\n",
" break\n",
"\n",
" next_page = next_button.find(\"a\")[\"href\"]\n",
" page_url = \"https://books.toscrape.com/catalogue/\" + next_page\n",
"\n",
" return pd.DataFrame(books_data)\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "6b29dbf2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Scrapeando: https://books.toscrape.com/catalogue/page-1.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-2.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-3.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-4.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-5.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-6.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-7.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-8.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-9.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-10.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-11.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-12.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-13.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-14.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-15.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-16.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-17.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-18.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-19.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-20.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-21.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-22.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-23.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-24.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-25.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-26.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-27.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-28.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-29.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-30.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-31.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-32.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-33.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-34.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-35.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-36.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-37.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-38.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-39.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-40.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-41.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-42.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-43.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-44.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-45.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-46.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-47.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-48.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-49.html\n",
"Scrapeando: https://books.toscrape.com/catalogue/page-50.html\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>UPC</th>\n",
" <th>Title</th>\n",
" <th>Price (£)</th>\n",
" <th>Rating</th>\n",
" <th>Genre</th>\n",
" <th>Availability</th>\n",
" <th>Description</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ce6396b0f23f6ecc</td>\n",
" <td>Set Me Free</td>\n",
" <td>17.46</td>\n",
" <td>5</td>\n",
" <td>Young Adult</td>\n",
" <td>In stock (19 available)</td>\n",
" <td>Aaron Ledbetter’s future had been planned out ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>6258a1f6a6dcfe50</td>\n",
" <td>The Four Agreements: A Practical Guide to Pers...</td>\n",
" <td>17.66</td>\n",
" <td>5</td>\n",
" <td>Spirituality</td>\n",
" <td>In stock (18 available)</td>\n",
" <td>In The Four Agreements, don Miguel Ruiz reveal...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6be3beb0793a53e7</td>\n",
" <td>Sophie's World</td>\n",
" <td>15.94</td>\n",
" <td>5</td>\n",
" <td>Philosophy</td>\n",
" <td>In stock (18 available)</td>\n",
" <td>A page-turning novel that is also an explorati...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>657fe5ead67a7767</td>\n",
" <td>Untitled Collection: Sabbath Poems 2014</td>\n",
" <td>14.27</td>\n",
" <td>4</td>\n",
" <td>Poetry</td>\n",
" <td>In stock (16 available)</td>\n",
" <td>More than thirty-five years ago, when the weat...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>51653ef291ab7ddc</td>\n",
" <td>This One Summer</td>\n",
" <td>19.49</td>\n",
" <td>4</td>\n",
" <td>Sequential Art</td>\n",
" <td>In stock (16 available)</td>\n",
" <td>Every summer, Rose goes with her mom and dad t...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>70</th>\n",
" <td>9c96cd1329fbd82d</td>\n",
" <td>The Zombie Room</td>\n",
" <td>19.69</td>\n",
" <td>5</td>\n",
" <td>Default</td>\n",
" <td>In stock (1 available)</td>\n",
" <td>An unlikely bond is forged between three men f...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71</th>\n",
" <td>b78deb463531d078</td>\n",
" <td>The Silent Wife</td>\n",
" <td>12.34</td>\n",
" <td>5</td>\n",
" <td>Fiction</td>\n",
" <td>In stock (1 available)</td>\n",
" <td>A chilling psychological thriller about a marr...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>72</th>\n",
" <td>4280ac3eab57aa5d</td>\n",
" <td>The Girl You Lost</td>\n",
" <td>12.29</td>\n",
" <td>5</td>\n",
" <td>Mystery</td>\n",
" <td>In stock (1 available)</td>\n",
" <td>Eighteen years ago your baby daughter was snat...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>73</th>\n",
" <td>29fc016c459aeb14</td>\n",
" <td>The Edge of Reason (Bridget Jones #2)</td>\n",
" <td>19.18</td>\n",
" <td>4</td>\n",
" <td>Womens Fiction</td>\n",
" <td>In stock (1 available)</td>\n",
" <td>Monday 27 January“7:15 a.m. Hurrah! The wilder...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>74</th>\n",
" <td>19fec36a1dfb4c16</td>\n",
" <td>A Spy's Devotion (The Regency Spies of London #1)</td>\n",
" <td>16.97</td>\n",
" <td>5</td>\n",
" <td>Historical Fiction</td>\n",
" <td>In stock (1 available)</td>\n",
" <td>In England’s Regency era, manners and elegance...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>75 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" UPC Title \\\n",
"0 ce6396b0f23f6ecc Set Me Free \n",
"1 6258a1f6a6dcfe50 The Four Agreements: A Practical Guide to Pers... \n",
"2 6be3beb0793a53e7 Sophie's World \n",
"3 657fe5ead67a7767 Untitled Collection: Sabbath Poems 2014 \n",
"4 51653ef291ab7ddc This One Summer \n",
".. ... ... \n",
"70 9c96cd1329fbd82d The Zombie Room \n",
"71 b78deb463531d078 The Silent Wife \n",
"72 4280ac3eab57aa5d The Girl You Lost \n",
"73 29fc016c459aeb14 The Edge of Reason (Bridget Jones #2) \n",
"74 19fec36a1dfb4c16 A Spy's Devotion (The Regency Spies of London #1) \n",
"\n",
" Price (£) Rating Genre Availability \\\n",
"0 17.46 5 Young Adult In stock (19 available) \n",
"1 17.66 5 Spirituality In stock (18 available) \n",
"2 15.94 5 Philosophy In stock (18 available) \n",
"3 14.27 4 Poetry In stock (16 available) \n",
"4 19.49 4 Sequential Art In stock (16 available) \n",
".. ... ... ... ... \n",
"70 19.69 5 Default In stock (1 available) \n",
"71 12.34 5 Fiction In stock (1 available) \n",
"72 12.29 5 Mystery In stock (1 available) \n",
"73 19.18 4 Womens Fiction In stock (1 available) \n",
"74 16.97 5 Historical Fiction In stock (1 available) \n",
"\n",
" Description \n",
"0 Aaron Ledbetter’s future had been planned out ... \n",
"1 In The Four Agreements, don Miguel Ruiz reveal... \n",
"2 A page-turning novel that is also an explorati... \n",
"3 More than thirty-five years ago, when the weat... \n",
"4 Every summer, Rose goes with her mom and dad t... \n",
".. ... \n",
"70 An unlikely bond is forged between three men f... \n",
"71 A chilling psychological thriller about a marr... \n",
"72 Eighteen years ago your baby daughter was snat... \n",
"73 Monday 27 January“7:15 a.m. Hurrah! The wilder... \n",
"74 In England’s Regency era, manners and elegance... \n",
"\n",
"[75 rows x 7 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"df = scrape_books(min_rating=4, max_price=20)\n",
"df\n",
"\n"
]
}
],
Expand All @@ -126,7 +465,7 @@
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
Expand All @@ -140,7 +479,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
"version": "3.12.9"
}
},
"nbformat": 4,
Expand Down