From 93fa8bbe3d6906718d0b573e445a4964ffba9628 Mon Sep 17 00:00:00 2001 From: claudiasanchez07 Date: Sun, 19 Oct 2025 23:32:41 +0200 Subject: [PATCH] scraping done --- lab-web-scraping.ipynb | 403 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 398 insertions(+), 5 deletions(-) diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb index e552783..4c84056 100644 --- a/lab-web-scraping.ipynb +++ b/lab-web-scraping.ipynb @@ -58,6 +58,401 @@ "- [Books to Scrape](https://books.toscrape.com/)\n" ] }, + { + "cell_type": "code", + "execution_count": 287, + "id": "8c059983", + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import pandas as pd " + ] + }, + { + "cell_type": "code", + "execution_count": 288, + "id": "c7a818f6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\n", + " All products | Books to Scrape - Sandbox\n", + "" + ] + }, + "execution_count": 288, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "url= \"https://books.toscrape.com/\"\n", + "response=requests.get(url)\n", + "response\n", + "soup=BeautifulSoup(response.content)\n", + "soup.find(\"title\")" + ] + }, + { + "cell_type": "code", + "execution_count": 289, + "id": "058cfd40", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nA Light in the ...\\n\\n£51.77\\n\\n\\n \\n In stock\\n \\n\\n\\nAdd to basket\\n\\n\\n\\n'" + ] + }, + "execution_count": 289, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grid= soup.find(\"ol\",attrs={\"class\":\"row\"})\n", + "products=grid.find_all(\"li\",attrs={\"class\":\"col-xs-6 col-sm-4 col-md-3 col-lg-3\"})\n", + "products[0].get_text()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 290, + "id": "6281ada8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "A Light in the ...£51.77 In stock Add to basket\n", + "-----\n", + "Tipping the Velvet£53.74 In stock Add to basket\n", + "-----\n", + "Soumission£50.10 In stock Add to basket\n", + "-----\n", + "Sharp Objects£47.82 In stock Add to basket\n", + "-----\n", + "Sapiens: A Brief History ...£54.23 In stock Add to basket\n", + "-----\n", + "The Requiem Red£22.65 In stock Add to basket\n", + "-----\n", + "The Dirty Little Secrets ...£33.34 In stock Add to basket\n", + "-----\n", + "The Coming Woman: A ...£17.93 In stock Add to basket\n", + "-----\n", + "The Boys in the ...£22.60 In stock Add to basket\n", + "-----\n", + "The Black Maria£52.15 In stock Add to basket\n", + "-----\n", + "Starving Hearts (Triangular Trade ...£13.99 In stock Add to basket\n", + "-----\n", + "Shakespeare's Sonnets£20.66 In stock Add to basket\n", + "-----\n", + "Set Me Free£17.46 In stock Add to basket\n", + "-----\n", + "Scott Pilgrim's Precious Little ...£52.29 In stock Add to basket\n", + "-----\n", + "Rip it Up and ...£35.02 In stock Add to basket\n", + "-----\n", + "Our Band Could Be ...£57.25 In stock Add to basket\n", + "-----\n", + "Olio£23.88 In stock Add to basket\n", + "-----\n", + "Mesaerion: The Best Science ...£37.59 In stock Add to basket\n", + "-----\n", + "Libertarianism for Beginners£51.33 In stock Add to basket\n", + "-----\n", + "It's Only the Himalayas£45.17 In stock Add to basket\n", + "-----\n" + ] + } + ], + "source": [ + "for prod in products:\n", + " print(prod.get_text().strip().replace(\" \",\" \").replace(\"\\n\",\"\"))\n", + " print(\"-----\")" + ] + }, + { + "cell_type": "code", + "execution_count": 291, + "id": "c550b367", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 291, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "products[0].find(\"a\",attrs={\"title\":\"A Light in the Attic\"}).get_text()\n", + "def get_title(product_item):\n", + " title=product_item.find(\"h3\").find(\"a\").get_text()\n", + " return title\n", + "get_title" + ] + }, + { + "cell_type": "code", + "execution_count": 292, + "id": "f8bedf16", + "metadata": {}, + "outputs": [], + "source": [ + "products[0].find(\"p\",attrs={\"class\":\"price_color\"}).get_text()\n", + "def get_price(product_item):\n", + " price=product_item.find(\"p\",attrs={\"class\":\"price_color\"}).get_text()\n", + " price=float(price.replace(\"£\", \"\"))\n", + " return price" + ] + }, + { + "cell_type": "code", + "execution_count": 293, + "id": "27e5f1c1", + "metadata": {}, + "outputs": [], + "source": [ + "products[0].find(\"p\",attrs={\"class\":\"instock availability\"}).get_text()\n", + "def get_availability(product_item):\n", + " available=product_item.find(\"p\",attrs={\"class\":\"instock availability\"}).get_text()\n", + " return available" + ] + }, + { + "cell_type": "code", + "execution_count": 294, + "id": "cbf9021f", + "metadata": {}, + "outputs": [], + "source": [ + "products[0].find(\"p\",attrs={\"class\":\"star-rating\"}).get_text()\n", + "def get_star(product_item):\n", + "\n", + " stars=product_item.find(\"p\",class_=\"star-rating\")\n", + " rating_map = {\"One\": 1, \"Two\": 2, \"Three\": 3, \"Four\": 4, \"Five\": 5}\n", + " if stars:\n", + " \n", + " for key, val in rating_map.items():\n", + " \n", + " if key in stars[\"class\"]:\n", + " return val\n", + " return \"No ratting\"" + ] + }, + { + "cell_type": "code", + "execution_count": 295, + "id": "04076649", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html'" + ] + }, + "execution_count": 295, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def get_url(product_item):\n", + " domain=\"https://books.toscrape.com/\"\n", + " partial_url=product_item.find(\"a\")[\"href\"]\n", + " link_total= domain+partial_url\n", + " return link_total\n", + "get_url(products[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "485532f9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--\n", + "It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love th It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love that Silverstein. Need proof of his genius? RockabyeRockabye baby, in the treetopDon't you know a treetopIs no safe place to rock?And who put you up there,And your cradle, too?Baby, I think someone down here'sGot it in for you. Shel, you never sounded so good. ...more\n" + ] + } + ], + "source": [ + "\n", + "def get_description (soup1):\n", + " \n", + " description=soup1.find(\"div\",id=\"content_inner\")\n", + " \n", + " description1=description.find_all(\"p\")\n", + " \n", + " \n", + "\n", + " return description1[3].text\n", + "prod_link=get_url(products[0])\n", + "response_1=requests.get(prod_link)\n", + "soup1=BeautifulSoup(response_1.content)\n", + "print(get_description(soup1))" + ] + }, + { + "cell_type": "code", + "execution_count": 297, + "id": "31944388", + "metadata": {}, + "outputs": [], + "source": [ + "def get_upc(soup1):\n", + " upc_table=soup1.find(\"table\",attrs={\"class\",\"table table-striped\"})\n", + " upc=upc_table.find(\"td\")\n", + " return upc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad0a9272", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--\n", + "---------------\n", + "--\n", + "---------------\n", + "--\n", + "---------------\n", + "--\n", + "---------------\n", + "--\n", + "---------------\n", + "--\n", + "---------------\n", + "--\n", + "---------------\n", + "--\n", + "---------------\n", + "--\n", + "---------------\n", + "--\n", + "---------------\n", + "--\n", + "---------------\n", + "--\n", + "---------------\n", + "--\n", + "---------------\n", + "--\n", + "---------------\n", + "--\n", + "---------------\n", + "--\n", + "---------------\n", + "--\n", + "---------------\n", + "--\n", + "---------------\n", + "--\n", + "---------------\n", + "--\n", + "---------------\n" + ] + } + ], + "source": [ + "\n", + "data=[]\n", + "for prod in products:\n", + " prod_link=get_url(prod)\n", + " response_1=requests.get(prod_link)\n", + " soup1=BeautifulSoup(response_1.content)\n", + " title=get_title(prod)\n", + " price=get_price(prod)\n", + " stock=get_availability(prod)\n", + " link=get_url(prod)\n", + " upc=get_upc(soup1)\n", + " rating=get_star(prod)\n", + " description=get_description(soup1)\n", + " \n", + " data.append({\n", + " \"Título\": get_title(prod),\n", + " \"Precio (£)\": price,\n", + " \"Disponibilidad\": get_availability(prod),\n", + " \"Estrellas\": rating,\n", + " \"UPC\": upc,\n", + " \"Descripción\": description,\n", + " \"URL\": prod_link\n", + " })\n", + "\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 301, + "id": "d1fbf9b0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Libros encontrados: 20\n", + " Título Precio (£) Estrellas\n", + "0 A Light in the ... 51.77 3\n", + "1 Tipping the Velvet 53.74 1\n", + "2 Soumission 50.10 1\n", + "3 Sharp Objects 47.82 4\n", + "4 Sapiens: A Brief History ... 54.23 5\n", + "5 The Requiem Red 22.65 1\n", + "6 The Dirty Little Secrets ... 33.34 4\n", + "7 The Coming Woman: A ... 17.93 3\n", + "8 The Boys in the ... 22.60 4\n", + "9 The Black Maria 52.15 1\n", + "10 Starving Hearts (Triangular Trade ... 13.99 2\n", + "11 Shakespeare's Sonnets 20.66 4\n", + "12 Set Me Free 17.46 5\n", + "13 Scott Pilgrim's Precious Little ... 52.29 5\n", + "14 Rip it Up and ... 35.02 5\n", + "15 Our Band Could Be ... 57.25 3\n", + "16 Olio 23.88 1\n", + "17 Mesaerion: The Best Science ... 37.59 1\n", + "18 Libertarianism for Beginners 51.33 2\n", + "19 It's Only the Himalayas 45.17 2\n" + ] + } + ], + "source": [ + "df = pd.DataFrame(data)\n", + "print(f\"✅ Libros encontrados: {len(df)}\")\n", + "print(df[[\"Título\", \"Precio (£)\", \"Estrellas\"]])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db0b0a7e", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "id": "3519921d-5890-445b-9a33-934ed8ee378c", @@ -116,9 +511,7 @@ "id": "40359eee-9cd7-4884-bfa4-83344c222305" }, "outputs": [], - "source": [ - "# Your solution goes here" - ] + "source": [] } ], "metadata": { @@ -126,7 +519,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -140,7 +533,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.13.5" } }, "nbformat": 4,