diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb index e552783..dd42a53 100644 --- a/lab-web-scraping.ipynb +++ b/lab-web-scraping.ipynb @@ -110,14 +110,112 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "40359eee-9cd7-4884-bfa4-83344c222305", "metadata": { "id": "40359eee-9cd7-4884-bfa4-83344c222305" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " UPC Title \\\n", + "0 ce6396b0f23f6ecc Set Me Free \n", + "1 6258a1f6a6dcfe50 The Four Agreements: A Practical Guide to Pers... \n", + "2 6be3beb0793a53e7 Sophie's World \n", + "3 657fe5ead67a7767 Untitled Collection: Sabbath Poems 2014 \n", + "4 51653ef291ab7ddc This One Summer \n", + "\n", + " Price (£) Rating Genre Availability \\\n", + "0 17.46 5 Young Adult In stock (19 available) \n", + "1 17.66 5 Spirituality In stock (18 available) \n", + "2 15.94 5 Philosophy In stock (18 available) \n", + "3 14.27 4 Poetry In stock (16 available) \n", + "4 19.49 4 Sequential Art In stock (16 available) \n", + "\n", + " Description \n", + "0 Aaron Ledbetter’s future had been planned ou... \n", + "1 In The Four Agreements, don Miguel Ruiz reveal... \n", + "2 A page-turning novel that is also an explorati... \n", + "3 More than thirty-five years ago, when the weat... \n", + "4 Every summer, Rose goes with her mom and dad t... \n" + ] + } + ], "source": [ - "# Your solution goes here" + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import pandas as pd\n", + "import textwrap\n", + "import time # para evitar bloqueos por exceso de peticiones\n", + "\n", + "# Diccionario para convertir palabras de rating a números\n", + "rating_map = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}\n", + "\n", + "# Función principal que acepta parámetros\n", + "def scrape_books(min_rating, max_price):\n", + " all_books = [] # Lista para guardar los libros\n", + "\n", + " # Recorremos las 50 páginas del sitio\n", + " for page in range(1, 51):\n", + " url = f'http://books.toscrape.com/catalogue/page-{page}.html'\n", + " response = requests.get(url)\n", + " soup = BeautifulSoup(response.text, 'html.parser')\n", + "\n", + " # Encontramos todos los libros en la página\n", + " books = soup.find_all('article', class_='product_pod')\n", + "\n", + " for book in books:\n", + " title = book.h3.a['title']\n", + " price = float(book.find('p', class_='price_color').text[2:])\n", + " rating_word = book.find('p', class_='star-rating')['class'][1]\n", + " rating = rating_map[rating_word]\n", + "\n", + " # Aplicamos los filtros usando los parámetros\n", + " if rating >= min_rating and price <= max_price:\n", + " link = book.h3.a['href']\n", + " detail_url = 'http://books.toscrape.com/catalogue/' + link\n", + " detail_response = requests.get(detail_url)\n", + " detail_soup = BeautifulSoup(detail_response.text, 'html.parser')\n", + "\n", + " # Extraemos los datos adicionales\n", + " upc = detail_soup.find('th', string='UPC').find_next('td').text\n", + " genre = detail_soup.select('ul.breadcrumb li')[2].text.strip()\n", + " availability = detail_soup.find('p', class_='instock availability').text.strip()\n", + "\n", + " desc_tag = detail_soup.find('div', id='product_description')\n", + " if desc_tag:\n", + " description = desc_tag.find_next('p').text.strip()\n", + " else:\n", + " description = 'No description available'\n", + "\n", + " # Formateamos la descripción para que se vea bonita\n", + " description = textwrap.fill(description, width=60)\n", + "\n", + " # Guardamos los datos en un diccionario\n", + " book_info = {\n", + " 'UPC': upc,\n", + " 'Title': title,\n", + " 'Price (£)': price,\n", + " 'Rating': rating,\n", + " 'Genre': genre,\n", + " 'Availability': availability,\n", + " 'Description': description\n", + " }\n", + "\n", + " # Añadimos el libro a la lista\n", + " all_books.append(book_info)\n", + "\n", + " # Esperamos medio segundo para no saturar el servidor\n", + " time.sleep(0.5)\n", + "\n", + " # Convertimos la lista en un DataFrame\n", + " return pd.DataFrame(all_books)\n", + "\n", + "# Ejemplo de uso\n", + "books_df = scrape_books(min_rating=4, max_price=20)\n", + "print(books_df.head()) # Mostramos los primeros libros" ] } ], @@ -126,7 +224,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -140,7 +238,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.13.5" } }, "nbformat": 4,