diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb index e552783..ecceb29 100644 --- a/lab-web-scraping.ipynb +++ b/lab-web-scraping.ipynb @@ -110,14 +110,480 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "40359eee-9cd7-4884-bfa4-83344c222305", "metadata": { "id": "40359eee-9cd7-4884-bfa4-83344c222305" }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "import requests\n", + "from bs4 import BeautifulSoup\n", + "from urllib.parse import urljoin\n", + "\n", + "\n", + "url = \"https://books.toscrape.com/\"\n", + "response = requests.get(url)\n", + "response\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "fd7d2b9d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "b'\\n\\n\\n\\n \\n \\n \\n All products | Books to Scrape - Sandbox\\n\\n\\n \\n \\n \\n \\n \\n\\n \\n \\n\\n \\n \\n \\n\\n \\n \\n \\n \\n \\n \\n \\n \\n\\n\\n \\n \\n\\n \\n\\n \\n \\n \\n\\n \\n \\n\\n \\n \\n \\n \\n \\n
\\n
\\n
\\n
Books to Scrape We love being scraped!\\n
\\n\\n \\n
\\n
\\n
\\n\\n \\n \\n
\\n
\\n \\n
    \\n
  • \\n Home\\n
  • \\n
  • All products
  • \\n
\\n\\n
\\n\\n \\n\\n
\\n \\n
\\n

All products

\\n
\\n \\n\\n \\n\\n\\n\\n
\\n\\n
\\n\\n\\n
\\n \\n
\\n\\n \\n
\\n \\n
\\n \\n \\n
\\n\\n \\n \\n \\n 1000 results - showing 1 to 20.\\n \\n \\n \\n \\n
\\n \\n
\\n
Warning! This is a demo website for web scraping purposes. Prices and ratings here were randomly assigned and have no real meaning.
\\n\\n
\\n
    \\n \\n
  1. \\n\\n\\n\\n\\n\\n\\n
    \\n \\n
    \\n \\n \\n \"A\\n \\n \\n
    \\n \\n\\n \\n \\n

    \\n \\n \\n \\n \\n \\n

    \\n \\n \\n\\n \\n

    A Light in the ...

    \\n \\n\\n \\n
    \\n \\n\\n\\n\\n\\n\\n\\n \\n

    \\xc2\\xa351.77

    \\n \\n\\n

    \\n \\n \\n In stock\\n \\n

    \\n\\n \\n \\n\\n\\n\\n\\n\\n\\n \\n
    \\n \\n
    \\n\\n\\n \\n
    \\n \\n
    \\n\\n
  2. \\n \\n
  3. \\n\\n\\n\\n\\n\\n\\n
    \\n \\n
    \\n \\n \\n \"Tipping\\n \\n \\n
    \\n \\n\\n \\n \\n

    \\n \\n \\n \\n \\n \\n

    \\n \\n \\n\\n \\n

    Tipping the Velvet

    \\n \\n\\n \\n
    \\n \\n\\n\\n\\n\\n\\n\\n \\n

    \\xc2\\xa353.74

    \\n \\n\\n

    \\n \\n \\n In stock\\n \\n

    \\n\\n \\n \\n\\n\\n\\n\\n\\n\\n \\n
    \\n \\n
    \\n\\n\\n \\n
    \\n \\n
    \\n\\n
  4. \\n \\n
  5. \\n\\n\\n\\n\\n\\n\\n
    \\n \\n
    \\n \\n \\n \"Soumission\"\\n \\n \\n
    \\n \\n\\n \\n \\n

    \\n \\n \\n \\n \\n \\n

    \\n \\n \\n\\n \\n

    Soumission

    \\n \\n\\n \\n
    \\n \\n\\n\\n\\n\\n\\n\\n \\n

    \\xc2\\xa350.10

    \\n \\n\\n

    \\n \\n \\n In stock\\n \\n

    \\n\\n \\n \\n\\n\\n\\n\\n\\n\\n \\n
    \\n \\n
    \\n\\n\\n \\n
    \\n \\n
    \\n\\n
  6. \\n \\n
  7. \\n\\n\\n\\n\\n\\n\\n
    \\n \\n
    \\n \\n \\n \"Sharp\\n \\n \\n
    \\n \\n\\n \\n \\n

    \\n \\n \\n \\n \\n \\n

    \\n \\n \\n\\n \\n

    Sharp Objects

    \\n \\n\\n \\n
    \\n \\n\\n\\n\\n\\n\\n\\n \\n

    \\xc2\\xa347.82

    \\n \\n\\n

    \\n \\n \\n In stock\\n \\n

    \\n\\n \\n \\n\\n\\n\\n\\n\\n\\n \\n
    \\n \\n
    \\n\\n\\n \\n
    \\n \\n
    \\n\\n
  8. \\n \\n
  9. \\n\\n\\n\\n\\n\\n\\n
    \\n \\n
    \\n \\n \\n \"Sapiens:\\n \\n \\n
    \\n \\n\\n \\n \\n

    \\n \\n \\n \\n \\n \\n

    \\n \\n \\n\\n \\n

    Sapiens: A Brief History ...

    \\n \\n\\n \\n
    \\n \\n\\n\\n\\n\\n\\n\\n \\n

    \\xc2\\xa354.23

    \\n \\n\\n

    \\n \\n \\n In stock\\n \\n

    \\n\\n \\n \\n\\n\\n\\n\\n\\n\\n \\n
    \\n \\n
    \\n\\n\\n \\n
    \\n \\n
    \\n\\n
  10. \\n \\n
  11. \\n\\n\\n\\n\\n\\n\\n
    \\n \\n
    \\n \\n \\n \"The\\n \\n \\n
    \\n \\n\\n \\n \\n

    \\n \\n \\n \\n \\n \\n

    \\n \\n \\n\\n \\n

    The Requiem Red

    \\n \\n\\n \\n
    \\n \\n\\n\\n\\n\\n\\n\\n \\n

    \\xc2\\xa322.65

    \\n \\n\\n

    \\n \\n \\n In stock\\n \\n

    \\n\\n \\n \\n\\n\\n\\n\\n\\n\\n \\n
    \\n \\n
    \\n\\n\\n \\n
    \\n \\n
    \\n\\n
  12. \\n \\n
  13. \\n\\n\\n\\n\\n\\n\\n
    \\n \\n
    \\n \\n \\n \"The\\n \\n \\n
    \\n \\n\\n \\n \\n

    \\n \\n \\n \\n \\n \\n

    \\n \\n \\n\\n \\n

    The Dirty Little Secrets ...

    \\n \\n\\n \\n
    \\n \\n\\n\\n\\n\\n\\n\\n \\n

    \\xc2\\xa333.34

    \\n \\n\\n

    \\n \\n \\n In stock\\n \\n

    \\n\\n \\n \\n\\n\\n\\n\\n\\n\\n \\n
    \\n \\n
    \\n\\n\\n \\n
    \\n \\n
    \\n\\n
  14. \\n \\n
  15. \\n\\n\\n\\n\\n\\n\\n
    \\n \\n
    \\n \\n \\n \"The\\n \\n \\n
    \\n \\n\\n \\n \\n

    \\n \\n \\n \\n \\n \\n

    \\n \\n \\n\\n \\n

    The Coming Woman: A ...

    \\n \\n\\n \\n
    \\n \\n\\n\\n\\n\\n\\n\\n \\n

    \\xc2\\xa317.93

    \\n \\n\\n

    \\n \\n \\n In stock\\n \\n

    \\n\\n \\n \\n\\n\\n\\n\\n\\n\\n \\n
    \\n \\n
    \\n\\n\\n \\n
    \\n \\n
    \\n\\n
  16. \\n \\n
  17. \\n\\n\\n\\n\\n\\n\\n
    \\n \\n
    \\n \\n \\n \"The\\n \\n \\n
    \\n \\n\\n \\n \\n

    \\n \\n \\n \\n \\n \\n

    \\n \\n \\n\\n \\n

    The Boys in the ...

    \\n \\n\\n \\n
    \\n \\n\\n\\n\\n\\n\\n\\n \\n

    \\xc2\\xa322.60

    \\n \\n\\n

    \\n \\n \\n In stock\\n \\n

    \\n\\n \\n \\n\\n\\n\\n\\n\\n\\n \\n
    \\n \\n
    \\n\\n\\n \\n
    \\n \\n
    \\n\\n
  18. \\n \\n
  19. \\n\\n\\n\\n\\n\\n\\n
    \\n \\n
    \\n \\n \\n \"The\\n \\n \\n
    \\n \\n\\n \\n \\n

    \\n \\n \\n \\n \\n \\n

    \\n \\n \\n\\n \\n

    The Black Maria

    \\n \\n\\n \\n
    \\n \\n\\n\\n\\n\\n\\n\\n \\n

    \\xc2\\xa352.15

    \\n \\n\\n

    \\n \\n \\n In stock\\n \\n

    \\n\\n \\n \\n\\n\\n\\n\\n\\n\\n \\n
    \\n \\n
    \\n\\n\\n \\n
    \\n \\n
    \\n\\n
  20. \\n \\n
  21. \\n\\n\\n\\n\\n\\n\\n
    \\n \\n
    \\n \\n \\n \"Starving\\n \\n \\n
    \\n \\n\\n \\n \\n

    \\n \\n \\n \\n \\n \\n

    \\n \\n \\n\\n \\n

    Starving Hearts (Triangular Trade ...

    \\n \\n\\n \\n
    \\n \\n\\n\\n\\n\\n\\n\\n \\n

    \\xc2\\xa313.99

    \\n \\n\\n

    \\n \\n \\n In stock\\n \\n

    \\n\\n \\n \\n\\n\\n\\n\\n\\n\\n \\n
    \\n \\n
    \\n\\n\\n \\n
    \\n \\n
    \\n\\n
  22. \\n \\n
  23. \\n\\n\\n\\n\\n\\n\\n
    \\n \\n
    \\n \\n \\n \"Shakespeare's\\n \\n \\n
    \\n \\n\\n \\n \\n

    \\n \\n \\n \\n \\n \\n

    \\n \\n \\n\\n \\n

    Shakespeare's Sonnets

    \\n \\n\\n \\n
    \\n \\n\\n\\n\\n\\n\\n\\n \\n

    \\xc2\\xa320.66

    \\n \\n\\n

    \\n \\n \\n In stock\\n \\n

    \\n\\n \\n \\n\\n\\n\\n\\n\\n\\n \\n
    \\n \\n
    \\n\\n\\n \\n
    \\n \\n
    \\n\\n
  24. \\n \\n
  25. \\n\\n\\n\\n\\n\\n\\n
    \\n \\n
    \\n \\n \\n \"Set\\n \\n \\n
    \\n \\n\\n \\n \\n

    \\n \\n \\n \\n \\n \\n

    \\n \\n \\n\\n \\n

    Set Me Free

    \\n \\n\\n \\n
    \\n \\n\\n\\n\\n\\n\\n\\n \\n

    \\xc2\\xa317.46

    \\n \\n\\n

    \\n \\n \\n In stock\\n \\n

    \\n\\n \\n \\n\\n\\n\\n\\n\\n\\n \\n
    \\n \\n
    \\n\\n\\n \\n
    \\n \\n
    \\n\\n
  26. \\n \\n
  27. \\n\\n\\n\\n\\n\\n\\n
    \\n \\n
    \\n \\n \\n \"Scott\\n \\n \\n
    \\n \\n\\n \\n \\n

    \\n \\n \\n \\n \\n \\n

    \\n \\n \\n\\n \\n

    Scott Pilgrim's Precious Little ...

    \\n \\n\\n \\n
    \\n \\n\\n\\n\\n\\n\\n\\n \\n

    \\xc2\\xa352.29

    \\n \\n\\n

    \\n \\n \\n In stock\\n \\n

    \\n\\n \\n \\n\\n\\n\\n\\n\\n\\n \\n
    \\n \\n
    \\n\\n\\n \\n
    \\n \\n
    \\n\\n
  28. \\n \\n
  29. \\n\\n\\n\\n\\n\\n\\n
    \\n \\n
    \\n \\n \\n \"Rip\\n \\n \\n
    \\n \\n\\n \\n \\n

    \\n \\n \\n \\n \\n \\n

    \\n \\n \\n\\n \\n

    Rip it Up and ...

    \\n \\n\\n \\n
    \\n \\n\\n\\n\\n\\n\\n\\n \\n

    \\xc2\\xa335.02

    \\n \\n\\n

    \\n \\n \\n In stock\\n \\n

    \\n\\n \\n \\n\\n\\n\\n\\n\\n\\n \\n
    \\n \\n
    \\n\\n\\n \\n
    \\n \\n
    \\n\\n
  30. \\n \\n
  31. \\n\\n\\n\\n\\n\\n\\n
    \\n \\n
    \\n \\n \\n \"Our\\n \\n \\n
    \\n \\n\\n \\n \\n

    \\n \\n \\n \\n \\n \\n

    \\n \\n \\n\\n \\n

    Our Band Could Be ...

    \\n \\n\\n \\n
    \\n \\n\\n\\n\\n\\n\\n\\n \\n

    \\xc2\\xa357.25

    \\n \\n\\n

    \\n \\n \\n In stock\\n \\n

    \\n\\n \\n \\n\\n\\n\\n\\n\\n\\n \\n
    \\n \\n
    \\n\\n\\n \\n
    \\n \\n
    \\n\\n
  32. \\n \\n
  33. \\n\\n\\n\\n\\n\\n\\n
    \\n \\n
    \\n \\n \\n \"Olio\"\\n \\n \\n
    \\n \\n\\n \\n \\n

    \\n \\n \\n \\n \\n \\n

    \\n \\n \\n\\n \\n

    Olio

    \\n \\n\\n \\n
    \\n \\n\\n\\n\\n\\n\\n\\n \\n

    \\xc2\\xa323.88

    \\n \\n\\n

    \\n \\n \\n In stock\\n \\n

    \\n\\n \\n \\n\\n\\n\\n\\n\\n\\n \\n
    \\n \\n
    \\n\\n\\n \\n
    \\n \\n
    \\n\\n
  34. \\n \\n
  35. \\n\\n\\n\\n\\n\\n\\n
    \\n \\n
    \\n \\n \\n \"Mesaerion:\\n \\n \\n
    \\n \\n\\n \\n \\n

    \\n \\n \\n \\n \\n \\n

    \\n \\n \\n\\n \\n

    Mesaerion: The Best Science ...

    \\n \\n\\n \\n
    \\n \\n\\n\\n\\n\\n\\n\\n \\n

    \\xc2\\xa337.59

    \\n \\n\\n

    \\n \\n \\n In stock\\n \\n

    \\n\\n \\n \\n\\n\\n\\n\\n\\n\\n \\n
    \\n \\n
    \\n\\n\\n \\n
    \\n \\n
    \\n\\n
  36. \\n \\n
  37. \\n\\n\\n\\n\\n\\n\\n
    \\n \\n
    \\n \\n \\n \"Libertarianism\\n \\n \\n
    \\n \\n\\n \\n \\n

    \\n \\n \\n \\n \\n \\n

    \\n \\n \\n\\n \\n

    Libertarianism for Beginners

    \\n \\n\\n \\n
    \\n \\n\\n\\n\\n\\n\\n\\n \\n

    \\xc2\\xa351.33

    \\n \\n\\n

    \\n \\n \\n In stock\\n \\n

    \\n\\n \\n \\n\\n\\n\\n\\n\\n\\n \\n
    \\n \\n
    \\n\\n\\n \\n
    \\n \\n
    \\n\\n
  38. \\n \\n
  39. \\n\\n\\n\\n\\n\\n\\n
    \\n \\n
    \\n \\n \\n \"It's\\n \\n \\n
    \\n \\n\\n \\n \\n

    \\n \\n \\n \\n \\n \\n

    \\n \\n \\n\\n \\n

    It's Only the Himalayas

    \\n \\n\\n \\n
    \\n \\n\\n\\n\\n\\n\\n\\n \\n

    \\xc2\\xa345.17

    \\n \\n\\n

    \\n \\n \\n In stock\\n \\n

    \\n\\n \\n \\n\\n\\n\\n\\n\\n\\n \\n
    \\n \\n
    \\n\\n\\n \\n
    \\n \\n
    \\n\\n
  40. \\n \\n
\\n \\n\\n\\n\\n
\\n
    \\n \\n
  • \\n \\n Page 1 of 50\\n \\n
  • \\n \\n
  • next
  • \\n \\n
\\n
\\n\\n\\n
\\n
\\n \\n\\n\\n
\\n\\n
\\n
\\n
\\n\\n\\n \\n\\n\\n\\n \\n \\n \\n \\n \\n \\n \\n \\n\\n\\n \\n \\n \\n \\n \\n \\n \\n \\n\\n \\n \\n\\n\\n \\n \\n \\n\\n \\n\\n\\n \\n \\n\\n \\n \\n \\n \\n\\n'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#lee el contenido de la web\n", + "response.content" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "29a58ab5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Date': 'Tue, 04 Nov 2025 20:44:18 GMT', 'Content-Type': 'text/html', 'Content-Length': '51294', 'Connection': 'keep-alive', 'Last-Modified': 'Wed, 08 Feb 2023 21:02:32 GMT', 'ETag': '\"63e40de8-c85e\"', 'Accept-Ranges': 'bytes', 'Strict-Transport-Security': 'max-age=0; includeSubDomains; preload'}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#muestra los encabezados de la web\n", + "response.headers" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9d703bd9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "text/html\n" + ] + } + ], + "source": [ + "#muestra el tipo de contenido de la web\n", + "print(response.headers['Content-Type'])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f28af646", + "metadata": {}, + "outputs": [], + "source": [ + "#Divide el contenido de la web en un objeto BeautifulSoup para su análisis --> Parsear el contenido a HTML\n", + "soup = BeautifulSoup(response.content, \"html.parser\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "f405d8b1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "20\n" + ] + } + ], + "source": [ + "#Explorar contenido - Cuenta el número de libros en la página principal\n", + "books = soup.find_all(\"article\", class_=\"product_pod\")\n", + "print(len(books))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "59933ca0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "
\n", + "
\n", + " \n", + " \"A\n", + " \n", + "
\n", + "

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "

\n", + "

\n", + " \n", + " A Light in the ...\n", + " \n", + "

\n", + "
\n", + "

\n", + " £51.77\n", + "

\n", + "

\n", + " \n", + " \n", + " In stock\n", + "

\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "\n" + ] + } + ], + "source": [ + "#contenido del primer libro\n", + "\n", + "book = books[0]\n", + "print (book.prettify()) # This formats the HTML in a readable way\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "881e7fd7", + "metadata": {}, "outputs": [], "source": [ - "# Your solution goes here" + "books_names_tags = soup.find_all('h3', class_='de-ProductTile-title')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0e8c2a22", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Title Price Rating\n", + "0 Set Me Free 17.46 5\n", + "1 The Four Agreements: A Practical Guide to Pers... 17.66 5\n", + "2 Sophie's World 15.94 5\n", + "3 Untitled Collection: Sabbath Poems 2014 14.27 4\n", + "4 This One Summer 19.49 4\n", + ".. ... ... ...\n", + "70 The Zombie Room 19.69 5\n", + "71 The Silent Wife 12.34 5\n", + "72 The Girl You Lost 12.29 5\n", + "73 The Edge of Reason (Bridget Jones #2) 19.18 4\n", + "74 A Spy's Devotion (The Regency Spies of London #1) 16.97 5\n", + "\n", + "[75 rows x 3 columns]\n" + ] + } + ], + "source": [ + "#A function named `scrape_books` that takes two parameters: `min_rating` and `max_price`.\n", + "# The function should scrape book data from the \"Books to Scrape\" website and return a `pandas` DataFrame with the following columns:\n", + "\n", + "def scrape_books(min_rating: int = 4, max_price: float = 20.0):\n", + " \"\"\"\n", + " Raspa books.toscrape.com (todas las páginas) y devuelve una lista de libros\n", + " que cumplen: rating >= min_rating y price <= max_price.\n", + "\n", + " Devuelve: list[dict] con keys: title, price, rating, availability, url\n", + " \"\"\"\n", + " base_url = \"https://books.toscrape.com/\"\n", + " books = []\n", + "\n", + " for page in range(1, 51): # There are 50 pages\n", + " url = f'{base_url}catalogue/page-{page}.html'\n", + " response = requests.get(url)\n", + " soup = BeautifulSoup(response.content, 'html.parser')\n", + " book_items = soup.find_all('article', class_='product_pod') \n", + "\n", + " for item in book_items:\n", + " title = item.h3.a['title']\n", + " price = float(item.find('p', class_='price_color').text[1:]) # Remove '£' and convert to float\n", + " rating_class = item.find('p', class_='star-rating')['class'][1]\n", + " rating = ['Zero', 'One', 'Two', 'Three', 'Four', 'Five'].index(rating_class)\n", + "\n", + " if rating >= min_rating and price <= max_price:\n", + " books.append({'Title': title, 'Price': price, 'Rating': rating})#, 'UPC': upc, 'Genre': genre, 'Availability': availability, 'Description': description})\n", + " return pd.DataFrame(books)\n", + "print(scrape_books(4, 20))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5f0aa700", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " UPC Title \\\n", + "0 ce6396b0f23f6ecc Set Me Free \n", + "1 6258a1f6a6dcfe50 The Four Agreements: A Practical Guide to Pers... \n", + "2 6be3beb0793a53e7 Sophie's World \n", + "3 657fe5ead67a7767 Untitled Collection: Sabbath Poems 2014 \n", + "4 51653ef291ab7ddc This One Summer \n", + ".. ... ... \n", + "70 9c96cd1329fbd82d The Zombie Room \n", + "71 b78deb463531d078 The Silent Wife \n", + "72 4280ac3eab57aa5d The Girl You Lost \n", + "73 29fc016c459aeb14 The Edge of Reason (Bridget Jones #2) \n", + "74 19fec36a1dfb4c16 A Spy's Devotion (The Regency Spies of London #1) \n", + "\n", + " Price (£) Rating Genre Availability \\\n", + "0 17.46 5 Young Adult In stock (19 available) \n", + "1 17.66 5 Spirituality In stock (18 available) \n", + "2 15.94 5 Philosophy In stock (18 available) \n", + "3 14.27 4 Poetry In stock (16 available) \n", + "4 19.49 4 Sequential Art In stock (16 available) \n", + ".. ... ... ... ... \n", + "70 19.69 5 Default In stock (1 available) \n", + "71 12.34 5 Fiction In stock (1 available) \n", + "72 12.29 5 Mystery In stock (1 available) \n", + "73 19.18 4 Womens Fiction In stock (1 available) \n", + "74 16.97 5 Historical Fiction In stock (1 available) \n", + "\n", + " Description \n", + "0 Aaron Ledbetter’s future had been planned out ... \n", + "1 In The Four Agreements, don Miguel Ruiz reveal... \n", + "2 A page-turning novel that is also an explorati... \n", + "3 More than thirty-five years ago, when the weat... \n", + "4 Every summer, Rose goes with her mom and dad t... \n", + ".. ... \n", + "70 An unlikely bond is forged between three men f... \n", + "71 A chilling psychological thriller about a marr... \n", + "72 Eighteen years ago your baby daughter was snat... \n", + "73 Monday 27 January“7:15 a.m. Hurrah! The wilder... \n", + "74 In England’s Regency era, manners and elegance... \n", + "\n", + "[75 rows x 7 columns]\n" + ] + } + ], + "source": [ + "# Enhanced version of scrape_books function with detailed book information\n", + "\n", + "def scrape_books_detailed(min_rating, max_price):\n", + " base_url = 'https://books.toscrape.com/catalogue/'\n", + " books = []\n", + " \n", + " for page in range(1, 51): # There are 50 pages\n", + " url = f'{base_url}page-{page}.html'\n", + " response = requests.get(url)\n", + " soup = BeautifulSoup(response.content, 'html.parser')\n", + " book_items = soup.find_all('article', class_='product_pod') \n", + " \n", + " for item in book_items: \n", + " title = item.h3.a['title']\n", + " price = float(item.find('p', class_='price_color').text[1:]) # Remove '£' and convert to float\n", + " rating_class = item.find('p', class_='star-rating')['class'][1] # Get the rating class\n", + " rating = ['Zero', 'One', 'Two', 'Three', 'Four', 'Five'].index(rating_class) # Convert rating class to numeric\n", + " \n", + " if rating >= min_rating and price <= max_price:\n", + " detail_url = base_url + item.h3.a['href']\n", + " detail_response = requests.get(detail_url)\n", + " detail_soup = BeautifulSoup(detail_response.content, 'html.parser')\n", + " \n", + " upc = detail_soup.find('th', string='UPC').find_next_sibling('td').text # Get UPC\n", + " genre = detail_soup.find('ul', class_='breadcrumb').find_all('a')[2].text # Get Genre\n", + " availability = detail_soup.find('th', string='Availability').find_next_sibling('td').text.strip() # Get Availability\n", + " description_tag = detail_soup.find('div', id='product_description') # Get Description\n", + " description = description_tag.find_next_sibling('p').text if description_tag else 'No description available' # Handle missing description\n", + "\n", + " books.append({\n", + " 'UPC': upc,\n", + " 'Title': title, \n", + " 'Price (£)': price, \n", + " 'Rating': rating, \n", + " 'Genre': genre, \n", + " 'Availability': availability, \n", + " 'Description': description\n", + " })\n", + " \n", + " return pd.DataFrame(books)\n", + "\n", + "# Test the function\n", + "result = scrape_books_detailed(4, 20)\n", + "print(result)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "e8946c6b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
UPCTitlePrice (£)RatingGenreAvailabilityDescription
0ce6396b0f23f6eccSet Me Free17.465Young AdultIn stock (19 available)Aaron Ledbetter’s future had been planned out ...
16258a1f6a6dcfe50The Four Agreements: A Practical Guide to Pers...17.665SpiritualityIn stock (18 available)In The Four Agreements, don Miguel Ruiz reveal...
26be3beb0793a53e7Sophie's World15.945PhilosophyIn stock (18 available)A page-turning novel that is also an explorati...
3657fe5ead67a7767Untitled Collection: Sabbath Poems 201414.274PoetryIn stock (16 available)More than thirty-five years ago, when the weat...
451653ef291ab7ddcThis One Summer19.494Sequential ArtIn stock (16 available)Every summer, Rose goes with her mom and dad t...
\n", + "
" + ], + "text/plain": [ + " UPC Title \\\n", + "0 ce6396b0f23f6ecc Set Me Free \n", + "1 6258a1f6a6dcfe50 The Four Agreements: A Practical Guide to Pers... \n", + "2 6be3beb0793a53e7 Sophie's World \n", + "3 657fe5ead67a7767 Untitled Collection: Sabbath Poems 2014 \n", + "4 51653ef291ab7ddc This One Summer \n", + "\n", + " Price (£) Rating Genre Availability \\\n", + "0 17.46 5 Young Adult In stock (19 available) \n", + "1 17.66 5 Spirituality In stock (18 available) \n", + "2 15.94 5 Philosophy In stock (18 available) \n", + "3 14.27 4 Poetry In stock (16 available) \n", + "4 19.49 4 Sequential Art In stock (16 available) \n", + "\n", + " Description \n", + "0 Aaron Ledbetter’s future had been planned out ... \n", + "1 In The Four Agreements, don Miguel Ruiz reveal... \n", + "2 A page-turning novel that is also an explorati... \n", + "3 More than thirty-five years ago, when the weat... \n", + "4 Every summer, Rose goes with her mom and dad t... " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#imprimir los primeros 5 libros\n", + "result.head()" ] } ], @@ -126,7 +592,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -140,7 +606,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.12.9" } }, "nbformat": 4,