Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
186 changes: 186 additions & 0 deletions Lab_web_scraping_books.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "f929914c",
"metadata": {},
"source": [
"Instalación de librerías necesarias"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "00ca7bd7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: requests in c:\\users\\cex-laguna\\appdata\\local\\programs\\python\\python313\\lib\\site-packages (2.32.5)\n",
"Requirement already satisfied: beautifulsoup4 in c:\\users\\cex-laguna\\appdata\\local\\programs\\python\\python313\\lib\\site-packages (4.14.2)\n",
"Requirement already satisfied: pandas in c:\\users\\cex-laguna\\appdata\\local\\programs\\python\\python313\\lib\\site-packages (2.3.2)\n",
"Requirement already satisfied: charset_normalizer<4,>=2 in c:\\users\\cex-laguna\\appdata\\local\\programs\\python\\python313\\lib\\site-packages (from requests) (3.4.3)\n",
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\cex-laguna\\appdata\\local\\programs\\python\\python313\\lib\\site-packages (from requests) (3.11)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\cex-laguna\\appdata\\local\\programs\\python\\python313\\lib\\site-packages (from requests) (2.5.0)\n",
"Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\cex-laguna\\appdata\\local\\programs\\python\\python313\\lib\\site-packages (from requests) (2025.10.5)\n",
"Requirement already satisfied: soupsieve>1.2 in c:\\users\\cex-laguna\\appdata\\local\\programs\\python\\python313\\lib\\site-packages (from beautifulsoup4) (2.8)\n",
"Requirement already satisfied: typing-extensions>=4.0.0 in c:\\users\\cex-laguna\\appdata\\local\\programs\\python\\python313\\lib\\site-packages (from beautifulsoup4) (4.15.0)\n",
"Requirement already satisfied: numpy>=1.26.0 in c:\\users\\cex-laguna\\appdata\\local\\programs\\python\\python313\\lib\\site-packages (from pandas) (2.2.6)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\cex-laguna\\appdata\\local\\programs\\python\\python313\\lib\\site-packages (from pandas) (2.9.0.post0)\n",
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\cex-laguna\\appdata\\local\\programs\\python\\python313\\lib\\site-packages (from pandas) (2025.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in c:\\users\\cex-laguna\\appdata\\local\\programs\\python\\python313\\lib\\site-packages (from pandas) (2025.2)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\cex-laguna\\appdata\\local\\programs\\python\\python313\\lib\\site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"[notice] A new release of pip is available: 25.2 -> 25.3\n",
"[notice] To update, run: python.exe -m pip install --upgrade pip\n"
]
}
],
"source": [
"pip install requests beautifulsoup4 pandas\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "911bcd57",
"metadata": {},
"outputs": [],
"source": [
"import requests \n",
"from bs4 import BeautifulSoup\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "cca840f5",
"metadata": {},
"outputs": [],
"source": [
"min_rating = 3\n",
"max_price = 30.0 "
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "f3618791",
"metadata": {},
"outputs": [],
"source": [
"def rating_to_int(r): \n",
" ratings = {\n",
" \"One\": 1,\n",
" \"Two\": 2,\n",
" \"Three\": 3,\n",
" \"Four\": 4,\n",
" \"Five\": 5\n",
" }\n",
" return ratings.get(r, 0)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "1a89ffe9",
"metadata": {},
"outputs": [],
"source": [
"books = []\n",
"base_url = \"http://books.toscrape.com/catalogue/page-{}.html\"\n",
"\n",
"for page in range(1, 51): \n",
" url = base_url.format(page)\n",
" response = requests.get(url)\n",
" response.encoding = \"utf-8\"\n",
"\n",
" soup = BeautifulSoup(response.text, \"html.parser\")\n",
"\n",
" articles = soup.find_all(\"article\", class_=\"product_pod\")\n",
"\n",
" for article in articles:\n",
" title = article.h3.a[\"title\"]\n",
"\n",
" # Rating\n",
" rating_class = article.find(\"p\")[\"class\"][1]\n",
" rating = rating_to_int(rating_class)\n",
"\n",
" # Precio\n",
" price_text = article.find(\"p\", class_=\"price_color\").text\n",
" price = float(price_text.replace(\"£\", \"\"))\n",
"\n",
" # Filtrar\n",
" if rating >= min_rating and price <= max_price:\n",
" books.append({\n",
" \"Título\": title,\n",
" \"Rating\": rating,\n",
" \"Precio (£)\": price\n",
" })"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "2cae24cb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Título Rating Precio (£)\n",
"0 The Coming Woman: A Novel Based on the Life of... 3 17.93\n",
"1 The Boys in the Boat: Nine Americans and Their... 4 22.60\n",
"2 Shakespeare's Sonnets 4 20.66\n",
"3 Set Me Free 5 17.46\n",
"4 Chase Me (Paris Nights #2) 5 25.27\n",
".. ... ... ...\n",
"222 Choosing Our Religion: The Spiritual Lives of ... 4 28.42\n",
"223 Charlie and the Chocolate Factory (Charlie Buc... 3 22.85\n",
"224 Blood Defense (Samantha Brinkman #1) 3 20.30\n",
"225 A Spy's Devotion (The Regency Spies of London #1) 5 16.97\n",
"226 1,000 Places to See Before You Die 5 26.08\n",
"\n",
"[227 rows x 3 columns]\n"
]
}
],
"source": [
"df = pd.DataFrame(books)\n",
"print(df)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}