diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb
index e552783..78bb941 100644
--- a/lab-web-scraping.ipynb
+++ b/lab-web-scraping.ipynb
@@ -110,14 +110,219 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"id": "40359eee-9cd7-4884-bfa4-83344c222305",
"metadata": {
"id": "40359eee-9cd7-4884-bfa4-83344c222305"
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "75 books found with a rating of at least 4 stars and a price less than £20!\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " UPC | \n",
+ " Title | \n",
+ " Price (£) | \n",
+ " Rating | \n",
+ " Genre | \n",
+ " Availability | \n",
+ " Description | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " ce6396b0f23f6ecc | \n",
+ " Set Me Free | \n",
+ " £17.46 | \n",
+ " 5 | \n",
+ " Young Adult | \n",
+ " In stock (19 available) | \n",
+ " Aaron Ledbetter’s future had been planned out ... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 6258a1f6a6dcfe50 | \n",
+ " The Four Agreements: A Practical Guide to Pers... | \n",
+ " £17.66 | \n",
+ " 5 | \n",
+ " Spirituality | \n",
+ " In stock (18 available) | \n",
+ " In The Four Agreements, don Miguel Ruiz reveal... | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 6be3beb0793a53e7 | \n",
+ " Sophie's World | \n",
+ " £15.94 | \n",
+ " 5 | \n",
+ " Philosophy | \n",
+ " In stock (18 available) | \n",
+ " A page-turning novel that is also an explorati... | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 657fe5ead67a7767 | \n",
+ " Untitled Collection: Sabbath Poems 2014 | \n",
+ " £14.27 | \n",
+ " 4 | \n",
+ " Poetry | \n",
+ " In stock (16 available) | \n",
+ " More than thirty-five years ago, when the weat... | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 51653ef291ab7ddc | \n",
+ " This One Summer | \n",
+ " £19.49 | \n",
+ " 4 | \n",
+ " Sequential Art | \n",
+ " In stock (16 available) | \n",
+ " Every summer, Rose goes with her mom and dad t... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " UPC Title \\\n",
+ "0 ce6396b0f23f6ecc Set Me Free \n",
+ "1 6258a1f6a6dcfe50 The Four Agreements: A Practical Guide to Pers... \n",
+ "2 6be3beb0793a53e7 Sophie's World \n",
+ "3 657fe5ead67a7767 Untitled Collection: Sabbath Poems 2014 \n",
+ "4 51653ef291ab7ddc This One Summer \n",
+ "\n",
+ " Price (£) Rating Genre Availability \\\n",
+ "0 £17.46 5 Young Adult In stock (19 available) \n",
+ "1 £17.66 5 Spirituality In stock (18 available) \n",
+ "2 £15.94 5 Philosophy In stock (18 available) \n",
+ "3 £14.27 4 Poetry In stock (16 available) \n",
+ "4 £19.49 4 Sequential Art In stock (16 available) \n",
+ "\n",
+ " Description \n",
+ "0 Aaron Ledbetter’s future had been planned out ... \n",
+ "1 In The Four Agreements, don Miguel Ruiz reveal... \n",
+ "2 A page-turning novel that is also an explorati... \n",
+ "3 More than thirty-five years ago, when the weat... \n",
+ "4 Every summer, Rose goes with her mom and dad t... "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# Your solution goes here"
+ "import requests\n",
+ "from bs4 import BeautifulSoup\n",
+ "import pandas as pd\n",
+ "\n",
+ "def scrape_books(min_rating, max_price):\n",
+ "\n",
+ " domain = \"https://books.toscrape.com/catalogue/\"\n",
+ " page_url = domain + \"page-1.html\"\n",
+ " book_list = []\n",
+ " rating_dict = {\"One\": 1, \"Two\": 2, \"Three\": 3, \"Four\": 4, \"Five\": 5}\n",
+ "\n",
+ " while True:\n",
+ "\n",
+ " response = requests.get(page_url) # We send a request to the page URL\n",
+ " soup = BeautifulSoup(response.content, \"html.parser\") # We parse the content of the page with BeautifulSoup\n",
+ "\n",
+ " products = soup.find_all(\"article\", class_=\"product_pod\")\n",
+ "\n",
+ " for product in products:\n",
+ "\n",
+ " # Rating\n",
+ " rating_class = product.find(\"p\")[\"class\"] # We retrieve the class list in order to get the rating\n",
+ " book_rating = rating_dict.get(rating_class[1], 0) # We convert the rating from string to integer by using the dictionary\n",
+ "\n",
+ " # Price\n",
+ " price = product.find(\"p\", class_=\"price_color\").get_text() # We retrieve the price string\n",
+ " price_clean = float(price.replace(\"£\", \"\")) # We clean the price string and convert it to float\n",
+ "\n",
+ " # Then we use an if statement to filter books based on rating and price\n",
+ " if book_rating >= min_rating and price_clean <= max_price:\n",
+ "\n",
+ " # Title and book URL\n",
+ " title = product.h3.a[\"title\"] # We retrieve the book title\n",
+ " partial_url = product.h3.a[\"href\"] # We retrieve the partial URL of the book\n",
+ " book_url = domain + partial_url # We create the full URL of the book by concatenating the domain and the partial URL\n",
+ "\n",
+ " # Book details\n",
+ " book_response = requests.get(book_url) # We send a request to the book URL\n",
+ " book_soup = BeautifulSoup(book_response.content, \"html.parser\") # We parse the content of the book page with BeautifulSoup\n",
+ " \n",
+ " # UPC, Genre, Availability\n",
+ " upc = book_soup.find(\"th\", string=\"UPC\").find_next(\"td\").get_text() # We retrieve the UPC\n",
+ " genre = book_soup.find(\"ul\", class_=\"breadcrumb\").find_all(\"li\")[2].get_text(strip=True) # We retrieve the genre\n",
+ " availability = book_soup.find(\"p\", class_=\"instock availability\").get_text(strip=True) # We retrieve the availability\n",
+ "\n",
+ " # Description\n",
+ " desc_tag = book_soup.find(\"div\", id=\"product_description\") # We need to check if the description exists by looking for the tag first\n",
+ " if desc_tag:\n",
+ " description = desc_tag.find_next(\"p\").get_text() # If it exists, we retrieve the description\n",
+ " else:\n",
+ " description = \"No description available.\"\n",
+ " \n",
+ " # We compile all the information into a dictionary with the name of the columns we want in our DataFrame\n",
+ " book_infos = {\n",
+ " \"UPC\": upc,\n",
+ " \"Title\": title,\n",
+ " \"Price (£)\": price,\n",
+ " \"Rating\": book_rating,\n",
+ " \"Genre\": genre,\n",
+ " \"Availability\": availability,\n",
+ " \"Description\": description\n",
+ " }\n",
+ "\n",
+ " book_list.append(book_infos) # We add the dictionary to the list\n",
+ "\n",
+ " # We check if there is a next page\n",
+ " next_btn = soup.find(\"li\", class_=\"next\")\n",
+ " if next_btn:\n",
+ " next_page = next_btn.a[\"href\"]\n",
+ " page_url = domain + next_page # We update the page URL to the next page by concatenating the domain and the next page URL\n",
+ " else:\n",
+ " break\n",
+ "\n",
+ " return pd.DataFrame(book_list) # We convert the list of dictionaries to a DataFrame and return it\n",
+ "\n",
+ "\n",
+ "df_books = scrape_books(min_rating=4, max_price=20.00)\n",
+ "print(f\"{len(df_books)} books found with a rating of at least 4 stars and a price less than £20!\")\n",
+ "df_books.head()\n",
+ "\n",
+ "# The most difficult challenge in this exercice for me was first to understand why I was getting only one book in my final DataFrame.\n",
+ "# I realized after a long time that I was checking only the first page of the website and not iterating through all the pages.\n",
+ "# Then, I had to figure out how to navigate to the next page by looking for the \"next\" button and updating the page URL accordingly.\n",
+ "# Also, I was getting errors when trying to retrieve the book description because some books did not have a description.\n",
+ "# I had to add a check to see if the description tag existed before trying to retrieve the description text.\n",
+ "# Compare to what we did in class, I found web scraping pretty challenging cause it's not really intuitive at first.\n"
]
}
],
@@ -126,7 +331,7 @@
"provenance": []
},
"kernelspec": {
- "display_name": "Python 3 (ipykernel)",
+ "display_name": "base",
"language": "python",
"name": "python3"
},
@@ -140,7 +345,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.13"
+ "version": "3.13.5"
}
},
"nbformat": 4,