Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
215 changes: 210 additions & 5 deletions lab-web-scraping.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -110,14 +110,219 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"id": "40359eee-9cd7-4884-bfa4-83344c222305",
"metadata": {
"id": "40359eee-9cd7-4884-bfa4-83344c222305"
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"75 books found with a rating of at least 4 stars and a price less than £20!\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>UPC</th>\n",
" <th>Title</th>\n",
" <th>Price (£)</th>\n",
" <th>Rating</th>\n",
" <th>Genre</th>\n",
" <th>Availability</th>\n",
" <th>Description</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ce6396b0f23f6ecc</td>\n",
" <td>Set Me Free</td>\n",
" <td>£17.46</td>\n",
" <td>5</td>\n",
" <td>Young Adult</td>\n",
" <td>In stock (19 available)</td>\n",
" <td>Aaron Ledbetter’s future had been planned out ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>6258a1f6a6dcfe50</td>\n",
" <td>The Four Agreements: A Practical Guide to Pers...</td>\n",
" <td>£17.66</td>\n",
" <td>5</td>\n",
" <td>Spirituality</td>\n",
" <td>In stock (18 available)</td>\n",
" <td>In The Four Agreements, don Miguel Ruiz reveal...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6be3beb0793a53e7</td>\n",
" <td>Sophie's World</td>\n",
" <td>£15.94</td>\n",
" <td>5</td>\n",
" <td>Philosophy</td>\n",
" <td>In stock (18 available)</td>\n",
" <td>A page-turning novel that is also an explorati...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>657fe5ead67a7767</td>\n",
" <td>Untitled Collection: Sabbath Poems 2014</td>\n",
" <td>£14.27</td>\n",
" <td>4</td>\n",
" <td>Poetry</td>\n",
" <td>In stock (16 available)</td>\n",
" <td>More than thirty-five years ago, when the weat...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>51653ef291ab7ddc</td>\n",
" <td>This One Summer</td>\n",
" <td>£19.49</td>\n",
" <td>4</td>\n",
" <td>Sequential Art</td>\n",
" <td>In stock (16 available)</td>\n",
" <td>Every summer, Rose goes with her mom and dad t...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" UPC Title \\\n",
"0 ce6396b0f23f6ecc Set Me Free \n",
"1 6258a1f6a6dcfe50 The Four Agreements: A Practical Guide to Pers... \n",
"2 6be3beb0793a53e7 Sophie's World \n",
"3 657fe5ead67a7767 Untitled Collection: Sabbath Poems 2014 \n",
"4 51653ef291ab7ddc This One Summer \n",
"\n",
" Price (£) Rating Genre Availability \\\n",
"0 £17.46 5 Young Adult In stock (19 available) \n",
"1 £17.66 5 Spirituality In stock (18 available) \n",
"2 £15.94 5 Philosophy In stock (18 available) \n",
"3 £14.27 4 Poetry In stock (16 available) \n",
"4 £19.49 4 Sequential Art In stock (16 available) \n",
"\n",
" Description \n",
"0 Aaron Ledbetter’s future had been planned out ... \n",
"1 In The Four Agreements, don Miguel Ruiz reveal... \n",
"2 A page-turning novel that is also an explorati... \n",
"3 More than thirty-five years ago, when the weat... \n",
"4 Every summer, Rose goes with her mom and dad t... "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Your solution goes here"
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import pandas as pd\n",
"\n",
"def scrape_books(min_rating, max_price):\n",
"\n",
" domain = \"https://books.toscrape.com/catalogue/\"\n",
" page_url = domain + \"page-1.html\"\n",
" book_list = []\n",
" rating_dict = {\"One\": 1, \"Two\": 2, \"Three\": 3, \"Four\": 4, \"Five\": 5}\n",
"\n",
" while True:\n",
"\n",
" response = requests.get(page_url) # We send a request to the page URL\n",
" soup = BeautifulSoup(response.content, \"html.parser\") # We parse the content of the page with BeautifulSoup\n",
"\n",
" products = soup.find_all(\"article\", class_=\"product_pod\")\n",
"\n",
" for product in products:\n",
"\n",
" # Rating\n",
" rating_class = product.find(\"p\")[\"class\"] # We retrieve the class list in order to get the rating\n",
" book_rating = rating_dict.get(rating_class[1], 0) # We convert the rating from string to integer by using the dictionary\n",
"\n",
" # Price\n",
" price = product.find(\"p\", class_=\"price_color\").get_text() # We retrieve the price string\n",
" price_clean = float(price.replace(\"£\", \"\")) # We clean the price string and convert it to float\n",
"\n",
" # Then we use an if statement to filter books based on rating and price\n",
" if book_rating >= min_rating and price_clean <= max_price:\n",
"\n",
" # Title and book URL\n",
" title = product.h3.a[\"title\"] # We retrieve the book title\n",
" partial_url = product.h3.a[\"href\"] # We retrieve the partial URL of the book\n",
" book_url = domain + partial_url # We create the full URL of the book by concatenating the domain and the partial URL\n",
"\n",
" # Book details\n",
" book_response = requests.get(book_url) # We send a request to the book URL\n",
" book_soup = BeautifulSoup(book_response.content, \"html.parser\") # We parse the content of the book page with BeautifulSoup\n",
" \n",
" # UPC, Genre, Availability\n",
" upc = book_soup.find(\"th\", string=\"UPC\").find_next(\"td\").get_text() # We retrieve the UPC\n",
" genre = book_soup.find(\"ul\", class_=\"breadcrumb\").find_all(\"li\")[2].get_text(strip=True) # We retrieve the genre\n",
" availability = book_soup.find(\"p\", class_=\"instock availability\").get_text(strip=True) # We retrieve the availability\n",
"\n",
" # Description\n",
" desc_tag = book_soup.find(\"div\", id=\"product_description\") # We need to check if the description exists by looking for the tag first\n",
" if desc_tag:\n",
" description = desc_tag.find_next(\"p\").get_text() # If it exists, we retrieve the description\n",
" else:\n",
" description = \"No description available.\"\n",
" \n",
" # We compile all the information into a dictionary with the name of the columns we want in our DataFrame\n",
" book_infos = {\n",
" \"UPC\": upc,\n",
" \"Title\": title,\n",
" \"Price (£)\": price,\n",
" \"Rating\": book_rating,\n",
" \"Genre\": genre,\n",
" \"Availability\": availability,\n",
" \"Description\": description\n",
" }\n",
"\n",
" book_list.append(book_infos) # We add the dictionary to the list\n",
"\n",
" # We check if there is a next page\n",
" next_btn = soup.find(\"li\", class_=\"next\")\n",
" if next_btn:\n",
" next_page = next_btn.a[\"href\"]\n",
" page_url = domain + next_page # We update the page URL to the next page by concatenating the domain and the next page URL\n",
" else:\n",
" break\n",
"\n",
" return pd.DataFrame(book_list) # We convert the list of dictionaries to a DataFrame and return it\n",
"\n",
"\n",
"df_books = scrape_books(min_rating=4, max_price=20.00)\n",
"print(f\"{len(df_books)} books found with a rating of at least 4 stars and a price less than £20!\")\n",
"df_books.head()\n",
"\n",
"# The most difficult challenge in this exercice for me was first to understand why I was getting only one book in my final DataFrame.\n",
"# I realized after a long time that I was checking only the first page of the website and not iterating through all the pages.\n",
"# Then, I had to figure out how to navigate to the next page by looking for the \"next\" button and updating the page URL accordingly.\n",
"# Also, I was getting errors when trying to retrieve the book description because some books did not have a description.\n",
"# I had to add a check to see if the description tag existed before trying to retrieve the description text.\n",
"# Compare to what we did in class, I found web scraping pretty challenging cause it's not really intuitive at first.\n"
]
}
],
Expand All @@ -126,7 +331,7 @@
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "base",
"language": "python",
"name": "python3"
},
Expand All @@ -140,7 +345,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
"version": "3.13.5"
}
},
"nbformat": 4,
Expand Down