data-bootcamp-v4 · jmpbusiness2023-commits · Oct 17, 2025 · Oct 17, 2025
diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb
@@ -110,14 +110,219 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 7,
       "id": "40359eee-9cd7-4884-bfa4-83344c222305",
       "metadata": {
         "id": "40359eee-9cd7-4884-bfa4-83344c222305"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "75 books found with a rating of at least 4 stars and a price less than £20!\n"
+          ]
+        },
+        {
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>UPC</th>\n",
+              "      <th>Title</th>\n",
+              "      <th>Price (£)</th>\n",
+              "      <th>Rating</th>\n",
+              "      <th>Genre</th>\n",
+              "      <th>Availability</th>\n",
+              "      <th>Description</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>ce6396b0f23f6ecc</td>\n",
+              "      <td>Set Me Free</td>\n",
+              "      <td>£17.46</td>\n",
+              "      <td>5</td>\n",
+              "      <td>Young Adult</td>\n",
+              "      <td>In stock (19 available)</td>\n",
+              "      <td>Aaron Ledbetter’s future had been planned out ...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>6258a1f6a6dcfe50</td>\n",
+              "      <td>The Four Agreements: A Practical Guide to Pers...</td>\n",
+              "      <td>£17.66</td>\n",
+              "      <td>5</td>\n",
+              "      <td>Spirituality</td>\n",
+              "      <td>In stock (18 available)</td>\n",
+              "      <td>In The Four Agreements, don Miguel Ruiz reveal...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>6be3beb0793a53e7</td>\n",
+              "      <td>Sophie's World</td>\n",
+              "      <td>£15.94</td>\n",
+              "      <td>5</td>\n",
+              "      <td>Philosophy</td>\n",
+              "      <td>In stock (18 available)</td>\n",
+              "      <td>A page-turning novel that is also an explorati...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3</th>\n",
+              "      <td>657fe5ead67a7767</td>\n",
+              "      <td>Untitled Collection: Sabbath Poems 2014</td>\n",
+              "      <td>£14.27</td>\n",
+              "      <td>4</td>\n",
+              "      <td>Poetry</td>\n",
+              "      <td>In stock (16 available)</td>\n",
+              "      <td>More than thirty-five years ago, when the weat...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>4</th>\n",
+              "      <td>51653ef291ab7ddc</td>\n",
+              "      <td>This One Summer</td>\n",
+              "      <td>£19.49</td>\n",
+              "      <td>4</td>\n",
+              "      <td>Sequential Art</td>\n",
+              "      <td>In stock (16 available)</td>\n",
+              "      <td>Every summer, Rose goes with her mom and dad t...</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "                UPC                                              Title  \\\n",
+              "0  ce6396b0f23f6ecc                                        Set Me Free   \n",
+              "1  6258a1f6a6dcfe50  The Four Agreements: A Practical Guide to Pers...   \n",
+              "2  6be3beb0793a53e7                                     Sophie's World   \n",
+              "3  657fe5ead67a7767            Untitled Collection: Sabbath Poems 2014   \n",
+              "4  51653ef291ab7ddc                                    This One Summer   \n",
+              "\n",
+              "  Price (£)  Rating           Genre             Availability  \\\n",
+              "0    £17.46       5     Young Adult  In stock (19 available)   \n",
+              "1    £17.66       5    Spirituality  In stock (18 available)   \n",
+              "2    £15.94       5      Philosophy  In stock (18 available)   \n",
+              "3    £14.27       4          Poetry  In stock (16 available)   \n",
+              "4    £19.49       4  Sequential Art  In stock (16 available)   \n",
+              "\n",
+              "                                         Description  \n",
+              "0  Aaron Ledbetter’s future had been planned out ...  \n",
+              "1  In The Four Agreements, don Miguel Ruiz reveal...  \n",
+              "2  A page-turning novel that is also an explorati...  \n",
+              "3  More than thirty-five years ago, when the weat...  \n",
+              "4  Every summer, Rose goes with her mom and dad t...  "
+            ]
+          },
+          "execution_count": 7,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
-        "# Your solution goes here"
+        "import requests\n",
+        "from bs4 import BeautifulSoup\n",
+        "import pandas as pd\n",
+        "\n",
+        "def scrape_books(min_rating, max_price):\n",
+        "\n",
+        "    domain = \"https://books.toscrape.com/catalogue/\"\n",
+        "    page_url = domain + \"page-1.html\"\n",
+        "    book_list = []\n",
+        "    rating_dict = {\"One\": 1, \"Two\": 2, \"Three\": 3, \"Four\": 4, \"Five\": 5}\n",
+        "\n",
+        "    while True:\n",
+        "\n",
+        "        response = requests.get(page_url) # We send a request to the page URL\n",
+        "        soup = BeautifulSoup(response.content, \"html.parser\") # We parse the content of the page with BeautifulSoup\n",
+        "\n",
+        "        products = soup.find_all(\"article\", class_=\"product_pod\")\n",
+        "\n",
+        "        for product in products:\n",
+        "\n",
+        "            # Rating\n",
+        "            rating_class = product.find(\"p\")[\"class\"] # We retrieve the class list in order to get the rating\n",
+        "            book_rating = rating_dict.get(rating_class[1], 0) # We convert the rating from string to integer by using the dictionary\n",
+        "\n",
+        "            # Price\n",
+        "            price = product.find(\"p\", class_=\"price_color\").get_text() # We retrieve the price string\n",
+        "            price_clean = float(price.replace(\"£\", \"\")) # We clean the price string and convert it to float\n",
+        "\n",
+        "            # Then we use an if statement to filter books based on rating and price\n",
+        "            if book_rating >= min_rating and price_clean <= max_price:\n",
+        "\n",
+        "                # Title and book URL\n",
+        "                title = product.h3.a[\"title\"] # We retrieve the book title\n",
+        "                partial_url = product.h3.a[\"href\"] # We retrieve the partial URL of the book\n",
+        "                book_url = domain + partial_url # We create the full URL of the book by concatenating the domain and the partial URL\n",
+        "\n",
+        "                # Book details\n",
+        "                book_response = requests.get(book_url) # We send a request to the book URL\n",
+        "                book_soup = BeautifulSoup(book_response.content, \"html.parser\") # We parse the content of the book page with BeautifulSoup\n",
+        "                \n",
+        "                # UPC, Genre, Availability\n",
+        "                upc = book_soup.find(\"th\", string=\"UPC\").find_next(\"td\").get_text() # We retrieve the UPC\n",
+        "                genre = book_soup.find(\"ul\", class_=\"breadcrumb\").find_all(\"li\")[2].get_text(strip=True) # We retrieve the genre\n",
+        "                availability = book_soup.find(\"p\", class_=\"instock availability\").get_text(strip=True) # We retrieve the availability\n",
+        "\n",
+        "                # Description\n",
+        "                desc_tag = book_soup.find(\"div\", id=\"product_description\") # We need to check if the description exists by looking for the tag first\n",
+        "                if desc_tag:\n",
+        "                    description = desc_tag.find_next(\"p\").get_text() # If it exists, we retrieve the description\n",
+        "                else:\n",
+        "                    description = \"No description available.\"\n",
+        "                \n",
+        "                # We compile all the information into a dictionary with the name of the columns we want in our DataFrame\n",
+        "                book_infos = {\n",
+        "                    \"UPC\": upc,\n",
+        "                    \"Title\": title,\n",
+        "                    \"Price (£)\": price,\n",
+        "                    \"Rating\": book_rating,\n",
+        "                    \"Genre\": genre,\n",
+        "                    \"Availability\": availability,\n",
+        "                    \"Description\": description\n",
+        "                }\n",
+        "\n",
+        "                book_list.append(book_infos) # We add the dictionary to the list\n",
+        "\n",
+        "        # We check if there is a next page\n",
+        "        next_btn = soup.find(\"li\", class_=\"next\")\n",
+        "        if next_btn:\n",
+        "            next_page = next_btn.a[\"href\"]\n",
+        "            page_url = domain + next_page # We update the page URL to the next page by concatenating the domain and the next page URL\n",
+        "        else:\n",
+        "            break\n",
+        "\n",
+        "    return pd.DataFrame(book_list) # We convert the list of dictionaries to a DataFrame and return it\n",
+        "\n",
+        "\n",
+        "df_books = scrape_books(min_rating=4, max_price=20.00)\n",
+        "print(f\"{len(df_books)} books found with a rating of at least 4 stars and a price less than £20!\")\n",
+        "df_books.head()\n",
+        "\n",
+        "# The most difficult challenge in this exercice for me was first to understand why I was getting only one book in my final DataFrame.\n",
+        "# I realized after a long time that I was checking only the first page of the website and not iterating through all the pages.\n",
+        "# Then, I had to figure out how to navigate to the next page by looking for the \"next\" button and updating the page URL accordingly.\n",
+        "# Also, I was getting errors when trying to retrieve the book description because some books did not have a description.\n",
+        "# I had to add a check to see if the description tag existed before trying to retrieve the description text.\n",
+        "# Compare to what we did in class, I found web scraping pretty challenging cause it's not really intuitive at first.\n"
       ]
     }
   ],
@@ -126,7 +331,7 @@
       "provenance": []
     },
     "kernelspec": {
-      "display_name": "Python 3 (ipykernel)",
+      "display_name": "base",
       "language": "python",
       "name": "python3"
     },
@@ -140,7 +345,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.9.13"
+      "version": "3.13.5"
     }
   },
   "nbformat": 4,