-
Notifications
You must be signed in to change notification settings - Fork 0
/
zara-mens-clearance-scraper.py
171 lines (136 loc) · 5.56 KB
/
zara-mens-clearance-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
from bs4 import BeautifulSoup, Tag
from playwright.sync_api import Page, expect
from playwright.sync_api import Playwright, sync_playwright
import pytest
import re
import sqlite3
from datetime import datetime
import tkinter as tk
from tkinter import messagebox
# Define the URL of the website
url = 'https://www.zara.com/us/en/man-special-prices-l806.html?v1=2203954'
# Define the name of the database and the name of the table
DATABASE_NAME = "clearance_items.db"
TABLE_NAME = "items"
# Define the SQL query to create the table
CREATE_TABLE_QUERY = f"""
CREATE TABLE IF NOT EXISTS {TABLE_NAME} (
name TEXT,
discount TEXT,
link TEXT,
curr_price TEXT,
prev_price TEXT,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
)
"""
# Method to scrape clearance items from Zara website
def scrape_clearance_items(url):
# Open a Playwright browser context and create a new page
with sync_playwright() as playwright:
browser = playwright.chromium.launch(headless=False)
with browser.new_context() as context:
page = context.new_page()
# Navigate to the URL
page.goto(url)
# Wait for the page to load completely
page.wait_for_load_state('networkidle')
# Get the page source
html = page.content()
# Close the browser
# browser.close()
# Parse the page source using BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
# Find all the product grid divs
product_grids = soup.find_all('div', class_='product-grid-product-info__product-header')
# Initialize a list to store the extracted information
items = []
# Connect to the database
conn = sqlite3.connect(DATABASE_NAME)
c = conn.cursor()
# Create the table if it doesn't exist
c.execute(CREATE_TABLE_QUERY)
for p in product_grids:
# Get product name
try:
name = p.find('a', class_='product-link _item product-grid-product-info__name link').text.strip()
except:
name = "Name not found"
# Use a regular expression to match part of the class name
image = soup.find("img", class_=re.compile("media-image__image"))
if image:
image_src = image.get("src")
# print(f"Image URL: {image_src}")
else:
image_src = "Image not found"
# print("Image not found")
# Get discounted amount
discount = p.find('span', class_='price-current__discount-percentage').text.strip()
# Get link
link = p.find('a', class_='product-link _item product-grid-product-info__name link')
link = link.get('href')
# Get current price
curr_price = p.find('span', class_='price-current__amount').text.strip()
# Get previous price
prev_price = p.find('span', class_='price-old__amount').text.strip()
# Current time and date
now = datetime.now()
# Print the extracted information
print(f"Name: {name}")
print(f"Image: {image_src}")
print(f"Discount: {discount}")
print(f"Link: {link}")
print(f"Current Price: {curr_price}")
print(f"Previous Price: {prev_price}")
print("----------------------")
# Create a dictionary for the current item and append it to the list
item = {
'name': name,
'discount': discount,
'link': link,
'curr_price': curr_price,
'prev_price': prev_price
}
items.append(item)
# Loop through the items in the database
for item in items:
# Check if the item already exists in the database
c.execute(f"SELECT * FROM {TABLE_NAME} WHERE name = ?", (item['name'],))
result = c.fetchone()
if result:
# If the item exists skip it
continue
else:
# If the item doesn't exist, insert a new record
c.execute(f"INSERT INTO {TABLE_NAME} (name, discount, link, curr_price, prev_price) VALUES (?, ?, ?, ?, ?)", (item['name'], item['discount'], item['link'], item['curr_price'], item['prev_price']))
# Commit the changes and close the connection
conn.commit()
conn.close()
return items
class MainWindow(tk.Tk):
def __init__(self):
super().__init__()
# Set the window title and size.
self.title("Zara Men's Clearance Scraper")
self.geometry("350x200")
# Add a label to the window.
self.label = tk.Label(self, text="Click the button to start scraping Zara.")
self.label.pack(pady=10)
# Add a button to the window.
self.button = tk.Button(self, text="Scrape Zara", command=self.scrape)
self.button.pack(pady=10)
# Add a label to write "Developed by" to the window.
self.label = tk.Label(self, text="Developed by:")
self.label.pack(pady=2)
# Add Logo to the window.
self.logo = tk.PhotoImage(file="logo.png")
self.logo = self.logo.subsample(2, 2)
self.label = tk.Label(self, image=self.logo)
self.label.pack(pady=10)
def scrape(self):
# Call the function and print the results
items = scrape_clearance_items(url)
scrape_count = len(items)
print(f"Scraped {scrape_count} items.")
if __name__ == "__main__":
app = MainWindow()
app.mainloop()