|
1 | 1 | import requests
|
2 | 2 | from bs4 import BeautifulSoup
|
3 | 3 | import csv
|
4 |
| -import time |
5 | 4 | import os
|
6 | 5 | import re
|
7 |
| -import dearpygui.dearpygui as dpg |
| 6 | +import threading |
| 7 | +import tkinter as tk |
| 8 | +from tkinter import ttk |
| 9 | +from tkinter import messagebox |
| 10 | +from tkinter import filedialog |
| 11 | +from urllib.parse import urljoin |
8 | 12 |
|
9 |
| -def scrape_links(url): |
10 |
| - response = requests.get(url) |
11 |
| - if response.status_code != 200: |
12 |
| - if response.status_code == 403: |
13 |
| - raise Exception("Access to the website is forbidden (403 error).") |
14 |
| - else: |
15 |
| - raise Exception(f"Failed to retrieve the page. Status code: {response.status_code}") |
16 |
| - |
17 |
| - soup = BeautifulSoup(response.content, 'html.parser') |
18 |
| - a_tags = soup.find_all('a') |
19 |
| - return [a.get('href') for a in a_tags if a.get('href')] |
20 |
| - |
21 |
| -def save_links_to_csv(links, filename): |
22 |
| - with open(filename, mode='w', newline='') as file: |
23 |
| - writer = csv.writer(file) |
24 |
| - writer.writerow(['Link']) |
25 |
| - for link in links: |
26 |
| - writer.writerow([link]) |
27 |
| - |
28 |
| -def save_links_to_txt(links, filename): |
29 |
| - with open(filename, 'w') as file: |
30 |
| - for link in links: |
31 |
| - file.write(link + '\n') |
32 |
| - |
33 |
| -def start_scraping(): |
34 |
| - url = dpg.get_value("url_input") |
35 |
| - if not url.startswith("http://") and not url.startswith("https://"): |
36 |
| - if ":" in url: |
37 |
| - dpg.configure_item("status_label", default_value="Error: Invalid URL. Please enter a valid URL starting with 'http://' or 'https://'.") |
38 |
| - return |
39 |
| - url = "http://" + url |
40 |
| - |
41 |
| - try: |
42 |
| - dpg.configure_item("status_label", default_value="Scraping...") |
43 |
| - dpg.configure_item("loading_indicator", show=True) |
44 |
| - links = scrape_links(url) |
45 |
| - dpg.configure_item("status_label", default_value="Scraping complete.") |
| 13 | +# LinkScraper class to handle the web scraping functionality |
| 14 | +class LinkScraper: |
| 15 | + @staticmethod |
| 16 | + def scrape_links(url): |
| 17 | + response = requests.get(url) |
| 18 | + if response.status_code != 200: |
| 19 | + if response.status_code == 403: |
| 20 | + raise Exception("Access to the website is forbidden (403 error).") |
| 21 | + else: |
| 22 | + raise Exception(f"Failed to retrieve the page. Status code: {response.status_code}") |
| 23 | + |
| 24 | + soup = BeautifulSoup(response.content, 'html.parser') |
| 25 | + a_tags = soup.find_all('a') |
| 26 | + links = [] |
| 27 | + for a in a_tags: |
| 28 | + href = a.get('href') |
| 29 | + if href: |
| 30 | + full_url = urljoin(url, href) |
| 31 | + links.append(full_url) |
| 32 | + return links |
| 33 | + |
| 34 | + @staticmethod |
| 35 | + def save_links_to_csv(links, filename): |
| 36 | + with open(filename, mode='w', newline='', encoding='utf-8') as file: |
| 37 | + writer = csv.writer(file) |
| 38 | + writer.writerow(['Link']) |
| 39 | + for link in links: |
| 40 | + writer.writerow([link]) |
| 41 | + |
| 42 | + @staticmethod |
| 43 | + def save_links_to_txt(links, filename): |
| 44 | + with open(filename, 'w', encoding='utf-8') as file: |
| 45 | + for link in links: |
| 46 | + file.write(link + '\n') |
| 47 | + |
| 48 | +# LinkScraperApp class to handle the GUI and user interactions |
| 49 | +class LinkScraperApp(tk.Tk): |
| 50 | + def __init__(self): |
| 51 | + super().__init__() |
| 52 | + self.title("Link Scraper") |
| 53 | + self.geometry("420x350") |
| 54 | + self.resizable(False, False) |
| 55 | + |
| 56 | + # Load the Azure theme |
| 57 | + try: |
| 58 | + self.tk.call("source", "azure.tcl") |
| 59 | + self.tk.call("set_theme", "dark") # You can set 'light' or 'dark' |
| 60 | + except tk.TclError as e: |
| 61 | + print(f"Error loading Azure theme: {e}") |
| 62 | + # Fallback to default theme if the Azure theme fails to load |
| 63 | + self.style = ttk.Style(self) |
| 64 | + self.style.theme_use('clam') |
| 65 | + |
| 66 | + self.create_widgets() |
| 67 | + def create_widgets(self): |
| 68 | + # URL input |
| 69 | + url_label = ttk.Label(self, text="Enter the URL to scrape links from:") |
| 70 | + url_label.pack(pady=(10, 0)) |
| 71 | + |
| 72 | + url_frame = ttk.Frame(self) |
| 73 | + url_frame.pack(pady=(5, 0)) |
| 74 | + self.url_var = tk.StringVar() |
| 75 | + url_entry = ttk.Entry(url_frame, textvariable=self.url_var, width=40) |
| 76 | + url_entry.pack(side=tk.LEFT, padx=(0, 5)) |
| 77 | + url_entry.focus() |
| 78 | + clear_button = ttk.Button(url_frame, text="Clear", command=self.clear_input) |
| 79 | + clear_button.pack(side=tk.LEFT) |
| 80 | + |
| 81 | + # Export format selection |
| 82 | + format_label = ttk.Label(self, text="Select export format:") |
| 83 | + format_label.pack(pady=(10, 0)) |
| 84 | + self.format_var = tk.StringVar(value='TXT') |
| 85 | + format_combo = ttk.Combobox(self, textvariable=self.format_var, values=['TXT', 'CSV'], state='readonly') |
| 86 | + format_combo.pack(pady=(5, 0)) |
| 87 | + |
| 88 | + # Overwrite checkbox |
| 89 | + self.overwrite_var = tk.BooleanVar() |
| 90 | + overwrite_check = ttk.Checkbutton(self, text="Overwrite if file exists", variable=self.overwrite_var) |
| 91 | + overwrite_check.pack(pady=(5, 0)) |
| 92 | + |
| 93 | + # Scrape Links button |
| 94 | + scrape_button = ttk.Button(self, text="Scrape Links", command=self.start_scraping) |
| 95 | + scrape_button.pack(pady=(10, 0)) |
| 96 | + |
| 97 | + # Status label |
| 98 | + self.status_var = tk.StringVar() |
| 99 | + status_label = ttk.Label(self, textvariable=self.status_var, wraplength=350) |
| 100 | + status_label.pack(pady=(10, 0)) |
| 101 | + |
| 102 | + # Loading indicator |
| 103 | + self.progress_bar = ttk.Progressbar(self, mode='indeterminate') |
| 104 | + self.progress_bar.pack(pady=(10, 0)) |
| 105 | + self.progress_bar.stop() |
| 106 | + |
| 107 | + # Created by label |
| 108 | + created_by_label = ttk.Label(self, text="Created by DJ_Fox11") |
| 109 | + created_by_label.pack(side=tk.BOTTOM, pady=(0, 10)) |
| 110 | + |
| 111 | + def clear_input(self): |
| 112 | + self.url_var.set("") |
| 113 | + self.status_var.set("") |
| 114 | + |
| 115 | + def start_scraping(self): |
| 116 | + url = self.url_var.get().strip() |
| 117 | + if not url.startswith("http://") and not url.startswith("https://"): |
| 118 | + if ":" in url: |
| 119 | + self.status_var.set( |
| 120 | + "Error: Invalid URL. Please enter a valid URL starting with 'http://' or 'https://'.") |
| 121 | + return |
| 122 | + url = "http://" + url |
| 123 | + |
| 124 | + self.status_var.set("Scraping...") |
| 125 | + self.progress_bar.start() |
| 126 | + |
| 127 | + # Start scraping in a separate thread |
| 128 | + threading.Thread(target=self.scrape_and_save_links, args=(url,), daemon=True).start() |
| 129 | + |
| 130 | + def scrape_and_save_links(self, url): |
| 131 | + try: |
| 132 | + links = LinkScraper.scrape_links(url) |
| 133 | + # Schedule the save and GUI updates in the main thread |
| 134 | + self.after(0, self.save_and_update, links, url) |
| 135 | + except Exception as e: |
| 136 | + self.after(0, self.scrape_failed, str(e)) |
| 137 | + |
| 138 | + def save_and_update(self, links, url): |
| 139 | + self.progress_bar.stop() |
46 | 140 |
|
47 | 141 | sanitized_url = re.sub(r'[^a-zA-Z0-9]', '_', url)
|
48 |
| - export_format = dpg.get_value("format_selector") |
| 142 | + export_format = self.format_var.get() |
49 | 143 |
|
50 | 144 | if export_format == "CSV":
|
51 |
| - filename = f"{sanitized_url}.csv" |
52 |
| - save_function = save_links_to_csv |
| 145 | + default_filename = f"{sanitized_url}.csv" |
| 146 | + filetypes = [('CSV files', '*.csv')] |
| 147 | + save_function = LinkScraper.save_links_to_csv |
53 | 148 | else:
|
54 |
| - filename = f"{sanitized_url}.txt" |
55 |
| - save_function = save_links_to_txt |
56 |
| - |
57 |
| - if os.path.exists(filename): |
58 |
| - overwrite = dpg.get_value("overwrite_checkbox") |
59 |
| - if not overwrite: |
60 |
| - dpg.configure_item("status_label", default_value="File not saved: File already exists.") |
61 |
| - dpg.configure_item("loading_indicator", show=False) |
62 |
| - return |
| 149 | + default_filename = f"{sanitized_url}.txt" |
| 150 | + filetypes = [('Text files', '*.txt')] |
| 151 | + save_function = LinkScraper.save_links_to_txt |
| 152 | + |
| 153 | + # Ask the user where to save the file |
| 154 | + filename = filedialog.asksaveasfilename( |
| 155 | + defaultextension=filetypes[0][1], filetypes=filetypes, initialfile=default_filename) |
| 156 | + |
| 157 | + if filename: |
| 158 | + if os.path.exists(filename): |
| 159 | + overwrite = self.overwrite_var.get() |
| 160 | + if not overwrite: |
| 161 | + self.status_var.set("File not saved: File already exists.") |
| 162 | + return |
| 163 | + |
| 164 | + save_function(links, filename) |
| 165 | + self.status_var.set(f"Saved {len(links)} links to {filename}") |
| 166 | + else: |
| 167 | + self.status_var.set("File save cancelled.") |
| 168 | + |
| 169 | + def scrape_failed(self, error_message): |
| 170 | + self.progress_bar.stop() |
| 171 | + self.status_var.set(f"Error: {error_message}") |
63 | 172 |
|
64 |
| - save_function(links, filename) |
65 |
| - dpg.configure_item("status_label", default_value=f"Saved {len(links)} links to {filename}") |
66 |
| - except Exception as e: |
67 |
| - dpg.configure_item("status_label", default_value=f"Error: {str(e)}") |
68 |
| - finally: |
69 |
| - dpg.configure_item("loading_indicator", show=False) |
70 |
| - |
71 |
| -def loading_animation(): |
72 |
| - spinner_states = ['/', '-', '\\', '|'] |
73 |
| - while dpg.get_item_configuration("loading_indicator")['show']: |
74 |
| - for state in spinner_states: |
75 |
| - dpg.configure_item("loading_indicator", default_value=state) |
76 |
| - time.sleep(0.2) |
77 |
| - |
78 |
| -def clear_input(): |
79 |
| - dpg.set_value("url_input", "") |
80 |
| - dpg.configure_item("status_label", default_value="") |
81 |
| - |
82 |
| -# GUI setup |
83 |
| -dpg.create_context() |
84 |
| - |
85 |
| -with dpg.window(label="Link Scraper", width=420, height=260, no_close=True): |
86 |
| - dpg.add_text("Enter the URL to scrape links from:") |
87 |
| - with dpg.group(horizontal=True): |
88 |
| - dpg.add_input_text(tag="url_input", width=250) |
89 |
| - dpg.add_button(label="Clear", callback=clear_input) |
90 |
| - dpg.add_text("Select export format:") |
91 |
| - dpg.add_combo(['TXT', 'CSV'], default_value='TXT', tag="format_selector") |
92 |
| - dpg.add_checkbox(label="Overwrite if file exists", tag="overwrite_checkbox") |
93 |
| - dpg.add_button(label="Scrape Links", callback=start_scraping) |
94 |
| - dpg.add_text("", tag="status_label", wrap=350) |
95 |
| - with dpg.group(horizontal=True): |
96 |
| - dpg.add_text("", tag="scraping_text") |
97 |
| - dpg.add_text("", tag="loading_indicator") |
98 |
| - dpg.configure_item("loading_indicator", show=False) |
99 |
| - dpg.add_spacer(height=5) |
100 |
| - dpg.add_text("Created by DJ_Fox11", pos=(10, 230)) |
101 |
| - |
102 |
| -dpg.create_viewport(title='Link Scraper', width=420, height=300) |
103 |
| -dpg.setup_dearpygui() |
104 |
| -dpg.show_viewport() |
105 |
| -dpg.start_dearpygui() |
106 |
| -dpg.destroy_context() |
| 173 | +if __name__ == "__main__": |
| 174 | + app = LinkScraperApp() |
| 175 | + app.mainloop() |
0 commit comments