-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
92 lines (77 loc) · 3.59 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
import re
import threading
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
def save_file(soup, pagefolder, session, url, tag, inner):
if not os.path.exists(pagefolder):
os.mkdir(pagefolder)
for res in soup.findAll(tag):
if tag == 'link' and res.has_attr('crossorigin'):
del res.attrs['crossorigin']
if tag == 'base':
res.extract()
elif tag == 'style':
if res.string:
text = res.string.strip()
try:
if 'url' in text:
index = 0
s = re.search("(url\(+)(?!\")([^)]*)", text)
while s:
urls = text[s.start() + 4 + index: s.end() + index]
filename = urls.split('/')[-1]
filepath = os.path.join(pagefolder, filename)
fileurl = urljoin(url, urls)
localpath = '../' + os.path.join(pagefolder, filename).replace('\\', '/')
text = (text[:s.start() + 4 + index] + localpath + text[s.end() - 1 + index + 1:])
if not os.path.isfile(filepath):
with open(filepath, 'wb') as f:
filebin = session.get(fileurl)
f.write(filebin.content)
index += s.end() - (len(urls) - len(localpath))
s = re.search("(url\(+)(?!\")([^)]*)", text[index:])
res.string = text
except Exception:
res.string = text
elif res.has_attr(inner):
try:
filename, ext = os.path.splitext(os.path.basename(res[inner]))
if '?' in ext:
ext = ext[:ext.find('?')]
filename = re.sub('\W+', '', filename) + ext
fileurl = urljoin(url, res.get(inner))
filepath = os.path.join(pagefolder, filename)
res[inner] = '../' + os.path.join(pagefolder, filename).replace('\\', '/')
if tag == 'img':
if res.has_attr('srcset'):
res.attrs['srcset'] = ''
if not os.path.isfile(filepath): # has not been downloaded yet
with open(filepath, 'wb') as file:
filebin = session.get(fileurl)
file.write(filebin.content)
except Exception:
pass
def save_page(url, pagepath):
path, _ = os.path.splitext(pagepath)
# pagefolder = os.path.join('sites', f'{path}_files')
pagefolder = f'sites/{path}_files'
session = requests.Session()
try:
response = session.get(url)
except requests.exceptions.ConnectionError as exception:
raise exception
soup = BeautifulSoup(response.content.decode('utf-8'), "html.parser")
tags_inner = {'img': 'src', 'link': 'href', 'script': 'src', 'style': '', 'base': ''}
threads = []
for tag, inner in tags_inner.items(): # save and rename resource files
thread = threading.Thread(target=save_file, args=[soup, pagefolder, session, url, tag, inner])
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
with open(f'templates/{path}.html', 'wb') as file:
file.write(soup.prettify('utf-8'))
# examples
# save_page('https://github.com/', 'github')