-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgrab.py
37 lines (28 loc) · 1.04 KB
/
grab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/usr/bin/env python3
from bs4 import BeautifulSoup, SoupStrainer
from urllib.request import urlopen
import re
# grab URL, parse links
a_tags = SoupStrainer('a') # prefilter
# testing data to prevent ban from live site while running tests
# url = open('html.html', 'r')
# soup = BeautifulSoup(url, 'html.parser', parse_only=a_tags)
# live data
url = 'https://pinboard.in/popular'
soup = BeautifulSoup(urlopen(url), 'html.parser', parse_only=a_tags)
# filter all but twitter.com links
# pop_tweets = soup.find_all(href=re.compile('//twitter.com/'), class_='url_display') # for test file
pop_tweets = soup.find_all(href=re.compile('//twitter.com/')) # for live site
# format links to HTML page
pop_clean = '<html>\n<body>\n<ul>\n'
for x in pop_tweets:
del x['class']
pop_clean += '<li> %s </li>' % (x)
pop_clean += '</ul>\n</body>\n</html>'
soup_clean = BeautifulSoup(pop_clean, 'html.parser')
soup_clean_pretty = soup_clean.prettify()
# write to file
f = open('output.html', 'w')
f.write(soup_clean_pretty)
# print(soup_clean_pretty)
f.close()