-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind_edges.py
50 lines (47 loc) · 1.55 KB
/
find_edges.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from urllib.parse import urlparse
from get_page import *
from htmlparser import *
"""
This function fetches each url and feeds the html page to the parser. The parser
looks for hyperlinks within the page. After validating the link, it is added to the graph.
"""
def find_edges(g, md):
for key in md.keys():
try:
filename, html, headers = get_page(key)
except:
#Print URLs that could not be fetched
print(key)
continue
try:
#Read html file
s = str(html.read())
parser.links = []
parser.feed(s)
except:
print("Invalid HTML file")
print(key)
continue
for link in parser.links:
if link == '':
continue
#Ignore URLs with specified IDs
if link[0] == '#':
continue
#Ignore javascript
elif link[0:10] == 'javascript':
continue
#Ignore files and scripts
elif link[0:2] == '//':
continue
#Handle URLs with relative paths
elif link[0] == '/':
scheme, netloc, path, params, query, fragment = urlparse(key)
link = scheme + '://' + netloc + link
#Ignore protocols other than http
elif link[0 - 4] != 'http':
continue
if link in g.vertices():
if key != link and g.is_edge((key, link)) == False:
g.add_edge((key, link))
return g