forked from controversial/wikipedia-map
-
Notifications
You must be signed in to change notification settings - Fork 0
/
python-prototype.py
83 lines (65 loc) · 2.68 KB
/
python-prototype.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
"""This is my original code, which I spent about 2 hours on. I thought I'd
include it just for fun. As you can see, the code has been completely rewritten
since this version. the `wikipedia` module only provides support for getting all
page links. Once I decided I wanted to work with only the first-paragraph links,
I wrote my own implementation in BeautifulSoup. This version generates a static
PNG. I think an interactive JavaScript front-end is much more fun ;)
"""
# Scrapes wikipedia. Start with the name of a page. Then, it will click the first
# 5 links on this page. For each of these links, it will click the first 5 links
# on *that* page. It will not stray more than 5 pages away from the start page.
# These attributes can be adjusted by changing BREADTH and MAXDEPTH. This script
# will output a PNG file of your wikipedia map.
#REQUIREMENTS: `wikipedia` and `pydot`
import wikipedia as wp
import pydot
def ascii(inp):
return str(inp.encode("ascii",errors="ignore"))
class WikiScraper:
def __init__(self, startpage,maxbreadth=10):
self.startpage=startpage
self.maxbreadth=maxbreadth
self.maxdepth=0
self.visited = set()
self.graph=pydot.Dot()
def connect(self,parent,children):
self.visited.add(parent)
for child in children:
edge=pydot.Edge(ascii(parent),ascii(child))
self.graph.add_edge(edge)
def pickLinks(self,page,n):
"""Pick `n` links from page."""
links=page.links
indices=range(1,len(links),len(links)/n)
return [links[i] for i in indices]
def explore(self,pagename,depth):
#Return if we've exceeded max depth.
if depth==self.maxdepth:
return
#Return if we've already visited a page
if pagename in self.visited:
return
try:
page=wp.page(pagename)
except wp.exceptions.DisambiguationError:
#Return in the event of reaching a disambiguation page
return
except wp.exceptions.PageError:
#We've tried to find a page that doesn't exist
print "The page {} could not be found".format(pagename.encode("utf-8"))
return
print "Exploring \""+pagename.encode("utf-8")+"\" at depth "+str(depth)
links=self.pickLinks(page,self.maxbreadth)
self.connect(pagename,links)
for link in links:
self.explore(link,depth+1)
def start(self,maxdepth=0):
self.maxdepth=maxdepth
self.explore(self.startpage,1)
if __name__ == "__main__":
STARTPAGE="Cats"
BREADTH=5
MAXDEPTH=5
w=WikiScraper(STARTPAGE,BREADTH)
w.start(MAXDEPTH)
w.graph.write(STARTPAGE+".dot")