-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlink_spam.py
80 lines (67 loc) · 3.1 KB
/
link_spam.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from UnitaryTest.test_tools import TestTools
#Combatting Link Spam. Exercise from Udacity course.
# One of the problems with our page ranking system is pages can
# collude with each other to improve their page ranks. We consider
# A->B a reciprocal link if there is a link path from B to A of length
# equal to or below the collusion level, k. The length of a link path
# is the number of links which are taken to travel from one page to the
# other.
# If k = 0, then a link from A to A is a reciprocal link for node A,
# since no links needs to be taken to get from A to A.
# If k=1, B->A would count as a reciprocal link if there is a link
# A->B, which includes one link and so is of length 1. (it requires
# two parties, A and B, to collude to increase each others page rank).
# If k=2, B->A would count as a reciprocal link for node A if there is
# a path A->C->B, for some page C, (link path of length 2),
# or a direct link A-> B (link path of length 1).
# Modify the compute_ranks code to
# - take an extra input k, which is a non-negative integer, and
# - exclude reciprocal links of length up to and including k from
# helping the page rank.
def is_reciprocal_link(graph, source, destination, k):
if k == 0:
if destination == source:
return True
return False
if source in graph[destination]:
return True
for node in graph[destination]:
if is_reciprocal_link(graph, source, node, k-1):
return True
return False
def compute_ranks(graph, k):
d = 0.8 # damping factor
numloops = 10
ranks = {}
npages = len(graph)
for page in graph:
ranks[page] = 1.0 / npages
for _ in range(0, numloops):
newranks = {}
for page in graph:
newrank = (1 - d) / npages
for node in graph:
if page in graph[node]:
if not is_reciprocal_link(graph, node, page, k):
newrank = newrank + d * (ranks[node]/len(graph[node]))
newranks[page] = newrank
ranks = newranks
return ranks
def main():
t = TestTools()
g = {'a': ['a', 'b', 'c'], 'b':['a'], 'c':['d'], 'd':['a']}
# the a->a link is reciprocal
t.new_test(func=compute_ranks)
t.evaluate_result(compute_ranks(g, 0), expected={'a': 0.26676872354238684, 'c': 0.1216391112164609,
'b': 0.1216391112164609, 'd': 0.1476647842238683})
# a->a, a->b, b->a links are reciprocal
t.new_test(func=compute_ranks)
t.evaluate_result(compute_ranks(g, 1), expected={'a': 0.14761759762962962, 'c': 0.08936469270123457,
'b': 0.04999999999999999, 'd': 0.12202199703703702})
# a->a, a->b, b->a, a->c, c->d, d->a links are reciprocal
# (so all pages end up with the same rank)
t.new_test(func=compute_ranks)
t.evaluate_result(compute_ranks(g, 2), expected={'a': 0.04999999999999999, 'c': 0.04999999999999999,
'b': 0.04999999999999999, 'd': 0.04999999999999999})
if __name__ == '__main__':
main()