-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
150 lines (108 loc) · 5.01 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
class Crawler:
SHOW_REQUESTED_URLS_FOR_DEBUGGING = True
def __init__(self, service_url, logger, db_cursor):
self.service_url = service_url # e.g., "https://gerrit.wikimedia.org/r"
self.api_url = "{}/changes/".format(self.service_url)
self.logger = logger
self.https_request_count = 0
self.db_cursor = db_cursor
def request(self, relative_url):
import urllib2
url = "{}{}".format(self.api_url, relative_url)
if Crawler.SHOW_REQUESTED_URLS_FOR_DEBUGGING:
self.logger.debug("Visiting %s" % url)
self.https_request_count += 1
if self.https_request_count % 5 == 0:
self.logger.warning("Total HTTPS request made: %d" % self.https_request_count)
return urllib2.urlopen(url).read()
# The parameter file_path should NOT be slash escaped
def build_diff_path(self, change_id, revision_no, file_path):
#"" https://gerrit.wikimedia.org/r/#/c/356586/6/modules/gerrit/templates/gerrit.config.erb
pattern = "{}/#/c/{}/{}/{}"
return pattern.format(self.service_url, change_id, revision_no, file_path)
def request_json(self, relative_url):
response = self.request(relative_url)
if response[:4] == ")]}'":
response = response[4:]
import json
return json.loads(response)
def is_merged(self, change_id):
response = self.request_json(change_id)
return response["status"] == "MERGED"
def revision_numbers(self, change_id):
relative_url = "{}/?o=ALL_REVISIONS".format(change_id)
response = self.request_json(relative_url)
numbers = [value["_number"] for value in response['revisions'].values()]
numbers.sort()
return numbers
def revision_count(self, change_id):
return len(self.revision_numbers(change_id))
# TODO: Use /content instead of /download (https://github.com/miyagilabs/revisionCrawler/issues/1)
# The parameter file_path should be slash escaped
def download_base_file(self, change_id, file_path):
valid_revision_number = self.revision_numbers(change_id)[0]
# Note: "parent=1" specifies that we request the file in the parent commit.
relative_url = "{}/revisions/{}/files/{}/download?parent=1".format(
change_id,
valid_revision_number,
file_path)
# https://gerrit.wikimedia.org/r/changes/356858/revisions/1/files/SpamBlacklistHooks.php/download?parent=1
return self.request(relative_url)
# TODO: Use /content instead of /download (https://github.com/miyagilabs/revisionCrawler/issues/1)
# The parameter file_path should be slash escaped
def download_revision_file(self, change_id, revision_no, file_path):
pattern = "{}/revisions/{}/files/{}/download"
relative_url = pattern.format(change_id, revision_no, file_path)
return self.request(relative_url)
def files_in_revision(self, change_id, revision_no):
# url = "https://gerrit.wikimedia.org/r/changes/356858/revisions/1/files/"
relative_url = "{}/revisions/{}/files/".format(change_id, revision_no)
response = self.request_json(relative_url)
# files = []
# for key, value in content_in_json.iteritems():
# if key != "/COMMIT_MSG":
# files.append(key)
# print files
condition = lambda file_name: file_name != "/COMMIT_MSG"
return filter(condition, response.keys())
# The parameter file_path should be slash escaped
def has_diff(self, change_id, file_path, revision_no, base_revision_no):
pattern = "{}/revisions/{}/files/{}/diff?base={}"
relative_url = pattern.format(
change_id,
revision_no,
file_path,
base_revision_no)
response = self.request_json(relative_url)
return len(response["content"]) > 1
def slash_escaped_file_path(self, path):
return path.replace("/", "%2F")
def unescape_slash(self, path):
return path.replace("%2F", "/")
# The parameter file_path should NOT be slash escaped
def print_file_comments_in_revision(self, change_id, revision_no, file_path):
relative_url = "{}/revisions/{}/comments".format(change_id, revision_no)
response = self.request_json(relative_url)
for comment_map in response[file_path]:
print comment_map["author"]["name"] + "says: \"\n " + comment_map["message"]
print "\""
# The parameter file_path should NOT be slash escaped
def has_comments_for_file_in_revision(self, change_id, revision_no, file_path):
relative_url = "{}/revisions/{}/comments".format(change_id, revision_no)
response = self.request_json(relative_url)
return len(response) > 0 and file_path in response
# The parameter file_path should NOT be slash escaped
def has_comments_for_file(self, change_id, file_path, cached_revision_nos=None):
numbers = cached_revision_nos if cached_revision_nos else self.revision_numbers(change_id)
for n in numbers:
if self.has_comments_for_file_in_revision(change_id, n, file_path):
return True
return False
def insert_status(self, change_id, status, detail):
self.db_cursor.execute(
"INSERT INTO changeIdStatus(change_id, status, detail) VALUES ({}, \"{}\", \"{}\")".\
format(change_id, status, detail))
def insert_corrected_file_url(self, change_id, url):
self.db_cursor.execute(
"INSERT INTO correctedFileUrl(change_id, url) VALUES ({change_id}, \"{url}\")".\
format(change_id=change_id, url=url))