forked from GregorUT/vgchartzScrape
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathvgchartz-full-crawler.py
339 lines (299 loc) · 12.5 KB
/
vgchartz-full-crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
from bs4 import BeautifulSoup, element
from random import randint, choice
import urllib
import urllib.request
import pandas as pd
import numpy as np
import logging
import sys
import time
import json
def create_random_header():
"""
Create a random user agent in order to better mimic user behaviour.
:return JSON with User-Agent as key and random browser-os combo as value
"""
logging.info("create_random_header >>>")
browsers = ["Mozilla", "Chrome"]
os_list = ["Windows NT 6.1; Win64; x64", "X11; Linux x86_64"]
major_version = randint(properties['minimum_major_version'], properties['maximum_major_version'])
minor_version = randint(properties['minimum_minor_version'], properties['maximum_minor_version'])
chosen_browser = choice(browsers)
chosen_os = choice(os_list)
user_agent = '{}/{}.{} ({})'.format(
chosen_browser,
major_version,
minor_version,
chosen_os)
header = {'User-Agent': user_agent}
logging.debug("Current user_agent: {}".format(header))
logging.info("create_random_header <<<")
return header
def generate_remaining_url(*, query_parameters):
"""
Generate an url with a list of videogames from the query params configured at resources.json
:return: Url with page number
"""
logging.info("generate_remaining_url >>>")
reply=''
for param in query_parameters:
value=query_parameters.get(param, None)
reply += f"&{param}={value}" if value is not None else f"&{param}="
logging.debug(f"Url Generated: {base_url}N{reply}")
logging.info("generate_remaining_url <<<")
return reply
def get_page(*, url):
"""
Perform a GET request to the given URL and return results.
Add a wait logic that, combined with random header, will help avoiding
HTTP 429 error.
:param url: webpage URL
:return: HTML page's body
"""
logging.info("get_page >>>")
logging.debug("Current URL: {}".format(url))
header = create_random_header()
request = urllib.request.Request(url, headers=header)
result = urllib.request.urlopen(request).read()
time.sleep(randint(properties['minimum_sleep_time'], properties['maximum_sleep_time']))
logging.info("get_page <<<")
return result
def get_genre(*, game_url):
"""
Return the game genre retrieved from the given url
(It involves another http request)
:param game_url:
:return: Genre of the input game
"""
logging.info("get_genre >>>")
logging.debug("Page to download: {}".format(game_url))
site_raw = get_page(url=game_url)
sub_soup = BeautifulSoup(site_raw, "html.parser")
# Eventually the info box is inconsistent among games so we
# have to find all the h2 and traverse from that to the genre name
# and make a temporary tag here to search
# for the one that contains the word "Genre"
h2s = sub_soup.find("div", {"id": "gameGenInfoBox"}).find_all('h2')
temp_tag = element.Tag
for h2 in h2s:
if h2.string == 'Genre':
temp_tag = h2
genre_value = temp_tag.next_sibling.string
logging.debug("Game genre: {}".format(genre_value))
logging.info("get_genre <<<")
return genre_value
def parse_number(*, number_string):
"""
Return string parsed to float with custom format for millions (m)
:param number_string:
:return: a float number right parsed
"""
logging.info("parse_number >>>")
print(number_string)
if "m" in number_string:
reply = number_string.strip('m')
reply = str(float(reply) * 1000000)
else:
reply=number_string
logging.info("parse_number <<<")
return float(reply) if not reply.startswith("N/A") else np.nan
def parse_date(*, date_string):
"""
Return the date received as string onto timestamp or N/A.
:param date_string:
:return: A timestamp in panda date format
"""
logging.info("parse_date >>>")
if date_string.startswith('N/A'):
date_formatted = 'N/A'
else:
#i.e. date_string = '18th Feb 20'
date_formatted = pd.to_datetime(date_string)
logging.debug("Date parsed: {}".format(date_formatted))
logging.info("parse_date <<<")
return date_formatted
def add_current_game_data(*,
current_rank,
current_game_name,
current_game_genre,
current_platform,
current_publisher,
current_developer,
current_vgchartz_score,
current_critic_score,
current_user_score,
current_total_shipped,
current_total_sales,
current_sales_na,
current_sales_pal,
current_sales_jp,
current_sales_ot,
current_release_date,
current_last_update):
"""
Add all the game data to the related lists
"""
logging.info("add_current_game_data >>>")
game_name.append(current_game_name)
rank.append(current_rank)
platform.append(current_platform)
genre.append(current_game_genre)
publisher.append(current_publisher.strip())
developer.append(current_developer.strip())
vgchartz_score.append(current_vgchartz_score)
critic_score.append(current_critic_score)
user_score.append(current_user_score)
total_shipped.append(current_total_shipped)
total_sales.append(current_total_sales)
sales_na.append(current_sales_na)
sales_pal.append(current_sales_pal)
sales_jp.append(current_sales_jp)
sales_ot.append(current_sales_ot)
release_date.append(current_release_date)
last_update.append(current_last_update)
logging.info("add_current_game_data <<<")
def download_data(*, start_page, end_page, include_genre):
"""
Download games data from vgchartz: only data whose pages are in the range (start_page, end_page) will be downloaded
:param start_page:
:param end_page:
:param include_genre:
:return:
"""
logging.info("download_data >>>")
downloaded_games = 0 # Results are decreasingly ordered according to Shipped units
for page in range(start_page, end_page + 1):
page_url = "{}{}{}".format(base_url, str(page), remaining_url)
current_page = get_page(url=page_url)
soup = BeautifulSoup(current_page, features="html.parser")
logging.info("Downloaded page {}".format(page))
# We locate the game through search <a> tags with game urls in the main table
game_tags = list(filter(
lambda x: x.attrs['href'].startswith('https://www.vgchartz.com/game/'),
# discard the first 10 elements because those
# links are in the navigation bar
soup.find_all("a")
))[10:]
for tag in game_tags:
current_game_name = " ".join(tag.string.split())
data = tag.parent.parent.find_all("td")
logging.debug("Downloaded game: {}. Name: {}".format(downloaded_games + 1, current_game_name))
# Get the resto of attributes traverse up the DOM tree looking for the cells in results' table
current_rank = np.int32(data[0].string)
current_platform = data[3].find('img').attrs['alt']
current_publisher = data[4].string
current_developer = data[5].string
current_vgchartz_score = parse_number(number_string=data[6].string)
current_critic_score = parse_number(number_string=data[7].string)
current_user_score = parse_number(number_string=data[8].string)
current_total_shipped = parse_number(number_string=data[9].string)
current_total_sales = parse_number(number_string=data[10].string)
current_sales_na = parse_number(number_string=data[11].string)
current_sales_pal = parse_number(number_string=data[12].string)
current_sales_jp = parse_number(number_string=data[13].string)
current_sales_ot = parse_number(number_string=data[14].string)
current_release_date = parse_date(date_string=data[15].string)
current_last_update = parse_date(date_string=data[16].string)
# The genre requires another HTTP Request, so it's made at the end
game_url = tag.attrs['href']
current_game_genre = ""
if include_genre:
current_game_genre = get_genre(game_url=game_url)
add_current_game_data(
current_rank=current_rank,
current_game_name=current_game_name,
current_game_genre=current_game_genre,
current_platform=current_platform,
current_publisher=current_publisher,
current_developer=current_developer,
current_vgchartz_score=current_vgchartz_score,
current_critic_score=current_critic_score,
current_user_score=current_user_score,
current_total_shipped=current_total_shipped,
current_total_sales=current_total_sales,
current_sales_na=current_sales_na,
current_sales_pal=current_sales_pal,
current_sales_jp=current_sales_jp,
current_sales_ot=current_sales_ot,
current_release_date=current_release_date,
current_last_update=current_last_update)
downloaded_games += 1
logging.info("Number of downloaded resources: {}".format(downloaded_games))
logging.info("download_data <<<")
def save_games_data(*, filename, separator, enc):
"""
Save all the downloaded data into the specified file
:param filename
:param separator
:param enc
"""
logging.info("save_games_data >>>")
columns = {
'Rank': rank,
'Name': game_name,
'Genre': genre,
'Platform': platform,
'Publisher': publisher,
'Developer': developer,
'Vgchartz_Score': vgchartz_score,
'Critic_Score': critic_score,
'User_Score': user_score,
'Total_Shipped': total_shipped,
'Total_Sales': total_sales,
'NA_Sales': sales_na,
'PAL_Sales': sales_pal,
'JP_Sales': sales_jp,
'Other_Sales': sales_ot,
'Release_Date': release_date,
'Last_Update': last_update
}
df = pd.DataFrame(columns)
logging.debug("Dataframe column name: {}".format(df.columns))
df = df[[ 'Rank', 'Name', 'Genre', 'Platform', 'Publisher', 'Developer',
'Vgchartz_Score', 'Critic_Score', 'User_Score', 'Total_Shipped',
'Total_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales',
'Release_Date', 'Last_Update' ]]
df.to_csv(filename, sep=separator, encoding=enc, index=False)
logging.info("save_games_data <<<")
if __name__ == "__main__":
# Buffers
rank = []
game_name = []
genre = []
platform = []
publisher, developer = [], []
critic_score, user_score, vgchartz_score = [], [], []
total_shipped = []
total_sales, sales_na, sales_pal, sales_jp, sales_ot = [], [], [], [], []
release_date, last_update = [], []
properties = None
with open("cfg/resources.json") as file:
properties = json.load(file)
logging.root.handlers = []
logging.basicConfig(format='%(asctime)s|%(name)s|%(levelname)s| %(message)s',
level=logging.DEBUG,
filename=properties["application_log_filename"])
# set up logging to console
console = logging.StreamHandler()
console.setLevel(logging.DEBUG)
# set a format which is simpler for console use
formatter = logging.Formatter(fmt='%(asctime)s|%(name)s|%(levelname)s| %(message)s',
datefmt="%d-%m-%Y %H:%M:%S")
console.setFormatter(formatter)
logging.getLogger("").addHandler(console)
try:
logging.info('Application started')
base_url = properties['base_page_url']
remaining_url=generate_remaining_url(query_parameters=properties['query_parameters'])
download_data(
start_page=properties['start_page'],
end_page=properties['end_page'],
include_genre=properties['include_genre'])
save_games_data(
filename=properties['output_filename'],
separator=properties['separator'],
enc=properties['encoding'])
except:
print("Global exception")
print("Unexpected error:", sys.exc_info())
pass