-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_data.py
104 lines (91 loc) · 3.39 KB
/
scrape_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from bs4 import BeautifulSoup
from pymongo import MongoClient
import requests
import pickle
import re
import os
from pprint import pprint
from conf import BASE_URL, GCG_FOLDER_PATH, REST_ENDPOINT
import json
import traceback
from IPython import embed
LAST_OFFSET = 25302
DIFFERENCE = 100
def get_div_text(parent_object, class_name):
div_object = parent_object.find('div', {'class': class_name})
if div_object is None:
return None
return div_object.text
def get_players_details(players_links):
details = list()
for player_link in players_links:
player_url_suffix = player_link.get("href")
player_id = player_url_suffix.split("=", 1)[-1]
response = requests.get(REST_ENDPOINT + "player.php?player=" + player_id)
player_details = json.loads(response.text)
player_details = player_details['player']
player_details["wins"] = player_details.pop("w")
player_details["losses"] = player_details.pop("l")
player_details["ties"] = player_details.pop("t")
details.append(player_details)
return details
def get_game_details(game_link):
game_details = dict()
game_url_suffix = game_link.get("href")
game_details["annotation_id"] = game_url_suffix.split("=", 1)[-1]
annotated_game_url = BASE_URL + game_url_suffix
response = requests.get(annotated_game_url)
game_details_html = response.text
soup = BeautifulSoup(game_details_html, "html.parser")
game_details["gcg_file_path"] = os.path.join(GCG_FOLDER_PATH, str(game_details["annotation_id"]) + '.gcg')
if not os.path.exists(game_details["gcg_file_path"]):
print "Saving gcg file"
gcg_file_tag = soup.select_one("a[href*=annotated/selfgcg]")
gcg_file_link = BASE_URL + gcg_file_tag.get("href")
response = requests.get(gcg_file_link)
with open(game_details["gcg_file_path"], 'w+') as f:
f.write(response.text)
else:
print "GCG file already present"
return game_details
with open("current_offset", 'rb') as fp:
current_offset = pickle.load(fp)
if not os.path.exists(GCG_FOLDER_PATH):
os.mkdir(GCG_FOLDER_PATH)
client = MongoClient()
db = client.cross_tables
player = db.player
game = db.game
index_collection = db.index
offset = index_collection.find_one()
starting_offset = offset['offset']
row_index = offset["row_index"]
pagination_pages = range(starting_offset, LAST_OFFSET, DIFFERENCE)
for current_offset in pagination_pages:
offset["offset"] = current_offset
index_collection.save(offset)
print "Current offset - ", current_offset
response = requests.get('http://www.cross-tables.com/annolistself.php?offset=' + str(current_offset))
html = response.text
soup = BeautifulSoup(html, "html.parser")
table = soup.find('table', id="xtdatatable")
rows = table.findChildren('tr')
for index, row in enumerate(rows[row_index:], row_index):
offset["row_index"] = index
index_collection.save(offset)
if row.get('id') == "headerrow":
continue
print "Current Row - ", index
all_row_links = row.findAll('a')
try:
game_details = get_game_details(all_row_links[0])
player1_details, player2_details = get_players_details(all_row_links[1:])
except:
print traceback.format_exc()
continue
game_details["player1_id"] = player1_details[u'playerid']
game_details["player2_id"] = player2_details[u'playerid']
player.update(player1_details, player1_details, upsert=True)
player.update(player2_details, player2_details, upsert=True)
game.update(game_details, game_details, upsert=True)
print "\n##############################################\n"