-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathvisa_extract_parser.py
101 lines (92 loc) · 4.63 KB
/
visa_extract_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
DATA_DIR = '../data/'
import pandas as pd
import networkx as nx
from unidecode import unidecode
# https://stackoverflow.com/a/7734686
import sys
if sys.version_info.major == 3:
from urllib.parse import urlencode, urlparse, urlunparse, parse_qs
else:
from urllib import urlencode
from urlparse import urlparse, urlunparse, parse_qs
def clean_url_queries(url):
# https://stackoverflow.com/a/7734686
if '?' not in url:
return url
u = urlparse(url)
query = parse_qs(u.query, keep_blank_values=True)
for q in ['page', 'fbclid', 'from', 'isappinstalled', 'sfns', '_e_pi_', 'from_source', 'ifm', 'mt']:
query.pop(q, None)
u = u._replace(query=urlencode(query, True))
return urlunparse(u)
G = nx.DiGraph()
chunksize = 10000 # http://acepor.github.io/2017/08/03/using-chunksize/
raw_sequences = {}
raw_from_to = {}
sequence_hashes = {}
sequence_counts = {}
for f in ['data_visa_extract_ausgov.csv']: # , 'data_visa_extract_immi.csv']:
lastSessionId = None
# read CSV in chunks to reduce memory overload
for chunk in pd.read_csv(DATA_DIR + f, chunksize=chunksize):
for index, row in chunk.iterrows():
sessionId = row['fullVisitorId'] + row['visitId']
if sessionId not in raw_sequences:
raw_sequences[sessionId] = []
# look at URLs from page hit paths and outbound link click events
for target in [row['pagePath'], row['outboundLinkURL']]:
# ignore blank columns and google translate URLs
if not pd.isnull(target) and not target.startswith('/translate'):
target = unidecode(target)
# normalise paths to start with australia.gov.au
if target[0] == '/':
target = 'https://www.australia.gov.au' + target
target = clean_url_queries(target)
# remove duplicate hits eg. page then click on same URL
if target not in raw_sequences[sessionId]:
# if there is a session history, start building from/to edges
if len(raw_sequences[sessionId]) > 0:
previousAusGovPage = raw_sequences[sessionId][-1]
# click events should come back to ausgov on from/to network
if row['hitType'] == 'EVENT':
for seq in reversed(raw_sequences[sessionId]):
if 'australia.gov.au' in seq:
previousAusGovPage = seq
if previousAusGovPage not in raw_from_to:
raw_from_to[previousAusGovPage] = {}
# increment count for edge weight
raw_from_to[previousAusGovPage][target] = \
raw_from_to[previousAusGovPage].get(target, 0) + 1
# add hit URL to session sequence of URLs
raw_sequences[sessionId].append(target)
# start a new session data object when the session id changes
if lastSessionId != sessionId:
sequenceHash = hash(str(raw_sequences[sessionId]))
if sequenceHash not in sequence_hashes:
sequence_hashes[sequenceHash] = raw_sequences[sessionId]
sequence_counts[sequenceHash] = sequence_counts.get(sequenceHash, 0) + 1
# print progress of CSV parsing
if index % 10000 == 0:
print(f, index)
lastSessionId = sessionId
print(f, 'done')
# write session journeys to CSV
with open('visa_extract_ausgov_journeys.csv', 'w') as csvfile:
csvfile.write("%s,%s\n" % ('count', 'journey_path'))
for shash, scount in sequence_counts.items():
# write to CSV journeys that have happened more than once
if scount > 1:
csvfile.write('%s,"%s"\n' % (scount, ';'.join(sequence_hashes[shash])))
# print frequent journeys
if scount > 500:
print(scount, sequence_hashes[shash])
# write from/to edges to GEXF and CSV
with open('visa_extract_ausgov_fromto.csv', 'w') as csvfile:
csvfile.write("%s,%s,%s\n" % ('from_url', 'to_url', 'weight'))
for from_url, urls in raw_from_to.items():
for to_url in urls.keys():
G.add_edge(from_url,
to_url,
weight=urls[to_url])
csvfile.write("%s,%s,%s\n" % (from_url, to_url, urls[to_url]))
nx.write_gexf(G, "visa_extract_ausgov.gexf")