-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_cleaning.py
More file actions
83 lines (63 loc) · 1.89 KB
/
data_cleaning.py
File metadata and controls
83 lines (63 loc) · 1.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import re
import copy
labels = ['items_id', 'items_title', 'items_country', 'items_dataProvider',
'items_type', 'items_edmTimespanLabelLangAware_def',
'items_dcCreator', 'items_edmPreview']
# new_labels = {
# 'items_dataProvider': 'provider_name',
# 'items_country': 'provider_country',
# 'items_dcCreator': 'creator',
# 'items_edmPreview': 'preview',
# 'items_type': 'type',
# 'items_title': 'title',
# 'items_id': 'id',
# 'items_edmTimespanLabelLangAware_def': 'creation_date'
# }
def add_missing_columns(items):
new_items = copy.deepcopy(items)
for i in range(len(items)):
c_labels = set(items[i].keys())
missing = set(labels).difference(c_labels)
for m in missing:
new_items[i].update({m: ''})
return new_items
def clean_list(value):
'''
When multiple values in on row, join
'''
if isinstance(value, list):
return ' '.join(value)
else:
return value
def parse_date(items):
'''
Keep element that looks like a date
'''
new_items = copy.deepcopy(items)
for i in range(len(items)):
ints = re.findall('\d+', items[i]['items_edmTimespanLabelLangAware_def'])
if len(ints) > 0:
try:
date = max(ints, key=len)
date = int(date)
except Exception:
date = -1
else:
date = -1
new_items[i]['items_edmTimespanLabelLangAware_def'] = date
return new_items
def parse_user_dates(usr_data):
'''
Parse user date inputs
Return false if problem during parsing,
else return parsed values
'''
try:
usr_from = int(usr_data['from'])
except Exception:
usr_from = 0
try:
usr_to = int(usr_data['to'])
except Exception:
usr_to = 3000
return {'from': usr_from, 'to': usr_to}