-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodelTrain-ov-win.py
118 lines (103 loc) · 4.11 KB
/
modelTrain-ov-win.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
# import matplotlib.pyplot as plt
import xgboost as xgb
import time
from datetime import datetime as dt
import operator
TRAIN_FINAL_CSV = 'training_features.csv'
TEST_FINAL_CSV = 'testing_features.csv'
LABEL_FINAL_CSV = 'labels.csv'
OUT_FOLDER = '../outputs/'
MODEL_FOLDER = '../models/'
MAX_DEPTH = 6
ETA = 0.01
NUM_ROUND = 2000
SUB_SAMPLE = 0.8
COL_SAMPLE = 0.8
EARLY_STOP = 100
COL_SAMPLE_SEED = 0
XGBOOST_SEED = 0
param = {'max_depth': MAX_DEPTH, 'eta': ETA, 'silent': 1, 'objective': 'multi:softprob', 'subsample': SUB_SAMPLE,
'colsample_bytree': 1.0, 'num_class': 12, 'seed': XGBOOST_SEED}
# ndcg5
def ndcg5(preds, dtrain):
k = 5
y_true = dtrain.get_label()
n = len(y_true)
num_class = preds.shape[1]
index = np.argsort(preds, axis=1)
top = index[:, -k:][:, ::-1]
rel = (np.reshape(y_true, (n, 1)) == top).astype(int)
cal_dcg = lambda y: sum((2 ** y - 1) / np.log2(range(2, k + 2)))
ndcg = np.mean((np.apply_along_axis(cal_dcg, 1, rel)))
return 'ndcg5', ndcg
def p_ndcg5(preds, dtrain):
name, q = ndcg5(preds, dtrain)
return 'p_ndcg5', q * 100
startTime = dt.now()
le = LabelEncoder()
print "start read CSV"
destination = pd.read_csv(LABEL_FINAL_CSV, header=0, index_col=0)
x = pd.read_csv(TRAIN_FINAL_CSV, header=0, index_col=0)
x_test = pd.read_csv(TEST_FINAL_CSV, header=0, index_col=0)
print "Over read Csv"
y = le.fit_transform(destination['country_destination'].values)
idSave = x_test.index
HOLD_OUT_SEED = 0
x_flg = pd.concat([x, destination], axis=1)
x_in = x_flg.sample(frac=0.8, random_state=HOLD_OUT_SEED, axis=0)
x_out = x_flg.drop(x_in.index, axis=0)
y_in = le.fit_transform(x_in['country_destination'].values)
y_out = le.fit_transform(x_out['country_destination'].values)
x_in.drop('country_destination', axis=1, inplace=True)
x_out.drop('country_destination', axis=1, inplace=True)
print x_in.shape, x_out.shape, len(y_in), len(y_out)
num_round = NUM_ROUND
train_rounds = 0
col_sample_seed_now = COL_SAMPLE_SEED - 1
bst = 0
while train_rounds < 30:
col_sample_seed_now += 1
x_in_sampled = x_in.sample(frac=COL_SAMPLE, random_state=col_sample_seed_now, axis=1)
print x_in_sampled.columns, x_in_sampled.shape
x_out_sampled = x_out[x_in_sampled.columns]
dtrain_in = xgb.DMatrix(x_in_sampled, label=y_in, missing=float('NaN'))
dtrain_out = xgb.DMatrix(x_out_sampled, label=y_out, missing=float('NaN'))
evallist = [(dtrain_in, 'train_in'), (dtrain_out, 'train_out')]
print "start hold-out-validation!"
bst = xgb.train(param, dtrain_in, num_round, evallist, feval=ndcg5, early_stopping_rounds=EARLY_STOP)
print col_sample_seed_now, bst.best_score, bst.best_iteration, bst.best_ntree_limit
if (bst.best_score < 0.83):
continue
train_rounds = max(1, bst.best_ntree_limit)
best_score = bst.best_score
num_round = train_rounds
paramCheck = '(' + str(MAX_DEPTH) + ',' + str(ETA) + ',' + str(num_round) + ',' + str(SUB_SAMPLE) + ',' + str(
COL_SAMPLE) + ')'
timeStr = str(time.strftime('%Y-%m-%d %H%M%S', time.localtime()))
bst.save_model(
MODEL_FOLDER + 'model' + timeStr + str(best_score) + paramCheck + str(col_sample_seed_now) + '.model')
bst.dump_model(MODEL_FOLDER + timeStr + "dump.raw.txt")
fscore = bst.get_fscore()
sorted_fscore = sorted(fscore.items(), key=operator.itemgetter(1), reverse=True)
print sorted_fscore
x_test_sampled = x_test[x_in_sampled.columns]
dtest = xgb.DMatrix(x_test_sampled, missing=float('NaN'))
ypred = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)
# print ypred[:5]
# Taking the 5 classes with highest probabilities
ids = [] # list of ids
cts = [] # list of countries
for i in range(len(ypred)):
idx = idSave[i]
ids += [idx] * 5
cts += le.inverse_transform(np.argsort(ypred[i])[::-1])[:5].tolist()
# Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
stTimeStr = '(' + str(startTime)[11:19] + 'start)'
timeSpend = (dt.now() - startTime)
print timeSpend, stTimeStr
sub.to_csv(OUT_FOLDER + 'sub' + timeStr + ' ' + str(best_score) + paramCheck + '.csv',
index=False)