-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathboostedforest.py
117 lines (75 loc) · 3.82 KB
/
boostedforest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Dec 2 11:19:11 2018
@author: nithish k
Please refer the report in the pdf commited in the git hub
Tried this for the best model however gives a conssitent accuracy of 68%
"""
###Adabooseted forest
from forest import forest
from adaboost import AdaBoost
import pandas as pd
import numpy as np
import itertools as itr
import random
import collections as col
import math
import tqdm
class boostedForest(AdaBoost):
def __init__(self,**kwargs):
self.kwargs = kwargs
AdaBoost.__init__(self,kwargs.get('nForests',5))
self.verbose = kwargs.get('verbose',False)
pass
def verbosePrint(self,*args):
if self.verbose:
print(*args)
def train(self,TrainXmatrix,TrainY):
self.trainXMatrix = TrainXmatrix
self.trainYLabels = TrainY
uniqueYLables = set(TrainY)
numYLabels = len(uniqueYLables)
weightsForForests = col.defaultdict(int)
numTrainObs = self.trainXMatrix.shape[0]
obsWeights = np.array([1/numTrainObs for i in range(numTrainObs)]) ##initialise
##create forests and train them
trainedForests = [] ##multiple forests
self.verbosePrint("\nNumber of forest building : ",self.nTrees )
for i in range(self.nTrees): ##do not confuse with trees, they are forest
ForestObj = forest(**self.kwargs)
ForestObj.trainForest(TrainXmatrix,TrainY)
trainedForests.append(ForestObj) #entire foreest
self.verbosePrint("\nNumber of forests built : ", i+1)
self.hypothesis = trainedForests ##assempble of forests
self.verbosePrint("\nCalculating weights ....")
for forestNum,ForestObj in enumerate(self.hypothesis): ##feature pair as tuple
trainPredictionList = ForestObj.predict(TrainXmatrix)
forestError = self._calcError(obsWeights,trainPredictionList,self.trainYLabels)
if forestError > 1-(1/numYLabels): #not better than random guessing
continue
# numSatisifyingStumps+=1
forestWeight = math.log((1-forestError)/forestError) + math.log(numYLabels-1)
obsWeights = self._adjustNormWeights(obsWeights,forestWeight,trainPredictionList,self.trainYLabels)
weightsForForests[forestNum] = forestWeight
self._weightsForHypothesis = weightsForForests
def predict(self,TestXmatrix):
self.verbosePrint("\nPredicting.....")
dictOfLabelsCumWeights = col.defaultdict(lambda: col.defaultdict(int))
for forestNum,ForestObj in enumerate(self.hypothesis):
predictionList = ForestObj.predict(TestXmatrix)
decisionWeight = self._weightsForHypothesis[forestNum]
for i,label in enumerate(predictionList):
dictOfLabelsCumWeights[i][label] += decisionWeight
dictOfLabelsCumWeights = dict(dictOfLabelsCumWeights)
finalWeightedPredictions = \
[max(dictOfLabelsCumWeights[i],key = dictOfLabelsCumWeights[i].get) for i in range(len(predictionList))]
return finalWeightedPredictions
if __name__ == '__main__':
myBoostedForest = boostedForest(nForests = 10,numTrees = 10,verbose = True)
TrainX,TrainY,TrainXID = myBoostedForest.getDataFromFile('train-data.txt')
myBoostedForest.train(TrainX,TrainY)
Xtest,yTest,XtestID = myBoostedForest.getDataFromFile('test-data.txt')
finalPredictions = myBoostedForest.predict(Xtest)
myBoostedForest.writeToFile(XtestID,finalPredictions,'output.txt')
print(sum(finalPredictions==yTest)/len(yTest))