-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
28,187 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -186,7 +186,7 @@ Apache License | |
same "printed page" as the copyright notice for easier | ||
identification within third-party archives. | ||
|
||
Copyright {yyyy} {name of copyright owner} | ||
Copyright 2014 George Danezis ([email protected]) | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,265 @@ | ||
## This is an educational random forest implementation | ||
|
||
## References: | ||
## * A. Criminisi, J. Shotton, and E. Konukoglu, Decision Forests: | ||
## A Unified Framework for Classification, Regression, Density Estimation, | ||
## Manifold Learning and Semi-Supervised Learning. Foundations and Trends in | ||
## Computer Graphics and Computer Vision. NOW Publishers. Vol.7: No 2-3, pp 81-227. 2012. | ||
## | ||
## * Jamie Shotton, Toby Sharp, Pushmeet Kohli, Sebastian Nowozin, John Winn, | ||
## and Antonio Criminisi, Decision Jungles: Compact and Rich Models for | ||
## Classification, in Proc. NIPS, 2013 | ||
|
||
import random | ||
from collections import Counter | ||
import numpy as np | ||
import copy | ||
|
||
def split_data(data, label=0, length=50): | ||
'Take a large text and divide it into chunks' | ||
strings = [data[i:i+length] for i in range(0, len(data) - length, length)] | ||
random.shuffle(strings) | ||
strings = [(s, label) for s in strings] | ||
|
||
test = strings[:len(strings) * 10 / 100] | ||
training = strings[len(strings) * 10 / 100:] | ||
return test, training | ||
|
||
|
||
def entropy(data): | ||
'Computes the binary entropy of labelled data' | ||
v = Counter([b for _, b in data]).values() | ||
d = np.array(v) / float(sum(v)) | ||
return - sum(d * np.log(d)) | ||
|
||
|
||
def split(train, feat): | ||
'Split data according to an infromation gain criterium' | ||
## first compute the entropy | ||
Hx = entropy(train) | ||
if Hx < 0.000001: | ||
raise Exception("Entropy very low") | ||
L1 = [] | ||
L2 = [] | ||
for t in train: | ||
if feat in t[0]: | ||
L1 += [t] | ||
else: | ||
L2 += [t] | ||
|
||
E1 = entropy(L1) | ||
E2 = entropy(L2) | ||
L = float(len(train)) | ||
|
||
H = Hx - E1 * len(L1)/L - E2 * len(L2)/L | ||
return H, L1, L2, feat | ||
|
||
## -------------------------- | ||
## - The random forest code - | ||
## -------------------------- | ||
|
||
|
||
def build_tree(train, features, levels=5, numfeatures=100): | ||
'Train a decsision tree based on labeled data and features' | ||
if levels == 0: | ||
C1 = Counter([b for _, b in train]) | ||
Leaf = (None, C1) | ||
return Leaf | ||
else: | ||
try: | ||
X = (split(train, F) for F in random.sample(features, numfeatures)) | ||
H, L1, L2, F = max(X) | ||
M1 = build_tree(L1, features, levels - 1, numfeatures) | ||
M2 = build_tree(L2, features, levels - 1, numfeatures) | ||
Branch = (F, M1, M2) | ||
return Branch | ||
except: | ||
return build_tree(train, features, levels=0) | ||
|
||
|
||
def classify(tree, item): | ||
'Get a decision for an item using a tree' | ||
if len(tree) == 2: | ||
assert tree[0] is None | ||
return tree[1] | ||
else: | ||
fet, L1, L2 = tree | ||
if fet in item: | ||
return classify(L1, item) | ||
else: | ||
return classify(L2, item) | ||
|
||
## ---------------------------- | ||
## - The decision jungle code - | ||
## ---------------------------- | ||
|
||
|
||
def build_jungle(train, features, levels=10, numfeatures=100): | ||
DAG = {0: copy.copy(train)} | ||
Candidate_sets = [0] | ||
next_ID = 0 | ||
M = 10 | ||
|
||
for level in range(levels): | ||
result_sets = [] | ||
for tdata_idx in Candidate_sets: | ||
tdata = DAG[tdata_idx] | ||
|
||
if entropy(tdata) == 0.0: | ||
next_ID += 1 | ||
idx1 = next_ID | ||
result_sets += [idx1] | ||
DAG[idx1] = tdata + [] | ||
del DAG[tdata_idx][:] | ||
DAG[tdata_idx] += [True, idx1, idx1] | ||
continue | ||
|
||
X = (split(tdata, F) for F in random.sample(features, numfeatures)) | ||
H, L1, L2, F = max(X) | ||
|
||
# Branch = (F, M1, M2) | ||
next_ID += 1 | ||
idx1 = next_ID | ||
DAG[idx1] = L1 | ||
next_ID += 1 | ||
idx2 = next_ID | ||
DAG[idx2] = L2 | ||
|
||
result_sets += [idx1, idx2] | ||
del DAG[tdata_idx][:] | ||
DAG[tdata_idx] += [F, idx1, idx2] | ||
|
||
## Now optimize the result sets here | ||
random.shuffle(result_sets) | ||
|
||
basic = result_sets[:M] | ||
for r in result_sets[M:]: | ||
maxv = None | ||
maxi = None | ||
for b in basic: | ||
L = float(len(DAG[r] + DAG[b])) | ||
e1 = len(DAG[r]) * entropy(DAG[r]) | ||
e2 = len(DAG[b]) * entropy(DAG[b]) | ||
newe = L * entropy(DAG[r] + DAG[b]) | ||
score = abs(e1 + e2 - newe) | ||
if maxv is None: | ||
maxv = score | ||
maxi = b | ||
continue | ||
if score < maxv: | ||
maxv = score | ||
maxi = b | ||
DAG[maxi] += DAG[r] | ||
del DAG[r] | ||
DAG[r] = DAG[maxi] | ||
|
||
Candidate_sets = basic | ||
|
||
for tdata_idx in Candidate_sets: | ||
tdata = DAG[tdata_idx] | ||
C1 = Counter([b for _, b in tdata]) | ||
del DAG[tdata_idx][:] | ||
DAG[tdata_idx] += [None, C1] | ||
|
||
return DAG | ||
|
||
|
||
def classify_jungle(DAG, item): | ||
branch = DAG[0] | ||
while branch[0] is not None: | ||
try: | ||
fet, L1, L2 = branch | ||
if fet == True or fet in item: | ||
branch = DAG[L1] | ||
else: | ||
branch = DAG[L2] | ||
except: | ||
print len(branch) | ||
raise | ||
return branch[1] | ||
|
||
## ------------------------- | ||
## - Sample classification - | ||
## ------------------------- | ||
|
||
if __name__ == "__main__": | ||
dataEN = file("../data/pg23428.txt").read() | ||
dataFR = file("../data/pg5711.txt").read() | ||
|
||
length = 50 | ||
|
||
testEN, trainEN = split_data(dataEN, label=0, length=length) | ||
testFR, trainFR = split_data(dataFR, label=1, length=length) | ||
|
||
print "training: EN=%s FR=%s" % (len(trainEN), len(trainFR)) | ||
|
||
train = trainEN + trainFR | ||
random.shuffle(train) | ||
test = testEN + testFR | ||
random.shuffle(test) | ||
|
||
## Now make a bunch of features | ||
## A feature is in at least 10% of strings | ||
## but also at most in 90% of strings | ||
|
||
sometrain = random.sample(train, 1000) | ||
features = set() | ||
while len(features) < 700: | ||
fragment, _ = random.choice(sometrain) | ||
l = int(round(random.expovariate(0.20))) | ||
b = random.randint(0, max(0, length - l)) | ||
feat = fragment[b:b+l] | ||
|
||
## Test | ||
C = 0 | ||
for st, _ in sometrain: | ||
if feat in st: | ||
C += 1 | ||
|
||
f = float(C) / 1000 | ||
if f > 0.01 and f < 0.99 and feat not in features: | ||
features.add(feat) | ||
|
||
features = list(features) | ||
|
||
manytrees = [] | ||
jungle = [] | ||
for i in range(10): | ||
print "Build tree %s" % i | ||
size = len(train) / 3 | ||
training_sample = random.sample(train, size) | ||
|
||
tree = build_jungle(training_sample, features, numfeatures=100) | ||
jungle += [tree] | ||
|
||
tree = build_tree(training_sample, features, numfeatures=100) | ||
manytrees += [tree] | ||
|
||
testdata = test | ||
results_tree = Counter() | ||
results_jungle = Counter() | ||
for item, cat in testdata: | ||
# Trees | ||
c = Counter() | ||
for tree in manytrees: | ||
c += classify(tree, item) | ||
res = (max(c, key=lambda x: c[x]), cat) | ||
results_tree.update([res]) | ||
|
||
# Jungle | ||
c = Counter() | ||
for tree in jungle: | ||
c += classify_jungle(tree, item) | ||
res = (max(c, key=lambda x: c[x]), cat) | ||
results_jungle.update([res]) | ||
|
||
print "Results Tree Jungle" | ||
print "True positives: %4d %4d" \ | ||
% (results_tree[(1, 1)], results_jungle[(1, 1)]) | ||
print "True negatives: %4d %4d" \ | ||
% (results_tree[(0, 0)], results_jungle[(0, 0)]) | ||
print "False positives: %4d %4d" \ | ||
% (results_tree[(1, 0)], results_jungle[(1, 0)]) | ||
print "False negatives: %4d %4d" \ | ||
% (results_tree[(0, 1)], results_jungle[(0, 1)]) |
Oops, something went wrong.