-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
executable file
·89 lines (84 loc) · 3.38 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env python3
"""SlipSapm.
Usage:
slipspam bench [-v] [--drop-col=<nb>...] [--executions=<nb>] [--test-size=<size>]
[--trainset=<file>] [--testset=<file>]
slipspam predict (<email-text> | --in-text=<file> | --in-feat=<file>) [-v] [-t] [--drop-col=<nb>...]
[--trainset=<file>]
slipspam parse <in-file> <out-file>
slipspam -h | --help
slipspam --version
Options:
-h --help Show this screen.
--version Show version.
-v Be verbose.
-d <nb>, --drop-col=<nb> Drop a column from the dataset (can be repeted) [default: 26 27].
-e <nb>, --executions=<nb> Number of executions [default: 5].
--test-size=<size> Proportion of the dataset to use for the tests [default: 0.2].
--trainset=<file> Path of the training dataset file (from data/) [default: spambase.csv].
--testset=<file> Path of the test dataset file (from data/) [default is trainset file].
-t Translated for human readability.
--in-text=<file> Path to a file containing the text of a mail to classify.
--in-feat=<file> Path to a file containing a csv of features compliant with spambase.
<in-file> Path to input file must be a csv with to columns: [text, spam]
"""
from src.algorithms import NaiveBayes, Svm, Knn, GradientBoosting, Mpl, Rfc
from src.dataset import Dataset
from src.benchmark import run_bench
from src.utils import text2features, vtext2features, vtrans_label, vint
import numpy as np
import pandas as pd
from tqdm import tqdm
from docopt import docopt
args = docopt(__doc__, version='SlipSpam 1.0-beta.1')
# print(args)
algos = [
(NaiveBayes, 'NB'),
(Svm, 'SVM'),
(Knn, 'KNN'),
(GradientBoosting, 'Gb'),
(Mpl, 'MLP'),
# (DecisionTreeClassifier, "DecisionTreeClassifier"),
# (LinearDiscriminantAnalysis, "LinearDiscriminant Analysis"),
(Rfc, 'RFC')
]
repetition = int(args['--executions'])
test_size = float(args['--test-size'])
trainset = args['--trainset']
testset = args['--testset']
drop_cols = vint(args['--drop-col'])
verbose = args['-v']
if args['bench']:
run_bench(algos,
repetition=repetition,
test_size=test_size,
trainset=trainset,
testset=testset,
drop_cols=drop_cols)
elif args['predict']:
if args['<email-text>']:
text = args['<email-text>']
features = [np.delete(text2features(text), drop_cols)]
if args['--in-text']:
f = open(args['--in-text'], "r")
text = f.read()
f.close()
features = [np.delete(text2features(text), drop_cols)]
if args['--in-feat']:
data_frame = pd.read_csv(args['--in-feat'], header=None).drop(columns=drop_cols)
features = data_frame.iloc[:, :-1].values
dataset = Dataset(test_size=test_size, trainset=trainset, drop_cols=drop_cols)
if verbose:
print(features)
results = Rfc(dataset).predict('optimize', features)
if args['-t']:
print(vtrans_label(results).tolist())
else:
print(results)
elif args['parse']:
in_file = args['<in-file>']
tqdm.pandas()
email_df = pd.read_csv(in_file)
features = email_df.progress_apply(lambda x: text2features(x['text']), axis=1, result_type='expand')
features['spam'] = email_df['spam']
features.to_csv(args['<out-file>'], header=False, index=False)