-
Notifications
You must be signed in to change notification settings - Fork 78
/
preprocess.py
96 lines (83 loc) · 3.39 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from ast import literal_eval
from csv import reader
from os import listdir, makedirs, path
from pickle import dump
import numpy as np
from args import get_parser
def load_and_save(category, filename, dataset, dataset_folder, output_folder):
temp = np.genfromtxt(
path.join(dataset_folder, category, filename),
dtype=np.float32,
delimiter=",",
)
print(dataset, category, filename, temp.shape)
with open(path.join(output_folder, dataset + "_" + category + ".pkl"), "wb") as file:
dump(temp, file)
def load_data(dataset):
""" Method from OmniAnomaly (https://github.com/NetManAIOps/OmniAnomaly) """
if dataset == "SMD":
dataset_folder = "datasets/ServerMachineDataset"
output_folder = "datasets/ServerMachineDataset/processed"
makedirs(output_folder, exist_ok=True)
file_list = listdir(path.join(dataset_folder, "train"))
for filename in file_list:
if filename.endswith(".txt"):
load_and_save(
"train",
filename,
filename.strip(".txt"),
dataset_folder,
output_folder,
)
load_and_save(
"test_label",
filename,
filename.strip(".txt"),
dataset_folder,
output_folder,
)
load_and_save(
"test",
filename,
filename.strip(".txt"),
dataset_folder,
output_folder,
)
elif dataset == "SMAP" or dataset == "MSL":
dataset_folder = "datasets/data"
output_folder = "datasets/data/processed"
makedirs(output_folder, exist_ok=True)
with open(path.join(dataset_folder, "labeled_anomalies.csv"), "r") as file:
csv_reader = reader(file, delimiter=",")
res = [row for row in csv_reader][1:]
res = sorted(res, key=lambda k: k[0])
data_info = [row for row in res if row[1] == dataset and row[0] != "P-2"]
labels = []
for row in data_info:
anomalies = literal_eval(row[2])
length = int(row[-1])
label = np.zeros([length], dtype=np.bool_)
for anomaly in anomalies:
label[anomaly[0] : anomaly[1] + 1] = True
labels.extend(label)
labels = np.asarray(labels)
print(dataset, "test_label", labels.shape)
with open(path.join(output_folder, dataset + "_" + "test_label" + ".pkl"), "wb") as file:
dump(labels, file)
def concatenate_and_save(category):
data = []
for row in data_info:
filename = row[0]
temp = np.load(path.join(dataset_folder, category, filename + ".npy"))
data.extend(temp)
data = np.asarray(data)
print(dataset, category, data.shape)
with open(path.join(output_folder, dataset + "_" + category + ".pkl"), "wb") as file:
dump(data, file)
for c in ["train", "test"]:
concatenate_and_save(c)
if __name__ == "__main__":
parser = get_parser()
args = parser.parse_args()
ds = args.dataset.upper()
load_data(ds)