-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPerturbation.py
202 lines (188 loc) · 7.02 KB
/
Perturbation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import csv
import numpy as np
import pandas as pd
from os import path
from sys import argv
from sys import exit
def readColumns(path):
with open(path, 'r') as f:
columns = [line for line in csv.reader(f)][0][1:]
return columns
def readTiers(columns, cat_attributes):
tier = []
isOH = []
lookup = {}
next_index = 1
for c in columns:
data = c.split('=', 2)
if len(data) == 1:
tier.append(next_index)
next_index = next_index + 1
if(data[0] in cat_attributes):
isOH.append(1)
else:
isOH.append(0)
else:
prefix, value = data
isOH.append(1)
if prefix in lookup:
tier.append(lookup[prefix])
else:
lookup[prefix] = next_index
tier.append(next_index)
next_index = next_index + 1
tier = tier + isOH
return tier
def readTiers_Health(columns, cat_attributes):
tier = []
isOH = []
lookup = {}
next_index = 1
for c in columns:
data = c.split('=', 2)
if len(data) == 1 or data[0] in ["PrimaryConditionGroup","Specialty","ProcedureGroup","PlaceSvc"]:
tier.append(next_index)
next_index = next_index + 1
if(columns in cat_attributes):
isOH.append(1)
else:
isOH.append(0)
else:
prefix, value = data
isOH.append(1)
if prefix in lookup:
tier.append(lookup[prefix])
else:
lookup[prefix] = next_index
tier.append(next_index)
next_index = next_index + 1
tier = tier + isOH
return tier
def category(dataset, columns, attributes):
rows = []
for index, record in dataset.iterrows():
row = []
for i in range(0, len(columns)):
c = columns[i]
column_data = c.split('=', 2)
if c in attributes or (len(column_data) == 2 and column_data[0] in attributes):
l = 0
u = 1
else:
l = record[i + 1]
u = record[i + 1]
row.append('[{};{}]'.format(l, u))
rows.append(' '.join(row))
#print(f"Originial-> {record}\n")
#print(f"New-> {rows[9]}\n\n")
#for i in range(len(columns)):
# print(f"{i} -> {columns[i]}")
#exit()
return rows
def noise(dataset, columns, attributes, epsilon):
rows = []
for index, record in dataset.iterrows():
row = []
for i in range(0, len(columns)):
c = columns[i]
column_data = c.split('=', 2)
if c in attributes or (len(column_data) == 2 and column_data[0] in attributes):
l = record[i + 1] - epsilon
u = record[i + 1] + epsilon
else:
l = record[i + 1]
u = record[i + 1]
row.append('[{};{}]'.format(l, u))
rows.append(' '.join(row))
return rows
def noiseCat(dataset, columns, noise_attributes, epsilon, cat_attributes):
rows = []
for index, record in dataset.iterrows():
row = []
for i in range(0, len(columns)):
c = columns[i]
column_data = c.split('=', 2)
if c in noise_attributes or (len(column_data) == 2 and column_data[0] in noise_attributes):
l = record[i + 1] - epsilon
u = record[i + 1] + epsilon
elif c in cat_attributes or (len(column_data) == 2 and column_data[0] in cat_attributes):
l = 0
u = 1
else:
l = record[i + 1]
u = record[i + 1]
row.append('[{};{}]'.format(l, u))
rows.append(' '.join(row))
return rows
def top(columns):
rows = []
row = []
for i in range(0, len(columns)):
l = 0
u = 1
row.append('[{};{}]'.format(l, u))
rows.append(' '.join(row))
return rows
def conditionalAttribute(dataset, columns, condition_attribute, threshold, attributes, epsilon_1, epsilon_2):
rows = []
for index, record in dataset.iterrows():
epsilon = epsilon_1 if record[1 + columns.index(condition_attribute)] < threshold else epsilon_2
group_l = -999999.9 if record[1 + columns.index(condition_attribute)] < threshold else threshold
group_u = threshold if record[1 + columns.index(condition_attribute)] < threshold else 999999.9
row = []
for i in range(0, len(columns)):
c = columns[i]
column_data = c.split('=', 2)
if c in attributes or (len(column_data) == 2 and column_data[0] in attributes):
l = record[i + 1] - epsilon
u = record[i + 1] + epsilon
elif c == condition_attribute:
l = group_l
u = group_u
else:
l = record[i + 1]
u = record[i + 1]
row.append('[{};{}]'.format(l, u))
rows.append(' '.join(row))
return rows
def savePerturbation(perturbation, output):
with open(output, 'w') as f:
for row in perturbation:
f.write(row + '\n')
def saveTiers(tiers, output):
with open(output, 'w') as f:
for tier in tiers:
f.write(str(tier) + ' ')
if __name__ == '__main__':
if len(argv) < 5:
print('Usage: python3 {} <dataset> <columns> <output> <command> [parameters]'.format(argv[0]))
print('Commands:')
print('\tshow-columns')
print('\tshow-tiers')
print('\tcat column_1,column_2,...,column_n')
print('\tnoise column_1,column_2,...,column_n epsilon')
print('\tnoise-cat noise_column_1,noise_column_2,...,noise_column_n epsilon cat_column_1,cat_column_2,...,cat_column_n')
print('\tconditional-attribute condition_attribute threshold, column_1,column_2,...,column_n epsilon_1 epsilon_2')
exit()
dataset = pd.read_csv(argv[1], header=None, skiprows=1)
columns = readColumns(argv[2])
tiers = readTiers(columns)
output = argv[3]
command = argv[4]
if command == 'show-columns':
print('{}: {}'.format(len(columns), ' '.join(map(str, columns))))
elif command == 'show-tiers':
print('{}: {}'.format(len(tiers), ' '.join(map(str, tiers))))
elif command == 'cat':
perturbation = category(dataset, columns, argv[5].split(','))
savePerturbation(perturbation, output)
elif command == 'noise':
perturbation = noise(dataset, columns, argv[5].split(','), float(argv[6]))
savePerturbation(perturbation, output)
elif command == 'noise-cat':
perturbation = noiseCat(dataset, columns, argv[5].split(','), float(argv[6]), argv[7].split(','))
savePerturbation(perturbation, output)
elif command == 'conditional-attribute':
perturbation = conditionalAttribute(dataset, columns, argv[5], float(argv[6]), argv[7].split(','), float(argv[8]), float(argv[9]))
savePerturbation(perturbation, output)
print(perturbation)