-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdata.py
135 lines (106 loc) · 5.07 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from __future__ import print_function
import os
import yaml
import argparse
import numpy as np
from keras.utils import np_utils
from keras.datasets import mnist
class DataLoader(object):
def __init__(self, config, one_hot = False):
self.config = config
self.one_hot = one_hot
def load(self):
(X_data, self.y_data), (X_test, self.y_test) = mnist.load_data()
self.input_shape = (-1, self.config["data"]["imsize"], self.config["data"]["imsize"], self.config["data"]["imchannel"])
self.X_data = np.reshape(X_data, self.input_shape)
self.X_test = np.reshape(X_test, self.input_shape)
if self.one_hot:
self.y_data = np_utils.to_categorical(self.y_data, self.config["data"]["num_classes"])
self.y_test = np_utils.to_categorical(self.y_test, self.config["data"]["num_classes"])
self.num_train = int(self.y_data.shape[0] * (1-self.config["data"]["val_split"]))
self.num_val = int(self.y_data.shape[0] * (self.config["data"]["val_split"]))
self.num_test = self.y_test.shape[0]
if self.config["train"]["loss"] in ["intra-enhanced-triplet-loss", "semi-hard-triplet-loss"]:
print("[INFO] Ordering Data")
self.order_data_triplet_loss()
self.split_data()
self.mean = np.mean(self.X_train, axis=0)
self.std = np.std(self.X_train, axis=0)
self.std = (self.std==0) * 1e-16 + self.std
self.X_train = self.preprocess(self.X_train)
self.X_val = self.preprocess(self.X_val)
self.X_test = self.preprocess(self.X_test)
def preprocess(self, data):
data = data.astype('float32')
# data = data - self.mean
# data = data / self.std
return data / 255.
def order_data_triplet_loss(self):
data = {}
samples_per_id = self.config["data"]["samples_per_id"]
for label in range(self.config["data"]["num_classes"]):
mask = self.y_data==label
data[label] = [i for i, x in enumerate(mask) if x]
if len(data[label]) < samples_per_id:
data[label].extend(np.random.choice(data[label], samples_per_id - len(data[label]), replace=False))
data[label] = data[label][:samples_per_id]
k_batch = self.config["train"]["k_batch"]
X_data, y_data = [], []
for i in range(samples_per_id // k_batch):
for label in data:
X_data.extend(self.X_data[data[label][i*k_batch:(i+1)*k_batch]])
y_data += [label] * k_batch
self.X_data = np.array(X_data)
self.y_data = np.array(y_data)
def split_data(self):
self.X_train = self.X_data[:self.num_train]
self.y_train = self.y_data[:self.num_train]
self.X_val = self.X_data[self.num_train:]
self.y_val = self.y_data[self.num_train:]
del self.X_data, self.y_data
def get_random_batch(self, k = 100):
X_batch, y_batch = [], []
for label in range(self.config["data"]["num_classes"]):
X_mask = self.X_test[self.y_test==label]
X_batch.extend(np.array([X_mask[np.random.choice(len(X_mask), k, replace=False)]]) if k <= len(X_mask) and k >= 0 else X_mask)
y_batch += [label] * k if k <= len(X_mask) and k >= 0 else [label] * len(X_mask)
X_batch = np.reshape(X_batch, self.input_shape)
return X_batch, np.array(y_batch)
class DataGenerator(object):
def __init__(self, config):
self.shuffle = config["train"]["shuffle"]
self.batch_size = config["train"]["batch-size"]
self.loss = config["train"]["loss"]
self.num_classes = config["data"]["num_classes"]
def generate(self, X, y):
''' Generates batches of samples '''
# Infinite loop
while 1:
# Generate order of exploration of dataset
indexes = self.__get_exploration_order(len(y))
# Generate batches
batches = np.arange(len(indexes)//self.batch_size)
if not self.shuffle: np.random.shuffle(batches)
for batch in batches:
# Find list of ids
batch_indecies = indexes[batch*self.batch_size:(batch+1)*self.batch_size]
if self.loss == "triplet-softmax":
y_1 = y[batch_indecies]
y_2 = np_utils.to_categorical(y_1, self.num_classes)
yield X[batch_indecies], [y_1, y_2]
else:
yield X[batch_indecies], y[batch_indecies]
def __get_exploration_order(self, data_size):
''' Generates order of exploration '''
idxs = np.arange(data_size)
if self.shuffle == True:
np.random.shuffle(idxs)
return idxs
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Model Paramaters')
parser.add_argument('-c', '--config', type=str, default="config.yaml", help='path of config file')
args = parser.parse_args()
with open(args.config, 'r') as file:
config = yaml.load(file)
dataloader = DataLoader(config)
dataloader.load()