-
Notifications
You must be signed in to change notification settings - Fork 5
/
critic_net.py
432 lines (354 loc) · 17.1 KB
/
critic_net.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
import numpy as np
class CriticNet(object):
"""
A Four-layer fully-connected neural network for critic network.
-The net has an input dimension of (N, S), with S being the cardinality of
the state space.
- The net also has an input dimension of (N, A), where A is the cardinality of
the action space.
- There are three hidden layers, with dimension of H1, H2, and H3, respectively.
- The output provides an Q-value vetcor with the dimenson of A.
- The state input connect to the first hidden layer.
- The action input bypass the first hidden layer and connected directly to
the second hidden layer
- The outputs (from action and state) at the second hidden layer are summed up.
- The network uses a ReLU nonlinearity for the first, second and
the third layer and uses leaner activation
for the final layer.
"""
def __init__(self, input_size_S,input_size_A, hidden_size1, hidden_size2, \
hidden_size3, output_size, weight=None):
"""
Initialize the model. Weights are initialized to small random values and
biases are initialized to zero. Weights and biases are stored in the
variable self.params, which is a dictionary with the following keys:
W1: First layer weights; has shape (S, H1)
b1: First layer biases; has shape (H1,)
W2_S: Second layer weights of state input; has shape (H1, H2)
b2_S: Second layer biases; has shape (H2,)
W2_A: Second layer weights of Action input; has shape (A, H2)
b2_A: Second layer biases; has shape (H2,)
W3: Third layer weights, has shape (H2, H3)
b3: Third layer biases; has shape (H3,)
W4: Third layer weights, has shape (H3, A)
b4: Third layer biases; has shape (A)
We also have the weights for a traget network (same architecture but
different weights)
W1_tgt: First layer weights; has shape (S, H1)
b1_tgt: First layer biases; has shape (H1,)
W2_S_tgt: Second layer weights of state input; has shape (H1, H2)
b2_S_tgt: Second layer biases; has shape (H2,)
W2_A_tgt: Second layer weights of Action input; has shape (A, H2)
b2_A_tgt: Second layer biases; has shape (H2,)
W3_tgt: Third layer weights, has shape (H2, H3)
b3_tgt: Third layer biases; has shape (H3,)
W4_tgt: Third layer weights, has shape (H3, A)
b4_tgt: Third layer biases; has shape (A)
Inputs:
- input_size: The dimension D of the input data.
- hidden_size: The number of neurons H in the hidden layer.
- output_size: The continuous variables that constitutes an action vector
of A dimension.
"""
print("A critic network is created.")
self.params = {}
if weight == None:
self.params['W1'] = self._uniform_init(input_size_S, hidden_size1)
self.params['b1'] = np.zeros(hidden_size1)
self.params['W2_S'] = self._uniform_init(hidden_size1, hidden_size2)
self.params['b2_S'] = np.zeros(hidden_size2)
self.params['W2_A'] = self._uniform_init(input_size_A, hidden_size2)
self.params['b2_A'] = np.zeros(hidden_size2)
self.params['W3'] = self._uniform_init(hidden_size2, hidden_size3)
self.params['b3'] = np.zeros(hidden_size3)
self.params['W4'] = np.random.uniform(-3e-3, 3e-3, (hidden_size3, output_size))
self.params['b4'] = np.zeros(output_size)
else:
self.params['W1'] = weight['W1']
self.params['b1'] = weight['b1']
self.params['W2_S'] = weight['W2_S']
self.params['b2_S'] = weight['b2_S']
self.params['W2_A'] = weight['W2_A']
self.params['b2_A'] = weight['b2_A']
self.params['W3'] = weight['W3']
self.params['b3'] = weight['b3']
self.params['W4'] = weight['W4']
self.params['b4'] = weight['b4']
# Initialization based on "Continuous control with deep reinformcement
# learning"
# self.params['W1_tgt'] = self.params['W1']
# self.params['b1_tgt'] = self.params['b1']
# self.params['W2_S_tgt'] = self.params['W2_S']
# self.params['b2_S_tgt'] = self.params['b2_S']
# self.params['W2_A_tgt'] = self.params['W2_A']
# self.params['b2_A_tgt'] = self.params['b2_A']
# self.params['W3_tgt'] = self.params['W3']
# self.params['b3_tgt'] = self.params['b3']
# self.params['W4_tgt'] = self.params['W4']
# self.params['b4_tgt'] = self.params['b4']
self.params['W1_tgt'] = self._uniform_init(input_size_S, hidden_size1)
self.params['b1_tgt'] = np.zeros(hidden_size1)
self.params['W2_S_tgt'] = self._uniform_init(hidden_size1, hidden_size2)
self.params['b2_S_tgt'] = np.zeros(hidden_size2)
self.params['W2_A_tgt'] = self._uniform_init(input_size_A, hidden_size2)
self.params['b2_A_tgt'] = np.zeros(hidden_size2)
self.params['W3_tgt'] = self._uniform_init(hidden_size2, hidden_size3)
self.params['b3_tgt'] = np.zeros(hidden_size3)
self.params['W4_tgt'] = np.random.uniform(-3e-3, 3e-3, (hidden_size3, output_size))
self.params['b4_tgt'] = np.zeros(output_size)
# Initialize the dictionary for optimization configuration for diffrent
# layers
self.optm_cfg ={}
self.optm_cfg['W1'] = None
self.optm_cfg['b1'] = None
self.optm_cfg['W2_S'] = None
self.optm_cfg['b2_S'] = None
self.optm_cfg['W2_A'] = None
self.optm_cfg['b2_A'] = None
self.optm_cfg['W3'] = None
self.optm_cfg['b3'] = None
self.optm_cfg['W4'] = None
self.optm_cfg['b4'] = None
def evaluate_gradient(self, X_S, X_A, Y_tgt, use_target=False):
"""
Compute the Q-value and gradients for the network based on the input X_S,
X_A, Y_tgt
Inputs:
- X_S: Input for state, shape (N, S), N is the batch size
- X_A: Input for actio, shape (N, A), N is the batch size
_ Y_tgt: Target vaule for Q-value, used for update weights (via regression)
- use_target: use default weights if False; otherwise use target weights.
Returns:
A tuple of:
- Q_values: a continuous vector, has the same dimension as A
- loss:
- grads: Dictionary mapping parameter names to gradients of those parameters;
has the same keys as self.params.
"""
# Unpack variables from the params dictionary
if not use_target:
W1, b1 = self.params['W1'], self.params['b1']
W2_S, b2_S = self.params['W2_S'], self.params['b2_S']
W2_A, b2_A = self.params['W2_A'], self.params['b2_A']
W3, b3 = self.params['W3'], self.params['b3']
W4, b4 = self.params['W4'], self.params['b4']
else:
W1, b1 = self.params['W1_tgt'], self.params['b1_tgt']
W2_S, b2_S = self.params['W2_S_tgt'], self.params['b2_S_tgt']
W2_A, b2_A = self.params['W2_A_tgt'], self.params['b2_A_tgt']
W3, b3 = self.params['W3_tgt'], self.params['b3_tgt']
W4, b4 = self.params['W4_tgt'], self.params['b4_tgt']
# Compute the forward pass
# The first hidden layer
output= None
z1=np.dot(X_S,W1)+b1
H1=np.maximum(0,z1)
# The second hidden layer
z2_S=np.dot(H1,W2_S)+b2_S
# The action input bypass the first hidden layer and connects directly to
#the second hidden layer
z2_A=np.dot(X_A,W2_A)+b2_A
H2=z2_S + z2_A # No relu here
# The third hidden layer
z3=np.dot(H2,W3)+b3
H3=np.maximum(0,z3) # Relu here
# The output layer
output=np.dot(H3, W4)+b4
Q_values=output # q_values is the scores in this case, due to the linear
# activation
batch_size=np.shape(X_S)[0]
# loss
loss =np.sum((Q_values-Y_tgt)**2, axis=0)/(1.0*batch_size)
loss = loss[0]
# error
error=(Q_values-Y_tgt)
# Backward propagation
grads = {}
grad_output=error*2/batch_size
# Back-propagate to second hidden layer
# as a sanity check out1 and z3 should have the same shape
out1=grad_output.dot(W4.T)
# derivative of the max() gate
out1[z3<=0]=0
# Backpropagate to the first hidden layer
out2=out1.dot(W3.T)
# derivtive of the max() gate again
out3=out2.dot(W2_S.T)
out3[z1<=0]=0
# Calculate gradient using back propagation
grads['W4']=np.dot(H3.T, grad_output)/(1.0*batch_size)
grads['W3']=np.dot(H2.T, out1)/(1.0*batch_size)
grads['W2_S']=np.dot(H1.T, out2)/(1.0*batch_size)
grads['W2_A']=np.dot(X_A.T, out2)/(1.0*batch_size)
grads['W1']=np.dot(X_S.T, out3)/(1.0*batch_size)
grads['b4']=np.sum(grad_output, axis=0)/(1.0*batch_size)
grads['b3']=np.sum(out1, axis=0)/(1.0*batch_size)
grads['b2_S']=np.sum(out2, axis=0)/(1.0*batch_size)
grads['b2_A']=np.sum(out2, axis=0)/(1.0*batch_size)
grads['b1']=np.sum(out3, axis=0)/(1.0*batch_size)
return grads,loss #, Q_values
def evaluate_action_gradient(self, X_S, X_A, use_target=False):
"""
Inputs:
- X_S: Input for state, shape (N, S), N is the batch size
- X_A: Input for actio, shape (N, A), N is the batch size
- use_target: use default weights if False; otherwise use target weights.
Returns:
A tuple of:
- Q_values: a continuous vector, has the same dimension as A
- loss:
- grads: Dictionary mapping parameter names to gradients of those parameters;
has the same keys as self.params.
"""
# Unpack variables from the params dictionary
if not use_target:
W1, b1 = self.params['W1'], self.params['b1']
W2_S, b2_S = self.params['W2_S'], self.params['b2_S']
W2_A, b2_A = self.params['W2_A'], self.params['b2_A']
W3, b3 = self.params['W3'], self.params['b3']
W4, b4 = self.params['W4'], self.params['b4']
else:
W1, b1 = self.params['W1_tgt'], self.params['b1_tgt']
W2_S, b2_S = self.params['W2_S_tgt'], self.params['b2_S_tgt']
W2_A, b2_A = self.params['W2_A_tgt'], self.params['b2_A_tgt']
W3, b3 = self.params['W3_tgt'], self.params['b3_tgt']
W4, b4 = self.params['W4_tgt'], self.params['b4_tgt']
# Compute the forward pass
# The first hidden layer
output= None
z1=np.dot(X_S,W1)+b1
H1=np.maximum(0,z1)
# The second hidden layer
z2_S=np.dot(H1,W2_S)+b2_S
# The action input bypass the first hidden layer and connects directly to
#the second hidden layer
z2_A=np.dot(X_A,W2_A)+b2_A
H2=z2_S + z2_A # No relu here
# The third hidden layer
z3=np.dot(H2,W3)+b3
H3=np.maximum(0,z3)
# The output layer
output=np.dot(H3, W4)+b4
# Backward propagation
grad_output=np.ones_like(output)
# Back-propagate to second hidden layer
# as a sanity check out1 and z3 should have the same shape
out1=grad_output.dot(W4.T)
# derivative of the max() gate
out1[z3<=0]=0
# Backpropagate to the first hidden layer
out2=out1.dot(W3.T)
grads_action = np.dot(out2,W2_A.T)
return grads_action
def train(self, X_S, X_A, Y_tgt):
"""
Train this neural network using adam optimizer.
Inputs:
- X: A numpy array of shape (N, D) giving training data.
"""
# Compute out and gradients using the current minibatch
grads, loss = self.evaluate_gradient(X_S, X_A, Y_tgt, use_target=False)
# Update the weights using adam optimizer
#print ('grads W4', grads['W4'])
self.params['W4'] = self._adam(self.params['W4'], grads['W4'], config=self.optm_cfg['W4'])[0]
self.params['W3'] = self._adam(self.params['W3'], grads['W3'], config=self.optm_cfg['W3'])[0]
self.params['W2_S'] = self._adam(self.params['W2_S'], grads['W2_S'], config=self.optm_cfg['W2_S'])[0]
self.params['W2_A'] = self._adam(self.params['W2_A'], grads['W2_A'], config=self.optm_cfg['W2_A'])[0]
self.params['W1'] = self._adam(self.params['W1'], grads['W1'], config=self.optm_cfg['W1'])[0]
self.params['b4'] = self._adam(self.params['b4'], grads['b4'], config=self.optm_cfg['b4'])[0]
self.params['b3'] = self._adam(self.params['b3'], grads['b3'], config=self.optm_cfg['b3'])[0]
self.params['b2_S'] = self._adam(self.params['b2_S'], grads['b2_S'], config=self.optm_cfg['b2_S'])[0]
self.params['b2_A'] = self._adam(self.params['b2_A'], grads['b2_A'], config=self.optm_cfg['b2_A'])[0]
self.params['b1'] = self._adam(self.params['b1'], grads['b1'], config=self.optm_cfg['b1'])[0]
# Update the configuration parameters to be used in the next iteration
self.optm_cfg['W4'] = self._adam(self.params['W4'], grads['W4'], config=self.optm_cfg['W4'])[1]
self.optm_cfg['W3'] = self._adam(self.params['W3'], grads['W3'], config=self.optm_cfg['W3'])[1]
self.optm_cfg['W2_S'] = self._adam(self.params['W2_S'], grads['W2_S'], config=self.optm_cfg['W2_S'])[1]
self.optm_cfg['W2_A'] = self._adam(self.params['W2_A'], grads['W2_A'], config=self.optm_cfg['W2_A'])[1]
self.optm_cfg['W1'] = self._adam(self.params['W1'], grads['W1'], config=self.optm_cfg['W1'])[1]
self.optm_cfg['b4'] = self._adam(self.params['b4'], grads['b4'], config=self.optm_cfg['b4'])[1]
self.optm_cfg['b3'] = self._adam(self.params['b3'], grads['b3'], config=self.optm_cfg['b3'])[1]
self.optm_cfg['b2_S'] = self._adam(self.params['b2_S'], grads['b2_S'], config=self.optm_cfg['b2_S'])[1]
self.optm_cfg['b2_A'] = self._adam(self.params['b2_A'], grads['b2_A'], config=self.optm_cfg['b2_A'])[1]
self.optm_cfg['b1'] = self._adam(self.params['b1'], grads['b1'], config=self.optm_cfg['b1'])[1]
return loss
def train_target(self, tau):
"""
Update the weights of the target network.
-tau: coefficent for tracking the learned network.
"""
self.params['W4_tgt'] = tau*self.params['W4']+(1-tau)*self.params['W4_tgt']
self.params['W3_tgt'] = tau*self.params['W3']+(1-tau)*self.params['W3_tgt']
self.params['W2_S_tgt'] = tau*self.params['W2_S']+(1-tau)*self.params['W2_S_tgt']
self.params['W2_A_tgt'] = tau*self.params['W2_A']+(1-tau)*self.params['W2_A_tgt']
self.params['W1_tgt'] = tau*self.params['W1']+(1-tau)*self.params['W1_tgt']
self.params['b4_tgt'] = tau*self.params['b4']+(1-tau)*self.params['b4_tgt']
self.params['b3_tgt'] = tau*self.params['b3']+(1-tau)*self.params['b3_tgt']
self.params['b2_S_tgt'] = tau*self.params['b2_S']+(1-tau)*self.params['b2_S_tgt']
self.params['b2_A_tgt'] = tau*self.params['b2_A']+(1-tau)*self.params['b2_A_tgt']
self.params['b1_tgt'] = tau*self.params['b1']+(1-tau)*self.params['b1_tgt']
def predict(self, X_S, X_A, target=False):
"""
Use the trained weights of this network to predict the Q vector for a
given state.
Inputs:
- X: numpy array of shape (N, D)
- target: if False, use normal weights, otherwise use learned weight.
- action_bound: the scaling factor for the action, which is environment
dependent.
Returns:
- y_pred: A numpy array of shape (N,)
"""
y_pred = None
if not target:
W1, b1 = self.params['W1'], self.params['b1']
W2_S, b2_S = self.params['W2_S'], self.params['b2_S']
W2_A, b2_A = self.params['W2_A'], self.params['b2_A']
W3, b3 = self.params['W3'], self.params['b3']
W4, b4 = self.params['W4'], self.params['b4']
else:
W1, b1 = self.params['W1_tgt'], self.params['b1_tgt']
W2_S, b2_S = self.params['W2_S_tgt'], self.params['b2_S_tgt']
W2_A, b2_A = self.params['W2_A_tgt'], self.params['b2_A_tgt']
W3, b3 = self.params['W3_tgt'], self.params['b3_tgt']
W4, b4 = self.params['W4_tgt'], self.params['b4_tgt']
H1 = np.maximum(0,X_S.dot(W1)+b1)
H2 = np.dot(H1,W2_S)+b2_S + np.dot(X_A, W2_A)+b2_A
H3 = np.maximum(0,H2.dot(W3)+b3)
score=np.dot(H3, W4)+b4
#print "scores=:", score
y_pred=score
return y_pred
def _adam(self, x, dx, config=None):
"""
Uses the Adam update rule, which incorporates moving averages of both the
gradient and its square and a bias correction term.
config format:
- learning_rate: scalar learning rate.
- beta1: decay rate for moving average of first moment of gradient.
- beta2: decay rate for moving average of second moment of gradient.
- epsilon: small scalar used for smoothing to avoid dividing by zero.
- m: moving average of gradient.
- v: moving average of squared gradient.
- t: iteration number (time step)
"""
if config is None: config = {}
config.setdefault('learning_rate', 1e-3)
config.setdefault('beta1', 0.9)
config.setdefault('beta2', 0.999)
config.setdefault('epsilon', 1e-8)
config.setdefault('m', np.zeros_like(x))
config.setdefault('v', np.zeros_like(x))
config.setdefault('t', 0)
next_x = None
#Adam update formula, #
config['t'] += 1
config['m'] = config['beta1']*config['m'] + (1-config['beta1'])*dx
config['v'] = config['beta2']*config['v'] + (1-config['beta2'])*(dx**2)
mb = config['m'] / (1 - config['beta1']**config['t'])
vb = config['v'] / (1 - config['beta2']**config['t'])
next_x = x - config['learning_rate'] * mb / (np.sqrt(vb) + config['epsilon'])
return next_x, config
def _uniform_init(self, input_size, output_size):
u = np.sqrt(6./(input_size+output_size))
return np.random.uniform(-u, u, (input_size, output_size))