CEDL2017 · Oscar860601 · Nov 9, 2017 · Nov 9, 2017
diff --git a/Lab3-policy-gradient.ipynb b/Lab3-policy-gradient.ipynb
diff --git a/policy_gradient/policy.py b/policy_gradient/policy.py
@@ -29,6 +29,9 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
 
         Sample solution is about 2~4 lines.
         """
+        hidden_layer1 = tf.contrib.layers.fully_connected(self._observations, hidden_dim, activation_fn=tf.tanh)
+        hidden_layer2 = tf.contrib.layers.fully_connected(hidden_layer1, out_dim, activation_fn=None)
+        probs = tf.nn.softmax(hidden_layer2)
         # YOUR CODE HERE >>>>>>
         # <<<<<<<<
 
@@ -74,6 +77,8 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
         # YOUR CODE HERE >>>>>>
         # <<<<<<<<
 
+        surr_loss = -tf.reduce_mean(tf.multiply(log_prob, self._advantages))
+
         grads_and_vars = self._opt.compute_gradients(surr_loss)
         train_op = self._opt.apply_gradients(grads_and_vars, name="train_op")
 

diff --git a/policy_gradient/util.py b/policy_gradient/util.py
@@ -32,6 +32,9 @@ def discount_bootstrap(x, discount_rate, b):
     Sample code should be about 3 lines
     """
     # YOUR CODE >>>>>>>>>>>>>>>>>>>
+    b_ = np.append(b[1:], 0)
+    y = np.add(x,discount_rate*b_)
+    return y
     # <<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
 def plot_curve(data, key, filename=None):

diff --git a/report.md b/report.md
@@ -1,3 +1,52 @@
 # Homework3-Policy-Gradient report
 
 TA: try to elaborate the algorithms that you implemented and any details worth mentioned.
+### Problem 1
+build a two layer NN
++ Code
+```python
+hidden_layer1 = tf.contrib.layers.fully_connected(self._observations, hidden_dim, activation_fn=tf.tanh) 
+hidden_layer2 = tf.contrib.layers.fully_connected(hidden_layer1, out_dim, activation_fn=None) 
+probs = tf.nn.softmax(hidden_layer2) 
+```
+
+### Problem 2
++ Code
+```python
+surr_loss = -tf.reduce_mean(tf.multiply(log_prob, self._advantages))
+```
+### Problem 3
+Because we have baseline b and reward r, all we need to do is a = r - b.
++ Code
+```python
+a = r - b
+```
+![](https://i.imgur.com/iVfUkwI.png) ![](https://i.imgur.com/z9AnuHS.png)
+Finish at 78 iters, However 4/5 of my training exceed 80 iters, even some of them exceed 100 iters.
+
+### Problem 4
++ Code
+```python
+baseline = None
+```
+![](https://i.imgur.com/6EESZ5u.png) ![](https://i.imgur.com/TGzCt8K.png)
+
+Removing the baseline would'n introduce bias.
+
+### Problem 5
++ Code
+```python
+b_next = np.append(b[1:], 0)
+y = x + discount_rate * b_next
+```
+![](https://i.imgur.com/jXQSQGi.png)
+1[](https://i.imgur.com/javH0dL.png)
+
+### Problem 6
++ Code
+```python
+a = util.discount(a, self.discount_rate * LAMBDA)
+```
+![](https://i.imgur.com/yzH4rfr.png)
+![](https://i.imgur.com/2F9VqJz.png)
+