CEDL2017 · potterhsu · Nov 24, 2017 · Nov 24, 2017
diff --git a/Lab3-policy-gradient.ipynb b/Lab3-policy-gradient.ipynb
diff --git a/images/actor_critic_avg_return.png b/images/actor_critic_avg_return.png
diff --git a/images/actor_critic_loss.png b/images/actor_critic_loss.png
diff --git a/images/baseline_bias.png b/images/baseline_bias.png
diff --git a/images/gae_avg_return.png b/images/gae_avg_return.png
diff --git a/images/gae_loss.png b/images/gae_loss.png
diff --git a/images/nn.jpg b/images/nn.jpg
diff --git a/images/surrogate_loss_1.png b/images/surrogate_loss_1.png
diff --git a/images/surrogate_loss_2.png b/images/surrogate_loss_2.png
diff --git a/images/with_baseline_avg_return.png b/images/with_baseline_avg_return.png
diff --git a/images/with_baseline_loss.png b/images/with_baseline_loss.png
diff --git a/images/without_baseline_avg_return.png b/images/without_baseline_avg_return.png
diff --git a/images/without_baseline_loss.png b/images/without_baseline_loss.png
diff --git a/policy_gradient/policy.py b/policy_gradient/policy.py
@@ -30,6 +30,9 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
         Sample solution is about 2~4 lines.
         """
         # YOUR CODE HERE >>>>>>
+        hidden = tf.layers.dense(self._observations, hidden_dim, activation=tf.nn.tanh)
+        output = tf.layers.dense(hidden, out_dim)
+        probs = tf.nn.softmax(output)
         # <<<<<<<<
 
         # --------------------------------------------------
@@ -72,6 +75,8 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
         Sample solution is about 1~3 lines.
         """
         # YOUR CODE HERE >>>>>>
+        surr_loss = tf.reduce_mean(tf.multiply(log_prob, self._advantages))
+        surr_loss = tf.multiply(surr_loss, -1)
         # <<<<<<<<
 
         grads_and_vars = self._opt.compute_gradients(surr_loss)

diff --git a/policy_gradient/util.py b/policy_gradient/util.py
@@ -32,6 +32,8 @@ def discount_bootstrap(x, discount_rate, b):
     Sample code should be about 3 lines
     """
     # YOUR CODE >>>>>>>>>>>>>>>>>>>
+    y = x + discount_rate * np.append(b, 0)[1:]
+    return y
     # <<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
 def plot_curve(data, key, filename=None):

diff --git a/report.md b/report.md
@@ -1,3 +1,82 @@
 # Homework3-Policy-Gradient report
 
-TA: try to elaborate the algorithms that you implemented and any details worth mentioned.
+## Problem 1: Construct a neural network to represent policy
+
+In this homework, we use simple 2-layer neural network to construct policy.
+
+![](images/nn.jpg)
+
+```python
+hidden = tf.layers.dense(self._observations, hidden_dim, activation=tf.nn.tanh)
+output = tf.layers.dense(hidden, out_dim)
+probs = tf.nn.softmax(output)
+```
+
+## Problem 2: Compute the surrogate loss
+
+![](images/surrogate_loss_1.png)
+
+![](images/surrogate_loss_2.png)
+
+```python
+surr_loss = tf.reduce_mean(tf.multiply(log_prob, self._advantages))
+surr_loss = tf.multiply(surr_loss, -1)
+```
+
+## Problem 3: Baseline Method
+
+Use baseline to reduce the variance of our gradient estimate.
+
+```python
+a = r - b
+```
+
+## Problem 4: Without Baseline
+
+Why baseline won't introduced bias?
+
+![](images/baseline_bias.png)
+
+##### Compare with and without baseline
+
+<table>
+    <tr>
+        <td> With Baseline </td>
+        <td> <img src="images/with_baseline_loss.png"/> </td>
+        <td> <img src="images/with_baseline_avg_return.png"/> </td>
+    </tr>
+    <tr>
+        <td> Without Baseline </td>
+        <td> <img src="images/without_baseline_loss.png"/> </td>
+        <td> <img src="images/without_baseline_avg_return.png"/> </td>
+    </tr>
+</table>
+
+> With baseline, we can reduce the variance of policy, so that training can be more stable than without baseline one.
+
+## Problem 5: Actor-Critic algorithm with bootstrapping
+
+<table>
+    <tr>
+        <td> <img src="images/actor_critic_loss.png"/> </td>
+        <td> <img src="images/actor_critic_avg_return.png"/> </td>
+    </tr>
+</table>
+
+```python
+y = x + discount_rate * np.append(b, 0)[1:]
+return y
+```
+
+## Problem 6: Generalized Advantage Estimation
+
+<table>
+    <tr>
+        <td> <img src="images/gae_loss.png"/> </td>
+        <td> <img src="images/gae_avg_return.png"/> </td>
+    </tr>
+</table>
+
+```python
+a = util.discount(a, self.discount_rate * LAMBDA)
+```