CEDL2017 · AnjieCheng · Nov 9, 2017 · Nov 9, 2017
diff --git a/Lab3-policy-gradient.ipynb b/Lab3-policy-gradient.ipynb
diff --git a/p3_1.png b/p3_1.png
diff --git a/p3_2.png b/p3_2.png
diff --git a/p4_1.png b/p4_1.png
diff --git a/p4_2.png b/p4_2.png
diff --git a/p5_1.png b/p5_1.png
diff --git a/p5_2.png b/p5_2.png
diff --git a/p6_1.png b/p6_1.png
diff --git a/p6_2.png b/p6_2.png
diff --git a/policy_gradient/policy.py b/policy_gradient/policy.py
@@ -31,6 +31,9 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
         """
         # YOUR CODE HERE >>>>>>
         # <<<<<<<<
+        layer1 = tf.contrib.layers.fully_connected(self._observations, hidden_dim, activation_fn=tf.tanh)
+        layer2 = tf.contrib.layers.fully_connected(layer1, out_dim, activation_fn=tf.nn.softmax)
+        probs = layer2
 
         # --------------------------------------------------
         # This operation (variable) is used when choosing action during data sampling phase
@@ -72,6 +75,7 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
         Sample solution is about 1~3 lines.
         """
         # YOUR CODE HERE >>>>>>
+        surr_loss = tf.reduce_mean(-log_prob * self._advantages)
         # <<<<<<<<
 
         grads_and_vars = self._opt.compute_gradients(surr_loss)

diff --git a/policy_gradient/util.py b/policy_gradient/util.py
@@ -32,6 +32,8 @@ def discount_bootstrap(x, discount_rate, b):
     Sample code should be about 3 lines
     """
     # YOUR CODE >>>>>>>>>>>>>>>>>>>
+    b = np.append(b[1:], 0)
+    return x + discount_rate * b
     # <<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
 def plot_curve(data, key, filename=None):

diff --git a/report.md b/report.md
@@ -1,3 +1,50 @@
 # Homework3-Policy-Gradient report
+## Problem 1
+Following instructions: Use `tanh` as the activation function of the first hidden layer, and append softmax layer after the output of the neural network to get the probability of each possible action.
+```python
+layer1 = tf.contrib.layers.fully_connected(self._observations, hidden_dim, activation_fn=tf.tanh)
+layer2 = tf.contrib.layers.fully_connected(layer1, out_dim, activation_fn=tf.nn.softmax)
+probs = layer2
+```
+## Problem 2
+Compute surrogate loss and assign it to variable `surr_loss`. Since we need to maximize it not minimize it, so we turn it into negative.
+```python
+surr_loss = tf.reduce_mean(-log_prob * self._advantages)
+```
+
+## Problem 3
+Simply subtract the variance and then assign the result to the variable `a`.
+```python
+ a = r - b
+```
+![](https://i.imgur.com/78ARcKp.png)
+![](https://i.imgur.com/TfxMgpZ.png)
+
+## Problem 4
+Curve without baseline substraction.
+
+![](https://i.imgur.com/hAIowgF.png)
+![](https://i.imgur.com/jxAnLjv.png)
+
+## Problem 5
+Following `y = r(s_t,a,s_{t+1}) + \gamma*V_t`
+```python
+b = np.append(b[1:], 0)
+y = x + discount_rate * b
+```
+
+![](https://i.imgur.com/fonXELu.png)
+![](https://i.imgur.com/20uIuUT.png)
+
+
+## Problem 6
+Generalized Advantage Estimation
+```python
+a = util.discount(a, self.discount_rate * LAMBDA)
+```
+
+![](https://i.imgur.com/HnAgfXD.png)
+![](https://i.imgur.com/8vN4nNC.png)
+
+
 
-TA: try to elaborate the algorithms that you implemented and any details worth mentioned.