CEDL2017 · hubert0527 · Nov 9, 2017
diff --git a/Lab3-policy-gradient.ipynb b/Lab3-policy-gradient.ipynb
diff --git a/policy_gradient/policy.py b/policy_gradient/policy.py
@@ -1,4 +1,5 @@
 import tensorflow as tf
+import tensorflow.contrib.slim as slim
 import numpy as np
 
 # In this Lab, we just use categorical policy, which used for MDPs with discrete action space
@@ -30,6 +31,8 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
         Sample solution is about 2~4 lines.
         """
         # YOUR CODE HERE >>>>>>
+        x = slim.fully_connected(self._observations, hidden_dim, activation_fn=tf.nn.tanh, scope='policy/fc_1')
+        probs = slim.fully_connected(x, out_dim, activation_fn=tf.nn.softmax, scope='policy/fc_2')
         # <<<<<<<<
 
         # --------------------------------------------------
@@ -72,6 +75,7 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
         Sample solution is about 1~3 lines.
         """
         # YOUR CODE HERE >>>>>>
+        surr_loss = - tf.reduce_mean(log_prob * self._advantages)
         # <<<<<<<<
 
         grads_and_vars = self._opt.compute_gradients(surr_loss)

diff --git a/policy_gradient/util.py b/policy_gradient/util.py
@@ -27,11 +27,14 @@ def discount_bootstrap(x, discount_rate, b):
         x: the immediate reward for each timestep. e.g. [1, 1, 0]
         discount_factor: the \gamma in standard reinforcement learning
         b: the prediction of the baseline. e.g. [1.3, 0.4, 0.2]
-    Returns: a numpy array y = r(s_t,a,s_{t+1}) + \gamma*V_t 
+    Returns: a numpy array y = r(s_t,a,s_{t+1}) + \gamma*V_t
              (the shape of it should be the same as the `x` and `b`)
     Sample code should be about 3 lines
     """
     # YOUR CODE >>>>>>>>>>>>>>>>>>>
+    left_shift_arr = lambda arr, shift: np.hstack([arr[shift:], [0]*shift])
+    # return x + discount_rate * left_shift_arr(b, 1)
+    return x + discount_rate * left_shift_arr(b, 1)
     # <<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
 def plot_curve(data, key, filename=None):
@@ -45,6 +48,19 @@ def plot_curve(data, key, filename=None):
     plt.show()
     plt.close()
 
+def plot_curves(data_dict, key, filename=None):
+    # plot the surrogate loss curve
+    for data_label in data_dict:
+        x = np.arange(len(data_dict[data_label]))
+        plt.plot(x, data_dict[data_label])
+        plt.text(x[-1], data_dict[data_label][-1], data_label)
+    plt.xlabel("iterations")
+    plt.ylabel(key)
+    if filename is not None:
+        plt.savefig(filename)
+    plt.show()
+    plt.close()
+
 def discount(x, discount_factor):
     return scipy.signal.lfilter([1.0], [1.0, -discount_factor], x[::-1])[::-1]
 
diff --git a/prob3.png b/prob3.png
diff --git a/prob4.png b/prob4.png
diff --git a/prob4_bias.png b/prob4_bias.png
diff --git a/prob5.png b/prob5.png
diff --git a/prob6.png b/prob6.png
diff --git a/report.md b/report.md
@@ -1,3 +1,49 @@
 # Homework3-Policy-Gradient report
 
-TA: try to elaborate the algorithms that you implemented and any details worth mentioned.
+## Problem 1
+
+Basically just use Tensorflow construct 2 fc layers, and set activation functions to tanh and softmax. I use slim here for simplicity.
+
+## Problem 2
+
+Simply multiply the calculated log probability with the advantage. And use it to compute loss. 
+The only thing iportant here is that we want to maximize the surrogate loss, so we need to add a negative sign to the loss tensor, fulfilling the optimizer's minimization objective.
+
+## Problem 3
+
+Simply calculating advantages by subtracting reward with predicted baseline.
+
+<img src='./prob3.png'></img>
+
+## Problem 4
+
+Comparing against plot in problem 3. The variance in loss plot becomes relatively large.
+
+<img src='./prob4.png'></img>
+
+About the proof of unbias:
+<img src='./prob4_bias.png'></img>
+
+Basically is because the expectation of gradient of log-probability. The gradient itself can be moved in front of summation. And since the summation result is 1, its gradient is zero, thus it is unbiased.
+
+## Problem 5
+
+Here we implement the actor-critic strategy to replace original one.
+
+But the loss here can't converge very well and tends to have a long-tail training. Possibly current implementation is still too naive.
+
+<img src='./prob5.png'></img>
+
+## Problem 6
+
+We use GAE in this example, its a kind of compromise between our original strategy and actor-critic. And the result is totally as we expected, its a smooth form between two approaches.
+
+<img src='./prob6.png'></img>
+
+## Plot everything togather
+
+This plot can make it more clear to compare between prob 3/4 and 3/5/6.
+
+<img src='./total.png'></img>
+
+
diff --git a/total.png b/total.png