CEDL2017 · wengbrian · Nov 9, 2017 · Nov 9, 2017 · Nov 9, 2017 · Nov 9, 2017
diff --git a/.DS_Store b/.DS_Store
diff --git a/Lab3-policy-gradient.ipynb b/Lab3-policy-gradient.ipynb
diff --git a/fig/p3_loss.png b/fig/p3_loss.png
diff --git a/fig/p3_return.png b/fig/p3_return.png
diff --git a/fig/p4_loss.png b/fig/p4_loss.png
diff --git a/fig/p4_return.png b/fig/p4_return.png
diff --git a/fig/p5_loss.png b/fig/p5_loss.png
diff --git a/fig/p5_return.png b/fig/p5_return.png
diff --git a/fig/p6_loss.png b/fig/p6_loss.png
diff --git a/fig/p6_return.png b/fig/p6_return.png
diff --git a/policy_gradient/policy.py b/policy_gradient/policy.py
@@ -30,6 +30,20 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
         Sample solution is about 2~4 lines.
         """
         # YOUR CODE HERE >>>>>>
+        with tf.variable_scope("fc1"):
+            weights = tf.Variable(tf.truncated_normal(shape=[in_dim, hidden_dim], seed=0))
+            biases = tf.Variable(tf.truncated_normal(shape=[hidden_dim], seed=0))
+            logit = tf.nn.xw_plus_b(self._observations, weights, biases)
+            act = tf.tanh(logit)
+        self.weights = [weights, biases]
+        with tf.variable_scope("fc2"):
+            weights = tf.Variable(tf.truncated_normal(shape=[hidden_dim, out_dim], seed=0))
+            biases = tf.Variable(tf.truncated_normal(shape=[out_dim], seed=0))
+            logit = tf.nn.xw_plus_b(act, weights, biases)
+            softmax = tf.nn.softmax(logit)
+        self.weights.append(weights)
+        self.weights.append(biases)
+        probs = softmax
         # <<<<<<<<
 
         # --------------------------------------------------
@@ -50,6 +64,7 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
         # 2. Add index of the action chosen at each timestep
         #    e.g., if index of the action chosen at timestep t = 0 is 1, and index of the action
         #    chosen at timestep = 1 is 0, then `action_idxs_flattened` == [0, 2] + [1, 0] = [1, 2]
+
         action_idxs_flattened += self._actions
 
         # 3. Gather the probability of action at each timestep
@@ -72,6 +87,7 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
         Sample solution is about 1~3 lines.
         """
         # YOUR CODE HERE >>>>>>
+        surr_loss = -tf.reduce_mean(log_prob*self._advantages)
         # <<<<<<<<
 
         grads_and_vars = self._opt.compute_gradients(surr_loss)
@@ -90,9 +106,22 @@ def act(self, observation):
         # expect observation to be of shape [1, observation_space]
         assert observation.shape[0] == 1
         action_probs = self._sess.run(self._act_op, feed_dict={self._observations: observation})
-
+        #print(observation)
         # `action_probs` is an array that has shape [1, action_space], it contains the probability of each action
         # sample an action according to `action_probs`
+        """ 
+        when 
+            action_probs = [0.01, 0.01, 0.97, 0.01]
+        then
+            cs = [0.01, 0.02, 0.99, 1.]
+            cs < random() could return [True, True, False, False] with hige probability
+            idx = 2
+        when 
+            action_probs = [0.25, 0.25, 0.25, 0.25]
+            idx = randint(3)
+        """
+        #print('weight:', self._sess.run(self.weights))
+        #print('action prob:', action_probs)
         cs = np.cumsum(action_probs)
         idx = sum(cs < np.random.rand())
         return idx

diff --git a/policy_gradient/util.py b/policy_gradient/util.py
@@ -32,6 +32,9 @@ def discount_bootstrap(x, discount_rate, b):
     Sample code should be about 3 lines
     """
     # YOUR CODE >>>>>>>>>>>>>>>>>>>
+    b = np.concatenate([b,[0]]) 
+    y = x + discount_rate*b[1:]
+    return y
     # <<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
 def plot_curve(data, key, filename=None):
@@ -46,5 +49,6 @@ def plot_curve(data, key, filename=None):
     plt.close()
 
 def discount(x, discount_factor):
+    """y[n] = x[n]+y[n-1]*discount_factor"""
     return scipy.signal.lfilter([1.0], [1.0, -discount_factor], x[::-1])[::-1]
 
diff --git a/report.md b/report.md
@@ -1,3 +1,58 @@
 # Homework3-Policy-Gradient report
+student name/ID: 翁正欣/106062577
+## Problem 1: construct a neural network to represent policy
+```
+with tf.variable_scope("fc1"):
+      weights = tf.Variable(tf.truncated_normal(shape=[in_dim, hidden_dim], seed=0))
+      biases = tf.Variable(tf.truncated_normal(shape=[hidden_dim], seed=0))
+      logit = tf.nn.xw_plus_b(self._observations, weights, biases)
+      act = tf.tanh(logit)
 
-TA: try to elaborate the algorithms that you implemented and any details worth mentioned.
+  with tf.variable_scope("fc2"):
+      weights = tf.Variable(tf.truncated_normal(shape=[hidden_dim, out_dim], seed=0))
+      biases = tf.Variable(tf.truncated_normal(shape=[out_dim], seed=0))
+      logit = tf.nn.xw_plus_b(act, weights, biases)
+      softmax = tf.nn.softmax(logit)
+  probs = softmax
+```
+## Problem 2: compute the surrogate loss
+```
+surr_loss = -tf.reduce_mean(log_prob*self._advantages)
+```
+## Problem 3: Use baseline to reduce the variance of our gradient estimate
+in this problem, I substract **baseline** from **returns**.
+```
+a = r-b
+```
+## Problem 4: train without baseline
+in this problem, I did not use baseline and use returns as advantage.
+## Problem 5: Actor-Critic algorithm (with bootstrapping)
+in this problem, I implement Actor-Critic with bootstrapping.
+```
+def discount_bootstrap(x, discount_rate, b):
+    b = np.concatenate([b,[0]])
+    y = x + discount_rate*b[1:]
+    return y
+```
+## Problem 6: Generalized Advantage Estimation
+in this problem, I discount advantage by GAE.
+```
+"""
+y[0] = x[0] + discount(x[1],1) + discount(x[2],2) + ... + discount(x[len(x)-1], len(x)-1)
+y[1] = x[1] + discount(x[2],1) + discount(x[3],2) + ... + discount(x[len(x)-1], len(x)-2)
+...
+y[n] = x[n] + discount(x[n], 1) + ... + discount(x[len(x)-1], len(x)-n+1)
+     = x[n] + discount(y[n+1],1)
+"""
+a = util.discount(a, discount_rate*LAMBDA)
+```
+## Experiments
+all experiment are under the same random seed for environment and network initial weights.
+
+|   |problem3|problem4|problem5|problem6|
+|---|---|---|---|---|
+|loss|![](https://i.imgur.com/F1hzpO2.png)|![](https://i.imgur.com/DDDN8qn.png)|![](https://i.imgur.com/Qw1bNwv.png)|![](https://i.imgur.com/TcYEveS.png)|
+|average return|![](https://i.imgur.com/CIgBDGx.png)|![](https://i.imgur.com/aJuHqco.png)|![](https://i.imgur.com/SY5MnRO.png)|![](https://i.imgur.com/v02OZ37.png)||
+
+## Conclusion
+setting for problem4 converges the fastest in this task. maybe I implement something wrong...