CEDL2017 · w95wayne10 · Nov 9, 2017 · Dec 19, 2017 · Dec 19, 2017 · Dec 19, 2017
diff --git a/Lab3-policy-gradient.ipynb b/Lab3-policy-gradient.ipynb
diff --git a/photo/Q3.png b/photo/Q3.png
diff --git a/photo/Q4.png b/photo/Q4.png
diff --git a/photo/Q6.png b/photo/Q6.png
diff --git a/policy_gradient/policy.py b/policy_gradient/policy.py
@@ -30,6 +30,8 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
         Sample solution is about 2~4 lines.
         """
         # YOUR CODE HERE >>>>>>
+        h1 = tf.layers.dense(self._observations, units=hidden_dim, activation=tf.tanh)
+        probs = tf.layers.dense(h1, units=out_dim, activation=tf.nn.softmax)
         # <<<<<<<<
 
         # --------------------------------------------------
@@ -72,6 +74,7 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
         Sample solution is about 1~3 lines.
         """
         # YOUR CODE HERE >>>>>>
+        surr_loss = -tf.reduce_mean(log_prob * self._advantages)
         # <<<<<<<<
 
         grads_and_vars = self._opt.compute_gradients(surr_loss)

diff --git a/policy_gradient/util.py b/policy_gradient/util.py
@@ -32,8 +32,10 @@ def discount_bootstrap(x, discount_rate, b):
     Sample code should be about 3 lines
     """
     # YOUR CODE >>>>>>>>>>>>>>>>>>>
+    bb = np.append(b[1:],0.0)
+    return x + discount_rate*bb
     # <<<<<<<<<<<<<<<<<<<<<<<<<<<<
- 
+
 def plot_curve(data, key, filename=None):
     # plot the surrogate loss curve
     x = np.arange(len(data))

diff --git a/report.md b/report.md
@@ -1,3 +1,50 @@
+
+
 # Homework3-Policy-Gradient report
 
-TA: try to elaborate the algorithms that you implemented and any details worth mentioned.
+## Problem 1: Construct a neural network to represent policy
+```Python
+    h1 = tf.layers.dense(self._observations, units=hidden_dim, activation=tf.tanh)
+    probs = tf.layers.dense(h1, units=out_dim, activation=tf.nn.softmax)
+```
+
+## Problem 2: Compute the surrogate loss
+```Python
+    surr_loss = -tf.reduce_mean(log_prob * self._advantages)
+```
+前兩題分別把 Policy Gradient的網路架構和loss function造出
+
+## Problem 3: Use baseline to reduce the variance of our gradient estimate
+```Python
+    a = r - b
+```
+前3題主要在架設Policy-Gradient的結構  
+運用神經網路計算出各方向的機率做加分  
+越往上(陡峭)分數越高  
+然而要設定baseline使得較差的路徑分數下降  
+![Q3](https://github.com/w95wayne10/homework3-policy-gradient/blob/master/photo/Q3.png)
+
+## Problem 4  
+測試拿掉baseline的狀況  
+收斂速度下降  
+![Q4](https://github.com/w95wayne10/homework3-policy-gradient/blob/master/photo/Q4.png)
+
+## Problem 5 Actor-Critic algorithm (with bootstrapping)
+```Python
+    bb = np.append(b[1:],0.0)
+        return x + discount_rate*bb
+```
+
+## Problem 6: Generalized Advantage Estimation
+```Python
+    r = util.discount_bootstrap(p["rewards"], self.discount_rate, b)
+    target_v = util.discount_cumsum(p["rewards"], self.discount_rate)
+    a = r - b
+    a = util.discount(a, self.discount_rate * LAMBDA)
+```
+從採樣方式(5題)  
+和baseline取法(各方向期望值)做優化(6題)  
+使之更快速完成訓練  
+從圖中可看出收斂速度明顯上升  
+(因為是結合第三題和第五題的結果 如第五題寫錯會反映在此題)  
+![Q6](https://github.com/w95wayne10/homework3-policy-gradient/blob/master/photo/Q6.png)