Update

fanfeichen · fanfeichen · commit 9845ded578cd · 2020-07-12T11:50:21.000-04:00
diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 # Robot Exploration with Deep Reinforcement Learning
-This repository contains code for robot exploration training with Deep Reinforcement Learning (DRL). The agent utilize the local structure of the environment to predict robot’s optimal sensing action. A demonstration can be found here -> www.youtube.com/watch?v=2gNF6efv12s
+This repository contains code for robot exploration training with Deep Reinforcement Learning (DRL). The agent utilize the local structure of the environment to predict robot’s optimal sensing action. A demonstration video can be found here -> www.youtube.com/watch?v=2gNF6efv12s
 
 <p align='center'>
     <img src="/doc/exploration.png" alt="drawing" width="1000"/>
@@ -12,7 +12,13 @@ This repository contains code for robot exploration training with Deep Reinforce
 ## Dependency
 - Python 3
 - [scikit-image](https://scikit-image.org/)
+  ```
+  pip3 install scikit-image
+  ```
 - [tensorboardX](https://github.com/lanpa/tensorboardX)
+  ```
+  pip3 install tensorboardX
+  ```
 - [TensorFlow](https://www.tensorflow.org/install) (code is writen under TF1.x but it is modified to be compatible with TF2)
 - [pybind11](https://github.com/pybind/pybind11) (pybind11 — Seamless operability between C++11 and Python)
   ```
@@ -51,8 +57,14 @@ make
     TRAIN = True
     PLOT = False
     ```
-  Set ``TRAIN=False`` to run saved policy. You can train your own policy by set ``TRAIN=True``. Set `` PLOT=True `` to turn on visualization plots.
-  
+  Set ``TRAIN=False`` to run saved policy. You can train your own policy by setting ``TRAIN=True``. Set `` PLOT=True `` to turn on visualization plots.
+ 
+- To show average reward during the training:
+    ```
+    cd DRL_robot_exploration
+    tensorboard --logdir=log
+    ```
+
 ## Cite
 
 Please cite [our paper](https://www.researchgate.net/profile/Fanfei_Chen/publication/330200308_Self-Learning_Exploration_and_Mapping_for_Mobile_Robots_via_Deep_Reinforcement_Learning/links/5d6e7ad4a6fdccf93d381d2e/Self-Learning-Exploration-and-Mapping-for-Mobile-Robots-via-Deep-Reinforcement-Learning.pdf) if you use any of this code: 
@@ -68,4 +80,5 @@ Please cite [our paper](https://www.researchgate.net/profile/Fanfei_Chen/publica
 
 ## Reference
 - [DeepRL-Agents](https://github.com/awjuliani/DeepRL-Agents)
-- [DeepLearningFlappyBird](https://github.com/yenchenlin/DeepLearningFlappyBird)
+- [DeepLearningFlappyBird](https://github.com/yenchenlin/DeepLearningFlappyBird)
+- [Random Dungeon Generator](http://perplexingtech.weebly.com/random-dungeon-demo.html)
diff --git a/scripts/robot_simulation.py b/scripts/robot_simulation.py
@@ -23,7 +23,8 @@ def __init__(self, index_map, train, plot):
             self.map_dir = '../DungeonMaps/test'
         self.map_list = os.listdir(self.map_dir)
         self.map_number = np.size(self.map_list)
-        shuffle(self.map_list)
+        if self.mode:
+            shuffle(self.map_list)
         self.li_map = index_map
         self.global_map, self.robot_position = self.map_setup(self.map_dir + '/' + self.map_list[self.li_map])
         self.op_map = np.ones(self.global_map.shape) * 127
diff --git a/scripts/tf_networks.py b/scripts/tf_networks.py
@@ -39,13 +39,13 @@ def create_CNN(num_action):
     h_conv3_flat = tf.compat.v1.layers.flatten(h_conv3)
 
     h_fc1 = tf.compat.v1.nn.relu(tf.matmul(h_conv3_flat, W_fc1) + b_fc1)
-    keep_per = tf.compat.v1.placeholder(shape=None, dtype=tf.float32)
-    hidden = tf.compat.v1.nn.dropout(h_fc1, keep_per)
+    keep_rate = tf.compat.v1.placeholder(shape=None, dtype=tf.float32)
+    hidden = tf.compat.v1.nn.dropout(h_fc1, keep_rate)
 
     # readout layer
     readout = tf.matmul(hidden, W_fc2) + b_fc2
 
-    return s, readout, keep_per
+    return s, readout, keep_rate
 
 
 def create_LSTM(num_action, num_cell, scope):
@@ -84,10 +84,10 @@ def create_LSTM(num_action, num_cell, scope):
         inputs=convFlat, cell=rnn_cell, dtype=tf.float32, initial_state=state_in, scope=scope)
     rnn = tf.reshape(rnn, shape=[-1, num_cell])
 
-    keep_per = tf.compat.v1.placeholder(shape=None, dtype=tf.float32)
-    hidden = tf.compat.v1.nn.dropout(rnn, keep_per)
+    keep_rate = tf.compat.v1.placeholder(shape=None, dtype=tf.float32)
+    hidden = tf.compat.v1.nn.dropout(rnn, keep_rate)
 
     # readout layer
     readout = tf.matmul(hidden, W_fc2) + b_fc2
 
-    return s, readout, keep_per, trainLength, batch_size, state_in, rnn_state
+    return s, readout, keep_rate, trainLength, batch_size, state_in, rnn_state
diff --git a/scripts/tf_policy_cnn.py b/scripts/tf_policy_cnn.py
@@ -44,8 +44,8 @@ def start():
     config = tf.compat.v1.ConfigProto()
     config.gpu_options.allow_growth = True
     sess = tf.compat.v1.InteractiveSession(config=config)
-    s, readout, keep_per = create_CNN(ACTIONS)
-    s_target, readout_target, keep_per_target = create_CNN(ACTIONS)
+    s, readout, keep_rate = create_CNN(ACTIONS)
+    s_target, readout_target, keep_rate_target = create_CNN(ACTIONS)
 
     # define the cost function
     a = tf.compat.v1.placeholder("float", [None, ACTIONS])
@@ -93,7 +93,7 @@ def start():
             drop_rate -= (INITIAL_RATE - FINAL_RATE) / EXPLORE
 
         # choose an action by uncertainty
-        readout_t = readout.eval(feed_dict={s: s_t, keep_per: 1-drop_rate})[0]
+        readout_t = readout.eval(feed_dict={s: s_t, keep_rate: 1-drop_rate})[0]
         readout_t[a_t_coll] = None
         a_t = np.zeros([ACTIONS])
         action_index = np.nanargmax(readout_t)
@@ -125,7 +125,7 @@ def start():
             r_batch = np.vstack(minibatch[:, 2]).flatten()
             s_j1_batch = np.vstack(minibatch[:, 3])
 
-            readout_j1_batch = readout_target.eval(feed_dict={s_target: s_j1_batch, keep_per_target: 0.2})
+            readout_j1_batch = readout_target.eval(feed_dict={s_target: s_j1_batch, keep_rate_target: 0.2})
             end_multiplier = -(np.vstack(minibatch[:, 4]).flatten() - 1)
             y_batch = r_batch + GAMMA * np.max(readout_j1_batch) * end_multiplier
 
@@ -134,7 +134,7 @@ def start():
                 y: y_batch,
                 a: a_batch,
                 s: s_j_batch,
-                keep_per: 0.2}
+                keep_rate: 0.2}
             )
             new_average_reward = np.average(total_reward[len(total_reward) - 10000:])
             writer.add_scalar('average reward', new_average_reward, step_t)
@@ -170,7 +170,7 @@ def start():
 
     while not TRAIN and not finish_all_map:
         # choose an action by policy
-        readout_t = readout.eval(feed_dict={s: s_t, keep_per: 1})[0]
+        readout_t = readout.eval(feed_dict={s: s_t, keep_rate: 1})[0]
         readout_t[a_t_coll] = None
         a_t = np.zeros([ACTIONS])
         action_index = np.nanargmax(readout_t)
diff --git a/scripts/tf_policy_rnn.py b/scripts/tf_policy_rnn.py
@@ -76,8 +76,8 @@ def start():
     config = tf.compat.v1.ConfigProto()
     config.gpu_options.allow_growth = True
     sess = tf.compat.v1.InteractiveSession(config=config)
-    s, readout, keep_per, tl, bs, si, rnn_state = create_LSTM(ACTIONS, h_size, 'policy')
-    s_target, readout_target, keep_per_target, \
+    s, readout, keep_rate, tl, bs, si, rnn_state = create_LSTM(ACTIONS, h_size, 'policy')
+    s_target, readout_target, keep_rate_target, \
     tl_target, bs_target, si_target, rnn_state_target = create_LSTM(ACTIONS, h_size, 'target')
 
     # define the cost function
@@ -130,7 +130,7 @@ def start():
 
         # choose an action by uncertainty
         readout_t, state1 = sess.run([readout, rnn_state],
-                                     feed_dict={s: s_t, keep_per: 1 - drop_rate, tl: 1, bs: 1, si: state})
+                                     feed_dict={s: s_t, keep_rate: 1 - drop_rate, tl: 1, bs: 1, si: state})
         readout_t = readout_t[0]
         readout_t[a_t_coll] = None
         a_t = np.zeros([ACTIONS])
@@ -165,7 +165,7 @@ def start():
             r_batch = np.vstack(trainBatch[:, 2]).flatten()
             s_j1_batch = np.vstack(trainBatch[:, 3])
 
-            readout_j1_batch = readout_target.eval(feed_dict={s_target: s_j1_batch, keep_per_target: 0.2,
+            readout_j1_batch = readout_target.eval(feed_dict={s_target: s_j1_batch, keep_rate_target: 0.2,
                                                               tl_target: trace_length, bs_target: BATCH,
                                                               si_target: state_train})[0]
             end_multiplier = -(np.vstack(trainBatch[:, 4]).flatten() - 1)
@@ -176,7 +176,7 @@ def start():
                 y: y_batch,
                 a: a_batch,
                 s: s_j_batch,
-                keep_per: 0.2,
+                keep_rate: 0.2,
                 tl: trace_length,
                 bs: BATCH,
                 si: state_train}
@@ -222,7 +222,7 @@ def start():
     while not TRAIN and not finish_all_map:
         # choose an action by uncertainty
         readout_t, state1 = sess.run([readout, rnn_state],
-                                     feed_dict={s: s_t, keep_per: 1, tl: 1, bs: 1, si: state})
+                                     feed_dict={s: s_t, keep_rate: 1, tl: 1, bs: 1, si: state})
         readout_t = readout_t[0]
         readout_t[a_t_coll] = None
         a_t = np.zeros([ACTIONS])