-
Notifications
You must be signed in to change notification settings - Fork 42
/
create2_docker.py
executable file
·209 lines (176 loc) · 8.26 KB
/
create2_docker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/usr/bin/env python
# Copyright (c) 2018, The SenseAct Authors.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
import time
import copy
import numpy as np
import baselines.common.tf_util as U
import senseact.devices.create2.create2_config as create2_config
from multiprocessing import Process, Value, Manager
from baselines.trpo_mpi.trpo_mpi import learn
from baselines.ppo1.mlp_policy import MlpPolicy
from senseact.envs.create2.create2_docker_env import Create2DockerEnv
from senseact.utils import tf_set_seeds, NormalizedEnv
from helper import create_callback
def main():
# use fixed random state
rand_state = np.random.RandomState(1).get_state()
np.random.set_state(rand_state)
tf_set_seeds(np.random.randint(1, 2**31 - 1))
# Create the Create2 docker environment
env = Create2DockerEnv(30, port='/dev/ttyUSB0', ir_window=20, ir_history=1,
obs_history=1, dt=0.045, random_state=rand_state)
env = NormalizedEnv(env)
# Start environment processes
env.start()
# Create baselines TRPO policy function
sess = U.single_threaded_session()
sess.__enter__()
def policy_fn(name, ob_space, ac_space):
return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
hid_size=32, num_hid_layers=2)
# Create and start plotting process
plot_running = Value('i', 1)
shared_returns = Manager().dict({"write_lock": False,
"episodic_returns": [],
"episodic_lengths": [], })
# Spawn plotting process
pp = Process(target=plot_create2_docker, args=(env, 2048, shared_returns, plot_running))
pp.start()
# Create callback function for logging data from baselines TRPO learn
kindred_callback = create_callback(shared_returns)
# Train baselines TRPO
learn(env, policy_fn,
max_timesteps=40000,
timesteps_per_batch=2048,
max_kl=0.05,
cg_iters=10,
cg_damping=0.1,
vf_iters=5,
vf_stepsize=0.001,
gamma=0.995,
lam=0.995,
callback=kindred_callback
)
# Safely terminate plotter process
plot_running.value = 0 # shutdown ploting process
time.sleep(2)
pp.join()
env.close()
def plot_create2_docker(env, batch_size, shared_returns, plot_running):
"""Helper process for visualize the learning curve and observations.
Args:
env: An instance of Create2DockerEnv
batch_size: An int representing timesteps_per_batch provided to the PPO learn function
shared_returns: A manager dictionary object containing `episodic returns` and `episodic lengths`
plot_running: A multiprocessing Value object containing 0/1.
1: Continue plotting, 0: Terminate plotting loop
"""
print("Started plotting routine")
import matplotlib.pyplot as plt
from matplotlib import gridspec
plt.ion()
time.sleep(5.0)
fig = plt.figure(figsize=(20, 7))
gs = gridspec.GridSpec(2, 3)
ax1 = plt.subplot(gs[0, :-1])
ax2 = plt.subplot(gs[0, 2])
ax3 = plt.subplot(gs[1, :])
# setup main plot showing the current state
sensor_array_type = env._sensor_comms[env._comm_name].sensor_buffer.np_array_type
sensor_array_dim = len(sensor_array_type)
action_array_dim = env._action_buffer.array_len
ax1_bars = ax1.bar(list(range(sensor_array_dim + action_array_dim)),
[0] * (sensor_array_dim + action_array_dim),
tick_label=list(sensor_array_type.names) +
['requested left wheel speed', 'requested right wheel speed'])
ax1_min_y = [create2_config.PACKET_INFO[create2_config.PACKET_NAME_TO_ID[packet]]['range'][0] for packet in
sensor_array_type.names]
ax1_min_y.extend([r[0] for r in
create2_config.OPCODE_INFO[create2_config.OPCODE_NAME_TO_CODE[env._main_op]]['params'].values()])
ax1_max_y = [create2_config.PACKET_INFO[create2_config.PACKET_NAME_TO_ID[packet]]['range'][1] for packet in
sensor_array_type.names]
ax1_max_y.extend([r[1] for r in
create2_config.OPCODE_INFO[create2_config.OPCODE_NAME_TO_CODE[env._main_op]]['params'].values()])
ax1.set_ylim([-1.25, 1.25])
ax1.tick_params(axis='x', labelrotation=10, labelsize=7)
ax1.set_title("Current Sensor & Action")
ax1_texts = []
for b in ax1_bars:
ax1_texts.append(ax1.text(b.get_x() + b.get_width() / 2., 1.05 * b.get_height(), '0', ha='center', va='bottom'))
hl11, = ax2.plot([], [])
sensation_array_dim = env._sensation_buffer.array_len - 2
ax3_bars = ax3.bar(list(range(sensation_array_dim)), [0] * sensation_array_dim)
ax3_min_y = env.observation_space.low
ax3_max_y = env.observation_space.high
ax3_texts = []
for b in ax3_bars:
ax3_texts.append(ax3.text(b.get_x() + b.get_width() / 2., 1.05 * b.get_height(), '0', ha='center', va='bottom'))
ax3.set_ylim([-1.25, 1.25])
ax3.set_title("Current Sensation")
count = 0
old_size = len(shared_returns['episodic_returns'])
while plot_running.value:
sensor_buffer = env._sensor_comms[env._comm_name].sensor_buffer.read()
action_buffer = env._action_buffer.read()
sensation_buffer = env._sensation_buffer.read()
for i, b in enumerate(ax1_bars):
ax1_texts[i].remove()
if i >= len(sensor_buffer[0][0][0]):
raw_val = action_buffer[0][0][-1 * (len(ax1_bars) - i)]
else:
raw_val = sensor_buffer[0][0][0][i]
if raw_val < 0:
height = raw_val / abs(ax1_min_y[i])
else:
height = raw_val / ax1_max_y[i]
b.set_height(max(-1.0, min(1.0, height)))
ax1_texts[i] = ax1.text(b.get_x() + b.get_width() / 2., 1.05 * b.get_height(), "{:.2f}".format(raw_val),
ha='center', va='bottom')
for i, b in enumerate(ax3_bars):
ax3_texts[i].remove()
raw_val = sensation_buffer[0][0][i]
if raw_val < 0:
height = raw_val / abs(ax3_min_y[i])
else:
height = raw_val / ax3_max_y[i]
b.set_height(max(-1.0, min(1.0, height)))
ax3_texts[i] = ax3.text(b.get_x() + b.get_width() / 2., 1.05 * b.get_height(), "{:.2f}".format(raw_val),
ha='center', va='bottom')
ax3.set_title("Current Sensation (Reward: {:.2f})".format(sensation_buffer[0][0][-2]))
# make a copy of the whole dict to avoid episode_returns and episodic_lengths getting desync
# while plotting
copied_returns = copy.deepcopy(shared_returns)
if not copied_returns['write_lock'] and len(copied_returns['episodic_returns']) > old_size:
# plot learning curve
returns = np.array(copied_returns['episodic_returns'])
old_size = len(copied_returns['episodic_returns'])
window_size_steps = 5000
x_tick = 1000
if copied_returns['episodic_lengths']:
ep_lens = np.array(copied_returns['episodic_lengths'])
else:
ep_lens = batch_size * np.arange(len(returns))
cum_episode_lengths = np.cumsum(ep_lens)
if cum_episode_lengths[-1] >= x_tick:
steps_show = np.arange(x_tick, cum_episode_lengths[-1] + 1, x_tick)
rets = []
for i in range(len(steps_show)):
rets_in_window = returns[(cum_episode_lengths > max(0, x_tick * (i + 1) - window_size_steps)) *
(cum_episode_lengths < x_tick * (i + 1))]
if rets_in_window.any():
rets.append(np.mean(rets_in_window))
hl11.set_xdata(np.arange(1, len(rets) + 1) * x_tick)
ax2.set_xlim([x_tick, len(rets) * x_tick])
hl11.set_ydata(rets)
ax2.set_ylim([np.min(rets), np.max(rets) + 1])
time.sleep(0.01)
fig.canvas.draw()
fig.canvas.flush_events()
plt.tight_layout(rect=[0, 0.01, 1, 0.99])
count += 1
if __name__ == '__main__':
main()