Skip to content

Commit ba5b116

Browse files
authored
Updated the Actor-critic example (#2048)
* Updated example to recent version * resolved review comments
1 parent 9f5c555 commit ba5b116

File tree

3 files changed

+21
-21
lines changed

3 files changed

+21
-21
lines changed

examples/rl/actor_critic_cartpole.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@
3434
3535
### References
3636
37-
- [CartPole](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)
37+
- [Environment documentation](https://gymnasium.farama.org/environments/classic_control/cart_pole/)
38+
- [CartPole paper](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)
3839
- [Actor Critic Method](https://hal.inria.fr/hal-00840470/document)
3940
"""
4041
"""
@@ -55,8 +56,9 @@
5556
seed = 42
5657
gamma = 0.99 # Discount factor for past rewards
5758
max_steps_per_episode = 10000
59+
# Adding `render_mode='human'` will show the attempts of the agent
5860
env = gym.make("CartPole-v0") # Create the environment
59-
env.seed(seed)
61+
env.reset(seed=seed)
6062
eps = np.finfo(np.float32).eps.item() # Smallest number such that 1.0 + eps != 1.0
6163

6264
"""
@@ -96,12 +98,10 @@
9698
episode_count = 0
9799

98100
while True: # Run until solved
99-
state = env.reset()
101+
state = env.reset()[0]
100102
episode_reward = 0
101103
with tf.GradientTape() as tape:
102104
for timestep in range(1, max_steps_per_episode):
103-
# env.render(); Adding this line would show the attempts
104-
# of the agent in a pop up window.
105105

106106
state = ops.convert_to_tensor(state)
107107
state = ops.expand_dims(state, 0)
@@ -116,7 +116,7 @@
116116
action_probs_history.append(ops.log(action_probs[0, action]))
117117

118118
# Apply the sampled action in our environment
119-
state, reward, done, _ = env.step(action)
119+
state, reward, done, *_ = env.step(action)
120120
rewards_history.append(reward)
121121
episode_reward += reward
122122

examples/rl/ipynb/actor_critic_cartpole.ipynb

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@
4545
"\n",
4646
"### References\n",
4747
"\n",
48-
"- [CartPole](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)\n",
48+
"- [Environment documentation](https://gymnasium.farama.org/environments/classic_control/cart_pole/)\n",
49+
"- [CartPole paper](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)\n",
4950
"- [Actor Critic Method](https://hal.inria.fr/hal-00840470/document)\n"
5051
]
5152
},
@@ -79,9 +80,10 @@
7980
"seed = 42\n",
8081
"gamma = 0.99 # Discount factor for past rewards\n",
8182
"max_steps_per_episode = 10000\n",
83+
"# Adding `render_mode='human'` will show the attempts of the agent\n",
8284
"env = gym.make(\"CartPole-v0\") # Create the environment\n",
83-
"env.seed(seed)\n",
84-
"eps = np.finfo(np.float32).eps.item() # Smallest number such that 1.0 + eps != 1.0\n"
85+
"env.reset(seed=seed)\n",
86+
"eps = np.finfo(np.float32).eps.item() # Smallest number such that 1.0 + eps != 1.0"
8587
]
8688
},
8789
{
@@ -148,13 +150,11 @@
148150
"episode_count = 0\n",
149151
"\n",
150152
"while True: # Run until solved\n",
151-
" state = env.reset()\n",
153+
" state = env.reset()[0]\n",
152154
" episode_reward = 0\n",
153155
" with tf.GradientTape() as tape:\n",
154156
" for timestep in range(1, max_steps_per_episode):\n",
155-
" # env.render(); Adding this line would show the attempts\n",
156-
" # of the agent in a pop up window.\n",
157-
"\n",
157+
" \n",
158158
" state = ops.convert_to_tensor(state)\n",
159159
" state = ops.expand_dims(state, 0)\n",
160160
"\n",
@@ -168,7 +168,7 @@
168168
" action_probs_history.append(ops.log(action_probs[0, action]))\n",
169169
"\n",
170170
" # Apply the sampled action in our environment\n",
171-
" state, reward, done, _ = env.step(action)\n",
171+
" state, reward, done, *_ = env.step(action)\n",
172172
" rewards_history.append(reward)\n",
173173
" episode_reward += reward\n",
174174
"\n",
@@ -272,7 +272,7 @@
272272
"name": "python",
273273
"nbconvert_exporter": "python",
274274
"pygments_lexer": "ipython3",
275-
"version": "3.7.0"
275+
"version": "3.12.8"
276276
}
277277
},
278278
"nbformat": 4,

examples/rl/md/actor_critic_cartpole.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ remains upright. The agent, therefore, must learn to keep the pole from falling
3636

3737
### References
3838

39-
- [CartPole](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)
39+
- [Environment documentation](https://gymnasium.farama.org/environments/classic_control/cart_pole/)
40+
- [CartPole paper](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)
4041
- [Actor Critic Method](https://hal.inria.fr/hal-00840470/document)
4142

4243

@@ -59,8 +60,9 @@ import tensorflow as tf
5960
seed = 42
6061
gamma = 0.99 # Discount factor for past rewards
6162
max_steps_per_episode = 10000
63+
# Adding `render_mode='human'` will show the attempts of the agent
6264
env = gym.make("CartPole-v0") # Create the environment
63-
env.seed(seed)
65+
env.reset(seed=seed)
6466
eps = np.finfo(np.float32).eps.item() # Smallest number such that 1.0 + eps != 1.0
6567

6668
```
@@ -108,12 +110,10 @@ running_reward = 0
108110
episode_count = 0
109111

110112
while True: # Run until solved
111-
state = env.reset()
113+
state = env.reset()[0]
112114
episode_reward = 0
113115
with tf.GradientTape() as tape:
114116
for timestep in range(1, max_steps_per_episode):
115-
# env.render(); Adding this line would show the attempts
116-
# of the agent in a pop up window.
117117

118118
state = ops.convert_to_tensor(state)
119119
state = ops.expand_dims(state, 0)
@@ -128,7 +128,7 @@ while True: # Run until solved
128128
action_probs_history.append(ops.log(action_probs[0, action]))
129129

130130
# Apply the sampled action in our environment
131-
state, reward, done, _ = env.step(action)
131+
state, reward, done, *_ = env.step(action)
132132
rewards_history.append(reward)
133133
episode_reward += reward
134134

0 commit comments

Comments
 (0)