Skip to content

Updated the Actor-critic example #2048

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions examples/rl/actor_critic_cartpole.py
Original file line number Diff line number Diff line change
@@ -34,7 +34,8 @@
### References
- [CartPole](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)
- [Environment documentation](https://gymnasium.farama.org/environments/classic_control/cart_pole/)
- [CartPole paper](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)
- [Actor Critic Method](https://hal.inria.fr/hal-00840470/document)
"""
"""
@@ -55,8 +56,9 @@
seed = 42
gamma = 0.99 # Discount factor for past rewards
max_steps_per_episode = 10000
# Adding `render_mode='human'` will show the attempts of the agent
env = gym.make("CartPole-v0") # Create the environment
env.seed(seed)
env.reset(seed=seed)
eps = np.finfo(np.float32).eps.item() # Smallest number such that 1.0 + eps != 1.0

"""
@@ -96,12 +98,10 @@
episode_count = 0

while True: # Run until solved
state = env.reset()
state = env.reset()[0]
episode_reward = 0
with tf.GradientTape() as tape:
for timestep in range(1, max_steps_per_episode):
# env.render(); Adding this line would show the attempts
# of the agent in a pop up window.

state = ops.convert_to_tensor(state)
state = ops.expand_dims(state, 0)
@@ -116,7 +116,7 @@
action_probs_history.append(ops.log(action_probs[0, action]))

# Apply the sampled action in our environment
state, reward, done, _ = env.step(action)
state, reward, done, *_ = env.step(action)
rewards_history.append(reward)
episode_reward += reward

18 changes: 9 additions & 9 deletions examples/rl/ipynb/actor_critic_cartpole.ipynb
Original file line number Diff line number Diff line change
@@ -45,7 +45,8 @@
"\n",
"### References\n",
"\n",
"- [CartPole](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)\n",
"- [Environment documentation](https://gymnasium.farama.org/environments/classic_control/cart_pole/)\n",
"- [CartPole paper](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)\n",
"- [Actor Critic Method](https://hal.inria.fr/hal-00840470/document)\n"
]
},
@@ -79,9 +80,10 @@
"seed = 42\n",
"gamma = 0.99 # Discount factor for past rewards\n",
"max_steps_per_episode = 10000\n",
"# Adding `render_mode='human'` will show the attempts of the agent\n",
"env = gym.make(\"CartPole-v0\") # Create the environment\n",
"env.seed(seed)\n",
"eps = np.finfo(np.float32).eps.item() # Smallest number such that 1.0 + eps != 1.0\n"
"env.reset(seed=seed)\n",
"eps = np.finfo(np.float32).eps.item() # Smallest number such that 1.0 + eps != 1.0"
]
},
{
@@ -148,13 +150,11 @@
"episode_count = 0\n",
"\n",
"while True: # Run until solved\n",
" state = env.reset()\n",
" state = env.reset()[0]\n",
" episode_reward = 0\n",
" with tf.GradientTape() as tape:\n",
" for timestep in range(1, max_steps_per_episode):\n",
" # env.render(); Adding this line would show the attempts\n",
" # of the agent in a pop up window.\n",
"\n",
" \n",
" state = ops.convert_to_tensor(state)\n",
" state = ops.expand_dims(state, 0)\n",
"\n",
@@ -168,7 +168,7 @@
" action_probs_history.append(ops.log(action_probs[0, action]))\n",
"\n",
" # Apply the sampled action in our environment\n",
" state, reward, done, _ = env.step(action)\n",
" state, reward, done, *_ = env.step(action)\n",
" rewards_history.append(reward)\n",
" episode_reward += reward\n",
"\n",
@@ -272,7 +272,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
"version": "3.12.8"
}
},
"nbformat": 4,
12 changes: 6 additions & 6 deletions examples/rl/md/actor_critic_cartpole.md
Original file line number Diff line number Diff line change
@@ -36,7 +36,8 @@ remains upright. The agent, therefore, must learn to keep the pole from falling

### References

- [CartPole](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)
- [Environment documentation](https://gymnasium.farama.org/environments/classic_control/cart_pole/)
- [CartPole paper](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)
- [Actor Critic Method](https://hal.inria.fr/hal-00840470/document)


@@ -59,8 +60,9 @@ import tensorflow as tf
seed = 42
gamma = 0.99 # Discount factor for past rewards
max_steps_per_episode = 10000
# Adding `render_mode='human'` will show the attempts of the agent
env = gym.make("CartPole-v0") # Create the environment
env.seed(seed)
env.reset(seed=seed)
eps = np.finfo(np.float32).eps.item() # Smallest number such that 1.0 + eps != 1.0

```
@@ -108,12 +110,10 @@ running_reward = 0
episode_count = 0

while True: # Run until solved
state = env.reset()
state = env.reset()[0]
episode_reward = 0
with tf.GradientTape() as tape:
for timestep in range(1, max_steps_per_episode):
# env.render(); Adding this line would show the attempts
# of the agent in a pop up window.

state = ops.convert_to_tensor(state)
state = ops.expand_dims(state, 0)
@@ -128,7 +128,7 @@ while True: # Run until solved
action_probs_history.append(ops.log(action_probs[0, action]))

# Apply the sampled action in our environment
state, reward, done, _ = env.step(action)
state, reward, done, *_ = env.step(action)
rewards_history.append(reward)
episode_reward += reward