Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions examples/rl/actor_critic_cartpole.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@

### References

- [CartPole](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)
- [Environment documentation](https://gymnasium.farama.org/environments/classic_control/cart_pole/)
- [CartPole paper](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)
- [Actor Critic Method](https://hal.inria.fr/hal-00840470/document)
"""
"""
Expand All @@ -55,8 +56,9 @@
seed = 42
gamma = 0.99 # Discount factor for past rewards
max_steps_per_episode = 10000
# Adding `render_mode='human'` will show the attempts of the agent
env = gym.make("CartPole-v0") # Create the environment
env.seed(seed)
env.reset(seed=seed)
eps = np.finfo(np.float32).eps.item() # Smallest number such that 1.0 + eps != 1.0

"""
Expand Down Expand Up @@ -96,12 +98,10 @@
episode_count = 0

while True: # Run until solved
state = env.reset()
state = env.reset()[0]
episode_reward = 0
with tf.GradientTape() as tape:
for timestep in range(1, max_steps_per_episode):
# env.render(); Adding this line would show the attempts
# of the agent in a pop up window.

state = ops.convert_to_tensor(state)
state = ops.expand_dims(state, 0)
Expand All @@ -116,7 +116,7 @@
action_probs_history.append(ops.log(action_probs[0, action]))

# Apply the sampled action in our environment
state, reward, done, _ = env.step(action)
state, reward, done, *_ = env.step(action)
rewards_history.append(reward)
episode_reward += reward

Expand Down
18 changes: 9 additions & 9 deletions examples/rl/ipynb/actor_critic_cartpole.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@
"\n",
"### References\n",
"\n",
"- [CartPole](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)\n",
"- [Environment documentation](https://gymnasium.farama.org/environments/classic_control/cart_pole/)\n",
"- [CartPole paper](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)\n",
"- [Actor Critic Method](https://hal.inria.fr/hal-00840470/document)\n"
]
},
Expand Down Expand Up @@ -79,9 +80,10 @@
"seed = 42\n",
"gamma = 0.99 # Discount factor for past rewards\n",
"max_steps_per_episode = 10000\n",
"# Adding `render_mode='human'` will show the attempts of the agent\n",
"env = gym.make(\"CartPole-v0\") # Create the environment\n",
"env.seed(seed)\n",
"eps = np.finfo(np.float32).eps.item() # Smallest number such that 1.0 + eps != 1.0\n"
"env.reset(seed=seed)\n",
"eps = np.finfo(np.float32).eps.item() # Smallest number such that 1.0 + eps != 1.0"
]
},
{
Expand Down Expand Up @@ -148,13 +150,11 @@
"episode_count = 0\n",
"\n",
"while True: # Run until solved\n",
" state = env.reset()\n",
" state = env.reset()[0]\n",
" episode_reward = 0\n",
" with tf.GradientTape() as tape:\n",
" for timestep in range(1, max_steps_per_episode):\n",
" # env.render(); Adding this line would show the attempts\n",
" # of the agent in a pop up window.\n",
"\n",
" \n",
" state = ops.convert_to_tensor(state)\n",
" state = ops.expand_dims(state, 0)\n",
"\n",
Expand All @@ -168,7 +168,7 @@
" action_probs_history.append(ops.log(action_probs[0, action]))\n",
"\n",
" # Apply the sampled action in our environment\n",
" state, reward, done, _ = env.step(action)\n",
" state, reward, done, *_ = env.step(action)\n",
" rewards_history.append(reward)\n",
" episode_reward += reward\n",
"\n",
Expand Down Expand Up @@ -272,7 +272,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
"version": "3.12.8"
}
},
"nbformat": 4,
Expand Down
12 changes: 6 additions & 6 deletions examples/rl/md/actor_critic_cartpole.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ remains upright. The agent, therefore, must learn to keep the pole from falling

### References

- [CartPole](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)
- [Environment documentation](https://gymnasium.farama.org/environments/classic_control/cart_pole/)
- [CartPole paper](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)
- [Actor Critic Method](https://hal.inria.fr/hal-00840470/document)


Expand All @@ -59,8 +60,9 @@ import tensorflow as tf
seed = 42
gamma = 0.99 # Discount factor for past rewards
max_steps_per_episode = 10000
# Adding `render_mode='human'` will show the attempts of the agent
env = gym.make("CartPole-v0") # Create the environment
env.seed(seed)
env.reset(seed=seed)
eps = np.finfo(np.float32).eps.item() # Smallest number such that 1.0 + eps != 1.0

```
Expand Down Expand Up @@ -108,12 +110,10 @@ running_reward = 0
episode_count = 0

while True: # Run until solved
state = env.reset()
state = env.reset()[0]
episode_reward = 0
with tf.GradientTape() as tape:
for timestep in range(1, max_steps_per_episode):
# env.render(); Adding this line would show the attempts
# of the agent in a pop up window.

state = ops.convert_to_tensor(state)
state = ops.expand_dims(state, 0)
Expand All @@ -128,7 +128,7 @@ while True: # Run until solved
action_probs_history.append(ops.log(action_probs[0, action]))

# Apply the sampled action in our environment
state, reward, done, _ = env.step(action)
state, reward, done, *_ = env.step(action)
rewards_history.append(reward)
episode_reward += reward

Expand Down