Updated the Actor-critic example (#2048)

yasserhcn · web-flow · commit ba5b116dc067 · 2025-03-03T10:18:47.000-08:00
* Updated example to recent version

* resolved review comments
diff --git a/examples/rl/actor_critic_cartpole.py b/examples/rl/actor_critic_cartpole.py
@@ -34,7 +34,8 @@
 
 ### References
 
-- [CartPole](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)
+- [Environment documentation](https://gymnasium.farama.org/environments/classic_control/cart_pole/)
+- [CartPole paper](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)
 - [Actor Critic Method](https://hal.inria.fr/hal-00840470/document)
 """
 """
@@ -55,8 +56,9 @@
 seed = 42
 gamma = 0.99  # Discount factor for past rewards
 max_steps_per_episode = 10000
+# Adding `render_mode='human'` will show the attempts of the agent
 env = gym.make("CartPole-v0")  # Create the environment
-env.seed(seed)
+env.reset(seed=seed)
 eps = np.finfo(np.float32).eps.item()  # Smallest number such that 1.0 + eps != 1.0
 
 """
@@ -96,12 +98,10 @@
 episode_count = 0
 
 while True:  # Run until solved
-    state = env.reset()
+    state = env.reset()[0]
     episode_reward = 0
     with tf.GradientTape() as tape:
         for timestep in range(1, max_steps_per_episode):
-            # env.render(); Adding this line would show the attempts
-            # of the agent in a pop up window.
 
             state = ops.convert_to_tensor(state)
             state = ops.expand_dims(state, 0)
@@ -116,7 +116,7 @@
             action_probs_history.append(ops.log(action_probs[0, action]))
 
             # Apply the sampled action in our environment
-            state, reward, done, _ = env.step(action)
+            state, reward, done, *_ = env.step(action)
             rewards_history.append(reward)
             episode_reward += reward
 
diff --git a/examples/rl/ipynb/actor_critic_cartpole.ipynb b/examples/rl/ipynb/actor_critic_cartpole.ipynb
@@ -45,7 +45,8 @@
     "\n",
     "### References\n",
     "\n",
-    "- [CartPole](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)\n",
+    "- [Environment documentation](https://gymnasium.farama.org/environments/classic_control/cart_pole/)\n",
+    "- [CartPole paper](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)\n",
     "- [Actor Critic Method](https://hal.inria.fr/hal-00840470/document)\n"
    ]
   },
@@ -79,9 +80,10 @@
     "seed = 42\n",
     "gamma = 0.99  # Discount factor for past rewards\n",
     "max_steps_per_episode = 10000\n",
+    "# Adding `render_mode='human'` will show the attempts of the agent\n",
     "env = gym.make(\"CartPole-v0\")  # Create the environment\n",
-    "env.seed(seed)\n",
-    "eps = np.finfo(np.float32).eps.item()  # Smallest number such that 1.0 + eps != 1.0\n"
+    "env.reset(seed=seed)\n",
+    "eps = np.finfo(np.float32).eps.item()  # Smallest number such that 1.0 + eps != 1.0"
    ]
   },
   {
@@ -148,13 +150,11 @@
     "episode_count = 0\n",
     "\n",
     "while True:  # Run until solved\n",
-    "    state = env.reset()\n",
+    "    state = env.reset()[0]\n",
     "    episode_reward = 0\n",
     "    with tf.GradientTape() as tape:\n",
     "        for timestep in range(1, max_steps_per_episode):\n",
-    "            # env.render(); Adding this line would show the attempts\n",
-    "            # of the agent in a pop up window.\n",
-    "\n",
+    "            \n",
     "            state = ops.convert_to_tensor(state)\n",
     "            state = ops.expand_dims(state, 0)\n",
     "\n",
@@ -168,7 +168,7 @@
     "            action_probs_history.append(ops.log(action_probs[0, action]))\n",
     "\n",
     "            # Apply the sampled action in our environment\n",
-    "            state, reward, done, _ = env.step(action)\n",
+    "            state, reward, done, *_ = env.step(action)\n",
     "            rewards_history.append(reward)\n",
     "            episode_reward += reward\n",
     "\n",
@@ -272,7 +272,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.0"
+   "version": "3.12.8"
   }
  },
  "nbformat": 4,
diff --git a/examples/rl/md/actor_critic_cartpole.md b/examples/rl/md/actor_critic_cartpole.md
@@ -36,7 +36,8 @@ remains upright. The agent, therefore, must learn to keep the pole from falling
 
 ### References
 
-- [CartPole](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)
+- [Environment documentation](https://gymnasium.farama.org/environments/classic_control/cart_pole/)
+- [CartPole paper](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)
 - [Actor Critic Method](https://hal.inria.fr/hal-00840470/document)
 
 
@@ -59,8 +60,9 @@ import tensorflow as tf
 seed = 42
 gamma = 0.99  # Discount factor for past rewards
 max_steps_per_episode = 10000
+# Adding `render_mode='human'` will show the attempts of the agent
 env = gym.make("CartPole-v0")  # Create the environment
-env.seed(seed)
+env.reset(seed=seed)
 eps = np.finfo(np.float32).eps.item()  # Smallest number such that 1.0 + eps != 1.0
 
 ```
@@ -108,12 +110,10 @@ running_reward = 0
 episode_count = 0
 
 while True:  # Run until solved
-    state = env.reset()
+    state = env.reset()[0]
     episode_reward = 0
     with tf.GradientTape() as tape:
         for timestep in range(1, max_steps_per_episode):
-            # env.render(); Adding this line would show the attempts
-            # of the agent in a pop up window.
 
             state = ops.convert_to_tensor(state)
             state = ops.expand_dims(state, 0)
@@ -128,7 +128,7 @@ while True:  # Run until solved
             action_probs_history.append(ops.log(action_probs[0, action]))
 
             # Apply the sampled action in our environment
-            state, reward, done, _ = env.step(action)
+            state, reward, done, *_ = env.step(action)
             rewards_history.append(reward)
             episode_reward += reward