From 16e2288f10d1d5a36479bcf87ae02d64282618a6 Mon Sep 17 00:00:00 2001 From: bpoole16 Date: Wed, 5 Jul 2023 17:05:32 -0400 Subject: [PATCH 01/15] Began updating repo to use gymnasium --- .../environment_runner_batch.py | 15 +- .../environment_runners/parallel_env.py | 10 +- continual_rl/experiments/tasks/image_task.py | 2 +- .../experiments/tasks/make_atari_task.py | 2 +- .../experiments/tasks/make_minihack_task.py | 2 +- .../experiments/tasks/make_procgen_task.py | 2 +- .../experiments/tasks/minigrid_task.py | 2 +- continual_rl/utils/env_wrappers.py | 162 ++++++++---------- continual_rl/utils/utils.py | 2 +- setup.py | 13 +- 10 files changed, 104 insertions(+), 108 deletions(-) diff --git a/continual_rl/experiments/environment_runners/environment_runner_batch.py b/continual_rl/experiments/environment_runners/environment_runner_batch.py index 177445f2..f722c34f 100644 --- a/continual_rl/experiments/environment_runners/environment_runner_batch.py +++ b/continual_rl/experiments/environment_runners/environment_runner_batch.py @@ -25,7 +25,8 @@ def __init__(self, policy, num_parallel_envs, timesteps_per_collection, render_c self._parallel_env = None self._last_observations = None # To allow returning mid-episode self._last_timestep_data = None # Always stores the last thing seen, even across "dones" - self._cumulative_rewards = np.array([0 for _ in range(num_parallel_envs)], dtype=np.float) + # NOTE: np.float is deprecated in numpy 1.24.4 + self._cumulative_rewards = np.array([0 for _ in range(num_parallel_envs)], dtype=float) # Used to determine what to save off to logs and when self._observations_to_render = [] @@ -41,7 +42,8 @@ def _initialize_envs(self, env_spec, preprocessor): self._parallel_env = ParallelEnv(env_specs, self._output_dir) # Initialize the observation time-batch with n of the first observation. - raw_observations = self._parallel_env.reset() + results = self._parallel_env.reset() + raw_observations, infos = list(results) processed_observations = self._preprocess_raw_observations(preprocessor, raw_observations) return processed_observations @@ -50,11 +52,11 @@ def _reset_env(self, env_id): ParallelEnv doesn't readily expose manually resetting an environment, so doing that here. """ if env_id == 0: - observation = self._parallel_env.envs[0].reset() + observation, _ = self._parallel_env.envs[0].reset() else: local = self._parallel_env.locals[env_id-1] local.send(("reset", None)) - observation = local.recv() + observation, _ = local.recv() return observation @@ -119,8 +121,9 @@ def collect_data(self, task_spec): # ParallelEnv automatically resets the env and returns the new observation when a "done" occurs result = self._parallel_env.step(actions) - raw_observations, rewards, dones, infos = list(result) - + raw_observations, rewards, terminated, truncated, infos = list(result) + dones = np.logical_or(terminated, truncated) + self._total_timesteps += self._num_parallel_envs self._last_timestep_data = timestep_data processed_observations = self._preprocess_raw_observations(preprocessor, raw_observations) diff --git a/continual_rl/experiments/environment_runners/parallel_env.py b/continual_rl/experiments/environment_runners/parallel_env.py index 03036da7..1a52529d 100644 --- a/continual_rl/experiments/environment_runners/parallel_env.py +++ b/continual_rl/experiments/environment_runners/parallel_env.py @@ -90,16 +90,16 @@ def __del__(self): def reset(self): for local in self.locals: local.send(("reset", None)) - results = [self._local_env.reset()] + [local.recv() for local in self.locals] + results = zip(*[self._local_env.reset()] + [local.recv() for local in self.locals]) return results def step(self, actions): for local, action in zip(self.locals, actions[1:]): local.send(("step", action)) - obs, reward, done, info = self._local_env.step(actions[0]) - if done: - obs = self._local_env.reset() - results = zip(*[(obs, reward, done, info)] + [local.recv() for local in self.locals]) + obs, reward, terminated, truncated, info = self._local_env.step(actions[0]) + if terminated or truncated: + obs, _ = self._local_env.reset() + results = zip(*[(obs, reward, terminated, truncated, info)] + [local.recv() for local in self.locals]) return results def render(self): diff --git a/continual_rl/experiments/tasks/image_task.py b/continual_rl/experiments/tasks/image_task.py index 03217a9f..6fd12a1f 100644 --- a/continual_rl/experiments/tasks/image_task.py +++ b/continual_rl/experiments/tasks/image_task.py @@ -1,6 +1,6 @@ import torch import torchvision -from gym.spaces.box import Box +from gymnasium.spaces.box import Box from continual_rl.experiments.tasks.task_base import TaskBase from continual_rl.experiments.tasks.preprocessor_base import PreprocessorBase from continual_rl.utils.utils import Utils diff --git a/continual_rl/experiments/tasks/make_atari_task.py b/continual_rl/experiments/tasks/make_atari_task.py index 0b50fe63..9e11cf8d 100644 --- a/continual_rl/experiments/tasks/make_atari_task.py +++ b/continual_rl/experiments/tasks/make_atari_task.py @@ -1,4 +1,4 @@ -import gym +import gymnasium as gym from continual_rl.utils.env_wrappers import ( NoopResetEnv, diff --git a/continual_rl/experiments/tasks/make_minihack_task.py b/continual_rl/experiments/tasks/make_minihack_task.py index 6a7e1194..884174f0 100644 --- a/continual_rl/experiments/tasks/make_minihack_task.py +++ b/continual_rl/experiments/tasks/make_minihack_task.py @@ -1,4 +1,4 @@ -import gym +import gymnasium as gym import numpy as np import os diff --git a/continual_rl/experiments/tasks/make_procgen_task.py b/continual_rl/experiments/tasks/make_procgen_task.py index b4dc70c6..0b24f345 100644 --- a/continual_rl/experiments/tasks/make_procgen_task.py +++ b/continual_rl/experiments/tasks/make_procgen_task.py @@ -1,4 +1,4 @@ -import gym +import gymnasium as gym from .image_task import ImageTask diff --git a/continual_rl/experiments/tasks/minigrid_task.py b/continual_rl/experiments/tasks/minigrid_task.py index c7f37fc1..6687363d 100644 --- a/continual_rl/experiments/tasks/minigrid_task.py +++ b/continual_rl/experiments/tasks/minigrid_task.py @@ -1,7 +1,7 @@ import torch import numpy as np import gym_minigrid # Needed for Utils.make_env -import gym +import gymnasium as gym from continual_rl.experiments.tasks.task_base import TaskBase from continual_rl.experiments.tasks.preprocessor_base import PreprocessorBase from continual_rl.utils.utils import Utils diff --git a/continual_rl/utils/env_wrappers.py b/continual_rl/utils/env_wrappers.py index 8548aabf..e0d59eb5 100644 --- a/continual_rl/utils/env_wrappers.py +++ b/continual_rl/utils/env_wrappers.py @@ -1,17 +1,17 @@ # The MIT License -# -# Copyright (c) 2017 OpenAI (http://openai.com) -# + +# Copyright (c) 2019 Antonin Raffin + # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: -# + # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. -# + # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -21,71 +21,62 @@ # THE SOFTWARE. # Taken from -# https://raw.githubusercontent.com/openai/baselines/9b68103b737ac46bc201dfb3121cfa5df2127e53/baselines/common/wrappers.py -# https://raw.githubusercontent.com/openai/baselines/7c520852d9cf4eaaad326a3d548efc915dc60c10/baselines/common/atari_wrappers.py -# https://github.com/facebookresearch/torchbeast/blob/542c607cfe4adbc1967c213e8c248f29b13b64b6/torchbeast/atari_wrappers.py +# https://stable-baselines3.readthedocs.io/en/master/_modules/stable_baselines3/common/atari_wrappers.html # and slightly modified. import numpy as np import os os.environ.setdefault('PATH', '') from collections import deque -import gym -from gym import spaces +import gymnasium as gym +from gymnasium import spaces import torch import cv2 cv2.ocl.setUseOpenCL(False) - class NoopResetEnv(gym.Wrapper): def __init__(self, env, noop_max=30): """Sample initial states by taking random number of no-ops on reset. No-op is assumed to be action 0. """ - gym.Wrapper.__init__(self, env) + super().__init__(env) self.noop_max = noop_max self.override_num_noops = None self.noop_action = 0 assert env.unwrapped.get_action_meanings()[0] == 'NOOP' def reset(self, **kwargs): - """ Do no-op action for a number of steps in [1, noop_max].""" self.env.reset(**kwargs) if self.override_num_noops is not None: noops = self.override_num_noops else: noops = self.unwrapped.np_random.integers(1, self.noop_max + 1) assert noops > 0 - obs = None + obs = np.zeros(0) + info: Dict = {} for _ in range(noops): - obs, _, done, _ = self.env.step(self.noop_action) - if done: - obs = self.env.reset(**kwargs) - return obs - - def step(self, ac): - return self.env.step(ac) + obs, _, terminated, truncated, info = self.env.step(self.noop_action) + if terminated or truncated: + obs, info = self.env.reset(**kwargs) + return obs, info class FireResetEnv(gym.Wrapper): def __init__(self, env): """Take action on reset for environments that are fixed until firing.""" - gym.Wrapper.__init__(self, env) - assert env.unwrapped.get_action_meanings()[1] == 'FIRE' + super().__init__(env) + assert env.unwrapped.get_action_meanings()[1] == "FIRE" assert len(env.unwrapped.get_action_meanings()) >= 3 def reset(self, **kwargs): self.env.reset(**kwargs) - obs, _, done, _ = self.env.step(1) - if done: + obs, _, terminated, truncated, _ = self.env.step(1) + if terminated or truncated: self.env.reset(**kwargs) - obs, _, done, _ = self.env.step(2) - if done: + obs, _, terminated, truncated, _ = self.env.step(2) + if terminated or truncated: self.env.reset(**kwargs) - return obs - - def step(self, ac): - return self.env.step(ac) + return obs, {} class EpisodicLifeEnv(gym.Wrapper): @@ -95,38 +86,25 @@ def __init__(self, env): This wrapper should come before any reward-modifying wrappers, so the score is maintained. """ - gym.Wrapper.__init__(self, env) + super().__init__(env) self.lives = 0 self.was_real_done = True self.real_episode_return = 0 def step(self, action): - obs, reward, done, info = self.env.step(action) - self.was_real_done = done - self.real_episode_return += reward - episode_return_to_report = None - + obs, reward, terminated, truncated, info = self.env.step(action) + self.was_real_done = terminated or truncated # check current lives, make loss of life terminal, # then update lives to handle bonus lives - lives = self.env.unwrapped.ale.lives() - if lives < self.lives and lives > 0: + lives = self.env.unwrapped.ale.lives() # type: ignore[attr-defined] + if 0 < lives < self.lives: # for Qbert sometimes we stay in lives == 0 condition for a few frames - # so it's important to keep lives > 0, so that we only reset once + # so its important to keep lives > 0, so that we only reset once # the environment advertises done. - done = True - - if self.was_real_done: - episode_return_to_report = self.real_episode_return - self.real_episode_return = 0 - - # Since the consumer of the env has no ability to tell a "real" done from a fake one, put the real return - # on the info object (or a dummy placeholder to tell the caller to wait for it), but ensure we're not - # overwriting anything. - assert "episode_return" not in info, "Attempting to overwrite an existing episode return." - info["episode_return"] = episode_return_to_report - + terminated = True self.lives = lives - return obs, reward, done, info + return obs, reward, terminated, truncated, info + def reset(self, **kwargs): """Reset only when lives are exhausted. @@ -134,50 +112,58 @@ def reset(self, **kwargs): and the learner need not know about any of this behind-the-scenes. """ if self.was_real_done: - obs = self.env.reset(**kwargs) + obs, info = self.env.reset(**kwargs) else: # no-op step to advance from terminal/lost life state - obs, _, _, _ = self.env.step(0) + obs, _, terminated, truncated, info = self.env.step(0) + + # The no-op step can lead to a game over, so we need to check it again + # to see if we should reset the environment and avoid the + # monitor.py `RuntimeError: Tried to step environment that needs reset` + if terminated or truncated: + obs, info = self.env.reset(**kwargs) self.lives = self.env.unwrapped.ale.lives() - return obs + return obs, info class MaxAndSkipEnv(gym.Wrapper): def __init__(self, env, skip=4): """Return only every `skip`-th frame""" - gym.Wrapper.__init__(self, env) + super().__init__(env) # most recent raw observations (for max pooling across time steps) - self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8) - self._skip = skip + assert env.observation_space.dtype is not None, "No dtype specified for the observation space" + assert env.observation_space.shape is not None, "No shape defined for the observation space" + self._obs_buffer = np.zeros((2, *env.observation_space.shape), dtype=env.observation_space.dtype) + self._skip = skip def step(self, action): """Repeat action, sum reward, and max over last observations.""" total_reward = 0.0 - done = None + terminated = truncated = False for i in range(self._skip): - obs, reward, done, info = self.env.step(action) - if i == self._skip - 2: self._obs_buffer[0] = obs - if i == self._skip - 1: self._obs_buffer[1] = obs - total_reward += reward + obs, reward, terminated, truncated, info = self.env.step(action) + done = terminated or truncated + if i == self._skip - 2: + self._obs_buffer[0] = obs + if i == self._skip - 1: + self._obs_buffer[1] = obs + total_reward += float(reward) if done: break # Note that the observation on the done=True frame # doesn't matter max_frame = self._obs_buffer.max(axis=0) - return max_frame, total_reward, done, info - - def reset(self, **kwargs): - return self.env.reset(**kwargs) + return max_frame, total_reward, terminated, truncated, info class ClipRewardEnv(gym.RewardWrapper): def __init__(self, env): - gym.RewardWrapper.__init__(self, env) + super().__init__(env) def reward(self, reward): """Bin reward to {+1, 0, -1} by its sign.""" - return np.sign(reward) + return np.sign(float(reward)) class WarpFrame(gym.ObservationWrapper): @@ -257,32 +243,32 @@ def __init__(self, env, k): See Also: LazyFrames """ - gym.Wrapper.__init__(self, env) + super().__init__(env) self.k = k self.frames = deque([], maxlen=k) shp = env.observation_space.shape self.observation_space = spaces.Box(low=env.observation_space.low.min(), high=env.observation_space.high.max(), shape=(k, *shp), dtype=env.observation_space.dtype) - def reset(self): - ob = self.env.reset() + def reset(self, **kwargs): + observation, info = self.env.reset(**kwargs) for _ in range(self.k): - self.frames.append(ob) - return self._get_ob() + self.frames.append(observation) + return self._get_obs(), info def step(self, action): - ob, reward, done, info = self.env.step(action) - self.frames.append(ob) - return self._get_ob(), reward, done, info + observation, reward, terminated, truncated, info = self.env.step(action) + self.frames.append(observation) + return self._get_obs(), reward, terminated, truncated, info - def _get_ob(self): + def _get_obs(self): assert len(self.frames) == self.k return LazyFrames(list(self.frames)) class ScaledFloatFrame(gym.ObservationWrapper): def __init__(self, env): - gym.ObservationWrapper.__init__(self, env) + super().__init__(env) self.observation_space = gym.spaces.Box(low=0, high=1, shape=env.observation_space.shape, dtype=np.float32) def observation(self, observation): @@ -340,18 +326,21 @@ def to_tensor(self): class TimeLimit(gym.Wrapper): + """ + Ref: https://github.com/Farama-Foundation/Gymnasium/blob/main/gymnasium/wrappers/time_limit.py + """ def __init__(self, env, max_episode_steps=None): - super(TimeLimit, self).__init__(env) + super().__init__(env) self._max_episode_steps = max_episode_steps self._elapsed_steps = 0 - def step(self, ac): - observation, reward, done, info = self.env.step(ac) + def step(self, action): + observation, reward, terminated, truncated, info = self.env.step(action) self._elapsed_steps += 1 if self._elapsed_steps >= self._max_episode_steps: done = True info['TimeLimit.truncated'] = True - return observation, reward, done, info + return observation, reward, terminated, truncated, info def reset(self, **kwargs): self._elapsed_steps = 0 @@ -378,7 +367,7 @@ class ImageToPyTorch(gym.ObservationWrapper): # For now switching this to return a Tensor and calling it *before* FrameStack... def __init__(self, env): - super(ImageToPyTorch, self).__init__(env) + super().__init__(env) old_shape = self.observation_space.shape self.observation_space = gym.spaces.Box( low=0, @@ -405,5 +394,4 @@ def __init__(self, env, seeds): def reset(self): seed = np.random.choice(self._seeds) - self._env.seed(int(seed)) - return self._env.reset() + return self._env.reset(seed=seed) diff --git a/continual_rl/utils/utils.py b/continual_rl/utils/utils.py index d2d1618d..d43e67d4 100644 --- a/continual_rl/utils/utils.py +++ b/continual_rl/utils/utils.py @@ -1,7 +1,7 @@ import logging import tempfile import types -import gym +import gymnasium as gym import numpy as np import random import torch diff --git a/setup.py b/setup.py index 6ad31824..bc367345 100644 --- a/setup.py +++ b/setup.py @@ -8,16 +8,21 @@ author_email='snpowers@cs.cmu.edu', packages=find_packages(), py_modules=['continual_rl.available_policies', 'continual_rl.experiment_specs'], - install_requires=['setuptools==59.5.0', + install_requires=['setuptools', 'uuid', 'numpy', 'tensorboard', 'torch-ac', - 'gym[atari]<=0.25.2', - 'atari-py==0.2.5', - 'moviepy', + 'gymnasium[atari]', + 'gymnasium[accept-rom-license]', 'dotmap', 'psutil', 'opencv-python' + # NOTE: More recent versions can't seem to save single color channel + # images. This means when tensorboard goes to save images for + # videos, imageio 'ValueError: Can't write images with one + # color channel' is thrown. + 'moviepy', + 'imageio<=2.24.0' ] ) From 4b6e32177c81fb934b0745f96a6822a0492b3c7c Mon Sep 17 00:00:00 2001 From: bpoole16 Date: Wed, 5 Jul 2023 17:45:22 -0400 Subject: [PATCH 02/15] Attempting to update tests/ and continue updating repo as bugs arise. --- .../environment_runners/environment_runner_batch.py | 2 ++ .../experiments/environment_runners/parallel_env.py | 10 +++++----- continual_rl/policies/play/play_policy.py | 2 +- tests/common_mocks/mock_preprocessor.py | 2 +- .../test_environment_runner_batch.py | 6 +++--- .../test_environment_runner_sync.py | 6 +++--- 6 files changed, 15 insertions(+), 13 deletions(-) diff --git a/continual_rl/experiments/environment_runners/environment_runner_batch.py b/continual_rl/experiments/environment_runners/environment_runner_batch.py index f722c34f..99379eb4 100644 --- a/continual_rl/experiments/environment_runners/environment_runner_batch.py +++ b/continual_rl/experiments/environment_runners/environment_runner_batch.py @@ -44,6 +44,8 @@ def _initialize_envs(self, env_spec, preprocessor): # Initialize the observation time-batch with n of the first observation. results = self._parallel_env.reset() raw_observations, infos = list(results) + from pdb import set_trace + set_trace() processed_observations = self._preprocess_raw_observations(preprocessor, raw_observations) return processed_observations diff --git a/continual_rl/experiments/environment_runners/parallel_env.py b/continual_rl/experiments/environment_runners/parallel_env.py index 1a52529d..7157b6a0 100644 --- a/continual_rl/experiments/environment_runners/parallel_env.py +++ b/continual_rl/experiments/environment_runners/parallel_env.py @@ -25,7 +25,7 @@ from multiprocessing import Process, Pipe -import gym +import gymnasium as gym import cloudpickle from continual_rl.utils.utils import Utils @@ -41,12 +41,12 @@ def worker(conn, env_spec, output_dir): while True: cmd, data = conn.recv() if cmd == "step": - obs, reward, done, info = env.step(data) + obs, reward, terminated, truncated, info = env.step(data) if done: - obs = env.reset() - conn.send((obs, reward, done, info)) + obs, info = env.reset() + conn.send((obs, reward, terminated, truncated, info)) elif cmd == "reset": - obs = env.reset() + obs, info = env.reset() conn.send(obs) elif cmd == "kill": env.close() diff --git a/continual_rl/policies/play/play_policy.py b/continual_rl/policies/play/play_policy.py index b1a7e6e6..d7b2d280 100644 --- a/continual_rl/policies/play/play_policy.py +++ b/continual_rl/policies/play/play_policy.py @@ -1,4 +1,4 @@ -import gym +import gymnasium as gym import time from continual_rl.policies.policy_base import PolicyBase from continual_rl.policies.play.play_policy_config import PlayPolicyConfig diff --git a/tests/common_mocks/mock_preprocessor.py b/tests/common_mocks/mock_preprocessor.py index 1e78e687..b9b0371a 100644 --- a/tests/common_mocks/mock_preprocessor.py +++ b/tests/common_mocks/mock_preprocessor.py @@ -1,5 +1,5 @@ from continual_rl.experiments.tasks.preprocessor_base import PreprocessorBase -from gym.spaces.box import Box +from gymnasium.spaces.box import Box import torch diff --git a/tests/experiments/environment_runners/test_environment_runner_batch.py b/tests/experiments/environment_runners/test_environment_runner_batch.py index 76d1ecb7..e29cc83c 100644 --- a/tests/experiments/environment_runners/test_environment_runner_batch.py +++ b/tests/experiments/environment_runners/test_environment_runner_batch.py @@ -20,14 +20,14 @@ def seed(self, seed): def reset(self): self.reset_count += 1 - return np.array([0, 1, 2]) + return np.array([0, 1, 2]), {"info": "unused"} def step(self, action): self.actions_executed.append(action) observation = np.array([12, 13, 14]) reward = 1.5 - done = action == 4 # Simple way to force the done state we want - return observation, reward, done, {"info": "unused"} + terminated = action == 4 # Simple way to force the done state we want + return observation, reward, terminated, False, {"info": "unused"} def close(self): pass diff --git a/tests/experiments/environment_runners/test_environment_runner_sync.py b/tests/experiments/environment_runners/test_environment_runner_sync.py index 6b634ea3..bc2e5c07 100644 --- a/tests/experiments/environment_runners/test_environment_runner_sync.py +++ b/tests/experiments/environment_runners/test_environment_runner_sync.py @@ -19,14 +19,14 @@ def seed(self, seed): def reset(self): self.reset_count += 1 - return np.array([0, 1, 2]) + return np.array([0, 1, 2]), {"info": "unused"} def step(self, action): self.actions_executed.append(action) observation = np.array([12, 13, 14]) reward = 1.5 - done = action == 4 # Simple way to force the done state we want - return observation, reward, done, {"info": "unused"} + terminated = action == 4 # Simple way to force the done state we want + return observation, reward, terminated, False, {"info": "unused"} def close(self): pass From 7186d8c108744f2e6c4aa7ecf72f2a52d8983959 Mon Sep 17 00:00:00 2001 From: bpoole16 Date: Wed, 5 Jul 2023 18:07:17 -0400 Subject: [PATCH 03/15] Updated tests/ code to work with gymnasium. Now passing all tests. --- .../environment_runners/environment_runner_batch.py | 2 -- continual_rl/experiments/environment_runners/parallel_env.py | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/continual_rl/experiments/environment_runners/environment_runner_batch.py b/continual_rl/experiments/environment_runners/environment_runner_batch.py index 99379eb4..f722c34f 100644 --- a/continual_rl/experiments/environment_runners/environment_runner_batch.py +++ b/continual_rl/experiments/environment_runners/environment_runner_batch.py @@ -44,8 +44,6 @@ def _initialize_envs(self, env_spec, preprocessor): # Initialize the observation time-batch with n of the first observation. results = self._parallel_env.reset() raw_observations, infos = list(results) - from pdb import set_trace - set_trace() processed_observations = self._preprocess_raw_observations(preprocessor, raw_observations) return processed_observations diff --git a/continual_rl/experiments/environment_runners/parallel_env.py b/continual_rl/experiments/environment_runners/parallel_env.py index 7157b6a0..d0ea640c 100644 --- a/continual_rl/experiments/environment_runners/parallel_env.py +++ b/continual_rl/experiments/environment_runners/parallel_env.py @@ -42,12 +42,12 @@ def worker(conn, env_spec, output_dir): cmd, data = conn.recv() if cmd == "step": obs, reward, terminated, truncated, info = env.step(data) - if done: + if terminated: obs, info = env.reset() conn.send((obs, reward, terminated, truncated, info)) elif cmd == "reset": obs, info = env.reset() - conn.send(obs) + conn.send((obs, info)) elif cmd == "kill": env.close() return From 7f005ab0d64c4edd1d138459a4d7d1a02daaab74 Mon Sep 17 00:00:00 2001 From: bpoole16 Date: Thu, 6 Jul 2023 09:54:58 -0400 Subject: [PATCH 04/15] Updated setup and MIT license. --- continual_rl/utils/env_wrappers.py | 7 ++++++- setup.py | 15 ++++++++------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/continual_rl/utils/env_wrappers.py b/continual_rl/utils/env_wrappers.py index e0d59eb5..7c3fc8d3 100644 --- a/continual_rl/utils/env_wrappers.py +++ b/continual_rl/utils/env_wrappers.py @@ -1,6 +1,8 @@ # The MIT License +# Copyright (c) 2017 OpenAI (http://openai.com) # Copyright (c) 2019 Antonin Raffin +# Copyright (c) 2022 Farama Foundation # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -21,7 +23,10 @@ # THE SOFTWARE. # Taken from -# https://stable-baselines3.readthedocs.io/en/master/_modules/stable_baselines3/common/atari_wrappers.html +# https://raw.githubusercontent.com/openai/baselines/9b68103b737ac46bc201dfb3121cfa5df2127e53/baselines/common/wrappers.py +# https://raw.githubusercontent.com/openai/baselines/7c520852d9cf4eaaad326a3d548efc915dc60c10/baselines/common/atari_wrappers.py +# https://github.com/facebookresearch/torchbeast/blob/542c607cfe4adbc1967c213e8c248f29b13b64b6/torchbeast/atari_wrappers.py +# https://stable-baselines3.readthedocs.io/en/master/_modules/stable_baselines3/common/atari_wrappers.html # and slightly modified. import numpy as np diff --git a/setup.py b/setup.py index bc367345..6f3067e8 100644 --- a/setup.py +++ b/setup.py @@ -17,12 +17,13 @@ 'gymnasium[accept-rom-license]', 'dotmap', 'psutil', - 'opencv-python' - # NOTE: More recent versions can't seem to save single color channel - # images. This means when tensorboard goes to save images for - # videos, imageio 'ValueError: Can't write images with one - # color channel' is thrown. - 'moviepy', - 'imageio<=2.24.0' + 'opencv-python', + # NOTE: More recent versions of imageio can't seem to save single + # color channel images. This means when tensorboard goes to + # save images for videos, imageio 'ValueError: Can't write images + # with one color channel' is thrown. Root issue is with moviepy + # but installing older imageio version seems to get around it. + 'imageio==2.24.0', + 'moviepy' ] ) From 8e4f358e4422fc596b26b1deed7a21330cfed7c4 Mon Sep 17 00:00:00 2001 From: bpoole Date: Fri, 15 Dec 2023 13:48:05 -0500 Subject: [PATCH 05/15] Fixed NumPy warnings and added histrogram tracking. --- .../experiments/environment_runners/environment_runner_batch.py | 2 +- continual_rl/experiments/tasks/task_base.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/continual_rl/experiments/environment_runners/environment_runner_batch.py b/continual_rl/experiments/environment_runners/environment_runner_batch.py index 177445f2..c7d53741 100644 --- a/continual_rl/experiments/environment_runners/environment_runner_batch.py +++ b/continual_rl/experiments/environment_runners/environment_runner_batch.py @@ -25,7 +25,7 @@ def __init__(self, policy, num_parallel_envs, timesteps_per_collection, render_c self._parallel_env = None self._last_observations = None # To allow returning mid-episode self._last_timestep_data = None # Always stores the last thing seen, even across "dones" - self._cumulative_rewards = np.array([0 for _ in range(num_parallel_envs)], dtype=np.float) + self._cumulative_rewards = np.array([0 for _ in range(num_parallel_envs)], dtype=np.float64) # Used to determine what to save off to logs and when self._observations_to_render = [] diff --git a/continual_rl/experiments/tasks/task_base.py b/continual_rl/experiments/tasks/task_base.py index b62f0f82..fef38737 100644 --- a/continual_rl/experiments/tasks/task_base.py +++ b/continual_rl/experiments/tasks/task_base.py @@ -67,6 +67,8 @@ def _report_log(self, summary_writer, log, run_id, default_timestep): summary_writer.add_video(tag, value, global_step=timestep) elif type == "scalar": summary_writer.add_scalar(tag, value, global_step=timestep) + elif type == "histogram": + summary_writer.add_histogram(tag, value, global_step=timestep) elif type == "image": summary_writer.add_image(tag, value, global_step=timestep) From 39cb8fc81a9590dae03889b8bcbf8f7fb87f6d31 Mon Sep 17 00:00:00 2001 From: bpoole Date: Thu, 15 Aug 2024 15:23:32 -0400 Subject: [PATCH 06/15] Removed minihack wrapper (outdated). Only closes batch runner when object is not None. --- .../environment_runners/environment_runner_batch.py | 3 ++- continual_rl/experiments/tasks/make_minihack_task.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/continual_rl/experiments/environment_runners/environment_runner_batch.py b/continual_rl/experiments/environment_runners/environment_runner_batch.py index c7d53741..a7768e3b 100644 --- a/continual_rl/experiments/environment_runners/environment_runner_batch.py +++ b/continual_rl/experiments/environment_runners/environment_runner_batch.py @@ -171,4 +171,5 @@ def collect_data(self, task_spec): return num_timesteps, [per_timestep_data], returns_to_report, logs_to_report def cleanup(self, task_spec): - self._parallel_env.close() + if self._parallel_env is not None: + self._parallel_env.close() diff --git a/continual_rl/experiments/tasks/make_minihack_task.py b/continual_rl/experiments/tasks/make_minihack_task.py index 6a7e1194..95d37a9f 100644 --- a/continual_rl/experiments/tasks/make_minihack_task.py +++ b/continual_rl/experiments/tasks/make_minihack_task.py @@ -73,7 +73,7 @@ def make_minihack( savedir=savedir, **kwargs, ) # each env specifies its own self._max_episode_steps - env = MiniHackMakeVecSafeWrapper(env) + # env = MiniHackMakeVecSafeWrapper(env) env = MiniHackObsWrapper(env) return env From ec712a2734e2f334be4549c7522e81bd31de50ff Mon Sep 17 00:00:00 2001 From: bpoole16 Date: Mon, 24 Mar 2025 10:15:49 -0400 Subject: [PATCH 07/15] Removed atari-pi, no longer needed. --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 6ad31824..647dd828 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,6 @@ 'tensorboard', 'torch-ac', 'gym[atari]<=0.25.2', - 'atari-py==0.2.5', 'moviepy', 'dotmap', 'psutil', From 4f93f38d4c477dd5817b264dd0d50e4d48a6c603 Mon Sep 17 00:00:00 2001 From: bpoole16 Date: Mon, 31 Mar 2025 11:48:04 -0400 Subject: [PATCH 08/15] Added ability to set more variables for TaskBase when using ImageTask. --- continual_rl/experiments/tasks/image_task.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/continual_rl/experiments/tasks/image_task.py b/continual_rl/experiments/tasks/image_task.py index 03217a9f..922381a1 100644 --- a/continual_rl/experiments/tasks/image_task.py +++ b/continual_rl/experiments/tasks/image_task.py @@ -38,9 +38,11 @@ def render_episode(self, episode_observations): class ImageTask(TaskBase): def __init__(self, task_id, action_space_id, env_spec, num_timesteps, time_batch_size, eval_mode, - image_size, grayscale, continual_eval=True, resize_interp_method="INTER_AREA"): + image_size, grayscale, continual_eval=True, resize_interp_method="INTER_AREA", + continual_eval_num_returns=10, rolling_return_count=100): preprocessor = ImagePreprocessor(time_batch_size, image_size, grayscale, env_spec, resize_interp_method) dummy_env, _ = Utils.make_env(preprocessor.env_spec) super().__init__(task_id, action_space_id, preprocessor, preprocessor.env_spec, preprocessor.observation_space, - dummy_env.action_space, num_timesteps, eval_mode, continual_eval=continual_eval) + dummy_env.action_space, num_timesteps, eval_mode, continual_eval=continual_eval, + rolling_return_count=rolling_return_count, continual_eval_num_returns=continual_eval_num_returns) From d2404053596bd1be9837f8eabcc70004f8d9ef0e Mon Sep 17 00:00:00 2001 From: Ben Date: Mon, 31 Mar 2025 12:02:57 -0400 Subject: [PATCH 09/15] Update action_run_tests.yml Updated from v2 to v4 to hopefully fix workflow error. --- .github/workflows/action_run_tests.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/action_run_tests.yml b/.github/workflows/action_run_tests.yml index 993d504d..4083a1d9 100644 --- a/.github/workflows/action_run_tests.yml +++ b/.github/workflows/action_run_tests.yml @@ -15,9 +15,9 @@ jobs: build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Cache conda - uses: actions/cache@v2 + uses: actions/cache@v4 env: # Increase this value to reset cache if environment.yml has not changed CACHE_NUMBER: 0 @@ -25,7 +25,7 @@ jobs: path: ~/conda_pkgs_dir key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{hashFiles('environment.yml') }} - - uses: conda-incubator/setup-miniconda@v2 + - uses: conda-incubator/setup-miniconda@v4 with: auto-update-conda: true activate-environment: venv_continual_rl From 251a1dc3de5f85e125a2c51d31d9a9ed90c6923e Mon Sep 17 00:00:00 2001 From: bpoole16 Date: Mon, 31 Mar 2025 12:05:55 -0400 Subject: [PATCH 10/15] Updated github workflows. --- .github/workflows/action_run_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/action_run_tests.yml b/.github/workflows/action_run_tests.yml index 4083a1d9..254c0763 100644 --- a/.github/workflows/action_run_tests.yml +++ b/.github/workflows/action_run_tests.yml @@ -25,7 +25,7 @@ jobs: path: ~/conda_pkgs_dir key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{hashFiles('environment.yml') }} - - uses: conda-incubator/setup-miniconda@v4 + - uses: conda-incubator/setup-miniconda@v3 with: auto-update-conda: true activate-environment: venv_continual_rl From dd9c1ee78c3c40dd29e2cf33d75d26da70eb1841 Mon Sep 17 00:00:00 2001 From: bpoole16 Date: Mon, 31 Mar 2025 12:11:50 -0400 Subject: [PATCH 11/15] Updated requirements to auto accept ALE ROMs for atari games. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 647dd828..9619eed0 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ 'numpy', 'tensorboard', 'torch-ac', - 'gym[atari]<=0.25.2', + 'gym[atari,accept-rom-license]<=0.25.2', 'moviepy', 'dotmap', 'psutil', From 1a7eaf1d3188766ebc8e9c8e4d98d369b34a2ca2 Mon Sep 17 00:00:00 2001 From: bpoole16 Date: Tue, 1 Apr 2025 15:20:42 -0400 Subject: [PATCH 12/15] Disabled WrapFrame in wrap_deepmind for atari as ImageTask already does this. As such, env was wrapped twice in WrapFrame. --- continual_rl/experiments/tasks/make_atari_task.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/continual_rl/experiments/tasks/make_atari_task.py b/continual_rl/experiments/tasks/make_atari_task.py index 0b50fe63..76ff5695 100644 --- a/continual_rl/experiments/tasks/make_atari_task.py +++ b/continual_rl/experiments/tasks/make_atari_task.py @@ -24,14 +24,15 @@ def make_atari(env_id, max_episode_steps=None, full_action_space=False): return env -def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False): +def wrap_deepmind(env, episode_life=True, wrap=True, clip_rewards=True, frame_stack=False, scale=False): """Configure environment for DeepMind-style Atari. """ if episode_life: env = EpisodicLifeEnv(env) if 'FIRE' in env.unwrapped.get_action_meanings(): env = FireResetEnv(env) - env = WarpFrame(env) + if wrap: + env = WarpFrame(env) if scale: env = ScaledFloatFrame(env) if clip_rewards: @@ -52,8 +53,9 @@ def get_single_atari_task(task_id, action_space_id, env_name, num_timesteps, max env_spec=lambda: wrap_deepmind( make_atari(env_name, max_episode_steps=max_episode_steps, full_action_space=full_action_space), clip_rewards=False, # If policies need to clip rewards, they should handle it themselves - frame_stack=False, # Handled separately + frame_stack=False, # Added by image task scale=False, + wrap=False, # Added by image task ), num_timesteps=num_timesteps, time_batch_size=4, From 08564da842c2f01b85c2d4770d619e0e0802b8ea Mon Sep 17 00:00:00 2001 From: bpoole Date: Mon, 21 Apr 2025 14:13:54 -0400 Subject: [PATCH 13/15] Updated exp specs for new atari registration for gymnasium. - Updated tests for new atari registration as well. --- continual_rl/experiment_specs.py | 5 +++++ tests/common_mocks/mock_preprocessor.py | 4 ++-- .../policies/discrete_random/test_discrete_random_policy.py | 4 ++-- tests/policies/impala/test_impala_policy.py | 2 +- tests/policies/ppo/test_ppo_policy.py | 2 +- 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/continual_rl/experiment_specs.py b/continual_rl/experiment_specs.py index 8d91d47d..4e34a927 100644 --- a/continual_rl/experiment_specs.py +++ b/continual_rl/experiment_specs.py @@ -1,3 +1,8 @@ +import gymnasium as gym +import ale_py + +gym.register_envs(ale_py) + from continual_rl.experiments.experiment import Experiment from continual_rl.experiments.tasks.make_atari_task import get_single_atari_task from continual_rl.experiments.tasks.make_procgen_task import get_single_procgen_task diff --git a/tests/common_mocks/mock_preprocessor.py b/tests/common_mocks/mock_preprocessor.py index b9b0371a..a602bbcf 100644 --- a/tests/common_mocks/mock_preprocessor.py +++ b/tests/common_mocks/mock_preprocessor.py @@ -1,7 +1,7 @@ from continual_rl.experiments.tasks.preprocessor_base import PreprocessorBase from gymnasium.spaces.box import Box import torch - +import numpy as np class MockPreprocessor(PreprocessorBase): def __init__(self): @@ -9,7 +9,7 @@ def __init__(self): super().__init__(observation_space) def preprocess(self, observation): - return torch.Tensor(observation) + return torch.Tensor(np.array(observation)) def render_episode(self, episode_observations): """ diff --git a/tests/policies/discrete_random/test_discrete_random_policy.py b/tests/policies/discrete_random/test_discrete_random_policy.py index 2e096b56..09ceb4bf 100644 --- a/tests/policies/discrete_random/test_discrete_random_policy.py +++ b/tests/policies/discrete_random/test_discrete_random_policy.py @@ -16,7 +16,7 @@ def test_end_to_end_batch(self, set_tmp_directory, cleanup_experiment, request): # Arrange experiment = Experiment(tasks=[ ImageTask(task_id="some_id", action_space_id=0, - env_spec='BreakoutDeterministic-v4', + env_spec='ale_py:BreakoutDeterministic-v4', num_timesteps=10, time_batch_size=4, eval_mode=False, image_size=[84, 84], grayscale=True) ]) @@ -45,7 +45,7 @@ def test_end_to_end_sync(self, set_tmp_directory, cleanup_experiment, request): # Arrange experiment = Experiment(tasks=[ ImageTask(task_id="end_to_end_sync", action_space_id=0, - env_spec='BreakoutDeterministic-v4', + env_spec='ale_py:BreakoutDeterministic-v4', num_timesteps=10, time_batch_size=4, eval_mode=False, image_size=[84, 84], grayscale=True) ]) diff --git a/tests/policies/impala/test_impala_policy.py b/tests/policies/impala/test_impala_policy.py index 6ae5b09c..434732e3 100644 --- a/tests/policies/impala/test_impala_policy.py +++ b/tests/policies/impala/test_impala_policy.py @@ -22,7 +22,7 @@ def test_end_to_end_batch(self, set_tmp_directory, cleanup_experiment, request): experiment = Experiment( tasks=[ ImageTask(task_id="some_id", action_space_id=0, - env_spec='BreakoutDeterministic-v4', + env_spec='ale_py:BreakoutDeterministic-v4', num_timesteps=10, time_batch_size=4, eval_mode=False, image_size=[84, 84], grayscale=True)]) config = ImpalaPolicyConfig() diff --git a/tests/policies/ppo/test_ppo_policy.py b/tests/policies/ppo/test_ppo_policy.py index 718abc52..c6e880f9 100644 --- a/tests/policies/ppo/test_ppo_policy.py +++ b/tests/policies/ppo/test_ppo_policy.py @@ -19,7 +19,7 @@ def test_end_to_end_batch(self, set_tmp_directory, cleanup_experiment, request): experiment = Experiment( tasks=[ ImageTask(task_id="end_to_end_batch", action_space_id=0, - env_spec='BreakoutDeterministic-v4', + env_spec='ale_py:BreakoutDeterministic-v4', num_timesteps=10, time_batch_size=4, eval_mode=False, image_size=[84, 84], grayscale=True)]) config = PPOPolicyConfig() From 7bfa5b031c3ce49132b3285a0b857a7d0220daa9 Mon Sep 17 00:00:00 2001 From: bpoole Date: Tue, 22 Apr 2025 15:48:58 -0400 Subject: [PATCH 14/15] Updated reset for wrappers to support taking kwargs. --- continual_rl/experiments/tasks/make_minihack_task.py | 6 +++--- continual_rl/utils/env_wrappers.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/continual_rl/experiments/tasks/make_minihack_task.py b/continual_rl/experiments/tasks/make_minihack_task.py index 501ac65d..9924a1f7 100644 --- a/continual_rl/experiments/tasks/make_minihack_task.py +++ b/continual_rl/experiments/tasks/make_minihack_task.py @@ -28,11 +28,11 @@ def step(self, action: int): os.chdir(self.basedir) return x - def reset(self): + def reset(self, **kwargs): os.chdir(self.env.env._vardir) - x = self.env.reset() + x, info = self.env.reset(**kwargs) os.chdir(self.basedir) - return x + return x, info def close(self): os.chdir(self.env.env._vardir) diff --git a/continual_rl/utils/env_wrappers.py b/continual_rl/utils/env_wrappers.py index 7c3fc8d3..7f30eb9f 100644 --- a/continual_rl/utils/env_wrappers.py +++ b/continual_rl/utils/env_wrappers.py @@ -74,14 +74,14 @@ def __init__(self, env): assert len(env.unwrapped.get_action_meanings()) >= 3 def reset(self, **kwargs): - self.env.reset(**kwargs) + _, info = self.env.reset(**kwargs) obs, _, terminated, truncated, _ = self.env.step(1) if terminated or truncated: - self.env.reset(**kwargs) + _, info = self.env.reset(**kwargs) obs, _, terminated, truncated, _ = self.env.step(2) if terminated or truncated: - self.env.reset(**kwargs) - return obs, {} + _, info = self.env.reset(**kwargs) + return obs, info class EpisodicLifeEnv(gym.Wrapper): From bfc269dd26639a8c1653b957a846ca637e188037 Mon Sep 17 00:00:00 2001 From: bpoole Date: Mon, 28 Apr 2025 12:30:02 -0400 Subject: [PATCH 15/15] Reworked EpisodicLifeEnv wrapper to return was_real_done. - When agent losses all lives then was_real_done is returned as true on reset only. If true, a normal reset is done. If False, no real reset is done and the game continues. --- continual_rl/utils/env_wrappers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/continual_rl/utils/env_wrappers.py b/continual_rl/utils/env_wrappers.py index 7f30eb9f..ea893e29 100644 --- a/continual_rl/utils/env_wrappers.py +++ b/continual_rl/utils/env_wrappers.py @@ -108,9 +108,9 @@ def step(self, action): # the environment advertises done. terminated = True self.lives = lives + # print(self.was_real_done, terminated, lives, self.lives) return obs, reward, terminated, truncated, info - def reset(self, **kwargs): """Reset only when lives are exhausted. This way all states are still reachable even though lives are episodic, @@ -128,6 +128,7 @@ def reset(self, **kwargs): if terminated or truncated: obs, info = self.env.reset(**kwargs) self.lives = self.env.unwrapped.ale.lives() + info['was_real_done'] = self.was_real_done return obs, info