diff --git a/.github/workflows/action_run_tests.yml b/.github/workflows/action_run_tests.yml index 993d504d..254c0763 100644 --- a/.github/workflows/action_run_tests.yml +++ b/.github/workflows/action_run_tests.yml @@ -15,9 +15,9 @@ jobs: build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Cache conda - uses: actions/cache@v2 + uses: actions/cache@v4 env: # Increase this value to reset cache if environment.yml has not changed CACHE_NUMBER: 0 @@ -25,7 +25,7 @@ jobs: path: ~/conda_pkgs_dir key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{hashFiles('environment.yml') }} - - uses: conda-incubator/setup-miniconda@v2 + - uses: conda-incubator/setup-miniconda@v3 with: auto-update-conda: true activate-environment: venv_continual_rl diff --git a/continual_rl/experiment_specs.py b/continual_rl/experiment_specs.py index 8d91d47d..4e34a927 100644 --- a/continual_rl/experiment_specs.py +++ b/continual_rl/experiment_specs.py @@ -1,3 +1,8 @@ +import gymnasium as gym +import ale_py + +gym.register_envs(ale_py) + from continual_rl.experiments.experiment import Experiment from continual_rl.experiments.tasks.make_atari_task import get_single_atari_task from continual_rl.experiments.tasks.make_procgen_task import get_single_procgen_task diff --git a/continual_rl/experiments/environment_runners/environment_runner_batch.py b/continual_rl/experiments/environment_runners/environment_runner_batch.py index 177445f2..da6b37ef 100644 --- a/continual_rl/experiments/environment_runners/environment_runner_batch.py +++ b/continual_rl/experiments/environment_runners/environment_runner_batch.py @@ -25,7 +25,8 @@ def __init__(self, policy, num_parallel_envs, timesteps_per_collection, render_c self._parallel_env = None self._last_observations = None # To allow returning mid-episode self._last_timestep_data = None # Always stores the last thing seen, even across "dones" - self._cumulative_rewards = np.array([0 for _ in range(num_parallel_envs)], dtype=np.float) + # NOTE: np.float is deprecated in numpy 1.24.4 + self._cumulative_rewards = np.array([0 for _ in range(num_parallel_envs)], dtype=float) # Used to determine what to save off to logs and when self._observations_to_render = [] @@ -41,7 +42,8 @@ def _initialize_envs(self, env_spec, preprocessor): self._parallel_env = ParallelEnv(env_specs, self._output_dir) # Initialize the observation time-batch with n of the first observation. - raw_observations = self._parallel_env.reset() + results = self._parallel_env.reset() + raw_observations, infos = list(results) processed_observations = self._preprocess_raw_observations(preprocessor, raw_observations) return processed_observations @@ -50,11 +52,11 @@ def _reset_env(self, env_id): ParallelEnv doesn't readily expose manually resetting an environment, so doing that here. """ if env_id == 0: - observation = self._parallel_env.envs[0].reset() + observation, _ = self._parallel_env.envs[0].reset() else: local = self._parallel_env.locals[env_id-1] local.send(("reset", None)) - observation = local.recv() + observation, _ = local.recv() return observation @@ -119,8 +121,9 @@ def collect_data(self, task_spec): # ParallelEnv automatically resets the env and returns the new observation when a "done" occurs result = self._parallel_env.step(actions) - raw_observations, rewards, dones, infos = list(result) - + raw_observations, rewards, terminated, truncated, infos = list(result) + dones = np.logical_or(terminated, truncated) + self._total_timesteps += self._num_parallel_envs self._last_timestep_data = timestep_data processed_observations = self._preprocess_raw_observations(preprocessor, raw_observations) @@ -171,4 +174,5 @@ def collect_data(self, task_spec): return num_timesteps, [per_timestep_data], returns_to_report, logs_to_report def cleanup(self, task_spec): - self._parallel_env.close() + if self._parallel_env is not None: + self._parallel_env.close() diff --git a/continual_rl/experiments/environment_runners/parallel_env.py b/continual_rl/experiments/environment_runners/parallel_env.py index 03036da7..d0ea640c 100644 --- a/continual_rl/experiments/environment_runners/parallel_env.py +++ b/continual_rl/experiments/environment_runners/parallel_env.py @@ -25,7 +25,7 @@ from multiprocessing import Process, Pipe -import gym +import gymnasium as gym import cloudpickle from continual_rl.utils.utils import Utils @@ -41,13 +41,13 @@ def worker(conn, env_spec, output_dir): while True: cmd, data = conn.recv() if cmd == "step": - obs, reward, done, info = env.step(data) - if done: - obs = env.reset() - conn.send((obs, reward, done, info)) + obs, reward, terminated, truncated, info = env.step(data) + if terminated: + obs, info = env.reset() + conn.send((obs, reward, terminated, truncated, info)) elif cmd == "reset": - obs = env.reset() - conn.send(obs) + obs, info = env.reset() + conn.send((obs, info)) elif cmd == "kill": env.close() return @@ -90,16 +90,16 @@ def __del__(self): def reset(self): for local in self.locals: local.send(("reset", None)) - results = [self._local_env.reset()] + [local.recv() for local in self.locals] + results = zip(*[self._local_env.reset()] + [local.recv() for local in self.locals]) return results def step(self, actions): for local, action in zip(self.locals, actions[1:]): local.send(("step", action)) - obs, reward, done, info = self._local_env.step(actions[0]) - if done: - obs = self._local_env.reset() - results = zip(*[(obs, reward, done, info)] + [local.recv() for local in self.locals]) + obs, reward, terminated, truncated, info = self._local_env.step(actions[0]) + if terminated or truncated: + obs, _ = self._local_env.reset() + results = zip(*[(obs, reward, terminated, truncated, info)] + [local.recv() for local in self.locals]) return results def render(self): diff --git a/continual_rl/experiments/tasks/image_task.py b/continual_rl/experiments/tasks/image_task.py index 03217a9f..73dabc09 100644 --- a/continual_rl/experiments/tasks/image_task.py +++ b/continual_rl/experiments/tasks/image_task.py @@ -1,6 +1,6 @@ import torch import torchvision -from gym.spaces.box import Box +from gymnasium.spaces.box import Box from continual_rl.experiments.tasks.task_base import TaskBase from continual_rl.experiments.tasks.preprocessor_base import PreprocessorBase from continual_rl.utils.utils import Utils @@ -38,9 +38,11 @@ def render_episode(self, episode_observations): class ImageTask(TaskBase): def __init__(self, task_id, action_space_id, env_spec, num_timesteps, time_batch_size, eval_mode, - image_size, grayscale, continual_eval=True, resize_interp_method="INTER_AREA"): + image_size, grayscale, continual_eval=True, resize_interp_method="INTER_AREA", + continual_eval_num_returns=10, rolling_return_count=100): preprocessor = ImagePreprocessor(time_batch_size, image_size, grayscale, env_spec, resize_interp_method) dummy_env, _ = Utils.make_env(preprocessor.env_spec) super().__init__(task_id, action_space_id, preprocessor, preprocessor.env_spec, preprocessor.observation_space, - dummy_env.action_space, num_timesteps, eval_mode, continual_eval=continual_eval) + dummy_env.action_space, num_timesteps, eval_mode, continual_eval=continual_eval, + rolling_return_count=rolling_return_count, continual_eval_num_returns=continual_eval_num_returns) diff --git a/continual_rl/experiments/tasks/make_atari_task.py b/continual_rl/experiments/tasks/make_atari_task.py index 0b50fe63..f75848fb 100644 --- a/continual_rl/experiments/tasks/make_atari_task.py +++ b/continual_rl/experiments/tasks/make_atari_task.py @@ -1,4 +1,4 @@ -import gym +import gymnasium as gym from continual_rl.utils.env_wrappers import ( NoopResetEnv, @@ -24,14 +24,15 @@ def make_atari(env_id, max_episode_steps=None, full_action_space=False): return env -def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False): +def wrap_deepmind(env, episode_life=True, wrap=True, clip_rewards=True, frame_stack=False, scale=False): """Configure environment for DeepMind-style Atari. """ if episode_life: env = EpisodicLifeEnv(env) if 'FIRE' in env.unwrapped.get_action_meanings(): env = FireResetEnv(env) - env = WarpFrame(env) + if wrap: + env = WarpFrame(env) if scale: env = ScaledFloatFrame(env) if clip_rewards: @@ -52,8 +53,9 @@ def get_single_atari_task(task_id, action_space_id, env_name, num_timesteps, max env_spec=lambda: wrap_deepmind( make_atari(env_name, max_episode_steps=max_episode_steps, full_action_space=full_action_space), clip_rewards=False, # If policies need to clip rewards, they should handle it themselves - frame_stack=False, # Handled separately + frame_stack=False, # Added by image task scale=False, + wrap=False, # Added by image task ), num_timesteps=num_timesteps, time_batch_size=4, diff --git a/continual_rl/experiments/tasks/make_minihack_task.py b/continual_rl/experiments/tasks/make_minihack_task.py index 6a7e1194..9924a1f7 100644 --- a/continual_rl/experiments/tasks/make_minihack_task.py +++ b/continual_rl/experiments/tasks/make_minihack_task.py @@ -1,4 +1,4 @@ -import gym +import gymnasium as gym import numpy as np import os @@ -28,11 +28,11 @@ def step(self, action: int): os.chdir(self.basedir) return x - def reset(self): + def reset(self, **kwargs): os.chdir(self.env.env._vardir) - x = self.env.reset() + x, info = self.env.reset(**kwargs) os.chdir(self.basedir) - return x + return x, info def close(self): os.chdir(self.env.env._vardir) @@ -73,7 +73,7 @@ def make_minihack( savedir=savedir, **kwargs, ) # each env specifies its own self._max_episode_steps - env = MiniHackMakeVecSafeWrapper(env) + # env = MiniHackMakeVecSafeWrapper(env) env = MiniHackObsWrapper(env) return env diff --git a/continual_rl/experiments/tasks/make_procgen_task.py b/continual_rl/experiments/tasks/make_procgen_task.py index b4dc70c6..0b24f345 100644 --- a/continual_rl/experiments/tasks/make_procgen_task.py +++ b/continual_rl/experiments/tasks/make_procgen_task.py @@ -1,4 +1,4 @@ -import gym +import gymnasium as gym from .image_task import ImageTask diff --git a/continual_rl/experiments/tasks/minigrid_task.py b/continual_rl/experiments/tasks/minigrid_task.py index c7f37fc1..6687363d 100644 --- a/continual_rl/experiments/tasks/minigrid_task.py +++ b/continual_rl/experiments/tasks/minigrid_task.py @@ -1,7 +1,7 @@ import torch import numpy as np import gym_minigrid # Needed for Utils.make_env -import gym +import gymnasium as gym from continual_rl.experiments.tasks.task_base import TaskBase from continual_rl.experiments.tasks.preprocessor_base import PreprocessorBase from continual_rl.utils.utils import Utils diff --git a/continual_rl/experiments/tasks/task_base.py b/continual_rl/experiments/tasks/task_base.py index b62f0f82..fef38737 100644 --- a/continual_rl/experiments/tasks/task_base.py +++ b/continual_rl/experiments/tasks/task_base.py @@ -67,6 +67,8 @@ def _report_log(self, summary_writer, log, run_id, default_timestep): summary_writer.add_video(tag, value, global_step=timestep) elif type == "scalar": summary_writer.add_scalar(tag, value, global_step=timestep) + elif type == "histogram": + summary_writer.add_histogram(tag, value, global_step=timestep) elif type == "image": summary_writer.add_image(tag, value, global_step=timestep) diff --git a/continual_rl/policies/play/play_policy.py b/continual_rl/policies/play/play_policy.py index b1a7e6e6..d7b2d280 100644 --- a/continual_rl/policies/play/play_policy.py +++ b/continual_rl/policies/play/play_policy.py @@ -1,4 +1,4 @@ -import gym +import gymnasium as gym import time from continual_rl.policies.policy_base import PolicyBase from continual_rl.policies.play.play_policy_config import PlayPolicyConfig diff --git a/continual_rl/utils/env_wrappers.py b/continual_rl/utils/env_wrappers.py index 8548aabf..ea893e29 100644 --- a/continual_rl/utils/env_wrappers.py +++ b/continual_rl/utils/env_wrappers.py @@ -1,17 +1,19 @@ # The MIT License -# + # Copyright (c) 2017 OpenAI (http://openai.com) -# +# Copyright (c) 2019 Antonin Raffin +# Copyright (c) 2022 Farama Foundation + # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: -# + # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. -# + # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -21,71 +23,65 @@ # THE SOFTWARE. # Taken from -# https://raw.githubusercontent.com/openai/baselines/9b68103b737ac46bc201dfb3121cfa5df2127e53/baselines/common/wrappers.py -# https://raw.githubusercontent.com/openai/baselines/7c520852d9cf4eaaad326a3d548efc915dc60c10/baselines/common/atari_wrappers.py -# https://github.com/facebookresearch/torchbeast/blob/542c607cfe4adbc1967c213e8c248f29b13b64b6/torchbeast/atari_wrappers.py +# https://raw.githubusercontent.com/openai/baselines/9b68103b737ac46bc201dfb3121cfa5df2127e53/baselines/common/wrappers.py +# https://raw.githubusercontent.com/openai/baselines/7c520852d9cf4eaaad326a3d548efc915dc60c10/baselines/common/atari_wrappers.py +# https://github.com/facebookresearch/torchbeast/blob/542c607cfe4adbc1967c213e8c248f29b13b64b6/torchbeast/atari_wrappers.py +# https://stable-baselines3.readthedocs.io/en/master/_modules/stable_baselines3/common/atari_wrappers.html # and slightly modified. import numpy as np import os os.environ.setdefault('PATH', '') from collections import deque -import gym -from gym import spaces +import gymnasium as gym +from gymnasium import spaces import torch import cv2 cv2.ocl.setUseOpenCL(False) - class NoopResetEnv(gym.Wrapper): def __init__(self, env, noop_max=30): """Sample initial states by taking random number of no-ops on reset. No-op is assumed to be action 0. """ - gym.Wrapper.__init__(self, env) + super().__init__(env) self.noop_max = noop_max self.override_num_noops = None self.noop_action = 0 assert env.unwrapped.get_action_meanings()[0] == 'NOOP' def reset(self, **kwargs): - """ Do no-op action for a number of steps in [1, noop_max].""" self.env.reset(**kwargs) if self.override_num_noops is not None: noops = self.override_num_noops else: noops = self.unwrapped.np_random.integers(1, self.noop_max + 1) assert noops > 0 - obs = None + obs = np.zeros(0) + info: Dict = {} for _ in range(noops): - obs, _, done, _ = self.env.step(self.noop_action) - if done: - obs = self.env.reset(**kwargs) - return obs - - def step(self, ac): - return self.env.step(ac) + obs, _, terminated, truncated, info = self.env.step(self.noop_action) + if terminated or truncated: + obs, info = self.env.reset(**kwargs) + return obs, info class FireResetEnv(gym.Wrapper): def __init__(self, env): """Take action on reset for environments that are fixed until firing.""" - gym.Wrapper.__init__(self, env) - assert env.unwrapped.get_action_meanings()[1] == 'FIRE' + super().__init__(env) + assert env.unwrapped.get_action_meanings()[1] == "FIRE" assert len(env.unwrapped.get_action_meanings()) >= 3 def reset(self, **kwargs): - self.env.reset(**kwargs) - obs, _, done, _ = self.env.step(1) - if done: - self.env.reset(**kwargs) - obs, _, done, _ = self.env.step(2) - if done: - self.env.reset(**kwargs) - return obs - - def step(self, ac): - return self.env.step(ac) + _, info = self.env.reset(**kwargs) + obs, _, terminated, truncated, _ = self.env.step(1) + if terminated or truncated: + _, info = self.env.reset(**kwargs) + obs, _, terminated, truncated, _ = self.env.step(2) + if terminated or truncated: + _, info = self.env.reset(**kwargs) + return obs, info class EpisodicLifeEnv(gym.Wrapper): @@ -95,38 +91,25 @@ def __init__(self, env): This wrapper should come before any reward-modifying wrappers, so the score is maintained. """ - gym.Wrapper.__init__(self, env) + super().__init__(env) self.lives = 0 self.was_real_done = True self.real_episode_return = 0 def step(self, action): - obs, reward, done, info = self.env.step(action) - self.was_real_done = done - self.real_episode_return += reward - episode_return_to_report = None - + obs, reward, terminated, truncated, info = self.env.step(action) + self.was_real_done = terminated or truncated # check current lives, make loss of life terminal, # then update lives to handle bonus lives - lives = self.env.unwrapped.ale.lives() - if lives < self.lives and lives > 0: + lives = self.env.unwrapped.ale.lives() # type: ignore[attr-defined] + if 0 < lives < self.lives: # for Qbert sometimes we stay in lives == 0 condition for a few frames - # so it's important to keep lives > 0, so that we only reset once + # so its important to keep lives > 0, so that we only reset once # the environment advertises done. - done = True - - if self.was_real_done: - episode_return_to_report = self.real_episode_return - self.real_episode_return = 0 - - # Since the consumer of the env has no ability to tell a "real" done from a fake one, put the real return - # on the info object (or a dummy placeholder to tell the caller to wait for it), but ensure we're not - # overwriting anything. - assert "episode_return" not in info, "Attempting to overwrite an existing episode return." - info["episode_return"] = episode_return_to_report - + terminated = True self.lives = lives - return obs, reward, done, info + # print(self.was_real_done, terminated, lives, self.lives) + return obs, reward, terminated, truncated, info def reset(self, **kwargs): """Reset only when lives are exhausted. @@ -134,50 +117,59 @@ def reset(self, **kwargs): and the learner need not know about any of this behind-the-scenes. """ if self.was_real_done: - obs = self.env.reset(**kwargs) + obs, info = self.env.reset(**kwargs) else: # no-op step to advance from terminal/lost life state - obs, _, _, _ = self.env.step(0) + obs, _, terminated, truncated, info = self.env.step(0) + + # The no-op step can lead to a game over, so we need to check it again + # to see if we should reset the environment and avoid the + # monitor.py `RuntimeError: Tried to step environment that needs reset` + if terminated or truncated: + obs, info = self.env.reset(**kwargs) self.lives = self.env.unwrapped.ale.lives() - return obs + info['was_real_done'] = self.was_real_done + return obs, info class MaxAndSkipEnv(gym.Wrapper): def __init__(self, env, skip=4): """Return only every `skip`-th frame""" - gym.Wrapper.__init__(self, env) + super().__init__(env) # most recent raw observations (for max pooling across time steps) - self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8) - self._skip = skip + assert env.observation_space.dtype is not None, "No dtype specified for the observation space" + assert env.observation_space.shape is not None, "No shape defined for the observation space" + self._obs_buffer = np.zeros((2, *env.observation_space.shape), dtype=env.observation_space.dtype) + self._skip = skip def step(self, action): """Repeat action, sum reward, and max over last observations.""" total_reward = 0.0 - done = None + terminated = truncated = False for i in range(self._skip): - obs, reward, done, info = self.env.step(action) - if i == self._skip - 2: self._obs_buffer[0] = obs - if i == self._skip - 1: self._obs_buffer[1] = obs - total_reward += reward + obs, reward, terminated, truncated, info = self.env.step(action) + done = terminated or truncated + if i == self._skip - 2: + self._obs_buffer[0] = obs + if i == self._skip - 1: + self._obs_buffer[1] = obs + total_reward += float(reward) if done: break # Note that the observation on the done=True frame # doesn't matter max_frame = self._obs_buffer.max(axis=0) - return max_frame, total_reward, done, info - - def reset(self, **kwargs): - return self.env.reset(**kwargs) + return max_frame, total_reward, terminated, truncated, info class ClipRewardEnv(gym.RewardWrapper): def __init__(self, env): - gym.RewardWrapper.__init__(self, env) + super().__init__(env) def reward(self, reward): """Bin reward to {+1, 0, -1} by its sign.""" - return np.sign(reward) + return np.sign(float(reward)) class WarpFrame(gym.ObservationWrapper): @@ -257,32 +249,32 @@ def __init__(self, env, k): See Also: LazyFrames """ - gym.Wrapper.__init__(self, env) + super().__init__(env) self.k = k self.frames = deque([], maxlen=k) shp = env.observation_space.shape self.observation_space = spaces.Box(low=env.observation_space.low.min(), high=env.observation_space.high.max(), shape=(k, *shp), dtype=env.observation_space.dtype) - def reset(self): - ob = self.env.reset() + def reset(self, **kwargs): + observation, info = self.env.reset(**kwargs) for _ in range(self.k): - self.frames.append(ob) - return self._get_ob() + self.frames.append(observation) + return self._get_obs(), info def step(self, action): - ob, reward, done, info = self.env.step(action) - self.frames.append(ob) - return self._get_ob(), reward, done, info + observation, reward, terminated, truncated, info = self.env.step(action) + self.frames.append(observation) + return self._get_obs(), reward, terminated, truncated, info - def _get_ob(self): + def _get_obs(self): assert len(self.frames) == self.k return LazyFrames(list(self.frames)) class ScaledFloatFrame(gym.ObservationWrapper): def __init__(self, env): - gym.ObservationWrapper.__init__(self, env) + super().__init__(env) self.observation_space = gym.spaces.Box(low=0, high=1, shape=env.observation_space.shape, dtype=np.float32) def observation(self, observation): @@ -340,18 +332,21 @@ def to_tensor(self): class TimeLimit(gym.Wrapper): + """ + Ref: https://github.com/Farama-Foundation/Gymnasium/blob/main/gymnasium/wrappers/time_limit.py + """ def __init__(self, env, max_episode_steps=None): - super(TimeLimit, self).__init__(env) + super().__init__(env) self._max_episode_steps = max_episode_steps self._elapsed_steps = 0 - def step(self, ac): - observation, reward, done, info = self.env.step(ac) + def step(self, action): + observation, reward, terminated, truncated, info = self.env.step(action) self._elapsed_steps += 1 if self._elapsed_steps >= self._max_episode_steps: done = True info['TimeLimit.truncated'] = True - return observation, reward, done, info + return observation, reward, terminated, truncated, info def reset(self, **kwargs): self._elapsed_steps = 0 @@ -378,7 +373,7 @@ class ImageToPyTorch(gym.ObservationWrapper): # For now switching this to return a Tensor and calling it *before* FrameStack... def __init__(self, env): - super(ImageToPyTorch, self).__init__(env) + super().__init__(env) old_shape = self.observation_space.shape self.observation_space = gym.spaces.Box( low=0, @@ -405,5 +400,4 @@ def __init__(self, env, seeds): def reset(self): seed = np.random.choice(self._seeds) - self._env.seed(int(seed)) - return self._env.reset() + return self._env.reset(seed=seed) diff --git a/continual_rl/utils/utils.py b/continual_rl/utils/utils.py index d2d1618d..d43e67d4 100644 --- a/continual_rl/utils/utils.py +++ b/continual_rl/utils/utils.py @@ -1,7 +1,7 @@ import logging import tempfile import types -import gym +import gymnasium as gym import numpy as np import random import torch diff --git a/setup.py b/setup.py index 6ad31824..6f3067e8 100644 --- a/setup.py +++ b/setup.py @@ -8,16 +8,22 @@ author_email='snpowers@cs.cmu.edu', packages=find_packages(), py_modules=['continual_rl.available_policies', 'continual_rl.experiment_specs'], - install_requires=['setuptools==59.5.0', + install_requires=['setuptools', 'uuid', 'numpy', 'tensorboard', 'torch-ac', - 'gym[atari]<=0.25.2', - 'atari-py==0.2.5', - 'moviepy', + 'gymnasium[atari]', + 'gymnasium[accept-rom-license]', 'dotmap', 'psutil', - 'opencv-python' + 'opencv-python', + # NOTE: More recent versions of imageio can't seem to save single + # color channel images. This means when tensorboard goes to + # save images for videos, imageio 'ValueError: Can't write images + # with one color channel' is thrown. Root issue is with moviepy + # but installing older imageio version seems to get around it. + 'imageio==2.24.0', + 'moviepy' ] ) diff --git a/tests/common_mocks/mock_preprocessor.py b/tests/common_mocks/mock_preprocessor.py index 1e78e687..a602bbcf 100644 --- a/tests/common_mocks/mock_preprocessor.py +++ b/tests/common_mocks/mock_preprocessor.py @@ -1,7 +1,7 @@ from continual_rl.experiments.tasks.preprocessor_base import PreprocessorBase -from gym.spaces.box import Box +from gymnasium.spaces.box import Box import torch - +import numpy as np class MockPreprocessor(PreprocessorBase): def __init__(self): @@ -9,7 +9,7 @@ def __init__(self): super().__init__(observation_space) def preprocess(self, observation): - return torch.Tensor(observation) + return torch.Tensor(np.array(observation)) def render_episode(self, episode_observations): """ diff --git a/tests/experiments/environment_runners/test_environment_runner_batch.py b/tests/experiments/environment_runners/test_environment_runner_batch.py index 76d1ecb7..e29cc83c 100644 --- a/tests/experiments/environment_runners/test_environment_runner_batch.py +++ b/tests/experiments/environment_runners/test_environment_runner_batch.py @@ -20,14 +20,14 @@ def seed(self, seed): def reset(self): self.reset_count += 1 - return np.array([0, 1, 2]) + return np.array([0, 1, 2]), {"info": "unused"} def step(self, action): self.actions_executed.append(action) observation = np.array([12, 13, 14]) reward = 1.5 - done = action == 4 # Simple way to force the done state we want - return observation, reward, done, {"info": "unused"} + terminated = action == 4 # Simple way to force the done state we want + return observation, reward, terminated, False, {"info": "unused"} def close(self): pass diff --git a/tests/experiments/environment_runners/test_environment_runner_sync.py b/tests/experiments/environment_runners/test_environment_runner_sync.py index 6b634ea3..bc2e5c07 100644 --- a/tests/experiments/environment_runners/test_environment_runner_sync.py +++ b/tests/experiments/environment_runners/test_environment_runner_sync.py @@ -19,14 +19,14 @@ def seed(self, seed): def reset(self): self.reset_count += 1 - return np.array([0, 1, 2]) + return np.array([0, 1, 2]), {"info": "unused"} def step(self, action): self.actions_executed.append(action) observation = np.array([12, 13, 14]) reward = 1.5 - done = action == 4 # Simple way to force the done state we want - return observation, reward, done, {"info": "unused"} + terminated = action == 4 # Simple way to force the done state we want + return observation, reward, terminated, False, {"info": "unused"} def close(self): pass diff --git a/tests/policies/discrete_random/test_discrete_random_policy.py b/tests/policies/discrete_random/test_discrete_random_policy.py index 2e096b56..09ceb4bf 100644 --- a/tests/policies/discrete_random/test_discrete_random_policy.py +++ b/tests/policies/discrete_random/test_discrete_random_policy.py @@ -16,7 +16,7 @@ def test_end_to_end_batch(self, set_tmp_directory, cleanup_experiment, request): # Arrange experiment = Experiment(tasks=[ ImageTask(task_id="some_id", action_space_id=0, - env_spec='BreakoutDeterministic-v4', + env_spec='ale_py:BreakoutDeterministic-v4', num_timesteps=10, time_batch_size=4, eval_mode=False, image_size=[84, 84], grayscale=True) ]) @@ -45,7 +45,7 @@ def test_end_to_end_sync(self, set_tmp_directory, cleanup_experiment, request): # Arrange experiment = Experiment(tasks=[ ImageTask(task_id="end_to_end_sync", action_space_id=0, - env_spec='BreakoutDeterministic-v4', + env_spec='ale_py:BreakoutDeterministic-v4', num_timesteps=10, time_batch_size=4, eval_mode=False, image_size=[84, 84], grayscale=True) ]) diff --git a/tests/policies/impala/test_impala_policy.py b/tests/policies/impala/test_impala_policy.py index 6ae5b09c..434732e3 100644 --- a/tests/policies/impala/test_impala_policy.py +++ b/tests/policies/impala/test_impala_policy.py @@ -22,7 +22,7 @@ def test_end_to_end_batch(self, set_tmp_directory, cleanup_experiment, request): experiment = Experiment( tasks=[ ImageTask(task_id="some_id", action_space_id=0, - env_spec='BreakoutDeterministic-v4', + env_spec='ale_py:BreakoutDeterministic-v4', num_timesteps=10, time_batch_size=4, eval_mode=False, image_size=[84, 84], grayscale=True)]) config = ImpalaPolicyConfig() diff --git a/tests/policies/ppo/test_ppo_policy.py b/tests/policies/ppo/test_ppo_policy.py index 718abc52..c6e880f9 100644 --- a/tests/policies/ppo/test_ppo_policy.py +++ b/tests/policies/ppo/test_ppo_policy.py @@ -19,7 +19,7 @@ def test_end_to_end_batch(self, set_tmp_directory, cleanup_experiment, request): experiment = Experiment( tasks=[ ImageTask(task_id="end_to_end_batch", action_space_id=0, - env_spec='BreakoutDeterministic-v4', + env_spec='ale_py:BreakoutDeterministic-v4', num_timesteps=10, time_batch_size=4, eval_mode=False, image_size=[84, 84], grayscale=True)]) config = PPOPolicyConfig()