[RLlib] Preparatory PR for multi-agent multi-GPU learner (alpha-star …

…style) #3 (#21652)
amzn · Jan 25, 2022 · d5bfb7b · d5bfb7b
1 parent b2cd123
commit d5bfb7b
Show file tree

Hide file tree

Showing 43 changed files with 375 additions and 281 deletions.
diff --git a/rllib/BUILD b/rllib/BUILD
@@ -81,7 +81,7 @@ py_test(
 )
 
 py_test(
-    name = "learning_cartpole_a2c_fake_gpus",
+    name = "learning_tests_cartpole_a2c_fake_gpus",
     main = "tests/run_regression_tests.py",
     tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "fake_gpus"],
     size = "large",
@@ -126,15 +126,22 @@ py_test(
 
 # APPO
 py_test(
-    name = "learning_tests_cartpole_appo",
+    name = "learning_tests_cartpole_appo_no_vtrace",
     main = "tests/run_regression_tests.py",
     tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete"],
     size = "large",
     srcs = ["tests/run_regression_tests.py"],
-    data = [
-        "tuned_examples/ppo/cartpole-appo.yaml",
-        "tuned_examples/ppo/cartpole-appo-vtrace.yaml"
-    ],
+    data = ["tuned_examples/ppo/cartpole-appo.yaml"],
+    args = ["--yaml-dir=tuned_examples/ppo"]
+)
+
+py_test(
+    name = "learning_tests_cartpole_appo_vtrace",
+    main = "tests/run_regression_tests.py",
+    tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/ppo/cartpole-appo-vtrace.yaml"],
     args = ["--yaml-dir=tuned_examples/ppo"]
 )
 
@@ -151,7 +158,7 @@ py_test(
 )
 
 py_test(
-    name = "learning_frozenlake_appo",
+    name = "learning_tests_frozenlake_appo",
     main = "tests/run_regression_tests.py",
     tags = ["team:ml", "learning_tests", "learning_tests_discrete"],
     size = "large",
@@ -161,7 +168,7 @@ py_test(
 )
 
 py_test(
-    name = "learning_cartpole_appo_fake_gpus",
+    name = "learning_tests_cartpole_appo_fake_gpus",
     main = "tests/run_regression_tests.py",
     tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "fake_gpus"],
     size = "large",
@@ -208,7 +215,7 @@ py_test(
 )
 
 py_test(
-    name = "learning_pendulum_ddpg_fake_gpus",
+    name = "learning_tests_pendulum_ddpg_fake_gpus",
     main = "tests/run_regression_tests.py",
     tags = ["team:ml", "learning_tests", "learning_tests_pendulum", "learning_tests_continuous", "fake_gpus"],
     size = "large",
@@ -263,7 +270,7 @@ py_test(
 )
 
 py_test(
-    name = "learning_cartpole_dqn_fake_gpus",
+    name = "learning_tests_cartpole_dqn_fake_gpus",
     main = "tests/run_regression_tests.py",
     tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "fake_gpus"],
     size = "large",
@@ -286,7 +293,7 @@ py_test(
 )
 
 py_test(
-    name = "learning_cartpole_simpleq_fake_gpus",
+    name = "learning_tests_cartpole_simpleq_fake_gpus",
     main = "tests/run_regression_tests.py",
     tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "fake_gpus"],
     size = "medium",
@@ -318,7 +325,7 @@ py_test(
 )
 
 py_test(
-    name = "learning_cartpole_impala_fake_gpus",
+    name = "learning_tests_cartpole_impala_fake_gpus",
     main = "tests/run_regression_tests.py",
     tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "fake_gpus"],
     size = "large",
@@ -352,7 +359,7 @@ py_test(
 )
 
 py_test(
-    name = "learning_cartpole_pg_fake_gpus",
+    name = "learning_tests_cartpole_pg_fake_gpus",
     main = "tests/run_regression_tests.py",
     tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "fake_gpus"],
     size = "large",
@@ -403,7 +410,7 @@ py_test(
 )
 
 py_test(
-    name = "learning_cartpole_ppo_fake_gpus",
+    name = "learning_tests_cartpole_ppo_fake_gpus",
     main = "tests/run_regression_tests.py",
     tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "fake_gpus"],
     size = "large",
@@ -455,7 +462,7 @@ py_test(
 )
 
 py_test(
-    name = "learning_stateless_cartpole_r2d2_fake_gpus",
+    name = "learning_tests_stateless_cartpole_r2d2_fake_gpus",
     main = "tests/run_regression_tests.py",
     tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "fake_gpus"],
     size = "large",
@@ -506,7 +513,7 @@ py_test(
 )
 
 py_test(
-    name = "learning_pendulum_sac_fake_gpus",
+    name = "learning_tests_pendulum_sac_fake_gpus",
     main = "tests/run_regression_tests.py",
     tags = ["team:ml", "learning_tests", "learning_tests_pendulum", "learning_tests_continuous", "fake_gpus"],
     size = "large",
@@ -845,7 +852,7 @@ py_test(
         "--env", "Pendulum-v1",
         "--run", "APEX_DDPG",
         "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"framework\": \"tf\", \"num_workers\": 2, \"optimizer\": {\"num_replay_buffer_shards\": 1}, \"learning_starts\": 100, \"min_iter_time_s\": 1, \"batch_mode\": \"complete_episodes\"}'",
+        "--config", "'{\"framework\": \"tf\", \"num_workers\": 2, \"optimizer\": {\"num_replay_buffer_shards\": 1}, \"learning_starts\": 100, \"min_time_s_per_reporting\": 1, \"batch_mode\": \"complete_episodes\"}'",
         "--ray-num-cpus", "4",
         ]
 )
@@ -928,7 +935,7 @@ py_test(
         "--env", "CartPole-v0",
         "--run", "IMPALA",
         "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"framework\": \"tf\", \"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1, \"num_multi_gpu_tower_stacks\": 2, \"replay_buffer_num_slots\": 100, \"replay_proportion\": 1.0}'",
+        "--config", "'{\"framework\": \"tf\", \"num_gpus\": 0, \"num_workers\": 2, \"min_time_s_per_reporting\": 1, \"num_multi_gpu_tower_stacks\": 2, \"replay_buffer_num_slots\": 100, \"replay_proportion\": 1.0}'",
         "--ray-num-cpus", "4",
         ]
 )
@@ -942,7 +949,7 @@ py_test(
         "--env", "CartPole-v0",
         "--run", "IMPALA",
         "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"framework\": \"tf\", \"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1, \"num_multi_gpu_tower_stacks\": 2, \"replay_buffer_num_slots\": 100, \"replay_proportion\": 1.0, \"model\": {\"use_lstm\": true}}'",
+        "--config", "'{\"framework\": \"tf\", \"num_gpus\": 0, \"num_workers\": 2, \"min_time_s_per_reporting\": 1, \"num_multi_gpu_tower_stacks\": 2, \"replay_buffer_num_slots\": 100, \"replay_proportion\": 1.0, \"model\": {\"use_lstm\": true}}'",
         "--ray-num-cpus", "4",
         ]
 )

diff --git a/rllib/agents/a3c/a2c.py b/rllib/agents/a3c/a2c.py
@@ -17,7 +17,7 @@
     A3C_CONFIG,
     {
         "rollout_fragment_length": 20,
-        "min_iter_time_s": 10,
+        "min_time_s_per_reporting": 10,
         "sample_async": False,
 
         # A2C supports microbatching, in which we accumulate gradients over

diff --git a/rllib/agents/a3c/a3c.py b/rllib/agents/a3c/a3c.py
@@ -39,8 +39,8 @@
     "entropy_coeff": 0.01,
     # Entropy coefficient schedule
     "entropy_coeff_schedule": None,
-    # Min time per iteration
-    "min_iter_time_s": 5,
+    # Min time per reporting
+    "min_time_s_per_reporting": 5,
     # Workers sample async. Note that this increases the effective
     # rollout_fragment_length by up to 5x due to async buffering of batches.
     "sample_async": True,

diff --git a/rllib/agents/a3c/tests/test_a2c.py b/rllib/agents/a3c/tests/test_a2c.py
@@ -35,7 +35,7 @@ def test_a2c_compilation(self):
                 trainer.stop()
 
     def test_a2c_exec_impl(ray_start_regular):
-        config = {"min_iter_time_s": 0}
+        config = {"min_time_s_per_reporting": 0}
         for _ in framework_iterator(config):
             trainer = a3c.A2CTrainer(env="CartPole-v0", config=config)
             results = trainer.train()
@@ -46,7 +46,7 @@ def test_a2c_exec_impl(ray_start_regular):
 
     def test_a2c_exec_impl_microbatch(ray_start_regular):
         config = {
-            "min_iter_time_s": 0,
+            "min_time_s_per_reporting": 0,
             "microbatch_size": 10,
         }
         for _ in framework_iterator(config):

diff --git a/rllib/agents/a3c/tests/test_a3c.py b/rllib/agents/a3c/tests/test_a3c.py
@@ -51,7 +51,7 @@ def test_a3c_entropy_coeff_schedule(self):
         config["timesteps_per_iteration"] = 20
         # 0 metrics reporting delay, this makes sure timestep,
         # which entropy coeff depends on, is updated after each worker rollout.
-        config["min_iter_time_s"] = 0
+        config["min_time_s_per_reporting"] = 0
         # Initial lr, doesn't really matter because of the schedule below.
         config["entropy_coeff"] = 0.01
         schedule = [

diff --git a/rllib/agents/ars/ars.py b/rllib/agents/ars/ars.py
@@ -228,30 +228,44 @@ def validate_config(self, config: TrainerConfigDict) -> None:
                 "`NoFilter` for ARS!")
 
     @override(Trainer)
-    def _init(self, config, env_creator):
-        self.validate_config(config)
-        env_context = EnvContext(config["env_config"] or {}, worker_index=0)
-        env = env_creator(env_context)
+    def setup(self, config):
+        # Setup our config: Merge the user-supplied config (which could
+        # be a partial config dict with the class' default).
+        self.config = self.merge_trainer_configs(
+            self.get_default_config(), config, self._allow_unknown_configs)
 
-        self._policy_class = get_policy_class(config)
+        # Validate our config dict.
+        self.validate_config(self.config)
+
+        # Generate `self.env_creator` callable to create an env instance.
+        self.env_creator = self._get_env_creator_from_env_id(self._env_id)
+        # Generate the local env.
+        env_context = EnvContext(
+            self.config["env_config"] or {}, worker_index=0)
+        env = self.env_creator(env_context)
+
+        self.callbacks = self.config["callbacks"]()
+
+        self._policy_class = get_policy_class(self.config)
         self.policy = self._policy_class(env.observation_space,
-                                         env.action_space, config)
-        self.optimizer = optimizers.SGD(self.policy, config["sgd_stepsize"])
+                                         env.action_space, self.config)
+        self.optimizer = optimizers.SGD(self.policy,
+                                        self.config["sgd_stepsize"])
 
-        self.rollouts_used = config["rollouts_used"]
-        self.num_rollouts = config["num_rollouts"]
-        self.report_length = config["report_length"]
+        self.rollouts_used = self.config["rollouts_used"]
+        self.num_rollouts = self.config["num_rollouts"]
+        self.report_length = self.config["report_length"]
 
         # Create the shared noise table.
         logger.info("Creating shared noise table.")
-        noise_id = create_shared_noise.remote(config["noise_size"])
+        noise_id = create_shared_noise.remote(self.config["noise_size"])
         self.noise = SharedNoiseTable(ray.get(noise_id))
 
         # Create the actors.
         logger.info("Creating actors.")
         self.workers = [
-            Worker.remote(config, env_creator, noise_id, idx + 1)
-            for idx in range(config["num_workers"])
+            Worker.remote(self.config, self.env_creator, noise_id, idx + 1)
+            for idx in range(self.config["num_workers"])
         ]
 
         self.episodes_so_far = 0
@@ -375,7 +389,7 @@ def compute_single_action(self, observation, *args, **kwargs):
             return action[0], [], {}
         return action[0]
 
-    @Deprecated(new="compute_single_action", error=False)
+    @Deprecated(new="compute_single_action", error=True)
     def compute_action(self, observation, *args, **kwargs):
         return self.compute_single_action(observation, *args, **kwargs)
 

diff --git a/rllib/agents/ars/tests/test_ars.py b/rllib/agents/ars/tests/test_ars.py
@@ -22,7 +22,8 @@ def test_ars_compilation(self):
         config["model"]["fcnet_hiddens"] = [10]
         config["model"]["fcnet_activation"] = None
         config["noise_size"] = 2500000
-        # Test eval workers ("normal" Trainer eval WorkerSet, unusual for ARS).
+        # Test eval workers ("normal" WorkerSet, unlike ARS' list of
+        # RolloutWorkers used for collecting train batches).
         config["evaluation_interval"] = 1
         config["evaluation_num_workers"] = 1
 

diff --git a/rllib/agents/ddpg/apex.py b/rllib/agents/ddpg/apex.py
@@ -38,7 +38,7 @@
         "target_network_update_freq": 500000,
         "timesteps_per_iteration": 25000,
         "worker_side_prioritization": True,
-        "min_iter_time_s": 30,
+        "min_time_s_per_reporting": 30,
     },
     _allow_unknown_configs=True,
 )

diff --git a/rllib/agents/ddpg/ddpg.py b/rllib/agents/ddpg/ddpg.py
@@ -171,8 +171,8 @@
     "num_workers": 0,
     # Whether to compute priorities on workers.
     "worker_side_prioritization": False,
-    # Prevent iterations from going lower than this time span
-    "min_iter_time_s": 1,
+    # Prevent reporting frequency from going lower than this time span.
+    "min_time_s_per_reporting": 1,
 })
 # __sphinx_doc_end__
 # yapf: enable

diff --git a/rllib/agents/ddpg/tests/test_apex_ddpg.py b/rllib/agents/ddpg/tests/test_apex_ddpg.py
@@ -20,7 +20,7 @@ def test_apex_ddpg_compilation_and_per_worker_epsilon_values(self):
         config["num_workers"] = 2
         config["prioritized_replay"] = True
         config["timesteps_per_iteration"] = 100
-        config["min_iter_time_s"] = 1
+        config["min_time_s_per_reporting"] = 1
         config["learning_starts"] = 0
         config["optimizer"]["num_replay_buffer_shards"] = 1
         num_iterations = 1

diff --git a/rllib/agents/ddpg/tests/test_ddpg.py b/rllib/agents/ddpg/tests/test_ddpg.py
@@ -154,7 +154,7 @@ def test_ddpg_loss_function(self):
         config["actor_hiddens"] = [10]
         config["critic_hiddens"] = [10]
         # Make sure, timing differences do not affect trainer.train().
-        config["min_iter_time_s"] = 0
+        config["min_time_s_per_reporting"] = 0
         config["timesteps_per_iteration"] = 100
 
         map_ = {

diff --git a/rllib/agents/dqn/apex.py b/rllib/agents/dqn/apex.py
@@ -78,7 +78,7 @@
         "timesteps_per_iteration": 25000,
         "exploration_config": {"type": "PerWorkerEpsilonGreedy"},
         "worker_side_prioritization": True,
-        "min_iter_time_s": 30,
+        "min_time_s_per_reporting": 30,
         # If set, this will fix the ratio of replayed from a buffer and learned
         # on timesteps to sampled from an environment and stored in the replay
         # buffer timesteps. Otherwise, replay will proceed as fast as possible.

diff --git a/rllib/agents/dqn/simple_q.py b/rllib/agents/dqn/simple_q.py
@@ -103,8 +103,8 @@
     # to increase if your environment is particularly slow to sample, or if
     # you"re using the Async or Ape-X optimizers.
     "num_workers": 0,
-    # Prevent iterations from going lower than this time span.
-    "min_iter_time_s": 1,
+    # Prevent reporting frequency from going lower than this time span.
+    "min_time_s_per_reporting": 1,
 })
 # __sphinx_doc_end__
 # yapf: enable

diff --git a/rllib/agents/dqn/tests/test_apex_dqn.py b/rllib/agents/dqn/tests/test_apex_dqn.py
@@ -24,7 +24,7 @@ def test_apex_zero_workers(self):
         config["learning_starts"] = 1000
         config["prioritized_replay"] = True
         config["timesteps_per_iteration"] = 100
-        config["min_iter_time_s"] = 1
+        config["min_time_s_per_reporting"] = 1
         config["optimizer"]["num_replay_buffer_shards"] = 1
         for _ in framework_iterator(config):
             trainer = apex.ApexTrainer(config=config, env="CartPole-v0")
@@ -41,7 +41,7 @@ def test_apex_dqn_compilation_and_per_worker_epsilon_values(self):
         config["learning_starts"] = 1000
         config["prioritized_replay"] = True
         config["timesteps_per_iteration"] = 100
-        config["min_iter_time_s"] = 1
+        config["min_time_s_per_reporting"] = 1
         config["optimizer"]["num_replay_buffer_shards"] = 1
 
         for _ in framework_iterator(config, with_eager_tracing=True):
@@ -81,7 +81,7 @@ def test_apex_lr_schedule(self):
         config["timesteps_per_iteration"] = 10
         # 0 metrics reporting delay, this makes sure timestep,
         # which lr depends on, is updated after each worker rollout.
-        config["min_iter_time_s"] = 0
+        config["min_time_s_per_reporting"] = 0
         config["optimizer"]["num_replay_buffer_shards"] = 1
         # This makes sure learning schedule is checked every 10 timesteps.
         config["optimizer"]["max_weight_sync_delay"] = 10