Emerge-Lab · aaravpandya · Jan 13, 2024 · Jan 15, 2024
@@ -0,0 +1,193 @@
+"""Train HR-PPO agent."""
+import logging
+from contextlib import nullcontext
+from datetime import datetime
+
+import numpy as np
+import torch
+from box import Box
+from stable_baselines3.common.policies import ActorCriticPolicy
+
+import wandb
+
+from typing import Callable
+
+# Import networks
+from networks.mlp_late_fusion import LateFusionMLP, LateFusionMLPPolicy
+# Permutation equivariant network
+from networks.perm_eq_late_fusion import LateFusionNet, LateFusionPolicy 
+
+# Multi-agent as vectorized environment
+from nocturne.envs.vec_env_ma import MultiAgentAsVecEnv
+from utils.config import load_config
+from utils.random_utils import init_seed
+from utils.render import make_video
+
+# Custom callback
+from utils.sb3.callbacks import CustomMultiAgentCallback
+
+# Custom PPO class that supports multi-agent control
+from utils.sb3.reg_ppo import RegularizedPPO
+from utils.string_utils import datetime_to_str
+
+logging.basicConfig(level=logging.INFO)
+
+
+def linear_schedule(initial_value: float) -> Callable[[float], float]:
+    """
+    Linear learning rate schedule.
+
+    :param initial_value: Initial learning rate.
+    :return: schedule that computes
+      current learning rate depending on remaining progress
+    """
+    def func(progress_remaining: float) -> float:
+        """
+        Progress will decrease from 1 (beginning) to 0.
+
+        :param progress_remaining:
+        :return: current learning rate
+        """
+        return progress_remaining * initial_value
+
+    return func
+
+def train(env_config, exp_config, video_config, model_config):  # pylint: disable=redefined-outer-name
+    """Train RL agent using PPO."""
+    # Ensure reproducability
+    init_seed(env_config, exp_config, exp_config.seed)
+
+    # Make environment
+    from nocturne.envs.nocturne_gymnasium import NocturneGymnasium
+    from stable_baselines3.common.vec_env import SubprocVecEnv
+    from nocturne.envs.base_env import BaseEnv
+    def make_env(env_config):
+        return NocturneGymnasium(BaseEnv(config=env_config)) 
+
+    env = SubprocVecEnv([lambda: make_env(env_config) for _ in range(4)])
+
+    # Set up run
+    datetime_ = datetime_to_str(dt=datetime.now())
+    run_id = f"{datetime_}" if exp_config.track_wandb else None
+
+    # Add scene to config
+    # exp_config.scene = env.filename
+    exp_config.track_wandb = False
+
+    with wandb.init(
+        project=exp_config.project,
+        name=run_id,
+        group=exp_config.group,
+        config={**exp_config, **env_config},
+        id=run_id,
+        **exp_config.wandb,
+    ) if exp_config.track_wandb else nullcontext() as run:
+        # Set device
+        exp_config.ppo.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+        # logging.info(f"Created env. Max # agents = {env_config.max_num_vehicles}.")
+        # logging.info(f"Learning in {len(env.env.files)} scene(s): {env.env.files} | using {exp_config.ppo.device}")
+        # logging.info(f"--- obs_space: {env.observation_space.shape[0]} ---")
+        # logging.info(f"Action_space\n: {env.env.idx_to_actions}")
+
+        # if exp_config.reg_weight > 0.0:
+        #     logging.info(f"Regularization weight: {exp_config.reg_weight} with policy: {exp_config.human_policy_path}")
+
+        # # Initialize custom callback
+        custom_callback = CustomMultiAgentCallback(
+            env_config=env_config,
+            exp_config=exp_config,
+            video_config=video_config,
+            wandb_run=run if run_id is not None else None,
+        )
+
+        # Make scene init video to check expert actions
+        # if exp_config.track_wandb:
+        #     for model in exp_config.wandb_init_videos:
+        #         make_video(
+        #             env_config=env_config,
+        #             exp_config=exp_config,
+        #             video_config=video_config,
+        #             filenames=[env.filename],
+        #             model=model,
+        #             n_steps=None,
+        #         )
+        exp_config.track_wandb = False
+
+        human_policy = None
+        # Load human reference policy if regularization is used
+        if exp_config.reg_weight > 0.0:
+            saved_variables = torch.load(exp_config.human_policy_path, map_location=exp_config.ppo.device)
+            human_policy = ActorCriticPolicy(**saved_variables["data"])
+            human_policy.load_state_dict(saved_variables["state_dict"])
+            human_policy.to(exp_config.ppo.device)
+
+        # Set up PPO
+        model = RegularizedPPO(
+            learning_rate=linear_schedule(1e-4),
+            reg_policy=human_policy,
+            reg_weight=exp_config.reg_weight,  # Regularization weight; lambda
+            env=env,
+            n_steps=exp_config.ppo.n_steps,
+            policy=LateFusionPolicy,
+            ent_coef=exp_config.ppo.ent_coef,
+            vf_coef=exp_config.ppo.vf_coef,
+            seed=exp_config.seed,  # Seed for the pseudo random generators
+            verbose=exp_config.verbose,
+            tensorboard_log=f"runs/{run_id}" if run_id is not None else None,
+            device=exp_config.ppo.device,
+            env_config=env_config,
+            mlp_class=LateFusionNet,
+            mlp_config=model_config,
+        )
+
+        # Log number of trainable parameters
+        policy_params = filter(lambda p: p.requires_grad, model.policy.parameters())
+        params = sum(np.prod(p.size()) for p in policy_params)
+        exp_config.n_policy_params = params
+        logging.info(f"Policy | trainable params: {params:,} \n")
+
+        # Architecture
+        logging.info(f"Policy | arch: \n {model.policy}")
+
+
+        return env, model
+        # Learn
+        # model.learn(
+        #     **exp_config.learn,
+        #     callback=custom_callback,
+        # )
+
+
+if __name__ == "__main__":
+    env_config = load_config("env_config")
+    exp_config = load_config("exp_config")
+    video_config = load_config("video_config")
+
+    env_config.num_files = 10
+
+    # Define model architecture
+    model_config = None
+    # model_config = Box(
+    #     {
+    #         "arch_ego_state": [8],
+    #         "arch_road_objects": [64],
+    #         "arch_road_graph": [128, 64],
+    #         "arch_shared_net": [128],
+    #         "act_func": "tanh",
+    #         "dropout": 0.0,
+    #         "last_layer_dim_pi": 64,
+    #         "last_layer_dim_vf": 64,
+    #     }
+    # )
+
+    # Train
+    env, model = train(
+        env_config=env_config,
+        exp_config=exp_config,
+        video_config=video_config,
+        model_config=model_config,
+    )
+
+
+    model.learn(1)
@@ -0,0 +1,169 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nocturne.envs.nocturne_gymnasium import NocturneGymnasium, CustomPostprocessor\n",
+    "import yaml\n",
+    "from nocturne.envs.base_env import BaseEnv\n",
+    "from nocturne.envs.vec_env_ma import MultiAgentAsVecEnv\n",
+    "\n",
+    "import pufferlib.vectorization\n",
+    "vec = pufferlib.vectorization.Multiprocessing\n",
+    "# vec = pufferlib.vectorization.Serial\n",
+    "\n",
+    "import pufferlib.emulation\n",
+    "import pufferlib.wrappers\n",
+    "\n",
+    "from time import perf_counter\n",
+    "\n",
+    "# Load environment settings\n",
+    "with open(f\"../configs/env_config.yaml\", \"r\") as stream:\n",
+    "    env_config = yaml.safe_load(stream)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def make_env(env_config):\n",
+    "    return NocturneGymnasium(config=env_config, num_agents=env_config[\"max_num_vehicles\"]) \n",
+    "\n",
+    "def nocturne_creator(env_config):\n",
+    "    return pufferlib.emulation.GymnasiumPufferEnv(env_creator=make_env, env_args=(env_config,), postprocessor_cls=CustomPostprocessor)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# envs = vec(nocturne_creator,env_args=[env_config,], num_envs=4, envs_per_worker=2, env_pool=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# envs.async_reset()\n",
+    "# obs = envs.recv()[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# actions = [envs.single_action_space.sample() for _ in range(4)]\n",
+    "# envs.step(actions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Average time for single env step: 0.002120851782616228\n",
+      "Average FPS for single env step: 471.50866844944056\n"
+     ]
+    }
+   ],
+   "source": [
+    "NUM_STEPS = 1000\n",
+    "\n",
+    "env = make_env(env_config)\n",
+    "env.reset()\n",
+    "\n",
+    "total_time = 0\n",
+    "\n",
+    "for i in range(NUM_STEPS):\n",
+    "    actions = env.action_space.sample()\n",
+    "    start = perf_counter()\n",
+    "    env.step(actions)\n",
+    "    end = perf_counter()\n",
+    "    total_time += end - start\n",
+    "\n",
+    "print(f\"Average time for single env step: {total_time/NUM_STEPS}\")\n",
+    "print(f\"Average FPS for single env step: {NUM_STEPS/(total_time)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Average time for 1000 env step: 0.0010244677745140506\n",
+      "Average FPS for 1000 env step: 976.1165991525143\n"
+     ]
+    }
+   ],
+   "source": [
+    "NUM_STEPS = 1000\n",
+    "NUM_PROCESSES = 32\n",
+    "NUM_ENVS = 128\n",
+    "assert(NUM_ENVS % NUM_PROCESSES == 0)\n",
+    "NUM_ENVS_PER_WORKER = NUM_ENVS // NUM_PROCESSES\n",
+    "\n",
+    "envs = vec(nocturne_creator,env_args=[env_config,], num_envs=NUM_ENVS, envs_per_worker=1, env_pool=False)\n",
+    "envs.async_reset()\n",
+    "obs = envs.recv()[0]\n",
+    "\n",
+    "total_time = 0\n",
+    "for i in range(NUM_STEPS):\n",
+    "    actions = [envs.single_action_space.sample() for _ in range(NUM_ENVS)]\n",
+    "    start = perf_counter()\n",
+    "    envs.step(actions)\n",
+    "    end = perf_counter()\n",
+    "    total_time += end - start\n",
+    "envs.close()\n",
+    "print(f\"Average time for {NUM_STEPS} env step: {total_time/(NUM_STEPS*NUM_ENVS)}\")\n",
+    "print(f\"Average FPS for {NUM_STEPS} env step: {(NUM_STEPS/(total_time))*NUM_ENVS}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}