Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
193 changes: 193 additions & 0 deletions examples/06_ppo_with_sb3_ma_menv_control.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
"""Train HR-PPO agent."""
import logging
from contextlib import nullcontext
from datetime import datetime

import numpy as np
import torch
from box import Box
from stable_baselines3.common.policies import ActorCriticPolicy

import wandb

from typing import Callable

# Import networks
from networks.mlp_late_fusion import LateFusionMLP, LateFusionMLPPolicy
# Permutation equivariant network
from networks.perm_eq_late_fusion import LateFusionNet, LateFusionPolicy

# Multi-agent as vectorized environment
from nocturne.envs.vec_env_ma import MultiAgentAsVecEnv
from utils.config import load_config
from utils.random_utils import init_seed
from utils.render import make_video

# Custom callback
from utils.sb3.callbacks import CustomMultiAgentCallback

# Custom PPO class that supports multi-agent control
from utils.sb3.reg_ppo import RegularizedPPO
from utils.string_utils import datetime_to_str

logging.basicConfig(level=logging.INFO)


def linear_schedule(initial_value: float) -> Callable[[float], float]:
"""
Linear learning rate schedule.

:param initial_value: Initial learning rate.
:return: schedule that computes
current learning rate depending on remaining progress
"""
def func(progress_remaining: float) -> float:
"""
Progress will decrease from 1 (beginning) to 0.

:param progress_remaining:
:return: current learning rate
"""
return progress_remaining * initial_value

return func

def train(env_config, exp_config, video_config, model_config): # pylint: disable=redefined-outer-name
"""Train RL agent using PPO."""
# Ensure reproducability
init_seed(env_config, exp_config, exp_config.seed)

# Make environment
from nocturne.envs.nocturne_gymnasium import NocturneGymnasium
from stable_baselines3.common.vec_env import SubprocVecEnv
from nocturne.envs.base_env import BaseEnv
def make_env(env_config):
return NocturneGymnasium(BaseEnv(config=env_config))

env = SubprocVecEnv([lambda: make_env(env_config) for _ in range(4)])

# Set up run
datetime_ = datetime_to_str(dt=datetime.now())
run_id = f"{datetime_}" if exp_config.track_wandb else None

# Add scene to config
# exp_config.scene = env.filename
exp_config.track_wandb = False

with wandb.init(
project=exp_config.project,
name=run_id,
group=exp_config.group,
config={**exp_config, **env_config},
id=run_id,
**exp_config.wandb,
) if exp_config.track_wandb else nullcontext() as run:
# Set device
exp_config.ppo.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# logging.info(f"Created env. Max # agents = {env_config.max_num_vehicles}.")
# logging.info(f"Learning in {len(env.env.files)} scene(s): {env.env.files} | using {exp_config.ppo.device}")
# logging.info(f"--- obs_space: {env.observation_space.shape[0]} ---")
# logging.info(f"Action_space\n: {env.env.idx_to_actions}")

# if exp_config.reg_weight > 0.0:
# logging.info(f"Regularization weight: {exp_config.reg_weight} with policy: {exp_config.human_policy_path}")

# # Initialize custom callback
custom_callback = CustomMultiAgentCallback(
env_config=env_config,
exp_config=exp_config,
video_config=video_config,
wandb_run=run if run_id is not None else None,
)

# Make scene init video to check expert actions
# if exp_config.track_wandb:
# for model in exp_config.wandb_init_videos:
# make_video(
# env_config=env_config,
# exp_config=exp_config,
# video_config=video_config,
# filenames=[env.filename],
# model=model,
# n_steps=None,
# )
exp_config.track_wandb = False

human_policy = None
# Load human reference policy if regularization is used
if exp_config.reg_weight > 0.0:
saved_variables = torch.load(exp_config.human_policy_path, map_location=exp_config.ppo.device)
human_policy = ActorCriticPolicy(**saved_variables["data"])
human_policy.load_state_dict(saved_variables["state_dict"])
human_policy.to(exp_config.ppo.device)

# Set up PPO
model = RegularizedPPO(
learning_rate=linear_schedule(1e-4),
reg_policy=human_policy,
reg_weight=exp_config.reg_weight, # Regularization weight; lambda
env=env,
n_steps=exp_config.ppo.n_steps,
policy=LateFusionPolicy,
ent_coef=exp_config.ppo.ent_coef,
vf_coef=exp_config.ppo.vf_coef,
seed=exp_config.seed, # Seed for the pseudo random generators
verbose=exp_config.verbose,
tensorboard_log=f"runs/{run_id}" if run_id is not None else None,
device=exp_config.ppo.device,
env_config=env_config,
mlp_class=LateFusionNet,
mlp_config=model_config,
)

# Log number of trainable parameters
policy_params = filter(lambda p: p.requires_grad, model.policy.parameters())
params = sum(np.prod(p.size()) for p in policy_params)
exp_config.n_policy_params = params
logging.info(f"Policy | trainable params: {params:,} \n")

# Architecture
logging.info(f"Policy | arch: \n {model.policy}")


return env, model
# Learn
# model.learn(
# **exp_config.learn,
# callback=custom_callback,
# )


if __name__ == "__main__":
env_config = load_config("env_config")
exp_config = load_config("exp_config")
video_config = load_config("video_config")

env_config.num_files = 10

# Define model architecture
model_config = None
# model_config = Box(
# {
# "arch_ego_state": [8],
# "arch_road_objects": [64],
# "arch_road_graph": [128, 64],
# "arch_shared_net": [128],
# "act_func": "tanh",
# "dropout": 0.0,
# "last_layer_dim_pi": 64,
# "last_layer_dim_vf": 64,
# }
# )

# Train
env, model = train(
env_config=env_config,
exp_config=exp_config,
video_config=video_config,
model_config=model_config,
)


model.learn(1)
169 changes: 169 additions & 0 deletions examples/07_nocturne_pufferlib.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from nocturne.envs.nocturne_gymnasium import NocturneGymnasium, CustomPostprocessor\n",
"import yaml\n",
"from nocturne.envs.base_env import BaseEnv\n",
"from nocturne.envs.vec_env_ma import MultiAgentAsVecEnv\n",
"\n",
"import pufferlib.vectorization\n",
"vec = pufferlib.vectorization.Multiprocessing\n",
"# vec = pufferlib.vectorization.Serial\n",
"\n",
"import pufferlib.emulation\n",
"import pufferlib.wrappers\n",
"\n",
"from time import perf_counter\n",
"\n",
"# Load environment settings\n",
"with open(f\"../configs/env_config.yaml\", \"r\") as stream:\n",
" env_config = yaml.safe_load(stream)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def make_env(env_config):\n",
" return NocturneGymnasium(config=env_config, num_agents=env_config[\"max_num_vehicles\"]) \n",
"\n",
"def nocturne_creator(env_config):\n",
" return pufferlib.emulation.GymnasiumPufferEnv(env_creator=make_env, env_args=(env_config,), postprocessor_cls=CustomPostprocessor)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# envs = vec(nocturne_creator,env_args=[env_config,], num_envs=4, envs_per_worker=2, env_pool=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# envs.async_reset()\n",
"# obs = envs.recv()[0]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# actions = [envs.single_action_space.sample() for _ in range(4)]\n",
"# envs.step(actions)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Average time for single env step: 0.002120851782616228\n",
"Average FPS for single env step: 471.50866844944056\n"
]
}
],
"source": [
"NUM_STEPS = 1000\n",
"\n",
"env = make_env(env_config)\n",
"env.reset()\n",
"\n",
"total_time = 0\n",
"\n",
"for i in range(NUM_STEPS):\n",
" actions = env.action_space.sample()\n",
" start = perf_counter()\n",
" env.step(actions)\n",
" end = perf_counter()\n",
" total_time += end - start\n",
"\n",
"print(f\"Average time for single env step: {total_time/NUM_STEPS}\")\n",
"print(f\"Average FPS for single env step: {NUM_STEPS/(total_time)}\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Average time for 1000 env step: 0.0010244677745140506\n",
"Average FPS for 1000 env step: 976.1165991525143\n"
]
}
],
"source": [
"NUM_STEPS = 1000\n",
"NUM_PROCESSES = 32\n",
"NUM_ENVS = 128\n",
"assert(NUM_ENVS % NUM_PROCESSES == 0)\n",
"NUM_ENVS_PER_WORKER = NUM_ENVS // NUM_PROCESSES\n",
"\n",
"envs = vec(nocturne_creator,env_args=[env_config,], num_envs=NUM_ENVS, envs_per_worker=1, env_pool=False)\n",
"envs.async_reset()\n",
"obs = envs.recv()[0]\n",
"\n",
"total_time = 0\n",
"for i in range(NUM_STEPS):\n",
" actions = [envs.single_action_space.sample() for _ in range(NUM_ENVS)]\n",
" start = perf_counter()\n",
" envs.step(actions)\n",
" end = perf_counter()\n",
" total_time += end - start\n",
"envs.close()\n",
"print(f\"Average time for {NUM_STEPS} env step: {total_time/(NUM_STEPS*NUM_ENVS)}\")\n",
"print(f\"Average FPS for {NUM_STEPS} env step: {(NUM_STEPS/(total_time))*NUM_ENVS}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading