Emerge-Lab · daphne-cornelisse · Apr 23, 2025 · Mar 20, 2025 · Mar 20, 2025 · Mar 20, 2025
diff --git a/.env.template b/.env.template
diff --git a/.gitignore b/.gitignore
@@ -27,9 +27,11 @@ data/raw/*
 data/processed/validation/*
 data/processed/training/*
 data/processed/testing/*
-data/processed/sampled/*
+data/processed/pop_play/*
 data/processed/hand_designed/*
 analyze/figures/*
+data/other/*
+wosac/
 
 # Logging
 /wandb

diff --git a/baselines/ppo/config/ppo_base_puffer.yaml b/baselines/ppo/config/ppo_base_puffer.yaml
@@ -8,20 +8,31 @@ model_cpt: null
 
 environment: # Overrides default environment configs (see pygpudrive/env/config.py)
   name: "gpudrive"
-  num_worlds: 75 # Number of parallel environments
-  k_unique_scenes: 75  # Number of unique scenes to sample from
+  num_worlds: 100 # Number of parallel environments
+  k_unique_scenes: 100 # Number of unique scenes to sample from
   max_controlled_agents: 64 # Maximum number of agents controlled by the model. Make sure this aligns with the variable kMaxAgentCount in src/consts.hpp
   ego_state: true
   road_map_obs: true
   partner_obs: true
   norm_obs: true
-  add_goal_state: true # If true, the goal state is added to the ego observation
+  add_reference_path: false
   remove_non_vehicles: false # If false, all agents are included (vehicles, pedestrians, cyclists)
   lidar_obs: false # NOTE: Setting this to true currently turns of the other observation types
-  reward_type: "weighted_combination"
+  reward_type: "weighted_combination" # Options: "weighted_combination", "reward_conditioned"
   collision_weight: -0.75
   off_road_weight: -0.75
   goal_achieved_weight: 1.0
+  init_mode: all_non_trivial
+
+  # If reward_type is "reward_conditioned", the following parameters are used
+  condition_mode: random
+  collision_weight_lb: -3.0
+  collision_weight_ub: 0.01
+  goal_achieved_weight_lb: 1.0
+  goal_achieved_weight_ub: 3.0
+  off_road_weight_lb: -3.0
+  off_road_weight_ub: 0.0
+
   dynamics_model: "classic"
   collision_behavior: "ignore" # Options: "remove", "stop", "ignore"
   goal_behavior: "remove" # Options: "remove", "stop", "ignore"
@@ -39,8 +50,8 @@ environment: # Overrides default environment configs (see pygpudrive/env/config.
 
 wandb:
   entity: ""
-  project: "clean_tests"
-  group: " "
+  project: "gpudrive"
+  group: ""
   mode: "online" # Options: online, offline, disabled
   tags: ["ppo", "ff"]
 
@@ -54,16 +65,16 @@ train:
   compile_mode: "reduce-overhead"
 
   # # # Data sampling # # #
-  resample_scenes: false
+  resample_scenes: true
   resample_dataset_size: 10_000 # Number of unique scenes to sample from
   resample_interval: 2_000_000
   sample_with_replacement: true
   shuffle_dataset: false
 
   # # # PPO # # #
   torch_deterministic: false
-  total_timesteps: 1_000_000_000
-  batch_size: 131_072
+  total_timesteps: 2_000_000_000
+  batch_size: 262_144
   minibatch_size: 8192
   learning_rate: 3e-4
   anneal_lr: false
@@ -89,7 +100,7 @@ train:
     num_parameters: 0 # Total trainable parameters, to be filled at runtime
 
   # # # Checkpointing # # #
-  checkpoint_interval: 400 # Save policy every k iterations
+  checkpoint_interval: 500 # Save policy every k iterations
   checkpoint_path: "./runs"
 
   # # # Rendering # # #

diff --git a/baselines/ppo/config/ppo_population.yaml b/baselines/ppo/config/ppo_population.yaml
@@ -0,0 +1,119 @@
+mode: "train"
+use_rnn: false
+eval_model_path: null
+baseline: false
+data_dir: data/processed/pop_play
+continue_training: false
+model_cpt: null
+
+environment: # Overrides default environment configs (see pygpudrive/env/config.py)
+  name: "gpudrive"
+  num_worlds: 100 # Number of parallel environments
+  k_unique_scenes: 100 # Number of unique scenes to sample from
+  max_controlled_agents: 64 # Maximum number of agents controlled by the model. Make sure this aligns with the variable kMaxAgentCount in src/consts.hpp
+  ego_state: true
+  road_map_obs: true
+  partner_obs: true
+  norm_obs: true
+  remove_non_vehicles: false # If false, all agents are included (vehicles, pedestrians, cyclists)
+  lidar_obs: false # NOTE: Setting this to true currently turns of the other observation types
+  reward_type: "reward_conditioned" # Options: "weighted_combination", "reward_conditioned", "follow_waypoints"
+  collision_weight: -0.75
+  off_road_weight: -0.75
+  goal_achieved_weight: 1.0
+  init_mode: all_non_trivial
+
+  # If reward_type is "reward_conditioned", the following parameters are used
+  randomize_rewards: true
+  condition_mode: random # Options: random, fixed
+  collision_weight_lb: -3.0
+  collision_weight_ub: 0.0
+  goal_achieved_weight_lb: 1.0
+  goal_achieved_weight_ub: 3.0
+  off_road_weight_lb: -3.0
+  off_road_weight_ub: 0.0
+
+  dynamics_model: "classic"
+  collision_behavior: "ignore" # Options: "remove", "stop", "ignore"
+  dist_to_goal_threshold: 2.0
+  polyline_reduction_threshold: 0.1 # Rate at which to sample points from the polyline (0 is use all closest points, 1 maximum sparsity), needs to be balanced with kMaxAgentMapObservationsCount
+  sampling_seed: 42 # If given, the set of scenes to sample from will be deterministic, if None, the set of scenes will be random
+  obs_radius: 50.0 # Visibility radius of the agents
+  action_space_steer_disc: 13
+  action_space_accel_disc: 7
+  init_steps: 0 # Warmup steps
+  # Versatile Behavior Diffusion (VBD): This will slow down training
+  use_vbd: false
+  vbd_model_path: "gpudrive/integrations/vbd/weights/epoch=18.ckpt"
+  vbd_trajectory_weight: 0.1 # Importance of distance to the vbd trajectories in the reward function
+  vbd_in_obs: false
+
+wandb:
+  entity: ""
+  project: "kshotagents"
+  group: "separate_actor_critic"
+  mode: "online" # Options: online, offline, disabled
+  tags: ["ppo", "ff"]
+
+train:
+  exp_id: # Set dynamically in the script if needed
+  seed: 42
+  cpu_offload: false
+  device: "cuda"  # Dynamically set to cuda if available, else cpu
+  bptt_horizon: 1
+  compile: false
+  compile_mode: "reduce-overhead"
+
+  # # # Data sampling # # #
+  resample_scenes: false
+  resample_dataset_size: 500 # Number of unique scenes to sample from
+  resample_interval: 2_000_000
+  sample_with_replacement: false
+  shuffle_dataset: false
+
+  # # # PPO # # #
+  torch_deterministic: false
+  total_timesteps: 2_000_000_000
+  batch_size: 131072
+  minibatch_size: 8192
+  learning_rate: 3e-4
+  anneal_lr: true
+  gamma: 0.99
+  gae_lambda: 0.95
+  update_epochs: 4
+  norm_adv: true
+  clip_coef: 0.2
+  clip_vloss: false
+  vf_clip_coef: 0.2
+  ent_coef: 0.001
+  vf_coef: 0.5
+  max_grad_norm: 0.5
+  target_kl: null
+  log_window: 1000
+
+  # # # Network # # #
+  network:
+    embed_dim: 64 # Embedding of the input features
+    dropout: 0.01
+    class_name: "Agent"
+    num_parameters: 0 # Total trainable parameters, to be filled at runtime
+
+  # # # Checkpointing # # #
+  checkpoint_interval: 250 # Save policy every k iterations
+  checkpoint_path: "./runs"
+
+  # # # Rendering # # #
+  render: false # Determines whether to render the environment (note: will slow down training)
+  render_3d: false # Render simulator state in 3d or 2d
+  render_interval: 50 # Render every k iterations
+  render_k_scenarios: 1 # Number of scenarios to render
+  render_format: "mp4" # Options: gif, mp4
+  render_fps: 20 # Frames per second
+  zoom_radius: 100
+  plot_waypoints: true
+
+vec:
+  backend: "native" # Only native is currently supported
+  num_workers: 1
+  env_batch_size: 1
+  zero_copy: false
diff --git a/baselines/ppo/config/ppo_waypoint.yaml b/baselines/ppo/config/ppo_waypoint.yaml
@@ -0,0 +1,122 @@
+mode: "train"
+use_rnn: false
+eval_model_path: null
+baseline: false
+data_dir: data/processed/wosac/validation_json_100
+continue_training: false
+model_cpt: null
+
+environment: # Overrides default environment configs (see pygpudrive/env/config.py)
+  name: "gpudrive"
+  num_worlds: 100 # Number of parallel environments
+  k_unique_scenes: 100 # Number of unique scenes to sample from
+  max_controlled_agents: 64 # Maximum number of agents controlled by the model. Make sure this aligns with the variable kMaxAgentCount in src/consts.hpp
+  ego_state: true
+  road_map_obs: true
+  partner_obs: true
+  norm_obs: true
+  remove_non_vehicles: false
+  collision_behavior: "ignore"
+  goal_behavior: "ignore"
+  reward_type: "follow_waypoints"
+  waypoint_distance_scale: 0.01
+  speed_distance_scale: 0.01
+  jerk_smoothness_scale: 0.001
+
+  init_mode: all_non_trivial #womd_tracks_to_predict
+  dynamics_model: "classic"
+  polyline_reduction_threshold: 0.1 # Rate at which to sample points from the polyline (0 is use all closest points, 1 maximum sparsity), needs to be balanced with kMaxAgentMapObservationsCount
+  sampling_seed: 42 # If given, the set of scenes to sample from will be deterministic, if None, the set of scenes will be random
+  obs_radius: 50.0 # Visibility radius of the agents
+  action_space_steer_disc: 15
+  action_space_accel_disc: 11
+  init_steps: 0 # Warmup steps
+  goal_achieved_weight: 0.0
+  collision_weight: -0.2
+  off_road_weight: -0.2
+
+  # Versatile Behavior Diffusion (VBD)
+  use_vbd: false
+  init_steps: 0
+  vbd_trajectory_weight: 0.1 # Importance of distance to the vbd trajectories in the reward function
+  vbd_in_obs: false
+
+  # Planning guidance
+  add_reference_path: true # If true, a reference path is added to the ego observation
+  add_reference_speed: true # If true, the reference speed (scalar) is added to the ego observation
+  prob_reference_dropout: 0.0 # Value between 0 and 1, probability of a reference point to be zeroed out
+
+wandb:
+  entity: ""
+  project: "humanlike"
+  group: "debug"
+  mode: "online" # Options: online, offline, disabled
+  tags: ["ppo", "ff"]
+
+train:
+  exp_id: waypoint_rs # Set dynamically in the script if needed
+  seed: 42
+  cpu_offload: false
+  device: "cuda" # Dynamically set to cuda if available, else cpu
+  bptt_horizon: 1
+  compile: false
+  compile_mode: "reduce-overhead"
+
+  # # # Data sampling # # #
+  resample_scenes: false
+  resample_dataset_size: 500 # Number of unique scenes to sample from
+  resample_interval: 2_000_000
+  sample_with_replacement: true
+  shuffle_dataset: true
+  file_prefix: ""
+
+  # # # PPO # # #
+  torch_deterministic: false
+  total_timesteps: 2_000_000_000
+  batch_size: 131072
+  minibatch_size: 8192
+  learning_rate: 3e-4
+  anneal_lr: true
+  gamma: 0.99
+  gae_lambda: 0.95
+  update_epochs: 4
+  norm_adv: true
+  clip_coef: 0.2
+  clip_vloss: false
+  vf_clip_coef: 0.2
+  ent_coef: 0.001
+  vf_coef: 0.5
+  max_grad_norm: 0.5
+  target_kl: null
+
+  # # # Logging # # #
+  log_window: 500
+  track_realism_metrics: true # Log human-like metrics
+  track_n_worlds: 3 # Number of worlds to track
+
+  # # # Network # # #
+  network:
+    embed_dim: 64 # Embedding of the input features
+    dropout: 0.01
+    class_name: "Agent"
+    num_parameters: 0 # Total trainable parameters, to be filled at runtime
+
+  # # # Checkpointing # # #
+  checkpoint_interval: 500 # Save policy every k iterations
+  checkpoint_path: "./runs"
+
+  # # # Rendering # # #
+  render: false # Determines whether to render the environment (note: will slow down training)
+  render_3d: false # Render simulator state in 3d or 2d
+  render_interval: 150 # Render every k iterations
+  render_k_scenarios: 2 # Number of scenarios to render
+  render_format: "mp4" # Options: gif, mp4
+  render_fps: 20 # Frames per second
+  zoom_radius: 100
+  plot_waypoints: true
+
+vec:
+  backend: "native" # Only native is currently supported
+  num_workers: 1
+  env_batch_size: 1
+  zero_copy: false
diff --git a/baselines/ppo/ppo_pufferlib.py b/baselines/ppo/ppo_pufferlib.py
@@ -161,11 +161,13 @@ def run(
     # fmt: off
     # Environment options
     num_worlds: Annotated[Optional[int], typer.Option(help="Number of parallel envs")] = None,
+    max_controlled_agents: Annotated[Optional[int], typer.Option(help="Number of controlled agents")] = None,
     k_unique_scenes: Annotated[Optional[int], typer.Option(help="The number of unique scenes to sample")] = None,
     collision_weight: Annotated[Optional[float], typer.Option(help="The weight for collision penalty")] = None,
     off_road_weight: Annotated[Optional[float], typer.Option(help="The weight for off-road penalty")] = None,
     goal_achieved_weight: Annotated[Optional[float], typer.Option(help="The weight for goal-achieved reward")] = None,
     dist_to_goal_threshold: Annotated[Optional[float], typer.Option(help="The distance threshold for goal-achieved")] = None,
+    randomize_rewards: Annotated[Optional[int], typer.Option(help="If reward_type == reward_conditioned, choose the condition_mode; 0 or 1")] = 0,
     sampling_seed: Annotated[Optional[int], typer.Option(help="The seed for sampling scenes")] = None,
     obs_radius: Annotated[Optional[float], typer.Option(help="The radius for the observation")] = None,
     collision_behavior: Annotated[Optional[str], typer.Option(help="The collision behavior; 'ignore' or 'remove'")] = None,
@@ -200,9 +202,20 @@ def run(
     # Load default configs
     config = load_config(config_path)
 
+    if config.environment.reward_type == "reward_conditioned":
+        if bool(randomize_rewards):
+            config.environment.condition_mode = "random"
+            config.train.exp_id = "random_weights"
+        else:
+            config.environment.condition_mode = (
+                "fixed"  # Use the same type for every agent
+            )
+            config.train.exp_id = "fixed_weights"
+
     # Override configs with command-line arguments
     env_config = {
         "num_worlds": num_worlds,
+        "max_controlled_agents": max_controlled_agents,
         "k_unique_scenes": k_unique_scenes,
         "collision_weight": collision_weight,
         "off_road_weight": off_road_weight,