diff --git a/clean_pufferl.py b/clean_pufferl.py index 20da4c0246..96bd3264e6 100644 --- a/clean_pufferl.py +++ b/clean_pufferl.py @@ -1,885 +1,684 @@ -from pdb import set_trace as T -import numpy as np +# TODO: Add information +# - Help menu +# - Docs link +#python -m torch.distributed.run --standalone --nnodes=1 --nproc-per-node=1 clean_pufferl.py --env puffer_nmmo3 --mode train +#from torch.distributed.elastic.multiprocessing.errors import record +#@record import os -import random -import psutil +import glob +import ast import time - +import random +import shutil +import argparse +import configparser from threading import Thread from collections import defaultdict, deque -from contextlib import nullcontext -import rich -from rich.console import Console -from rich.table import Table +import numpy as np +import psutil + import torch -import torch.distributed as dist -from torch.utils.cpp_extension import load +import torch.distributed +import torch.utils.cpp_extension import pufferlib -import pufferlib.utils +import pufferlib.sweep +import pufferlib.vector import pufferlib.pytorch +from pufferlib import _C -def create(config, vecenv, policy, optimizer=None, wandb=None, neptune=None): - random.seed(config.seed) - np.random.seed(config.seed) - torch.backends.cudnn.deterministic = config.torch_deterministic - torch.backends.cudnn.benchmark = True - torch.set_float32_matmul_precision('high') - if config.seed is not None: - torch.manual_seed(config.seed) - - ext = 'cu' if 'cuda' in config.device else 'cpp' - puffer_cuda = load( - name='puffer_cuda', - sources=[f'pufferlib.{ext}'], - verbose=True - ) - compute_gae = puffer_cuda.compute_gae - compute_vtrace = puffer_cuda.compute_vtrace - compute_puff_advantage = puffer_cuda.compute_puff_advantage - - losses = pufferlib.namespace( - policy_loss=0, - value_loss=0, - entropy=0, - old_approx_kl=0, - approx_kl=0, - clipfrac=0, - explained_variance=0, - diayn_loss=0, - grad_var=0, - importance=0, - ) +import rich +import rich.traceback +from rich.table import Table +from rich.console import Console +from rich_argparse import RichHelpFormatter +rich.traceback.install(show_locals=False) + +class CleanPuffeRL: + def __init__(self, config, vecenv, policy, neptune=False, wandb=False): + # Backend perf optimization + torch.set_float32_matmul_precision('high') + torch.backends.cudnn.deterministic = config['torch_deterministic'] + torch.backends.cudnn.benchmark = True + + # Reproducibility + seed = config['seed'] + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + + # Vecenv info + vecenv.async_reset(seed) + obs_space = vecenv.single_observation_space + atn_space = vecenv.single_action_space + total_agents = vecenv.num_agents + self.total_agents = total_agents + + # Experience + if config['batch_size'] == 'auto' and config['bptt_horizon'] == 'auto': + raise pufferlib.APIUsageError('Must specify batch_size or bptt_horizon') + elif config['batch_size'] == 'auto': + config['batch_size'] = total_agents * config['bptt_horizon'] + elif config['bptt_horizon'] == 'auto': + config['bptt_horizon'] = config['batch_size'] // total_agents + + batch_size = config['batch_size'] + horizon = config['bptt_horizon'] + segments = batch_size // horizon + self.segments = segments + if total_agents > segments: + raise pufferlib.APIUsageError( + f'Total agents {total_agents} <= segments {segments}' + ) - utilization = Utilization() - msg = f'Model Size: {abbreviate(count_params(policy))} parameters' - - vecenv.async_reset(config.seed) - total_agents = vecenv.num_agents - obs_shape = vecenv.single_observation_space.shape - atn_shape = vecenv.single_action_space.shape - obs_dtype = pufferlib.pytorch.numpy_to_torch_dtype_dict[vecenv.single_observation_space.dtype] - atn_dtype = pufferlib.pytorch.numpy_to_torch_dtype_dict[vecenv.single_action_space.dtype] - on_policy_rows = config.batch_size // config.bptt_horizon - off_policy_rows = int(config.replay_factor*config.batch_size // config.bptt_horizon) - experience_rows = on_policy_rows + off_policy_rows - pin = config.device == 'cuda' and config.cpu_offload - obs_device = config.device if not pin else 'cpu' - experience = pufferlib.namespace( - obs=torch.zeros(experience_rows, config.bptt_horizon, *obs_shape, - dtype=obs_dtype, pin_memory=pin, device='cpu' if pin else config.device), - actions=torch.zeros(experience_rows, config.bptt_horizon, *atn_shape, - dtype=atn_dtype, device=config.device), - logprobs=torch.zeros(experience_rows, config.bptt_horizon, device=config.device), - rewards=torch.zeros(experience_rows, config.bptt_horizon, device=config.device), - dones=torch.zeros(experience_rows, config.bptt_horizon, device=config.device), - truncateds=torch.zeros(experience_rows, config.bptt_horizon, device=config.device), - ratio = torch.ones(experience_rows, config.bptt_horizon, device=config.device), - ) - ep_uses = torch.zeros(experience_rows, device=config.device, dtype=torch.int32) - ep_lengths = torch.zeros(total_agents, device=config.device, dtype=torch.int32) - ep_indices = torch.arange(total_agents, device=config.device, dtype=torch.int32) - free_idx = total_agents - assert free_idx <= experience_rows, f'Total agents {total_agents} must be at least batch size {config.batch_size} / bptt_horizon {config.bptt_horizon} = {experience_rows}' - - diayn_skills = None - if config.use_diayn: - diayn_skills = torch.randint( - 0, config.diayn_archive, (total_agents,), dtype=torch.long, device=config.device) - experience.diayn_batch = torch.zeros(experience_rows, config.bptt_horizon, - dtype=torch.long, device=config.device) - - if config.use_p3o: - batch_size = config.batch_size - p3o_horizon = config.p3o_horizon - device = config.device - experience.values_mean=torch.zeros(batch_size, p3o_horizon, device=device) - experience.values_std=torch.zeros(batch_size, p3o_horizon, device=device) - experience.reward_block = torch.zeros(batch_size, p3o_horizon, dtype=torch.float32, device=device) - experience.mask_block = torch.ones(batch_size, p3o_horizon, dtype=torch.float32, device=device) - experience.buf = torch.zeros(batch_size, p3o_horizon, dtype=torch.float32, device=device) - experience.advantages = torch.zeros(batch_size, dtype=torch.float32, device=device) - experience.bounds = torch.zeros(batch_size, dtype=torch.int32, device=device) - experience.vstd_max = 1.0 - else: - experience.values = torch.zeros(experience_rows, config.bptt_horizon, device=config.device) - - if config.use_vtrace or config.use_puff_advantage: - experience.importance = torch.ones(experience_rows, config.bptt_horizon, device=config.device) - - lstm_h = None - lstm_c = None - # TODO: This breaks compile - if isinstance(policy, torch.nn.LSTM): - assert total_agents > 0 - if config.env_batch_size > 1: - shape = (total_agents, policy.hidden_size) - lstm_h = torch.zeros(shape).to(config.device) - lstm_c = torch.zeros(shape).to(config.device) - else: - # TODO: Doesn't exist in native envs + device = config['device'] + self.observations = torch.zeros(segments, horizon, *obs_space.shape, + dtype=pufferlib.pytorch.numpy_to_torch_dtype_dict[obs_space.dtype], + pin_memory=device == 'cuda' and config['cpu_offload'], + device='cpu' if config['cpu_offload'] else device) + self.actions = torch.zeros(segments, horizon, *atn_space.shape, device=device, + dtype=pufferlib.pytorch.numpy_to_torch_dtype_dict[atn_space.dtype]) + self.values = torch.zeros(segments, horizon, device=device) + self.logprobs = torch.zeros(segments, horizon, device=device) + self.rewards = torch.zeros(segments, horizon, device=device) + self.terminals = torch.zeros(segments, horizon, device=device) + self.truncations = torch.zeros(segments, horizon, device=device) + self.ratio = torch.ones(segments, horizon, device=device) + self.importance = torch.ones(segments, horizon, device=device) + self.ep_lengths = torch.zeros(total_agents, device=device, dtype=torch.int32) + self.ep_indices = torch.arange(total_agents, device=device, dtype=torch.int32) + self.free_idx = total_agents + + # LSTM + if config['use_rnn']: n = vecenv.agents_per_batch - shape = (n, policy.hidden_size) - lstm_h = {slice(i*n, (i+1)*n):torch.zeros(shape).to(config.device) for i in range(total_agents//n)} - lstm_c = {slice(i*n, (i+1)*n):torch.zeros(shape).to(config.device) for i in range(total_agents//n)} - - minibatch_size = min(config.minibatch_size, config.max_minibatch_size) - uncompiled_policy = policy - if config.compile: - policy = torch.compile(policy, mode=config.compile_mode, fullgraph=config.compile_fullgraph) - - if config.optimizer == 'adam': - optimizer = torch.optim.Adam( - policy.parameters(), - lr=config.learning_rate, - betas=(config.adam_beta1, config.adam_beta2), - eps=config.adam_eps, - ) - elif config.optimizer == 'muon': - from heavyball import ForeachMuon - import heavyball.utils - #heavyball.utils.compile_mode = "reduce-overhead" - optimizer = ForeachMuon( - policy.parameters(), - lr=config.learning_rate, - betas=(config.adam_beta1, config.adam_beta2), - eps=config.adam_eps, - - ) - elif config.optimizer == 'kron': - from heavyball import ForeachPSGDKron - import heavyball.utils - #heavyball.utils.compile_mode = "reduce-overhead" - optimizer = ForeachPSGDKron( - policy.parameters(), - lr=config.learning_rate, - precond_lr=config.precond_lr, - beta=config.adam_beta1, - ) - else: - raise ValueError(f'Unknown optimizer: {config.optimizer}') - - epochs = config.total_timesteps // config.batch_size - assert config.scheduler in ('linear', 'cosine') - if config.scheduler == 'linear': - scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.0, total_iters=epochs) - elif config.scheduler == 'cosine': - scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs) - - amp_context = nullcontext() - scaler = None - if config.precision != 'float32': - amp_context = torch.amp.autocast(device_type='cuda', dtype=getattr(torch, config.precision)) - scaler = torch.amp.GradScaler() - - profile = Profile(['eval', 'env', 'eval_forward', 'eval_copy', 'eval_misc', 'train', 'train_forward', - 'learn', 'train_copy', 'train_misc', 'custom'], frequency=5) - - data = pufferlib.namespace( - config=config, - vecenv=vecenv, - policy=policy, - uncompiled_policy=uncompiled_policy, - optimizer=optimizer, - scheduler=scheduler, - scaler=scaler, - experience=experience, - profile=profile, - losses=losses, - wandb=wandb, - neptune=neptune, - global_step=0, - epoch=0, - stats=defaultdict(list), - msg=msg, - last_log_time=0, - utilization=utilization, - use_p3o=config.use_p3o, - p3o_horizon=config.p3o_horizon, - puf=config.puf, - use_diayn=config.use_diayn, - diayn_coef=config.diayn_coef, - # Do we use these? - ptr=0, - step=0, - lstm_h=lstm_h, - lstm_c=lstm_c, - ep_uses=ep_uses, - ep_lengths=ep_lengths, - ep_indices=ep_indices, - free_idx=free_idx, - on_policy_rows=on_policy_rows, - off_policy_rows=off_policy_rows, - experience_rows=experience_rows, - device=config.device, - minibatch_size=minibatch_size, - compute_gae=compute_gae, - compute_vtrace=compute_vtrace, - compute_puff_advantage=compute_puff_advantage, - diayn_skills=diayn_skills, - total_agents=total_agents, - total_epochs=epochs, - start_time=time.time(), - uptime=0, - ) - print_dashboard(data, clear=True) - return data - -def evaluate(data): - profile = data.profile - epoch = data.epoch - profile('eval', epoch) - profile('eval_misc', epoch, nest=True) - config = data.config - experience = data.experience - policy = data.policy - infos = defaultdict(list) - lstm_h = data.lstm_h - lstm_c = data.lstm_c - - data.full_rows = 0 - while data.full_rows < data.on_policy_rows: - profile('env', epoch) - o, r, d, t, info, env_id, mask = data.vecenv.recv() - - profile('eval_misc', epoch) - # Zero-copy indexing for contiguous env_id - if config.env_batch_size == 1: - gpu_env_id = cpu_env_id = slice(env_id[0], env_id[-1] + 1) - else: - cpu_env_id = env_id - gpu_env_id = torch.as_tensor(env_id).to(config.device, non_blocking=True) - - done_mask = d + t - data.global_step += mask.sum() - - profile('eval_copy', epoch) - o = torch.as_tensor(o) - o_device = o.to(config.device, non_blocking=True) - r = torch.as_tensor(r).to(config.device, non_blocking=True) - d = torch.as_tensor(d).to(config.device, non_blocking=True) - - h = None - c = None - if lstm_h is not None: - h = lstm_h[gpu_env_id] - c = lstm_c[gpu_env_id] - - profile('eval_forward', epoch) - with torch.no_grad(): - state = pufferlib.namespace( - reward=r, - done=d, - env_id=gpu_env_id, - mask=mask, - lstm_h=h, - lstm_c=c, + h = policy.hidden_size + self.lstm_h = {i*n: torch.zeros(n, h, device=device) for i in range(total_agents//n)} + self.lstm_c = {i*n: torch.zeros(n, h, device=device) for i in range(total_agents//n)} + + # Minibatching & gradient accumulation + minibatch_size = config['minibatch_size'] + max_minibatch_size = config['max_minibatch_size'] + self.minibatch_size = min(minibatch_size, max_minibatch_size) + if minibatch_size > max_minibatch_size and minibatch_size % max_minibatch_size != 0: + raise pufferlib.APIUsageError( + f'minibatch_size {minibatch_size} > max_minibatch_size {max_minibatch_size} must divide evenly') + + self.accumulate_minibatches = max(1, minibatch_size // max_minibatch_size) + self.total_minibatches = int(config['update_epochs'] * batch_size / self.minibatch_size) + self.minibatch_segments = self.minibatch_size // horizon + if self.minibatch_segments * horizon != self.minibatch_size: + raise pufferlib.APIUsageError( + f'minibatch_size {self.minibatch_size} must be divisible by bptt_horizon {horizon}' ) - if data.use_diayn: - state.diayn_z = data.diayn_skills[env_id] + # Torch compile + self.uncompiled_policy = policy + self.policy = policy + if config['compile']: + self.policy = torch.compile(policy, mode=config['compile_mode'], fullgraph=config['compile_fullgraph']) - logits, value = policy(o_device, state) - action, logprob, _ = pufferlib.pytorch.sample_logits(logits, is_continuous=policy.is_continuous) - r = torch.clamp(r, -1, 1) - profile('eval_copy', epoch) - with torch.no_grad(): - if lstm_h is not None: - lstm_h[gpu_env_id] = state.lstm_h - lstm_c[gpu_env_id] = state.lstm_c - - o = o if config.cpu_offload else o_device - actions = store(data, state, o, value, action, logprob, r, d, gpu_env_id, mask) - - profile('eval_misc', epoch) - for i in info: - for k, v in pufferlib.utils.unroll_nested_dict(i): - infos[k].append(v) - - profile('env', epoch) - data.vecenv.send(actions) - - profile('eval_misc', epoch) - for k, v in infos.items(): - if '_map' in k: - if data.wandb is not None: - data.stats[f'Media/{k}'] = data.wandb.Image(v[0]) - continue - elif data.neptune is not None: - # TODO: Add neptune image logging - pass - - if isinstance(v, np.ndarray): - v = v.tolist() - try: - iter(v) - except TypeError: - data.stats[k].append(v) - else: - data.stats[k] += v - - data.free_idx = data.total_agents - data.ep_indices = torch.arange(data.total_agents, device=config.device, dtype=torch.int32) - data.ep_lengths.zero_() - data.ep_uses.zero_() - profile.end() - return data.stats, infos - -def train(data): - profile = data.profile - epoch = data.epoch - profile('train', epoch) - config = data.config - experience = data.experience - losses = data.losses - - total_minibatches = int(config.update_epochs*config.batch_size/data.minibatch_size) - accumulate_minibatches = max(1, config.minibatch_size // config.max_minibatch_size) - n_samples = data.minibatch_size // config.bptt_horizon - for mb in range(total_minibatches): - profile('train_misc', epoch, nest=True) - loss = 0 - if config.use_p3o: - # Note: This function gets messed up by computing across - # episode bounds. Because we store experience in a flat buffer, - # bounds can be crossed even after handling dones. This prevent - # our method from scaling to longer horizons. TODO: Redo the way - # we store experience to avoid this issue - vstd_min = experience.values_std.min().item() - vstd_max = experience.values_std.max().item() - - data.mask_block.zero_() - data.buf.zero_() - data.reward_block.zero_() - data.bounds.zero_() - - r_mean = experience.rewards.mean().item() - r_std = experience.rewards.std().item() - - # TODO: Rename vstd to r_std - advantages = compute_advantages( - experience.reward_block, experience.mask_block, - experience.values_mean, experience.values_std, - experience.buf, experience.dones, experience.rewards, - experience.bounds, r_std, data.puf, config.p3o_horizon + # Optimizer + if config['optimizer'] == 'adam': + optimizer = torch.optim.Adam( + self.policy.parameters(), + lr=config['learning_rate'], + betas=(config['adam_beta1'], config['adam_beta2']), + eps=config['adam_eps'], + ) + elif config['optimizer'] == 'muon': + from heavyball import ForeachMuon + import heavyball.utils + heavyball.utils.compile_mode = config['compile_mode'] if config['compile'] else None + optimizer = ForeachMuon( + self.policy.parameters(), + lr=config['learning_rate'], + betas=(config['adam_beta1'], config['adam_beta2']), + eps=config['adam_eps'], ) - - horizon = torch.where(experience.values_std[0] > 0.95*r_std)[0] - horizon = horizon[0].item()+1 if len(horizon) else 1 - if horizon < 16: - horizon = 16 - - advantages = advantages.cpu().numpy() - torch.cuda.synchronize() - elif config.use_vtrace: - importance = advantages = torch.zeros(experience.values.shape, device=config.device).to(config.device) - vs = torch.zeros(experience.values.shape, device=config.device) - data.compute_vtrace(experience.values, experience.rewards, experience.dones, - experience.ratio, vs, advantages, config.gamma, config.vtrace_rho_clip, config.vtrace_c_clip) - elif config.use_puff_advantage: - importance = advantages = torch.zeros(experience.values.shape, device=config.device).to(config.device) - vs = torch.zeros(experience.values.shape, device=config.device) - data.compute_puff_advantage(experience.values, experience.rewards, experience.dones, - experience.ratio, vs, advantages, config.gamma, config.gae_lambda, config.vtrace_rho_clip, config.vtrace_c_clip) else: - importance = advantages = data.compute_gae(experience.values, experience.rewards, - experience.dones, config.gamma, config.gae_lambda) - - profile('train_copy', epoch) - batch = sample(data, importance, n_samples) - - profile('train_misc', epoch) - state = pufferlib.namespace( - action=batch.actions, - lstm_h=None, - lstm_c=None, - ) + raise ValueError(f'Unknown optimizer: {config["optimizer"]}') + + self.optimizer = optimizer + + # Learning rate scheduler + epochs = config['total_timesteps'] // config['batch_size'] + self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs) + self.total_epochs = epochs + + # Automatic mixed precision + precision = config['precision'] + self.amp_context = torch.amp.autocast(device_type='cuda', dtype=getattr(torch, precision)) + if precision not in ('float32', 'bfloat16'): + raise pufferlib.APIUsageError(f'Invalid precision: {precision}: use float32 or bfloat16') + + # Logging + self.neptune = neptune + self.wandb = wandb + if neptune: + self.neptune = init_neptune(args, tag=config['tag']) + self.run_id = self.neptune._sys_id + for k, v in pufferlib.unroll_nested_dict(args): + self.neptune[k].append(v) + elif wandb: + self.wandb = init_wandb(args, tag=config['tag']) + self.run_id = self.wandb.run.id + else: + self.run_id = str(int(random.random() * 1e8)) + + # Initializations + self.config = config + self.vecenv = vecenv + self.epoch = 0 + self.global_step = 0 + self.last_log_step = 0 + self.last_log_time = time.time() + self.start_time = time.time() + self.utilization = Utilization() + self.profile = Profile() + self.stats = defaultdict(list) + self.last_stats = defaultdict(list) + self.losses = {} + + # Dashboard + self.model_size = sum(p.numel() for p in policy.parameters() if p.requires_grad) + self.print_dashboard(clear=True) + + @property + def uptime(self): + return time.time() - self.start_time + + @property + def sps(self): + if self.global_step == self.last_log_step: + return 0 + + return (self.global_step - self.last_log_step) / (time.time() - self.last_log_time) + + def evaluate(self): + profile = self.profile + epoch = self.epoch + profile('eval', epoch) + profile('eval_misc', epoch, nest=True) + + config = self.config + device = config['device'] + + self.full_rows = 0 + while self.full_rows < self.segments: + profile('env', epoch) + o, r, d, t, info, env_id, mask = self.vecenv.recv() + + profile('eval_misc', epoch) + # TODO: Port to vecenv + env_id = slice(env_id[0], env_id[-1] + 1) + + # TODO: Handle truncations + done_mask = d + t + self.global_step += mask.sum() + + o = torch.as_tensor(o) + o = o.pin_memory() + profile('eval_copy', epoch) + o_device = o.to(device, non_blocking=True) + profile('eval_misc', epoch) + r = torch.as_tensor(r).to(device, non_blocking=True) + d = torch.as_tensor(d).to(device, non_blocking=True) + + profile('eval_forward', epoch) + with torch.no_grad(), self.amp_context: + state = dict( + reward=r, + done=d, + env_id=env_id, + mask=mask, + ) + + if config['use_rnn']: + state['lstm_h'] = self.lstm_h[env_id.start] + state['lstm_c'] = self.lstm_c[env_id.start] + + logits, value = self.policy(o_device, state) + action, logprob, _ = pufferlib.pytorch.sample_logits(logits) + r = torch.clamp(r, -1, 1) + + profile('eval_copy', epoch) + with torch.no_grad(): + if config['use_rnn']: + self.lstm_h[env_id.start] = state['lstm_h'] + self.lstm_c[env_id.start] = state['lstm_c'] + + # Fast path for fully vectorized envs + l = self.ep_lengths[env_id.start].item() + batch_rows = slice(self.ep_indices[env_id.start].item(), 1+self.ep_indices[env_id.stop - 1].item()) + + if config['cpu_offload']: + self.observations[batch_rows, l] = o + else: + self.observations[batch_rows, l] = o_device + + self.actions[batch_rows, l] = action + self.logprobs[batch_rows, l] = logprob + self.rewards[batch_rows, l] = r + self.terminals[batch_rows, l] = d.float() + self.values[batch_rows, l] = value.flatten() + + # TODO: Handle masks!! + #indices = np.where(mask)[0] + #data.ep_lengths[env_id[mask]] += 1 + self.ep_lengths[env_id] += 1 + if l+1 >= config['bptt_horizon']: + num_full = env_id.stop - env_id.start + self.ep_indices[env_id] = self.free_idx + torch.arange(num_full, device=config['device']).int() + self.ep_lengths[env_id] = 0 + self.free_idx += num_full + self.full_rows += num_full + + action = action.squeeze(-1).cpu().numpy() + if isinstance(logits, torch.distributions.Normal): + action = np.clip(action, vecenv.action_space.low, vecenv.action_space.high) + + profile('eval_misc', epoch) + for i in info: + for k, v in pufferlib.unroll_nested_dict(i): + if isinstance(v, np.ndarray): + v = v.tolist() + elif isinstance(v, (list, tuple)): + self.stats[k].extend(v) + else: + self.stats[k].append(v) + + profile('env', epoch) + self.vecenv.send(action) - if config.use_diayn: - state.diayn_z = batch.diayn_z.reshape(-1) + profile('eval_misc', epoch) + self.free_idx = self.total_agents + self.ep_indices = torch.arange(self.total_agents, device=device, dtype=torch.int32) + self.ep_lengths.zero_() + profile.end() + return self.stats + + def train(self): + profile = self.profile + epoch = self.epoch + profile('train', epoch) + losses = defaultdict(float) + config = self.config + device = config['device'] + + b0 = config['prio_beta0'] + a = config['prio_alpha'] + clip_coef = config['clip_coef'] + vf_clip = config['vf_clip_coef'] + anneal_beta = b0 + (1 - b0)*a*self.epoch/self.total_epochs + self.ratio[:] = 1 + + for mb in range(self.total_minibatches): + profile('train_misc', epoch, nest=True) + self.amp_context.__enter__() + + # TODO: Eliminate + shape = self.values.shape + n = (shape[0]//256)*256 + advantages = torch.zeros(shape, device=device) + torch.ops.pufferlib.compute_puff_advantage(self.values[:n], self.rewards[:n], + self.terminals[:n], self.ratio[:n], advantages[:n], config['gamma'], + config['gae_lambda'], config['vtrace_rho_clip'], config['vtrace_c_clip']) + + profile('train_copy', epoch) + adv = advantages.abs().sum(axis=1) + prio_weights = torch.nan_to_num(adv**a, 0, 0, 0) + prio_probs = (prio_weights + 1e-6)/(prio_weights.sum() + 1e-6) + idx = torch.multinomial(prio_probs, self.minibatch_segments) + mb_prio = (self.segments*prio_probs[idx, None])**-anneal_beta + mb_obs = self.observations[idx] + mb_actions = self.actions[idx] + mb_logprobs = self.logprobs[idx] + mb_rewards = self.rewards[idx] + mb_terminals = self.terminals[idx] + mb_truncations = self.truncations[idx] + mb_ratio = self.ratio[idx] + mb_values = self.values[idx] + mb_returns = advantages[idx] + mb_values + mb_advantages = advantages[idx] + + profile('train_forward', epoch) + if not config['use_rnn']: + mb_obs = mb_obs.reshape(-1, *self.vecenv.single_observation_space.shape) + + state = dict( + action=mb_actions, + lstm_h=None, + lstm_c=None, + ) - profile('train_forward', epoch) - if not isinstance(data.policy, torch.nn.LSTM): - batch.obs = batch.obs.reshape(-1, *data.vecenv.single_observation_space.shape) + # TODO: Currently only returning traj shaped value as a hack + logits, newvalue = self.policy.forward_train(mb_obs, state) + # TODO: Redundant actions? + actions, newlogprob, entropy = pufferlib.pytorch.sample_logits(logits, action=mb_actions) - # TODO: Currently only returning traj shaped value as a hack - logits, newvalue = data.policy.forward_train(batch.obs, state) - actions, newlogprob, entropy = pufferlib.pytorch.sample_logits(logits, - action=batch.actions, is_continuous=data.policy.is_continuous) + profile('train_misc', epoch) + newlogprob = newlogprob.reshape(mb_logprobs.shape) + logratio = newlogprob - mb_logprobs + ratio = logratio.exp() + self.ratio[idx] = ratio # TODO: Experiment with this - profile('train_misc', epoch) - if config.use_diayn: - N = 1 - batch_logits = state.batch_logits[:, ::N] - batch_logits = torch.nn.functional.log_softmax(batch_logits, dim=-1) - mask = torch.nn.functional.one_hot(batch.actions[:, ::N], batch_logits.shape[-1]).bool() - #batch_logits = mask*batch_logits - batch_logits = batch_logits.view(batch_logits.shape[0], -1) - diayn_policy = data.policy.policy - q = diayn_policy.discrim_forward(batch_logits) - z_idxs = batch.diayn_z[:, 0] - q = q.view(-1, q.shape[-1]) - diayn_loss = torch.nn.functional.cross_entropy(q, z_idxs) - loss += config.diayn_loss_coef*diayn_loss - - newlogprob = newlogprob.reshape(batch.logprobs.shape) - logratio = newlogprob - batch.logprobs - ratio = logratio.exp() - experience.ratio[batch.idx] = ratio - - # TODO: Only do this if we are KL clipping? Saves 1-2% compute - with torch.no_grad(): - # calculate approx_kl http://joschu.net/blog/kl-approx.html - old_approx_kl = (-logratio).mean() - approx_kl = ((ratio - 1) - logratio).mean() - clipfrac = ((ratio - 1.0).abs() > config.clip_coef).float().mean() - - if config.use_vtrace or config.use_puff_advantage: + # TODO: Only do this if we are KL clipping? Saves 1-2% compute with torch.no_grad(): - adv = advantages[batch.idx] - vs = vs[batch.idx] - if config.use_vtrace: - data.compute_vtrace(batch.values, batch.rewards, batch.dones, - ratio, vs, adv, config.gamma, config.vtrace_rho_clip, config.vtrace_c_clip) - elif config.use_puff_advantage: - data.compute_puff_advantage(batch.values, batch.rewards, batch.dones, - ratio, vs, adv, config.gamma, config.gae_lambda, config.vtrace_rho_clip, config.vtrace_c_clip) - - #advantages[batch.idx] = adv - #importance[batch.idx] = adv - - adv = batch.advantages - if config.norm_adv: - adv = (adv - adv.mean()) / (adv.std() + 1e-8) - - adv = adv * batch.prio - - # Policy loss - pg_loss1 = -adv * ratio - pg_loss2 = -adv * torch.clamp( - ratio, 1 - config.clip_coef, 1 + config.clip_coef - ) - pg_loss = torch.max(pg_loss1, pg_loss2).mean() - - # Value loss - if config.use_p3o: - newvalue_mean = newvalue.mean.view(-1, config.p3o_horizon) - newvalue_std = newvalue.std.view(-1, config.p3o_horizon) - newvalue_var = torch.square(newvalue_std) - criterion = torch.nn.GaussianNLLLoss(reduction='none') - v_loss = criterion(newvalue_mean, batch.reward_block, newvalue_var) - v_loss = v_loss[:, :(horizon+3)] - mask_block = mask_block[:, :(horizon+3)] - v_loss = v_loss[mask_block.bool()].mean() - elif config.clip_vloss: - ret = batch.returns#.flatten() - newvalue = newvalue.view(ret.shape) - v_loss_unclipped = (newvalue - ret) ** 2 - val = batch.values#.flatten() - v_clipped = val + torch.clamp( - newvalue - val, - -config.vf_clip_coef, - config.vf_clip_coef, - ) - v_loss_clipped = (v_clipped - ret) ** 2 - v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) - v_loss = 0.5 * v_loss_max.mean() - else: - newvalue = newvalue.flatten() - v_loss = 0.5 * ((newvalue - ret) ** 2).mean() - - entropy_loss = entropy.mean() - loss += pg_loss - config.ent_coef*entropy_loss + v_loss*config.vf_coef - - # This breaks vloss clipping? - with torch.no_grad(): - experience.values[batch.idx] = newvalue + old_approx_kl = (-logratio).mean() + approx_kl = ((ratio - 1) - logratio).mean() + clipfrac = ((ratio - 1.0).abs() > config['clip_coef']).float().mean() + + # TODO: Do you need to do this? Policy hasn't changed + adv = advantages[idx] + torch.ops.pufferlib.compute_puff_advantage(mb_values, mb_rewards, mb_terminals, + ratio, adv, config['gamma'], config['gae_lambda'], + config['vtrace_rho_clip'], config['vtrace_c_clip']) + adv = mb_advantages + adv = mb_prio * (adv - adv.mean()) / (adv.std() + 1e-8) # TODO: Norm by full batch + + # Losses + pg_loss1 = -adv * ratio + pg_loss2 = -adv * torch.clamp(ratio, 1 - clip_coef, 1 + clip_coef) + pg_loss = torch.max(pg_loss1, pg_loss2).mean() + + newvalue = newvalue.view(mb_returns.shape) + v_clipped = mb_values + torch.clamp(newvalue - mb_values, -vf_clip, vf_clip) + v_loss_unclipped = (newvalue - mb_returns) ** 2 + v_loss_clipped = (v_clipped - mb_returns) ** 2 + v_loss = 0.5*torch.max(v_loss_unclipped, v_loss_clipped).mean() + + entropy_loss = entropy.mean() + + loss = pg_loss + config['vf_coef']*v_loss - config['ent_coef']*entropy_loss + self.amp_context.__enter__() # TODO: Debug + + # This breaks vloss clipping? + self.values[idx] = newvalue.detach().float() + + # Logging + profile('train_misc', epoch) + losses['policy_loss'] += pg_loss.item() / self.total_minibatches + losses['value_loss'] += v_loss.item() / self.total_minibatches + losses['entropy'] += entropy_loss.item() / self.total_minibatches + losses['old_approx_kl'] += old_approx_kl.item() / self.total_minibatches + losses['approx_kl'] += approx_kl.item() / self.total_minibatches + losses['clipfrac'] += clipfrac.item() / self.total_minibatches + losses['importance'] += ratio.mean().item() / self.total_minibatches + + # Learn on accumulated minibatches + profile('learn', epoch) + loss.backward() + if (mb + 1) % self.accumulate_minibatches == 0: + torch.nn.utils.clip_grad_norm_(self.policy.parameters(), config['max_grad_norm']) + self.optimizer.step() + self.optimizer.zero_grad() + + # Reprioritize experience + profile('train_misc', epoch) + if config['anneal_lr']: + self.scheduler.step() + + y_pred = self.values.flatten() + y_true = advantages.flatten() + self.values.flatten() + var_y = y_true.var() + explained_var = torch.nan if var_y == 0 else 1 - (y_true - y_pred).var() / var_y + losses['explained_variance'] = explained_var.item() + + profile.end() + logs = None + self.epoch += 1 + done_training = self.global_step >= config['total_timesteps'] + if done_training or self.global_step == 0 or time.time() > self.last_log_time + 0.25: + logs = self.mean_and_log() + self.losses = losses + self.print_dashboard() + self.last_stats = self.stats + self.stats = defaultdict(list) + self.last_log_time = time.time() + self.last_log_step = self.global_step + profile.clear() + + if self.epoch % config['checkpoint_interval'] == 0 or done_training: + self.save_checkpoint() + self.msg = f'Checkpoint saved at update {self.epoch}' - profile('learn', epoch) - if data.scaler is not None: - loss = data.scaler.scale(loss) + return logs - loss.backward() + def mean_and_log(self): + config = self.config + for k in list(self.stats.keys()): + v = self.stats[k] + try: + v = np.mean(v) + except: + del self.stats[k] + + self.stats[k] = v + + device = config['device'] + agent_steps = int(dist_sum(self.global_step, device)) + logs = { + 'SPS': dist_sum(self.sps, device), + 'agent_steps': agent_steps, + 'uptime': time.time() - self.start_time, + 'epoch': int(dist_sum(self.epoch, device)), + 'learning_rate': self.optimizer.param_groups[0]["lr"], + **{f'environment/{k}': dist_mean(v, device) for k, v in self.stats.items()}, + **{f'losses/{k}': dist_mean(v, device) for k, v in self.losses.items()}, + **{f'performance/{k}': dist_sum(v['elapsed'], device) for k, v in self.profile}, + } - if data.scaler is not None: - data.scaler.unscale_(data.optimizer) + if torch.distributed.is_initialized() and torch.distributed.get_rank() != 0: + return logs + elif self.wandb: + self.wandb.log(logs) + elif self.neptune: + for k, v in logs.items(): + self.neptune[k].append(v, step=agent_steps) - # TODO: Delete? - with torch.no_grad(): - grads = torch.cat([p.grad.flatten() for p in data.policy.parameters()]) - grad_var = grads.var(0).mean() * config.minibatch_size - data.msg = f'Gradient variance: {grad_var.item():.3f}' + return logs - if (mb + 1) % accumulate_minibatches == 0: - torch.nn.utils.clip_grad_norm_(data.policy.parameters(), config.max_grad_norm) + def close(self): + self.vecenv.close() + self.utilization.stop() + model_path = self.save_checkpoint() + path = os.path.join(self.config['data_dir'], f'{self.run_id}.pt') + shutil.copy(model_path, path) + if self.wandb: + artifact = self.wandb.Artifact(self.run_id, type='model') + artifact.add_file(path) + self.wandb.run.log_artifact(artifact) + self.wandb.finish() + elif self.neptune: + self.neptune['model'].track_files(path) + self.neptune.stop() + + def save_checkpoint(self): + path = os.path.join(self.config['data_dir'], self.run_id) + if not os.path.exists(path): + os.makedirs(path) + + model_name = f'model_{self.epoch:06d}.pt' + model_path = os.path.join(path, model_name) + if os.path.exists(model_path): + return model_path + + torch.save(self.uncompiled_policy.state_dict(), model_path) + + state = { + 'optimizer_state_dict': self.optimizer.state_dict(), + 'global_step': self.global_step, + 'agent_step': self.global_step, + 'update': self.epoch, + 'model_name': model_name, + 'run_id': self.run_id, + } + state_path = os.path.join(path, 'trainer_state.pt') + torch.save(state, state_path + '.tmp') + os.rename(state_path + '.tmp', state_path) + return model_path - # TODO: Can remove scaler if only using bf16 - if data.scaler is None: - data.optimizer.step() - else: - data.scaler.step(data.optimizer) - data.scaler.update() + def try_load_checkpoint(self): + config = self.config + path = os.path.join(config['data_dir'], self.run_id) + if not os.path.exists(path): + print('No checkpoints found. Assuming new experiment') + return - data.optimizer.zero_grad() + trainer_path = os.path.join(path, 'trainer_state.pt') + resume_state = torch.load(trainer_path, weights_only=False) + model_path = os.path.join(path, resume_state['model_name']) + self.policy.uncompiled.load_state_dict( + torch.load(model_path, weights_only=True), map_location=config['device']) + self.optimizer.load_state_dict(resume_state['optimizer_state_dict']) + print(f'Loaded checkpoint {resume_state["model_name"]}') + + def print_dashboard(self, clear=False, idx=[0], + c1='[cyan]', c2='[white]', b1='[bright_cyan]', b2='[bright_white]'): + profile = self.profile + config = self.config + console = Console() + dashboard = Table(box=rich.box.ROUNDED, expand=True, + show_header=False, border_style='bright_cyan') + table = Table(box=None, expand=True, show_header=False) + dashboard.add_row(table) + + table.add_column(justify="left", width=30) + table.add_column(justify="center", width=12) + table.add_column(justify="center", width=12) + table.add_column(justify="center", width=13) + table.add_column(justify="right", width=13) + + table.add_row( + f'{b1}PufferLib {b2}2.0.0 {idx[0]*" "}:blowfish:', + f'{c1}CPU: {b2}{np.mean(self.utilization.cpu_util):.1f}{c2}%', + f'{c1}GPU: {b2}{np.mean(self.utilization.gpu_util):.1f}{c2}%', + f'{c1}DRAM: {b2}{np.mean(self.utilization.cpu_mem):.1f}{c2}%', + f'{c1}VRAM: {b2}{np.mean(self.utilization.gpu_mem):.1f}{c2}%', + ) + idx[0] = (idx[0] - 1) % 10 + + s = Table(box=None, expand=True) + sps = self.sps + remaining = 'A hair past a freckle' + if sps != 0: + remaining = duration((config['total_timesteps'] - self.global_step)/sps, b2, c2) + + s.add_column(f"{c1}Summary", justify='left', vertical='top', width=10) + s.add_column(f"{c1}Value", justify='right', vertical='top', width=14) + s.add_row(f'{c2}Env', f'{b2}{config["env"]}') + s.add_row(f'{c2}Params', abbreviate(self.model_size, b2, c2)) + s.add_row(f'{c2}Steps', abbreviate(self.global_step, b2, c2)) + s.add_row(f'{c2}SPS', abbreviate(sps, b2, c2)) + s.add_row(f'{c2}Epoch', f'{b2}{self.epoch}') + s.add_row(f'{c2}Uptime', duration(self.uptime, b2, c2)) + s.add_row(f'{c2}Remaining', remaining) + + delta = profile.eval['buffer'] + profile.train['buffer'] + p = Table(box=None, expand=True, show_header=False) + p.add_column(f"{c1}Performance", justify="left", width=10) + p.add_column(f"{c1}Time", justify="right", width=8) + p.add_column(f"{c1}%", justify="right", width=4) + p.add_row(*fmt_perf('Evaluate', b1, delta, profile.eval, b2, c2)) + p.add_row(*fmt_perf(' Forward', c2, delta, profile.eval_forward, b2, c2)) + p.add_row(*fmt_perf(' Env', c2, delta, profile.env, b2, c2)) + p.add_row(*fmt_perf(' Copy', c2, delta, profile.eval_copy, b2, c2)) + p.add_row(*fmt_perf(' Misc', c2, delta, profile.eval_misc, b2, c2)) + p.add_row(*fmt_perf('Train', b1, delta, profile.train, b2, c2)) + p.add_row(*fmt_perf(' Forward', c2, delta, profile.train_forward, b2, c2)) + p.add_row(*fmt_perf(' Learn', c2, delta, profile.learn, b2, c2)) + p.add_row(*fmt_perf(' Copy', c2, delta, profile.train_copy, b2, c2)) + p.add_row(*fmt_perf(' Misc', c2, delta, profile.train_misc, b2, c2)) + + l = Table(box=None, expand=True, ) + l.add_column(f'{c1}Losses', justify="left", width=16) + l.add_column(f'{c1}Value', justify="right", width=8) + for metric, value in self.losses.items(): + l.add_row(f'{c2}{metric}', f'{b2}{value:.3f}') + + monitor = Table(box=None, expand=True, pad_edge=False) + monitor.add_row(s, p, l) + dashboard.add_row(monitor) + + table = Table(box=None, expand=True, pad_edge=False) + dashboard.add_row(table) + left = Table(box=None, expand=True) + right = Table(box=None, expand=True) + table.add_row(left, right) + left.add_column(f"{c1}User Stats", justify="left", width=20) + left.add_column(f"{c1}Value", justify="right", width=10) + right.add_column(f"{c1}User Stats", justify="left", width=20) + right.add_column(f"{c1}Value", justify="right", width=10) + i = 0 + for metric, value in (self.stats or self.last_stats).items(): + try: # Discard non-numeric values + int(value) + except: + continue - profile('train_misc', epoch) - losses.policy_loss += pg_loss.item() / total_minibatches - losses.value_loss += v_loss.item() / total_minibatches - losses.entropy += entropy_loss.item() / total_minibatches - losses.old_approx_kl += old_approx_kl.item() / total_minibatches - losses.approx_kl += approx_kl.item() / total_minibatches - losses.clipfrac += clipfrac.item() / total_minibatches - losses.grad_var += grad_var.item() / total_minibatches - losses.importance += ratio.mean().item() / total_minibatches - - if data.use_diayn: - losses.diayn_loss += diayn_loss.item() / total_minibatches - - if config.target_kl is not None: - if approx_kl > config.target_kl: + u = left if i % 2 == 0 else right + u.add_row(f'{c2}{metric}', f'{b2}{value:.3f}') + i += 1 + if i == 30: break - # Reprioritize experience - profile('train_misc', epoch) - data.max_uses = data.ep_uses.max().item() - data.mean_uses = data.ep_uses.float().mean().item() - if config.replay_factor > 0: - advantages = torch.zeros(experience.values.shape, device=config.device).to(config.device) - vs = torch.zeros(experience.values.shape, device=config.device) - data.compute_puff_advantage(experience.values, experience.rewards, experience.dones, - experience.ratio, vs, advantages, config.gamma, config.gae_lambda, config.vtrace_rho_clip, config.vtrace_c_clip) - - exp = sample(data, advantages, data.off_policy_rows, method='random') - for k, v in experience.items(): - v[data.on_policy_rows:] = exp[k] + if clear: + console.clear() - experience.ratio[:data.on_policy_rows] = 1 + with console.capture() as capture: + console.print(dashboard) - if config.anneal_lr: - data.scheduler.step() + print('\033[0;0H' + capture.get()) - if config.use_p3o: - y_pred = experience.values_mean - y_true = experience.reward_block - else: - y_pred = experience.values.flatten() - - # Probably not updated - y_true = advantages.flatten() + experience.values.flatten() - - var_y = y_true.var() - explained_var = torch.nan if var_y == 0 else 1 - (y_true - y_pred).var() / var_y - #losses.explained_variance = explained_var.item() - - profile.end() - profile.clear() - logs = None - data.epoch += 1 - done_training = data.global_step >= config.total_timesteps - if done_training or data.global_step == 0 or time.time() - data.start_time - data.uptime > 1: - data.uptime = time.time() - data.start_time - logs = mean_and_log(data) - print_dashboard(data) - data.stats = defaultdict(list) - - for k in losses: - losses[k] = 0 - - if data.epoch % config.checkpoint_interval == 0 or done_training: - save_checkpoint(data) - data.msg = f'Checkpoint saved at update {data.epoch}' - - return logs - -def store(data, state, obs, value, action, logprob, reward, done, env_id, mask): - exp = data.experience - - # Fast path for fully vectorized envs - if data.config.env_batch_size == 1: - l = data.ep_lengths[env_id.start].item() - batch_rows = slice(data.ep_indices[env_id.start].item(), 1+data.ep_indices[env_id.stop - 1].item()) - else: - l = data.ep_lengths[env_id] - batch_rows = data.ep_indices[env_id] - - exp.obs[batch_rows, l] = obs - exp.actions[batch_rows, l] = action - exp.logprobs[batch_rows, l] = logprob - exp.rewards[batch_rows, l] = reward - exp.dones[batch_rows, l] = done.float() - - if data.use_p3o: - exp.values_mean[batch_rows, l] = value.mean - exp.values_std[batch_rows, l] = value.std - else: - exp.values[batch_rows, l] = value.flatten() - #exp.values[l, batch_rows] = value.flatten() - - if data.use_diayn: - exp.diayn_batch[batch_rows, l] = state.diayn_z - - # TODO: Handle masks!! - #indices = np.where(mask)[0] - #data.ep_lengths[env_id[mask]] += 1 - data.ep_lengths[env_id] += 1 - if data.config.env_batch_size == 1: - if l+1 >= data.config.bptt_horizon: - num_full = env_id.stop - env_id.start - data.ep_indices[env_id] = data.free_idx + torch.arange(num_full, device=data.device).int() - data.ep_lengths[env_id] = 0 - data.free_idx += num_full - data.full_rows += num_full - else: - full = data.ep_lengths[env_id] >= data.config.bptt_horizon - num_full = full.sum() - if num_full > 0: - full_ids = env_id[full] - data.ep_indices[full_ids] = data.free_idx + torch.arange(num_full, device=data.device).int() - data.ep_lengths[full_ids] = 0 - data.free_idx += num_full - data.full_rows += num_full - - data.step += 1 - - return action.cpu().numpy() - -def sample(data, advantages, n, reward_block=None, mask_block=None, method='prio'): - exp = data.experience - if method == 'topk': - _, idx = torch.topk(advantages.abs().sum(axis=1), n) - elif method == 'prio': - adv = advantages.abs().sum(axis=1) - probs = adv**data.config.prio_alpha - probs = (probs + 1e-6)/(probs.sum() + 1e-6) - idx = torch.multinomial(probs, n) - elif method == 'multinomial': - idx = torch.multinomial(advantages.abs().sum(axis=1) + 1e-6, n) - elif method == 'random': - idx = torch.randint(0, advantages.shape[0], (n,), device=data.device) - else: - raise ValueError(f'Unknown sampling method: {method}') - - - data.ep_uses[idx] += 1 - output = {k: v[idx] for k, v in exp.items()} - output['idx'] = idx - - if data.use_p3o: - output['reward_block'] = reward_block[idx] - output['mask_block'] = mask_block[idx] - output['values_mean'] = exp.values_mean[idx] - output['values_std'] = exp.values_std[idx] +def abbreviate(num, b2, c2): + if num < 1e3: + return str(num) + elif num < 1e6: + return f'{num/1e3:.1f}K' + elif num < 1e9: + return f'{num/1e6:.1f}M' + elif num < 1e12: + return f'{num/1e9:.1f}B' else: - output['values'] = exp.values[idx] - output['advantages'] = advantages[idx] - output['returns'] = advantages[idx] + exp.values[idx] + return f'{num/1e12:.2f}T' - if data.use_diayn: - output['diayn_z'] = exp.diayn_batch[idx] - - output['prio'] = 1 - if method == 'prio': - beta = data.config.prio_beta0 + (1 - data.config.prio_beta0)*data.config.prio_alpha*data.epoch/data.total_epochs - output['prio'] = (((1/len(probs)) * (1/probs[idx]))**beta).unsqueeze(1).expand_as(output['advantages']) +def duration(seconds, b2, c2): + seconds = int(seconds) + h = seconds // 3600 + m = (seconds % 3600) // 60 + s = seconds % 60 + return f"{b2}{h}{c2}h {b2}{m}{c2}m {b2}{s}{c2}s" if h else f"{b2}{m}{c2}m {b2}{s}{c2}s" if m else f"{b2}{s}{c2}s" - return pufferlib.namespace(**output) +def fmt_perf(name, color, delta_ref, prof, b2, c2): + percent = 0 if delta_ref == 0 else int(100*prof['buffer']/delta_ref - 1e-5) + return f'{color}{name}', duration(prof['elapsed'], b2, c2), f'{b2}{percent:2d}{c2}%' def dist_sum(value, device): - if not dist.is_initialized(): + if not torch.distributed.is_initialized(): return value tensor = torch.tensor(value, device=device) - dist.all_reduce(tensor, op=dist.ReduceOp.SUM) + torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM) return tensor.item() def dist_mean(value, device): - if not dist.is_initialized(): + if not torch.distributed.is_initialized(): return value - return dist_sum(value, device) / dist.get_world_size() - -def mean_and_log(data): - for k in list(data.stats.keys()): - v = data.stats[k] - try: - v = np.mean(v) - except: - del data.stats[k] - - data.stats[k] = v - - device = data.config.device - - agent_steps = int(dist_sum(data.global_step, device)) - logs = { - #'SPS': dist_sum(data.profile.SPS, device), - 'agent_steps': agent_steps, - 'epoch': int(dist_sum(data.epoch, device)), - 'learning_rate': data.optimizer.param_groups[0]["lr"], - 'max_uses': data.max_uses, - 'mean_uses': data.mean_uses, - **{f'environment/{k}': dist_mean(v, device) for k, v in data.stats.items()}, - **{f'losses/{k}': dist_mean(v, device) for k, v in data.losses.items()}, - #**{f'performance/{k}': dist_sum(v, device) for k, v in data.profile}, - } - - if dist.is_initialized() and dist.get_rank() != 0: - return logs - - if data.wandb is not None: - data.last_log_time = time.time() - data.wandb.log(logs) - elif data.neptune is not None: - data.last_log_time = time.time() - for k, v in logs.items(): - data.neptune[k].append(v, step=agent_steps) - - return logs - -def close(data): - data.vecenv.close() - data.utilization.stop() - config = data.config - if data.wandb is not None: - artifact_name = f"{config.exp_id}_model" - artifact = data.wandb.Artifact(artifact_name, type="model") - model_path = save_checkpoint(data) - artifact.add_file(model_path) - data.wandb.run.log_artifact(artifact) - data.wandb.finish() - elif data.neptune is not None: - data.neptune.stop() - -def save_checkpoint(data): - config = data.config - path = os.path.join(config.data_dir, config.exp_id) - if not os.path.exists(path): - os.makedirs(path) - - model_name = f'model_{data.epoch:06d}.pt' - model_path = os.path.join(path, model_name) - if os.path.exists(model_path): - return model_path - - torch.save(data.uncompiled_policy.state_dict(), model_path) - - state = { - 'optimizer_state_dict': data.optimizer.state_dict(), - 'global_step': data.global_step, - 'agent_step': data.global_step, - 'update': data.epoch, - 'model_name': model_name, - 'exp_id': config.exp_id, - } - state_path = os.path.join(path, 'trainer_state.pt') - torch.save(state, state_path + '.tmp') - os.rename(state_path + '.tmp', state_path) - return model_path - -def try_load_checkpoint(data): - config = data.config - path = os.path.join(config.data_dir, config.exp_id) - if not os.path.exists(path): - print('No checkpoints found. Assuming new experiment') - return - - trainer_path = os.path.join(path, 'trainer_state.pt') - resume_state = torch.load(trainer_path, weights_only=False) - model_path = os.path.join(path, resume_state['model_name']) - data.policy.uncompiled.load_state_dict( - torch.load(model_path, weights_only=True), map_location=config.device) - data.optimizer.load_state_dict(resume_state['optimizer_state_dict']) - print(f'Loaded checkpoint {resume_state["model_name"]}') - -def count_params(policy): - return sum(p.numel() for p in policy.parameters() if p.requires_grad) - -def rollout(env_creator, env_kwargs, policy_cls, rnn_cls, agent_creator, agent_kwargs, - backend, render_mode='auto', model_path=None, device='cuda'): - - if render_mode != 'auto': - env_kwargs['render_mode'] = render_mode - - # We are just using Serial vecenv to give a consistent - # single-agent/multi-agent API for evaluation - env = pufferlib.vector.make(env_creator, env_kwargs=env_kwargs, backend=backend) - - agent = agent_creator(env, policy_cls, rnn_cls, agent_kwargs).to(device) - if model_path is not None: - agent.load_state_dict(torch.load(model_path, map_location=device, weights_only=False)) - - ob, info = env.reset() - driver = env.driver_env - os.system('clear') - - state = pufferlib.namespace( - lstm_h=None, - lstm_c=None, - diayn_z=torch.arange(env.num_agents, dtype=torch.long, device=device) % 4 - ) - - num_agents = env.observation_space.shape[0] - if hasattr(agent, 'recurrent'): - shape = (num_agents, agent.hidden_size) - state.lstm_h = torch.zeros(shape).to(device) - state.lstm_c = torch.zeros(shape).to(device) - - frames = [] - tick = 0 - value = [0] - intrinsic = [0] - intrinsic_mean = None - intrinsic_std = None - while tick <= 200000: - if tick > 1000 and tick % 1 == 0: - #render = driver.render(overlay=float(intrinsic[0])) - render = driver.render() - if driver.render_mode == 'ansi': - print('\033[0;0H' + render + '\n') - time.sleep(0.05) - elif driver.render_mode == 'rgb_array': - frames.append(render) - import cv2 - render = cv2.cvtColor(render, cv2.COLOR_RGB2BGR) - cv2.imshow('frame', render) - cv2.waitKey(1) - time.sleep(1/24) - elif driver.render_mode in ('human', 'raylib') and render is not None: - frames.append(render) - - with torch.no_grad(): - ob = torch.as_tensor(ob).to(device) - logits, value = agent(ob, state) - action, logprob, _ = pufferlib.pytorch.sample_logits(logits, is_continuous=agent.is_continuous) - action = action.cpu().numpy().reshape(env.action_space.shape) - - ob, reward = env.step(action)[:2] - reward = reward.mean() - if tick % 128 == 0: - print(f'Reward: {reward:.4f}, Tick: {tick}') - tick += 1 - - # Save frames as gif - if frames: - import imageio - os.makedirs('../docker', exist_ok=True) or imageio.mimsave('../docker/eval.gif', frames, fps=15, loop=0) + return dist_sum(value, device) / torch.distributed.get_world_size() class Profile: - def __init__(self, keys, frequency=1): - self.stack = [] + def __init__(self, frequency=5): + self.profiles = defaultdict(lambda: defaultdict(float)) self.frequency = frequency - self.profiles = {k: - pufferlib.namespace( - start = 0, - buffer = 0, - delta = 0, - elapsed = 0, - calls = 0, - ) for k in keys - } + self.stack = [] + + def __iter__(self): + return iter(self.profiles.items()) def __getattr__(self, name): return self.profiles[name] @@ -890,33 +689,29 @@ def __call__(self, name, epoch, nest=False): torch.cuda.synchronize() tick = time.time() - if len(self.stack) != 0 and not nest: self.pop(tick) self.stack.append(name) - self.profiles[name].start = tick + self.profiles[name]['start'] = tick def pop(self, end): profile = self.profiles[self.stack.pop()] - delta = end - profile.start - profile.buffer += delta - profile.elapsed += delta - profile.calls += 1 + delta = end - profile['start'] + profile['elapsed'] += delta + profile['delta'] += delta def end(self): torch.cuda.synchronize() end = time.time() - for i in range(len(self.stack)): self.pop(end) def clear(self): - for v in self.profiles.values(): - if v.buffer != 0: - v.delta = v.buffer - - v.buffer = 0 + for prof in self.profiles.values(): + if prof['delta'] > 0: + prof['buffer'] = prof['delta'] + prof['delta'] = 0 class Utilization(Thread): def __init__(self, delay=1, maxlen=20): @@ -925,9 +720,8 @@ def __init__(self, delay=1, maxlen=20): self.cpu_util = deque(maxlen=maxlen) self.gpu_util = deque(maxlen=maxlen) self.gpu_mem = deque(maxlen=maxlen) - - self.delay = delay self.stopped = False + self.delay = delay self.start() def run(self): @@ -942,159 +736,301 @@ def run(self): else: self.gpu_util.append(0) self.gpu_mem.append(0) + time.sleep(self.delay) def stop(self): self.stopped = True -ROUND_OPEN = rich.box.Box( - "╭──╮\n" - "│ │\n" - "│ │\n" - "│ │\n" - "│ │\n" - "│ │\n" - "│ │\n" - "╰──╯\n" -) - -c1 = '[cyan]' -c2 = '[white]' -b1 = '[bright_cyan]' -b2 = '[bright_white]' - -def abbreviate(num): - if num < 1e3: - return f'{b2}{num:.0f}' - elif num < 1e6: - return f'{b2}{num/1e3:.1f}{c2}k' - elif num < 1e9: - return f'{b2}{num/1e6:.1f}{c2}m' - elif num < 1e12: - return f'{b2}{num/1e9:.1f}{c2}b' - else: - return f'{b2}{num/1e12:.1f}{c2}t' - -def duration(seconds): - seconds = int(seconds) - h = seconds // 3600 - m = (seconds % 3600) // 60 - s = seconds % 60 - return f"{b2}{h}{c2}h {b2}{m}{c2}m {b2}{s}{c2}s" if h else f"{b2}{m}{c2}m {b2}{s}{c2}s" if m else f"{b2}{s}{c2}s" - -def fmt_perf(name, color, delta_ref, prof): - percent = 0 if delta_ref == 0 else int(100*prof.delta/delta_ref - 1e-5) - return f'{color}{name}', duration(prof.elapsed), f'{b2}{percent:2d}{c2}%' - -# TODO: Add env name to print_dashboard -def print_dashboard(data, clear=False, max_stats=[0]): - utilization = data.utilization - profile = data.profile - config = data.config - console = Console() - if clear: - console.clear() - - dashboard = Table(box=ROUND_OPEN, expand=True, - show_header=False, border_style='bright_cyan') - - table = Table(box=None, expand=True, show_header=False) - dashboard.add_row(table) - cpu_percent = np.mean(utilization.cpu_util) - dram_percent = np.mean(utilization.cpu_mem) - gpu_percent = np.mean(utilization.gpu_util) - vram_percent = np.mean(utilization.gpu_mem) - table.add_column(justify="left", width=30) - table.add_column(justify="center", width=12) - table.add_column(justify="center", width=12) - table.add_column(justify="center", width=13) - table.add_column(justify="right", width=13) - table.add_row( - f':blowfish: {b1}PufferLib {b2}2.0.0', - f'{c1}CPU: {b2}{cpu_percent:.1f}{c2}%', - f'{c1}GPU: {b2}{gpu_percent:.1f}{c2}%', - f'{c1}DRAM: {b2}{dram_percent:.1f}{c2}%', - f'{c1}VRAM: {b2}{vram_percent:.1f}{c2}%', +def init_wandb(args, id=None, resume=True, tag=None): + import wandb + wandb.init( + id=id or wandb.util.generate_id(), + project=args['wandb_project'], + group=args['wandb_group'], + allow_val_change=True, + save_code=False, + resume=resume, + config=args, + tags=[tag] if tag is not None else [], ) - - s = Table(box=None, expand=True) - SPS = 0 - delta = profile.eval.delta + profile.train.delta - remaining = 'A hair past a freckle' - if delta != 0: - SPS = config.batch_size/delta - remaining = duration((config.total_timesteps - data.global_step)/SPS) - - uptime = time.time() - data.start_time - s.add_column(f"{c1}Summary", justify='left', vertical='top', width=10) - s.add_column(f"{c1}Value", justify='right', vertical='top', width=14) - s.add_row(f'{c2}Env', f'{b2}{config.env}') - s.add_row(f'{c2}Steps', abbreviate(data.global_step)) - s.add_row(f'{c2}SPS', abbreviate(SPS)) - s.add_row(f'{c2}Epoch', abbreviate(data.epoch)) - s.add_row(f'{c2}Uptime', duration(uptime)) - s.add_row(f'{c2}Remaining', remaining) - - p = Table(box=None, expand=True, show_header=False) - p.add_column(f"{c1}Performance", justify="left", width=10) - p.add_column(f"{c1}Time", justify="right", width=8) - p.add_column(f"{c1}%", justify="right", width=4) - p.add_row(*fmt_perf('Evaluate', b1, delta, profile.eval)) - p.add_row(*fmt_perf(' Forward', c2, delta, profile.eval_forward)) - p.add_row(*fmt_perf(' Env', c2, delta, profile.env)) - p.add_row(*fmt_perf(' Copy', c2, delta, profile.eval_copy)) - p.add_row(*fmt_perf(' Misc', c2, delta, profile.eval_misc)) - p.add_row(*fmt_perf('Train', b1, delta, profile.train)) - p.add_row(*fmt_perf(' Forward', c2, delta, profile.train_forward)) - p.add_row(*fmt_perf(' Learn', c2, delta, profile.learn)) - p.add_row(*fmt_perf(' Copy', c2, delta, profile.train_copy)) - p.add_row(*fmt_perf(' Misc', c2, delta, profile.train_misc)) - if 'custom' in profile.profiles: - p.add_row(*fmt_perf(' Custom', c2, uptime, profile.custom)) - - l = Table(box=None, expand=True, ) - l.add_column(f'{c1}Losses', justify="left", width=16) - l.add_column(f'{c1}Value', justify="right", width=8) - for metric, value in data.losses.items(): - l.add_row(f'{c2}{metric}', f'{b2}{value:.3f}') - - monitor = Table(box=None, expand=True, pad_edge=False) - monitor.add_row(s, p, l) - dashboard.add_row(monitor) - - table = Table(box=None, expand=True, pad_edge=False) - dashboard.add_row(table) - left = Table(box=None, expand=True) - right = Table(box=None, expand=True) - table.add_row(left, right) - left.add_column(f"{c1}User Stats", justify="left", width=20) - left.add_column(f"{c1}Value", justify="right", width=10) - right.add_column(f"{c1}User Stats", justify="left", width=20) - right.add_column(f"{c1}Value", justify="right", width=10) + return wandb + +def init_neptune(args, id=None, resume=True, tag=None, mode="async"): + import neptune + import neptune.exceptions + try: + neptune_name = args['neptune_name'] + neptune_project = args['neptune_project'] + run = neptune.init_run( + project=f"{neptune_name}/{neptune_project}", + capture_hardware_metrics=False, + capture_stdout=False, + capture_stderr=False, + capture_traceback=False, + tags=[tag] if tag is not None else [], + mode=mode, + ) + except neptune.exceptions.NeptuneConnectionLostException: + print("couldn't connect to neptune, logging in offline mode") + return init_neptune(args, id, resume, tag, mode="offline") + return run + +# TODO: Do we need this? +def make_policy(env, policy_cls, rnn_cls, args): + policy = policy_cls(env, **args['policy']) + if rnn_cls is not None: + policy = rnn_cls(env, policy, **args['rnn']) + + return policy.to(args['train']['device']) + +# TODO: Is there a simpler interp +def downsample_linear(arr, m): + n = len(arr) + x_old = np.linspace(0, 1, n) # Original indices normalized + x_new = np.linspace(0, 1, m) # New indices normalized + return np.interp(x_new, x_old, arr) + +# TODO: All logs? +def experiment(vecenv, policy, args): + train_config = dict(**args['train'], env=env_name, tag=args['tag']) + pufferl = CleanPuffeRL(train_config, vecenv, policy, neptune=args['neptune'], wandb=args['wandb']) + + all_logs = [] + while pufferl.global_step < train_config['total_timesteps']: + pufferl.evaluate() + logs = pufferl.train() + if logs is not None: + all_logs.append(logs) + + vecenv.async_reset(train_config['seed']) i = 0 - for metric, value in data.stats.items(): - try: # Discard non-numeric values - int(value) + stats = {} + while i < 10 and not stats: + stats = pufferl.evaluate() + i += 1 + + logs = pufferl.mean_and_log() + if logs is not None: + all_logs.append(logs) + + pufferl.close() + return all_logs + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=f':blowfish: PufferLib [bright_cyan]{pufferlib.__version__}[/]' + ' demo options. Shows valid args for your env and policy', + formatter_class=RichHelpFormatter, add_help=False) + parser.add_argument('--env', '--environment', type=str, + default='puffer_squared', help='Name of specific environment to run') + parser.add_argument('--mode', type=str, default='train', + choices='train eval sweep autotune profile'.split()) + parser.add_argument('--load-model-path', type=str, default=None, + help='Path to a pretrained checkpoint') + parser.add_argument('--load-id', type=str, + default=None, help='Kickstart/eval from from a finished Wandb/Neptune run') + parser.add_argument('--render-mode', type=str, default='auto', + choices=['auto', 'human', 'ansi', 'rgb_array', 'raylib', 'None']) + parser.add_argument('--save-frames', type=int, default=0) + parser.add_argument('--gif-path', type=str, default='eval.gif') + parser.add_argument('--fps', type=float, default=15) + parser.add_argument('--max-runs', type=int, default=200, help='Max number of sweep runs') + parser.add_argument('--wandb', action='store_true', help='Use wandb for logging') + parser.add_argument('--wandb-project', type=str, default='pufferlib') + parser.add_argument('--wandb-group', type=str, default='debug') + parser.add_argument('--neptune', action='store_true', help='Use neptune for logging') + parser.add_argument('--neptune-name', type=str, default='pufferai') + parser.add_argument('--neptune-project', type=str, default='ablations') + parser.add_argument('--local-rank', type=int, default=0, help='Used by torchrun for DDP') + parser.add_argument('--tag', type=str, default=None, help='Tag for experiment') + args = parser.parse_known_args()[0] + + # Load defaults and config + for path in glob.glob('config/**/*.ini', recursive=True): + p = configparser.ConfigParser() + p.read(['config/default.ini', path]) + if args.env in p['base']['env_name'].split(): break + else: + raise pufferlib.APIUsageError('No config for env_name {}'.format(args.env)) + + # Dynamic help menu from config + for section in p.sections(): + for key in p[section]: + try: + value = ast.literal_eval(p[section][key]) + except: + value = p[section][key] + + fmt = f'--{key}' if section == 'base' else f'--{section}.{key}' + parser.add_argument(fmt.replace('_', '-'), default=value) + + parser.add_argument('-h', '--help', default=argparse.SUPPRESS, + action='help', help='Show this help message and exit') + + # Unpack to nested dict + parsed = vars(parser.parse_args()) + env_name = parsed.pop('env') + args = defaultdict(dict) + for key, value in parsed.items(): + next = args + for subkey in key.split('.'): + prev = next + next = next.setdefault(subkey, {}) + + prev[subkey] = value + + # Dynamically import environment and policy + import importlib + package = args['package'] + module_name = 'pufferlib.ocean' if package == 'ocean' else f'pufferlib.environments.{package}' + env_module = importlib.import_module(module_name) + make_env = env_module.env_creator(env_name) + policy_cls = getattr(env_module.torch, args['policy_name']) + rnn_name = args['rnn_name'] + rnn_cls = None + if rnn_name is not None: + rnn_cls = getattr(env_module.torch, args['rnn_name']) + + # Aggressively exit on ctrl+c + import signal + signal.signal(signal.SIGINT, lambda sig, frame: os._exit(0)) + + # Assume TorchRun DDP is used if LOCAL_RANK is set + if 'LOCAL_RANK' in os.environ: + torch.distributed.init_process_group(backend='nccl', rank=0, world_size=1) + + if args['mode'] == 'autotune': + pufferlib.vector.autotune(make_env, batch_size=args['train']['env_batch_size']) + exit(0) + + args['train']['use_rnn'] = rnn_cls is not None + env_name = args['env_name'] + device = args['train']['device'] + + if args['mode'] == 'sweep': + if not args['wandb'] and not args['neptune']: + raise pufferlib.APIUsageError('Sweeps require either wandb or neptune') + + method = args['sweep'].pop('method') + try: + sweep_cls = getattr(pufferlib.sweep, method) except: - continue + raise pufferlib.APIUsageError(f'Invalid sweep method {method}. See pufferlib.sweep') - u = left if i % 2 == 0 else right - u.add_row(f'{c2}{metric}', f'{b2}{value:.3f}') - i += 1 - if i == 30: - break + sweep = sweep_cls(args['sweep']) + target_key = f'environment/{args["sweep"]["metric"]}' + total_timesteps = args['train']['total_timesteps'] + for i in range(args['max_runs']): + seed = time.time_ns() & 0xFFFFFFFF + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + sweep.suggest(args) + + vecenv = pufferlib.vector.make(make_env, env_kwargs=args['env'], **args['vec']) + policy = make_policy(vecenv.driver_env, policy_cls, rnn_cls, args) + all_logs = experiment(vecenv, policy, args) + + scores = downsample_linear([log[target_key] for log in all_logs], 10) + costs = downsample_linear([log['uptime'] for log in all_logs], 10) + timesteps = downsample_linear([log['agent_steps'] for log in all_logs], 10) - for i in range(max_stats[0] - i): - u = left if i % 2 == 0 else right - u.add_row('', '') + for score, cost, timestep in zip(scores, costs, timesteps): + args['train']['total_timesteps'] = timestep + sweep.observe(args, score, cost) - max_stats[0] = max(max_stats[0], i) + # Prevent logging final eval steps as training steps + args['train']['total_timesteps'] = total_timesteps - table = Table(box=None, expand=True, pad_edge=False) - dashboard.add_row(table) - table.add_row(f' {c1}Message: {c2}{data.msg}') + exit(0) + + if args['mode'] == 'eval': + args['vec'] = dict(backend='Serial', num_envs=1) + + vecenv = pufferlib.vector.make(make_env, env_kwargs=args['env'], **args['vec']) + policy = make_policy(vecenv.driver_env, policy_cls, rnn_cls, args) + + load_id = args['load_id'] + if load_id is not None: + if args['mode'] not in ('train', 'eval'): + raise pufferlib.APIUsageError('load_id requires mode to be train or eval') + + if args['neptune']: + import neptune + neptune_name = args['neptune_name'] + neptune_project = args['neptune_project'] + run = neptune.init_run( + project=f"{neptune_name}/{neptune_project}", + with_id=load_id, mode="read-only") + data_dir = 'artifacts' + run["model"].download(destination=data_dir) + elif args['wandb']: + run = init_wandb(args, load_id, resume='must') + artifact = run.use_artifact(f'{load_id}:latest') + data_dir = artifact.download() + model_file = max(os.listdir(data_dir)) + else: + raise pufferlib.APIUsageError('No run id provided for eval') + + policy.load_state_dict(torch.load(f'{data_dir}/{load_id}.pt', map_location=device)) + + if args['load_model_path'] is not None: + policy.load_state_dict(torch.load( + args['load_model_path'], map_location=args['train']['device'])) + + if args['mode'] == 'train': + experiment(vecenv, policy, args) + elif args['mode'] == 'eval': + ob, info = vecenv.reset() + driver = vecenv.driver_env + num_agents = vecenv.observation_space.shape[0] + + state = {} + if args['train']['use_rnn']: + state = dict( + lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device), + lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device), + ) + + frames = [] + while True: + render = driver.render() + if len(frames) < args['save_frames']: + frames.append(render) - with console.capture() as capture: - console.print(dashboard) + # TODO: Frames from raylib + if driver.render_mode == 'ansi': + print('\033[0;0H' + render + '\n') + time.sleep(1/args['fps']) + elif driver.render_mode == 'rgb_array': + import cv2 + render = cv2.cvtColor(render, cv2.COLOR_RGB2BGR) + cv2.imshow('frame', render) + cv2.waitKey(1) + time.sleep(1/args['fps']) - print('\033[0;0H' + capture.get()) + with torch.no_grad(): + ob = torch.as_tensor(ob).to(args['train']['device']) + logits, value = policy(ob, state) + action, logprob, _ = pufferlib.pytorch.sample_logits(logits) + action = action.cpu().numpy().reshape(vecenv.action_space.shape) + + ob = vecenv.step(action)[0] + + if len(frames) > 0 and len(frames) == args['save_frames']: + import imageio + imageio.mimsave(args['gif_path'], frames, fps=args['fps'], loop=0) + frames.append('Done') + elif args['mode'] == 'profile': + import torch + import torchvision.models as models + from torch.profiler import profile, record_function, ProfilerActivity + with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof: + with record_function("model_inference"): + for _ in range(10): + stats = pufferl.evaluate() + pufferl.train() + + print(prof.key_averages().table(sort_by='cuda_time_total', row_limit=10)) + prof.export_chrome_trace("trace.json") diff --git a/config/default.ini b/config/default.ini index aa8efdc02c..2816695409 100644 --- a/config/default.ini +++ b/config/default.ini @@ -1,26 +1,33 @@ [base] package = None env_name = None -vec = native policy_name = Policy rnn_name = None max_suggestion_cost = 3600 -[workspace] -name = pufferai -project = ablations +[vec] +backend = Multiprocessing +num_envs = 2 +num_workers = auto +batch_size = auto +zero_copy = True +seed = 42 [env] [policy] [rnn] [train] -seed = 0 +name = pufferai +project = ablations +run_id = None +run_tag = None + +seed = 42 torch_deterministic = True cpu_offload = False device = cuda optimizer = muon -scheduler = cosine anneal_lr = True precision = float32 total_timesteps = 10_000_000 @@ -28,78 +35,47 @@ learning_rate = 0.025 gamma = 0.995 gae_lambda = 0.85 update_epochs = 1 -norm_adv = True # Consider raising clip coef to 0.2 clip_coef = 0.1 -clip_vloss = True vf_coef = 2.0 vf_clip_coef = 0.1 max_grad_norm = 0.5 ent_coef = 0.01 -target_kl = None adam_beta1 = 0.9 adam_beta2 = 0.999 adam_eps = 1e-12 -num_envs = 2 -num_workers = 2 -env_batch_size = 1 -zero_copy = True data_dir = experiments checkpoint_interval = 200 -batch_size = 524288 +batch_size = auto minibatch_size = 8192 -replay_factor = 0.0 + # Accumulate gradients above this size max_minibatch_size = 32768 bptt_horizon = 64 compile = False -compile_mode = reduce-overhead +compile_mode = max-autotune-no-cudagraphs compile_fullgraph = True -use_diayn = False -diayn_archive = 256 -diayn_loss_coef = 0.000 -diayn_coef = 0.0 - -use_p3o = False -p3o_horizon = 128 -puf = 0.0 - -use_vtrace = False vtrace_rho_clip = 1.0 vtrace_c_clip = 1.0 -use_puff_advantage = True - prio_alpha = 0.6 prio_beta0 = 0.4 [sweep] -method = protein -name = sweep - -[sweep.metric] +method = Protein +metric = score goal = maximize -name = score -min = 0 -max = 1 -[sweep.env.num_envs] +[sweep.vec.num_envs] distribution = uniform_pow2 -min = 64 -max = 4096 -mean = 1024 +min = 1 +max = 8 +mean = 2 scale = auto -#scale = 0.5 - -#[sweep.policy.hidden_size] -#distribution = uniform_pow2 -#min = 32 -#max = 1024 -#mean = 128 -#scale = auto +# TODO: Elim from base [sweep.train.total_timesteps] distribution = log_normal min = 5e7 @@ -107,18 +83,18 @@ max = 1e10 mean = 1e8 scale = time -[sweep.train.batch_size] -distribution = uniform_pow2 -min = 32768 -max = 1048576 -mean = 262144 +[sweep.train.bptt_horizon] +distribution = int_uniform +min = 16 +max = 64 +mean = 64 scale = auto [sweep.train.minibatch_size] distribution = uniform_pow2 -min = 1024 -max = 32768 -mean = 8192 +min = 8192 +max = 131072 +mean = 32768 scale = auto [sweep.train.learning_rate] @@ -141,7 +117,6 @@ min = 0.8 mean = 0.98 max = 0.9999 scale = auto -#scale = 0.5 [sweep.train.gae_lambda] distribution = logit_normal @@ -149,7 +124,6 @@ min = 0.6 mean = 0.95 max = 0.995 scale = auto -#scale = 0.5 [sweep.train.update_epochs] distribution = int_uniform @@ -158,6 +132,20 @@ max = 4 mean = 1 scale = 1.0 +[sweep.train.clip_coef] +distribution = uniform +min = 0.01 +max = 1.0 +mean = 0.1 +scale = auto + +[sweep.train.vf_clip_coef] +distribution = uniform +min = 0.01 +max = 5.0 +mean = 0.1 +scale = auto + [sweep.train.vf_coef] distribution = uniform min = 0.0 @@ -172,20 +160,6 @@ mean = 1.0 max = 5.0 scale = auto -[sweep.train.bptt_horizon] -distribution = uniform_pow2 -min = 4 -max = 128 -mean = 16 -scale = auto - -#[sweep.train.puf] -#distribution = logit_normal -#min = 0.01 -#mean = 0.5 -#max = 0.99 -#scale = auto - [sweep.train.adam_beta1] distribution = logit_normal min = 0.5 @@ -201,36 +175,22 @@ max = 0.99999 scale = auto [sweep.train.adam_eps] -distribution = uniform -min = 0.00000000000001 -mean = 0.00000001 -max = 0.001 +distribution = log_normal +min = 1e-14 +mean = 1e-8 +max = 1e-4 scale = auto -#[sweep.train.horizon] -#distribution = uniform_pow2 -#min = 4 -#max = 128 -#mean = 32 -#scale = 0.25 - -#[sweep.train.diayn_archive] -#distribution = uniform_pow2 -#min = 2 -#max = 64 -#mean = 8 -#scale = auto - -#[sweep.train.diayn_loss_coef] -#distribution = uniform -#min = 0.0 -#max = 2.0 -#mean = 1.0 -#scale = auto - -#[sweep.train.diayn_coef] -#distribution = log_normal -#min = 0.0001 -#mean = 0.1 -#max = 0.99 -#scale = auto +[sweep.train.prio_alpha] +distribution = logit_normal +min = 0.1 +mean = 0.6 +max = 0.99 +scale = auto + +[sweep.train.prio_beta0] +distribution = logit_normal +min = 0.1 +mean = 0.4 +max = 0.99 +scale = auto diff --git a/config/metta.ini b/config/metta.ini index df284bf76f..9bfcbd20ce 100644 --- a/config/metta.ini +++ b/config/metta.ini @@ -3,17 +3,17 @@ package = metta env_name = metta policy_name = Policy rnn_name = Recurrent -vec = multiprocessing + +[vec] +num_envs = 128 +num_workers = 16 +batch_size = 64 [env] render_mode = auto -#num_envs = 128 [train] -total_timesteps = 5_000_000_000 -num_envs = 128 -num_workers = 16 -env_batch_size = 64 +total_timesteps = 100_000_000 learning_rate = 0.0013848535655657842 gamma = 0.9959746852829785 gae_lambda = 0.9283720217357007 @@ -45,17 +45,6 @@ adam_eps = 0.000249501214984291 #minibatch_size = 32768 #compile = False -[sweep] -method = protein -name = sweep - -[sweep.metric] -goal = maximize -name = score -min = 0 -max = 10 -scale = auto - #[sweep.train.total_timesteps] #distribution = log_normal #min = 2e7 diff --git a/config/ocean/blastar.ini b/config/ocean/blastar.ini index 227096a0c9..e545f019b8 100644 --- a/config/ocean/blastar.ini +++ b/config/ocean/blastar.ini @@ -14,8 +14,8 @@ gamma = 0.95 learning_rate = 0.05 minibatch_size = 32768 -[sweep.metric] -name = environment/enemy_crossed_screen +[sweep] +metric = environment/enemy_crossed_screen goal = minimize [sweep.parameters.train.parameters.batch_size] diff --git a/config/ocean/breakout.ini b/config/ocean/breakout.ini index ebc0ac1660..25355a3e44 100644 --- a/config/ocean/breakout.ini +++ b/config/ocean/breakout.ini @@ -3,7 +3,9 @@ package = ocean env_name = puffer_breakout policy_name = Policy rnn_name = Recurrent -vec = multiprocessing + +[vec] +num_envs = 2 [env] num_envs = 4096 @@ -16,23 +18,35 @@ input_size = 128 hidden_size = 128 [train] -total_timesteps = 80_000_000 -learning_rate = 0.05 -minibatch_size = 32768 - -[sweep] -method = protein -name = sweep - -[sweep.metric] -goal = maximize -name = score -min = 0 -max = 864 - -#[sweep.train.total_timesteps] -#distribution = log_normal -#min = 2e7 -#max = 1e8 -#mean = 5e7 -#scale = auto +total_timesteps = 75_000_000 + +# Highly sensitive +adam_beta1 = 0.99 + +adam_beta2 = 0.9999 +adam_eps = 1e-14 +ent_coef = 0.025 +gae_lambda = 0.85 + +# Highly sensitive +gamma = 0.975 + +learning_rate = 0.01 +max_grad_norm = 1.5 +minibatch_size = 16384 + +prio_alpha = 0.0 +# Doesn't matter +prio_beta0 = 1.0 + +# Just can't be low +vf_coef = 1.3 + +# TODO: Try tuning clip coefs + +[sweep.train.total_timesteps] +distribution = log_normal +min = 2e7 +max = 5e8 +mean = 8e7 +scale = auto diff --git a/config/ocean/cartpole.ini b/config/ocean/cartpole.ini index 9a5674c6f0..6ecfb7db00 100644 --- a/config/ocean/cartpole.ini +++ b/config/ocean/cartpole.ini @@ -1,7 +1,6 @@ [base] package = ocean env_name = puffer_cartpole -vec = multiprocessing policy_name = Policy rnn_name = Recurrent @@ -16,13 +15,7 @@ minibatch_size = 32768 [sweep] method = protein -name = sweep - -[sweep.metric] -goal = maximize -name = episode_length -min = 0 -max = 205 +metric = episode_length [sweep.train.total_timesteps] distribution = log_normal diff --git a/config/ocean/enduro.ini b/config/ocean/enduro.ini index 4f6455b57a..f67089205f 100644 --- a/config/ocean/enduro.ini +++ b/config/ocean/enduro.ini @@ -15,15 +15,7 @@ minibatch_size = 32768 [sweep] -method = protein -name = sweep -max_score = None - -[sweep.metric] -goal = maximize -name = days_completed -min = 0 -max = None +metric = days_completed [sweep.train.total_timesteps] distribution = log_normal diff --git a/config/ocean/gpudrive.ini b/config/ocean/gpudrive.ini index 0235fca9c7..4391e1c7f1 100644 --- a/config/ocean/gpudrive.ini +++ b/config/ocean/gpudrive.ini @@ -3,41 +3,82 @@ package = ocean env_name = puffer_gpudrive policy_name = GPUDrive rnn_name = Recurrent -vec = native + +[vec] +num_workers = 16 +num_envs = 16 +batch_size = 8 +#backend = Serial [policy] input_size = 64 -hidden_size = 128 +hidden_size = 512 [rnn] -input_size = 64 -hidden_size = 128 +input_size = 512 +hidden_size = 512 [env] -num_envs = 75 -reward_vehicle_collision = 0 -reward_offroad_collision = 0 +num_envs = 72 +reward_vehicle_collision = 0.0 +reward_offroad_collision = 0.0 [train] total_timesteps = 150_000_000 -learning_rate = 0.005 -num_workers = 1 -num_envs = 1 -env_batch_size = 1 +#learning_rate = 0.005 anneal_lr = True +batch_size = 738192 +minibatch_size = 23296 +max_minibatch_size = 23296 +bptt_horizon = 91 +#adam_beta1 = 0.9225899639773112 +#adam_beta2 = 0.9 +#adam_eps = 0.0004030478187254784 +#ent_coef = 0.0020159472963835016 +#gae_lambda = 0.8829440612065992 +#gamma = 0.9872971455373439 +#learning_rate = 0.0003947934701844728 +#max_grad_norm = 0.5296288081133984 +#prio_alpha = 0.99 +#prio_beta0 = 0.48469847315324566 +#update_epochs = 2 +#vf_coef = 3.6777541336880786 +#checkpoint_interval = 1000 +adam_beta1 = 0.9852000972032763 +adam_beta2 = 0.9948751690861872 +adam_eps = 0.000002967099767264975 +clip_coef = 0.3153578071651496 +ent_coef = 0.000369784972524992 +gae_lambda = 0.9385892578563558 +gamma = 0.9864999317644947 +learning_rate = 0.0022659903674495338 +max_grad_norm = 1.942292174080673 +prio_alpha = 0.9414003089586056 +prio_beta = 0.9429842108374631 +vf_clip_coef = 1.9533056765171148 +vf_coef = 3.2028923035616774 +[sweep.train.total_timesteps] +distribution = log_normal +min = 5e7 +max = 2e8 +mean = 1e8 +scale = time + [sweep.env.reward_vehicle_collision] distribution = uniform min = -1.0 max = -0.25 +max = 0.0 mean = -0.5 scale = auto - + [sweep.env.reward_offroad_collision] distribution = uniform min = -1.0 max = -0.25 +max = 0.0 mean = -0.5 scale = auto diff --git a/config/ocean/grid.ini b/config/ocean/grid.ini index 511ed0428c..1c0bb9544b 100644 --- a/config/ocean/grid.ini +++ b/config/ocean/grid.ini @@ -1,7 +1,6 @@ [base] package = ocean env_name = puffer_grid -vec = multiprocessing policy_name = Policy rnn_name = Recurrent @@ -13,54 +12,32 @@ input_size = 512 hidden_size = 512 [env] -max_size = 31 +max_size = 47 num_envs = 4096 num_maps = 8192 [train] -total_timesteps = 180_000_000 +total_timesteps = 250_000_000 +adam_beta1 = 0.9225899639773112 +adam_beta2 = 0.9 +adam_eps = 0.0004030478187254784 +anneal_lr = true +batch_size = 524288 +ent_coef = 0.0020159472963835016 +gae_lambda = 0.8829440612065992 +gamma = 0.9872971455373439 +learning_rate = 0.0003947934701844728 +max_grad_norm = 0.5296288081133984 +minibatch_size = 4096 +prio_alpha = 0.99 +prio_beta0 = 0.48469847315324566 +#update_epochs = 2 +vf_coef = 3.6777541336880786 checkpoint_interval = 1000 -gamma = 0.9944336976183826 -gae_lambda = 0.9474288929489364 -ent_coef = 0.00001 -learning_rate = 0.005 -minibatch_size = 32768 - -[sweep] -method = protein -name = sweep - -[sweep.metric] -goal = maximize -name = score -min = 0 -max = 1 [sweep.train.total_timesteps] distribution = log_normal min = 5e7 -max = 2e8 +max = 6e8 mean = 1e8 scale = auto - -[sweep.train.e3b_coef] -distribution = logit_normal -min = 0.0001 -max = 0.99 -mean = 0.001 -scale = auto - -[sweep.train.e3b_lambda] -distribution = log_normal -min = 0.01 -max = 10.0 -mean = 0.1 -scale = auto - -[sweep.train.e3b_norm] -distribution = log_normal -min = 0.0001 -max = 0.1 -mean = 0.001 -scale = auto - diff --git a/config/ocean/impulse_wars.ini b/config/ocean/impulse_wars.ini index bacbf0b228..9fe5eff614 100644 --- a/config/ocean/impulse_wars.ini +++ b/config/ocean/impulse_wars.ini @@ -41,16 +41,6 @@ compile_mode = reduce-overhead compile_fullgraph = False device = cuda -[sweep] -method = protein -name = sweep - -[sweep.metric] -goal = maximize -name = score -min = 0.0 -max = 1.0 - [sweep.env.num_envs] distribution = uniform_pow2 min = 16 diff --git a/config/ocean/moba.ini b/config/ocean/moba.ini index bb345d6282..cd9e621771 100644 --- a/config/ocean/moba.ini +++ b/config/ocean/moba.ini @@ -18,13 +18,13 @@ num_envs = 8 num_workers = 4 env_batch_size = 4 minibatch_size = 20_480 +max_minibatch_size = 20_480 batch_size = 409_600 bptt_horizon = 80 learning_rate = 0.05 [sweep.metric] -goal = maximize -name = radiant_towers_alive +metric = radiant_towers_alive [sweep.train.total_timesteps] distribution = log_normal diff --git a/config/ocean/nmmo3.ini b/config/ocean/nmmo3.ini index 4ebbb77fcc..f5a33a5ee2 100644 --- a/config/ocean/nmmo3.ini +++ b/config/ocean/nmmo3.ini @@ -1,40 +1,38 @@ [base] package = ocean env_name = puffer_nmmo3 -vec = multiprocessing policy_name = NMMO3 rnn_name = NMMO3LSTM +[vec] +num_workers = 8 +num_envs = 8 +batch_size = 4 + [env] reward_combat_level = 1.0 reward_prof_level = 1.0 reward_item_level = 1.0 reward_market = 0.0 reward_death = -1.0 -num_envs = 4 +num_envs = 1 [train] total_timesteps = 107000000000 checkpoint_interval = 1000 learning_rate = 0.0004573146765703167 -num_envs = 2 -num_workers = 2 -env_batch_size = 1 -update_epochs = 1 gamma = 0.7647543366891623 gae_lambda = 0.996005622445478 ent_coef = 0.01210084358004069 max_grad_norm = 0.6075578331947327 vf_coef = 0.3979089612467003 -# todo: run 500k, 64 horz -bptt_horizon = 32 -batch_size = 262144 +bptt_horizon = 64 +batch_size = 524288 minibatch_size = 32768 -compile = False +max_minibatch_size = 32768 -[sweep.metric] -goal = maximize -name = min_comb_prof +[sweep] +metric = min_comb_prof [sweep.env.num_envs] distribution = uniform_pow2 diff --git a/config/ocean/pong.ini b/config/ocean/pong.ini index 7ddf13c044..bcacb4f9ef 100644 --- a/config/ocean/pong.ini +++ b/config/ocean/pong.ini @@ -3,25 +3,17 @@ package = ocean env_name = puffer_pong policy_name = Policy rnn_name = Recurrent -vec = multiprocessing + +[vec] +num_envs = 2 [env] num_envs = 4096 [train] -total_timesteps = 80_000_000 +total_timesteps = 500_000_000 learning_rate = 0.05 -minibatch_size = 32768 - -[sweep] -method = protein -name = sweep - -[sweep.metric] -goal = maximize -name = score -min = -21 -max = 21 +batch_size = auto [sweep.train.total_timesteps] distribution = log_normal diff --git a/config/ocean/snake.ini b/config/ocean/snake.ini index 2f6e6194ba..001d0df092 100644 --- a/config/ocean/snake.ini +++ b/config/ocean/snake.ini @@ -22,17 +22,6 @@ total_timesteps = 300_000_000 learning_rate = 0.05 minibatch_size = 32768 -[sweep] -method = protein -name = sweep -max_score = None - -[sweep.metric] -goal = maximize -name = score -min = 0 -max = None - [sweep.train.diayn_archive] distribution = uniform_pow2 min = 2 diff --git a/config/ocean/tower_climb.ini b/config/ocean/tower_climb.ini index fe7f210ce2..01b2059423 100644 --- a/config/ocean/tower_climb.ini +++ b/config/ocean/tower_climb.ini @@ -19,8 +19,7 @@ learning_rate = 0.05 minibatch_size = 32768 [sweep.metric] -goal = maximize -name = environment/levels_completed +metric = environment/levels_completed [sweep.parameters.train.parameters.total_timesteps] distribution = uniform diff --git a/config/ocean/tripletriad.ini b/config/ocean/tripletriad.ini index b0c2392e32..203555ae2a 100644 --- a/config/ocean/tripletriad.ini +++ b/config/ocean/tripletriad.ini @@ -14,12 +14,6 @@ gamma = 0.95 learning_rate = 0.05 minibatch_size = 32768 -[sweep.metric] -goal = maximize -name = score -min = 0 -max = 9.0 - [sweep.train.total_timesteps] distribution = log_normal min = 5e7 diff --git a/config/trade_sim.ini b/config/trade_sim.ini index f31f37fc5b..2a7f5b4054 100644 --- a/config/trade_sim.ini +++ b/config/trade_sim.ini @@ -3,29 +3,24 @@ package = trade_sim env_name = trade_sim policy_name = Policy rnn_name = Recurrent -vec = multiprocessing + +[vec] +backend = Multiprocessing +num_envs = 1024 +num_workers = 16 +batch_size = 512 #[env] #num_envs = 128 [train] total_timesteps = 100_000_000 -num_envs = 1024 -num_workers = 16 -env_batch_size = 512 gamma = 0.95 learning_rate = 0.05 minibatch_size = 32768 [sweep] -method = protein -name = sweep - -[sweep.metric] -goal = maximize -name = final_capital -min = 0 -max = 20000 +metric = final_capital [sweep.train.total_timesteps] distribution = log_normal diff --git a/demo.py b/demo.py deleted file mode 100644 index 8275d5e9bc..0000000000 --- a/demo.py +++ /dev/null @@ -1,381 +0,0 @@ -import configparser -import argparse -import shutil -import glob -import uuid -import ast -import os -import random -import time - -import numpy as np -import torch - -import pufferlib -import pufferlib.sweep -import pufferlib.utils -import pufferlib.vector - -from rich_argparse import RichHelpFormatter -from rich.console import Console -from rich.traceback import install -install(show_locals=False) # Rich tracebacks - -import signal # Aggressively exit on ctrl+c -signal.signal(signal.SIGINT, lambda sig, frame: os._exit(0)) - -import clean_pufferl - -def init_wandb(args, name, id=None, resume=True, tag=None): - import wandb - wandb.init( - id=id or wandb.util.generate_id(), - project=args['wandb_project'], - group=args['wandb_group'], - allow_val_change=True, - save_code=False, - resume=resume, - config=args, - name=name, - tags=[tag] if tag is not None else [], - ) - return wandb - -def init_neptune(args, name, id=None, resume=True, tag=None, mode="async"): - import neptune - import neptune.exceptions - try: - workspace = args['workspace'] - run = neptune.init_run( - project=f"{workspace['name']}/{workspace['project']}", - capture_hardware_metrics=False, - capture_stdout=False, - capture_stderr=False, - capture_traceback=False, - tags=[tag] if tag is not None else [], - mode=mode, - ) - except neptune.exceptions.NeptuneConnectionLostException: - print("couldn't connect to neptune, logging in offline mode") - return init_neptune(args, name, id, resume, tag, mode="offline") - return run - -def make_policy(env, policy_cls, rnn_cls, args): - policy = policy_cls(env, **args['policy'], - #batch_size=args['train']['batch_size'], - use_p3o=args['train']['use_p3o'], - p3o_horizon=args['train']['p3o_horizon'], - use_diayn=args['train']['use_diayn'], - diayn_skills=args['train']['diayn_archive'], - ) - args['rnn']['input_size'] = policy.hidden_size - args['rnn']['hidden_size'] = policy.hidden_size - if rnn_cls is not None: - policy = rnn_cls(env, policy, **args['rnn']) - - return policy.to(args['train']['device']) - -def sweep(args, env_name, make_env, policy_cls, rnn_cls): - method = args['sweep']['method'] - if method == 'random': - sweep = pufferlib.sweep.Random(args['sweep']) - elif method == 'pareto_genetic': - sweep = pufferlib.sweep.ParetoGenetic(args['sweep']) - elif method == 'protein': - sweep = pufferlib.sweep.Protein( - args['sweep'], - resample_frequency=0, - num_random_samples=50, # Should be number of params - max_suggestion_cost=args['max_suggestion_cost'], - min_score = args['sweep']['metric']['min'], - max_score = args['sweep']['metric']['max'], - ) - elif method == 'carbs': - sweep = pufferlib.sweep.Carbs( - args['sweep'], - resample_frequency=5, - num_random_samples=10, # Should be number of params - max_suggestion_cost=args['max_suggestion_cost'], - ) - else: - raise ValueError(f'Invalid sweep method {method} (random/pareto_genetic/protein)') - - target_metric = args['sweep']['metric']['name'] - for i in range(args['max_runs']): - seed = time.time_ns() & 0xFFFFFFFF - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - - info = sweep.suggest(args) - if args['train']['minibatch_size'] >= args['train']['batch_size']: - sweep.observe(args, 0.0, 0.0) - continue - - scores, costs, timesteps, _, _ = train(args, make_env, policy_cls, rnn_cls, target_metric) - - # Hacky patch to prevent increasing total_timesteps when not swept - total_timesteps = args['train']['total_timesteps'] - for score, cost, timestep in zip(scores, costs, timesteps): - args['train']['total_timesteps'] = timestep - sweep.observe(args, score, cost) - - args['train']['total_timesteps'] = total_timesteps - - print('Score:', score, 'Cost:', cost, 'Timesteps:', timestep) - -def train(args, make_env, policy_cls, rnn_cls, target_metric, min_eval_points=100, - elos={'model_random.pt': 1000}, vecenv=None, wandb=None, neptune=None): - if args['vec'] == 'serial': - vec = pufferlib.vector.Serial - elif args['vec'] == 'multiprocessing': - vec = pufferlib.vector.Multiprocessing - elif args['vec'] == 'ray': - vec = pufferlib.vector.Ray - elif args['vec'] == 'native': - vec = pufferlib.environment.PufferEnv - else: - raise ValueError(f'Invalid --vec (serial/multiprocessing/ray/native).') - - env_name = args['env_name'] - if vecenv is None: - vecenv = pufferlib.vector.make( - make_env, - env_kwargs=args['env'], - num_envs=args['train']['num_envs'], - num_workers=args['train']['num_workers'], - batch_size=args['train']['env_batch_size'], - zero_copy=args['train']['zero_copy'], - overwork=args['vec_overwork'], - seed=args['train']['seed'], - backend=vec, - ) - - policy = make_policy(vecenv.driver_env, policy_cls, rnn_cls, args) - - if args['ddp']: - from torch.nn.parallel import DistributedDataParallel as DDP - orig_policy = policy - policy = DDP(policy, device_ids=[args['rank']]) - # TODO: Test this? isinstance? - if hasattr(orig_policy, 'lstm'): - policy.lstm = orig_policy.lstm - - neptune = None - wandb = None - if args['neptune']: - neptune = init_neptune(args, env_name, id=args['exp_id'], tag=args['tag']) - for k, v in pufferlib.utils.unroll_nested_dict(args): - neptune[k].append(v) - elif args['wandb']: - wandb = init_wandb(args, env_name, id=args['exp_id'], tag=args['tag']) - - train_config = pufferlib.namespace(**args['train'], env=env_name, - exp_id=args['exp_id'] or env_name + '-' + str(uuid.uuid4())[:8]) - data = clean_pufferl.create(train_config, vecenv, policy, wandb=wandb, neptune=neptune) - - timesteps = [] - scores = [] - costs = [] - target_key = f'environment/{target_metric}' - - vecenv.async_reset(train_config.seed) - while data.global_step < train_config.total_timesteps: - clean_pufferl.evaluate(data) - logs = clean_pufferl.train(data) - if logs is not None and target_key in logs: - timesteps.append(logs['agent_steps']) - scores.append(logs[target_key]) - #costs.append(data.profile.uptime) - - steps_evaluated = 0 - cost = time.time() - data.start_time - batch_size = args['train']['batch_size'] - while len(data.stats[target_metric]) < min_eval_points: - stats, _ = clean_pufferl.evaluate(data) - steps_evaluated += batch_size - - clean_pufferl.mean_and_log(data) - score = stats[target_metric] - print(f'Evaluated {steps_evaluated} steps. Score: {score}') - - scores.append(score) - costs.append(cost) - timesteps.append(data.global_step) - - def downsample_linear(arr, m): - n = len(arr) - x_old = np.linspace(0, 1, n) # Original indices normalized - x_new = np.linspace(0, 1, m) # New indices normalized - return np.interp(x_new, x_old, arr) - - scores = downsample_linear(scores, 10) - costs = downsample_linear(costs, 10) - timesteps = downsample_linear(timesteps, 10) - - if args['neptune']: - neptune['score'].append(score) - neptune['cost'].append(cost) - elif args['wandb']: - wandb.log({'score': score, 'cost': cost}) - - clean_pufferl.close(data) - return scores, costs, timesteps, elos, vecenv - -def train_ddp(rank, world_size, args, make_env, policy_cls, rnn_cls, target_metric): - import torch.distributed as dist - args['rank'] = rank - args['train']['device'] = f'cuda:{rank}' - dist.init_process_group(backend='nccl', rank=rank, world_size=world_size) - train(args, make_env, policy_cls, rnn_cls, target_metric) - dist.destroy_process_group() - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description=f':blowfish: PufferLib [bright_cyan]{pufferlib.__version__}[/]' - ' demo options. Shows valid args for your env and policy', - formatter_class=RichHelpFormatter, add_help=False) - parser.add_argument('--env', '--environment', type=str, - default='puffer_squared', help='Name of specific environment to run') - parser.add_argument('--mode', type=str, default='train', - choices='train eval evaluate sweep autotune profile'.split()) - parser.add_argument('--vec-overwork', action='store_true', - help='Allow vectorization to use >1 worker/core. Not recommended.') - parser.add_argument('--eval-model-path', type=str, default=None, - help='Path to a pretrained checkpoint') - parser.add_argument('--baseline', action='store_true', - help='Load pretrained model from WandB if available') - parser.add_argument('--ddp', action='store_true', help='Distributed data parallel') - parser.add_argument('--render-mode', type=str, default='auto', - choices=['auto', 'human', 'ansi', 'rgb_array', 'raylib', 'None']) - parser.add_argument('--exp-id', '--exp-name', type=str, - default=None, help='Resume from experiment') - parser.add_argument('--data-path', type=str, default=None, - help='Used for testing hparam algorithms') - parser.add_argument('--track', action='store_true', help='Track on WandB') - parser.add_argument('--max-runs', type=int, default=200, help='Max number of sweep runs') - parser.add_argument('--wandb-project', type=str, default='pufferlib') - parser.add_argument('--wandb-group', type=str, default='debug') - parser.add_argument('--tag', type=str, default=None, help='Tag for experiment') - parser.add_argument('--wandb', action='store_true', help='Track on WandB') - parser.add_argument('--neptune', action='store_true', help='Track on Neptune') - #parser.add_argument('--wandb-project', type=str, default='pufferlib') - #parser.add_argument('--wandb-group', type=str, default='debug') - args = parser.parse_known_args()[0] - - file_paths = glob.glob('config/**/*.ini', recursive=True) - for path in file_paths: - p = configparser.ConfigParser() - p.read('config/default.ini') - - subconfig = os.path.join(*path.split('/')[:-1] + ['default.ini']) - if subconfig in file_paths: - p.read(subconfig) - - p.read(path) - if args.env in p['base']['env_name'].split(): - break - else: - raise Exception('No config for env_name {}'.format(args.env)) - - for section in p.sections(): - for key in p[section]: - if section == 'base': - argparse_key = f'--{key}'.replace('_', '-') - else: - argparse_key = f'--{section}.{key}'.replace('_', '-') - parser.add_argument(argparse_key, default=p[section][key]) - - # Late add help so you get a dynamic menu based on the env - parser.add_argument('-h', '--help', default=argparse.SUPPRESS, - action='help', help='Show this help message and exit') - - parsed = parser.parse_args().__dict__ - args = {'env': {}, 'policy': {}, 'rnn': {}} - env_name = parsed.pop('env') - for key, value in parsed.items(): - next = args - for subkey in key.split('.'): - if subkey not in next: - next[subkey] = {} - prev = next - next = next[subkey] - try: - prev[subkey] = ast.literal_eval(value) - except: - prev[subkey] = value - - package = args['package'] - module_name = f'pufferlib.environments.{package}' - if package == 'ocean': - module_name = 'pufferlib.ocean' - - import importlib - env_module = importlib.import_module(module_name) - - make_env = env_module.env_creator(env_name) - policy_cls = getattr(env_module.torch, args['policy_name']) - - rnn_name = args['rnn_name'] - rnn_cls = None - if rnn_name is not None: - rnn_cls = getattr(env_module.torch, args['rnn_name']) - - if args['baseline']: - assert args['mode'] in ('train', 'eval', 'evaluate') - args['track'] = True - version = '.'.join(pufferlib.__version__.split('.')[:2]) - args['exp_id'] = f'puf-{version}-{env_name}' - args['wandb_group'] = f'puf-{version}-baseline' - shutil.rmtree(f'experiments/{args["exp_id"]}', ignore_errors=True) - run = init_wandb(args, args['exp_id'], resume=False) - if args['mode'] in ('eval', 'evaluate'): - model_name = f'puf-{version}-{env_name}_model:latest' - artifact = run.use_artifact(model_name) - data_dir = artifact.download() - model_file = max(os.listdir(data_dir)) - args['eval_model_path'] = os.path.join(data_dir, model_file) - if args['mode'] == 'train' and args['ddp']: - import torch.multiprocessing as mp - world_size = 1 - os.environ["MASTER_ADDR"] = "localhost" - os.environ["MASTER_PORT"] = "29500" - target_metric = args['sweep']['metric']['name'] - mp.spawn(train_ddp, - args=(world_size, args, make_env, policy_cls, rnn_cls, target_metric), - nprocs=world_size, - join=True, - ) - elif args['mode'] == 'train': - target_metric = args['sweep']['metric']['name'] - train(args, make_env, policy_cls, rnn_cls, target_metric) - elif args['mode'] in ('eval', 'evaluate'): - vec = pufferlib.vector.Serial - if args['vec'] == 'native': vec = pufferlib.environment.PufferEnv - clean_pufferl.rollout( - make_env, - args['env'], - policy_cls=policy_cls, - rnn_cls=rnn_cls, - agent_creator=make_policy, - agent_kwargs=args, - backend=vec, - model_path=args['eval_model_path'], - render_mode=args['render_mode'], - device=args['train']['device'], - ) - elif args['mode'] == 'sweep': - assert args['wandb'] or args['neptune'], 'Sweeps require either wandb or neptune' - sweep(args, env_name, make_env, policy_cls, rnn_cls) - elif args['mode'] == 'autotune': - pufferlib.vector.autotune(make_env, batch_size=args['train']['env_batch_size']) - elif args['mode'] == 'profile': - import cProfile - target_metric = args['sweep']['metric']['name'] - cProfile.run('train(args, make_env, policy_cls, rnn_cls, target_metric)', 'stats.profile') - import pstats - from pstats import SortKey - p = pstats.Stats('stats.profile') - p.sort_stats(SortKey.TIME).print_stats(10) - breakpoint() - pass diff --git a/pufferlib.cpp b/pufferlib.cpp index 6c8aab7fe5..4260234779 100644 --- a/pufferlib.cpp +++ b/pufferlib.cpp @@ -1,80 +1,97 @@ -#include "shared.cpp" +#include +#include +#include +#include +#include -// [num_steps, horizon] -void gae(float* values, float* rewards, float* dones, float* advantages, - float gamma, float gae_lambda, int num_steps, int horizon){ - for (int offset = 0; offset < num_steps*horizon; offset+=horizon) { - gae_row(values + offset, rewards + offset, dones + offset, - advantages + offset, gamma, gae_lambda, horizon); - } +extern "C" { + /* Creates a dummy empty _C module that can be imported from Python. + The import from Python will load the .so consisting of this file + in this extension, so that the TORCH_LIBRARY static initializers + below are run. */ + PyObject* PyInit__C(void) + { + static struct PyModuleDef module_def = { + PyModuleDef_HEAD_INIT, + "_C", /* name of module */ + NULL, /* module documentation, may be NULL */ + -1, /* size of per-interpreter state of the module, + or -1 if the module keeps state in global variables. */ + NULL, /* methods */ + }; + return PyModule_Create(&module_def); + } } -torch::Tensor compute_gae(torch::Tensor values, torch::Tensor rewards, - torch::Tensor dones, float gamma, float gae_lambda) { - int num_steps = values.size(0); - int horizon = values.size(1); - torch::Tensor advantages = gae_check(values, rewards, dones, num_steps, horizon); - gae(values.data_ptr(), rewards.data_ptr(), - dones.data_ptr(), advantages.data_ptr(), - gamma, gae_lambda, num_steps, horizon - ); - return advantages; +namespace pufferlib { + +static const int max_horizon = 256; +void puff_advantage_row(float* values, float* rewards, float* dones, + float* importance, float* advantages, float gamma, float lambda, + float rho_clip, float c_clip, int horizon) { + float lastpufferlam = 0; + for (int t = horizon-2; t >= 0; t--) { + int t_next = t + 1; + float nextnonterminal = 1.0 - dones[t_next]; + float rho_t = fminf(importance[t], rho_clip); + float c_t = fminf(importance[t], c_clip); + float delta = rho_t*(rewards[t_next] + gamma*values[t_next]*nextnonterminal - values[t]); + lastpufferlam = delta + gamma*lambda*c_t*lastpufferlam*nextnonterminal; + advantages[t] = lastpufferlam; + } } -// [num_steps, horizon] -void vtrace(float* values, float* rewards, float* dones, float* importance, - float* vs, float* advantages, float gamma, float rho_clip, float c_clip, - int num_steps, const int horizon){ - for (int offset = 0; offset < num_steps*horizon; offset+=horizon) { - vtrace_row(values + offset, rewards + offset, - dones + offset, importance + offset, - vs + offset, advantages + offset, - gamma, rho_clip, c_clip, horizon - ); +void vtrace_check(torch::Tensor values, torch::Tensor rewards, + torch::Tensor dones, torch::Tensor importance, torch::Tensor advantages, + int num_steps, int horizon) { + + // Validate input tensors + torch::Device device = values.device(); + for (const torch::Tensor& t : {values, rewards, dones, importance, advantages}) { + TORCH_CHECK(t.dim() == 2, "Tensor must be 2D"); + TORCH_CHECK(t.device() == device, "All tensors must be on same device"); + TORCH_CHECK(t.size(0) == num_steps, "First dimension must match num_steps"); + TORCH_CHECK(t.size(1) == horizon, "Second dimension must match horizon"); + TORCH_CHECK(t.dtype() == torch::kFloat32, "All tensors must be float32"); + assert(horizon <= max_horizon); + if (!t.is_contiguous()) { + t.contiguous(); + } } } + // [num_steps, horizon] void puff_advantage(float* values, float* rewards, float* dones, float* importance, - float* vs, float* advantages, float gamma, float lambda, float rho_clip, float c_clip, + float* advantages, float gamma, float lambda, float rho_clip, float c_clip, int num_steps, const int horizon){ for (int offset = 0; offset < num_steps*horizon; offset+=horizon) { puff_advantage_row(values + offset, rewards + offset, - dones + offset, importance + offset, - vs + offset, advantages + offset, + dones + offset, importance + offset, advantages + offset, gamma, lambda, rho_clip, c_clip, horizon ); } } -void compute_vtrace(torch::Tensor values, torch::Tensor rewards, - torch::Tensor dones, torch::Tensor importance, torch::Tensor vs, torch::Tensor advantages, - float gamma, float rho_clip, float c_clip) { - int num_steps = values.size(0); - int horizon = values.size(1); - vtrace_check(values, rewards, dones, importance, vs, advantages, num_steps, horizon); - vtrace(values.data_ptr(), rewards.data_ptr(), - dones.data_ptr(), importance.data_ptr(), - vs.data_ptr(), advantages.data_ptr(), - gamma, rho_clip, c_clip, num_steps, horizon - ); -} -void compute_puff_advantage(torch::Tensor values, torch::Tensor rewards, - torch::Tensor dones, torch::Tensor importance, torch::Tensor vs, torch::Tensor advantages, - float gamma, float lambda, float rho_clip, float c_clip) { +void compute_puff_advantage_cpu(torch::Tensor values, torch::Tensor rewards, + torch::Tensor dones, torch::Tensor importance, torch::Tensor advantages, + double gamma, double lambda, double rho_clip, double c_clip) { int num_steps = values.size(0); int horizon = values.size(1); - vtrace_check(values, rewards, dones, importance, vs, advantages, num_steps, horizon); + vtrace_check(values, rewards, dones, importance, advantages, num_steps, horizon); puff_advantage(values.data_ptr(), rewards.data_ptr(), - dones.data_ptr(), importance.data_ptr(), - vs.data_ptr(), advantages.data_ptr(), + dones.data_ptr(), importance.data_ptr(), advantages.data_ptr(), gamma, lambda, rho_clip, c_clip, num_steps, horizon ); } -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("compute_gae", &compute_gae, "Compute GAE with C"); - m.def("compute_vtrace", &compute_vtrace, "Compute VTrace with C"); - m.def("compute_puff_advantage", &compute_puff_advantage, "Compute PuffAdvantage with C"); +TORCH_LIBRARY(pufferlib, m) { + m.def("compute_puff_advantage(Tensor(a!) values, Tensor(b!) rewards, Tensor(c!) dones, Tensor(d!) importance, Tensor(e!) advantages, float gamma, float lambda, float rho_clip, float c_clip) -> ()"); + } + +TORCH_LIBRARY_IMPL(pufferlib, CPU, m) { + m.impl("compute_puff_advantage", &compute_puff_advantage_cpu); +} + } diff --git a/pufferlib.cu b/pufferlib.cu deleted file mode 100644 index 6cf490496c..0000000000 --- a/pufferlib.cu +++ /dev/null @@ -1,284 +0,0 @@ -#include "shared.cpp" - -__global__ void p3o_kernel( - float* reward_block, // [num_steps, horizon] - float* reward_mask, // [num_steps, horizon] - float* values_mean, // [num_steps, horizon] - float* values_std, // [num_steps, horizon] - float* buf, // [num_steps, horizon] - float* dones, // [num_steps] - float* rewards, // [num_steps] - float* advantages, // [num_steps] - int* bounds, // [num_steps] - int num_steps, - float r_std, - float puf, - int horizon -) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= num_steps) return; - - int k = 0; - for (int j = 0; j < horizon-1; j++) { - int t = i + j; - if (t >= num_steps - 1) { - break; - } - if (dones[t+1]) { - k++; - break; - } - k++; - } - - float gamma_max = 0.0f; - float n = 0.0f; - for (int j = k-1; j >= 0; j--) { - int idx = i * horizon + j; - n++; - - float vstd = values_std[idx]; - if (vstd == 0.0f) { - buf[idx] = 0.0f; - continue; - } - - float gamma = 1.0f / (vstd*vstd); - if (r_std != 0.0f) { - gamma -= puf/(r_std*r_std); - } - - if (gamma < 0.0f) { - gamma = 0.0f; - } - - if (gamma > gamma_max) { - gamma_max = gamma; - } - buf[idx] = gamma; - reward_mask[idx] = 1.0f; - } - - //float bootstrap = 0.0f; - //if (k == horizon-1) { - // bootstrap = buf[i*horizon + horizon - 1]*values_mean[i*horizon + horizon - 1]; - //} - - float R = 0.0f; - for (int j = 0; j <= k-1; j++) { - int t = i + j; - int idx = i * horizon + j; - float r = rewards[t+1]; - - float gamma = buf[idx]; - if (gamma_max > 0) { - gamma /= gamma_max; - } - - if (j >= 16 && values_std[idx] > 0.95*r_std) { - break; - } - - R += gamma * (r - values_mean[idx]); - reward_block[idx] = r; - buf[idx] = gamma; - } - - advantages[i] = R; - bounds[i] = k; -} - - -void compute_p3o(torch::Tensor reward_block, torch::Tensor reward_mask, - torch::Tensor values_mean, torch::Tensor values_std, torch::Tensor buf, - torch::Tensor dones, torch::Tensor rewards, torch::Tensor advantages, - torch::Tensor bounds, int num_steps, float vstd_max, float puf, - int horizon) { - - // TODO: Port from python - /* - assert all(t.is_cuda for t in [reward_block, reward_mask, values_mean, values_std, - buf, dones, rewards, advantages, bounds]), "All tensors must be on GPU" - - # Ensure contiguous memory - tensors = [reward_block, reward_mask, values_mean, values_std, buf, dones, rewards, advantages, bounds] - for t in tensors: - t.contiguous() - assert t.is_cuda - - num_steps = rewards.shape[0] - - # Precompute vstd_min and vstd_max - #vstd_max = values_std.max().item() - #vstd_min = values_std.min().item() - - # Launch kernel - threads_per_block = 256 - assert num_steps % threads_per_block == 0 - blocks = (num_steps + threads_per_block - 1) // threads_per_block - */ - - // Launch the kernel - int threads_per_block = 256; - int blocks = (num_steps + threads_per_block - 1) / threads_per_block; - - p3o_kernel<<>>( - reward_block.data_ptr(), - reward_mask.data_ptr(), - values_mean.data_ptr(), - values_std.data_ptr(), - buf.data_ptr(), - dones.data_ptr(), - rewards.data_ptr(), - advantages.data_ptr(), - bounds.data_ptr(), - num_steps, - vstd_max, - puf, - horizon - ); - - // Check for CUDA errors - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - throw std::runtime_error(cudaGetErrorString(err)); - } - return; -} - -// [num_steps, horizon] -__global__ void gae_kernel(float* values, float* rewards, float* dones, - float* advantages, float gamma, float gae_lambda, int num_steps, int horizon) { - int row = blockIdx.x*blockDim.x + threadIdx.x; - int offset = row*horizon; - gae_row(values + offset, rewards + offset, dones + offset, - advantages + offset, gamma, gae_lambda, horizon); -} - -torch::Tensor compute_gae(torch::Tensor values, torch::Tensor rewards, - torch::Tensor dones, float gamma, float gae_lambda) { - int num_steps = values.size(0); - int horizon = values.size(1); - torch::Tensor advantages = gae_check(values, rewards, dones, num_steps, horizon); - TORCH_CHECK(values.is_cuda(), "All tensors must be on GPU"); - - int threads_per_block = 256; - int blocks = (num_steps + threads_per_block - 1) / threads_per_block; - assert(num_steps % threads_per_block == 0); - - gae_kernel<<>>( - values.data_ptr(), - rewards.data_ptr(), - dones.data_ptr(), - advantages.data_ptr(), - gamma, - gae_lambda, - num_steps, - horizon - ); - - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - throw std::runtime_error(cudaGetErrorString(err)); - } - - return advantages; -} - - // [num_steps, horizon] -__global__ void vtrace_kernel(float* values, float* rewards, float* dones, float* importance, - float* vs, float* advantages, float gamma, float rho_clip, float c_clip, int num_steps, int horizon) { - int row = blockIdx.x*blockDim.x + threadIdx.x; - int offset = row*horizon; - vtrace_row(values + offset, rewards + offset, dones + offset, - importance + offset, vs + offset, advantages + offset, gamma, rho_clip, c_clip, horizon); -} - -void compute_vtrace(torch::Tensor values, torch::Tensor rewards, - torch::Tensor dones, torch::Tensor importance, torch::Tensor vs, torch::Tensor advantages, - float gamma, float rho_clip, float c_clip) { - int num_steps = values.size(0); - int horizon = values.size(1); - vtrace_check(values, rewards, dones, importance, vs, advantages, num_steps, horizon); - TORCH_CHECK(values.is_cuda(), "All tensors must be on GPU"); - assert(horizon <= max_horizon); - - int threads_per_block = 128; - int blocks = (num_steps + threads_per_block - 1) / threads_per_block; - assert(num_steps % threads_per_block == 0); - - vtrace_kernel<<>>( - values.data_ptr(), - rewards.data_ptr(), - dones.data_ptr(), - importance.data_ptr(), - vs.data_ptr(), - advantages.data_ptr(), - gamma, - rho_clip, - c_clip, - num_steps, - horizon - ); - - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - throw std::runtime_error(cudaGetErrorString(err)); - } -} - - // [num_steps, horizon] -__global__ void puff_advantage_kernel(float* values, float* rewards, float* dones, float* importance, - float* vs, float* advantages, float gamma, float lambda, - float rho_clip, float c_clip, int num_steps, int horizon) { - int row = blockIdx.x*blockDim.x + threadIdx.x; - int offset = row*horizon; - puff_advantage_row(values + offset, rewards + offset, dones + offset, - importance + offset, vs + offset, advantages + offset, gamma, lambda, rho_clip, c_clip, horizon); -} - -void compute_puff_advantage(torch::Tensor values, torch::Tensor rewards, - torch::Tensor dones, torch::Tensor importance, torch::Tensor vs, torch::Tensor advantages, - float gamma, float lambda, float rho_clip, float c_clip) { - int num_steps = values.size(0); - int horizon = values.size(1); - vtrace_check(values, rewards, dones, importance, vs, advantages, num_steps, horizon); - TORCH_CHECK(values.is_cuda(), "All tensors must be on GPU"); - assert(horizon <= max_horizon); - - int threads_per_block = 256; - if (threads_per_block > num_steps) { - threads_per_block = 2*(num_steps/2); - } - int blocks = (num_steps + threads_per_block - 1) / threads_per_block; - assert(num_steps % threads_per_block == 0); - - puff_advantage_kernel<<>>( - values.data_ptr(), - rewards.data_ptr(), - dones.data_ptr(), - importance.data_ptr(), - vs.data_ptr(), - advantages.data_ptr(), - gamma, - lambda, - rho_clip, - c_clip, - num_steps, - horizon - ); - - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - throw std::runtime_error(cudaGetErrorString(err)); - } -} - - -// Pybind11 module definition -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("compute_p3o", &compute_p3o, "Compute p3o advantages with CUDA"); - m.def("compute_gae", &compute_gae, "Compute GAE with CUDA"); - m.def("compute_vtrace", &compute_vtrace, "Compute VTrace with CUDA"); - m.def("compute_puff_advantage", &compute_puff_advantage, "Compute PuffAdvantage with CUDA"); -} diff --git a/pufferlib/__init__.py b/pufferlib/__init__.py index c501c01945..cbdc7a463b 100644 --- a/pufferlib/__init__.py +++ b/pufferlib/__init__.py @@ -1,5 +1,4 @@ -from pufferlib import version -__version__ = version.__version__ +__version__ = '2.0.6' import os import sys @@ -23,6 +22,5 @@ sys.stdout = original_stdout sys.stderr = original_stderr -from pufferlib.namespace import namespace, dataclass, Namespace +from pufferlib.pufferlib import * from pufferlib import environments -from pufferlib.environment import PufferEnv diff --git a/pufferlib/cleanrl.py b/pufferlib/cleanrl.py deleted file mode 100644 index 9292a1d7c4..0000000000 --- a/pufferlib/cleanrl.py +++ /dev/null @@ -1,51 +0,0 @@ -from pdb import set_trace as T - -class Policy(torch.nn.Module): - '''Wrap a non-recurrent PyTorch model for use with CleanRL''' - def __init__(self, policy): - super().__init__() - self.policy = policy - self.is_continuous = hasattr(policy, 'is_continuous') and policy.is_continuous - self.hidden_size = policy.hidden_size - - def get_value(self, x, state=None): - _, value = self.policy(x) - return value - - def get_action_and_value(self, x, action=None): - logits, value, e3b, intrinsic_reward = self.policy(x, e3b=e3b) - action, logprob, entropy = sample_logits(logits, action, self.is_continuous) - return action, logprob, entropy, value, e3b, intrinsic_reward - - def forward(self, x, action=None, e3b=None): - return self.get_action_and_value(x, action, e3b) - - -class RecurrentPolicy(torch.nn.Module): - '''Wrap a recurrent PyTorch model for use with CleanRL''' - def __init__(self, policy): - super().__init__() - self.policy = policy - self.is_continuous = hasattr(policy.policy, 'is_continuous') and policy.policy.is_continuous - self.hidden_size = policy.hidden_size - - @property - def lstm(self): - if hasattr(self.policy, 'recurrent'): - return self.policy.recurrent - elif hasattr(self.policy, 'lstm'): - return self.policy.lstm - else: - raise ValueError('Policy must have a subnetwork named lstm or recurrent') - - def get_value(self, x, state=None): - _, value, _ = self.policy(x, state) - - def get_action_and_value(self, x, state=None, action=None, e3b=None): - #logits, value, state, e3b, intrinsic_reward = self.policy(x, state, e3b=e3b) - logits, value_mean, value_logstd, state = self.policy(x, state, e3b=e3b) - action, logprob, entropy = sample_logits(logits, action, self.is_continuous) - return action, logprob, entropy, value_mean, value_logstd, state#, e3b, intrinsic_reward - - def forward(self, x, state=None, action=None, e3b=None): - return self.get_action_and_value(x, state, action, e3b) diff --git a/pufferlib/emulation.py b/pufferlib/emulation.py index a1aba05e98..844104fc68 100644 --- a/pufferlib/emulation.py +++ b/pufferlib/emulation.py @@ -8,11 +8,7 @@ import pufferlib import pufferlib.spaces -from pufferlib import utils, exceptions -from pufferlib.environment import set_buffers from pufferlib.spaces import Discrete, Tuple, Dict -import pufferlib.environment - def emulate(struct, sample): if isinstance(sample, dict): @@ -58,6 +54,7 @@ def nativize(arr, space, struct_dtype): struct = np.asarray(arr).view(struct_dtype)[0] return _nativize(struct, space) +# TODO: Uncomment? ''' try: from pufferlib.extensions import emulate, nativize @@ -65,6 +62,20 @@ def nativize(arr, space, struct_dtype): warnings.warn('PufferLib Cython extensions not installed. Using slow Python versions') ''' +def get_dtype_bounds(dtype): + if dtype == bool: + return 0, 1 + elif np.issubdtype(dtype, np.integer): + return np.iinfo(dtype).min, np.iinfo(dtype).max + elif np.issubdtype(dtype, np.unsignedinteger): + return np.iinfo(dtype).min, np.iinfo(dtype).max + elif np.issubdtype(dtype, np.floating): + # Gym fails on float64 + return np.finfo(np.float32).min, np.finfo(np.float32).max + else: + raise ValueError(f"Unsupported dtype: {dtype}") + + def dtype_from_space(space): if isinstance(space, pufferlib.spaces.Tuple): dtype = [] @@ -110,7 +121,7 @@ def emulate_observation_space(space): else: dtype = np.dtype(np.uint8) - mmin, mmax = utils._get_dtype_bounds(dtype) + mmin, mmax = get_dtype_bounds(dtype) numel = emulated_dtype.itemsize // dtype.itemsize emulated_space = gymnasium.spaces.Box(low=mmin, high=mmax, shape=(numel,), dtype=dtype) return emulated_space, emulated_dtype @@ -128,7 +139,7 @@ def emulate_action_space(space): class GymnasiumPufferEnv(gymnasium.Env): - def __init__(self, env=None, env_creator=None, env_args=[], env_kwargs={}, buf=None): + def __init__(self, env=None, env_creator=None, env_args=[], env_kwargs={}, buf=None, seed=0): self.env = make_object(env, env_creator, env_args, env_kwargs) self.initialized = False @@ -147,14 +158,14 @@ def __init__(self, env=None, env_creator=None, env_args=[], env_kwargs={}, buf=N self.is_obs_emulated = self.single_observation_space is not self.env.observation_space self.is_atn_emulated = self.single_action_space is not self.env.action_space - self.emulated = pufferlib.namespace( - observation_dtype = self.observation_space.dtype, - emulated_observation_dtype = self.obs_dtype, + self.emulated = dict( + observation_dtype=self.observation_space.dtype, + emulated_observation_dtype=self.obs_dtype, ) self.render_modes = 'human rgb_array'.split() - set_buffers(self, buf) + pufferlib.set_buffers(self, buf) if isinstance(self.env.observation_space, pufferlib.spaces.Box): self.obs_struct = self.observations else: @@ -191,9 +202,9 @@ def reset(self, seed=None): def step(self, action): '''Execute an action and return (observation, reward, done, info)''' if not self.initialized: - raise exceptions.APIUsageError('step() called before reset()') + raise pufferlib.APIUsageError('step() called before reset()') if self.done: - raise exceptions.APIUsageError('step() called after environment is done') + raise pufferlib.APIUsageError('step() called after environment is done') # Unpack actions from multidiscrete into the original action space if self.is_atn_emulated: @@ -249,14 +260,14 @@ def __init__(self, env=None, env_creator=None, env_args=[], buf=None, env_kwargs emulate_action_space(self.env_single_action_space)) self.is_obs_emulated = self.single_observation_space is not self.env_single_observation_space self.is_atn_emulated = self.single_action_space is not self.env_single_action_space - self.emulated = pufferlib.namespace( + self.emulated = dict( observation_dtype = self.single_observation_space.dtype, emulated_observation_dtype = self.obs_dtype, ) self.num_agents = len(self.possible_agents) - set_buffers(self, buf) + pufferlib.set_buffers(self, buf) if isinstance(self.env_single_observation_space, pufferlib.spaces.Box): self.obs_struct = self.observations else: @@ -281,14 +292,14 @@ def done(self): def observation_space(self, agent): '''Returns the observation space for a single agent''' if agent not in self.possible_agents: - raise pufferlib.exceptions.InvalidAgentError(agent, self.possible_agents) + raise pufferlib.InvalidAgentError(agent, self.possible_agents) return self.single_observation_space def action_space(self, agent): '''Returns the action space for a single agent''' if agent not in self.possible_agents: - raise pufferlib.exceptions.InvalidAgentError(agent, self.possible_agents) + raise pufferlib.InvalidAgentError(agent, self.possible_agents) return self.single_action_space @@ -329,13 +340,13 @@ def reset(self, seed=None): def step(self, actions): '''Step the environment and return (observations, rewards, dones, infos)''' if not self.initialized: - raise exceptions.APIUsageError('step() called before reset()') + raise pufferlib.APIUsageError('step() called before reset()') if self.done: - raise exceptions.APIUsageError('step() called after environment is done') + raise pufferlib.APIUsageError('step() called after environment is done') if isinstance(actions, np.ndarray): if not self.is_action_checked and len(actions) != self.num_agents: - raise exceptions.APIUsageError( + raise pufferlib.APIUsageError( f'Actions specified as len {len(actions)} but environment has {self.num_agents} agents') actions = {agent: actions[i] for i, agent in enumerate(self.possible_agents)} @@ -344,7 +355,7 @@ def step(self, actions): if not self.is_action_checked: for agent in actions: if agent not in self.possible_agents: - raise exceptions.InvalidAgentError(agent, self.possible_agents) + raise pufferlib.InvalidAgentError(agent, self.possible_agents) self.is_action_checked = check_space( next(iter(actions.values())), @@ -355,7 +366,7 @@ def step(self, actions): unpacked_actions = {} for agent, atn in actions.items(): if agent not in self.possible_agents: - raise exceptions.InvalidAgentError(agent, self.agents) + raise pufferlib.InvalidAgentError(agent, self.agents) if agent not in self.agents: continue @@ -435,11 +446,11 @@ def check_space(data, space): try: contains = space.contains(data) except: - raise exceptions.APIUsageError( + raise pufferlib.APIUsageError( f'Error checking space {space} with sample :\n{data}') if not contains: - raise exceptions.APIUsageError( + raise pufferlib.APIUsageError( f'Data:\n{data}\n not in space:\n{space}') return True @@ -462,9 +473,9 @@ def _seed_and_reset(env, seed): return obs, info -class GymnaxPufferEnv(pufferlib.environment.PufferEnv): +class GymnaxPufferEnv(pufferlib.PufferEnv): def __init__(self, env, env_params, num_envs=1, buf=None): - from gymnax.environments.spaces import gymnax_space_to_gym_space + from gymnax.spaces import gymnax_space_to_gym_space gymnax_obs_space = env.observation_space(env_params) self.single_observation_space = gymnax_space_to_gym_space(gymnax_obs_space) diff --git a/pufferlib/environment.py b/pufferlib/environment.py deleted file mode 100644 index bce092fbc6..0000000000 --- a/pufferlib/environment.py +++ /dev/null @@ -1,94 +0,0 @@ -import numpy as np - -from pufferlib.exceptions import APIUsageError -import pufferlib.spaces - -ERROR = ''' -Environment missing required attribute {}. The most common cause is -calling super() before you have assigned the attribute. -''' - -def set_buffers(env, buf=None): - if buf is None: - obs_space = env.single_observation_space - env.observations = np.zeros((env.num_agents, *obs_space.shape), dtype=obs_space.dtype) - env.rewards = np.zeros(env.num_agents, dtype=np.float32) - env.terminals = np.zeros(env.num_agents, dtype=bool) - env.truncations = np.zeros(env.num_agents, dtype=bool) - env.masks = np.ones(env.num_agents, dtype=bool) - - # TODO: Major kerfuffle on inferring action space dtype. This needs some asserts? - atn_space = env.single_action_space - if isinstance(env.single_action_space, pufferlib.spaces.Box): - env.actions = np.zeros((env.num_agents, *atn_space.shape), dtype=atn_space.dtype) - else: - env.actions = np.zeros((env.num_agents, *atn_space.shape), dtype=np.int32) - else: - env.observations = buf.observations - env.rewards = buf.rewards - env.terminals = buf.terminals - env.truncations = buf.truncations - env.masks = buf.masks - env.actions = buf.actions - -class PufferEnv: - def __init__(self, buf=None): - if not hasattr(self, 'single_observation_space'): - raise APIUsageError(ERROR.format('single_observation_space')) - if not hasattr(self, 'single_action_space'): - raise APIUsageError(ERROR.format('single_action_space')) - if not hasattr(self, 'num_agents'): - raise APIUsageError(ERROR.format('num_agents')) - - if hasattr(self, 'observation_space'): - raise APIUsageError('PufferEnvs must define single_observation_space, not observation_space') - if hasattr(self, 'action_space'): - raise APIUsageError('PufferEnvs must define single_action_space, not action_space') - if not isinstance(self.single_observation_space, pufferlib.spaces.Box): - raise APIUsageError('Native observation_space must be a Box') - if (not isinstance(self.single_action_space, pufferlib.spaces.Discrete) - and not isinstance(self.single_action_space, pufferlib.spaces.MultiDiscrete) - and not isinstance(self.single_action_space, pufferlib.spaces.Box)): - raise APIUsageError('Native action_space must be a Discrete, MultiDiscrete, or Box') - - set_buffers(self, buf) - - self.action_space = pufferlib.spaces.joint_space(self.single_action_space, self.num_agents) - self.observation_space = pufferlib.spaces.joint_space(self.single_observation_space, self.num_agents) - self.agent_ids = np.arange(self.num_agents) - - @property - def emulated(self): - '''Native envs do not use emulation''' - return False - - @property - def done(self): - '''Native envs handle resets internally''' - return False - - @property - def driver_env(self): - '''For compatibility with Multiprocessing''' - return self - - def reset(self, seed=None): - raise NotImplementedError - - def step(self, actions): - raise NotImplementedError - - def close(self): - raise NotImplementedError - - def async_reset(self, seed=None): - _, self.infos = self.reset(seed) - assert isinstance(self.infos, list), 'PufferEnvs must return info as a list of dicts' - - def send(self, actions): - _, _, _, _, self.infos = self.step(actions) - assert isinstance(self.infos, list), 'PufferEnvs must return info as a list of dicts' - - def recv(self): - return (self.observations, self.rewards, self.terminals, - self.truncations, self.infos, self.agent_ids, self.masks) diff --git a/pufferlib/environments/metta/environment.py b/pufferlib/environments/metta/environment.py index af57437e72..c4ef967c26 100644 --- a/pufferlib/environments/metta/environment.py +++ b/pufferlib/environments/metta/environment.py @@ -15,7 +15,11 @@ class MettaPuff(pufferlib.PufferEnv): def __init__(self, config, render_mode='human', buf=None, seed=0): self.render_mode = render_mode import mettagrid.mettagrid_env - self.env = mettagrid.mettagrid_env.make_env_from_cfg(config, render_mode, buf=buf) + from omegaconf import OmegaConf + cfg = OmegaConf.load(config) + + from mettagrid.mettagrid_env import MettaGridEnv + self.env = MettaGridEnv(cfg, render_mode=render_mode, buf=buf) if render_mode == 'human': from mettagrid.gym_wrapper import RaylibRendererWrapper @@ -26,12 +30,6 @@ def __init__(self, config, render_mode='human', buf=None, seed=0): self.num_agents = self.env.num_agents super().__init__(buf) - #cfg = self.env._env_cfg - #cfg.eval.env = config_from_path(cfg.eval.env, cfg.eval.env_overrides) - #from mettagrid.renderer.raylib.raylib_renderer import MettaGridRaylibRenderer - #self.env._renderer = MettaGridRaylibRenderer(self.env._c_env, self.env._env_cfg['game']) - - def step(self, actions): obs, rew, term, trunc, info = self.env.step(actions) diff --git a/pufferlib/environments/trade_sim/environment.py b/pufferlib/environments/trade_sim/environment.py index af47073e42..0c245c0b25 100644 --- a/pufferlib/environments/trade_sim/environment.py +++ b/pufferlib/environments/trade_sim/environment.py @@ -8,7 +8,7 @@ def env_creator(name='metta'): return functools.partial(make, name) -def make(name, config_path='../nof1-trading-sim/config/experiment_config_3.yaml', render_mode='human', buf=None, seed=1): +def make(name, config_path='../nof1-trading-sim/config/experiment_cv.yaml', render_mode='human', buf=None, seed=1): '''Crafter creation function''' from nof1.utils.config_manager import ConfigManager from nof1.data_ingestion.historical_data_reader import HistoricalDataReader diff --git a/pufferlib/exceptions.py b/pufferlib/exceptions.py deleted file mode 100644 index ec8d2cc844..0000000000 --- a/pufferlib/exceptions.py +++ /dev/null @@ -1,20 +0,0 @@ -class EnvironmentSetupError(RuntimeError): - def __init__(self, e, package): - super().__init__(self.message) - -class APIUsageError(RuntimeError): - """Exception raised when the API is used incorrectly.""" - - def __init__(self, message="API usage error."): - self.message = message - super().__init__(self.message) - -class InvalidAgentError(ValueError): - """Exception raised when an invalid agent key is used.""" - - def __init__(self, agent_id, agents): - message = ( - f'Invalid agent/team ({agent_id}) specified. ' - f'Valid values:\n{agents}' - ) - super().__init__(message) diff --git a/pufferlib/models.py b/pufferlib/models.py index 067c37c6e5..0b76fe310e 100644 --- a/pufferlib/models.py +++ b/pufferlib/models.py @@ -21,7 +21,7 @@ class Default(nn.Module): the recurrent cell into encode_observations and put everything after into decode_actions. ''' - def __init__(self, env, hidden_size=128, use_p3o=False, p3o_horizon=32, use_diayn=False, diayn_skills=128): + def __init__(self, env, hidden_size=128): super().__init__() self.hidden_size = hidden_size self.is_multidiscrete = isinstance(env.single_action_space, @@ -38,7 +38,6 @@ def __init__(self, env, hidden_size=128, use_p3o=False, p3o_horizon=32, use_diay input_size = int(sum(np.prod(v.shape) for v in env.env.observation_space.values())) self.encoder = nn.Linear(input_size, self.hidden_size) else: - #self.encoder = nn.Linear(np.prod(env.single_observation_space.shape), hidden_size) self.encoder = torch.nn.Sequential( nn.Linear(np.prod(env.single_observation_space.shape), hidden_size), nn.GELU(), @@ -58,32 +57,11 @@ def __init__(self, env, hidden_size=128, use_p3o=False, p3o_horizon=32, use_diay self.decoder_logstd = nn.Parameter(torch.zeros( 1, env.single_action_space.shape[0])) - if use_diayn: - self.diayn_discriminator = nn.Sequential( - pufferlib.pytorch.layer_init(nn.Linear(hidden_size, hidden_size)), - nn.ReLU(), - pufferlib.pytorch.layer_init(nn.Linear(hidden_size, diayn_skills)), - ) - - self.use_p3o = use_p3o - self.p3o_horizon = p3o_horizon - if use_p3o: - self.value_mean = pufferlib.pytorch.layer_init( - nn.Linear(hidden_size, p3o_horizon), std=1) - self.value_logstd = nn.Parameter(torch.zeros(1, p3o_horizon)) - - #param = np.log10(np.arange(1, N+1)) - #param = 1 - np.exp(-np.sqrt(np.arange(N))) - #self.value_logstd = nn.Parameter(torch.tensor(param).view(1, N)) - #self.value_logstd = pufferlib.pytorch.layer_init( - # nn.Linear(hidden_size, 32), std=0.01) - else: - self.value = pufferlib.pytorch.layer_init( - nn.Linear(hidden_size, 1), std=1) + self.value = pufferlib.pytorch.layer_init( + nn.Linear(hidden_size, 1), std=1) def forward(self, observations, state=None): hidden = self.encode_observations(observations, state=state) - state.hidden = hidden logits, values = self.decode_actions(hidden) return logits, values @@ -114,24 +92,16 @@ def decode_actions(self, hidden): else: logits = self.decoder(hidden) - if self.use_p3o: - mean=self.value_mean(hidden) - values = pufferlib.namespace( - mean=mean, - std=torch.exp(torch.clamp(self.value_logstd, -10, 10)).expand_as(mean), - ) - else: - values = self.value(hidden) - + values = self.value(hidden) return logits, values -class LSTMWrapper(nn.LSTM): +class LSTMWrapper(nn.Module): def __init__(self, env, policy, input_size=128, hidden_size=128): '''Wraps your policy with an LSTM without letting you shoot yourself in the foot with bad transpose and shape operations. This saves much pain. Requires that your policy define encode_observations and decode_actions. See the Default policy for an example.''' - super().__init__(input_size, hidden_size) + super().__init__() self.obs_shape = env.single_observation_space.shape self.policy = policy @@ -147,11 +117,13 @@ def __init__(self, env, policy, input_size=128, hidden_size=128): elif "weight" in name: nn.init.orthogonal_(param, 1.0) + self.lstm = nn.LSTM(input_size, hidden_size) + self.cell = torch.nn.LSTMCell(input_size, hidden_size) - self.cell.weight_ih = self.weight_ih_l0 - self.cell.weight_hh = self.weight_hh_l0 - self.cell.bias_ih = self.bias_ih_l0 - self.cell.bias_hh = self.bias_hh_l0 + self.cell.weight_ih = self.lstm.weight_ih_l0 + self.cell.weight_hh = self.lstm.weight_hh_l0 + self.cell.bias_ih = self.lstm.bias_ih_l0 + self.cell.bias_hh = self.lstm.bias_hh_l0 #self.pre_layernorm = nn.LayerNorm(hidden_size) #self.post_layernorm = nn.LayerNorm(hidden_size) @@ -159,8 +131,8 @@ def __init__(self, env, policy, input_size=128, hidden_size=128): def forward(self, observations, state): '''Forward function for inference. 3x faster than using LSTM directly''' hidden = self.policy.encode_observations(observations, state=state) - h = state.lstm_h - c = state.lstm_c + h = state['lstm_h'] + c = state['lstm_c'] # TODO: Don't break compile if h is not None: @@ -172,17 +144,17 @@ def forward(self, observations, state): #hidden = self.pre_layernorm(hidden) hidden, c = self.cell(hidden, lstm_state) #hidden = self.post_layernorm(hidden) - state.hidden = hidden - state.lstm_h = hidden - state.lstm_c = c + state['hidden'] = hidden + state['lstm_h'] = hidden + state['lstm_c'] = c logits, values = self.policy.decode_actions(hidden) return logits, values def forward_train(self, observations, state): '''Forward function for training. Uses LSTM for fast time-batching''' x = observations - lstm_h = state.lstm_h - lstm_c = state.lstm_c + lstm_h = state['lstm_h'] + lstm_c = state['lstm_c'] x_shape, space_shape = x.shape, self.obs_shape x_n, space_n = len(x_shape), len(space_shape) @@ -210,7 +182,7 @@ def forward_train(self, observations, state): hidden = hidden.transpose(0, 1) #hidden = self.pre_layernorm(hidden) - hidden, (lstm_h, lstm_c) = super().forward(hidden, lstm_state) + hidden, (lstm_h, lstm_c) = self.lstm.forward(hidden, lstm_state) #hidden = self.post_layernorm(hidden) hidden = hidden.transpose(0, 1) @@ -218,9 +190,9 @@ def forward_train(self, observations, state): logits, values = self.policy.decode_actions(flat_hidden) values = values.reshape(B, TT) #state.batch_logits = logits.reshape(B, TT, -1) - state.hidden = hidden - state.lstm_h = lstm_h.detach() - state.lstm_c = lstm_c.detach() + state['hidden'] = hidden + state['lstm_h'] = lstm_h.detach() + state['lstm_c'] = lstm_c.detach() return logits, values class Convolutional(nn.Module): diff --git a/pufferlib/namespace.py b/pufferlib/namespace.py deleted file mode 100644 index a6ecfe3529..0000000000 --- a/pufferlib/namespace.py +++ /dev/null @@ -1,60 +0,0 @@ -from pdb import set_trace as T -from types import SimpleNamespace -from collections.abc import Mapping - -def __getitem__(self, key): - return self.__dict__[key] - -def __setitem__(self, key, value): - self.__dict__[key] = value - -def keys(self): - return self.__dict__.keys() - -def values(self): - return self.__dict__.values() - -def items(self): - return self.__dict__.items() - -def __iter__(self): - return iter(self.__dict__) - -def __len__(self): - return len(self.__dict__) - -class Namespace(SimpleNamespace, Mapping): - __getitem__ = __getitem__ - __setitem__ = __setitem__ - __iter__ = __iter__ - __len__ = __len__ - keys = keys - values = values - items = items - -def dataclass(cls): - # Safely get annotations - annotations = getattr(cls, '__annotations__', {}) - - # Combine both annotated and non-annotated fields - all_fields = {**{k: None for k in annotations.keys()}, **cls.__dict__} - all_fields = {k: v for k, v in all_fields.items() if not callable(v) and not k.startswith('__')} - - def __init__(self, **kwargs): - for field, default_value in all_fields.items(): - setattr(self, field, kwargs.get(field, default_value)) - - cls.__init__ = __init__ - setattr(cls, "__getitem__", __getitem__) - setattr(cls, "__setitem__", __setitem__) - setattr(cls, "__iter__", __iter__) - setattr(cls, "__len__", __len__) - setattr(cls, "keys", keys) - setattr(cls, "values", values) - setattr(cls, "items", items) - return cls - -def namespace(self=None, **kwargs): - if self is None: - return Namespace(**kwargs) - self.__dict__.update(kwargs) diff --git a/pufferlib/ocean/breakout/breakout.c b/pufferlib/ocean/breakout/breakout.c index 1785ebd215..a7c79ada53 100644 --- a/pufferlib/ocean/breakout/breakout.c +++ b/pufferlib/ocean/breakout/breakout.c @@ -2,7 +2,7 @@ #include "breakout.h" #include "puffernet.h" -int main() { +void demo() { Weights* weights = load_weights("resources/breakout_weights.bin", 147972); LinearLSTM* net = make_linearlstm(weights, 1, 119, 3); @@ -51,3 +51,38 @@ int main() { free_allocated(&env); close_client(env.client); } + +void test_performance(int timeout) { + Breakout env = { + .width = 512, + .height = 512, + .paddle_width = 20, + .paddle_height = 70, + .ball_width = 10, + .ball_height = 15, + .brick_width = 10, + .brick_height = 10, + .brick_rows = 5, + .brick_cols = 10, + .continuous = 0, + }; + allocate(&env); + c_reset(&env); + + int start = time(NULL); + int num_steps = 0; + while (time(NULL) - start < timeout) { + env.actions[0] = rand() % 3; + c_step(&env); + num_steps++; + } + + int end = time(NULL); + float sps = num_steps / (end - start); + printf("Test Environment SPS: %f\n", sps); + free_allocated(&env); +} + +int main() { + test_performance(10); +} diff --git a/pufferlib/ocean/breakout/breakout.h b/pufferlib/ocean/breakout/breakout.h index 6b89a1ea6e..c85e5564a0 100644 --- a/pufferlib/ocean/breakout/breakout.h +++ b/pufferlib/ocean/breakout/breakout.h @@ -19,18 +19,25 @@ #define BRICK_INDEX_BACKWALL_COLLISION -2 #define BRICK_INDEX_PADDLE_COLLISION -1 -typedef struct Log Log; -struct Log { +typedef struct Log { float perf; float score; float episode_return; float episode_length; float n; -}; +} Log; + +typedef struct Client { + float width; + float height; + float paddle_width; + float paddle_height; + float ball_width; + float ball_height; + Texture2D ball; +} Client; -typedef struct Client Client; -typedef struct Breakout Breakout; -struct Breakout { +typedef struct Breakout { Client* client; Log log; float* observations; @@ -68,7 +75,7 @@ struct Breakout { int frameskip; unsigned char hit_brick; int continuous; -}; +} Breakout; typedef struct CollisionInfo CollisionInfo; struct CollisionInfo { @@ -470,17 +477,6 @@ void c_step(Breakout* env) { Color BRICK_COLORS[6] = {RED, ORANGE, YELLOW, GREEN, SKYBLUE, BLUE}; -typedef struct Client Client; -struct Client { - float width; - float height; - float paddle_width; - float paddle_height; - float ball_width; - float ball_height; - Texture2D ball; -}; - static inline bool file_exists(const char* path) { return access(path, F_OK) != -1; } diff --git a/pufferlib/ocean/breakout/breakout.py b/pufferlib/ocean/breakout/breakout.py index 59300888c9..f482701a8c 100644 --- a/pufferlib/ocean/breakout/breakout.py +++ b/pufferlib/ocean/breakout/breakout.py @@ -46,7 +46,7 @@ def __init__(self, num_envs=1, render_mode=None, brick_cols=brick_cols, continuous=continuous ) - def reset(self, seed=None): + def reset(self, seed=0): binding.vec_reset(self.c_envs, seed) self.tick = 0 return self.observations, [] diff --git a/pufferlib/ocean/cartpole/cartpole.py b/pufferlib/ocean/cartpole/cartpole.py index 9b3eecca3c..62e3ba6d3d 100644 --- a/pufferlib/ocean/cartpole/cartpole.py +++ b/pufferlib/ocean/cartpole/cartpole.py @@ -4,7 +4,7 @@ from pufferlib.ocean.cartpole import binding class Cartpole(pufferlib.PufferEnv): - def __init__(self, num_envs=1, render_mode='human', report_interval=1, continuous=False, buf=None, seed=0): + def __init__(self, num_envs=1, render_mode='human', report_interval=1, continuous=True, buf=None, seed=0): self.render_mode = render_mode self.num_agents = num_envs self.report_interval = report_interval @@ -18,17 +18,14 @@ def __init__(self, num_envs=1, render_mode='human', report_interval=1, continuou ) if self.continuous: self.single_action_space = gymnasium.spaces.Box( - low=-1.0, high=1.0, shape=(1,), dtype=np.float32 + low=-1.0, high=1.0, shape=(1,) ) else: self.single_action_space = gymnasium.spaces.Discrete(2) super().__init__(buf) - - self.actions = np.zeros(self.num_agents, dtype=np.float32) - self.terminals = np.zeros(self.num_agents, dtype=np.uint8) - self.truncations = np.zeros(self.num_agents, dtype=np.uint8) + self.actions = np.zeros(num_envs, dtype=np.float32) self.c_envs = binding.vec_init( self.observations, @@ -37,7 +34,8 @@ def __init__(self, num_envs=1, render_mode='human', report_interval=1, continuou self.terminals, self.truncations, num_envs, - int(self.continuous), + seed, + continuous=int(self.continuous), ) def reset(self, seed=None): @@ -98,4 +96,4 @@ def test_performance(timeout=10, atn_cache=8192, continuous=True): if __name__ == '__main__': test_performance() - \ No newline at end of file + diff --git a/pufferlib/ocean/env_binding.h b/pufferlib/ocean/env_binding.h index 2230c53692..12f54d4990 100644 --- a/pufferlib/ocean/env_binding.h +++ b/pufferlib/ocean/env_binding.h @@ -12,6 +12,13 @@ static PyObject* my_shared(PyObject* self, PyObject* args, PyObject* kwargs) { } #endif +static PyObject* my_get(PyObject* dict, Env* env); +#ifndef MY_GET +static PyObject* my_get(PyObject* dict, Env* env) { + return NULL; +} +#endif + static Env* unpack_env(PyObject* args) { PyObject* handle_obj = PyTuple_GetItem(args, 0); if (!PyObject_TypeCheck(handle_obj, &PyLong_Type)) { @@ -64,6 +71,10 @@ static PyObject* env_init(PyObject* self, PyObject* args, PyObject* kwargs) { return NULL; } env->actions = PyArray_DATA(actions); + if (PyArray_STRIDE(actions, 0) == sizeof(double)) { + PyErr_SetString(PyExc_ValueError, "Action tensor passed as float64 (pass np.float32 buffer)"); + return NULL; + } PyObject* rew = PyTuple_GetItem(args, 2); if (!PyObject_TypeCheck(rew, &PyArray_Type)) { @@ -142,18 +153,22 @@ static PyObject* env_init(PyObject* self, PyObject* args, PyObject* kwargs) { Py_DECREF(py_seed); PyObject* empty_args = PyTuple_New(0); - if (my_init(env, empty_args, kwargs)) { - //PyErr_SetString(PyExc_TypeError, "env_init failed"); - Py_DECREF(kwargs); + my_init(env, empty_args, kwargs); + Py_DECREF(kwargs); + if (PyErr_Occurred()) { return NULL; } - Py_DECREF(kwargs); return PyLong_FromVoidPtr(env); } // Python function to reset the environment static PyObject* env_reset(PyObject* self, PyObject* args) { + if (PyTuple_Size(args) != 2) { + PyErr_SetString(PyExc_TypeError, "env_reset requires 2 arguments"); + return NULL; + } + Env* env = unpack_env(args); if (!env){ return NULL; @@ -162,9 +177,14 @@ static PyObject* env_reset(PyObject* self, PyObject* args) { Py_RETURN_NONE; } - // Python function to step the environment static PyObject* env_step(PyObject* self, PyObject* args) { + int num_args = PyTuple_Size(args); + if (num_args != 1) { + PyErr_SetString(PyExc_TypeError, "vec_render requires 1 argument"); + return NULL; + } + Env* env = unpack_env(args); if (!env){ return NULL; @@ -194,6 +214,19 @@ static PyObject* env_close(PyObject* self, PyObject* args) { Py_RETURN_NONE; } +static PyObject* env_get(PyObject* self, PyObject* args) { + Env* env = unpack_env(args); + if (!env){ + return NULL; + } + PyObject* dict = PyDict_New(); + my_get(dict, env); + if (PyErr_Occurred()) { + return NULL; + } + return dict; +} + typedef struct { Env** envs; int num_envs; @@ -208,7 +241,12 @@ static VecEnv* unpack_vecenv(PyObject* args) { VecEnv* vec = (VecEnv*)PyLong_AsVoidPtr(handle_obj); if (!vec) { - PyErr_SetString(PyExc_ValueError, "Invalid vec env handle"); + PyErr_SetString(PyExc_ValueError, "Missing or invalid vec env handle"); + return NULL; + } + + if (vec->num_envs <= 0) { + PyErr_SetString(PyExc_ValueError, "Missing or invalid vec env handle"); return NULL; } @@ -275,6 +313,10 @@ static PyObject* vec_init(PyObject* self, PyObject* args, PyObject* kwargs) { PyErr_SetString(PyExc_ValueError, "Actions must be contiguous"); return NULL; } + if (PyArray_STRIDE(actions, 0) == sizeof(double)) { + PyErr_SetString(PyExc_ValueError, "Action tensor passed as float64 (pass np.float32 buffer)"); + return NULL; + } PyObject* rew = PyTuple_GetItem(args, 2); if (!PyObject_TypeCheck(rew, &PyArray_Type)) { @@ -361,9 +403,9 @@ static PyObject* vec_init(PyObject* self, PyObject* args, PyObject* kwargs) { Py_DECREF(py_seed); PyObject* empty_args = PyTuple_New(0); - if (my_init(env, empty_args, kwargs)) { - PyErr_SetString(PyExc_TypeError, "env_init failed"); - Py_DECREF(kwargs); + my_init(env, empty_args, kwargs); + Py_DECREF(kwargs); + if (PyErr_Occurred()) { return NULL; } } @@ -407,6 +449,11 @@ static PyObject* vectorize(PyObject* self, PyObject* args) { } static PyObject* vec_reset(PyObject* self, PyObject* args) { + if (PyTuple_Size(args) != 2) { + PyErr_SetString(PyExc_TypeError, "vec_reset requires 2 arguments"); + return NULL; + } + VecEnv* vec = unpack_vecenv(args); if (!vec) { return NULL; @@ -428,6 +475,12 @@ static PyObject* vec_reset(PyObject* self, PyObject* args) { } static PyObject* vec_step(PyObject* self, PyObject* arg) { + int num_args = PyTuple_Size(arg); + if (num_args != 1) { + PyErr_SetString(PyExc_TypeError, "vec_step requires 1 argument"); + return NULL; + } + VecEnv* vec = unpack_vecenv(arg); if (!vec) { return NULL; @@ -530,9 +583,10 @@ static PyObject* vec_close(PyObject* self, PyObject* args) { static double unpack(PyObject* kwargs, char* key) { PyObject* val = PyDict_GetItemString(kwargs, key); if (val == NULL) { - // If the key doesn't exist, don't set an error - this allows optional parameters - // Just return a default value that the caller can check for - return 0.0; + char error_msg[100]; + snprintf(error_msg, sizeof(error_msg), "Missing required keyword argument '%s'", key); + PyErr_SetString(PyExc_TypeError, error_msg); + return 1; } if (PyLong_Check(val)) { long out = PyLong_AsLong(val); @@ -561,6 +615,7 @@ static PyMethodDef methods[] = { {"env_step", env_step, METH_VARARGS, "Step the environment"}, {"env_render", env_render, METH_VARARGS, "Render the environment"}, {"env_close", env_close, METH_VARARGS, "Close the environment"}, + {"env_get", env_get, METH_VARARGS, "Get the environment state"}, {"vectorize", vectorize, METH_VARARGS, "Make a vector of environment handles"}, {"vec_init", (PyCFunction)vec_init, METH_VARARGS | METH_KEYWORDS, "Initialize a vector of environments"}, {"vec_reset", (PyCFunction)vec_reset, METH_VARARGS, "Reset the vector of environments"}, diff --git a/pufferlib/ocean/environment.py b/pufferlib/ocean/environment.py index 34728c3087..3e2404aafe 100644 --- a/pufferlib/ocean/environment.py +++ b/pufferlib/ocean/environment.py @@ -1,5 +1,5 @@ +import importlib import pufferlib.emulation -import pufferlib.postprocess def lazy_import(module_path, attr): """ @@ -57,110 +57,101 @@ def make_continuous(discretize=False, buf=None, **kwargs): from . import sanity env = sanity.Continuous(discretize=discretize) if not discretize: - env = pufferlib.postprocess.ClipAction(env) - env = pufferlib.postprocess.EpisodeStats(env) + env = pufferlib.ClipAction(env) + env = pufferlib.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) def make_squared(distance_to_target=3, num_targets=1, buf=None, **kwargs): from . import sanity env = sanity.Squared(distance_to_target=distance_to_target, num_targets=num_targets, **kwargs) - env = pufferlib.postprocess.EpisodeStats(env) + env = pufferlib.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf, **kwargs) def make_bandit(num_actions=10, reward_scale=1, reward_noise=1, buf=None): from . import sanity env = sanity.Bandit(num_actions=num_actions, reward_scale=reward_scale, reward_noise=reward_noise) - env = pufferlib.postprocess.EpisodeStats(env) + env = pufferlib.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) def make_memory(mem_length=2, mem_delay=2, buf=None, **kwargs): from . import sanity env = sanity.Memory(mem_length=mem_length, mem_delay=mem_delay) - env = pufferlib.postprocess.EpisodeStats(env) + env = pufferlib.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) def make_password(password_length=5, buf=None, **kwargs): from . import sanity env = sanity.Password(password_length=password_length) - env = pufferlib.postprocess.EpisodeStats(env) + env = pufferlib.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) def make_performance(delay_mean=0, delay_std=0, bandwidth=1, buf=None, **kwargs): from . import sanity env = sanity.Performance(delay_mean=delay_mean, delay_std=delay_std, bandwidth=bandwidth) - env = pufferlib.postprocess.EpisodeStats(env) + env = pufferlib.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) def make_performance_empiric(count_n=0, count_std=0, bandwidth=1, buf=None, **kwargs): from . import sanity env = sanity.PerformanceEmpiric(count_n=count_n, count_std=count_std, bandwidth=bandwidth) - env = pufferlib.postprocess.EpisodeStats(env) + env = pufferlib.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) def make_stochastic(p=0.7, horizon=100, buf=None, **kwargs): from . import sanity env = sanity.Stochastic(p=p, horizon=100) - env = pufferlib.postprocess.EpisodeStats(env) + env = pufferlib.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) def make_spaces(buf=None, **kwargs): from . import sanity env = sanity.Spaces() - env = pufferlib.postprocess.EpisodeStats(env) + env = pufferlib.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf, **kwargs) def make_multiagent(buf=None, **kwargs): from . import sanity env = sanity.Multiagent() - env = pufferlib.postprocess.MultiagentEpisodeStats(env) + env = pufferlib.MultiagentEpisodeStats(env) return pufferlib.emulation.PettingZooPufferEnv(env=env, buf=buf) -MAKE_FNS = { - 'breakout': lambda: lazy_import('pufferlib.ocean.breakout.breakout', 'Breakout'), - 'blastar': lambda: lazy_import('pufferlib.ocean.blastar.blastar', 'Blastar'), - 'pong': lambda: lazy_import('pufferlib.ocean.pong.pong', 'Pong'), - 'enduro': lambda: lazy_import('pufferlib.ocean.enduro.enduro', 'Enduro'), - 'cartpole': lambda: lazy_import('pufferlib.ocean.cartpole.cartpole', 'Cartpole'), - 'moba': lambda: lazy_import('pufferlib.ocean.moba.moba', 'Moba'), - 'nmmo3': lambda: lazy_import('pufferlib.ocean.nmmo3.nmmo3', 'NMMO3'), - 'snake': lambda: lazy_import('pufferlib.ocean.snake.snake', 'Snake'), - 'squared': lambda: lazy_import('pufferlib.ocean.squared.squared', 'Squared'), - 'pysquared': lambda: lazy_import('pufferlib.ocean.squared.pysquared', 'PySquared'), - 'connect4': lambda: lazy_import('pufferlib.ocean.connect4.connect4', 'Connect4'), - 'tripletriad': lambda: lazy_import('pufferlib.ocean.tripletriad.tripletriad', 'TripleTriad'), - 'tactical': lambda: lazy_import('pufferlib.ocean.tactical.tactical', 'Tactical'), - 'go': lambda: lazy_import('pufferlib.ocean.go.go', 'Go'), - 'rware': lambda: lazy_import('pufferlib.ocean.rware.rware', 'Rware'), - 'trash_pickup': lambda: lazy_import('pufferlib.ocean.trash_pickup.trash_pickup', 'TrashPickupEnv'), - 'tower_climb': lambda: lazy_import('pufferlib.ocean.tower_climb.tower_climb', 'TowerClimb'), - 'grid': lambda: lazy_import('pufferlib.ocean.grid.grid', 'Grid'), - 'cpr': lambda: lazy_import('pufferlib.ocean.cpr.cpr', 'PyCPR'), - 'impulse_wars': lambda: lazy_import('pufferlib.ocean.impulse_wars.impulse_wars', 'ImpulseWars'), - 'gpudrive': lambda: lazy_import('pufferlib.ocean.gpudrive.gpudrive', 'GPUDrive'), - #'rocket_lander': rocket_lander.RocketLander, - 'foraging': make_foraging, - 'predator_prey': make_predator_prey, - 'group': make_group, - 'puffer': make_puffer, - 'continuous': make_continuous, - 'bandit': make_bandit, - 'memory': make_memory, - 'password': make_password, - 'stochastic': make_stochastic, - 'multiagent': make_multiagent, +MAKE_FUNCTIONS = { + 'breakout': 'Breakout', + 'blastar': 'Blastar', + 'pong': 'Pong', + 'enduro': 'Enduro', + 'cartpole': 'Cartpole', + 'moba': 'Moba', + 'nmmo3': 'NMMO3', + 'snake': 'Snake', + 'squared': 'Squared', + 'pysquared': 'PySquared', + 'connect4': 'Connect4', + 'tripletriad': 'TripleTriad', + 'tactical': 'Tactical', + 'go': 'Go', + 'rware': 'Rware', + 'trash_pickup': 'TrashPickupEnv', + 'tower_climb': 'TowerClimb', + 'grid': 'Grid', + 'cpr': 'PyCPR', + 'impulse_wars': 'ImpulseWars', + 'gpudrive': 'GPUDrive', 'spaces': make_spaces, - 'performance': make_performance, - 'performance_empiric': make_performance_empiric, + 'multiagent': make_multiagent, } -# Alias puffer_ to all names -MAKE_FNS = {**MAKE_FNS, **{'puffer_' + k: v for k, v in MAKE_FNS.items()}} - def env_creator(name='squared', *args, **kwargs): - if name in MAKE_FNS: - return MAKE_FNS[name](*args, **kwargs) - else: - raise ValueError(f'Invalid environment name: {name}') + if 'puffer_' not in name: + raise pufferlib.exceptions.APIUsageError(f'Invalid environment name: {name}') + + # TODO: Robust sanity / ocean imports + name = name.replace('puffer_', '') + try: + module = importlib.import_module(f'pufferlib.ocean.{name}.{name}') + return getattr(module, MAKE_FUNCTIONS[name]) + except ModuleNotFoundError: + return MAKE_FUNCTIONS[name] diff --git a/pufferlib/ocean/gpudrive/binding.c b/pufferlib/ocean/gpudrive/binding.c new file mode 100644 index 0000000000..a8999d90f3 --- /dev/null +++ b/pufferlib/ocean/gpudrive/binding.c @@ -0,0 +1,62 @@ +#include "gpudrive.h" +#define Env GPUDrive +#define MY_SHARED +#include "../env_binding.h" + +static PyObject* my_shared(PyObject* self, PyObject* args, PyObject* kwargs) { + int num_envs = unpack(kwargs, "num_envs"); + GPUDrive* temp_envs = calloc(num_envs, sizeof(GPUDrive)); + PyObject* agent_offsets = PyList_New(num_envs+1); + int total_count = 0; + // getting agent counts and offsets + for(int i = 0;i< num_envs;i++) { + char map_file[100]; + sprintf(map_file, "resources/gpudrive/binaries/map_%03d.bin", i); + temp_envs[i].entities = load_map_binary(map_file, &temp_envs[i]); + set_active_agents(&temp_envs[i]); + PyObject* num = PyLong_FromLong(total_count); + PyList_SetItem(agent_offsets, i, num); + //Py_DECREF(num); + total_count += temp_envs[i].active_agent_count; + } + PyObject* num = PyLong_FromLong(total_count); + PyList_SetItem(agent_offsets, num_envs, num); + //Py_DECREF(num); + /* + for(int i = 0;ihuman_agent_idx = unpack(kwargs, "human_agent_idx"); + env->reward_vehicle_collision = unpack(kwargs, "reward_vehicle_collision"); + env->reward_offroad_collision = unpack(kwargs, "reward_offroad_collision"); + int env_id = unpack(kwargs, "env_id"); + + char map_file[100]; + sprintf(map_file, "resources/gpudrive/binaries/map_%03d.bin", env_id); + env->map_name = map_file; + init(env); + return 0; +} + +static int my_log(PyObject* dict, Log* log) { + assign_to_dict(dict, "perf", log->perf); + assign_to_dict(dict, "score", log->score); + assign_to_dict(dict, "episode_return", log->episode_return); + assign_to_dict(dict, "episode_length", log->episode_length); + assign_to_dict(dict, "offroad_rate", log->offroad_rate); + assign_to_dict(dict, "collision_rate", log->collision_rate); + assign_to_dict(dict, "dnf_rate", log->dnf_rate); + assign_to_dict(dict, "n", log->n); + return 0; +} diff --git a/pufferlib/ocean/gpudrive/gpudrive.c b/pufferlib/ocean/gpudrive/gpudrive.c index 78c5731bf7..123074e677 100644 --- a/pufferlib/ocean/gpudrive/gpudrive.c +++ b/pufferlib/ocean/gpudrive/gpudrive.c @@ -102,17 +102,18 @@ void demo() { .human_agent_idx = 0, .reward_vehicle_collision = -0.1f, .reward_offroad_collision = -0.1f, - .map_name = "resources/gpudrive/binaries/map_063.bin" + .map_name = "resources/gpudrive/binaries/map_000.bin" }; allocate(&env); c_reset(&env); - Client* client = make_client(&env); + c_render(&env); + //Client* client = make_client(&env); printf("Human controlling agent index: %d\n", env.active_agent_indices[env.human_agent_idx]); int accel_delta = 1; int steer_delta = 1; while (!WindowShouldClose()) { // Handle camera controls - handle_camera_controls(client); + handle_camera_controls(env.client); int (*actions)[2] = (int(*)[2])env.actions; // // Reset all agent actions at the beginning of each frame // for(int i = 0; i < env.active_agent_count; i++) { @@ -160,10 +161,10 @@ void demo() { // Handle human input for the controlled agent // handle_human_input(&env); c_step(&env); - c_render(client, &env); + c_render(&env); } - close_client(client); + close_client(env.client); free_allocated(&env); } diff --git a/pufferlib/ocean/gpudrive/gpudrive.h b/pufferlib/ocean/gpudrive/gpudrive.h index e01445a15b..bb0153a1bd 100644 --- a/pufferlib/ocean/gpudrive/gpudrive.h +++ b/pufferlib/ocean/gpudrive/gpudrive.h @@ -81,65 +81,22 @@ static const int collision_offsets[25][2] = { {-2, 1}, {-1, 1}, {0, 1}, {1, 1}, {2, 1}, // Fourth row {-2, 2}, {-1, 2}, {0, 2}, {1, 2}, {2, 2} // Bottom row }; -#define LOG_BUFFER_SIZE 1024 +typedef struct GPUDrive GPUDrive; +typedef struct Client Client; typedef struct Log Log; + struct Log { float episode_return; float episode_length; + float perf; float score; float offroad_rate; float collision_rate; float dnf_rate; + float n; }; - -typedef struct LogBuffer LogBuffer; -struct LogBuffer { - Log* logs; - int length; - int idx; -}; - -LogBuffer* allocate_logbuffer(int size) { - LogBuffer* logs = (LogBuffer*)calloc(1, sizeof(LogBuffer)); - logs->logs = (Log*)calloc(size, sizeof(Log)); - logs->length = size; - logs->idx = 0; - return logs; -} - -void free_logbuffer(LogBuffer* buffer) { - free(buffer->logs); - free(buffer); -} - -void add_log(LogBuffer* logs, Log* log) { - if (logs->idx == logs->length) { - return; - } - logs->logs[logs->idx] = *log; - logs->idx += 1; - //printf("Log: %f, %f,\n", log->episode_return, log->episode_length); -} - -Log aggregate_and_clear(LogBuffer* logs) { - Log log = {0}; - if (logs->idx == 0) { - return log; - } - for (int i = 0; i < logs->idx; i++) { - log.episode_return += logs->logs[i].episode_return / logs->idx; - log.episode_length += logs->logs[i].episode_length / logs->idx; - log.score += logs->logs[i].score / logs->idx; - log.offroad_rate += logs->logs[i].offroad_rate / logs->idx; - log.collision_rate += logs->logs[i].collision_rate / logs->idx; - log.dnf_rate += logs->logs[i].dnf_rate / logs->idx; - } - logs->idx = 0; - return log; -} - typedef struct Entity Entity; struct Entity { int type; @@ -196,14 +153,13 @@ float relative_distance_2d(float x1, float y1, float x2, float y2){ return distance; } -typedef struct GPUDrive GPUDrive; struct GPUDrive { + Client* client; float* observations; int* actions; float* rewards; - unsigned char* masks; - unsigned char* dones; - LogBuffer* log_buffer; + unsigned char* terminals; + Log log; Log* logs; int num_agents; int active_agent_count; @@ -233,14 +189,37 @@ struct GPUDrive { float reward_vehicle_collision; float reward_offroad_collision; char* map_name; +<<<<<<< HEAD + char* reached_goal_this_episode; +======= char* reached_goal_this_turn; +>>>>>>> cf2b09c9d525bf784c8dd5c03818450cabeae914 float world_mean_x; float world_mean_y; }; +void add_log(GPUDrive* env) { + for(int i = 0; i < env->active_agent_count; i++){ + if(env->reached_goal_this_episode[i]) { + env->log.score += 1.0f; + env->log.perf += 1.0f; + } + int offroad = env->logs[i].offroad_rate; + env->log.offroad_rate += offroad; + int collided = env->logs[i].collision_rate; + env->log.collision_rate += collided; + if(!offroad && !collided && !env->reached_goal_this_episode[i]){ + env->log.dnf_rate += 1.0f; + } + env->log.episode_length += env->logs[i].episode_length; + env->log.episode_return += env->logs[i].episode_return; + env->log.n += 1; + } +} + Entity* load_map_binary(const char* filename, GPUDrive* env) { FILE* file = fopen(filename, "rb"); - printf("fileanme: %s\n", filename); + //printf("fileanme: %s\n", filename); if (!file) return NULL; fread(&env->num_objects, sizeof(int), 1, file); fread(&env->num_roads, sizeof(int), 1, file); @@ -304,6 +283,8 @@ Entity* load_map_binary(const char* filename, GPUDrive* env) { } void set_start_position(GPUDrive* env){ + //InitWindow(800, 600, "GPU Drive"); + //BeginDrawing(); for(int i = 0; i < env->num_entities; i++){ int is_active = 0; for(int j = 0; j < env->active_agent_count; j++){ @@ -316,6 +297,10 @@ void set_start_position(GPUDrive* env){ e->x = e->traj_x[0]; e->y = e->traj_y[0]; e->z = e->traj_z[0]; + //printf("Entity %d is at (%f, %f, %f)\n", i, e->x, e->y, e->z); + //if (e->type < 4) { + // DrawRectangle(200+2*e->x, 200+2*e->y, 2.0, 2.0, RED); + //} if(e->type >3 || e->type == 0){ continue; } @@ -331,6 +316,10 @@ void set_start_position(GPUDrive* env){ e->heading = e->traj_heading[0]; e->valid = e->traj_valid[0]; } + //EndDrawing(); + int x = 0; + + } void set_active_agents(GPUDrive* env){ @@ -342,7 +331,11 @@ void set_active_agents(GPUDrive* env){ int expert_static_car_indices[MAX_CARS]; env->active_agent_count = 1; active_agent_indices[0] = env->num_objects-1; +<<<<<<< HEAD + for(int i = 0; i < env->num_objects-1 && env->num_cars < MAX_CARS; i++){ +======= for(int i = 0; i < env->num_objects && env->num_cars < MAX_CARS; i++){ +>>>>>>> cf2b09c9d525bf784c8dd5c03818450cabeae914 if(env->entities[i].type != 1) continue; if(env->entities[i].traj_valid[0] != 1) continue; env->num_cars++; @@ -444,8 +437,6 @@ void init_grid_map(GPUDrive* env){ } } } - printf("top left: %f, %f\n", top_left_x, top_left_y); - printf("bottom right: %f, %f\n", bottom_right_x, bottom_right_y); env->map_corners = (float*)calloc(4, sizeof(float)); env->map_corners[0] = top_left_x; @@ -623,13 +614,16 @@ void init(GPUDrive* env){ // printf("num entities: %d\n", env->num_entities); env->dynamics_model = CLASSIC; set_means(env); +<<<<<<< HEAD +======= printf("world mean: %f, %f\n", env->world_mean_x, env->world_mean_y); +>>>>>>> cf2b09c9d525bf784c8dd5c03818450cabeae914 set_active_agents(env); set_start_position(env); // printf("Active agents: %d\n", env->active_agent_count); env->logs = (Log*)calloc(env->active_agent_count, sizeof(Log)); env->goal_reached = (char*)calloc(env->active_agent_count, sizeof(char)); - env->reached_goal_this_turn = (char*)calloc(env->active_agent_count, sizeof(char)); + env->reached_goal_this_episode = (char*)calloc(env->active_agent_count, sizeof(char)); init_grid_map(env); env->vision_range = 21; init_neighbor_offsets(env); @@ -646,7 +640,7 @@ void free_initialized(GPUDrive* env){ free(env->logs); free(env->fake_data); free(env->goal_reached); - free(env->reached_goal_this_turn); + free(env->reached_goal_this_episode); free(env->map_corners); free(env->grid_cells); free(env->neighbor_offsets); @@ -667,9 +661,7 @@ void allocate(GPUDrive* env){ env->observations = (float*)calloc(env->active_agent_count*max_obs, sizeof(float)); env->actions = (int*)calloc(env->active_agent_count*2, sizeof(int)); env->rewards = (float*)calloc(env->active_agent_count, sizeof(float)); - env->masks = (unsigned char*)calloc(env->active_agent_count, sizeof(unsigned char)); - env->dones = (unsigned char*)calloc(env->active_agent_count, sizeof(unsigned char)); - env->log_buffer = allocate_logbuffer(LOG_BUFFER_SIZE); + env->terminals= (unsigned char*)calloc(env->active_agent_count, sizeof(unsigned char)); // printf("allocated\n"); } @@ -677,9 +669,7 @@ void free_allocated(GPUDrive* env){ free(env->observations); free(env->actions); free(env->rewards); - free(env->masks); - free(env->dones); - free_logbuffer(env->log_buffer); + free(env->terminals); free_initialized(env); } @@ -921,9 +911,6 @@ void compute_observations(GPUDrive* env) { memset(env->observations, 0, max_obs*env->active_agent_count*sizeof(float)); float (*observations)[max_obs] = (float(*)[max_obs])env->observations; for(int i = 0; i < env->active_agent_count; i++) { - if(env->goal_reached[i] && !env->reached_goal_this_turn[i]){ - continue; - } float* obs = &observations[i][0]; Entity* ego_entity = &env->entities[env->active_agent_indices[i]]; if(ego_entity->type > 3) break; @@ -937,9 +924,12 @@ void compute_observations(GPUDrive* env) { // Rotate to ego vehicle's frame float rel_goal_x = goal_x*cos_heading + goal_y*sin_heading; float rel_goal_y = -goal_x*sin_heading + goal_y*cos_heading; - obs[0] = normalize_value(rel_goal_x, MIN_REL_GOAL_COORD, MAX_REL_GOAL_COORD); - obs[1] = normalize_value(rel_goal_y, MIN_REL_GOAL_COORD, MAX_REL_GOAL_COORD); - obs[2] = ego_speed / MAX_SPEED; + //obs[0] = normalize_value(rel_goal_x, MIN_REL_GOAL_COORD, MAX_REL_GOAL_COORD); + //obs[1] = normalize_value(rel_goal_y, MIN_REL_GOAL_COORD, MAX_REL_GOAL_COORD); + obs[0] = rel_goal_x/20.0f; + obs[1] = rel_goal_y/20.0f; + //obs[2] = ego_speed / MAX_SPEED; + obs[2] = ego_speed / 5.0f; obs[3] = ego_entity->width / MAX_VEH_WIDTH; obs[4] = ego_entity->length / MAX_VEH_LEN; obs[5] = (ego_entity->collision_state > 0) ? 1 : 0; @@ -967,8 +957,8 @@ void compute_observations(GPUDrive* env) { float rel_x = dx*cos_heading + dy*sin_heading; float rel_y = -dx*sin_heading + dy*cos_heading; // Store observations with correct indexing - obs[obs_idx] = normalize_value(rel_x, MIN_REL_AGENT_POS, MAX_REL_AGENT_POS); - obs[obs_idx + 1] = normalize_value(rel_y, MIN_REL_AGENT_POS, MAX_REL_AGENT_POS); + obs[obs_idx] = rel_x / 20.0f; + obs[obs_idx + 1] = rel_y / 20.0f; obs[obs_idx + 2] = other_entity->width / MAX_VEH_WIDTH; obs[obs_idx + 3] = other_entity->length / MAX_VEH_LEN; // relative heading @@ -1017,8 +1007,8 @@ void compute_observations(GPUDrive* env) { // Compute sin and cos of relative angle directly without atan2f float cos_angle = dx_norm*cos_heading + dy_norm*sin_heading; float sin_angle = -dx_norm*sin_heading + dy_norm*cos_heading; - obs[obs_idx] = normalize_value(x_obs, MIN_RG_COORD, MAX_RG_COORD); - obs[obs_idx + 1] = normalize_value(y_obs, MIN_RG_COORD, MAX_RG_COORD); + obs[obs_idx] = x_obs / 20.0f; + obs[obs_idx + 1] = y_obs / 20.0f; obs[obs_idx + 2] = length / MAX_ROAD_SEGMENT_LENGTH; obs[obs_idx + 3] = width / MAX_ROAD_SCALE; obs[obs_idx + 4] = cos_angle / MAX_ORIENTATION_RAD; @@ -1041,33 +1031,23 @@ void c_reset(GPUDrive* env){ collision_check(env, agent_idx); } memset(env->goal_reached, 0, env->active_agent_count*sizeof(char)); - memset(env->masks, 1, env->active_agent_count*sizeof(char)); - memset(env->dones, 0, env->active_agent_count*sizeof(char)); + memset(env->reached_goal_this_episode, 0, env->active_agent_count*sizeof(char)); compute_observations(env); } +void respawn_agent(GPUDrive* env, int agent_idx){ + env->entities[agent_idx].x = env->entities[agent_idx].traj_x[0]; + env->entities[agent_idx].y = env->entities[agent_idx].traj_y[0]; + env->entities[agent_idx].heading = env->entities[agent_idx].traj_heading[0]; + env->entities[agent_idx].vx = env->entities[agent_idx].traj_vx[0]; + env->entities[agent_idx].vy = env->entities[agent_idx].traj_vy[0]; +} + void c_step(GPUDrive* env){ memset(env->rewards, 0, env->active_agent_count * sizeof(float)); - memset(env->reached_goal_this_turn, 0, env->active_agent_count * sizeof(char)); env->timestep++; if(env->timestep == 91){ - for(int i = 0; i < env->active_agent_count; i++){ - if(env->goal_reached[i] == 0){ - env->logs[i].score = 0.0f; - } - else { - env->logs[i].score = 1.0f; - env->logs[i].dnf_rate = 0.0f; - } - int offroad = env->logs[i].offroad_rate; - int collided = env->logs[i].collision_rate; - int goal_reached = env->goal_reached[i]; - if(!offroad && !collided && !goal_reached){ - env->logs[i].dnf_rate = 1.0f; - } - - add_log(env->log_buffer, &env->logs[i]); - } + add_log(env); c_reset(env); } // Move statix experts @@ -1081,13 +1061,20 @@ void c_step(GPUDrive* env){ env->logs[i].score = 0.0f; env->logs[i].episode_length += 1; int agent_idx = env->active_agent_indices[i]; + if(env->goal_reached[i] || env->entities[agent_idx].collision_state > 0){ + respawn_agent(env, agent_idx); + env->goal_reached[i] = 0; + } env->entities[agent_idx].collision_state = 0; +<<<<<<< HEAD +======= if(env->goal_reached[i]){ env->masks[i] = 0; env->entities[agent_idx].x = -10000; env->entities[agent_idx].y = -10000; continue; } +>>>>>>> cf2b09c9d525bf784c8dd5c03818450cabeae914 move_dynamics(env, i, agent_idx); // move_expert(env, env->actions, agent_idx); collision_check(env, agent_idx); @@ -1113,10 +1100,13 @@ void c_step(GPUDrive* env){ if(reached_goal && env->goal_reached[i] == 0){ env->rewards[i] += 1.0f; env->goal_reached[i] = 1; - env->reached_goal_this_turn[i] = 1; env->logs[i].episode_return += 1.0f; +<<<<<<< HEAD + env->reached_goal_this_episode[i] = 1; +======= env->dones[i] = 1; continue; +>>>>>>> cf2b09c9d525bf784c8dd5c03818450cabeae914 } } compute_observations(env); @@ -1382,8 +1372,16 @@ void draw_road_edge(GPUDrive* env, float start_x, float start_y, float end_x, fl DrawTriangle3D(b4, t4, b1, CURB_SIDE); DrawTriangle3D(t4, t1, b1, CURB_SIDE); } +<<<<<<< HEAD + +void c_render(GPUDrive* env) { + if (env->client == NULL) { + env->client = make_client(env); + } + Client* client = env->client; +======= +>>>>>>> cf2b09c9d525bf784c8dd5c03818450cabeae914 -void c_render(Client* client, GPUDrive* env) { BeginDrawing(); Color road = (Color){35, 35, 37, 255}; ClearBackground(road); diff --git a/pufferlib/ocean/gpudrive/gpudrive.py b/pufferlib/ocean/gpudrive/gpudrive.py index 36f87e716b..528496663e 100644 --- a/pufferlib/ocean/gpudrive/gpudrive.py +++ b/pufferlib/ocean/gpudrive/gpudrive.py @@ -4,7 +4,7 @@ import struct import pufferlib -from pufferlib.ocean.gpudrive.cy_gpudrive import CyGPUDrive, entity_dtype +from pufferlib.ocean.gpudrive import binding class GPUDrive(pufferlib.PufferEnv): def __init__(self, num_envs=1, render_mode=None, report_interval=1, @@ -19,46 +19,59 @@ def __init__(self, num_envs=1, render_mode=None, report_interval=1, self.num_agents = num_envs self.render_mode = render_mode self.report_interval = report_interval - print("Num envs: ", num_envs) self.num_obs = 6 + 63*7 + 200*7 self.single_observation_space = gymnasium.spaces.Box(low=-1, high=1, shape=(self.num_obs,), dtype=np.float32) self.single_action_space = gymnasium.spaces.MultiDiscrete([7, 13]) - - total_agents, agent_offsets =CyGPUDrive.get_total_agent_count( - num_envs, human_agent_idx, reward_vehicle_collision, reward_offroad_collision) - - self.num_agents = total_agents * 8 - print("Num agents: ", self.num_agents) + agent_offsets = binding.shared(num_envs=num_envs) + total_agents = agent_offsets[-1] + self.num_agents = total_agents super().__init__(buf=buf) - self.c_envs = CyGPUDrive(self.observations, self.actions, self.rewards, self.masks, - self.terminals, num_envs, human_agent_idx, reward_vehicle_collision, reward_offroad_collision, offsets = agent_offsets) + env_ids = [] + for i in range(num_envs): + cur = agent_offsets[i] + nxt = agent_offsets[i+1] + env_id = binding.env_init( + self.observations[cur:nxt], + self.actions[cur:nxt], + self.rewards[cur:nxt], + self.terminals[cur:nxt], + self.truncations[cur:nxt], + seed, + human_agent_idx=human_agent_idx, + reward_vehicle_collision=reward_vehicle_collision, + reward_offroad_collision=reward_offroad_collision, + env_id=i + ) + env_ids.append(env_id) + self.c_envs = binding.vectorize(*env_ids) - def reset(self, seed=None): - self.c_envs.reset() + def reset(self, seed=0): + binding.vec_reset(self.c_envs, seed) self.tick = 0 return self.observations, [] def step(self, actions): self.actions[:] = actions - self.c_envs.step() + binding.vec_step(self.c_envs) self.tick+=1 info = [] if self.tick % self.report_interval == 0: - log = self.c_envs.log() - if log['episode_length'] > 0: + log = binding.vec_log(self.c_envs) + if log: info.append(log) - info.append({'total_agents': self.num_agents}) + return (self.observations, self.rewards, self.terminals, self.truncations, info) def render(self): - self.c_envs.render() + binding.vec_render(self.c_envs, 63) def close(self): - self.c_envs.close() + binding.vec_close(self.c_envs) + def calculate_area(p1, p2, p3): # Calculate the area of the triangle using the determinant method return 0.5 * abs((p1['x'] - p3['x']) * (p2['y'] - p1['y']) - (p1['x'] - p2['x']) * (p3['y'] - p1['y'])) @@ -204,6 +217,7 @@ def save_map_binary(map_data, output_file): f.write(struct.pack('f', float(goal_pos.get('y', 0.0)))) # Get y value f.write(struct.pack('f', float(goal_pos.get('z', 0.0)))) # Get z value f.write(struct.pack('i', road.get('mark_as_expert', 0))) + def load_map(map_name, binary_output=None): """Loads a JSON map and optionally saves it as binary""" with open(map_name, 'r') as f: @@ -211,9 +225,6 @@ def load_map(map_name, binary_output=None): if binary_output: save_map_binary(map_data, binary_output) - - entities = np.zeros(1, dtype=entity_dtype()) - return entities def process_all_maps(): """Process all maps and save them as binaries""" diff --git a/pufferlib/ocean/grid/grid.h b/pufferlib/ocean/grid/grid.h index 161e725d0b..6c22b69406 100644 --- a/pufferlib/ocean/grid/grid.h +++ b/pufferlib/ocean/grid/grid.h @@ -496,7 +496,7 @@ void c_render(Grid* env) { float frac = 0.0; float overlay = 0.0; if (env->renderer == NULL) { - env->renderer = init_renderer(16, env->width, env->height); + env->renderer = init_renderer(16, env->max_size, env->max_size); } Renderer* renderer = env->renderer; diff --git a/pufferlib/ocean/pong/pong.c b/pufferlib/ocean/pong/pong.c index e3c104ab0c..fa1c6f5981 100644 --- a/pufferlib/ocean/pong/pong.c +++ b/pufferlib/ocean/pong/pong.c @@ -1,7 +1,8 @@ +#include #include "pong.h" #include "puffernet.h" -int main() { +void demo() { Weights* weights = load_weights("resources/pong_weights.bin", 133764); LinearLSTM* net = make_linearlstm(weights, 1, 8, 3); @@ -56,3 +57,40 @@ int main() { close_client(env.client); } +void test_performance(int timeout) { + Pong env = { + .width = 500, + .height = 640, + .paddle_width = 20, + .paddle_height = 70, + .ball_width = 32, + .ball_height = 32, + .paddle_speed = 8, + .ball_initial_speed_x = 10, + .ball_initial_speed_y = 1, + .ball_speed_y_increment = 3, + .ball_max_speed_y = 13, + .max_score = 21, + .frameskip = 1, + .continuous = 0, + }; + allocate(&env); + c_reset(&env); + + int start = time(NULL); + int num_steps = 0; + while (time(NULL) - start < timeout) { + env.actions[0] = rand() % 3; + c_step(&env); + num_steps++; + } + + int end = time(NULL); + float sps = num_steps / (end - start); + printf("Test Environment SPS: %f\n", sps); + free_allocated(&env); +} + +int main() { + test_performance(10); +} diff --git a/pufferlib/ocean/pong/pong.py b/pufferlib/ocean/pong/pong.py index affcd86cd2..1a37a693be 100644 --- a/pufferlib/ocean/pong/pong.py +++ b/pufferlib/ocean/pong/pong.py @@ -53,7 +53,7 @@ def __init__(self, num_envs=1, render_mode=None, max_score=max_score, frameskip=frameskip, continuous=continuous ) - def reset(self, seed=None): + def reset(self, seed=0): binding.vec_reset(self.c_envs, seed) self.tick = 0 return self.observations, [] @@ -80,7 +80,6 @@ def render(self): def close(self): binding.vec_close(self.c_envs) -from pufferlib.ocean.pong.cy_pong import CyPong #from cy_pong import CyPong class CythonPong(pufferlib.PufferEnv): def __init__(self, num_envs=1, render_mode=None, @@ -158,4 +157,3 @@ def test_performance(cls, timeout=10, atn_cache=1024): if __name__ == '__main__': test_performance(Pong) - test_performance(CythonPong) diff --git a/pufferlib/ocean/torch.py b/pufferlib/ocean/torch.py index feeedecf32..ef05533b69 100644 --- a/pufferlib/ocean/torch.py +++ b/pufferlib/ocean/torch.py @@ -29,16 +29,17 @@ def __init__(self, env, hidden_size=512, output_size=512, **kwargs): #self.dtype = pufferlib.pytorch.nativize_dtype(env.emulated) self.num_actions = env.single_action_space.n self.factors = np.array([4, 4, 17, 5, 3, 5, 5, 5, 7, 4]) - self.offsets = torch.tensor([0] + list(np.cumsum(self.factors)[:-1])).cuda().view(1, -1, 1, 1) + offsets = torch.tensor([0] + list(np.cumsum(self.factors)[:-1])).view(1, -1, 1, 1) + self.register_buffer('offsets', offsets) self.cum_facs = np.cumsum(self.factors) self.multihot_dim = self.factors.sum() self.is_continuous = False self.map_2d = nn.Sequential( - pufferlib.pytorch.layer_init(nn.Conv2d(self.multihot_dim, 256, 5, stride=3)), + pufferlib.pytorch.layer_init(nn.Conv2d(self.multihot_dim, 128, 5, stride=3)), nn.ReLU(), - pufferlib.pytorch.layer_init(nn.Conv2d(256, 256, 3, stride=1)), + pufferlib.pytorch.layer_init(nn.Conv2d(128, 128, 3, stride=1)), nn.Flatten(), ) @@ -47,7 +48,7 @@ def __init__(self, env, hidden_size=512, output_size=512, **kwargs): nn.Flatten(), ) self.proj = nn.Sequential( - pufferlib.pytorch.layer_init(nn.Linear(2073, hidden_size)), + pufferlib.pytorch.layer_init(nn.Linear(1817, hidden_size)), nn.ReLU(), ) @@ -56,10 +57,6 @@ def __init__(self, env, hidden_size=512, output_size=512, **kwargs): nn.Linear(output_size, self.num_actions), std=0.01) self.value_fn = pufferlib.pytorch.layer_init(nn.Linear(output_size, 1), std=1) - # Pre-allocate allows compilation - map_buf = torch.zeros(32768, self.multihot_dim, 11, 15, dtype=torch.float32) - self.register_buffer('map_buf', map_buf) - def forward(self, x, state=None): hidden = self.encode_observations(x) actions, value = self.decode_actions(hidden) @@ -75,15 +72,14 @@ def encode_observations(self, observations, state=None): ob_reward = observations[:, -10:] batch = ob_map.shape[0] - map_buf = self.map_buf[:batch] - map_buf.zero_() + map_buf = torch.zeros(batch, 59, 11, 15, dtype=torch.float32, device=observations.device) codes = ob_map.permute(0, 3, 1, 2) + self.offsets map_buf.scatter_(1, codes, 1) ob_map = self.map_2d(map_buf) player_discrete = self.player_discrete_encoder(ob_player.int()) - obs = torch.cat([ob_map, player_discrete, ob_player.float(), ob_reward], dim=1) + obs = torch.cat([ob_map, player_discrete, ob_player.to(ob_map.dtype), ob_reward], dim=1) obs = self.proj(obs) return obs @@ -344,10 +340,6 @@ def forward_train(self, x, state=None): def encode_observations(self, observations, state=None): cnn_features = observations[:, :-26].view(-1, 11, 11, 4).long() - if cnn_features[:, :, :, 0].max() > 15: - print('Invalid map value:', cnn_features[:, :, :, 0].max()) - breakpoint() - exit(1) map_features = F.one_hot(cnn_features[:, :, :, 0], 16).permute(0, 3, 1, 2).float() extra_map_features = (cnn_features[:, :, :, -3:].float() / 255).permute(0, 3, 1, 2) cnn_features = torch.cat([map_features, extra_map_features], dim=1) diff --git a/pufferlib/policy_ranker.py b/pufferlib/policy_ranker.py deleted file mode 100644 index 8282f9fc9e..0000000000 --- a/pufferlib/policy_ranker.py +++ /dev/null @@ -1,104 +0,0 @@ -from pdb import set_trace as T -import numpy as np - -import sqlite3 - -ANCHOR_ELO = 1000.0 - - -def win_prob(elo1, elo2): - '''Calculate win probability such that a difference of - 50/100/150 elo corresponds to win probabilitit 68/95/99.7%''' - return 1 / (1 + 10 ** ((elo2 - elo1) / 400)) - -def update_elos(elos: np.ndarray, scores: np.ndarray, k: float = 4.0): - '''Update elos based on the result of a game - - The parameter k controls the magnitude of the update. - A higher k means that the elo will change more after a game. - This means that elos will converge faster but less precisely. - In particular, low k cannot distinguish between players of - similar skill, while a high k will just take longer to converge. - - The default is tuned for normally distributed player skill - You should lower it if you have very similar players. - Raise it if you are evaluating a diverse skill pool. - ''' - num_players = len(elos) - assert num_players == len(scores) - - elo_update = [[] for _ in range(num_players)] - for i in range(num_players): - for j in range(i+1, num_players): - delta = scores[i] - scores[j] - - # Convert to elo scoring format - if delta > 0: - score_i = 1 - elif delta == 0: - score_i = 0.5 - else: - score_i = 0 - - # Calculate elo update for pairs - expected_i = win_prob(elos[i], elos[j]) - expected_j = 1 - expected_i - score_j = 1 - score_i - - elo_update[i].append(k * (score_i - expected_i)) - elo_update[j].append(k * (score_j - expected_j)) - - elo_update = [np.mean(e) for e in elo_update] - return [elo + update for elo, update in zip(elos, elo_update)] - -class Ranker: - def __init__(self, db_path): - self.conn = sqlite3.connect(db_path) - with self.conn: - self.conn.execute(""" - CREATE TABLE IF NOT EXISTS ratings ( - policy TEXT PRIMARY KEY, - elo REAL - ); - """) - - def __repr__(self): - if len(self.ratings) == 0: - return '' - - sorted_dict = sorted(self.ratings.items(), key=lambda x: x[1], reverse=True) - return '\n'.join([ - f' - Policy: {name}, Elo: {elo:.3f}' - for name, elo in sorted_dict - ]) - - @property - def ratings(self): - with self.conn: - cursor = self.conn.execute("SELECT * FROM ratings;") - - return {row[0]: row[1] for row in cursor.fetchall()} - - def update(self, scores: dict): - if len(scores) < 2: - return - - # Load all elos from DB - elos = self.ratings - - flat_scores = [] - flat_elos = [] - for policy in scores.keys(): - flat_scores.append(scores[policy]) - if policy in elos: - flat_elos.append(elos[policy]) - else: - flat_elos.append(ANCHOR_ELO) - - flat_elos = update_elos(flat_elos, flat_scores) - elos = zip(scores.keys(), flat_elos) - with self.conn: - self.conn.executemany(""" - INSERT OR REPLACE INTO ratings (policy, elo) - VALUES (?, ?); - """, elos) diff --git a/pufferlib/policy_store.py b/pufferlib/policy_store.py deleted file mode 100644 index 7bbd96ad1a..0000000000 --- a/pufferlib/policy_store.py +++ /dev/null @@ -1,26 +0,0 @@ -from pdb import set_trace as T -import os -import torch - - -def get_policy_names(path: str) -> list: - # Assumeing that all pt files other than trainer_state.pt in the path are policy files - names = [] - for file in os.listdir(path): - if file.endswith(".pt") and file != 'trainer_state.pt': - names.append(file[:-3]) - return sorted(names) - -class PolicyStore: - def __init__(self, path: str): - self.path = path - - def policy_names(self) -> list: - return get_policy_names(self.path) - - def get_policy(self, name: str) -> torch.nn.Module: - path = os.path.join(self.path, name + '.pt') - try: - return torch.load(path) - except: - return torch.load(path, map_location=torch.device('cpu')) diff --git a/pufferlib/postprocess.py b/pufferlib/postprocess.py deleted file mode 100644 index cb311cc145..0000000000 --- a/pufferlib/postprocess.py +++ /dev/null @@ -1,219 +0,0 @@ -from pdb import set_trace as T -import numpy as np -import gymnasium - -import pufferlib.utils - -class ResizeObservation(gymnasium.Wrapper): - '''Fixed downscaling wrapper. Do NOT use gym.wrappers.ResizeObservation - It uses a laughably slow OpenCV resize. -50% on Atari just from that.''' - def __init__(self, env, downscale=2): - super().__init__(env) - self.downscale = downscale - y_size, x_size = env.observation_space.shape - assert y_size % downscale == 0 and x_size % downscale == 0 - y_size = env.observation_space.shape[0] // downscale - x_size = env.observation_space.shape[1] // downscale - self.observation_space = gymnasium.spaces.Box( - low=0, high=255, shape=(y_size, x_size), dtype=np.uint8) - - def reset(self, seed=None, options=None): - obs, info = self.env.reset(seed=seed, options=options) - return obs[::self.downscale, ::self.downscale], info - - def step(self, action): - obs, reward, terminal, truncated, info = self.env.step(action) - return obs[::self.downscale, ::self.downscale], reward, terminal, truncated, info - -class ClipAction(gymnasium.Wrapper): - '''Wrapper for Gymnasium environments that clips actions''' - def __init__(self, env): - self.env = env - assert isinstance(env.action_space, gymnasium.spaces.Box) - dtype_info = np.finfo(env.action_space.dtype) - self.action_space = gymnasium.spaces.Box( - low=dtype_info.min, - high=dtype_info.max, - shape=env.action_space.shape, - dtype=env.action_space.dtype, - ) - - def step(self, action): - action = np.clip(action, self.env.action_space.low, self.env.action_space.high) - return self.env.step(action) - - -class EpisodeStats(gymnasium.Wrapper): - '''Wrapper for Gymnasium environments that stores - episodic returns and lengths in infos''' - def __init__(self, env): - self.env = env - self.observation_space = env.observation_space - self.action_space = env.action_space - self.reset() - - def reset(self, seed=None, options=None): - self.info = dict(episode_return=[], episode_length=0) - # TODO: options - return self.env.reset(seed=seed)#, options=options) - - def step(self, action): - observation, reward, terminated, truncated, info = super().step(action) - - for k, v in pufferlib.utils.unroll_nested_dict(info): - if k not in self.info: - self.info[k] = [] - - self.info[k].append(v) - - self.info['episode_return'].append(reward) - self.info['episode_length'] += 1 - - info = {} - if terminated or truncated: - for k, v in self.info.items(): - try: - info[k] = sum(v) - continue - except TypeError: - pass - - if isinstance(v, str): - info[k] = v - continue - - try: - x = int(v) # probably a value - info[k] = v - continue - except TypeError: - pass - - return observation, reward, terminated, truncated, info - -class PettingZooWrapper: - '''PettingZoo does not provide a ParallelEnv wrapper. This code is adapted from - their AEC wrapper, to prevent unneeded conversions to/from AEC''' - def __init__(self, env): - self.env = env - - def __getattr__(self, name): - '''Returns an attribute with ``name``, unless ``name`` starts with an underscore.''' - if name.startswith('_') and name != '_cumulative_rewards': - raise AttributeError(f'accessing private attribute "{name}" is prohibited') - return getattr(self.env, name) - - @property - def unwrapped(self): - return self.env.unwrapped - - def close(self): - self.env.close() - - def render(self): - return self.env.render() - - def reset(self, seed=None, options=None): - try: - return self.env.reset(seed=seed, options=options) - except TypeError: - return self.env.reset(seed=seed) - - def observe(self, agent): - return self.env.observe(agent) - - def state(self): - return self.env.state() - - def step(self, action): - return self.env.step(action) - - def observation_space(self, agent): - return self.env.observation_space(agent) - - def action_space(self, agent): - return self.env.action_space(agent) - - def __str__(self) -> str: - '''Returns a name which looks like: "max_observation".''' - return f'{type(self).__name__}<{str(self.env)}>' - -class MeanOverAgents(PettingZooWrapper): - '''Averages over agent infos''' - def _mean(self, infos): - list_infos = {} - for agent, info in infos.items(): - for k, v in info.items(): - if k not in list_infos: - list_infos[k] = [] - - list_infos[k].append(v) - - mean_infos = {} - for k, v in list_infos.items(): - try: - mean_infos[k] = np.mean(v) - except: - pass - - return mean_infos - - def reset(self, seed=None, options=None): - observations, infos = super().reset(seed, options) - infos = self._mean(infos) - return observations, infos - - def step(self, actions): - observations, rewards, terminations, truncations, infos = super().step(actions) - infos = self._mean(infos) - return observations, rewards, terminations, truncations, infos - -class MultiagentEpisodeStats(PettingZooWrapper): - '''Wrapper for PettingZoo environments that stores - episodic returns and lengths in infos''' - def reset(self, seed=None, options=None): - observations, infos = super().reset(seed=seed, options=options) - self.infos = { - agent: dict(episode_return=[], episode_length=0) - for agent in self.possible_agents - } - return observations, infos - - def step(self, actions): - observations, rewards, terminations, truncations, infos = super().step(actions) - - all_infos = {} - for agent in infos: - agent_info = self.infos[agent] - for k, v in pufferlib.utils.unroll_nested_dict(infos[agent]): - if k not in agent_info: - agent_info[k] = [] - - agent_info[k].append(v) - - # Saved to self. TODO: Clean up - agent_info['episode_return'].append(rewards[agent]) - agent_info['episode_length'] += 1 - - agent_info = {} - all_infos[agent] = agent_info - if terminations[agent] or truncations[agent]: - for k, v in self.infos[agent].items(): - try: - agent_info[k] = sum(v) - continue - except TypeError: - pass - - if isinstance(v, str): - agent_info[k] = v - continue - - try: - x = int(v) # probably a value - agent_info[k] = v - continue - except TypeError: - pass - - return observations, rewards, terminations, truncations, all_infos diff --git a/pufferlib/pufferlib.cu b/pufferlib/pufferlib.cu new file mode 100644 index 0000000000..c979fcf790 --- /dev/null +++ b/pufferlib/pufferlib.cu @@ -0,0 +1,94 @@ +#include +#include +#include + +namespace pufferlib { + +static const int max_horizon = 256; +__host__ __device__ void puff_advantage_row_cuda(float* values, float* rewards, float* dones, + float* importance, float* advantages, float gamma, float lambda, + float rho_clip, float c_clip, int horizon) { + float lastpufferlam = 0; + for (int t = horizon-2; t >= 0; t--) { + int t_next = t + 1; + float nextnonterminal = 1.0 - dones[t_next]; + float rho_t = fminf(importance[t], rho_clip); + float c_t = fminf(importance[t], c_clip); + // TODO: t_next works and t doesn't. Check original formula + float delta = rho_t*(rewards[t_next] + gamma*values[t_next]*nextnonterminal - values[t]); + lastpufferlam = delta + gamma*lambda*c_t*lastpufferlam*nextnonterminal; + advantages[t] = lastpufferlam; + } +} + +void vtrace_check_cuda(torch::Tensor values, torch::Tensor rewards, + torch::Tensor dones, torch::Tensor importance, torch::Tensor advantages, + int num_steps, int horizon) { + + // Validate input tensors + torch::Device device = values.device(); + for (const torch::Tensor& t : {values, rewards, dones, importance, advantages}) { + TORCH_CHECK(t.dim() == 2, "Tensor must be 2D"); + TORCH_CHECK(t.device() == device, "All tensors must be on same device"); + TORCH_CHECK(t.size(0) == num_steps, "First dimension must match num_steps"); + TORCH_CHECK(t.size(1) == horizon, "Second dimension must match horizon"); + TORCH_CHECK(t.dtype() == torch::kFloat32, "All tensors must be float32"); + assert(horizon <= max_horizon); + if (!t.is_contiguous()) { + t.contiguous(); + } + } +} + + + // [num_steps, horizon] +__global__ void puff_advantage_kernel(float* values, float* rewards, + float* dones, float* importance, float* advantages, float gamma, + float lambda, float rho_clip, float c_clip, int num_steps, int horizon) { + int row = blockIdx.x*blockDim.x + threadIdx.x; + int offset = row*horizon; + puff_advantage_row_cuda(values + offset, rewards + offset, dones + offset, + importance + offset, advantages + offset, gamma, lambda, rho_clip, c_clip, horizon); +} + +void compute_puff_advantage_cuda(torch::Tensor values, torch::Tensor rewards, + torch::Tensor dones, torch::Tensor importance, torch::Tensor advantages, + double gamma, double lambda, double rho_clip, double c_clip) { + int num_steps = values.size(0); + int horizon = values.size(1); + vtrace_check_cuda(values, rewards, dones, importance, advantages, num_steps, horizon); + TORCH_CHECK(values.is_cuda(), "All tensors must be on GPU"); + assert(horizon <= max_horizon); + + int threads_per_block = 256; + if (threads_per_block > num_steps) { + threads_per_block = 2*(num_steps/2); + } + int blocks = (num_steps + threads_per_block - 1) / threads_per_block; + assert(num_steps % threads_per_block == 0); + + puff_advantage_kernel<<>>( + values.data_ptr(), + rewards.data_ptr(), + dones.data_ptr(), + importance.data_ptr(), + advantages.data_ptr(), + gamma, + lambda, + rho_clip, + c_clip, + num_steps, + horizon + ); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + throw std::runtime_error(cudaGetErrorString(err)); + } +} + +TORCH_LIBRARY_IMPL(pufferlib, CUDA, m) { + m.impl("compute_puff_advantage", &compute_puff_advantage_cuda); +} + +} diff --git a/pufferlib/pufferlib.py b/pufferlib/pufferlib.py new file mode 100644 index 0000000000..e3f95643ba --- /dev/null +++ b/pufferlib/pufferlib.py @@ -0,0 +1,455 @@ +import os +import sys +import warnings + +from contextlib import redirect_stdout, redirect_stderr, contextmanager +from types import SimpleNamespace +from collections.abc import Mapping +from io import StringIO +from functools import wraps + +import numpy as np +import gymnasium + +import pufferlib.spaces + +ENV_ERROR = ''' +Environment missing required attribute {}. The most common cause is +calling super() before you have assigned the attribute. +''' + + +def set_buffers(env, buf=None): + if buf is None: + obs_space = env.single_observation_space + env.observations = np.zeros((env.num_agents, *obs_space.shape), dtype=obs_space.dtype) + env.rewards = np.zeros(env.num_agents, dtype=np.float32) + env.terminals = np.zeros(env.num_agents, dtype=bool) + env.truncations = np.zeros(env.num_agents, dtype=bool) + env.masks = np.ones(env.num_agents, dtype=bool) + + # TODO: Major kerfuffle on inferring action space dtype. This needs some asserts? + atn_space = pufferlib.spaces.joint_space(env.single_action_space, env.num_agents) + if isinstance(env.single_action_space, pufferlib.spaces.Box): + env.actions = np.zeros(atn_space.shape, dtype=atn_space.dtype) + else: + env.actions = np.zeros(atn_space.shape, dtype=np.int32) + else: + env.observations = buf['observations'] + env.rewards = buf['rewards'] + env.terminals = buf['terminals'] + env.truncations = buf['truncations'] + env.masks = buf['masks'] + env.actions = buf['actions'] + +class PufferEnv: + def __init__(self, buf=None): + if not hasattr(self, 'single_observation_space'): + raise APIUsageError(ENV_ERROR.format('single_observation_space')) + if not hasattr(self, 'single_action_space'): + raise APIUsageError(ENV_ERROR.format('single_action_space')) + if not hasattr(self, 'num_agents'): + raise APIUsageError(ENV_ERROR.format('num_agents')) + if self.num_agents < 1: + raise APIUsageError('num_agents must be >= 1') + + if hasattr(self, 'observation_space'): + raise APIUsageError('PufferEnvs must define single_observation_space, not observation_space') + if hasattr(self, 'action_space'): + raise APIUsageError('PufferEnvs must define single_action_space, not action_space') + if not isinstance(self.single_observation_space, pufferlib.spaces.Box): + raise APIUsageError('Native observation_space must be a Box') + if (not isinstance(self.single_action_space, pufferlib.spaces.Discrete) + and not isinstance(self.single_action_space, pufferlib.spaces.MultiDiscrete) + and not isinstance(self.single_action_space, pufferlib.spaces.Box)): + raise APIUsageError('Native action_space must be a Discrete, MultiDiscrete, or Box') + + set_buffers(self, buf) + + self.action_space = pufferlib.spaces.joint_space(self.single_action_space, self.num_agents) + self.observation_space = pufferlib.spaces.joint_space(self.single_observation_space, self.num_agents) + self.agent_ids = np.arange(self.num_agents) + + @property + def agent_per_batch(self): + return self.num_agents + + @property + def emulated(self): + '''Native envs do not use emulation''' + return False + + @property + def done(self): + '''Native envs handle resets internally''' + return False + + @property + def driver_env(self): + '''For compatibility with Multiprocessing''' + return self + + def reset(self, seed=None): + raise NotImplementedError + + def step(self, actions): + raise NotImplementedError + + def close(self): + raise NotImplementedError + + def async_reset(self, seed=None): + _, self.infos = self.reset(seed) + assert isinstance(self.infos, list), 'PufferEnvs must return info as a list of dicts' + + def send(self, actions): + _, _, _, _, self.infos = self.step(actions) + assert isinstance(self.infos, list), 'PufferEnvs must return info as a list of dicts' + + def recv(self): + return (self.observations, self.rewards, self.terminals, + self.truncations, self.infos, self.agent_ids, self.masks) +### Postprocessing +class ResizeObservation(gymnasium.Wrapper): + '''Fixed downscaling wrapper. Do NOT use gym.wrappers.ResizeObservation + It uses a laughably slow OpenCV resize. -50% on Atari just from that.''' + def __init__(self, env, downscale=2): + super().__init__(env) + self.downscale = downscale + y_size, x_size = env.observation_space.shape + assert y_size % downscale == 0 and x_size % downscale == 0 + y_size = env.observation_space.shape[0] // downscale + x_size = env.observation_space.shape[1] // downscale + self.observation_space = gymnasium.spaces.Box( + low=0, high=255, shape=(y_size, x_size), dtype=np.uint8) + + def reset(self, seed=None, options=None): + obs, info = self.env.reset(seed=seed, options=options) + return obs[::self.downscale, ::self.downscale], info + + def step(self, action): + obs, reward, terminal, truncated, info = self.env.step(action) + return obs[::self.downscale, ::self.downscale], reward, terminal, truncated, info + +class ClipAction(gymnasium.Wrapper): + '''Wrapper for Gymnasium environments that clips actions''' + def __init__(self, env): + self.env = env + assert isinstance(env.action_space, gymnasium.spaces.Box) + dtype_info = np.finfo(env.action_space.dtype) + self.action_space = gymnasium.spaces.Box( + low=dtype_info.min, + high=dtype_info.max, + shape=env.action_space.shape, + dtype=env.action_space.dtype, + ) + + def step(self, action): + action = np.clip(action, self.env.action_space.low, self.env.action_space.high) + return self.env.step(action) + + +class EpisodeStats(gymnasium.Wrapper): + '''Wrapper for Gymnasium environments that stores + episodic returns and lengths in infos''' + def __init__(self, env): + self.env = env + self.observation_space = env.observation_space + self.action_space = env.action_space + self.reset() + + def reset(self, seed=None, options=None): + self.info = dict(episode_return=[], episode_length=0) + # TODO: options + return self.env.reset(seed=seed)#, options=options) + + def step(self, action): + observation, reward, terminated, truncated, info = super().step(action) + + for k, v in unroll_nested_dict(info): + if k not in self.info: + self.info[k] = [] + + self.info[k].append(v) + + self.info['episode_return'].append(reward) + self.info['episode_length'] += 1 + + info = {} + if terminated or truncated: + for k, v in self.info.items(): + try: + info[k] = sum(v) + continue + except TypeError: + pass + + if isinstance(v, str): + info[k] = v + continue + + try: + x = int(v) # probably a value + info[k] = v + continue + except TypeError: + pass + + return observation, reward, terminated, truncated, info + +class PettingZooWrapper: + '''PettingZoo does not provide a ParallelEnv wrapper. This code is adapted from + their AEC wrapper, to prevent unneeded conversions to/from AEC''' + def __init__(self, env): + self.env = env + + def __getattr__(self, name): + '''Returns an attribute with ``name``, unless ``name`` starts with an underscore.''' + if name.startswith('_') and name != '_cumulative_rewards': + raise AttributeError(f'accessing private attribute "{name}" is prohibited') + return getattr(self.env, name) + + @property + def unwrapped(self): + return self.env.unwrapped + + def close(self): + self.env.close() + + def render(self): + return self.env.render() + + def reset(self, seed=None, options=None): + try: + return self.env.reset(seed=seed, options=options) + except TypeError: + return self.env.reset(seed=seed) + + def observe(self, agent): + return self.env.observe(agent) + + def state(self): + return self.env.state() + + def step(self, action): + return self.env.step(action) + + def observation_space(self, agent): + return self.env.observation_space(agent) + + def action_space(self, agent): + return self.env.action_space(agent) + + def __str__(self) -> str: + '''Returns a name which looks like: "max_observation".''' + return f'{type(self).__name__}<{str(self.env)}>' + +class MeanOverAgents(PettingZooWrapper): + '''Averages over agent infos''' + def _mean(self, infos): + list_infos = {} + for agent, info in infos.items(): + for k, v in info.items(): + if k not in list_infos: + list_infos[k] = [] + + list_infos[k].append(v) + + mean_infos = {} + for k, v in list_infos.items(): + try: + mean_infos[k] = np.mean(v) + except: + pass + + return mean_infos + + def reset(self, seed=None, options=None): + observations, infos = super().reset(seed, options) + infos = self._mean(infos) + return observations, infos + + def step(self, actions): + observations, rewards, terminations, truncations, infos = super().step(actions) + infos = self._mean(infos) + return observations, rewards, terminations, truncations, infos + +class MultiagentEpisodeStats(PettingZooWrapper): + '''Wrapper for PettingZoo environments that stores + episodic returns and lengths in infos''' + def reset(self, seed=None, options=None): + observations, infos = super().reset(seed=seed, options=options) + self.infos = { + agent: dict(episode_return=[], episode_length=0) + for agent in self.possible_agents + } + return observations, infos + + def step(self, actions): + observations, rewards, terminations, truncations, infos = super().step(actions) + + all_infos = {} + for agent in infos: + agent_info = self.infos[agent] + for k, v in unroll_nested_dict(infos[agent]): + if k not in agent_info: + agent_info[k] = [] + + agent_info[k].append(v) + + # Saved to self. TODO: Clean up + agent_info['episode_return'].append(rewards[agent]) + agent_info['episode_length'] += 1 + + agent_info = {} + all_infos[agent] = agent_info + if terminations[agent] or truncations[agent]: + for k, v in self.infos[agent].items(): + try: + agent_info[k] = sum(v) + continue + except TypeError: + pass + + if isinstance(v, str): + agent_info[k] = v + continue + + try: + x = int(v) # probably a value + agent_info[k] = v + continue + except TypeError: + pass + + return observations, rewards, terminations, truncations, all_infos +### Exceptions +class EnvironmentSetupError(RuntimeError): + def __init__(self, e, package): + super().__init__(self.message) + +class APIUsageError(RuntimeError): + """Exception raised when the API is used incorrectly.""" + + def __init__(self, message="API usage error."): + self.message = message + super().__init__(self.message) + +class InvalidAgentError(ValueError): + """Exception raised when an invalid agent key is used.""" + + def __init__(self, agent_id, agents): + message = ( + f'Invalid agent/team ({agent_id}) specified. ' + f'Valid values:\n{agents}' + ) + super().__init__(message) + +class GymToGymnasium: + def __init__(self, env): + self.env = env + self.observation_space = env.observation_space + self.action_space = env.action_space + self.render = env.render + self.metadata = env.metadata + + def reset(self, seed=None, options=None): + if seed is not None: + ob = self.env.reset(seed=seed) + else: + ob = self.env.reset() + return ob, {} + + def step(self, action): + observation, reward, done, info = self.env.step(action) + return observation, reward, done, False, info + + def close(self): + self.env.close() + +### Wrappers +class PettingZooTruncatedWrapper: + def __init__(self, env): + self.env = env + self.observation_space = env.observation_space + self.action_space = env.action_space + self.render = env.render + + @property + def render_mode(self): + return self.env.render_mode + + @property + def possible_agents(self): + return self.env.possible_agents + + @property + def agents(self): + return self.env.agents + + def reset(self, seed=None): + if seed is not None: + ob, info = self.env.reset(seed=seed) + else: + ob, info = self.env.reset() + info = {k: {} for k in ob} + return ob, info + + def step(self, actions): + observations, rewards, terminals, truncations, infos = self.env.step(actions) + return observations, rewards, terminals, truncations, infos + + def close(self): + self.env.close() + +### Misc +def unroll_nested_dict(d): + if not isinstance(d, dict): + return d + + for k, v in d.items(): + if isinstance(v, dict): + for k2, v2 in unroll_nested_dict(v): + yield f"{k}/{k2}", v2 + else: + yield k, v + +def silence_warnings(original_func, category=DeprecationWarning): + @wraps(original_func) + def wrapper(*args, **kwargs): + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=category) + return original_func(*args, **kwargs) + return wrapper + +class Suppress(): + def __init__(self): + self.f = StringIO() + self.null_1 = os.open(os.devnull, os.O_WRONLY | os.O_TRUNC | os.O_CREAT) + self.null_2 = os.open(os.devnull, os.O_WRONLY | os.O_TRUNC | os.O_CREAT) + + def __enter__(self): + # Suppress C library outputs + self.orig_stdout = os.dup(1) + self.orig_stderr = os.dup(2) + os.dup2(self.null_1, 1) + os.dup2(self.null_2, 2) + + # Suppress Python outputs + self._stdout_redirector = redirect_stdout(self.f) + self._stderr_redirector = redirect_stderr(self.f) + self._stdout_redirector.__enter__() + self._stderr_redirector.__enter__() + + def __exit__(self, exc_type, exc_val, exc_tb): + # Enable C library outputs + os.dup2(self.orig_stdout, 1) + os.dup2(self.orig_stderr, 2) + os.close(self.orig_stdout) + os.close(self.orig_stderr) + os.close(self.null_1) + os.close(self.null_2) + + # Enable Python outputs + self._stdout_redirector.__exit__(exc_type, exc_val, exc_tb) + self._stderr_redirector.__exit__(exc_type, exc_val, exc_tb) diff --git a/pufferlib/pytorch.py b/pufferlib/pytorch.py index 4d4581cd70..fd95fd714d 100644 --- a/pufferlib/pytorch.py +++ b/pufferlib/pytorch.py @@ -47,7 +47,7 @@ # TODO: handle discrete obs # Spend some time trying to break this fn with differnt obs -def nativize_dtype(emulated: pufferlib.namespace) -> NativeDType: +def nativize_dtype(emulated) -> NativeDType: # sample dtype - the dtype of what we obtain from the environment (usually bytes) sample_dtype: np.dtype = emulated.observation_dtype # structured dtype - the gym.Space converted numpy dtype @@ -100,10 +100,7 @@ def _nativize_dtype(sample_dtype: np.dtype, return subviews, dtype, shape, start_offset, all_delta -def nativize_tensor( - observation: torch.Tensor, - native_dtype: NativeDType, -) -> torch.Tensor | dict[str, torch.Tensor]: +def nativize_tensor(observation: torch.Tensor, native_dtype: NativeDType): return _nativize_tensor(observation, native_dtype) @@ -124,9 +121,7 @@ def compilable_cast(u8, dtype): return u8.view(dtype) # breaking cast -def _nativize_tensor( - observation: torch.Tensor, native_dtype: NativeDType -) -> torch.Tensor | dict[str, torch.Tensor]: +def _nativize_tensor(observation: torch.Tensor, native_dtype: NativeDType): if isinstance(native_dtype, tuple): dtype, shape, offset, delta = native_dtype torch._check_is_size(offset) @@ -157,13 +152,11 @@ def nativize_observation(observation, emulated): ) -def flattened_tensor_size(native_dtype: tuple[torch.dtype, tuple[int], int, int]): +def flattened_tensor_size(native_dtype): return _flattened_tensor_size(native_dtype) -def _flattened_tensor_size( - native_dtype: tuple[torch.dtype, tuple[int], int, int], -) -> int: +def _flattened_tensor_size(native_dtype): if isinstance(native_dtype, tuple): return np.prod(native_dtype[1]) # shape else: @@ -277,11 +270,9 @@ def entropy_probs(logits, probs): p_log_p = logits * probs return -p_log_p.sum(-1) - -def sample_logits(logits: Union[torch.Tensor, List[torch.Tensor]], - action=None, is_continuous=False): +def sample_logits(logits, action=None): is_discrete = isinstance(logits, torch.Tensor) - if is_continuous: + if isinstance(logits, torch.distributions.Normal): batch = logits.loc.shape[0] if action is None: action = logits.sample().view(batch, -1) @@ -291,6 +282,7 @@ def sample_logits(logits: Union[torch.Tensor, List[torch.Tensor]], return action, log_probs, logits_entropy elif is_discrete: logits = logits.unsqueeze(0) + # TODO: Double check this else: #multi-discrete logits = torch.nn.utils.rnn.pad_sequence( [l.transpose(0,1) for l in logits], @@ -299,15 +291,15 @@ def sample_logits(logits: Union[torch.Tensor, List[torch.Tensor]], ).permute(1,2,0) normalized_logits = logits - logits.logsumexp(dim=-1, keepdim=True) - probs = logits_to_probs(normalized_logits) + probs = logits_to_probs(logits) if action is None: + probs = torch.nan_to_num(probs, 1e-8, 1e-8, 1e-8) action = torch.multinomial(probs.reshape(-1, probs.shape[-1]), 1, replacement=True) action = action.reshape(probs.shape[:-1]) else: batch = logits[0].shape[0] action = action.view(batch, -1).T - probs = logits_to_probs(normalized_logits) assert len(logits) == len(action) logprob = log_prob(normalized_logits, action) @@ -317,6 +309,3 @@ def sample_logits(logits: Union[torch.Tensor, List[torch.Tensor]], return action.squeeze(0), logprob.squeeze(0), logits_entropy.squeeze(0) return action.T, logprob.sum(0), logits_entropy - - - diff --git a/shared.cpp b/pufferlib/shared.cpp similarity index 99% rename from shared.cpp rename to pufferlib/shared.cpp index 1216564779..791215b0af 100644 --- a/shared.cpp +++ b/pufferlib/shared.cpp @@ -6,7 +6,9 @@ #define __device__ #endif +const int max_horizon = 256; // [horizon] +/* __host__ __device__ void gae_row(float* values, float* rewards, float* dones, float* advantages, float gamma, float gae_lambda, int horizon) { float lastgaelam = 0; @@ -46,7 +48,6 @@ torch::Tensor gae_check(torch::Tensor values, torch::Tensor rewards, } // [horizon] -const int max_horizon = 256; __host__ __device__ void vtrace_row(float* values, float* rewards, float* dones, float* importance, float* vs, float* advantages, float gamma, float rho_clip, float c_clip, int horizon) { float accum = 0.0;//values[horizon-1]; // Is this correct? @@ -62,6 +63,7 @@ __host__ __device__ void vtrace_row(float* values, float* rewards, float* dones, vs[t] = accum + values[t]; } } +*/ __host__ __device__ void puff_advantage_row(float* values, float* rewards, float* dones, float* importance, float* vs, float* advantages, float gamma, float lambda, diff --git a/pufferlib/spaces.py b/pufferlib/spaces.py index b5bab9e6cc..178513c02c 100644 --- a/pufferlib/spaces.py +++ b/pufferlib/spaces.py @@ -17,9 +17,10 @@ def joint_space(space, n): high=np.repeat(space.nvec[None] - 1, n, axis=0), shape=(n, len(space)), dtype=space.dtype) elif isinstance(space, Box): - return gymnasium.spaces.Box( - low=np.repeat(space.low[None], n, axis=0), - high=np.repeat(space.high[None], n, axis=0), - shape=(n, *space.shape), dtype=space.dtype) + low = np.repeat(space.low[None], n, axis=0).squeeze() + high = np.repeat(space.high[None], n, axis=0).squeeze() + shape = [n, *[e for e in space.shape if e != 1]] + return gymnasium.spaces.Box(low=low, high=high, + shape=shape, dtype=space.dtype) else: raise ValueError(f'Unsupported space: {space}') diff --git a/pufferlib/sweep.py b/pufferlib/sweep.py index 49c2494a8d..63e06b186d 100644 --- a/pufferlib/sweep.py +++ b/pufferlib/sweep.py @@ -6,7 +6,6 @@ from copy import deepcopy import pufferlib -import scipy.stats import torch import pyro @@ -115,7 +114,7 @@ def unnormalize(self, value): def _params_from_puffer_sweep(sweep_config): param_spaces = {} for name, param in sweep_config.items(): - if name in ('method', 'name', 'metric', 'max_score'): + if name in ('method', 'metric', 'goal'): continue assert isinstance(param, dict) @@ -152,12 +151,13 @@ def _params_from_puffer_sweep(sweep_config): class Hyperparameters: def __init__(self, config, verbose=True): self.spaces = _params_from_puffer_sweep(config) - self.flat_spaces = dict(pufferlib.utils.unroll_nested_dict(self.spaces)) + self.flat_spaces = dict(pufferlib.unroll_nested_dict(self.spaces)) self.num = len(self.flat_spaces) self.metric = config['metric'] - assert self.metric['goal'] in ['maximize', 'minimize'] - self.optimize_direction = 1 if self.metric['goal'] == 'maximize' else -1 + goal = config['goal'] + assert goal in ('maximize', 'minimize') + self.optimize_direction = 1 if goal == 'maximize' else -1 self.search_centers = np.array([ e.norm_mean for e in self.flat_spaces.values()]) @@ -191,7 +191,7 @@ def sample(self, n, mu=None, scale=1): return np.clip(samples, self.min_bounds, self.max_bounds) def from_dict(self, params): - flat_params = dict(pufferlib.utils.unroll_nested_dict(params)) + flat_params = dict(pufferlib.unroll_nested_dict(params)) values = [] for key, space in self.flat_spaces.items(): assert key in flat_params, f'Missing hyperparameter {key}' @@ -325,15 +325,17 @@ def create_gp(x_dim, scale_length=1.0): optimizer = torch.optim.Adam(model.parameters(), lr=0.0001) return model, optimizer +# TODO: Eval defaults class Protein: def __init__(self, sweep_config, - max_suggestion_cost = None, - resample_frequency = 5, - num_random_samples = 10, + max_suggestion_cost = 3600, + resample_frequency = 0, + num_random_samples = 50, global_search_scale = 1, random_suggestions = 1024, suggestions_per_pareto = 256, + seed_with_search_center = True, min_score = None, max_score = None, ): @@ -350,6 +352,7 @@ def __init__(self, self.global_search_scale = global_search_scale self.random_suggestions = random_suggestions self.suggestions_per_pareto = suggestions_per_pareto + self.seed_with_search_center = seed_with_search_center self.resample_frequency = resample_frequency self.max_suggestion_cost = max_suggestion_cost @@ -363,15 +366,11 @@ def __init__(self, def suggest(self, fill): # TODO: Clip random samples to bounds so we don't get bad high cost samples info = {} - #if self.suggestion_idx <= self.num_random_samples: - # suggestions = self.hyperparameters.sample(self.random_suggestions) - # best_idx = np.random.randint(0, self.random_suggestions) - # best = suggestions[best_idx] self.suggestion_idx += 1 - if len(self.success_observations) == 0: + if len(self.success_observations) == 0 and self.seed_with_search_center: best = self.hyperparameters.search_centers return self.hyperparameters.to_dict(best, fill), info - elif len(self.success_observations) < self.num_random_samples: + elif not self.seed_with_search_center and len(self.success_observations) < self.num_random_samples: suggestions = self.hyperparameters.sample(self.random_suggestions) self.suggestion = random.choice(suggestions) return self.hyperparameters.to_dict(self.suggestion, fill), info @@ -392,26 +391,21 @@ def suggest(self, fill): # Transformed scores min_score = self.min_score if min_score is None: - min_score = np.min(y) - np.min(np.abs(y)) + min_score = np.min(y) if np.min(y) < min_score - 1e-6: raise ValueError(f'Min score {min_score} is less than min score in data {np.min(y)}') max_score = self.max_score if max_score is None: - max_score = np.max(y) + np.max(np.abs(y)) + max_score = np.max(y) if np.max(y) > max_score + 1e-6: raise ValueError(f'Max score {max_score} is greater than max score in data {np.max(y)}') - # Linearize, exp transform, linearize - y_norm = (y - min_score) / (max_score - min_score) - #yt = -np.log(1 - y_norm + eps) - #yt_min = np.min(yt) - #yt_max = np.max(yt) - #yt_norm = (yt - yt_min) / (yt_max - yt_min) + # Linearize + y_norm = (y - min_score) / (np.abs(max_score - min_score) + 1e-6) - #self.gp_score.set_data(params, torch.from_numpy(yt_norm)) self.gp_score.set_data(params, torch.from_numpy(y_norm)) self.gp_score.train() gp.util.train(self.gp_score, self.score_opt) @@ -425,23 +419,20 @@ def suggest(self, fill): # Linear input norm creates clean 1 mean fn log_c_min = np.min(log_c) log_c_max = np.max(log_c) - log_c_norm = (log_c - log_c_min) / (log_c_max - log_c_min) + log_c_norm = (log_c - log_c_min) / (log_c_max - log_c_min + 1e-6) self.gp_cost.mean_function = lambda x: 1 self.gp_cost.set_data(params, torch.from_numpy(log_c_norm)) self.gp_cost.train() - gp.util.train(self.gp_cost, self.cost_opt) + try: + gp.util.train(self.gp_cost, self.cost_opt) + except: + breakpoint() self.gp_cost.eval() candidates, pareto_idxs = pareto_points(self.success_observations) pareto_costs = np.array([e['cost'] for e in candidates]) - #cost_dists = np.abs(np.log(pareto_costs[:, None]) - np.log(pareto_costs[None, :])) - ###cost_dists = np.abs(pareto_costs[:, None] - pareto_costs[None, :]) - #cost_dists += (np.max(pareto_costs) + 1)*np.eye(len(pareto_costs)) # mask self-distance - #idx = np.argmax(np.min(cost_dists, axis=1)) - #search_centers = candidates[idx]['input'] - ### Sample suggestions search_centers = np.stack([e['input'] for e in candidates]) suggestions = self.hyperparameters.sample( @@ -456,10 +447,7 @@ def suggest(self, fill): gp_y_norm = gp_y_norm.numpy() gp_log_c_norm = gp_log_c_norm.numpy() - # Unlinearize, inverse exp transform, unlinearize - #gp_yt = gp_yt_norm*(yt_max - yt_min) + yt_min - #gp_y_norm = -(np.exp(-gp_yt) - 1 - eps) - #gp_y = gp_y_norm*(max_score - min_score) + min_score + # Unlinearize gp_y = gp_y_norm*(max_score - min_score) + min_score gp_log_c = gp_log_c_norm*(log_c_max - log_c_min) + log_c_min @@ -467,94 +455,31 @@ def suggest(self, fill): gp_c_min = np.min(gp_c) gp_c_max = np.max(gp_c) - gp_c_norm = (gp_c - gp_c_min) / (gp_c_max - gp_c_min) + gp_c_norm = (gp_c - gp_c_min) / (gp_c_max - gp_c_min + 1e-6) pareto_y = y[pareto_idxs] - #pareto_yt = yt[pareto_idxs] - #pareto_yt_norm = yt_norm[pareto_idxs] pareto_c = c[pareto_idxs] pareto_log_c_norm = log_c_norm[pareto_idxs] max_c = np.max(c) min_c = np.min(c) - c_right = abs(pareto_log_c_norm[None, :] - gp_log_c_norm[:, None]) - - #pareto_c_norm = (pareto_c - min_c) / (max_c - min_c) - #gp_c_norm = (gp_c - min_c) / (max_c - min_c) - #c_right = np.abs(pareto_c_norm[None, :] - gp_c_norm[:, None]) - - #pareto_log_c_norm = (np.log(pareto_c) - log_c_min) / (log_c_max - log_c_min) - #c_right = np.abs(pareto_log_c_norm[None, :] - gp_log_c_norm[:, None]) - - sorted_dist = np.sort(c_right, axis=1) - #top_k = sorted_dist[:, :5] - #pareto_dist_weight = np.sum(top_k, axis=1) / top_k.shape[1] - - nearest_idx = np.argmin(c_right, axis=1) - nearest_pareto_dist = np.min(c_right, axis=1) - nearest_pareto_y = pareto_y[nearest_idx] - - #c_left = np.abs(gp_c[:, None] - pareto_c[None, :]) - #c_left[c_left < 0] = np.inf - #nearest_idx = np.argmin(c_left, axis=1) - #nearest_pareto_yt_norm = pareto_yt_norm[nearest_idx] - max_c_mask = gp_c < self.max_suggestion_cost - #suggestion_scores = self.hyperparameters.optimize_direction * max_c_mask * ( - # gp_yt_norm - nearest_pareto_yt_norm) * nearest_pareto_dist - - #suggestion_scores = self.hyperparameters.optimize_direction * max_c_mask * ( - # gp_yt_norm - nearest_pareto_yt_norm)# / gp_c - - #np.argwhere(gp_c > c) - cumsum_mask = c[None, :] <= np.clip(gp_c[:, None], min_c, max_c) - cumsum_mask = cumsum_mask * c[None, :] - cumsum = np.sum(cumsum_mask, axis=1) / np.sum(c) - target = gp_c_norm - weight = target - cumsum - - #if np.random.rand() < 0.5: - # score = gp_y_norm - #else: - # score = gp_y_norm * weight - #suggestion_scores = self.hyperparameters.optimize_direction * max_c_mask * ( - # score)# / gp_c - target = 1.25*np.random.rand() weight = 1 - abs(target - gp_log_c_norm) suggestion_scores = self.hyperparameters.optimize_direction * max_c_mask * ( - gp_y_norm*weight)# / gp_c - - #suggestion_scores = self.hyperparameters.optimize_direction * max_c_mask * ( - # gp_y_norm*nearest_pareto_dist)# / gp_c - - #exp_scores = np.exp(suggestion_scores) - #sum_exp_scores = np.sum(exp_scores) - #softmax_scores = exp_scores / sum_exp_scores - #idxs = np.arange(len(softmax_scores)) - #best_idx = np.random.choice(idxs, p=softmax_scores) - - # This works and uncovers approximate binary search when the GP is perfect - # Can't include cost in denom because it biases this case - # Instead, use conservative score and/or cost estimates - # Just need to figure out why the GP is overconfident + gp_y_norm*weight) best_idx = np.argmax(suggestion_scores) - #best_idx = np.argmax(gp_y_norm) info = dict( cost = gp_c[best_idx].item(), score = gp_y[best_idx].item(), - nearby = nearest_pareto_y[best_idx].item(), - dist = nearest_pareto_dist[best_idx].item(), rating = suggestion_scores[best_idx].item(), ) print('Predicted -- ', f'Score: {info["score"]:.3f}', - f'Nearby: {info["nearby"]:.3f}', - f'Dist: {info["dist"]:.3f}', f'Cost: {info["cost"]:.3f}', f'Rating: {info["rating"]:.3f}', ) @@ -699,13 +624,13 @@ def _carbs_params_from_puffer_sweep(sweep_config): class Carbs: def __init__(self, sweep_config: dict, - max_suggestion_cost: float = None, + max_suggestion_cost: float = 3600, resample_frequency: int = 5, num_random_samples: int = 10, ): param_spaces = _carbs_params_from_puffer_sweep(sweep_config) - flat_spaces = [e[1] for e in pufferlib.utils.unroll_nested_dict(param_spaces)] + flat_spaces = [e[1] for e in pufferlib.unroll_nested_dict(param_spaces)] for e in flat_spaces: print(e.name, e.space) diff --git a/pufferlib/utils.py b/pufferlib/utils.py deleted file mode 100644 index 1150c76311..0000000000 --- a/pufferlib/utils.py +++ /dev/null @@ -1,410 +0,0 @@ -from pdb import set_trace as T - -from collections import OrderedDict -from contextlib import nullcontext - -import numpy as np - -import time -import os -import sys -import pickle -import subprocess -from contextlib import redirect_stdout, redirect_stderr, contextmanager -from io import StringIO -import psutil - -import warnings -from functools import wraps - -import functools -import inspect -import importlib - -def validate_args(fn, kwargs): - fn_kwargs = get_init_args(fn) - for param, val in kwargs.items(): - if param not in fn_kwargs: - raise ValueError( - f'Invalid argument\n{param}\nto\n{fn}\n' - f'which takes \n{fn_kwargs}\n' - f'Double check your config' - ) - -def get_init_args(fn): - if fn is None: - return {} - - if isinstance(fn, functools.partial): - return fn.keywords - - sig = inspect.signature(fn) - kwargs = {} - for name, param in sig.parameters.items(): - if name in ['env', 'policy']: - # Hack to avoid duplicate kwargs - continue - if param.kind == inspect.Parameter.VAR_POSITIONAL: - continue - elif param.kind == inspect.Parameter.VAR_KEYWORD: - continue - else: - kwargs[name] = param.default if param.default is not inspect.Parameter.empty else None - return kwargs - - -def unroll_nested_dict(d): - if not isinstance(d, dict): - return d - - for k, v in d.items(): - if isinstance(v, dict): - for k2, v2 in unroll_nested_dict(v): - yield f"{k}/{k2}", v2 - else: - yield k, v - -def install_requirements(env): - '''Pip install dependencies for specified environment''' - pip_install_cmd = [sys.executable, "-m", "pip", "install", "-e" f".[{env}]"] - proc = subprocess.run(pip_install_cmd, capture_output=True, text=True) - if proc.returncode != 0: - raise RuntimeError(f"Error installing requirements: {proc.stderr}") - -def install_and_import(package): - '''Install and import a package''' - try: - module = importlib.import_module(package) - except ImportError: - install_requirements(package) - module = importlib.import_module(package) - - return module - -def silence_warnings(original_func, category=DeprecationWarning): - @wraps(original_func) - def wrapper(*args, **kwargs): - with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=category) - return original_func(*args, **kwargs) - return wrapper - -def check_env(env): - #assert issubclass(env_cls, gym.Env), "Not a gymnasium env (are you on old gym?)" - assert hasattr(env, 'possible_agents') - assert len(env.possible_agents) - obs_space = env.observation_space(env.possible_agents[0]) - atn_space = env.action_space(env.possible_agents[0]) - for e in env.possible_agents: - assert env.observation_space(e) == obs_space, 'All agents must have same obs space' - assert env.action_space(e) == atn_space, 'All agents must have same atn space' - -def make_zeros_like(data): - if isinstance(data, dict): - return {k: make_zeros_like(v) for k, v in data.items()} - elif isinstance(data, (list, tuple)): - return [make_zeros_like(v) for v in data] - elif isinstance(data, np.ndarray): - return np.zeros_like(data) - elif isinstance(data, (int, float)): - return 0 - else: - raise ValueError(f'Unsupported type: {type(data)}') - -def compare_arrays(array_1, array_2): - assert isinstance(array_1, np.ndarray) - assert isinstance(array_2, np.ndarray) - assert array_1.shape == array_2.shape - return np.allclose(array_1, array_2) - -def compare_dicts(dict_1, dict_2, idx): - assert isinstance(dict_1, (dict, OrderedDict)) - assert isinstance(dict_2, (dict, OrderedDict)) - - if not all(k in dict_2 for k in dict_1): - raise ValueError("Keys do not match between dictionaries.") - - for k, v in dict_1.items(): - if not compare_space_samples(v, dict_2[k], idx): - return False - - return True - -def compare_lists(list_1, list_2, idx): - assert isinstance(list_1, (list, tuple)) - assert isinstance(list_2, (list, tuple)) - - if len(list_1) != len(list_2): - raise ValueError("Lengths do not match between lists/tuples.") - - for v1, v2 in zip(list_1, list_2): - if not compare_space_samples(v1, v2, idx): - return False - - return True - -def compare_space_samples(sample_1, sample_2, sample_2_batch_idx=None): - '''Compare two samples from the same space - - Optionally, sample_2 may be a batch of samples from the same space - concatenated along the first dimension of the leaves. In this case, - sample_2_batch_idx specifies which sample to compare. - ''' - if isinstance(sample_1, (dict, OrderedDict)): - return compare_dicts(sample_1, sample_2, sample_2_batch_idx) - elif isinstance(sample_1, (list, tuple)): - return compare_lists(sample_1, sample_2, sample_2_batch_idx) - elif isinstance(sample_1, np.ndarray): - assert isinstance(sample_2, np.ndarray) - if sample_2_batch_idx is not None: - sample_2 = sample_2[sample_2_batch_idx] - return compare_arrays(sample_1, sample_2) - elif isinstance(sample_1, (int, float)): - if sample_2_batch_idx is not None: - sample_2 = sample_2[sample_2_batch_idx] - if isinstance(sample_2, np.ndarray): - assert sample_2.size == 1, "Cannot compare scalar to non-scalar." - sample_2 = sample_2[0] - return sample_1 == sample_2 - else: - raise ValueError(f"Unsupported type: {type(sample_1)}") - -def _get_dtype_bounds(dtype): - if dtype == bool: - return 0, 1 - elif np.issubdtype(dtype, np.integer): - return np.iinfo(dtype).min, np.iinfo(dtype).max - elif np.issubdtype(dtype, np.unsignedinteger): - return np.iinfo(dtype).min, np.iinfo(dtype).max - elif np.issubdtype(dtype, np.floating): - # Gym fails on float64 - return np.finfo(np.float32).min, np.finfo(np.float32).max - else: - raise ValueError(f"Unsupported dtype: {dtype}") - -def is_dict_space(space): - # Compatible with gym/gymnasium - return type(space).__name__ == 'Dict' - -def is_multiagent(env): - import pettingzoo - import gym - if inspect.isclass(env): - env_cls = env - else: - env_cls = type(env) - - if not issubclass(env_cls, pettingzoo.AECEnv) and not issubclass(env_cls, pettingzoo.ParallelEnv): - assert issubclass(env_cls, gym.Env), 'Environment must subclass pettingzoo.AECEnv/ParallelEnv or gym.Env' - return False - return True - -def current_datetime(): - return time.strftime('%Y-%m-%d_%H-%M-%S', time.localtime()) - -def myprint(d): - stack = d.items() - while stack: - k, v = stack.pop() - if isinstance(v, dict): - stack.extend(v.iteritems()) - else: - print("%s: %s" % (k, v)) - -class RandomState: - def __init__(self, seed): - self.rng = np.random.RandomState(seed) - - def random(self): - return self.rng.random() - - def probabilistic_round(self, n): - frac, integer = np.modf(n) - if self.random() < frac: - return int(integer) + 1 - else: - return int(integer) - - def sample(self, ary, n): - n_rounded = self.probabilistic_round(n) - return self.rng.choice(ary, n_rounded, replace=False).tolist() - - def choice(self, ary): - return self.sample(ary, 1)[0] - -def format_bytes(size): - if size >= 1024 ** 4: - return f'{size / (1024 ** 4):.2f} TB' - elif size >= 1024 ** 3: - return f'{size / (1024 ** 3):.2f} GB' - elif size >= 1024 ** 2: - return f'{size / (1024 ** 2):.2f} MB' - elif size >= 1024: - return f'{size / 1024:.2f} KB' - else: - return f'{size} B' - -# TODO: 5% perf gain by doing cuda sync less frequently -class Profiler: - def __init__(self, elapsed=True, calls=True, memory=False, - pytorch_memory=False, sync_cuda=True, frequency=10, amp_context=nullcontext()): - self.elapsed = 0 if elapsed else None - self.calls = 0 if calls else None - self.memory = None - self.pytorch_memory = None - self.prev = 0 - self.delta = 0 - - self.track_elapsed = elapsed - self.track_calls = calls - self.track_memory = memory - self.track_pytorch_memory = pytorch_memory - self.sync_cuda = sync_cuda - self.frequency = frequency - self.epoch = 0 - - if memory: - self.process = psutil.Process() - - if pytorch_memory or sync_cuda: - import torch - self.torch = torch - - self.amp_context = amp_context - - ''' - @property - def serial(self): - return { - 'elapsed': self.elapsed, - 'calls': self.calls, - 'memory': self.memory, - 'pytorch_memory': self.pytorch_memory, - 'delta': self.delta - } - - @property - def delta(self): - ret = self.elapsed - self.prev if self.elapsed is not None else None - self.prev = self.elapsed - return ret - ''' - - def __call__(self, epoch): - self.epoch = epoch - return self - - def __enter__(self): - if self.epoch % self.frequency != 0: - return self - - if self.sync_cuda: - self.torch.cuda.synchronize() - self.amp_context.__enter__() - if self.track_elapsed: - self.start_time = time.perf_counter() - if self.track_memory: - self.start_mem = self.process.memory_info().rss - if self.track_pytorch_memory: - self.start_torch_mem = self.torch.cuda.memory_allocated() - return self - - def __exit__(self, *args): - if self.epoch % self.frequency != 0: - return self - - self.amp_context.__exit__(None, None, None) - if self.sync_cuda: - self.torch.cuda.synchronize() - if self.track_elapsed: - self.end_time = time.perf_counter() - self.delta += self.end_time - self.start_time - self.elapsed += self.delta - if self.track_calls: - self.calls += 1 - if self.track_memory: - self.end_mem = self.process.memory_info().rss - self.memory = self.end_mem - self.start_mem - if self.track_pytorch_memory: - self.end_torch_mem = self.torch.cuda.memory_allocated() - self.pytorch_memory = self.end_torch_mem - self.start_torch_mem - - def __repr__(self): - parts = [] - if self.track_elapsed: - parts.append(f'Elapsed: {self.elapsed:.4f} s') - if self.track_calls: - parts.append(f'Calls: {self.calls}') - if self.track_memory: - parts.append(f'Memory: {format_bytes(self.memory)}') - if self.track_pytorch_memory: - parts.append(f'PyTorch Memory: {format_bytes(self.pytorch_memory)}') - return ", ".join(parts) - - # Aliases for use without context manager - start = __enter__ - stop = __exit__ - -def profile(func): - name = func.__name__ - - def wrapper(*args, **kwargs): - self = args[0] - - if not hasattr(self, '_timers'): - self._timers = {} - - if name not in self._timers: - self._timers[name] = Profiler() - - timer = self._timers[name] - - with timer: - result = func(*args, **kwargs) - - return result - - return wrapper - -def aggregate_profilers(profiler_dicts): - merged = {} - - for key in list(profiler_dicts[0].keys()): - merged[key] = Profiler() - for prof_dict in profiler_dicts: - merged[key].elapsed += prof_dict[key].elapsed - merged[key].calls += prof_dict[key].calls - - return merged - -class Suppress(): - def __init__(self): - self.f = StringIO() - self.null_1 = os.open(os.devnull, os.O_WRONLY | os.O_TRUNC | os.O_CREAT) - self.null_2 = os.open(os.devnull, os.O_WRONLY | os.O_TRUNC | os.O_CREAT) - - def __enter__(self): - # Suppress C library outputs - self.orig_stdout = os.dup(1) - self.orig_stderr = os.dup(2) - os.dup2(self.null_1, 1) - os.dup2(self.null_2, 2) - - # Suppress Python outputs - self._stdout_redirector = redirect_stdout(self.f) - self._stderr_redirector = redirect_stderr(self.f) - self._stdout_redirector.__enter__() - self._stderr_redirector.__enter__() - - def __exit__(self, exc_type, exc_val, exc_tb): - # Enable C library outputs - os.dup2(self.orig_stdout, 1) - os.dup2(self.orig_stderr, 2) - os.close(self.orig_stdout) - os.close(self.orig_stderr) - os.close(self.null_1) - os.close(self.null_2) - - # Enable Python outputs - self._stdout_redirector.__exit__(exc_type, exc_val, exc_tb) - self._stderr_redirector.__exit__(exc_type, exc_val, exc_tb) diff --git a/pufferlib/vector.py b/pufferlib/vector.py index 7b5d008f88..b5d2a1a704 100644 --- a/pufferlib/vector.py +++ b/pufferlib/vector.py @@ -6,11 +6,8 @@ import time import psutil -from pufferlib import namespace from pufferlib.emulation import GymnasiumPufferEnv, PettingZooPufferEnv -from pufferlib.environment import PufferEnv, set_buffers -from pufferlib.exceptions import APIUsageError -from pufferlib.namespace import Namespace +from pufferlib import PufferEnv, set_buffers import pufferlib.spaces import gymnasium @@ -24,19 +21,19 @@ def recv_precheck(vecenv): if vecenv.flag != RECV: - raise APIUsageError('Call reset before stepping') + raise pufferlib.APIUsageError('Call reset before stepping') vecenv.flag = SEND def send_precheck(vecenv, actions): if vecenv.flag != SEND: - raise APIUsageError('Call (async) reset + recv before sending') + raise pufferlib.APIUsageError('Call (async) reset + recv before sending') actions = np.asarray(actions) if not vecenv.initialized: vecenv.initialized = True if not vecenv.action_space.contains(actions): - raise APIUsageError('Actions do not match action space') + raise pufferlib.APIUsageError('Actions do not match action space') vecenv.flag = RECV return actions @@ -77,7 +74,7 @@ def __init__(self, env_creators, env_args, env_kwargs, num_envs, buf=None, seed= ptr = 0 for i in range(num_envs): end = ptr + self.driver_env.num_agents - buf_i = namespace( + buf_i = dict( observations=self.observations[ptr:end], rewards=self.rewards[ptr:end], terminals=self.terminals[ptr:end], @@ -102,7 +99,7 @@ def __init__(self, env_creators, env_args, env_kwargs, num_envs, buf=None, seed= def _avg_infos(self): infos = {} for e in self.infos: - for k, v in pufferlib.utils.unroll_nested_dict(e): + for k, v in pufferlib.unroll_nested_dict(e): if k not in infos: infos[k] = [] @@ -178,25 +175,25 @@ def _worker_process(env_creators, env_args, env_kwargs, obs_shape, obs_dtype, at # Environments read and write directly to shared memory shape = (num_workers, num_envs*num_agents) atn_arr = np.ndarray((*shape, *atn_shape), - dtype=atn_dtype, buffer=shm.actions)[worker_idx] - buf = namespace( + dtype=atn_dtype, buffer=shm['actions'])[worker_idx] + buf = dict( observations=np.ndarray((*shape, *obs_shape), - dtype=obs_dtype, buffer=shm.observations)[worker_idx], - rewards=np.ndarray(shape, dtype=np.float32, buffer=shm.rewards)[worker_idx], - terminals=np.ndarray(shape, dtype=bool, buffer=shm.terminals)[worker_idx], - truncations=np.ndarray(shape, dtype=bool, buffer=shm.truncateds)[worker_idx], - masks=np.ndarray(shape, dtype=bool, buffer=shm.masks)[worker_idx], + dtype=obs_dtype, buffer=shm['observations'])[worker_idx], + rewards=np.ndarray(shape, dtype=np.float32, buffer=shm['rewards'])[worker_idx], + terminals=np.ndarray(shape, dtype=bool, buffer=shm['terminals'])[worker_idx], + truncations=np.ndarray(shape, dtype=bool, buffer=shm['truncateds'])[worker_idx], + masks=np.ndarray(shape, dtype=bool, buffer=shm['masks'])[worker_idx], actions=atn_arr, ) - buf.masks[:] = True + buf['masks'][:] = True if is_native and num_envs == 1: envs = env_creators[0](*env_args[0], **env_kwargs[0], buf=buf, seed=seed) else: envs = Serial(env_creators, env_args, env_kwargs, num_envs, buf=buf, seed=seed*num_envs) - semaphores=np.ndarray(num_workers, dtype=np.uint8, buffer=shm.semaphores) - notify=np.ndarray(num_workers, dtype=bool, buffer=shm.notify) + semaphores=np.ndarray(num_workers, dtype=np.uint8, buffer=shm['semaphores']) + notify=np.ndarray(num_workers, dtype=bool, buffer=shm['notify']) start = time.time() while True: if notify[worker_idx]: @@ -249,7 +246,7 @@ def __init__(self, env_creators, env_args, env_kwargs, import psutil cpu_cores = psutil.cpu_count(logical=False) if num_workers > cpu_cores and not overwork: - raise APIUsageError(' '.join([ + raise pufferlib.APIUsageError(' '.join([ f'num_workers ({num_workers}) > hardware cores ({cpu_cores}) is disallowed by default.', 'PufferLib multiprocessing is heavily optimized for 1 process per hardware core.', 'If you really want to do this, set overwork=True (--vec-overwork in our demo.py).', @@ -258,7 +255,7 @@ def __init__(self, env_creators, env_args, env_kwargs, num_batches = num_envs / batch_size if zero_copy and num_batches != int(num_batches): # This is so you can have n equal buffers - raise APIUsageError( + raise pufferlib.APIUsageError( 'zero_copy: num_envs must be divisible by batch_size') self.num_environments = num_envs @@ -300,7 +297,7 @@ def __init__(self, env_creators, env_args, env_kwargs, from multiprocessing import RawArray, set_start_method # Mac breaks without setting fork... but setting it breaks sweeps on 2nd run #set_start_method('fork') - self.shm = namespace( + self.shm = dict( observations=RawArray(obs_ctype, num_agents * int(np.prod(obs_shape))), actions=RawArray(atn_ctype, num_agents * int(np.prod(atn_shape))), rewards=RawArray('f', num_agents), @@ -314,18 +311,18 @@ def __init__(self, env_creators, env_args, env_kwargs, self.obs_batch_shape = (self.agents_per_batch, *obs_shape) self.atn_batch_shape = (self.workers_per_batch, agents_per_worker, *atn_shape) self.actions = np.ndarray((*shape, *atn_shape), - dtype=atn_dtype, buffer=self.shm.actions) - self.buf = namespace( + dtype=atn_dtype, buffer=self.shm['actions']) + self.buf = dict( observations=np.ndarray((*shape, *obs_shape), - dtype=obs_dtype, buffer=self.shm.observations), - rewards=np.ndarray(shape, dtype=np.float32, buffer=self.shm.rewards), - terminals=np.ndarray(shape, dtype=bool, buffer=self.shm.terminals), - truncations=np.ndarray(shape, dtype=bool, buffer=self.shm.truncateds), - masks=np.ndarray(shape, dtype=bool, buffer=self.shm.masks), - semaphores=np.ndarray(num_workers, dtype=np.uint8, buffer=self.shm.semaphores), - notify=np.ndarray(num_workers, dtype=bool, buffer=self.shm.notify), + dtype=obs_dtype, buffer=self.shm['observations']), + rewards=np.ndarray(shape, dtype=np.float32, buffer=self.shm['rewards']), + terminals=np.ndarray(shape, dtype=bool, buffer=self.shm['terminals']), + truncations=np.ndarray(shape, dtype=bool, buffer=self.shm['truncateds']), + masks=np.ndarray(shape, dtype=bool, buffer=self.shm['masks']), + semaphores=np.ndarray(num_workers, dtype=np.uint8, buffer=self.shm['semaphores']), + notify=np.ndarray(num_workers, dtype=bool, buffer=self.shm['notify']), ) - self.buf.semaphores[:] = MAIN + self.buf['semaphores'][:] = MAIN from multiprocessing import Pipe, Process self.send_pipes, w_recv_pipes = zip(*[Pipe() for _ in range(num_workers)]) @@ -359,13 +356,13 @@ def recv(self): # Bandaid patch for new experience buffer desync if self.sync_traj: worker = self.waiting_workers[0] - sem = self.buf.semaphores[worker] + sem = self.buf['semaphores'][worker] if sem >= MAIN: self.waiting_workers.pop(0) self.ready_workers.append(worker) else: worker = self.waiting_workers.pop(0) - sem = self.buf.semaphores[worker] + sem = self.buf['semaphores'][worker] if sem >= MAIN: self.ready_workers.append(worker) else: @@ -427,10 +424,10 @@ def recv(self): self.w_slice = w_slice buf = self.buf - o = buf.observations[w_slice].reshape(self.obs_batch_shape) - r = buf.rewards[w_slice].ravel() - d = buf.terminals[w_slice].ravel() - t = buf.truncations[w_slice].ravel() + o = buf['observations'][w_slice].reshape(self.obs_batch_shape) + r = buf['rewards'][w_slice].ravel() + d = buf['terminals'][w_slice].ravel() + t = buf['truncations'][w_slice].ravel() infos = [] for i in s_range: @@ -439,7 +436,7 @@ def recv(self): self.infos[i] = [] agent_ids = self.agent_ids[w_slice].ravel() - m = buf.masks[w_slice].ravel() + m = buf['masks'][w_slice].ravel() self.batch_mask = m return o, r, d, t, infos, agent_ids, m @@ -450,7 +447,7 @@ def send(self, actions): idxs = self.w_slice self.actions[idxs] = actions - self.buf.semaphores[idxs] = STEP + self.buf['semaphores'][idxs] = STEP def async_reset(self, seed=0): self.flag = RECV @@ -462,42 +459,16 @@ def async_reset(self, seed=0): self.waiting_workers = list(range(self.num_workers)) self.infos = [[] for _ in range(self.num_workers)] - self.buf.semaphores[:] = RESET + self.buf['semaphores'][:] = RESET for i in range(self.num_workers): start = i*self.envs_per_worker end = (i+1)*self.envs_per_worker self.send_pipes[i].send(seed+i) def notify(self): - self.buf.notify[:] = True + self.buf['notify'][:] = True def close(self): - ''' - while self.waiting_workers: - worker = self.waiting_workers.pop(0) - sem = self.buf.semaphores[worker] - if sem >= MAIN: - self.ready_workers.append(worker) - if sem == INFO: - self.recv_pipes[worker].recv() - else: - self.waiting_workers.append(worker) - - self.buf.semaphores[:] = CLOSE - self.waiting_workers = list(range(self.num_workers)) - - while self.waiting_workers: - worker = self.waiting_workers.pop(0) - sem = self.buf.semaphores[worker] - if sem >= MAIN: - self.ready_workers.append(worker) - if sem == INFO: - self.recv_pipes[worker].recv() - - else: - self.waiting_workers.append(worker) - ''' - for p in self.processes: p.terminate() @@ -631,35 +602,50 @@ def close(self): def make(env_creator_or_creators, env_args=None, env_kwargs=None, backend=PufferEnv, num_envs=1, seed=0, **kwargs): if num_envs < 1: - raise APIUsageError('num_envs must be at least 1') + raise pufferlib.APIUsageError('num_envs must be at least 1') if num_envs != int(num_envs): - raise APIUsageError('num_envs must be an integer') + raise pufferlib.APIUsageError('num_envs must be an integer') + + if isinstance(backend, str): + try: + backend = getattr(pufferlib.vector, backend) + except: + raise pufferlib.APIUsageError(f'Invalid backend: {backend}') if backend == PufferEnv: env_args = env_args or [] env_kwargs = env_kwargs or {} vecenv = env_creator_or_creators(*env_args, **env_kwargs) if not isinstance(vecenv, PufferEnv): - raise APIUsageError('Native vectorization requires a native PufferEnv. Use Serial or Multiprocessing instead.') + raise pufferlib.APIUsageError('Native vectorization requires a native PufferEnv. Use Serial or Multiprocessing instead.') if num_envs != 1: - raise APIUsageError('Native vectorization is for PufferEnvs that handle all per-process vectorization internally. If you want to run multiple separate Python instances on a single process, use Serial or Multiprocessing instead') + raise pufferlib.APIUsageError('Native vectorization is for PufferEnvs that handle all per-process vectorization internally. If you want to run multiple separate Python instances on a single process, use Serial or Multiprocessing instead') return vecenv if 'num_workers' in kwargs: - num_workers = kwargs['num_workers'] + if kwargs['num_workers'] == 'auto': + kwargs['num_workers'] = num_envs + + # TODO: None? - envs_per_worker = num_envs / num_workers + envs_per_worker = num_envs / kwargs['num_workers'] if envs_per_worker != int(envs_per_worker): - raise APIUsageError('num_envs must be divisible by num_workers') + raise pufferlib.APIUsageError('num_envs must be divisible by num_workers') if 'batch_size' in kwargs: + if kwargs['batch_size'] == 'auto': + if num_envs == 1: + kwargs['batch_size'] = 1 + else: + kwargs['batch_size'] = num_envs // 2 + batch_size = kwargs['batch_size'] if batch_size is None: batch_size = num_envs if batch_size % envs_per_worker != 0: - raise APIUsageError( + raise pufferlib.APIUsageError( 'batch_size must be divisible by (num_envs / num_workers)') @@ -677,19 +663,19 @@ def make(env_creator_or_creators, env_args=None, env_kwargs=None, backend=Puffer env_creators = env_creator_or_creators if len(env_creators) != num_envs: - raise APIUsageError('env_creators must be a list of length num_envs') + raise pufferlib.APIUsageError('env_creators must be a list of length num_envs') if len(env_args) != num_envs: - raise APIUsageError('env_args must be a list of length num_envs') + raise pufferlib.APIUsageError('env_args must be a list of length num_envs') if len(env_kwargs) != num_envs: - raise APIUsageError('env_kwargs must be a list of length num_envs') + raise pufferlib.APIUsageError('env_kwargs must be a list of length num_envs') for i in range(num_envs): if not callable(env_creators[i]): - raise APIUsageError('env_creators must be a list of callables') + raise pufferlib.APIUsageError('env_creators must be a list of callables') if not isinstance(env_args[i], (list, tuple)): - raise APIUsageError('env_args must be a list of lists or tuples') - if not isinstance(env_kwargs[i], (dict, Namespace)): - raise APIUsageError('env_kwargs must be a list of dictionaries') + raise pufferlib.APIUsageError('env_args must be a list of lists or tuples') + if not isinstance(env_kwargs[i], dict): + raise pufferlib.APIUsageError('env_kwargs must be a list of dictionaries') # Keeps batch size consistent when debugging with Serial backend if backend is Serial and 'batch_size' in kwargs: @@ -701,7 +687,7 @@ def make(env_creator_or_creators, env_args=None, env_kwargs=None, backend=Puffer # Sanity check args for k in kwargs: if k not in ['num_workers', 'batch_size', 'zero_copy', 'overwork', 'backend']: - raise APIUsageError(f'Invalid argument: {k}') + raise pufferlib.APIUsageError(f'Invalid argument: {k}') # TODO: First step action space check @@ -714,28 +700,28 @@ def make_seeds(seed, num_envs): err = f'seed {seed} must be an integer or a list of integers' if isinstance(seed, (list, tuple)): if len(seed) != num_envs: - raise APIUsageError(err) + raise pufferlib.APIUsageError(err) return seed - raise APIUsageError(err) + raise pufferlib.APIUsageError(err) def check_envs(envs, driver): valid = (PufferEnv, GymnasiumPufferEnv, PettingZooPufferEnv) if not isinstance(driver, valid): - raise APIUsageError(f'env_creator must be {valid}') + raise pufferlib.APIUsageError(f'env_creator must be {valid}') driver_obs = driver.single_observation_space driver_atn = driver.single_action_space for env in envs: if not isinstance(env, valid): - raise APIUsageError(f'env_creators must be {valid}') + raise pufferlib.APIUsageError(f'env_creators must be {valid}') obs_space = env.single_observation_space if obs_space != driver_obs: - raise APIUsageError(f'\n{obs_space}\n{driver_obs} obs space mismatch') + raise pufferlib.APIUsageError(f'\n{obs_space}\n{driver_obs} obs space mismatch') atn_space = env.single_action_space if atn_space != driver_atn: - raise APIUsageError(f'\n{atn_space}\n{driver_atn} atn space mismatch') + raise pufferlib.APIUsageError(f'\n{atn_space}\n{driver_atn} atn space mismatch') def autotune(env_creator, batch_size, max_envs=194, model_forward_s=0.0, max_env_ram_gb=32, max_batch_vram_gb=0.05, time_per_test=5): @@ -891,7 +877,7 @@ def autotune(env_creator, batch_size, max_envs=194, model_forward_s=0.0, )) for config in configs: - with pufferlib.utils.Suppress(): + with pufferlib.Suppress(): envs = make(env_creator, **config) envs.reset() actions = [envs.action_space.sample() for _ in range(1000)] diff --git a/pufferlib/version.py b/pufferlib/version.py deleted file mode 100644 index 13ce17d8e8..0000000000 --- a/pufferlib/version.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = '2.0.6' diff --git a/pufferlib/wrappers.py b/pufferlib/wrappers.py deleted file mode 100644 index 3bda419715..0000000000 --- a/pufferlib/wrappers.py +++ /dev/null @@ -1,57 +0,0 @@ -from pdb import set_trace as T - -class GymToGymnasium: - def __init__(self, env): - self.env = env - self.observation_space = env.observation_space - self.action_space = env.action_space - self.render = env.render - self.metadata = env.metadata - - def reset(self, seed=None, options=None): - if seed is not None: - ob = self.env.reset(seed=seed) - else: - ob = self.env.reset() - return ob, {} - - def step(self, action): - observation, reward, done, info = self.env.step(action) - return observation, reward, done, False, info - - def close(self): - self.env.close() - -class PettingZooTruncatedWrapper: - def __init__(self, env): - self.env = env - self.observation_space = env.observation_space - self.action_space = env.action_space - self.render = env.render - - @property - def render_mode(self): - return self.env.render_mode - - @property - def possible_agents(self): - return self.env.possible_agents - - @property - def agents(self): - return self.env.agents - - def reset(self, seed=None): - if seed is not None: - ob, info = self.env.reset(seed=seed) - else: - ob, info = self.env.reset() - info = {k: {} for k in ob} - return ob, info - - def step(self, actions): - observations, rewards, terminals, truncations, infos = self.env.step(actions) - return observations, rewards, terminals, truncations, infos - - def close(self): - self.env.close() diff --git a/pyproject.toml b/pyproject.toml index b4d35b4021..d1bc56ce11 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,3 @@ [build-system] -requires = ["setuptools", "wheel", "Cython", "numpy"] +requires = ["setuptools", "wheel", "Cython", "numpy", "torch"] build-backend = "setuptools.build_meta" diff --git a/setup.py b/setup.py index c43853440e..4b6a8414a7 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,7 @@ +#TODO: +# --no-build-isolation for 5090 +# Make c and torch compile at the same time + from setuptools import find_packages, find_namespace_packages, setup, Extension from Cython.Build import cythonize import numpy @@ -6,46 +10,143 @@ import zipfile import tarfile import platform +import shutil + +from setuptools.command.build_ext import build_ext +from torch.utils import cpp_extension +from torch.utils.cpp_extension import ( + CppExtension, + CUDAExtension, + BuildExtension, + CUDA_HOME, +) + -# python3 setup.py built_ext --inplace +import pufferlib +VERSION = pufferlib.__version__ + +# Build with DEBUG=1 to enable debug symbols +DEBUG = os.getenv("DEBUG", "0") == "1" + +# Put C env names here. PufferLib will look for +# pufferlib/ocean//binding.c +c_extensions_names = [ + 'gpudrive', + 'squared', + 'pong', + 'breakout', + 'enduro', + 'blastar', + 'grid', + 'nmmo3', + 'tactical', + 'go', + 'cartpole' +] -VERSION = '2.0.6' +# Put full paths to Cython extension here +# Note we are trying to move away from Cython, +# because our C envs are lighter weigh and +# easier to debug (you can run gdb --args python ...) +cython_extension_paths = [ + 'pufferlib/ocean/moba/cy_moba', + 'pufferlib/ocean/snake/cy_snake', + 'pufferlib/ocean/connect4/cy_connect4', + 'pufferlib/ocean/tripletriad/cy_tripletriad', + 'pufferlib/ocean/rware/cy_rware', + 'pufferlib/ocean/trash_pickup/cy_trash_pickup', + 'pufferlib/ocean/cpr/cy_cpr', + 'pufferlib/ocean/tower_climb/cy_tower_climb', +] +# Build raylib for your platform RAYLIB_BASE = 'https://github.com/raysan5/raylib/releases/download/5.5/' RAYLIB_NAME = 'raylib-5.5_macos' if platform.system() == "Darwin" else 'raylib-5.5_linux_amd64' - -RAYLIB_LINUX = 'raylib-5.5_linux_amd64' -RAYLIB_LINUX_URL = RAYLIB_BASE + RAYLIB_LINUX + '.tar.gz' RLIGHTS_URL = 'https://raw.githubusercontent.com/raysan5/raylib/refs/heads/master/examples/shaders/rlights.h' -if not os.path.exists(RAYLIB_LINUX): - urllib.request.urlretrieve(RAYLIB_LINUX_URL, RAYLIB_LINUX + '.tar.gz') - with tarfile.open(RAYLIB_LINUX + '.tar.gz', 'r') as tar_ref: - tar_ref.extractall() - - os.remove(RAYLIB_LINUX + '.tar.gz') - urllib.request.urlretrieve(RLIGHTS_URL, 'raylib-5.5_linux_amd64/include/rlights.h') +def download_raylib(platform, url): + if not os.path.exists(platform): + urllib.request.urlretrieve(url, platform + '.tar.gz') + with tarfile.open(platform + '.tar.gz', 'r') as tar_ref: + tar_ref.extractall() -RAYLIB_MACOS = 'raylib-5.5_macos' -RAYLIB_MACOS_URL = RAYLIB_BASE + RAYLIB_MACOS + '.tar.gz' -if not os.path.exists(RAYLIB_MACOS): - urllib.request.urlretrieve(RAYLIB_MACOS_URL, RAYLIB_MACOS + '.tar.gz') - with tarfile.open(RAYLIB_MACOS + '.tar.gz', 'r') as tar_ref: - tar_ref.extractall() - - os.remove(RAYLIB_MACOS + '.tar.gz') - urllib.request.urlretrieve(RLIGHTS_URL, 'raylib-5.5_macos/include/rlights.h') + os.remove(platform + '.tar.gz') + urllib.request.urlretrieve(RLIGHTS_URL, platform + '/include/rlights.h') RAYLIB_WASM = 'raylib-5.5_webassembly' RAYLIB_WASM_URL = RAYLIB_BASE + RAYLIB_WASM + '.zip' -if not os.path.exists(RAYLIB_WASM): - urllib.request.urlretrieve(RAYLIB_WASM_URL, RAYLIB_WASM + '.zip') - with zipfile.ZipFile(RAYLIB_WASM + '.zip', 'r') as zip_ref: - zip_ref.extractall() +download_raylib(RAYLIB_WASM, RAYLIB_WASM_URL) - os.remove(RAYLIB_WASM + '.zip') - urllib.request.urlretrieve(RLIGHTS_URL, 'raylib-5.5_webassembly/include/rlights.h') +# Shared compile args for all platforms +extra_compile_args = [ + '-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION', + '-DPLATFORM_DESKTOP', +] +extra_link_args = [ + '-fwrapv' +] +cxx_args = [ + '-fdiagnostics-color=always', +] +nvcc_args = [] + +if DEBUG: + extra_compile_args += [ + '-O0', + '-g', + '-fsanitize=address,undefined,bounds,pointer-overflow,leak', + ] + extra_link_args += [ + '-g', + ] + cxx_args += [ + '-O0', + '-g', + ] + nvcc_args += [ + '-O0', + '-g', + ] +else: + extra_compile_args += [ + '-O2', + ] + extra_link_args += [ + '-O2', + ] + cxx_args += [ + '-O3', + ] + nvcc_args += [ + '-O3', + ] + +system = platform.system() +if system == 'Linux': + extra_compile_args += [ + '-Wno-alloc-size-larger-than', + '-fmax-errors=3', + ] + extra_link_args += [ + '-Bsymbolic-functions', + ] + RAYLIB_LINUX = 'raylib-5.5_linux_amd64' + RAYLIB_LINUX_URL = RAYLIB_BASE + RAYLIB_LINUX + '.tar.gz' + download_raylib(RAYLIB_LINUX, RAYLIB_LINUX_URL) +elif system == 'Darwin': + extra_compile_args += [ + ] + extra_link_args += [ + '-framework', 'Cocoa', + '-framework', 'OpenGL', + '-framework', 'IOKit', + ] + RAYLIB_MACOS = 'raylib-5.5_macos' + RAYLIB_MACOS_URL = RAYLIB_BASE + RAYLIB_MACOS + '.tar.gz' + download_raylib(RAYLIB_MACOS, RAYLIB_MACOS_URL) +else: + raise ValueError(f'Unsupported system: {system}') # Default Gym/Gymnasium/PettingZoo versions # Gym: @@ -61,31 +162,6 @@ GYM_VERSION = '0.23' PETTINGZOO_VERSION = '1.24.1' -docs = [ - 'sphinx==5.0.0', - 'sphinx-rtd-theme==0.5.1', - 'sphinxcontrib-youtube==1.0.1', - 'sphinx-rtd-theme==0.5.1', - 'sphinx-design==0.4.1', - 'furo==2023.3.27', -] - -cleanrl = [ - 'stable_baselines3==2.1.0', - 'tensorboard==2.11.2', - 'torch', - 'tyro==0.8.6', - 'wandb==0.19.1', - 'scipy', - 'pyro-ppl', - 'neptune', - 'heavyball', -] - -ray = [ - 'ray==2.23.0', -] - environments = { 'avalon': [ f'gym=={GYM_VERSION}', @@ -237,6 +313,30 @@ ], } +docs = [ + 'sphinx==5.0.0', + 'sphinx-rtd-theme==0.5.1', + 'sphinxcontrib-youtube==1.0.1', + 'sphinx-rtd-theme==0.5.1', + 'sphinx-design==0.4.1', + 'furo==2023.3.27', +] + +cleanrl = [ + 'stable_baselines3==2.1.0', + 'tensorboard==2.11.2', + 'torch', + 'tyro==0.8.6', + 'wandb==0.19.1', + 'scipy', + 'pyro-ppl', + 'neptune', + 'heavyball', +] + +ray = [ + 'ray==2.23.0', +] # These are the environments that PufferLib has made # compatible with the latest version of Gym/Gymnasium/PettingZoo @@ -264,69 +364,64 @@ 'vizdoom', ]] -extension_paths = [ - #'pufferlib/ocean/nmmo3/cy_nmmo3', - 'pufferlib/ocean/moba/cy_moba', - # 'pufferlib/ocean/tactical/c_tactical', - #'pufferlib/ocean/squared/cy_squared', - 'pufferlib/ocean/snake/cy_snake', - 'pufferlib/ocean/gpudrive/cy_gpudrive', - #'pufferlib/ocean/pong/cy_pong', - # 'pufferlib/ocean/breakout/cy_breakout', - # 'pufferlib/ocean/cartpole/cy_cartpole', - # 'pufferlib/ocean/connect4/cy_connect4', - #'pufferlib/ocean/grid/cy_grid', - 'pufferlib/ocean/tripletriad/cy_tripletriad', - # 'pufferlib/ocean/go/cy_go', - 'pufferlib/ocean/rware/cy_rware', - 'pufferlib/ocean/trash_pickup/cy_trash_pickup', - 'pufferlib/ocean/cpr/cy_cpr', - 'pufferlib/ocean/tower_climb/cy_tower_climb', - 'pufferlib/ocean/gpudrive/cy_gpudrive', -] +# Extensions +class BuildExt(build_ext): + def run(self): + self.run_command('build_torch') + self.run_command('build_c') -system = platform.system() -if system == 'Darwin': - # On macOS, use @loader_path. - # The extension “.so” is typically in pufferlib/ocean/..., - # and “raylib/lib” is (maybe) two directories up from ocean/. - # So @loader_path/../../raylib/lib is common. - extra_compile_args = ['-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION','-DPLATFORM_DESKTOP', '-O2'] - extra_link_args=['-fwrapv', '-framework', 'Cocoa', '-framework', 'OpenGL', '-framework', 'IOKit'] - -elif system == 'Linux': - extra_compile_args = ['-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION', '-DPLATFORM_DESKTOP', '-O2', '-Wno-alloc-size-larger-than', '-fmax-errors=3', '-g'] - extra_link_args=['-fwrapv', '-Bsymbolic-functions', '-O2'] - - # On Linux, $ORIGIN works -else: - raise ValueError(f'Unsupported system: {system}') +class CBuildExt(build_ext): + def run(self): + self.extensions = [e for e in self.extensions if e.name != "pufferlib._C"] + super().run() + +class TorchBuildExt(cpp_extension.BuildExtension): + def run(self): + self.extensions = [e for e in self.extensions if e.name == "pufferlib._C"] + super().run() -extensions = [Extension( - path.replace('/', '.'), - [path + '.pyx'], - include_dirs=[numpy.get_include(), 'raylib/include'], +RAYLIB_A = f'{RAYLIB_NAME}/lib/libraylib.a' +INCLUDE = [numpy.get_include(), 'raylib/include'] +extension_kwargs = dict( + include_dirs=INCLUDE, extra_compile_args=extra_compile_args, extra_link_args=extra_link_args, - extra_objects=[f'{RAYLIB_NAME}/lib/libraylib.a'], -) for path in extension_paths] - -#c_args = ['-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION', '-DPLATFORM_DESKTOP', '-O0', '-Wno-alloc-size-larger-than', '-g'] -#c_args = ['-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION', '-DPLATFORM_DESKTOP', '-O2'] -#c_args += "-Wsign-compare -DNDEBUG -g -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC".split() - -pure_c_extensions = ['squared', 'pong', 'breakout', 'enduro', 'blastar', 'grid', 'nmmo3', 'tactical', 'go', 'cartpole', 'connect4'] + extra_objects=[RAYLIB_A], +) -extensions += [ +c_extensions = [ Extension( f'pufferlib.ocean.{name}.binding', sources=[f'pufferlib/ocean/{name}/binding.c'], - include_dirs=[numpy.get_include(), 'raylib/include'], - extra_compile_args=extra_compile_args,# + ['-fsanitize=address,undefined,bounds,pointer-overflow,leak'], - extra_link_args=extra_link_args,# + ['-fsanitize=address,undefined,bounds,pointer-overflow,leak', '-g'], - extra_objects=[f'{RAYLIB_NAME}/lib/libraylib.a'], + **extension_kwargs, + ) + for name in c_extensions_names +] + +cython_extensions = cythonize([ + Extension( + path.replace('/', '.'), + [path + '.pyx'], + **extension_kwargs, ) - for name in pure_c_extensions + for path in cython_extension_paths +]) + +# Check if CUDA compiler is available. You need cuda dev, not just runtime. +if shutil.which("nvcc"): + extension = CUDAExtension +else: + extension = CppExtension + +torch_extensions = [ + extension( + "pufferlib._C", + ["pufferlib.cpp", "pufferlib/pufferlib.cu"], + extra_compile_args = { + "cxx": cxx_args, + "nvcc": nvcc_args, + } + ), ] # Prevent Conda from injecting garbage compile flags @@ -341,7 +436,7 @@ for key, value in cfg_vars.items(): if value and '-fno-strict-overflow' in str(value): cfg_vars[key] = value.replace('-fno-strict-overflow', '') - + setup( name="pufferlib", description="PufferAI Library" @@ -362,6 +457,7 @@ f'gym<={GYM_VERSION}', f'gymnasium<={GYMNASIUM_VERSION}', f'pettingzoo<={PETTINGZOO_VERSION}', + 'torch', 'shimmy[gym-v21]', 'psutil==5.9.5', 'pynvml', @@ -375,25 +471,12 @@ 'common': common, **environments, }, - ext_modules = cythonize([ - "pufferlib/extensions.pyx", - "c_advantage.pyx", - "pufferlib/puffernet.pyx", - *extensions, - ], - compiler_directives={ - 'language_level': 3, - 'boundscheck': False, - 'initializedcheck': False, - 'wraparound': False, - 'cdivision': True, - 'nonecheck': False, - 'profile': False, + ext_modules = cython_extensions + c_extensions + torch_extensions, + cmdclass={ + "build_ext": BuildExt, + "build_torch": TorchBuildExt, + "build_c": CBuildExt, }, - #nthreads=6, - #annotate=True, - #compiler_directives={'profile': True},# annotate=True - ), include_dirs=[numpy.get_include(), RAYLIB_NAME + '/include'], python_requires=">=3.9", license="MIT", @@ -405,10 +488,10 @@ "Intended Audience :: Science/Research", "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", ], ) #stable_baselines3 diff --git a/tests/test.py b/tests/test.py index 813bd8dbe1..1890c3c2ef 100644 --- a/tests/test.py +++ b/tests/test.py @@ -13,6 +13,100 @@ import warnings warnings.filterwarnings("ignore") +class RandomState: + def __init__(self, seed): + self.rng = np.random.RandomState(seed) + + def random(self): + return self.rng.random() + + def probabilistic_round(self, n): + frac, integer = np.modf(n) + if self.random() < frac: + return int(integer) + 1 + else: + return int(integer) + + def sample(self, ary, n): + n_rounded = self.probabilistic_round(n) + return self.rng.choice(ary, n_rounded, replace=False).tolist() + + def choice(self, ary): + return self.sample(ary, 1)[0] + + +# TODO: Fix this. Was in utils.py. Only used for tests +def make_zeros_like(data): + if isinstance(data, dict): + return {k: make_zeros_like(v) for k, v in data.items()} + elif isinstance(data, (list, tuple)): + return [make_zeros_like(v) for v in data] + elif isinstance(data, np.ndarray): + return np.zeros_like(data) + elif isinstance(data, (int, float)): + return 0 + else: + raise ValueError(f'Unsupported type: {type(data)}') + +def compare_arrays(array_1, array_2): + assert isinstance(array_1, np.ndarray) + assert isinstance(array_2, np.ndarray) + assert array_1.shape == array_2.shape + return np.allclose(array_1, array_2) + +def compare_dicts(dict_1, dict_2, idx): + assert isinstance(dict_1, (dict, OrderedDict)) + assert isinstance(dict_2, (dict, OrderedDict)) + + if not all(k in dict_2 for k in dict_1): + raise ValueError("Keys do not match between dictionaries.") + + for k, v in dict_1.items(): + if not compare_space_samples(v, dict_2[k], idx): + return False + + return True + +def compare_lists(list_1, list_2, idx): + assert isinstance(list_1, (list, tuple)) + assert isinstance(list_2, (list, tuple)) + + if len(list_1) != len(list_2): + raise ValueError("Lengths do not match between lists/tuples.") + + for v1, v2 in zip(list_1, list_2): + if not compare_space_samples(v1, v2, idx): + return False + + return True + +def compare_space_samples(sample_1, sample_2, sample_2_batch_idx=None): + '''Compare two samples from the same space + + Optionally, sample_2 may be a batch of samples from the same space + concatenated along the first dimension of the leaves. In this case, + sample_2_batch_idx specifies which sample to compare. + ''' + if isinstance(sample_1, (dict, OrderedDict)): + return compare_dicts(sample_1, sample_2, sample_2_batch_idx) + elif isinstance(sample_1, (list, tuple)): + return compare_lists(sample_1, sample_2, sample_2_batch_idx) + elif isinstance(sample_1, np.ndarray): + assert isinstance(sample_2, np.ndarray) + if sample_2_batch_idx is not None: + sample_2 = sample_2[sample_2_batch_idx] + return compare_arrays(sample_1, sample_2) + elif isinstance(sample_1, (int, float)): + if sample_2_batch_idx is not None: + sample_2 = sample_2[sample_2_batch_idx] + if isinstance(sample_2, np.ndarray): + assert sample_2.size == 1, "Cannot compare scalar to non-scalar." + sample_2 = sample_2[0] + return sample_1 == sample_2 + else: + raise ValueError(f"Unsupported type: {type(sample_1)}") + + def test_gymnasium_emulation(env_cls, steps=100): raw_env = env_cls() diff --git a/tests/test_env_binding.py b/tests/test_env_binding.py new file mode 100644 index 0000000000..cefcef857d --- /dev/null +++ b/tests/test_env_binding.py @@ -0,0 +1,116 @@ +from pufferlib.ocean.breakout import breakout + +kwargs = dict( + frameskip=1, + width=576, + height=330, + paddle_width=62, + paddle_height=8, + ball_width=32, + ball_height=32, + brick_width=32, + brick_height=12, + brick_rows=6, + brick_cols=18, + continuous=False, +) + +def test_env_binding(): + reference = breakout.Breakout() + + # Correct usage + c_env = breakout.binding.env_init( + reference.observations, + reference.actions, + reference.rewards, + reference.terminals, + reference.truncations, + 0, + **kwargs + ) + c_envs = breakout.binding.vectorize(c_env) + breakout.binding.vec_reset(c_envs, 0) + breakout.binding.vec_step(c_envs) + breakout.binding.vec_close(c_envs) + + # Correct vec usage + c_envs = breakout.binding.vec_init( + reference.observations, + reference.actions, + reference.rewards, + reference.terminals, + reference.truncations, + reference.num_agents, + 0, + **kwargs + ) + + # Correct vec usage + c_envs = breakout.binding.vec_init( + reference.observations, + reference.actions, + reference.rewards, + reference.terminals, + reference.truncations, + reference.num_agents, + 0, + **kwargs + ) + breakout.binding.vec_reset(c_envs, 0) + breakout.binding.vec_step(c_envs) + breakout.binding.vec_close(c_envs) + + try: + c_env = breakout.binding.env_init() + raise Exception('init missing args. Should have thrown TypeError') + except TypeError: + pass + + try: + c_env = breakout.binding.env_init( + reference.observations, + reference.actions, + reference.rewards, + reference.terminals, + reference.truncations, + reference.num_agents, + 0, + ) + raise Exception('init missing kwarg. Should have thrown TypeError') + except TypeError: + pass + + try: + c_envs = breakout.binding.vec_init() + raise Exception('vec_init missing args. Should have thrown TypeError') + except TypeError: + pass + + try: + c_envs = breakout.binding.vec_init( + reference.observations, + reference.actions, + reference.rewards, + reference.terminals, + reference.truncations, + reference.num_agents, + 0, + ) + raise Exception('vec_init missing kwarg. Should have thrown TypeError') + except TypeError: + pass + + try: + breakout.binding.vec_reset() + raise Exception('vec_reset missing arg. Should have thrown TypeError') + except TypeError: + pass + + try: + breakout.binding.vec_step() + raise Exception('vec_step missing arg. Should have thrown TypeError') + except TypeError: + pass + +if __name__ == '__main__': + test_env_binding()