Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
Sharad24 committed Aug 27, 2020
2 parents 0f4aa53 + a344720 commit 55f5ee0
Show file tree
Hide file tree
Showing 37 changed files with 456 additions and 224 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ repos:
rev: 19.10b0
hooks:
- id: black
language_version: python3.6
language_version: python3.7

- repo: https://gitlab.com/pycqa/flake8
rev: 3.7.7
Expand Down
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,3 +115,12 @@ trainer.plot(episode_rewards)
- Epsilon greedy with a neural network
- Bayesian Regression on for posterior inference
- Bootstraped Ensemble


#### Credits and Similar Libraries:
- [Gym](https://gym.openai.com/) - Environments
- [Ray](https://github.com/ray-project/ray)
- [OpenAI Baselines](https://github.com/openai/baselines) - Logger
- [Stable Baselines 3](https://github.com/DLR-RM/stable-baselines3): Stable Baselines aims to provide _baselines_ for Deep RL Algorithms. Part of our code (e.g. Rollout Storage) is inspired from Stable Baselines.
- [pytorch-a2c-ppo-acktr](https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail)
- [Deep Contextual Bandits](https://github.com/tensorflow/models/tree/archive/research/deep_contextual_bandits)
24 changes: 19 additions & 5 deletions examples/deep_cb.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,16 +57,28 @@ def main(args):
type=int,
)
parser.add_argument(
"--train-epochs", help="Epochs to train for each update", default=20, type=int,
"--train-epochs",
help="Epochs to train for each update",
default=20,
type=int,
)
parser.add_argument(
"--log-every", help="Timesteps interval for logging", default=100, type=int,
"--log-every",
help="Timesteps interval for logging",
default=100,
type=int,
)
parser.add_argument(
"--logdir", help="Directory to store logs in", default="./logs/", type=str,
"--logdir",
help="Directory to store logs in",
default="./logs/",
type=str,
)
parser.add_argument(
"--ignore-init", help="Initial no. of step to ignore", default=10, type=int,
"--ignore-init",
help="Initial no. of step to ignore",
default=10,
type=int,
)
parser.add_argument(
"--init-train-epochs",
Expand All @@ -81,7 +93,9 @@ def main(args):
type=int,
)
parser.add_argument(
"--download", help="Download data for bandit", action="store_true",
"--download",
help="Download data for bandit",
action="store_true",
)

args = parser.parse_args()
Expand Down
Empty file.
Binary file not shown.
80 changes: 80 additions & 0 deletions examples/hyperparameters/optuna/a2c_cartpole-v0.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import numpy as np
import optuna
import torch

from genrl.agents.a2c.a2c import A2C
from genrl.environments.suite import VectorEnv
from genrl.trainers.onpolicy import OnPolicyTrainer

env = VectorEnv("CartPole-v0")


def tune_A2C(trial):
# Define hyperparameters that are relevant for training
# Choose a suggestion type and range (float/int and log/uniform)
lr_value = trial.suggest_float("lr_value", 1e-5, 1e-2, log=True)
lr_policy = trial.suggest_float("lr_policy", 1e-5, 1e-2, log=True)
rollout_size = trial.suggest_int("rollout_size", 100, 10000, log=True)
entropy_coeff = trial.suggest_float("entropy_coeff", 5e-4, 2e-1, log=True)

agent = A2C(
"mlp",
env,
lr_value=lr_value,
lr_policy=lr_policy,
rollout_size=rollout_size,
entropy_coeff=entropy_coeff,
)
trainer = OnPolicyTrainer(
agent, env, log_interval=10, epochs=100, evaluate_episodes=10,
)
trainer.train()

episode, episode_reward = 0, np.zeros(trainer.env.n_envs)
episode_rewards = []
state = trainer.env.reset()
while True:
if trainer.off_policy:
action = trainer.agent.select_action(state, deterministic=True)
else:
action, _, _ = trainer.agent.select_action(state)

if isinstance(action, torch.Tensor):
action = action.numpy()

next_state, reward, done, _ = trainer.env.step(action)

episode_reward += reward
state = next_state
if np.any(done):
for i, di in enumerate(done):
if di:
episode += 1
episode_rewards.append(episode_reward[i])
episode_reward[i] = 0
if episode == trainer.evaluate_episodes:
print(
"Evaluated for {} episodes, Mean Reward: {}, Std Deviation for the Reward: {}".format(
trainer.evaluate_episodes,
np.mean(episode_rewards),
np.std(episode_rewards),
)
)
break

return np.mean(episode_rewards)


agent_name = "A2C" # replace
study_name = "{}-3".format(agent_name)
study = optuna.create_study(
study_name=study_name,
direction="maximize",
storage="sqlite:///{}.db".format(study_name),
# load_if_exists=True
)
study.optimize(tune_A2C, n_trials=20)

print("Best Trial Results:")
for key, value in study.best_trial.__dict__.items():
print("{} : {}".format(key, value))
13 changes: 13 additions & 0 deletions examples/hyperparameters/optuna/read.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import optuna

path_name = "A2C-CartPole-v0-ep100"
study = optuna.create_study(
study_name=path_name,
direction="maximize",
storage="sqlite:///{}.db".format(path_name),
load_if_exists=True,
)

print("Best Trial Results:")
for key, value in study.best_trial.__dict__.items():
print("{} : {}".format(key, value))
79 changes: 79 additions & 0 deletions examples/hyperparameters/optuna/td3_pendulum-v0.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import numpy as np
import optuna
import torch

from genrl.agents.td3.td3 import TD3
from genrl.environments.suite import VectorEnv
from genrl.trainers.offpolicy import OffPolicyTrainer

env = VectorEnv("Pendulum-v0")


def objective(trial):
lr_value = trial.suggest_float("lr_value", 1e-6, 1e-1, log=True)
lr_policy = trial.suggest_float("lr_policy", 1e-6, 1e-1, log=True)
replay_size = trial.suggest_int("replay_size", 1e2, 1e5, log=True)
max_ep_len = trial.suggest_int("max_ep_len", 1e3, 50000, log=True)

agent = TD3(
"mlp", env, lr_value=lr_value, lr_policy=lr_policy, replay_size=replay_size
)
trainer = OffPolicyTrainer(
agent,
env,
log_interval=5,
epochs=100,
max_timesteps=16500,
evaluate_episodes=10,
max_ep_len=max_ep_len,
)
trainer.train()

episode = 0
episode_rewards = []
state = trainer.env.reset()

while True:
if trainer.off_policy:
action = trainer.agent.select_action(state, deterministic=True)
else:
action, _, _ = trainer.agent.select_action(state)

if isinstance(action, torch.Tensor):
action = action.numpy()

next_state, reward, done, _ = trainer.env.step(action)

state = next_state

if np.any(done):
for i, di in enumerate(done):
if di:
episode += 1
episode_rewards.append(trainer.env.episode_reward[i])
trainer.env.episode_reward[i] = 0

if episode == trainer.evaluate_episodes:
eval_reward = float(np.mean(episode_rewards))

trial.report(eval_reward, int(episode))
break

return eval_reward


study = optuna.create_study(
study_name="1",
direction="maximize",
storage="sqlite:///td3--pendulum-v0--replay_size-max_ep_len-lr_value-lr_policy.db",
load_if_exists=True,
)
study.optimize(objective, n_trials=20)
df = study.trials_dataframe(attrs=("number", "value", "params"))
df.to_pickle("logs/optuna_logs.pkl")

print("Best trial: ")
for key, value in study.best_trial.__dict__.items():
print("{}: {}".format(key, value))
print("Eval Reward: ", study.best_value)
print("Params: ", study.best_params)
3 changes: 2 additions & 1 deletion examples/run_cb.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ def run(args, agent, bandit, plot=True):
if plot:
fig, axs = plt.subplots(3, 2, figsize=(10, 10))
fig.suptitle(
f"{agent.__class__.__name__} on {bandit.__class__.__name__}", fontsize=14,
f"{agent.__class__.__name__} on {bandit.__class__.__name__}",
fontsize=14,
)
axs[0, 0].scatter(list(range(len(bandit.regret_hist))), results["regrets"])
axs[0, 0].set_title("Regret History")
Expand Down
3 changes: 1 addition & 2 deletions genrl/agents/bandits/contextual/common/bayesian.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@ def __init__(self, in_features: int, out_features: int, bias: bool = True) -> No
self.reset_parameters()

def reset_parameters(self) -> None:
"""Resets weight and bias parameters of the layer.
"""
"""Resets weight and bias parameters of the layer."""
self.w_mu.data.normal_(0, 0.1)
self.w_sigma.data.normal_(0, 0.1)
self.b_mu.data.normal_(0, 0.1) if self.bias else None
Expand Down
3 changes: 1 addition & 2 deletions genrl/agents/deep/a2c/a2c.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,8 +222,7 @@ def get_logging_params(self) -> Dict[str, Any]:
return logs

def empty_logs(self):
"""Empties logs
"""
"""Empties logs"""
self.logs = {}
self.logs["policy_loss"] = []
self.logs["value_loss"] = []
Expand Down
6 changes: 2 additions & 4 deletions genrl/agents/deep/base/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,7 @@ def __init__(
set_seeds(self.seed, self.env)

def _create_model(self) -> None:
"""Function to initialize all models of the agent
"""
"""Function to initialize all models of the agent"""
raise NotImplementedError

def select_action(
Expand Down Expand Up @@ -111,6 +110,5 @@ def get_logging_params(self) -> Dict[str, Any]:
raise NotImplementedError

def empty_logs(self):
"""Empties logs
"""
"""Empties logs"""
raise NotImplementedError
3 changes: 1 addition & 2 deletions genrl/agents/deep/base/offpolicy.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,7 @@ def update_params_before_select_action(self, timestep: int) -> None:
pass

def update_params(self, update_interval: int) -> None:
"""Update parameters of the model
"""
"""Update parameters of the model"""
raise NotImplementedError

def update_target_model(self) -> None:
Expand Down
3 changes: 1 addition & 2 deletions genrl/agents/deep/base/onpolicy.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,7 @@ def __init__(
raise NotImplementedError

def update_params(self) -> None:
"""Update parameters of the model
"""
"""Update parameters of the model"""
raise NotImplementedError

def collect_rewards(self, dones: List[bool], timestep: int):
Expand Down
3 changes: 1 addition & 2 deletions genrl/agents/deep/ddpg/ddpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,7 @@ def get_logging_params(self) -> Dict[str, Any]:
return logs

def empty_logs(self):
"""Empties logs
"""
"""Empties logs"""
self.logs = {}
self.logs["policy_loss"] = []
self.logs["value_loss"] = []
3 changes: 1 addition & 2 deletions genrl/agents/deep/dqn/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,8 +250,7 @@ def get_logging_params(self) -> Dict[str, Any]:
return logs

def empty_logs(self) -> None:
"""Empties logs
"""
"""Empties logs"""
self.logs = {}
self.logs["value_loss"] = []
self.logs["epsilon"] = []
18 changes: 14 additions & 4 deletions genrl/agents/deep/dqn/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@


def ddqn_q_target(
agent: DQN, next_states: torch.Tensor, rewards: torch.Tensor, dones: torch.Tensor,
agent: DQN,
next_states: torch.Tensor,
rewards: torch.Tensor,
dones: torch.Tensor,
) -> torch.Tensor:
"""Double Q-learning target
Expand Down Expand Up @@ -115,7 +118,10 @@ def categorical_q_values(agent: DQN, states: torch.Tensor, actions: torch.Tensor


def categorical_q_target(
agent: DQN, next_states: np.ndarray, rewards: List[float], dones: List[bool],
agent: DQN,
next_states: np.ndarray,
rewards: List[float],
dones: List[bool],
):
"""Projected Distribution of Q-values
Expand Down Expand Up @@ -164,10 +170,14 @@ def categorical_q_target(

target_q_values = torch.zeros(next_q_values.size())
target_q_values.view(-1).index_add_(
0, (l + offset).view(-1), (next_q_values * (u.float() - bz)).view(-1),
0,
(l + offset).view(-1),
(next_q_values * (u.float() - bz)).view(-1),
)
target_q_values.view(-1).index_add_(
0, (u + offset).view(-1), (next_q_values * (bz - l.float())).view(-1),
0,
(u + offset).view(-1),
(next_q_values * (bz - l.float())).view(-1),
)
return target_q_values

Expand Down
3 changes: 1 addition & 2 deletions genrl/agents/deep/ppo1/ppo1.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,8 +231,7 @@ def get_logging_params(self) -> Dict[str, Any]:
return logs

def empty_logs(self):
"""Empties logs
"""
"""Empties logs"""
self.logs = {}
self.logs["policy_loss"] = []
self.logs["value_loss"] = []
Expand Down
3 changes: 1 addition & 2 deletions genrl/agents/deep/sac/sac.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,8 +253,7 @@ def get_logging_params(self) -> Dict[str, Any]:
return logs

def empty_logs(self):
"""Empties logs
"""
"""Empties logs"""
self.logs = {}
self.logs["value_loss"] = []
self.logs["policy_loss"] = []
Expand Down
3 changes: 1 addition & 2 deletions genrl/agents/deep/td3/td3.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,7 @@ def get_logging_params(self) -> Dict[str, Any]:
return logs

def empty_logs(self):
"""Empties logs
"""
"""Empties logs"""
self.logs = {}
self.logs["policy_loss"] = []
self.logs["value_loss"] = []
Loading

0 comments on commit 55f5ee0

Please sign in to comment.