diff --git a/reinforcement_learning/reinforce.py b/reinforcement_learning/reinforce.py index a222ff804c..bb2f057832 100644 --- a/reinforcement_learning/reinforce.py +++ b/reinforcement_learning/reinforce.py @@ -11,97 +11,169 @@ parser = argparse.ArgumentParser(description='PyTorch REINFORCE example') -parser.add_argument('--gamma', type=float, default=0.99, metavar='G', + +parser.add_argument('--env', type=str, default="CartPole-v0", metavar='E', + help='environment (default: CartPole-v0)') + +parser.add_argument('--discount_factor', type=float, default=0.99, metavar='G', help='discount factor (default: 0.99)') + +parser.add_argument('--hidden_size', type=int, default=128, metavar='H', + help='number of hidden units for the policy network\'s input layer (default: 128)') + +parser.add_argument('--learning_rate', type=float, default=1e-2, metavar='L', + help='learning rate for the Adam optimizer (default: 1e-2)') + +parser.add_argument('--dropout', type=float, default=0.6, metavar='D', + help='Dropout probability for the policy network (default: 0.6)') + parser.add_argument('--seed', type=int, default=543, metavar='N', help='random seed (default: 543)') + parser.add_argument('--render', action='store_true', help='render the environment') + parser.add_argument('--log-interval', type=int, default=10, metavar='N', help='interval between training status logs (default: 10)') + args = parser.parse_args() -env = gym.make('CartPole-v1') -env.seed(args.seed) -torch.manual_seed(args.seed) +class PolicyNetwork(nn.Module): + + """ + Implements the policy for the REINFORCE algorithm + """ + + def __init__(self, num_features, num_actions, hidden_size, learning_rate, dropout_prob): + super(PolicyNetwork, self).__init__() + self._input_layer = nn.Linear(num_features, hidden_size) + self._dropout = nn.Dropout(p=dropout_prob) + self._output_layer = nn.Linear(hidden_size, num_actions) + self._optimizer = optim.Adam(self.parameters(), lr=learning_rate) + + def forward(self, X): + X = self._input_layer(X) + X = self._dropout(X) + X = F.relu(X) + action_logits = self._output_layer(X) + policy = F.softmax(action_logits, dim=1) + return policy + + def update(self, returns, action_logits): + self._optimizer.zero_grad() + loss = self.loss_fn(returns, action_logits) + loss.backward() + self._optimizer.step() + + @staticmethod + def loss_fn(returns, action_logits): + batch_size = len(returns) + policy_losses = [-action_logits[b] * returns[b] for b in range(batch_size)] + loss = torch.cat(policy_losses).sum() / batch_size + return loss + +class ActorCriticAgent: + + """ + Implements the concept of agent + (action/reinforcement interface + internal state) + """ + + def __init__(self, num_features, num_actions, hidden_size, learning_rate, dropout_prob, discount_factor): + self._network = PolicyNetwork(num_features, num_actions, hidden_size, learning_rate, dropout_prob) + self._gamma = discount_factor + self._rewards_buffer = [] + self._action_logits_buffer = [] + def action(self, state): -class Policy(nn.Module): - def __init__(self): - super(Policy, self).__init__() - self.affine1 = nn.Linear(4, 128) - self.dropout = nn.Dropout(p=0.6) - self.affine2 = nn.Linear(128, 2) + x = torch.from_numpy(state).float().unsqueeze(0) + policy = self._network(x) - self.saved_log_probs = [] - self.rewards = [] + policy = Categorical(policy) + action = policy.sample() - def forward(self, x): - x = self.affine1(x) - x = self.dropout(x) - x = F.relu(x) - action_scores = self.affine2(x) - return F.softmax(action_scores, dim=1) + self._action_logits_buffer.append(policy.log_prob(action)) + return action.item() -policy = Policy() -optimizer = optim.Adam(policy.parameters(), lr=1e-2) -eps = np.finfo(np.float32).eps.item() + def reinforce(self, reward, terminal): + self._rewards_buffer.append(reward) + if terminal: + returns = self.compute_returns() + self._network.update(returns, self._action_logits_buffer) + self._rewards_buffer.clear() + self._action_logits_buffer.clear() + def compute_returns(self, nan_preventing_eps=np.finfo(np.float32).eps.item()): + returns = [] + current_return = 0 + for reward in reversed(self._rewards_buffer): + current_return = reward + self._gamma * current_return + returns = [current_return] + returns + returns = torch.tensor(returns) + returns = (returns - returns.mean()) / (returns.std() + nan_preventing_eps) + return returns -def select_action(state): - state = torch.from_numpy(state).float().unsqueeze(0) - probs = policy(state) - m = Categorical(probs) - action = m.sample() - policy.saved_log_probs.append(m.log_prob(action)) - return action.item() +def run_episode(agent, env, render): + """ + Runs a full episode on the environment + """ -def finish_episode(): - R = 0 - policy_loss = [] - returns = [] - for r in policy.rewards[::-1]: - R = r + args.gamma * R - returns.insert(0, R) - returns = torch.tensor(returns) - returns = (returns - returns.mean()) / (returns.std() + eps) - for log_prob, R in zip(policy.saved_log_probs, returns): - policy_loss.append(-log_prob * R) - optimizer.zero_grad() - policy_loss = torch.cat(policy_loss).sum() - policy_loss.backward() - optimizer.step() - del policy.rewards[:] - del policy.saved_log_probs[:] + ep_reward = 0 + ep_steps = 0 + state = env.reset() + terminal = False + while not terminal: + + action = agent.action(state) + state, reward, terminal, _ = env.step(action) + + agent.reinforce(reward, terminal) + + ep_reward += reward + ep_steps += 1 + + if render: + env.render() + + return ep_reward, ep_steps + +if __name__ == '__main__': + + env = gym.make(args.env) + env.seed(args.seed) + torch.manual_seed(args.seed) + + agent = ActorCriticAgent( + num_features=env.observation_space.shape[0], + num_actions=env.action_space.n, + hidden_size=args.hidden_size, + learning_rate=args.learning_rate, + dropout_prob=args.dropout, + discount_factor=args.discount_factor + ) -def main(): running_reward = 10 - for i_episode in count(1): - state, ep_reward = env.reset(), 0 - for t in range(1, 10000): # Don't infinite loop while learning - action = select_action(state) - state, reward, done, _ = env.step(action) - if args.render: - env.render() - policy.rewards.append(reward) - ep_reward += reward - if done: - break + # Run infinitely many episodes + for episode in count(1): + + ep_reward, ep_steps = run_episode(agent, env, args.render) + + # update cumulative reward running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward - finish_episode() - if i_episode % args.log_interval == 0: + + # Log results + if episode % args.log_interval == 0: print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format( - i_episode, ep_reward, running_reward)) + episode, ep_reward, running_reward)) + + # Check if we have "solved" the problem if running_reward > env.spec.reward_threshold: print("Solved! Running reward is now {} and " - "the last episode runs to {} time steps!".format(running_reward, t)) + "the last episode runs to {} time steps!".format(running_reward, ep_steps)) break - - -if __name__ == '__main__': - main()