dlcv/agent.py at main · Navfalbek/dlcv · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
import gymnasium as gym
import numpy as np

import matplotlib
import matplotlib.pyplot as plt

import random
import torch
from torch import nn
import yaml

from experience_replay import ReplayMemory
from dqn import DQN

from datetime import datetime, timedelta
import argparse
import itertools

import flappy_bird_gymnasium
import os

import argparse
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import DummyVecEnv


# For printing date and time
DATE_FORMAT = "%m-%d %H:%M:%S"

# Directory for saving run info
RUNS_DIR = "runs"
os.makedirs(RUNS_DIR, exist_ok=True)

# 'Agg': used to generate plots as images and save them to a file instead of rendering to screen
matplotlib.use('Agg')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device = 'cpu' # force cpu, sometimes GPU not always faster than CPU due to overhead of moving data to GPU

def train_dqn():
    print("Training with DQN...")
    agent = DQN()
    agent.run(is_training=True)

def test_dqn(env_name="FlappyBird-v0"):
    print("Testing with DQN...")

    # Create the environment
    import gymnasium as gym
    env = gym.make(env_name, render_mode="human")

    # Get state and action dimensions
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n

    # Initialize the DQN agent
    agent = DQN(state_dim=state_dim, action_dim=action_dim)

    # Test the agent
    obs, _ = env.reset()
    done = False
    while not done:
        action = agent.select_action(obs)  # Use the trained model to select actions
        obs, reward, done, _, _ = env.step(action)
        env.render()

    env.close()


# def train_a2c(env_name="FlappyBird-v0", timesteps=10_000_000):
#     print("Training with A2C...")
#     env = DummyVecEnv([lambda: gym.make(env_name)])
#     model = A2C("MlpPolicy", env, verbose=1, learning_rate=1e-3, gamma=0.99, tensorboard_log="./a2c_logs/")
#     model.learn(total_timesteps=timesteps)
#     model.save("runs/a2c_flappy_bird.zip")
#     print("A2C model saved to runs/a2c_flappy_bird.zip")

def train_a2c(env_name="FlappyBird-v0", timesteps=10_000_000):
    print("Training with A2C...")

    # Create environment
    env = DummyVecEnv([lambda: gym.make(env_name)])

    # Initialize model
    model = A2C("MlpPolicy", env, verbose=1, learning_rate=1e-3, gamma=0.99)

    # Initialize metrics storage
    episode_rewards = []
    losses = []
    episodes = []

    # Initialize Matplotlib for live plotting
    plt.ion()
    fig, ax = plt.subplots(2, 1, figsize=(10, 10))
    ax[0].set_title("Episode Rewards")
    ax[0].set_xlabel("Episode")
    ax[0].set_ylabel("Reward")
    ax[1].set_title("Policy Loss")
    ax[1].set_xlabel("Episode")
    ax[1].set_ylabel("Loss")

    # Callback function for logging and plotting
    def custom_callback(_locals, _globals):
        nonlocal episodes, episode_rewards, losses

        if "infos" in _locals:
            infos = _locals["infos"]
            for info in infos:
                if "episode" in info.keys():
                    # Log rewards
                    episode_rewards.append(info["episode"]["r"])
                    episodes.append(len(episodes) + 1)

                    # Update reward plot
                    ax[0].plot(episodes, episode_rewards, color="blue")
                    ax[0].relim()
                    ax[0].autoscale_view()

        # Log loss (can be accessed from locals)
        if "policy_loss" in _locals:
            policy_loss = _locals["policy_loss"]
            losses.append(policy_loss)

            # Update loss plot
            ax[1].plot(episodes, losses, color="red")
            ax[1].relim()
            ax[1].autoscale_view()

        plt.draw()
        plt.pause(0.01)

        return True

    # Train model with custom callback for logging
    model.learn(total_timesteps=timesteps, callback=custom_callback)

    # Save model
    model.save("runs/a2c_flappy_bird")
    print("A2C model saved to runs/a2c_flappy_bird.zip")

    # Finalize plots
    plt.ioff()
    plt.show()

def test_a2c(env_name="FlappyBird-v0"):
    print("Testing with A2C...")
    # Add render_mode="human" for rendering
    env = DummyVecEnv([lambda: gym.make(env_name, render_mode="human")])
    model = A2C.load("runs/a2c_flappy_bird.zip")

    obs = env.reset()
    while True:
        # Predict action
        action, _ = model.predict(obs)
        # Step the environment
        obs, _, done, _ = env.step(action)
        if done:
            break


# Deep Q-Learning Agent
class Agent():

    def __init__(self, hyperparameter_set):
        with open('hyperparameters.yml', 'r') as file:
            all_hyperparameter_sets = yaml.safe_load(file)
            hyperparameters = all_hyperparameter_sets[hyperparameter_set]
            # print(hyperparameters)

        self.hyperparameter_set = hyperparameter_set

        # Hyperparameters (adjustable)
        self.env_id             = hyperparameters['env_id']
        self.learning_rate_a    = hyperparameters['learning_rate_a']        # learning rate (alpha)
        self.discount_factor_g  = hyperparameters['discount_factor_g']      # discount rate (gamma)
        self.network_sync_rate  = hyperparameters['network_sync_rate']      # number of steps the agent takes before syncing the policy and target network
        self.replay_memory_size = hyperparameters['replay_memory_size']     # size of replay memory
        self.mini_batch_size    = hyperparameters['mini_batch_size']        # size of the training data set sampled from the replay memory
        self.epsilon_init       = hyperparameters['epsilon_init']           # 1 = 100% random actions
        self.epsilon_decay      = hyperparameters['epsilon_decay']          # epsilon decay rate
        self.epsilon_min        = hyperparameters['epsilon_min']            # minimum epsilon value
        self.stop_on_reward     = hyperparameters['stop_on_reward']         # stop training after reaching this number of rewards
        self.fc1_nodes          = hyperparameters['fc1_nodes']
        self.env_make_params    = hyperparameters.get('env_make_params',{}) # Get optional environment-specific parameters, default to empty dict
        self.enable_double_dqn  = hyperparameters['enable_double_dqn']      # double dqn on/off flag

        # Neural Network
        self.loss_fn = nn.MSELoss()          # NN Loss function. MSE=Mean Squared Error can be swapped to something else.
        self.optimizer = None                # NN Optimizer. Initialize later.

        # Path to Run info
        self.LOG_FILE   = os.path.join(RUNS_DIR, f'{self.hyperparameter_set}.log')
        self.MODEL_FILE = os.path.join(RUNS_DIR, f'{self.hyperparameter_set}.pt')
        self.GRAPH_FILE = os.path.join(RUNS_DIR, f'{self.hyperparameter_set}.png')

    def run(self, is_training=True, render=False):
        if is_training:
            start_time = datetime.now()
            last_graph_update_time = start_time

            log_message = f"{start_time.strftime(DATE_FORMAT)}: Training starting..."
            print(log_message)
            with open(self.LOG_FILE, 'w') as file:
                file.write(log_message + '\n')

        # Create instance of the environment.
        # Use "**self.env_make_params" to pass in environment-specific parameters from hyperparameters.yml.
        env = gym.make(self.env_id, render_mode='human' if render else None, **self.env_make_params)

        # Number of possible actions
        num_actions = env.action_space.n

        # Get observation space size
        num_states = env.observation_space.shape[0] # Expecting type: Box(low, high, (shape0,), float64)

        # List to keep track of rewards collected per episode.
        rewards_per_episode = []

        # Create policy and target network. Number of nodes in the hidden layer can be adjusted.
        policy_dqn = DQN(num_states, num_actions, self.fc1_nodes, self.enable_double_dqn).to(device)

        if is_training:
            # Initialize epsilon
            epsilon = self.epsilon_init

            # Initialize replay memory
            memory = ReplayMemory(self.replay_memory_size)

            # Create the target network and make it identical to the policy network
            target_dqn = DQN(num_states, num_actions, self.fc1_nodes, self.enable_double_dqn).to(device)
            target_dqn.load_state_dict(policy_dqn.state_dict())

            # Policy network optimizer. "Adam" optimizer can be swapped to something else.
            self.optimizer = torch.optim.Adam(policy_dqn.parameters(), lr=self.learning_rate_a)

            # List to keep track of epsilon decay
            epsilon_history = []

            # Track number of steps taken. Used for syncing policy => target network.
            step_count=0

            # Track best reward
            best_reward = -9999999
        else:
            # Load learned policy
            policy_dqn.load_state_dict(torch.load(self.MODEL_FILE))

            # switch model to evaluation mode
            policy_dqn.eval()

        # Train INDEFINITELY, manually stop the run when you are satisfied (or unsatisfied) with the results
        for episode in itertools.count():

            state, _ = env.reset()  # Initialize environment. Reset returns (state,info).
            state = torch.tensor(state, dtype=torch.float, device=device) # Convert state to tensor directly on device

            terminated = False      # True when agent reaches goal or fails
            episode_reward = 0.0    # Used to accumulate rewards per episode

            # Perform actions until episode terminates or reaches max rewards
            # (on some envs, it is possible for the agent to train to a point where it NEVER terminates, so stop on reward is necessary)
            while(not terminated and episode_reward < self.stop_on_reward):

                # Select action based on epsilon-greedy
                if is_training and random.random() < epsilon:
                    # select random action
                    action = env.action_space.sample()
                    action = torch.tensor(action, dtype=torch.int64, device=device)
                else:
                    # select best action
                    with torch.no_grad():
                        # state.unsqueeze(dim=0): Pytorch expects a batch layer, so add batch dimension i.e. tensor([1, 2, 3]) unsqueezes to tensor([[1, 2, 3]])
                        # policy_dqn returns tensor([[1], [2], [3]]), so squeeze it to tensor([1, 2, 3]).
                        # argmax finds the index of the largest element.
                        action = policy_dqn(state.unsqueeze(dim=0)).squeeze().argmax()

                # Execute action. Truncated and info is not used.
                new_state,reward,terminated,truncated,info = env.step(action.item())

                # Accumulate rewards
                episode_reward += reward

                # Convert new state and reward to tensors on device
                new_state = torch.tensor(new_state, dtype=torch.float, device=device)
                reward = torch.tensor(reward, dtype=torch.float, device=device)

                if is_training:
                    # Save experience into memory
                    memory.append((state, action, new_state, reward, terminated))

                    # Increment step counter
                    step_count+=1

                # Move to the next state
                state = new_state

            # Keep track of the rewards collected per episode.
            rewards_per_episode.append(episode_reward)

            # Save model when new best reward is obtained.
            if is_training:
                if episode_reward > best_reward:
                    log_message = f"{datetime.now().strftime(DATE_FORMAT)}: New best reward {episode_reward:0.1f} ({(episode_reward-best_reward)/best_reward*100:+.1f}%) at episode {episode}, saving model..."
                    print(log_message)
                    with open(self.LOG_FILE, 'a') as file:
                        file.write(log_message + '\n')

                    torch.save(policy_dqn.state_dict(), self.MODEL_FILE)
                    best_reward = episode_reward


                # Update graph every x seconds
                current_time = datetime.now()
                if current_time - last_graph_update_time > timedelta(seconds=10):
                    self.save_graph(rewards_per_episode, epsilon_history)
                    last_graph_update_time = current_time

                # If enough experience has been collected
                if len(memory)>self.mini_batch_size:
                    mini_batch = memory.sample(self.mini_batch_size)
                    self.optimize(mini_batch, policy_dqn, target_dqn)

                    # Decay epsilon
                    epsilon = max(epsilon * self.epsilon_decay, self.epsilon_min)
                    epsilon_history.append(epsilon)

                    # Copy policy network to target network after a certain number of steps
                    if step_count > self.network_sync_rate:
                        target_dqn.load_state_dict(policy_dqn.state_dict())
                        step_count=0


    # def save_graph(self, rewards_per_episode, epsilon_history):
    #     # Save plots
    #     fig = plt.figure(1)
    #
    #     # Plot average rewards (Y-axis) vs episodes (X-axis)
    #     mean_rewards = np.zeros(len(rewards_per_episode))
    #     for x in range(len(mean_rewards)):
    #         mean_rewards[x] = np.mean(rewards_per_episode[max(0, x-99):(x+1)])
    #     plt.subplot(121) # plot on a 1 row x 2 col grid, at cell 1
    #     # plt.xlabel('Episodes')
    #     plt.ylabel('Mean Rewards')
    #     plt.plot(mean_rewards)
    #
    #     # Plot epsilon decay (Y-axis) vs episodes (X-axis)
    #     plt.subplot(122) # plot on a 1 row x 2 col grid, at cell 2
    #     # plt.xlabel('Time Steps')
    #     plt.ylabel('Epsilon Decay')
    #     plt.plot(epsilon_history)
    #
    #     plt.subplots_adjust(wspace=1.0, hspace=1.0)
    #
    #     # Save plots
    #     fig.savefig(self.GRAPH_FILE)
    #     plt.close(fig)

    def save_graph(self, rewards_per_episode, epsilon_history):
        """
        Saves two separate plots for mean rewards and epsilon decay.
        Args:
            rewards_per_episode (list): List of rewards per episode.
            epsilon_history (list): List of epsilon values during training.
        """
        # Calculate mean rewards over a sliding window of 100 episodes
        mean_rewards = np.zeros(len(rewards_per_episode))
        for x in range(len(mean_rewards)):
            mean_rewards[x] = np.mean(rewards_per_episode[max(0, x - 99):(x + 1)])

        # Plot and save Mean Rewards graph
        plt.figure(figsize=(12, 6))  # Larger figure size for clarity
        plt.plot(mean_rewards, label="Mean Rewards")
        plt.xlabel("Episodes")
        plt.ylabel("Mean Rewards")
        plt.title("Mean Rewards per Episode")
        plt.legend()
        plt.grid(True)
        mean_rewards_path = "mean_rewards.png"
        plt.savefig(mean_rewards_path)
        plt.close()
        print(f"Mean rewards plot saved to {mean_rewards_path}")

        # Plot and save Epsilon Decay graph
        plt.figure(figsize=(12, 6))  # Larger figure size for clarity
        plt.plot(epsilon_history, label="Epsilon Decay", color='orange')
        plt.xlabel("Time Steps")
        plt.ylabel("Epsilon")
        plt.title("Epsilon Decay over Time")
        plt.legend()
        plt.grid(True)
        epsilon_decay_path = "epsilon_decay.png"
        plt.savefig(epsilon_decay_path)
        plt.close()
        print(f"Epsilon decay plot saved to {epsilon_decay_path}")

    # Optimize policy network
    def optimize(self, mini_batch, policy_dqn, target_dqn):

        # Transpose the list of experiences and separate each element
        states, actions, new_states, rewards, terminations = zip(*mini_batch)

        # Stack tensors to create batch tensors
        # tensor([[1,2,3]])
        states = torch.stack(states)

        actions = torch.stack(actions)

        new_states = torch.stack(new_states)

        rewards = torch.stack(rewards)
        terminations = torch.tensor(terminations).float().to(device)

        with torch.no_grad():
            if self.enable_double_dqn:
                best_actions_from_policy = policy_dqn(new_states).argmax(dim=1)

                target_q = rewards + (1-terminations) * self.discount_factor_g * \
                                target_dqn(new_states).gather(dim=1, index=best_actions_from_policy.unsqueeze(dim=1)).squeeze()
            else:
                # Calculate target Q values (expected returns)
                target_q = rewards + (1-terminations) * self.discount_factor_g * target_dqn(new_states).max(dim=1)[0]
                '''
                    target_dqn(new_states)  ==> tensor([[1,2,3],[4,5,6]])
                        .max(dim=1)         ==> torch.return_types.max(values=tensor([3,6]), indices=tensor([3, 0, 0, 1]))
                            [0]             ==> tensor([3,6])
                '''

        # Calcuate Q values from current policy
        current_q = policy_dqn(states).gather(dim=1, index=actions.unsqueeze(dim=1)).squeeze()
        '''
            policy_dqn(states)  ==> tensor([[1,2,3],[4,5,6]])
                actions.unsqueeze(dim=1)
                .gather(1, actions.unsqueeze(dim=1))  ==>
                    .squeeze()                    ==>
        '''

        # Compute loss
        loss = self.loss_fn(current_q, target_q)

        # Optimize the model (backpropagation)
        self.optimizer.zero_grad()  # Clear gradients
        loss.backward()             # Compute gradients
        self.optimizer.step()       # Update network parameters i.e. weights and biases


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Train or test RL models for Flappy Bird.")
    parser.add_argument("--train", help="Training mode", action="store_true")
    parser.add_argument("--algorithm", choices=["dqn", "a2c"], required=True, help="Choose the RL algorithm")
    parser.add_argument("--hyperparameters", help="The hyperparameter set to use (e.g., 'flappybird1')", default="flappybird1")
    args = parser.parse_args()

    if args.algorithm == "dqn":
        dql = Agent(hyperparameter_set=args.hyperparameters)
        if args.train:
            dql.run(is_training=True, render=False)
        else:
            dql.run(is_training=False, render=True)
    elif args.algorithm == "a2c":
        if args.train:
            train_a2c()
        else:
            test_a2c()

# if __name__ == '__main__':
#     # Parse command line inputs
#     parser = argparse.ArgumentParser(description='Train or test model.')
#     parser.add_argument('hyperparameters', help='')
#     parser.add_argument('--train', help='Training mode', action='store_true')
#     args = parser.parse_args()

#     dql = Agent(hyperparameter_set=args.hyperparameters)

#     if args.train:
#         dql.run(is_training=True)
#     else:
#         dql.run(is_training=False, render=True)