diff --git a/BJRL.py b/BJRL.py index fe7e167..61841af 100644 --- a/BJRL.py +++ b/BJRL.py @@ -1,79 +1,66 @@ import numpy as np import matplotlib.pyplot as plt -from mpl_toolkits import mplot3d -import blackjack +from blackjack import blackjack +from StatePacket import StatePacket -class BJRL(): +class BJRL: + def __init__(self): - A = np.empty(2, dtype=int) - gamma = 0.5 - epsilon = 1 - epsilon_decay = 0.99999 - epsilon_min = 0.01 - alpha = 0.01 + self.gamma = 0.5 + self.epsilon = 1 + self.epsilon_decay = 0.99999 + self.epsilon_min = 0.01 + self.alpha = 0.01 - def __init__(self): - self.A[0], self.A[1] = 0, 1 + self.state_history = StatePacket(self) self.policy = np.full((22, 22, 2), 0.5) - #self.N = np.zeros((22, 22, 2)) + self.A = [0, 1] self.V = np.zeros((22, 22)) self.Q = np.zeros((22, 22, 2)) - self.na = len(self.A) + self.na = len(self.A) + self.av_reward = None self.average = [] - self.epsilon_hist = [] - self.reward_hist = [] - - def train(self, num_iters=10000): - plt.figure(1) - plt.title("State-Value function") - self.ax = plt.axes(projection='3d') - self.ax.view_init(28, -131) + def train(self, num_iters=10000): self.plot() - plt.figure(2) - plt.title("Epsilon vs. Time") - plt.plot(0, self.epsilon) - input() - - b = blackjack.blackjack() + b = blackjack() t = 0 - - for it in range(num_iters+1): + + for it in range(num_iters + 1): history = self.run_episode(b) self.evaluate(history) - self.epsilon = max(self.epsilon*self.epsilon_decay, self.epsilon_min) + self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min) if t == 0: - t=10000 + t = 10000 self.plot() - av_reward = np.sum(self.average)/t + self.av_reward = np.sum(self.average) / t print("[*] Iteration:", it) - print("[*] Average reward:", av_reward) - - self.average=[] - self.epsilon_hist.append(self.epsilon) - self.reward_hist.append(av_reward) - - t-=1 - + print("[*] Average reward:", self.av_reward) + + self.average = [] + + self.state_history.update() + + t -= 1 + self.epsilon = 0 for i in range(len(self.Q)): for i2 in range(len(self.Q[0])): self.improve([i, i2]) - def run_episode(self, b): history = [] - b.deal() + over = False - while True: + while not over: agent_sum, dealer_sum = b.get_state() action = np.random.choice(self.A, p=self.policy[agent_sum, dealer_sum]) @@ -82,48 +69,50 @@ def run_episode(self, b): history.append([reward, agent_sum, dealer_sum, action]) self.average.append(reward) - #self.N[agent_sum, dealer_sum, action] += 1 - - if over: - break return history - def evaluate(self, history): - for _ in range(len(history)): total, index = self.disc_value(history) - self.Q[index[0], index[1], index[2]] = self.Q[index[0], index[1], index[2]] + self.alpha * (total - self.Q[index[0], index[1], index[2]]) + self.Q[index[0], index[1], index[2]] = self.Q[index[0], index[1], index[2]] + self.alpha * \ + (total - self.Q[index[0], index[1], index[2]]) self.V[index[0], index[1]] = self.V[index[0], index[1]] + self.alpha * (total - self.V[index[0], index[1]]) self.improve(index) - def improve(self, index): best_action = np.argmax(self.Q[index[0], index[1]]) - self.policy[index[0], index[1]] = np.full(self.na, (self.epsilon/self.na)) - self.policy[index[0], index[1], best_action] = (self.epsilon/self.na) + 1 - self.epsilon - - def disc_value(self, history): + self.policy[index[0], index[1]] = np.full(self.na, (self.epsilon / self.na)) + self.policy[index[0], index[1], best_action] = (self.epsilon / self.na) + 1 - self.epsilon + def disc_value(self, history): v, index = np.split(history.pop(0), [1]) v = float(v) for ind, val in enumerate(history): - v+=(self.gamma ** ind) * val[0] - + v += (self.gamma ** ind) * val[0] return v, index def plot(self): + + plt.figure(1) + plt.title("State-Value function") + ax = plt.axes(projection='3d') + ax.view_init(28, -131) + + plt.figure(2) + plt.title("Epsilon vs. Time") + plt.plot(0, self.epsilon) + plt.figure(1) plt.cla() - V = self.V[2:,2:] + v = self.V[2:, 2:] - (x, y) = np.meshgrid(np.arange(V.shape[0]), np.arange(V.shape[1])) - self.ax.plot_wireframe(x, y, V, cmap='binary') + (x, y) = np.meshgrid(np.arange(v.shape[0]), np.arange(v.shape[1])) + ax.plot_wireframe(x, y, v, cmap='binary') plt.draw() plt.xlabel("Dealer_sum") @@ -133,11 +122,11 @@ def plot(self): plt.figure(2) plt.cla() plt.subplot(2, 1, 1) - plt.plot(np.arange(len(self.epsilon_hist)), self.epsilon_hist) + plt.plot(np.arange(len(self.state_history.epsilon)), self.state_history.epsilon) plt.ylabel("Epsilon") - plt.subplot(2, 1 , 2) - plt.plot(np.arange(len(self.reward_hist)), self.reward_hist) + plt.subplot(2, 1, 2) + plt.plot(np.arange(len(self.state_history.reward)), self.state_history.reward) plt.xlabel("Iterations") plt.ylabel("Average Reward") plt.pause(0.000001) @@ -145,4 +134,3 @@ def plot(self): def save(self, filename="default"): np.save(filename, self.policy) print("\n[*] Model save successfully\n") - diff --git a/StatePacket.py b/StatePacket.py new file mode 100644 index 0000000..7a58b5c --- /dev/null +++ b/StatePacket.py @@ -0,0 +1,17 @@ +class StatePacket: + def __init__(self, trainer): + self.trainer = trainer + self.epsilon = [] + self.reward = [] + self.policy = [] + self.Q = [] + self.V = [] + self.A = [] + + def update(self): + self.reward.append(self.trainer.av_reward) + self.epsilon.append(self.trainer.epsilon) + self.Q.append(self.trainer.Q) + self.V.append(self.trainer.V) + self.A.append(self.trainer.A) + self.policy.append(self.trainer.policy) diff --git a/blackjack.py b/blackjack.py index fdc0284..b942242 100644 --- a/blackjack.py +++ b/blackjack.py @@ -1,51 +1,51 @@ import numpy as np import random + def build_deck(): - v=0 - deck = np.empty(52, dtype=int) + v = 0 + deck = np.empty(52, dtype=int) - for i in range(9): - v+=1 - deck[i*4:i*4+4]= v + for i in range(9): + v += 1 + deck[i*4:i*4+4] = v - deck[36:52] = 10 + deck[36:52] = 10 - return deck + return deck -class blackjack(): +class blackjack(): def __init__(self, verbose=False): self.deck = np.empty(52, dtype=int) self.verbose = verbose - - - def deal(self): - self.deck = build_deck() self.end = 51 self.dealer = 0 self.agent = 0 + def deal(self): + self.deck = build_deck() + self.dealer, self.deck = self.draw_card(self.deck, self.dealer) self.dealer, self.deck = self.draw_card(self.deck, self.dealer) self.agent, self.deck = self.draw_card(self.deck, self.agent) - if (self.verbose==True): + if self.verbose: print("---------Deal----------") self.print_state() - #0: hit, 1: hold + # 0: hit, 1: hold def choose_action(self, action): deck = self.deck dealer = self.dealer agent = self.agent over = False - if (action==0): + if action == 0: agent, deck = self.draw_card(deck, agent) - if(agent > 21): + if agent > 21: over = True reward = -1 else: @@ -53,12 +53,12 @@ def choose_action(self, action): else: over = True - while (dealer <= 17): + while dealer <= 17: dealer, deck = self.draw_card(deck, dealer) - if (dealer > 21 or agent > dealer): + if dealer > 21 or agent > dealer: reward = 1 - elif (agent == dealer): + elif agent == dealer: reward = 0 else: reward = -1 @@ -67,17 +67,15 @@ def choose_action(self, action): self.dealer = dealer self.agent = agent - if (self.verbose==True): + if self.verbose: print("--------Action---------") self.print_state() - - - return reward, over + return reward, over def draw_card(self, deck, hand): choice = random.randint(1, self.end) - self.end-=1 + self.end -= 1 hand += deck[choice] deck = np.delete(deck, choice) @@ -94,5 +92,3 @@ def print_state(self, show_deck=False): if show_deck: print("Deck:", self.deck) - - diff --git a/main.py b/main.py index e825e5d..ff73592 100644 --- a/main.py +++ b/main.py @@ -1,8 +1,8 @@ import numpy as np import copy +from blackjack import blackjack +from BJRL import BJRL -import blackjack -import BJRL def main(): print("\nWelcome to my Blackjack RL bot\n") @@ -10,38 +10,37 @@ def main(): while True: user = input("1. Game demo\n2. Train\n3. Run agent\n4. Evaluate agent\n5. Quit\n") - if (user == "1"): + if user == "1": game_demo() - elif (user == "2"): + elif user == "2": train_agent() - elif (user == "3"): + elif user == "3": run_agent() - elif (user == "4"): + elif user == "4": evaluate_agent() - elif (user == "5"): + elif user == "5": break else: print("\n[-] Error: please enter a valid option\n") def game_demo(): - b = blackjack.blackjack(verbose=True) + b = blackjack(verbose=True) - while True: + while True: b.deal() + over = False - while True: + while not over: reward, over = b.choose_action(int(input("choose action: "))) + print("[*] Reward:", reward) - if over: - print("[*] Reward:", reward) - break - - if input("\nPress enter to continue\n")!="": + if not input("\nPress enter to continue\n") == "": break + def train_agent(): - agent = BJRL.BJRL() + agent = BJRL() agent.train(700000) user = input("\nWould you like to save? y/n\n") @@ -49,14 +48,15 @@ def train_agent(): if user == "y": agent.save() + def run_agent(): - b = blackjack.blackjack() + b = blackjack() print("[*] Loading agent") agent = Agent() agent.load_agent() - while True: + while True: b.deal() b2 = copy.deepcopy(b) @@ -66,12 +66,14 @@ def run_agent(): over, over2 = False, False - while True: + while True: state1 = b.get_state() state2 = b2.get_state() - print_states(state1, state2) + reward = None + reward2 = None + if not over: reward, over = b.choose_action(int(input("choose action: "))) @@ -82,43 +84,45 @@ def run_agent(): print_states(state1, state2) - if (reward > reward2): + if reward > reward2: print("You win!") - elif (reward < reward2): + elif reward < reward2: print("AI wins!") else: print("Tie") break - - if input("\nPress enter to continue\n")!="": + + if input("\nPress enter to continue\n") != "": break + def print_states(state1, state2): print("\n --------Action---------") print(" P1 | AI") - if len(str(state1[1]))==1: - print("Dealer:", state1[1] , " | ", state2[1]) + if len(str(state1[1])) == 1: + print("Dealer:", state1[1], " | ", state2[1]) else: - print("Dealer:", state1[1] , " | ", state2[1]) + print("Dealer:", state1[1], " | ", state2[1]) - if len(str(state1[0]))==1: + if len(str(state1[0])) == 1: print("Player:", state1[0], " | ", state2[0]) else: print("Player:", state1[0], " | ", state2[0]) + def evaluate_agent(num_iters=50000): - b = blackjack.blackjack() + b = blackjack() - agent = Agent() + agent = Agent() agent.load_agent(input("\nEnter filename: ") + ".npy") - record = np.empty(num_iters+1) + record = np.empty(num_iters + 1) print("[*] Running samples") - for i in range(num_iters+1): + for i in range(num_iters + 1): record[i] = agent.run_episode(b) wins = np.sum(record == 1) @@ -130,10 +134,10 @@ def evaluate_agent(num_iters=50000): print("Wins: {}".format(wins)) print("Ties: {}".format(ties)) print("Losses: {}".format(losses)) - print("Win probability: {}% | Average player: 44-48%\n".format(round((wins/num_iters) * 100, 2))) - + print("Win probability: {}% | Average player: 44-48%\n".format(round((wins / num_iters) * 100, 2))) -class Agent(): + +class Agent: def __init__(self): self.policy = np.empty((22, 22, 2)) @@ -144,21 +148,21 @@ def load_agent(self, filename="default.npy"): def get_action(self, state): return np.argmax(self.policy[state[0], state[1]]) - + def run_episode(self, b): b.deal() + over = False + reward = None - while True: + while not over: agent_sum, dealer_sum = b.get_state() - #actions array substituted + # actions array substituted action = np.random.choice([0, 1], p=self.policy[agent_sum, dealer_sum]) reward, over = b.choose_action(action) - if over: - break - return reward -main() \ No newline at end of file + +main()