willbelcher · preston-willis · May 28, 2019
diff --git a/BJRL.py b/BJRL.py
@@ -1,79 +1,66 @@
 import numpy as np
 import matplotlib.pyplot as plt
-from mpl_toolkits import mplot3d
 
-import blackjack
+from blackjack import blackjack
+from StatePacket import StatePacket
 
 
-class BJRL():
+class BJRL:
+    def __init__(self):
 
-    A = np.empty(2, dtype=int)
-    gamma = 0.5
-    epsilon = 1
-    epsilon_decay = 0.99999
-    epsilon_min = 0.01
-    alpha = 0.01
+        self.gamma = 0.5
+        self.epsilon = 1
+        self.epsilon_decay = 0.99999
+        self.epsilon_min = 0.01
+        self.alpha = 0.01
 
-    def __init__(self):
-        self.A[0], self.A[1] = 0, 1
+        self.state_history = StatePacket(self)
         self.policy = np.full((22, 22, 2), 0.5)
-        #self.N = np.zeros((22, 22, 2))
+        self.A = [0, 1]
         self.V = np.zeros((22, 22))
         self.Q = np.zeros((22, 22, 2))
-        self.na = len(self.A)
 
+        self.na = len(self.A)
+        self.av_reward = None
         self.average = []
-        self.epsilon_hist = []
-        self.reward_hist = []
 
-
-    def train(self, num_iters=10000):       
-        plt.figure(1)
-        plt.title("State-Value function")
-        self.ax = plt.axes(projection='3d')
-        self.ax.view_init(28, -131)
+    def train(self, num_iters=10000):
         self.plot()
 
-        plt.figure(2)
-        plt.title("Epsilon vs. Time")
-        plt.plot(0, self.epsilon)
-        input()
-
-        b = blackjack.blackjack()
+        b = blackjack()
         t = 0
-        
-        for it in range(num_iters+1):
+
+        for it in range(num_iters + 1):
             history = self.run_episode(b)
             self.evaluate(history)
-            self.epsilon = max(self.epsilon*self.epsilon_decay, self.epsilon_min)
+            self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)
 
             if t == 0:
-                t=10000
+                t = 10000
 
                 self.plot()
-                av_reward = np.sum(self.average)/t
+                self.av_reward = np.sum(self.average) / t
                 print("[*] Iteration:", it)
-                print("[*] Average reward:", av_reward)
-
-                self.average=[]
-                self.epsilon_hist.append(self.epsilon)
-                self.reward_hist.append(av_reward)
-               
-            t-=1
-        
+                print("[*] Average reward:", self.av_reward)
+
+                self.average = []
+
+                self.state_history.update()
+
+            t -= 1
+
         self.epsilon = 0
 
         for i in range(len(self.Q)):
             for i2 in range(len(self.Q[0])):
                 self.improve([i, i2])
 
-
     def run_episode(self, b):
         history = []
-
         b.deal()
+        over = False
 
-        while True:
+        while not over:
             agent_sum, dealer_sum = b.get_state()
 
             action = np.random.choice(self.A, p=self.policy[agent_sum, dealer_sum])
@@ -82,48 +69,50 @@ def run_episode(self, b):
 
             history.append([reward, agent_sum, dealer_sum, action])
             self.average.append(reward)
-            #self.N[agent_sum, dealer_sum, action] += 1
-
-            if over:
-                break
 
         return history
 
-
     def evaluate(self, history):
-
         for _ in range(len(history)):
             total, index = self.disc_value(history)
 
-            self.Q[index[0], index[1], index[2]] = self.Q[index[0], index[1], index[2]] + self.alpha * (total - self.Q[index[0], index[1], index[2]])
+            self.Q[index[0], index[1], index[2]] = self.Q[index[0], index[1], index[2]] + self.alpha * \
+                (total - self.Q[index[0], index[1], index[2]])
             self.V[index[0], index[1]] = self.V[index[0], index[1]] + self.alpha * (total - self.V[index[0], index[1]])
 
             self.improve(index)
 
-
     def improve(self, index):
         best_action = np.argmax(self.Q[index[0], index[1]])
-        self.policy[index[0], index[1]] = np.full(self.na, (self.epsilon/self.na))
-        self.policy[index[0], index[1], best_action] = (self.epsilon/self.na) + 1 - self.epsilon
-
-    def disc_value(self, history): 
+        self.policy[index[0], index[1]] = np.full(self.na, (self.epsilon / self.na))
+        self.policy[index[0], index[1], best_action] = (self.epsilon / self.na) + 1 - self.epsilon
 
+    def disc_value(self, history):
         v, index = np.split(history.pop(0), [1])
         v = float(v)
 
         for ind, val in enumerate(history):
-            v+=(self.gamma ** ind) * val[0]
-
+            v += (self.gamma ** ind) * val[0]
 
         return v, index
 
     def plot(self):
+
+        plt.figure(1)
+        plt.title("State-Value function")
+        ax = plt.axes(projection='3d')
+        ax.view_init(28, -131)
+
+        plt.figure(2)
+        plt.title("Epsilon vs. Time")
+        plt.plot(0, self.epsilon)
+
         plt.figure(1)
         plt.cla()
-        V = self.V[2:,2:]
+        v = self.V[2:, 2:]
 
-        (x, y) = np.meshgrid(np.arange(V.shape[0]), np.arange(V.shape[1]))
-        self.ax.plot_wireframe(x, y, V, cmap='binary')
+        (x, y) = np.meshgrid(np.arange(v.shape[0]), np.arange(v.shape[1]))
+        ax.plot_wireframe(x, y, v, cmap='binary')
 
         plt.draw()
         plt.xlabel("Dealer_sum")
@@ -133,16 +122,15 @@ def plot(self):
         plt.figure(2)
         plt.cla()
         plt.subplot(2, 1, 1)
-        plt.plot(np.arange(len(self.epsilon_hist)), self.epsilon_hist)
+        plt.plot(np.arange(len(self.state_history.epsilon)), self.state_history.epsilon)
         plt.ylabel("Epsilon")
 
-        plt.subplot(2, 1 , 2)
-        plt.plot(np.arange(len(self.reward_hist)), self.reward_hist)
+        plt.subplot(2, 1, 2)
+        plt.plot(np.arange(len(self.state_history.reward)), self.state_history.reward)
         plt.xlabel("Iterations")
         plt.ylabel("Average Reward")
         plt.pause(0.000001)
 
     def save(self, filename="default"):
         np.save(filename, self.policy)
         print("\n[*] Model save successfully\n")
-
diff --git a/StatePacket.py b/StatePacket.py
@@ -0,0 +1,17 @@
+class StatePacket:
+    def __init__(self, trainer):
+        self.trainer = trainer
+        self.epsilon = []
+        self.reward = []
+        self.policy = []
+        self.Q = []
+        self.V = []
+        self.A = []
+
+    def update(self):
+        self.reward.append(self.trainer.av_reward)
+        self.epsilon.append(self.trainer.epsilon)
+        self.Q.append(self.trainer.Q)
+        self.V.append(self.trainer.V)
+        self.A.append(self.trainer.A)
+        self.policy.append(self.trainer.policy)
diff --git a/blackjack.py b/blackjack.py
@@ -1,64 +1,64 @@
 import numpy as np
 import random
 
+
 def build_deck():
-        v=0
-        deck = np.empty(52, dtype=int)
+    v = 0
+    deck = np.empty(52, dtype=int)
 
-        for i in range(9):
-            v+=1
-            deck[i*4:i*4+4]= v
+    for i in range(9):
+        v += 1
+        deck[i*4:i*4+4] = v
 
-        deck[36:52] = 10
+    deck[36:52] = 10
 
-        return deck
+    return deck
 
 
-class blackjack():   
+class blackjack():
 
     def __init__(self, verbose=False):
         self.deck = np.empty(52, dtype=int)
         self.verbose = verbose
-
-
-    def deal(self):        
-        self.deck = build_deck()
         self.end = 51
         self.dealer = 0
         self.agent = 0
 
+    def deal(self):        
+        self.deck = build_deck()
+
         self.dealer, self.deck = self.draw_card(self.deck, self.dealer)
         self.dealer, self.deck = self.draw_card(self.deck, self.dealer)
         self.agent, self.deck = self.draw_card(self.deck, self.agent)
 
-        if (self.verbose==True):
+        if self.verbose:
             print("---------Deal----------")
             self.print_state()
 
-    #0: hit, 1: hold
+    # 0: hit, 1: hold
     def choose_action(self, action):
         deck = self.deck
         dealer = self.dealer
         agent = self.agent
         over = False
 
-        if (action==0):
+        if action == 0:
             agent, deck = self.draw_card(deck, agent)
 
-            if(agent > 21):
+            if agent > 21:
                 over = True
                 reward = -1
             else:
                 reward = 0                
         else:
             over = True
 
-            while (dealer <= 17):
+            while dealer <= 17:
                 dealer, deck = self.draw_card(deck, dealer)
 
-            if (dealer > 21 or agent > dealer):
+            if dealer > 21 or agent > dealer:
                 reward = 1
-            elif (agent == dealer):
+            elif agent == dealer:
                 reward = 0
             else:
                 reward = -1
@@ -67,17 +67,15 @@ def choose_action(self, action):
         self.dealer = dealer
         self.agent = agent
 
-        if (self.verbose==True):
+        if self.verbose:
             print("--------Action---------")
             self.print_state()
-
-
-        return reward, over
 
+        return reward, over
 
     def draw_card(self, deck, hand):
         choice = random.randint(1, self.end)
-        self.end-=1
+        self.end -= 1
 
         hand += deck[choice]
         deck = np.delete(deck, choice)
@@ -94,5 +92,3 @@ def print_state(self, show_deck=False):
 
         if show_deck:
             print("Deck:", self.deck)
-
-