Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 52 additions & 64 deletions BJRL.py
Original file line number Diff line number Diff line change
@@ -1,79 +1,66 @@
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d

import blackjack
from blackjack import blackjack
from StatePacket import StatePacket


class BJRL():
class BJRL:
def __init__(self):

A = np.empty(2, dtype=int)
gamma = 0.5
epsilon = 1
epsilon_decay = 0.99999
epsilon_min = 0.01
alpha = 0.01
self.gamma = 0.5
self.epsilon = 1
self.epsilon_decay = 0.99999
self.epsilon_min = 0.01
self.alpha = 0.01

def __init__(self):
self.A[0], self.A[1] = 0, 1
self.state_history = StatePacket(self)
self.policy = np.full((22, 22, 2), 0.5)
#self.N = np.zeros((22, 22, 2))
self.A = [0, 1]
self.V = np.zeros((22, 22))
self.Q = np.zeros((22, 22, 2))
self.na = len(self.A)

self.na = len(self.A)
self.av_reward = None
self.average = []
self.epsilon_hist = []
self.reward_hist = []


def train(self, num_iters=10000):
plt.figure(1)
plt.title("State-Value function")
self.ax = plt.axes(projection='3d')
self.ax.view_init(28, -131)
def train(self, num_iters=10000):
self.plot()

plt.figure(2)
plt.title("Epsilon vs. Time")
plt.plot(0, self.epsilon)
input()

b = blackjack.blackjack()
b = blackjack()
t = 0
for it in range(num_iters+1):

for it in range(num_iters + 1):
history = self.run_episode(b)
self.evaluate(history)
self.epsilon = max(self.epsilon*self.epsilon_decay, self.epsilon_min)
self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)

if t == 0:
t=10000
t = 10000

self.plot()
av_reward = np.sum(self.average)/t
self.av_reward = np.sum(self.average) / t
print("[*] Iteration:", it)
print("[*] Average reward:", av_reward)

self.average=[]
self.epsilon_hist.append(self.epsilon)
self.reward_hist.append(av_reward)
t-=1
print("[*] Average reward:", self.av_reward)

self.average = []

self.state_history.update()

t -= 1

self.epsilon = 0

for i in range(len(self.Q)):
for i2 in range(len(self.Q[0])):
self.improve([i, i2])


def run_episode(self, b):
history = []

b.deal()
over = False

while True:
while not over:
agent_sum, dealer_sum = b.get_state()

action = np.random.choice(self.A, p=self.policy[agent_sum, dealer_sum])
Expand All @@ -82,48 +69,50 @@ def run_episode(self, b):

history.append([reward, agent_sum, dealer_sum, action])
self.average.append(reward)
#self.N[agent_sum, dealer_sum, action] += 1

if over:
break

return history


def evaluate(self, history):

for _ in range(len(history)):
total, index = self.disc_value(history)

self.Q[index[0], index[1], index[2]] = self.Q[index[0], index[1], index[2]] + self.alpha * (total - self.Q[index[0], index[1], index[2]])
self.Q[index[0], index[1], index[2]] = self.Q[index[0], index[1], index[2]] + self.alpha * \
(total - self.Q[index[0], index[1], index[2]])
self.V[index[0], index[1]] = self.V[index[0], index[1]] + self.alpha * (total - self.V[index[0], index[1]])

self.improve(index)


def improve(self, index):
best_action = np.argmax(self.Q[index[0], index[1]])
self.policy[index[0], index[1]] = np.full(self.na, (self.epsilon/self.na))
self.policy[index[0], index[1], best_action] = (self.epsilon/self.na) + 1 - self.epsilon

def disc_value(self, history):
self.policy[index[0], index[1]] = np.full(self.na, (self.epsilon / self.na))
self.policy[index[0], index[1], best_action] = (self.epsilon / self.na) + 1 - self.epsilon

def disc_value(self, history):
v, index = np.split(history.pop(0), [1])
v = float(v)

for ind, val in enumerate(history):
v+=(self.gamma ** ind) * val[0]

v += (self.gamma ** ind) * val[0]

return v, index

def plot(self):

plt.figure(1)
plt.title("State-Value function")
ax = plt.axes(projection='3d')
ax.view_init(28, -131)

plt.figure(2)
plt.title("Epsilon vs. Time")
plt.plot(0, self.epsilon)

plt.figure(1)
plt.cla()
V = self.V[2:,2:]
v = self.V[2:, 2:]

(x, y) = np.meshgrid(np.arange(V.shape[0]), np.arange(V.shape[1]))
self.ax.plot_wireframe(x, y, V, cmap='binary')
(x, y) = np.meshgrid(np.arange(v.shape[0]), np.arange(v.shape[1]))
ax.plot_wireframe(x, y, v, cmap='binary')

plt.draw()
plt.xlabel("Dealer_sum")
Expand All @@ -133,16 +122,15 @@ def plot(self):
plt.figure(2)
plt.cla()
plt.subplot(2, 1, 1)
plt.plot(np.arange(len(self.epsilon_hist)), self.epsilon_hist)
plt.plot(np.arange(len(self.state_history.epsilon)), self.state_history.epsilon)
plt.ylabel("Epsilon")

plt.subplot(2, 1 , 2)
plt.plot(np.arange(len(self.reward_hist)), self.reward_hist)
plt.subplot(2, 1, 2)
plt.plot(np.arange(len(self.state_history.reward)), self.state_history.reward)
plt.xlabel("Iterations")
plt.ylabel("Average Reward")
plt.pause(0.000001)

def save(self, filename="default"):
np.save(filename, self.policy)
print("\n[*] Model save successfully\n")

17 changes: 17 additions & 0 deletions StatePacket.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
class StatePacket:
def __init__(self, trainer):
self.trainer = trainer
self.epsilon = []
self.reward = []
self.policy = []
self.Q = []
self.V = []
self.A = []

def update(self):
self.reward.append(self.trainer.av_reward)
self.epsilon.append(self.trainer.epsilon)
self.Q.append(self.trainer.Q)
self.V.append(self.trainer.V)
self.A.append(self.trainer.A)
self.policy.append(self.trainer.policy)
48 changes: 22 additions & 26 deletions blackjack.py
Original file line number Diff line number Diff line change
@@ -1,64 +1,64 @@
import numpy as np
import random


def build_deck():
v=0
deck = np.empty(52, dtype=int)
v = 0
deck = np.empty(52, dtype=int)

for i in range(9):
v+=1
deck[i*4:i*4+4]= v
for i in range(9):
v += 1
deck[i*4:i*4+4] = v

deck[36:52] = 10
deck[36:52] = 10

return deck
return deck


class blackjack():
class blackjack():

def __init__(self, verbose=False):
self.deck = np.empty(52, dtype=int)
self.verbose = verbose


def deal(self):
self.deck = build_deck()
self.end = 51
self.dealer = 0
self.agent = 0

def deal(self):
self.deck = build_deck()

self.dealer, self.deck = self.draw_card(self.deck, self.dealer)
self.dealer, self.deck = self.draw_card(self.deck, self.dealer)
self.agent, self.deck = self.draw_card(self.deck, self.agent)

if (self.verbose==True):
if self.verbose:
print("---------Deal----------")
self.print_state()

#0: hit, 1: hold
# 0: hit, 1: hold
def choose_action(self, action):
deck = self.deck
dealer = self.dealer
agent = self.agent
over = False

if (action==0):
if action == 0:
agent, deck = self.draw_card(deck, agent)

if(agent > 21):
if agent > 21:
over = True
reward = -1
else:
reward = 0
else:
over = True

while (dealer <= 17):
while dealer <= 17:
dealer, deck = self.draw_card(deck, dealer)

if (dealer > 21 or agent > dealer):
if dealer > 21 or agent > dealer:
reward = 1
elif (agent == dealer):
elif agent == dealer:
reward = 0
else:
reward = -1
Expand All @@ -67,17 +67,15 @@ def choose_action(self, action):
self.dealer = dealer
self.agent = agent

if (self.verbose==True):
if self.verbose:
print("--------Action---------")
self.print_state()


return reward, over

return reward, over

def draw_card(self, deck, hand):
choice = random.randint(1, self.end)
self.end-=1
self.end -= 1

hand += deck[choice]
deck = np.delete(deck, choice)
Expand All @@ -94,5 +92,3 @@ def print_state(self, show_deck=False):

if show_deck:
print("Deck:", self.deck)


Loading