-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmaze_runner.py
135 lines (110 loc) · 4.62 KB
/
maze_runner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# Maze Runner using Reinforcement Learning
#
# Generates a random maze and then uses a uniform random walk policy and
# the Iterative Policy Evaluation algorithm to solve the maze.
# The initial maze and the result are stored as a PNG image.
#
# inspired by Lecture 36 of Udemy/Lazy Programmer Inc/Reinforcement Learning
#
# done by sy2002 on 19th of May 2018
import grid_world
import maze_generator
import numpy as np
# maze size
mx = 40
my = 40
# imgage size
img_dx = 500
img_dy = 500
WIN_REWARD = 1e12 # needs to be large due to float accuracy
STEP_PUNISH = -0.1 # cost of "walking"
SMALL_ENOUGH = 1e-3 # threshold for convergence
gamma = 0.999 # discount factor
# index 0 = wall, index = 1 path, index 2 = start/end, index = shortest path from start to end
rgb_color_map = [(0, 0, 0), (255, 255, 255), (255, 0, 0), (0, 0, 255)]
# generate maze and save the "unsolved" maze
maze = maze_generator.make_maze(mx, my)
maze_generator.add_start_and_end(maze, rgb_color_map)
maze_generator.save_maze(maze, "mrunner_1.png", img_dx, img_dy, rgb_color_map)
# move the maze to "Grid World" by doing the following:
# 1. Make sure that each step "costs something", so set everything to a reward of STEP_PUNISH
# 2. Reward the end point with WIN_REWARD
# 3. For each state aka position: define the possible movements (based on the maze layout)
# 4. Make the end point a terminal point
start, end = maze_generator.find_start_end(maze)
grid = grid_world.Grid(mx, my, (start[1], start[0]))
rewards = {}
for y in range(my):
for x in range(mx):
if maze[y][x] == 1: # 1 means "path", 0 means "wall"
rewards[(y, x)] = STEP_PUNISH # make sure, each step "costs something"
rewards[(end[1], end[0])] = WIN_REWARD # the goal offers 10.0 reward
# create possible actions for each position
actions = {}
for y in range(my):
for x in range(mx):
possible_actions = []
if y > 0 and maze[y - 1][x] > 0: possible_actions.append("U")
if y < my - 1 and maze[y + 1][x] > 0: possible_actions.append("D")
if x > 0 and maze[y][x - 1] > 0: possible_actions.append("L")
if x < mx - 1 and maze[y][x + 1] > 0: possible_actions.append("R")
# only care, if we are standing at start/end or path but not "inside" a wall and if there are any actions
if maze[y][x] > 0 and len(possible_actions):
actions[(y, x)] = set(possible_actions)
actions.pop((end[1], end[0])) # make the end point a terminal state
grid.set(rewards, actions)
# =================================================================
# Execute the iterative policy evaluation
# =================================================================
states = grid.all_states()
V = {} # initialize V(s) = 0
for s in states:
V[s] = 0
# repeat until convergence
verbose_counter = 0
while True:
biggest_change = 0
for s in states:
old_v = V[s]
# V(s) only has value if it's not a terminal state
if s in grid.actions:
new_v = 0 # we will accumulate the answer
p_a = 1.0 / len(grid.actions[s]) # each action has equal probability
for a in grid.actions[s]:
grid.set_state(s)
r = grid.move(a)
new_v += p_a * (r + gamma * V[grid.current_state()])
V[s] = new_v
biggest_change = max(biggest_change, np.abs(old_v - V[s]))
verbose_counter += 1
if verbose_counter % 100 == 0:
print(biggest_change)
if biggest_change < SMALL_ENOUGH:
break
V[(end[1], end[0])] = WIN_REWARD
# =================================================================
# Paint shortest path by following the value function
# =================================================================
grid.set_state((start[1], start[0]))
n = 0
while True:
best_action = None
best_action_value = -999999999
for a in grid.actions[grid.get_state()]:
grid.move(a)
if V[grid.get_state()] > best_action_value:
best_action_value = V[grid.get_state()]
best_action = a
grid.undo_move(a)
if (best_action is not None):
grid.move(best_action)
gs = grid.get_state()
if (gs[1], gs[0]) == (end[1], end[0]):
break
maze[gs[0]][gs[1]] = 3
print(gs, V[gs])
n = n + 1
maze_generator.save_maze(maze, "mrunner_2_{0:0=3d}.png".format(n), img_dx, img_dy, rgb_color_map)
# save the last frame a couple of times so that when we make an animated GIF, the last frame stands still for a while
for z in range(25):
maze_generator.save_maze(maze, "mrunner_2_{0:0=3d}.png".format(n + z), img_dx, img_dy, rgb_color_map)