|
6 | 6 | dictionary of {state:number} pairs. We then define the value_iteration
|
7 | 7 | and policy_iteration algorithms."""
|
8 | 8 |
|
9 |
| -# (Written for the second edition of AIMA; expect some discrepanciecs |
10 |
| -# from the third edition until this gets reviewed.) |
11 |
| - |
12 | 9 | from utils import *
|
13 | 10 |
|
14 | 11 | class MDP:
|
15 | 12 | """A Markov Decision Process, defined by an initial state, transition model,
|
16 | 13 | and reward function. We also keep track of a gamma value, for use by
|
17 | 14 | algorithms. The transition model is represented somewhat differently from
|
18 |
| - the text. Instead of T(s, a, s') being a probability number for each |
19 |
| - state/action/state triplet, we instead have T(s, a) return a list of (p, s') |
| 15 | + the text. Instead of P(s' | s, a) being a probability number for each |
| 16 | + state/state/action triplet, we instead have T(s, a) return a list of (p, s') |
20 | 17 | pairs. We also keep track of the possible states, terminal states, and
|
21 |
| - actions for each state. [page 615]""" |
| 18 | + actions for each state. [page 646]""" |
22 | 19 |
|
23 | 20 | def __init__(self, init, actlist, terminals, gamma=.9):
|
24 | 21 | update(self, init=init, actlist=actlist, terminals=terminals,
|
|
0 commit comments