-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathVI.py
104 lines (89 loc) · 3.6 KB
/
VI.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import numpy as np
def value(policy, n_states, transition_probabilities, reward, discount,
threshold=1e-2):
"""
Find the value function associated with a policy.
policy: List of action ints for each state.
n_states: Number of states. int.
transition_probabilities: Function taking (action, state, state) to
transition probabilities.
reward: Vector of rewards for each state.
discount: MDP discount factor. float.
threshold: Convergence threshold, default 1e-2. float.
-> Array of values for each state
"""
v = np.zeros(n_states)
diff = float("inf")
while diff > threshold:
diff = 0
for s in range(n_states):
vs = v[s]
a = policy[s]
v[s] = sum(transition_probabilities[s, a, k] *
(reward[k] + discount * v[k])
for k in range(n_states))
diff = max(diff, abs(vs - v[s]))
return v
def optimal_value(n_states, n_actions, transition_probabilities, reward,
discount, threshold=1e-2):
"""
Find the optimal value function.
n_states: Number of states. int.
n_actions: Number of actions. int.
transition_probabilities: Function taking (state, action, state) to
transition probabilities.
reward: Vector of rewards for each state.
discount: MDP discount factor. float.
threshold: Convergence threshold, default 1e-2. float.
-> Array of values for each state
"""
v = np.zeros(n_states)
diff = float("inf")
while diff > threshold:
diff = 0
for s in range(n_states):
max_v = float("-inf")
for a in range(n_actions):
tp = transition_probabilities[a, s, :]
max_v = max(max_v, np.dot(tp, reward + discount*v))
new_diff = abs(v[s] - max_v)
if new_diff > diff:
diff = new_diff
v[s] = max_v
return v
def find_policy(n_states, n_actions, transition_probabilities, reward, discount,
threshold=1e-2, v=None, stochastic=True):
"""
Find the optimal policy.
n_states: Number of states. int.
n_actions: Number of actions. int.
transition_probabilities: Function taking (action, state, state) to
transition probabilities.
reward: Vector of rewards for each state.
discount: MDP discount factor. float.
threshold: Convergence threshold, default 1e-2. float.
v: Value function (if known). Default None.
stochastic: Whether the policy should be stochastic. Default True.
-> Action probabilities for each state or action int for each state
(depending on stochasticity).
"""
if v is None:
v = optimal_value(n_states, n_actions, transition_probabilities, reward,
discount, threshold)
if stochastic:
# Get Q using equation 9.2 from Ziebart's thesis.
Q = np.zeros((n_states, n_actions))
for i in range(n_states):
for j in range(n_actions):
p = transition_probabilities[j, i, :]
Q[i, j] = p.dot(reward + discount*v)
Q -= Q.max(axis=1).reshape((n_states, 1)) # For numerical stability.
Q = np.exp(Q)/np.exp(Q).sum(axis=1).reshape((n_states, 1))
return Q
def _policy(s):
return max(range(n_actions),
key=lambda a: sum(transition_probabilities[s, a, k] *
(reward[k] + discount * v[k])
for k in range(n_states)))
policy = np.array([_policy(s) for s in range(n_states)])
return policy