Skip to content

Commit 4f1eb25

Browse files
ad71norvig
authored andcommitted
Added POMDP-value-iteration (#929)
* Added POMDP value iteration * Added plot_pomdp_utility function * Added tests for pomdp-value-iteration * Updated README.md * Fixed notebook import * Changed colors * Added notebook sections for POMDP and pomdp_value_iteration * Fixed notebook parsing error * Replace pomdp.ipynb * Updated README.md * Fixed line endings * Fixed line endings * Fixed line endings * Fixed line endings * Removed numpy dependency * Added docstrings * Fix tests * Added a test for pomdp_value_iteration * Remove numpy dependencies from mdp.ipynb * Added POMDP to mdp_apps.ipynb
1 parent 68327a8 commit 4f1eb25

File tree

7 files changed

+1418
-254
lines changed

7 files changed

+1418
-254
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ Here is a table of algorithms, the figure, name of the algorithm in the book and
131131
| 16.9 | Information-Gathering-Agent | | | | |
132132
| 17.4 | Value-Iteration | `value_iteration` | [`mdp.py`][mdp] | Done | Included |
133133
| 17.7 | Policy-Iteration | `policy_iteration` | [`mdp.py`][mdp] | Done | Included |
134-
| 17.9 | POMDP-Value-Iteration | | | | |
134+
| 17.9 | POMDP-Value-Iteration | `pomdp_value_iteration` | [`mdp.py`][mdp] | Done | Included |
135135
| 18.5 | Decision-Tree-Learning | `DecisionTreeLearner` | [`learning.py`][learning] | Done | Included |
136136
| 18.8 | Cross-Validation | `cross_validation` | [`learning.py`][learning] | | |
137137
| 18.11 | Decision-List-Learning | `DecisionListLearner` | [`learning.py`][learning]\* | | |

mdp.ipynb

+772-6
Large diffs are not rendered by default.

mdp.py

+208-1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
from utils import argmax, vector_add, orientations, turn_right, turn_left
1010

1111
import random
12+
import numpy as np
13+
from collections import defaultdict
1214

1315

1416
class MDP:
@@ -51,11 +53,13 @@ def __init__(self, init, actlist, terminals, transitions=None, reward=None, stat
5153

5254
def R(self, state):
5355
"""Return a numeric reward for this state."""
56+
5457
return self.reward[state]
5558

5659
def T(self, state, action):
5760
"""Transition model. From a state and an action, return a list
5861
of (probability, result-state) pairs."""
62+
5963
if not self.transitions:
6064
raise ValueError("Transition model is missing")
6165
else:
@@ -65,6 +69,7 @@ def actions(self, state):
6569
"""Return a list of actions that can be performed in this state. By default, a
6670
fixed list of actions, except for terminal states. Override this
6771
method if you need to specialize by state."""
72+
6873
if state in self.terminals:
6974
return [None]
7075
else:
@@ -106,7 +111,10 @@ def check_consistency(self):
106111

107112
class MDP2(MDP):
108113

109-
"""Inherits from MDP. Handles terminal states, and transitions to and from terminal states better."""
114+
"""
115+
Inherits from MDP. Handles terminal states, and transitions to and from terminal states better.
116+
"""
117+
110118
def __init__(self, init, actlist, terminals, transitions, reward=None, gamma=0.9):
111119
MDP.__init__(self, init, actlist, terminals, transitions, reward, gamma=gamma)
112120

@@ -160,11 +168,13 @@ def T(self, state, action):
160168

161169
def go(self, state, direction):
162170
"""Return the state that results from going in this direction."""
171+
163172
state1 = vector_add(state, direction)
164173
return state1 if state1 in self.states else state
165174

166175
def to_grid(self, mapping):
167176
"""Convert a mapping from (x, y) to v into a [[..., v, ...]] grid."""
177+
168178
return list(reversed([[mapping.get((x, y), None)
169179
for x in range(self.cols)]
170180
for y in range(self.rows)]))
@@ -190,6 +200,7 @@ def to_arrows(self, policy):
190200

191201
def value_iteration(mdp, epsilon=0.001):
192202
"""Solving an MDP by value iteration. [Figure 17.4]"""
203+
193204
U1 = {s: 0 for s in mdp.states}
194205
R, T, gamma = mdp.R, mdp.T, mdp.gamma
195206
while True:
@@ -206,6 +217,7 @@ def value_iteration(mdp, epsilon=0.001):
206217
def best_policy(mdp, U):
207218
"""Given an MDP and a utility function U, determine the best policy,
208219
as a mapping from state to action. (Equation 17.4)"""
220+
209221
pi = {}
210222
for s in mdp.states:
211223
pi[s] = argmax(mdp.actions(s), key=lambda a: expected_utility(a, s, U, mdp))
@@ -214,13 +226,15 @@ def best_policy(mdp, U):
214226

215227
def expected_utility(a, s, U, mdp):
216228
"""The expected utility of doing a in state s, according to the MDP and U."""
229+
217230
return sum(p*U[s1] for (p, s1) in mdp.T(s, a))
218231

219232
# ______________________________________________________________________________
220233

221234

222235
def policy_iteration(mdp):
223236
"""Solve an MDP by policy iteration [Figure 17.7]"""
237+
224238
U = {s: 0 for s in mdp.states}
225239
pi = {s: random.choice(mdp.actions(s)) for s in mdp.states}
226240
while True:
@@ -238,13 +252,206 @@ def policy_iteration(mdp):
238252
def policy_evaluation(pi, U, mdp, k=20):
239253
"""Return an updated utility mapping U from each state in the MDP to its
240254
utility, using an approximation (modified policy iteration)."""
255+
241256
R, T, gamma = mdp.R, mdp.T, mdp.gamma
242257
for i in range(k):
243258
for s in mdp.states:
244259
U[s] = R(s) + gamma*sum(p*U[s1] for (p, s1) in T(s, pi[s]))
245260
return U
246261

247262

263+
class POMDP(MDP):
264+
265+
"""A Partially Observable Markov Decision Process, defined by
266+
a transition model P(s'|s,a), actions A(s), a reward function R(s),
267+
and a sensor model P(e|s). We also keep track of a gamma value,
268+
for use by algorithms. The transition and the sensor models
269+
are defined as matrices. We also keep track of the possible states
270+
and actions for each state. [page 659]."""
271+
272+
def __init__(self, actions, transitions=None, evidences=None, rewards=None, states=None, gamma=0.95):
273+
"""Initialize variables of the pomdp"""
274+
275+
if not (0 < gamma <= 1):
276+
raise ValueError('A POMDP must have 0 < gamma <= 1')
277+
278+
self.states = states
279+
self.actions = actions
280+
281+
# transition model cannot be undefined
282+
self.t_prob = transitions or {}
283+
if not self.t_prob:
284+
print('Warning: Transition model is undefined')
285+
286+
# sensor model cannot be undefined
287+
self.e_prob = evidences or {}
288+
if not self.e_prob:
289+
print('Warning: Sensor model is undefined')
290+
291+
self.gamma = gamma
292+
self.rewards = rewards
293+
294+
def remove_dominated_plans(self, input_values):
295+
"""
296+
Remove dominated plans.
297+
This method finds all the lines contributing to the
298+
upper surface and removes those which don't.
299+
"""
300+
301+
values = [val for action in input_values for val in input_values[action]]
302+
values.sort(key=lambda x: x[0], reverse=True)
303+
304+
best = [values[0]]
305+
y1_max = max(val[1] for val in values)
306+
tgt = values[0]
307+
prev_b = 0
308+
prev_ix = 0
309+
while tgt[1] != y1_max:
310+
min_b = 1
311+
min_ix = 0
312+
for i in range(prev_ix + 1, len(values)):
313+
if values[i][0] - tgt[0] + tgt[1] - values[i][1] != 0:
314+
trans_b = (values[i][0] - tgt[0]) / (values[i][0] - tgt[0] + tgt[1] - values[i][1])
315+
if 0 <= trans_b <= 1 and trans_b > prev_b and trans_b < min_b:
316+
min_b = trans_b
317+
min_ix = i
318+
prev_b = min_b
319+
prev_ix = min_ix
320+
tgt = values[min_ix]
321+
best.append(tgt)
322+
323+
return self.generate_mapping(best, input_values)
324+
325+
def remove_dominated_plans_fast(self, input_values):
326+
"""
327+
Remove dominated plans using approximations.
328+
Resamples the upper boundary at intervals of 100 and
329+
finds the maximum values at these points.
330+
"""
331+
332+
values = [val for action in input_values for val in input_values[action]]
333+
values.sort(key=lambda x: x[0], reverse=True)
334+
335+
best = []
336+
sr = 100
337+
for i in range(sr + 1):
338+
x = i / float(sr)
339+
maximum = (values[0][1] - values[0][0]) * x + values[0][0]
340+
tgt = values[0]
341+
for value in values:
342+
val = (value[1] - value[0]) * x + value[0]
343+
if val > maximum:
344+
maximum = val
345+
tgt = value
346+
347+
if all(any(tgt != v) for v in best):
348+
best.append(np.array(tgt))
349+
350+
return self.generate_mapping(best, input_values)
351+
352+
def generate_mapping(self, best, input_values):
353+
"""Generate mappings after removing dominated plans"""
354+
355+
mapping = defaultdict(list)
356+
for value in best:
357+
for action in input_values:
358+
if any(all(value == v) for v in input_values[action]):
359+
mapping[action].append(value)
360+
361+
return mapping
362+
363+
def max_difference(self, U1, U2):
364+
"""Find maximum difference between two utility mappings"""
365+
366+
for k, v in U1.items():
367+
sum1 = 0
368+
for element in U1[k]:
369+
sum1 += sum(element)
370+
sum2 = 0
371+
for element in U2[k]:
372+
sum2 += sum(element)
373+
return abs(sum1 - sum2)
374+
375+
376+
class Matrix:
377+
"""Matrix operations class"""
378+
379+
@staticmethod
380+
def add(A, B):
381+
"""Add two matrices A and B"""
382+
383+
res = []
384+
for i in range(len(A)):
385+
row = []
386+
for j in range(len(A[0])):
387+
row.append(A[i][j] + B[i][j])
388+
res.append(row)
389+
return res
390+
391+
@staticmethod
392+
def scalar_multiply(a, B):
393+
"""Multiply scalar a to matrix B"""
394+
395+
for i in range(len(B)):
396+
for j in range(len(B[0])):
397+
B[i][j] = a * B[i][j]
398+
return B
399+
400+
@staticmethod
401+
def multiply(A, B):
402+
"""Multiply two matrices A and B element-wise"""
403+
404+
matrix = []
405+
for i in range(len(B)):
406+
row = []
407+
for j in range(len(B[0])):
408+
row.append(B[i][j] * A[j][i])
409+
matrix.append(row)
410+
411+
return matrix
412+
413+
@staticmethod
414+
def matmul(A, B):
415+
"""Inner-product of two matrices"""
416+
417+
return [[sum(ele_a*ele_b for ele_a, ele_b in zip(row_a, col_b)) for col_b in list(zip(*B))] for row_a in A]
418+
419+
@staticmethod
420+
def transpose(A):
421+
"""Transpose a matrix"""
422+
423+
return [list(i) for i in zip(*A)]
424+
425+
426+
def pomdp_value_iteration(pomdp, epsilon=0.1):
427+
"""Solving a POMDP by value iteration."""
428+
429+
U = {'':[[0]* len(pomdp.states)]}
430+
count = 0
431+
while True:
432+
count += 1
433+
prev_U = U
434+
values = [val for action in U for val in U[action]]
435+
value_matxs = []
436+
for i in values:
437+
for j in values:
438+
value_matxs.append([i, j])
439+
440+
U1 = defaultdict(list)
441+
for action in pomdp.actions:
442+
for u in value_matxs:
443+
u1 = Matrix.matmul(Matrix.matmul(pomdp.t_prob[int(action)], Matrix.multiply(pomdp.e_prob[int(action)], Matrix.transpose(u))), [[1], [1]])
444+
u1 = Matrix.add(Matrix.scalar_multiply(pomdp.gamma, Matrix.transpose(u1)), [pomdp.rewards[int(action)]])
445+
U1[action].append(u1[0])
446+
447+
U = pomdp.remove_dominated_plans_fast(U1)
448+
# replace with U = pomdp.remove_dominated_plans(U1) for accurate calculations
449+
450+
if count > 10:
451+
if pomdp.max_difference(U, prev_U) < epsilon * (1 - pomdp.gamma) / pomdp.gamma:
452+
return U
453+
454+
248455
__doc__ += """
249456
>>> pi = best_policy(sequential_decision_environment, value_iteration(sequential_decision_environment, .01))
250457

0 commit comments

Comments
 (0)