9
9
from utils import argmax , vector_add , orientations , turn_right , turn_left
10
10
11
11
import random
12
+ import numpy as np
13
+ from collections import defaultdict
12
14
13
15
14
16
class MDP :
@@ -51,11 +53,13 @@ def __init__(self, init, actlist, terminals, transitions=None, reward=None, stat
51
53
52
54
def R (self , state ):
53
55
"""Return a numeric reward for this state."""
56
+
54
57
return self .reward [state ]
55
58
56
59
def T (self , state , action ):
57
60
"""Transition model. From a state and an action, return a list
58
61
of (probability, result-state) pairs."""
62
+
59
63
if not self .transitions :
60
64
raise ValueError ("Transition model is missing" )
61
65
else :
@@ -65,6 +69,7 @@ def actions(self, state):
65
69
"""Return a list of actions that can be performed in this state. By default, a
66
70
fixed list of actions, except for terminal states. Override this
67
71
method if you need to specialize by state."""
72
+
68
73
if state in self .terminals :
69
74
return [None ]
70
75
else :
@@ -106,7 +111,10 @@ def check_consistency(self):
106
111
107
112
class MDP2 (MDP ):
108
113
109
- """Inherits from MDP. Handles terminal states, and transitions to and from terminal states better."""
114
+ """
115
+ Inherits from MDP. Handles terminal states, and transitions to and from terminal states better.
116
+ """
117
+
110
118
def __init__ (self , init , actlist , terminals , transitions , reward = None , gamma = 0.9 ):
111
119
MDP .__init__ (self , init , actlist , terminals , transitions , reward , gamma = gamma )
112
120
@@ -160,11 +168,13 @@ def T(self, state, action):
160
168
161
169
def go (self , state , direction ):
162
170
"""Return the state that results from going in this direction."""
171
+
163
172
state1 = vector_add (state , direction )
164
173
return state1 if state1 in self .states else state
165
174
166
175
def to_grid (self , mapping ):
167
176
"""Convert a mapping from (x, y) to v into a [[..., v, ...]] grid."""
177
+
168
178
return list (reversed ([[mapping .get ((x , y ), None )
169
179
for x in range (self .cols )]
170
180
for y in range (self .rows )]))
@@ -190,6 +200,7 @@ def to_arrows(self, policy):
190
200
191
201
def value_iteration (mdp , epsilon = 0.001 ):
192
202
"""Solving an MDP by value iteration. [Figure 17.4]"""
203
+
193
204
U1 = {s : 0 for s in mdp .states }
194
205
R , T , gamma = mdp .R , mdp .T , mdp .gamma
195
206
while True :
@@ -206,6 +217,7 @@ def value_iteration(mdp, epsilon=0.001):
206
217
def best_policy (mdp , U ):
207
218
"""Given an MDP and a utility function U, determine the best policy,
208
219
as a mapping from state to action. (Equation 17.4)"""
220
+
209
221
pi = {}
210
222
for s in mdp .states :
211
223
pi [s ] = argmax (mdp .actions (s ), key = lambda a : expected_utility (a , s , U , mdp ))
@@ -214,13 +226,15 @@ def best_policy(mdp, U):
214
226
215
227
def expected_utility (a , s , U , mdp ):
216
228
"""The expected utility of doing a in state s, according to the MDP and U."""
229
+
217
230
return sum (p * U [s1 ] for (p , s1 ) in mdp .T (s , a ))
218
231
219
232
# ______________________________________________________________________________
220
233
221
234
222
235
def policy_iteration (mdp ):
223
236
"""Solve an MDP by policy iteration [Figure 17.7]"""
237
+
224
238
U = {s : 0 for s in mdp .states }
225
239
pi = {s : random .choice (mdp .actions (s )) for s in mdp .states }
226
240
while True :
@@ -238,13 +252,206 @@ def policy_iteration(mdp):
238
252
def policy_evaluation (pi , U , mdp , k = 20 ):
239
253
"""Return an updated utility mapping U from each state in the MDP to its
240
254
utility, using an approximation (modified policy iteration)."""
255
+
241
256
R , T , gamma = mdp .R , mdp .T , mdp .gamma
242
257
for i in range (k ):
243
258
for s in mdp .states :
244
259
U [s ] = R (s ) + gamma * sum (p * U [s1 ] for (p , s1 ) in T (s , pi [s ]))
245
260
return U
246
261
247
262
263
+ class POMDP (MDP ):
264
+
265
+ """A Partially Observable Markov Decision Process, defined by
266
+ a transition model P(s'|s,a), actions A(s), a reward function R(s),
267
+ and a sensor model P(e|s). We also keep track of a gamma value,
268
+ for use by algorithms. The transition and the sensor models
269
+ are defined as matrices. We also keep track of the possible states
270
+ and actions for each state. [page 659]."""
271
+
272
+ def __init__ (self , actions , transitions = None , evidences = None , rewards = None , states = None , gamma = 0.95 ):
273
+ """Initialize variables of the pomdp"""
274
+
275
+ if not (0 < gamma <= 1 ):
276
+ raise ValueError ('A POMDP must have 0 < gamma <= 1' )
277
+
278
+ self .states = states
279
+ self .actions = actions
280
+
281
+ # transition model cannot be undefined
282
+ self .t_prob = transitions or {}
283
+ if not self .t_prob :
284
+ print ('Warning: Transition model is undefined' )
285
+
286
+ # sensor model cannot be undefined
287
+ self .e_prob = evidences or {}
288
+ if not self .e_prob :
289
+ print ('Warning: Sensor model is undefined' )
290
+
291
+ self .gamma = gamma
292
+ self .rewards = rewards
293
+
294
+ def remove_dominated_plans (self , input_values ):
295
+ """
296
+ Remove dominated plans.
297
+ This method finds all the lines contributing to the
298
+ upper surface and removes those which don't.
299
+ """
300
+
301
+ values = [val for action in input_values for val in input_values [action ]]
302
+ values .sort (key = lambda x : x [0 ], reverse = True )
303
+
304
+ best = [values [0 ]]
305
+ y1_max = max (val [1 ] for val in values )
306
+ tgt = values [0 ]
307
+ prev_b = 0
308
+ prev_ix = 0
309
+ while tgt [1 ] != y1_max :
310
+ min_b = 1
311
+ min_ix = 0
312
+ for i in range (prev_ix + 1 , len (values )):
313
+ if values [i ][0 ] - tgt [0 ] + tgt [1 ] - values [i ][1 ] != 0 :
314
+ trans_b = (values [i ][0 ] - tgt [0 ]) / (values [i ][0 ] - tgt [0 ] + tgt [1 ] - values [i ][1 ])
315
+ if 0 <= trans_b <= 1 and trans_b > prev_b and trans_b < min_b :
316
+ min_b = trans_b
317
+ min_ix = i
318
+ prev_b = min_b
319
+ prev_ix = min_ix
320
+ tgt = values [min_ix ]
321
+ best .append (tgt )
322
+
323
+ return self .generate_mapping (best , input_values )
324
+
325
+ def remove_dominated_plans_fast (self , input_values ):
326
+ """
327
+ Remove dominated plans using approximations.
328
+ Resamples the upper boundary at intervals of 100 and
329
+ finds the maximum values at these points.
330
+ """
331
+
332
+ values = [val for action in input_values for val in input_values [action ]]
333
+ values .sort (key = lambda x : x [0 ], reverse = True )
334
+
335
+ best = []
336
+ sr = 100
337
+ for i in range (sr + 1 ):
338
+ x = i / float (sr )
339
+ maximum = (values [0 ][1 ] - values [0 ][0 ]) * x + values [0 ][0 ]
340
+ tgt = values [0 ]
341
+ for value in values :
342
+ val = (value [1 ] - value [0 ]) * x + value [0 ]
343
+ if val > maximum :
344
+ maximum = val
345
+ tgt = value
346
+
347
+ if all (any (tgt != v ) for v in best ):
348
+ best .append (np .array (tgt ))
349
+
350
+ return self .generate_mapping (best , input_values )
351
+
352
+ def generate_mapping (self , best , input_values ):
353
+ """Generate mappings after removing dominated plans"""
354
+
355
+ mapping = defaultdict (list )
356
+ for value in best :
357
+ for action in input_values :
358
+ if any (all (value == v ) for v in input_values [action ]):
359
+ mapping [action ].append (value )
360
+
361
+ return mapping
362
+
363
+ def max_difference (self , U1 , U2 ):
364
+ """Find maximum difference between two utility mappings"""
365
+
366
+ for k , v in U1 .items ():
367
+ sum1 = 0
368
+ for element in U1 [k ]:
369
+ sum1 += sum (element )
370
+ sum2 = 0
371
+ for element in U2 [k ]:
372
+ sum2 += sum (element )
373
+ return abs (sum1 - sum2 )
374
+
375
+
376
+ class Matrix :
377
+ """Matrix operations class"""
378
+
379
+ @staticmethod
380
+ def add (A , B ):
381
+ """Add two matrices A and B"""
382
+
383
+ res = []
384
+ for i in range (len (A )):
385
+ row = []
386
+ for j in range (len (A [0 ])):
387
+ row .append (A [i ][j ] + B [i ][j ])
388
+ res .append (row )
389
+ return res
390
+
391
+ @staticmethod
392
+ def scalar_multiply (a , B ):
393
+ """Multiply scalar a to matrix B"""
394
+
395
+ for i in range (len (B )):
396
+ for j in range (len (B [0 ])):
397
+ B [i ][j ] = a * B [i ][j ]
398
+ return B
399
+
400
+ @staticmethod
401
+ def multiply (A , B ):
402
+ """Multiply two matrices A and B element-wise"""
403
+
404
+ matrix = []
405
+ for i in range (len (B )):
406
+ row = []
407
+ for j in range (len (B [0 ])):
408
+ row .append (B [i ][j ] * A [j ][i ])
409
+ matrix .append (row )
410
+
411
+ return matrix
412
+
413
+ @staticmethod
414
+ def matmul (A , B ):
415
+ """Inner-product of two matrices"""
416
+
417
+ return [[sum (ele_a * ele_b for ele_a , ele_b in zip (row_a , col_b )) for col_b in list (zip (* B ))] for row_a in A ]
418
+
419
+ @staticmethod
420
+ def transpose (A ):
421
+ """Transpose a matrix"""
422
+
423
+ return [list (i ) for i in zip (* A )]
424
+
425
+
426
+ def pomdp_value_iteration (pomdp , epsilon = 0.1 ):
427
+ """Solving a POMDP by value iteration."""
428
+
429
+ U = {'' :[[0 ]* len (pomdp .states )]}
430
+ count = 0
431
+ while True :
432
+ count += 1
433
+ prev_U = U
434
+ values = [val for action in U for val in U [action ]]
435
+ value_matxs = []
436
+ for i in values :
437
+ for j in values :
438
+ value_matxs .append ([i , j ])
439
+
440
+ U1 = defaultdict (list )
441
+ for action in pomdp .actions :
442
+ for u in value_matxs :
443
+ u1 = Matrix .matmul (Matrix .matmul (pomdp .t_prob [int (action )], Matrix .multiply (pomdp .e_prob [int (action )], Matrix .transpose (u ))), [[1 ], [1 ]])
444
+ u1 = Matrix .add (Matrix .scalar_multiply (pomdp .gamma , Matrix .transpose (u1 )), [pomdp .rewards [int (action )]])
445
+ U1 [action ].append (u1 [0 ])
446
+
447
+ U = pomdp .remove_dominated_plans_fast (U1 )
448
+ # replace with U = pomdp.remove_dominated_plans(U1) for accurate calculations
449
+
450
+ if count > 10 :
451
+ if pomdp .max_difference (U , prev_U ) < epsilon * (1 - pomdp .gamma ) / pomdp .gamma :
452
+ return U
453
+
454
+
248
455
__doc__ += """
249
456
>>> pi = best_policy(sequential_decision_environment, value_iteration(sequential_decision_environment, .01))
250
457
0 commit comments