-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathagents.py
117 lines (86 loc) · 3 KB
/
agents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""
File to complete. Contains the agents
"""
import numpy as np
import math
class Agent(object):
"""Agent base class. DO NOT MODIFY THIS CLASS
"""
def __init__(self, mdp):
# Init the random policy
super(Agent, self).__init__()
self.policy = np.zeros((4, mdp.size[0], mdp.size[1])) + 0.25
self.mdp = mdp
self.actions = [(-1, 0), (1, 0), (0, -1), (0, 1)]
self.discount = 0.9
self.V = np.zeros(mdp.size)
# For others: Q values
self.Q = np.zeros((4, mdp.size[0], mdp.size[1]))
def update(self):
# DO NOT MODIFY
raise NotImplementedError
def action(self):
self.last_position = self.mdp.position
return self.actions[np.random.choice(range(len(self.actions)),
p=self.policy[:, self.last_position[0],
self.last_position[1]])]
class VEvalTemporalDifferencing(Agent):
def __init__(self, mdp):
super(VEvalTemporalDifferencing, self).__init__(mdp)
self.learning_rate = 0.1
def update(self):
# If the last episode finish, the reward is the one from the last episode
reward = self.mdp.get_last_reward()
self.V[self.last_position] += self.learning_rate * \
(reward + self.discount *
self.V[self.mdp.position] - self.V[self.last_position])
def action(self):
# YOU CAN MODIFY
return super(VEvalTemporalDifferencing, self).action()
class VEvalMonteCarlo(Agent):
def __init__(self, mdp):
super(VEvalMonteCarlo, self).__init__(mdp)
def update(self):
# TO IMPLEMENT
raise NotImplementedError
def action(self):
# YOU CAN MODIFY
return super(VEvalMonteCarlo, self).action()
class ValueIteration(Agent):
def __init__(self, mdp):
super(ValueIteration, self).__init__(mdp)
def update(self):
# TO IMPLEMENT
reward = self.mdp.get_last_reward()
self.V[self.last_position] = reward + self.discount * 99999999
raise NotImplementedError
def action(self):
# YOU CAN MODIFY
return super(ValueIteration, self).action()
class PolicyIteration(Agent):
def __init__(self, mdp):
super(PolicyIteration, self).__init__(mdp)
def update(self):
# TO IMPLEMENT
raise NotImplementedError
def action(self):
# YOU CAN MODIFY
return super(PolicyIteration, self).action()
class QLearning(Agent):
def __init__(self, mdp):
super(QLearning, self).__init__(mdp)
def update(self):
# TO IMPLEMENT
raise NotImplementedError
def action(self):
# YOU CAN MODIFY
return super(QLearning, self).action()
class SARSA(Agent):
def __init__(self, mdp):
super(SARSA, self).__init__(mdp)
def update(self):
# TO IMPLEMENT
raise NotImplementedError
def action(self):
# YOU CAN MODIFY
return super(SARSA, self).action()