-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathbase_agent.py
67 lines (58 loc) · 1.88 KB
/
base_agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import numpy as np
import numpy.random as npr
import neuronav.utils as utils
class BaseAgent:
"""
Parent class for Agents which concrete implementations inherit from.
"""
def __init__(
self,
state_size: int,
action_size: int,
lr: float = 1e-1,
gamma: float = 0.99,
poltype: str = "softmax",
beta: float = 1e4,
epsilon: float = 1e-1,
):
self.state_size = state_size
self.action_size = action_size
self.lr = lr
self.beta = beta
self.gamma = gamma
self.poltype = poltype
self.num_updates = 0
self.epsilon = epsilon
def base_sample_action(self, policy_logits):
if self.poltype == "softmax":
action = npr.choice(
self.action_size, p=utils.softmax(self.beta * policy_logits)
)
else:
if npr.rand() < self.epsilon:
action = npr.choice(self.action_size)
else:
action = np.argmax(policy_logits)
return action
def update(self, current_exp):
self.num_updates += 1
error = self._update(current_exp)
return error
def base_get_policy(self, policy_logits):
if self.poltype == "softmax":
policy = utils.softmax(self.beta * policy_logits, axis=0)
else:
mask = policy_logits == policy_logits.max(0)
greedy = mask / mask.sum(0)
policy = (1 - self.epsilon) * greedy + (
1 / self.action_size
) * self.epsilon * np.ones((self.action_size, self.state_size))
return policy
def discount(self, rewards, gamma):
for i in range(len(rewards) - 2, -1, -1):
rewards[i] += gamma * rewards[i + 1]
return rewards
def _update(self, current_exp):
return None
def reset(self):
return None