-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlosses.py
66 lines (51 loc) · 1.81 KB
/
losses.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import tensorflow as tf
"""
All the components for A2C RL loss (advantage-actor-critic).
"""
def negative_policy_entropy_loss(pols, mask=None, epsilon=1e-8):
"""
Args:
- pols: (T, B, pol_dim).
- mask: (T, B, 1) or None.
Returns:
- loss: scalar
"""
if mask is None:
mask = tf.ones([v_s.shape[0], q_sa.shape[1], 1])
mask = tf.stop_gradient(mask)
loss_per_tstep_batch = tf.reduce_sum(mask*pols*tf.math.log(pols+epsilon), axis=2) # (T, B)
loss_per_batch = tf.reduce_sum(loss_per_tstep_batch, axis=0) # (B)
return tf.reduce_mean(loss_per_batch) # scalar
def value_loss(v_s, q_sa, mask=None):
"""
Args:
- v_s: V(s) as output by agent. (T, B, 1).
- q_sa: Q(s,a) = r(s,a) + V(s'). (T, B, 1).
- mask: (T, B, 1) or None
Returns:
- loss: scalar
"""
if mask is None:
mask = tf.ones([v_s.shape[0], q_sa.shape[1], 1])
q_sa = tf.stop_gradient(q_sa)
mask = tf.stop_gradient(mask)
value_loss_per_batch = tf.reduce_sum(mask*tf.square(v_s - q_sa), axis=0) # (B, 1)
return 0.5 * tf.reduce_mean(value_loss_per_batch) # scalar
def advantage_policy_gradient_loss(pols, actions, advantages, mask=None, epsilon=1e-8):
"""
Args:
- pols: (T, B, pol_dim)
- actions: (T, B, pol_dim)
- advantages: (T, B, 1)
- mask: (T, B, 1) or None
Returns:
- loss: scalar
"""
if mask is None:
mask = tf.ones([pols.shape[0], pols.shape[1], 1])
actions = tf.stop_gradient(actions)
advantages = tf.stop_gradient(advantages)
mask = tf.stop_gradient(mask)
actions_logps = tf.reduce_sum(actions*tf.math.log(pols+epsilon), axis=-1, keepdims=True) # (T, B, 1)
actor_loss_per_batch = tf.reduce_sum(mask*advantages*-actions_logps, axis=0) # (B, 1)
return tf.reduce_mean(actor_loss_per_batch) # scalar