diff --git a/contents/10_A3C/A3C_RNN.py b/contents/10_A3C/A3C_RNN.py index fbb72ed..9a42100 100644 --- a/contents/10_A3C/A3C_RNN.py +++ b/contents/10_A3C/A3C_RNN.py @@ -68,7 +68,7 @@ def __init__(self, scope, globalAC=None): with tf.name_scope('a_loss'): log_prob = normal_dist.log_prob(self.a_his) exp_v = log_prob * td - entropy = tf.stop_gradient(normal_dist.entropy()) # encourage exploration + entropy = normal_dist.entropy() # encourage exploration self.exp_v = ENTROPY_BETA * entropy + exp_v self.a_loss = tf.reduce_mean(-self.exp_v) diff --git a/contents/10_A3C/A3C_continuous_action.py b/contents/10_A3C/A3C_continuous_action.py index 60043d7..61b1b9f 100644 --- a/contents/10_A3C/A3C_continuous_action.py +++ b/contents/10_A3C/A3C_continuous_action.py @@ -68,7 +68,7 @@ def __init__(self, scope, globalAC=None): with tf.name_scope('a_loss'): log_prob = normal_dist.log_prob(self.a_his) exp_v = log_prob * td - entropy = tf.stop_gradient(normal_dist.entropy()) # encourage exploration + entropy = normal_dist.entropy() # encourage exploration self.exp_v = ENTROPY_BETA * entropy + exp_v self.a_loss = tf.reduce_mean(-self.exp_v) diff --git a/contents/10_A3C/A3C_discrete_action.py b/contents/10_A3C/A3C_discrete_action.py index 2ba7bce..489657f 100644 --- a/contents/10_A3C/A3C_discrete_action.py +++ b/contents/10_A3C/A3C_discrete_action.py @@ -61,8 +61,8 @@ def __init__(self, scope, globalAC=None): with tf.name_scope('a_loss'): log_prob = tf.reduce_sum(tf.log(self.a_prob) * tf.one_hot(self.a_his, N_A, dtype=tf.float32), axis=1, keep_dims=True) exp_v = log_prob * td - entropy = tf.stop_gradient(-tf.reduce_sum(self.a_prob * tf.log(self.a_prob + 1e-5), - axis=1, keep_dims=True)) # encourage exploration + entropy = -tf.reduce_sum(self.a_prob * tf.log(self.a_prob + 1e-5), + axis=1, keep_dims=True) # encourage exploration self.exp_v = ENTROPY_BETA * entropy + exp_v self.a_loss = tf.reduce_mean(-self.exp_v) diff --git a/contents/12_Proximal_Policy_Optimization/simply_PPO.py b/contents/12_Proximal_Policy_Optimization/simply_PPO.py index 25464e9..bcb03e1 100644 --- a/contents/12_Proximal_Policy_Optimization/simply_PPO.py +++ b/contents/12_Proximal_Policy_Optimization/simply_PPO.py @@ -64,7 +64,7 @@ def __init__(self): surr = ratio * self.tfadv if METHOD['name'] == 'kl_pen': self.tflam = tf.placeholder(tf.float32, None, 'lambda') - kl = tf.stop_gradient(tf.distributions.kl_divergence(oldpi, pi)) + kl = tf.distributions.kl_divergence(oldpi, pi) self.kl_mean = tf.reduce_mean(kl) self.aloss = -(tf.reduce_mean(surr - self.tflam * kl)) else: # clipping method, find this is better diff --git a/contents/8_Actor_Critic_Advantage/AC_continue_Pendulum.py b/contents/8_Actor_Critic_Advantage/AC_continue_Pendulum.py index 0de0ac7..c52c6d1 100644 --- a/contents/8_Actor_Critic_Advantage/AC_continue_Pendulum.py +++ b/contents/8_Actor_Critic_Advantage/AC_continue_Pendulum.py @@ -65,7 +65,7 @@ def __init__(self, sess, n_features, action_bound, lr=0.0001): log_prob = self.normal_dist.log_prob(self.a) # loss without advantage self.exp_v = log_prob * self.td_error # advantage (TD_error) guided loss # Add cross entropy cost to encourage exploration - self.exp_v += tf.stop_gradient(0.1*self.normal_dist.entropy()) + self.exp_v += 0.01*self.normal_dist.entropy() with tf.name_scope('train'): self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v, global_step) # min(v) = max(-v)