From e6e59bc160a150cd11233c5a035125486965d710 Mon Sep 17 00:00:00 2001 From: Morvan Zhou Date: Thu, 18 Jan 2018 22:07:30 +1100 Subject: [PATCH] add stop gradient --- contents/10_A3C/A3C_RNN.py | 2 +- contents/10_A3C/A3C_continuous_action.py | 6 +++--- contents/10_A3C/A3C_discrete_action.py | 2 +- contents/10_A3C/A3C_distributed_tf.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/contents/10_A3C/A3C_RNN.py b/contents/10_A3C/A3C_RNN.py index 9a42100..cdac4c8 100644 --- a/contents/10_A3C/A3C_RNN.py +++ b/contents/10_A3C/A3C_RNN.py @@ -67,7 +67,7 @@ def __init__(self, scope, globalAC=None): with tf.name_scope('a_loss'): log_prob = normal_dist.log_prob(self.a_his) - exp_v = log_prob * td + exp_v = log_prob * tf.stop_gradient(td) entropy = normal_dist.entropy() # encourage exploration self.exp_v = ENTROPY_BETA * entropy + exp_v self.a_loss = tf.reduce_mean(-self.exp_v) diff --git a/contents/10_A3C/A3C_continuous_action.py b/contents/10_A3C/A3C_continuous_action.py index 61b1b9f..e2cd3d4 100644 --- a/contents/10_A3C/A3C_continuous_action.py +++ b/contents/10_A3C/A3C_continuous_action.py @@ -67,7 +67,7 @@ def __init__(self, scope, globalAC=None): with tf.name_scope('a_loss'): log_prob = normal_dist.log_prob(self.a_his) - exp_v = log_prob * td + exp_v = log_prob * tf.stop_gradient(td) entropy = normal_dist.entropy() # encourage exploration self.exp_v = ENTROPY_BETA * entropy + exp_v self.a_loss = tf.reduce_mean(-self.exp_v) @@ -124,8 +124,8 @@ def work(self): s = self.env.reset() ep_r = 0 for ep_t in range(MAX_EP_STEP): - if self.name == 'W_0': - self.env.render() + # if self.name == 'W_0': + # self.env.render() a = self.AC.choose_action(s) s_, r, done, info = self.env.step(a) done = True if ep_t == MAX_EP_STEP - 1 else False diff --git a/contents/10_A3C/A3C_discrete_action.py b/contents/10_A3C/A3C_discrete_action.py index e3209e9..8335e62 100644 --- a/contents/10_A3C/A3C_discrete_action.py +++ b/contents/10_A3C/A3C_discrete_action.py @@ -60,7 +60,7 @@ def __init__(self, scope, globalAC=None): with tf.name_scope('a_loss'): log_prob = tf.reduce_sum(tf.log(self.a_prob) * tf.one_hot(self.a_his, N_A, dtype=tf.float32), axis=1, keep_dims=True) - exp_v = log_prob * td + exp_v = log_prob * tf.stop_gradient(td) entropy = -tf.reduce_sum(self.a_prob * tf.log(self.a_prob + 1e-5), axis=1, keep_dims=True) # encourage exploration self.exp_v = ENTROPY_BETA * entropy + exp_v diff --git a/contents/10_A3C/A3C_distributed_tf.py b/contents/10_A3C/A3C_distributed_tf.py index 20fd531..cde5169 100644 --- a/contents/10_A3C/A3C_distributed_tf.py +++ b/contents/10_A3C/A3C_distributed_tf.py @@ -49,7 +49,7 @@ def __init__(self, scope, opt_a=None, opt_c=None, global_net=None): log_prob = tf.reduce_sum( tf.log(self.a_prob) * tf.one_hot(self.a_his, N_A, dtype=tf.float32), axis=1, keep_dims=True) - exp_v = log_prob * td + exp_v = log_prob * tf.stop_gradient(td) entropy = -tf.reduce_sum(self.a_prob * tf.log(self.a_prob + 1e-5), axis=1, keep_dims=True) # encourage exploration self.exp_v = ENTROPY_BETA * entropy + exp_v