From e6e59bc160a150cd11233c5a035125486965d710 Mon Sep 17 00:00:00 2001
From: Morvan Zhou <morvanzhou@gmail.com>
Date: Thu, 18 Jan 2018 22:07:30 +1100
Subject: [PATCH] add stop gradient

---
 contents/10_A3C/A3C_RNN.py               | 2 +-
 contents/10_A3C/A3C_continuous_action.py | 6 +++---
 contents/10_A3C/A3C_discrete_action.py   | 2 +-
 contents/10_A3C/A3C_distributed_tf.py    | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/contents/10_A3C/A3C_RNN.py b/contents/10_A3C/A3C_RNN.py
index 9a42100..cdac4c8 100644
--- a/contents/10_A3C/A3C_RNN.py
+++ b/contents/10_A3C/A3C_RNN.py
@@ -67,7 +67,7 @@ def __init__(self, scope, globalAC=None):
 
                 with tf.name_scope('a_loss'):
                     log_prob = normal_dist.log_prob(self.a_his)
-                    exp_v = log_prob * td
+                    exp_v = log_prob * tf.stop_gradient(td)
                     entropy = normal_dist.entropy()  # encourage exploration
                     self.exp_v = ENTROPY_BETA * entropy + exp_v
                     self.a_loss = tf.reduce_mean(-self.exp_v)
diff --git a/contents/10_A3C/A3C_continuous_action.py b/contents/10_A3C/A3C_continuous_action.py
index 61b1b9f..e2cd3d4 100644
--- a/contents/10_A3C/A3C_continuous_action.py
+++ b/contents/10_A3C/A3C_continuous_action.py
@@ -67,7 +67,7 @@ def __init__(self, scope, globalAC=None):
 
                 with tf.name_scope('a_loss'):
                     log_prob = normal_dist.log_prob(self.a_his)
-                    exp_v = log_prob * td
+                    exp_v = log_prob * tf.stop_gradient(td)
                     entropy = normal_dist.entropy()  # encourage exploration
                     self.exp_v = ENTROPY_BETA * entropy + exp_v
                     self.a_loss = tf.reduce_mean(-self.exp_v)
@@ -124,8 +124,8 @@ def work(self):
             s = self.env.reset()
             ep_r = 0
             for ep_t in range(MAX_EP_STEP):
-                if self.name == 'W_0':
-                    self.env.render()
+                # if self.name == 'W_0':
+                #     self.env.render()
                 a = self.AC.choose_action(s)
                 s_, r, done, info = self.env.step(a)
                 done = True if ep_t == MAX_EP_STEP - 1 else False
diff --git a/contents/10_A3C/A3C_discrete_action.py b/contents/10_A3C/A3C_discrete_action.py
index e3209e9..8335e62 100644
--- a/contents/10_A3C/A3C_discrete_action.py
+++ b/contents/10_A3C/A3C_discrete_action.py
@@ -60,7 +60,7 @@ def __init__(self, scope, globalAC=None):
 
                 with tf.name_scope('a_loss'):
                     log_prob = tf.reduce_sum(tf.log(self.a_prob) * tf.one_hot(self.a_his, N_A, dtype=tf.float32), axis=1, keep_dims=True)
-                    exp_v = log_prob * td
+                    exp_v = log_prob * tf.stop_gradient(td)
                     entropy = -tf.reduce_sum(self.a_prob * tf.log(self.a_prob + 1e-5),
                                              axis=1, keep_dims=True)  # encourage exploration
                     self.exp_v = ENTROPY_BETA * entropy + exp_v
diff --git a/contents/10_A3C/A3C_distributed_tf.py b/contents/10_A3C/A3C_distributed_tf.py
index 20fd531..cde5169 100644
--- a/contents/10_A3C/A3C_distributed_tf.py
+++ b/contents/10_A3C/A3C_distributed_tf.py
@@ -49,7 +49,7 @@ def __init__(self, scope, opt_a=None, opt_c=None, global_net=None):
                     log_prob = tf.reduce_sum(
                         tf.log(self.a_prob) * tf.one_hot(self.a_his, N_A, dtype=tf.float32),
                         axis=1, keep_dims=True)
-                    exp_v = log_prob * td
+                    exp_v = log_prob * tf.stop_gradient(td)
                     entropy = -tf.reduce_sum(self.a_prob * tf.log(self.a_prob + 1e-5),
                                              axis=1, keep_dims=True)  # encourage exploration
                     self.exp_v = ENTROPY_BETA * entropy + exp_v