From 30a82ec89aa6fc540474978622e11d00ba00fd4d Mon Sep 17 00:00:00 2001
From: morvanzhou <morvanzhou@gmail.com>
Date: Fri, 15 Jun 2018 12:57:08 +0800
Subject: [PATCH] add plt to show result

---
 experiments/Solve_BipedalWalker/A3C.py     |  8 ++++++--
 experiments/Solve_BipedalWalker/A3C_rnn.py | 19 ++++++++++++-------
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/experiments/Solve_BipedalWalker/A3C.py b/experiments/Solve_BipedalWalker/A3C.py
index 5cecde5..560aa95 100644
--- a/experiments/Solve_BipedalWalker/A3C.py
+++ b/experiments/Solve_BipedalWalker/A3C.py
@@ -28,7 +28,7 @@
 UPDATE_GLOBAL_ITER = 10
 GAMMA = 0.99
 ENTROPY_BETA = 0.005
-LR_A = 0.00002    # learning rate for actor
+LR_A = 0.00005    # learning rate for actor
 LR_C = 0.0001    # learning rate for critic
 GLOBAL_RUNNING_R = []
 GLOBAL_EP = 0
@@ -205,5 +205,9 @@ def work(self):
         t.start()
         worker_threads.append(t)
     COORD.join(worker_threads)
-
+    import matplotlib.pyplot as plt
+    plt.plot(GLOBAL_RUNNING_R)
+    plt.xlabel('episode')
+    plt.ylabel('global running reward')
+    plt.show()
 
diff --git a/experiments/Solve_BipedalWalker/A3C_rnn.py b/experiments/Solve_BipedalWalker/A3C_rnn.py
index 1028414..6768b2d 100644
--- a/experiments/Solve_BipedalWalker/A3C_rnn.py
+++ b/experiments/Solve_BipedalWalker/A3C_rnn.py
@@ -26,9 +26,9 @@
 MAX_GLOBAL_EP = 8000
 GLOBAL_NET_SCOPE = 'Global_Net'
 UPDATE_GLOBAL_ITER = 10
-GAMMA = 0.99
-ENTROPY_BETA = 0.005
-LR_A = 0.00001    # learning rate for actor
+GAMMA = 0.9
+ENTROPY_BETA = 0.001
+LR_A = 0.00002    # learning rate for actor
 LR_C = 0.0001    # learning rate for critic
 GLOBAL_RUNNING_R = []
 GLOBAL_EP = 0
@@ -95,9 +95,9 @@ def __init__(self, scope, globalAC=None):
                     self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params))
 
     def _build_net(self):
-        w_init = tf.random_normal_initializer(0., .01)
+        w_init = tf.random_normal_initializer(0., .1)
         with tf.variable_scope('critic'):  # only critic controls the rnn update
-            cell_size = 128
+            cell_size = 126
             s = tf.expand_dims(self.s, axis=1,
                                name='timely_input')  # [time_step, feature] => [time_step, batch, feature]
             rnn_cell = tf.contrib.rnn.BasicRNNCell(cell_size)
@@ -105,12 +105,12 @@ def _build_net(self):
             outputs, self.final_state = tf.nn.dynamic_rnn(
                 cell=rnn_cell, inputs=s, initial_state=self.init_state, time_major=True)
             cell_out = tf.reshape(outputs, [-1, cell_size], name='flatten_rnn_outputs')  # joined state representation
-            l_c = tf.layers.dense(cell_out, 300, tf.nn.relu6, kernel_initializer=w_init, name='lc')
+            l_c = tf.layers.dense(cell_out, 512, tf.nn.relu6, kernel_initializer=w_init, name='lc')
             v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v')  # state value
 
         with tf.variable_scope('actor'):  # state representation is based on critic
             cell_out = tf.stop_gradient(cell_out, name='c_cell_out')  # from what critic think it is
-            l_a = tf.layers.dense(cell_out, 400, tf.nn.relu6, kernel_initializer=w_init, name='la')
+            l_a = tf.layers.dense(cell_out, 512, tf.nn.relu6, kernel_initializer=w_init, name='la')
             mu = tf.layers.dense(l_a, N_A, tf.nn.tanh, kernel_initializer=w_init, name='mu')
             sigma = tf.layers.dense(l_a, N_A, tf.nn.softplus, kernel_initializer=w_init, name='sigma') # restrict variance
         return mu, sigma, v
@@ -233,3 +233,8 @@ def work(self):
         t.start()
         worker_threads.append(t)
     COORD.join(worker_threads)
+    import matplotlib.pyplot as plt
+    plt.plot(GLOBAL_RUNNING_R)
+    plt.xlabel('episode')
+    plt.ylabel('global running reward')
+    plt.show()
\ No newline at end of file