From 30a82ec89aa6fc540474978622e11d00ba00fd4d Mon Sep 17 00:00:00 2001 From: morvanzhou Date: Fri, 15 Jun 2018 12:57:08 +0800 Subject: [PATCH] add plt to show result --- experiments/Solve_BipedalWalker/A3C.py | 8 ++++++-- experiments/Solve_BipedalWalker/A3C_rnn.py | 19 ++++++++++++------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/experiments/Solve_BipedalWalker/A3C.py b/experiments/Solve_BipedalWalker/A3C.py index 5cecde5..560aa95 100644 --- a/experiments/Solve_BipedalWalker/A3C.py +++ b/experiments/Solve_BipedalWalker/A3C.py @@ -28,7 +28,7 @@ UPDATE_GLOBAL_ITER = 10 GAMMA = 0.99 ENTROPY_BETA = 0.005 -LR_A = 0.00002 # learning rate for actor +LR_A = 0.00005 # learning rate for actor LR_C = 0.0001 # learning rate for critic GLOBAL_RUNNING_R = [] GLOBAL_EP = 0 @@ -205,5 +205,9 @@ def work(self): t.start() worker_threads.append(t) COORD.join(worker_threads) - + import matplotlib.pyplot as plt + plt.plot(GLOBAL_RUNNING_R) + plt.xlabel('episode') + plt.ylabel('global running reward') + plt.show() diff --git a/experiments/Solve_BipedalWalker/A3C_rnn.py b/experiments/Solve_BipedalWalker/A3C_rnn.py index 1028414..6768b2d 100644 --- a/experiments/Solve_BipedalWalker/A3C_rnn.py +++ b/experiments/Solve_BipedalWalker/A3C_rnn.py @@ -26,9 +26,9 @@ MAX_GLOBAL_EP = 8000 GLOBAL_NET_SCOPE = 'Global_Net' UPDATE_GLOBAL_ITER = 10 -GAMMA = 0.99 -ENTROPY_BETA = 0.005 -LR_A = 0.00001 # learning rate for actor +GAMMA = 0.9 +ENTROPY_BETA = 0.001 +LR_A = 0.00002 # learning rate for actor LR_C = 0.0001 # learning rate for critic GLOBAL_RUNNING_R = [] GLOBAL_EP = 0 @@ -95,9 +95,9 @@ def __init__(self, scope, globalAC=None): self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params)) def _build_net(self): - w_init = tf.random_normal_initializer(0., .01) + w_init = tf.random_normal_initializer(0., .1) with tf.variable_scope('critic'): # only critic controls the rnn update - cell_size = 128 + cell_size = 126 s = tf.expand_dims(self.s, axis=1, name='timely_input') # [time_step, feature] => [time_step, batch, feature] rnn_cell = tf.contrib.rnn.BasicRNNCell(cell_size) @@ -105,12 +105,12 @@ def _build_net(self): outputs, self.final_state = tf.nn.dynamic_rnn( cell=rnn_cell, inputs=s, initial_state=self.init_state, time_major=True) cell_out = tf.reshape(outputs, [-1, cell_size], name='flatten_rnn_outputs') # joined state representation - l_c = tf.layers.dense(cell_out, 300, tf.nn.relu6, kernel_initializer=w_init, name='lc') + l_c = tf.layers.dense(cell_out, 512, tf.nn.relu6, kernel_initializer=w_init, name='lc') v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v') # state value with tf.variable_scope('actor'): # state representation is based on critic cell_out = tf.stop_gradient(cell_out, name='c_cell_out') # from what critic think it is - l_a = tf.layers.dense(cell_out, 400, tf.nn.relu6, kernel_initializer=w_init, name='la') + l_a = tf.layers.dense(cell_out, 512, tf.nn.relu6, kernel_initializer=w_init, name='la') mu = tf.layers.dense(l_a, N_A, tf.nn.tanh, kernel_initializer=w_init, name='mu') sigma = tf.layers.dense(l_a, N_A, tf.nn.softplus, kernel_initializer=w_init, name='sigma') # restrict variance return mu, sigma, v @@ -233,3 +233,8 @@ def work(self): t.start() worker_threads.append(t) COORD.join(worker_threads) + import matplotlib.pyplot as plt + plt.plot(GLOBAL_RUNNING_R) + plt.xlabel('episode') + plt.ylabel('global running reward') + plt.show() \ No newline at end of file