diff --git a/README.md b/README.md
new file mode 100644
index 0000000..5f17bfe
--- /dev/null
+++ b/README.md
@@ -0,0 +1,50 @@
+
+
+
+
+
+
+---
+
+
+
+# Reinforcement Learning Methods and Tutorials
+
+In these tutorials for reinforcement learning, it covers from the basic RL algorithms to advanced algorithms developed recent years.
+
+**For Chinese speaker, visit [莫烦 Python](https://morvanzhou.github.io/tutorials/) or my [Youtube channel](https://www.youtube.com/channel/UCdyjiB5H8Pu7aDTNVXTTpcg) for more.**
+
+**As many requests about making these tutorials available in English, please find them in this playlist:** ([https://www.youtube.com/playlist?list=PLXO45tsB95cIplu-fLMpUEEZTwrDNh6Ba](https://www.youtube.com/playlist?list=PLXO45tsB95cIplu-fLMpUEEZTwrDNh6Ba))
+
+
+* [Simple entry example](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/1_command_line_reinforcement_learning)
+* Tabular Methods
+ * [Q-learning](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/2_Q_Learning_maze)
+ * [Sarsa](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/3_Sarsa_maze)
+ * [Sarsa(lambda)](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/4_Sarsa_lambda_maze)
+* Function Approximation (DQN)
+ * [Deep Q Network](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/5_Deep_Q_Network)
+* [Using OpenAI Gym](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/6_OpenAI_gym)
+* DQN-based methods
+ * [Double DQN](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/5.1_Double_DQN)
+ * [DQN with Prioitized Experience Replay](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/5.2_Prioritized_Replay_DQN)
+ * [Dueling DQN](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/5.3_Dueling_DQN)
+* [Policy Gradients](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/7_Policy_gradient_softmax)
+* [Actor Critic](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/8_Actor_Critic_Advantage)
+ * [Deep Deterministic Policy Gradient](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/9_Deep_Deterministic_Policy_Gradient_DDPG)
+ * [A3C](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/10_A3C)
+* Model-based RL (WIP)
+ * [Dyna-Q](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/11_Dyna_Q)
+
+
+# Donation
+
+*If this does help you, please consider donating to support me for better tutorials. Any contribution is greatly appreciated!*
+
+
+
+ 
+
diff --git a/RL_cover.jpg b/RL_cover.jpg
new file mode 100644
index 0000000..8a47adc
Binary files /dev/null and b/RL_cover.jpg differ
diff --git a/contents/10_A3C/A3C_RNN.py b/contents/10_A3C/A3C_RNN.py
new file mode 100644
index 0000000..82ea6bb
--- /dev/null
+++ b/contents/10_A3C/A3C_RNN.py
@@ -0,0 +1,230 @@
+"""
+Asynchronous Advantage Actor Critic (A3C) + RNN with continuous action space, Reinforcement Learning.
+
+The Pendulum example.
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+Using:
+tensorflow 1.0
+gym 0.8.0
+"""
+
+import multiprocessing
+import threading
+import tensorflow as tf
+import numpy as np
+import gym
+import os
+import shutil
+import matplotlib.pyplot as plt
+
+GAME = 'Pendulum-v0'
+OUTPUT_GRAPH = True
+LOG_DIR = './log'
+N_WORKERS = multiprocessing.cpu_count()
+MAX_EP_STEP = 400
+MAX_GLOBAL_EP = 800
+GLOBAL_NET_SCOPE = 'Global_Net'
+UPDATE_GLOBAL_ITER = 5
+GAMMA = 0.9
+ENTROPY_BETA = 0.01
+LR_A = 0.0001 # learning rate for actor
+LR_C = 0.001 # learning rate for critic
+GLOBAL_RUNNING_R = []
+GLOBAL_EP = 0
+
+env = gym.make(GAME)
+
+N_S = env.observation_space.shape[0]
+N_A = env.action_space.shape[0]
+A_BOUND = [env.action_space.low, env.action_space.high]
+
+
+class ACNet(object):
+ def __init__(self, scope, globalAC=None):
+
+ if scope == GLOBAL_NET_SCOPE: # get global network
+ with tf.variable_scope(scope):
+ self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+ self._build_net()
+ self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+ self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+ else: # local net, calculate losses
+ with tf.variable_scope(scope):
+ self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+ self.a_his = tf.placeholder(tf.float32, [None, N_A], 'A')
+ self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
+
+ mu, sigma, self.v = self._build_net()
+
+ td = tf.subtract(self.v_target, self.v, name='TD_error')
+ with tf.name_scope('c_loss'):
+ self.c_loss = tf.reduce_mean(tf.square(td))
+
+ with tf.name_scope('wrap_a_out'):
+ mu, sigma = mu * A_BOUND[1], sigma + 1e-4
+
+ normal_dist = tf.contrib.distributions.Normal(mu, sigma)
+
+ with tf.name_scope('a_loss'):
+ log_prob = normal_dist.log_prob(self.a_his)
+ exp_v = log_prob * td
+ entropy = normal_dist.entropy() # encourage exploration
+ self.exp_v = ENTROPY_BETA * entropy + exp_v
+ self.a_loss = tf.reduce_mean(-self.exp_v)
+
+ with tf.name_scope('choose_a'): # use local params to choose action
+ self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=0), A_BOUND[0], A_BOUND[1])
+ with tf.name_scope('local_grad'):
+ self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+ self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+ self.a_grads = tf.gradients(self.a_loss, self.a_params)
+ self.c_grads = tf.gradients(self.c_loss, self.c_params)
+
+ with tf.name_scope('sync'):
+ with tf.name_scope('pull'):
+ self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)]
+ self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)]
+ with tf.name_scope('push'):
+ self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params))
+ self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params))
+
+ def _build_net(self):
+ w_init = tf.random_normal_initializer(0., .1)
+ with tf.variable_scope('critic'): # only critic controls the rnn update
+ cell_size = 32
+ s = tf.expand_dims(self.s, axis=1,
+ name='timely_input') # [time_step, feature] => [time_step, batch, feature]
+ rnn_cell = tf.contrib.rnn.BasicRNNCell(cell_size)
+ self.init_state = rnn_cell.zero_state(batch_size=1, dtype=tf.float32)
+ outputs, self.final_state = tf.nn.dynamic_rnn(
+ cell=rnn_cell, inputs=s, initial_state=self.init_state, time_major=True)
+ cell_out = tf.reshape(outputs, [-1, cell_size], name='flatten_rnn_outputs') # joined state representation
+ l_c = tf.layers.dense(cell_out, 50, tf.nn.relu6, kernel_initializer=w_init, name='lc')
+ v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v') # state value
+
+ with tf.variable_scope('actor'): # state representation is based on critic
+ cell_out = tf.stop_gradient(cell_out, name='c_cell_out') # from what critic think it is
+ l_a = tf.layers.dense(cell_out, 80, tf.nn.relu6, kernel_initializer=w_init, name='la')
+ mu = tf.layers.dense(l_a, N_A, tf.nn.tanh, kernel_initializer=w_init, name='mu')
+ sigma = tf.layers.dense(l_a, N_A, tf.nn.softplus, kernel_initializer=w_init, name='sigma')
+ return mu, sigma, v
+
+ def update_global(self, feed_dict): # run by a local
+ SESS.run([self.update_a_op, self.update_c_op], feed_dict) # local grads applies to global net
+
+ def pull_global(self): # run by a local
+ SESS.run([self.pull_a_params_op, self.pull_c_params_op])
+
+ def choose_action(self, s, cell_state): # run by a local
+ s = s[np.newaxis, :]
+ a, cell_state = SESS.run([self.A, self.final_state], {self.s: s, self.init_state: cell_state})
+ return a[0], cell_state
+
+
+class Worker(object):
+ def __init__(self, name, globalAC):
+ self.env = gym.make(GAME).unwrapped
+ self.name = name
+ self.AC = ACNet(name, globalAC)
+
+ def work(self):
+ global GLOBAL_RUNNING_R, GLOBAL_EP
+ total_step = 1
+ buffer_s, buffer_a, buffer_r = [], [], []
+ while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
+ s = self.env.reset()
+ ep_r = 0
+ rnn_state = SESS.run(self.AC.init_state) # zero rnn state at beginning
+ keep_state = rnn_state.copy() # keep rnn state for updating global net
+ for ep_t in range(MAX_EP_STEP):
+ if self.name == 'W_0':
+ self.env.render()
+
+ a, rnn_state_ = self.AC.choose_action(s, rnn_state) # get the action and next rnn state
+ s_, r, done, info = self.env.step(a)
+ done = True if ep_t == MAX_EP_STEP - 1 else False
+ r /= 10 # normalize reward
+
+ ep_r += r
+ buffer_s.append(s)
+ buffer_a.append(a)
+ buffer_r.append(r)
+
+ if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net
+ if done:
+ v_s_ = 0 # terminal
+ else:
+ v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :], self.AC.init_state: rnn_state_})[0, 0]
+ buffer_v_target = []
+ for r in buffer_r[::-1]: # reverse buffer r
+ v_s_ = r + GAMMA * v_s_
+ buffer_v_target.append(v_s_)
+ buffer_v_target.reverse()
+
+ buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target)
+
+ feed_dict = {
+ self.AC.s: buffer_s,
+ self.AC.a_his: buffer_a,
+ self.AC.v_target: buffer_v_target,
+ self.AC.init_state: keep_state,
+ }
+
+ self.AC.update_global(feed_dict)
+ buffer_s, buffer_a, buffer_r = [], [], []
+ self.AC.pull_global()
+ keep_state = rnn_state_.copy() # replace the keep_state as the new initial rnn state_
+
+ s = s_
+ rnn_state = rnn_state_ # renew rnn state
+ total_step += 1
+
+ if done:
+ if len(GLOBAL_RUNNING_R) == 0: # record running episode reward
+ GLOBAL_RUNNING_R.append(ep_r)
+ else:
+ GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r)
+ print(
+ self.name,
+ "Ep:", GLOBAL_EP,
+ "| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
+ )
+ GLOBAL_EP += 1
+ break
+
+if __name__ == "__main__":
+ SESS = tf.Session()
+
+ with tf.device("/cpu:0"):
+ OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA')
+ OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC')
+ GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) # we only need its params
+ workers = []
+ # Create worker
+ for i in range(N_WORKERS):
+ i_name = 'W_%i' % i # worker name
+ workers.append(Worker(i_name, GLOBAL_AC))
+
+ COORD = tf.train.Coordinator()
+ SESS.run(tf.global_variables_initializer())
+
+ if OUTPUT_GRAPH:
+ if os.path.exists(LOG_DIR):
+ shutil.rmtree(LOG_DIR)
+ tf.summary.FileWriter(LOG_DIR, SESS.graph)
+
+ worker_threads = []
+ for worker in workers:
+ job = lambda: worker.work()
+ t = threading.Thread(target=job)
+ t.start()
+ worker_threads.append(t)
+ COORD.join(worker_threads)
+
+ plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
+ plt.xlabel('step')
+ plt.ylabel('Total moving reward')
+ plt.show()
+
diff --git a/contents/10_A3C/A3C_continuous_action.py b/contents/10_A3C/A3C_continuous_action.py
new file mode 100644
index 0000000..4cd534a
--- /dev/null
+++ b/contents/10_A3C/A3C_continuous_action.py
@@ -0,0 +1,210 @@
+"""
+Asynchronous Advantage Actor Critic (A3C) with continuous action space, Reinforcement Learning.
+
+The Pendulum example.
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+Using:
+tensorflow 1.0
+gym 0.8.0
+"""
+
+import multiprocessing
+import threading
+import tensorflow as tf
+import numpy as np
+import gym
+import os
+import shutil
+import matplotlib.pyplot as plt
+
+GAME = 'Pendulum-v0'
+OUTPUT_GRAPH = True
+LOG_DIR = './log'
+N_WORKERS = multiprocessing.cpu_count()
+MAX_EP_STEP = 400
+MAX_GLOBAL_EP = 800
+GLOBAL_NET_SCOPE = 'Global_Net'
+UPDATE_GLOBAL_ITER = 5
+GAMMA = 0.9
+ENTROPY_BETA = 0.01
+LR_A = 0.0001 # learning rate for actor
+LR_C = 0.001 # learning rate for critic
+GLOBAL_RUNNING_R = []
+GLOBAL_EP = 0
+
+env = gym.make(GAME)
+
+N_S = env.observation_space.shape[0]
+N_A = env.action_space.shape[0]
+A_BOUND = [env.action_space.low, env.action_space.high]
+
+
+class ACNet(object):
+ def __init__(self, scope, globalAC=None):
+
+ if scope == GLOBAL_NET_SCOPE: # get global network
+ with tf.variable_scope(scope):
+ self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+ self._build_net()
+ self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+ self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+ else: # local net, calculate losses
+ with tf.variable_scope(scope):
+ self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+ self.a_his = tf.placeholder(tf.float32, [None, N_A], 'A')
+ self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
+
+ mu, sigma, self.v = self._build_net()
+
+ td = tf.subtract(self.v_target, self.v, name='TD_error')
+ with tf.name_scope('c_loss'):
+ self.c_loss = tf.reduce_mean(tf.square(td))
+
+ with tf.name_scope('wrap_a_out'):
+ mu, sigma = mu * A_BOUND[1], sigma + 1e-4
+
+ normal_dist = tf.contrib.distributions.Normal(mu, sigma)
+
+ with tf.name_scope('a_loss'):
+ log_prob = normal_dist.log_prob(self.a_his)
+ exp_v = log_prob * td
+ entropy = normal_dist.entropy() # encourage exploration
+ self.exp_v = ENTROPY_BETA * entropy + exp_v
+ self.a_loss = tf.reduce_mean(-self.exp_v)
+
+ with tf.name_scope('choose_a'): # use local params to choose action
+ self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=0), A_BOUND[0], A_BOUND[1])
+ with tf.name_scope('local_grad'):
+ self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+ self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+ self.a_grads = tf.gradients(self.a_loss, self.a_params)
+ self.c_grads = tf.gradients(self.c_loss, self.c_params)
+
+ with tf.name_scope('sync'):
+ with tf.name_scope('pull'):
+ self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)]
+ self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)]
+ with tf.name_scope('push'):
+ self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params))
+ self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params))
+
+ def _build_net(self ):
+ w_init = tf.random_normal_initializer(0., .1)
+ with tf.variable_scope('actor'):
+ l_a = tf.layers.dense(self.s, 200, tf.nn.relu6, kernel_initializer=w_init, name='la')
+ mu = tf.layers.dense(l_a, N_A, tf.nn.tanh, kernel_initializer=w_init, name='mu')
+ sigma = tf.layers.dense(l_a, N_A, tf.nn.softplus, kernel_initializer=w_init, name='sigma')
+ with tf.variable_scope('critic'):
+ l_c = tf.layers.dense(self.s, 100, tf.nn.relu6, kernel_initializer=w_init, name='lc')
+ v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v') # state value
+ return mu, sigma, v
+
+ def update_global(self, feed_dict): # run by a local
+ SESS.run([self.update_a_op, self.update_c_op], feed_dict) # local grads applies to global net
+
+ def pull_global(self): # run by a local
+ SESS.run([self.pull_a_params_op, self.pull_c_params_op])
+
+ def choose_action(self, s): # run by a local
+ s = s[np.newaxis, :]
+ return SESS.run(self.A, {self.s: s})[0]
+
+
+class Worker(object):
+ def __init__(self, name, globalAC):
+ self.env = gym.make(GAME).unwrapped
+ self.name = name
+ self.AC = ACNet(name, globalAC)
+
+ def work(self):
+ global GLOBAL_RUNNING_R, GLOBAL_EP
+ total_step = 1
+ buffer_s, buffer_a, buffer_r = [], [], []
+ while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
+ s = self.env.reset()
+ ep_r = 0
+ for ep_t in range(MAX_EP_STEP):
+ if self.name == 'W_0':
+ self.env.render()
+ a = self.AC.choose_action(s)
+ s_, r, done, info = self.env.step(a)
+ done = True if ep_t == MAX_EP_STEP - 1 else False
+ r /= 10 # normalize reward
+
+ ep_r += r
+ buffer_s.append(s)
+ buffer_a.append(a)
+ buffer_r.append(r)
+
+ if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net
+ if done:
+ v_s_ = 0 # terminal
+ else:
+ v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0]
+ buffer_v_target = []
+ for r in buffer_r[::-1]: # reverse buffer r
+ v_s_ = r + GAMMA * v_s_
+ buffer_v_target.append(v_s_)
+ buffer_v_target.reverse()
+
+ buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target)
+ feed_dict = {
+ self.AC.s: buffer_s,
+ self.AC.a_his: buffer_a,
+ self.AC.v_target: buffer_v_target,
+ }
+ self.AC.update_global(feed_dict)
+ buffer_s, buffer_a, buffer_r = [], [], []
+ self.AC.pull_global()
+
+ s = s_
+ total_step += 1
+ if done:
+ if len(GLOBAL_RUNNING_R) == 0: # record running episode reward
+ GLOBAL_RUNNING_R.append(ep_r)
+ else:
+ GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r)
+ print(
+ self.name,
+ "Ep:", GLOBAL_EP,
+ "| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
+ )
+ GLOBAL_EP += 1
+ break
+
+if __name__ == "__main__":
+ SESS = tf.Session()
+
+ with tf.device("/cpu:0"):
+ OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA')
+ OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC')
+ GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) # we only need its params
+ workers = []
+ # Create worker
+ for i in range(N_WORKERS):
+ i_name = 'W_%i' % i # worker name
+ workers.append(Worker(i_name, GLOBAL_AC))
+
+ COORD = tf.train.Coordinator()
+ SESS.run(tf.global_variables_initializer())
+
+ if OUTPUT_GRAPH:
+ if os.path.exists(LOG_DIR):
+ shutil.rmtree(LOG_DIR)
+ tf.summary.FileWriter(LOG_DIR, SESS.graph)
+
+ worker_threads = []
+ for worker in workers:
+ job = lambda: worker.work()
+ t = threading.Thread(target=job)
+ t.start()
+ worker_threads.append(t)
+ COORD.join(worker_threads)
+
+ plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
+ plt.xlabel('step')
+ plt.ylabel('Total moving reward')
+ plt.show()
+
diff --git a/contents/10_A3C/A3C_discrete_action.py b/contents/10_A3C/A3C_discrete_action.py
new file mode 100644
index 0000000..f17352a
--- /dev/null
+++ b/contents/10_A3C/A3C_discrete_action.py
@@ -0,0 +1,201 @@
+"""
+Asynchronous Advantage Actor Critic (A3C) with discrete action space, Reinforcement Learning.
+
+The Cartpole example.
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+Using:
+tensorflow 1.0
+gym 0.8.0
+"""
+
+import multiprocessing
+import threading
+import tensorflow as tf
+import numpy as np
+import gym
+import os
+import shutil
+import matplotlib.pyplot as plt
+
+
+GAME = 'CartPole-v0'
+OUTPUT_GRAPH = True
+LOG_DIR = './log'
+N_WORKERS = multiprocessing.cpu_count()
+MAX_GLOBAL_EP = 1000
+GLOBAL_NET_SCOPE = 'Global_Net'
+UPDATE_GLOBAL_ITER = 20
+GAMMA = 0.9
+ENTROPY_BETA = 0.001
+LR_A = 0.001 # learning rate for actor
+LR_C = 0.001 # learning rate for critic
+GLOBAL_RUNNING_R = []
+GLOBAL_EP = 0
+
+env = gym.make(GAME)
+
+N_S = env.observation_space.shape[0]
+N_A = env.action_space.n
+
+
+class ACNet(object):
+ def __init__(self, scope, globalAC=None):
+
+ if scope == GLOBAL_NET_SCOPE: # get global network
+ with tf.variable_scope(scope):
+ self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+ self._build_net()
+ self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+ self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+ else: # local net, calculate losses
+ with tf.variable_scope(scope):
+ self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+ self.a_his = tf.placeholder(tf.int32, [None, ], 'A')
+ self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
+
+ self.a_prob, self.v = self._build_net()
+
+ td = tf.subtract(self.v_target, self.v, name='TD_error')
+ with tf.name_scope('c_loss'):
+ self.c_loss = tf.reduce_mean(tf.square(td))
+
+ with tf.name_scope('a_loss'):
+ log_prob = tf.reduce_sum(tf.log(self.a_prob) * tf.one_hot(self.a_his, N_A, dtype=tf.float32), axis=1, keep_dims=True)
+ exp_v = log_prob * td
+ entropy = -tf.reduce_sum(self.a_prob * tf.log(self.a_prob), axis=1, keep_dims=True) # encourage exploration
+ self.exp_v = ENTROPY_BETA * entropy + exp_v
+ self.a_loss = tf.reduce_mean(-self.exp_v)
+
+ with tf.name_scope('local_grad'):
+ self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+ self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+ self.a_grads = tf.gradients(self.a_loss, self.a_params)
+ self.c_grads = tf.gradients(self.c_loss, self.c_params)
+
+ with tf.name_scope('sync'):
+ with tf.name_scope('pull'):
+ self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)]
+ self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)]
+ with tf.name_scope('push'):
+ self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params))
+ self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params))
+
+ def _build_net(self):
+ w_init = tf.random_normal_initializer(0., .1)
+ with tf.variable_scope('actor'):
+ l_a = tf.layers.dense(self.s, 200, tf.nn.relu6, kernel_initializer=w_init, name='la')
+ a_prob = tf.layers.dense(l_a, N_A, tf.nn.softmax, kernel_initializer=w_init, name='ap')
+ with tf.variable_scope('critic'):
+ l_c = tf.layers.dense(self.s, 100, tf.nn.relu6, kernel_initializer=w_init, name='lc')
+ v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v') # state value
+ return a_prob, v
+
+ def update_global(self, feed_dict): # run by a local
+ SESS.run([self.update_a_op, self.update_c_op], feed_dict) # local grads applies to global net
+
+ def pull_global(self): # run by a local
+ SESS.run([self.pull_a_params_op, self.pull_c_params_op])
+
+ def choose_action(self, s): # run by a local
+ prob_weights = SESS.run(self.a_prob, feed_dict={self.s: s[np.newaxis, :]})
+ action = np.random.choice(range(prob_weights.shape[1]),
+ p=prob_weights.ravel()) # select action w.r.t the actions prob
+ return action
+
+
+class Worker(object):
+ def __init__(self, name, globalAC):
+ self.env = gym.make(GAME).unwrapped
+ self.name = name
+ self.AC = ACNet(name, globalAC)
+
+ def work(self):
+ global GLOBAL_RUNNING_R, GLOBAL_EP
+ total_step = 1
+ buffer_s, buffer_a, buffer_r = [], [], []
+ while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
+ s = self.env.reset()
+ ep_r = 0
+ while True:
+ if self.name == 'W_0':
+ self.env.render()
+ a = self.AC.choose_action(s)
+ s_, r, done, info = self.env.step(a)
+ if done: r = -5
+ ep_r += r
+ buffer_s.append(s)
+ buffer_a.append(a)
+ buffer_r.append(r)
+
+ if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net
+ if done:
+ v_s_ = 0 # terminal
+ else:
+ v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0]
+ buffer_v_target = []
+ for r in buffer_r[::-1]: # reverse buffer r
+ v_s_ = r + GAMMA * v_s_
+ buffer_v_target.append(v_s_)
+ buffer_v_target.reverse()
+
+ buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.array(buffer_a), np.vstack(buffer_v_target)
+ feed_dict = {
+ self.AC.s: buffer_s,
+ self.AC.a_his: buffer_a,
+ self.AC.v_target: buffer_v_target,
+ }
+ self.AC.update_global(feed_dict)
+
+ buffer_s, buffer_a, buffer_r = [], [], []
+ self.AC.pull_global()
+
+ s = s_
+ total_step += 1
+ if done:
+ if len(GLOBAL_RUNNING_R) == 0: # record running episode reward
+ GLOBAL_RUNNING_R.append(ep_r)
+ else:
+ GLOBAL_RUNNING_R.append(0.99 * GLOBAL_RUNNING_R[-1] + 0.01 * ep_r)
+ print(
+ self.name,
+ "Ep:", GLOBAL_EP,
+ "| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
+ )
+ GLOBAL_EP += 1
+ break
+
+if __name__ == "__main__":
+ SESS = tf.Session()
+
+ with tf.device("/cpu:0"):
+ OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA')
+ OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC')
+ GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) # we only need its params
+ workers = []
+ # Create worker
+ for i in range(N_WORKERS):
+ i_name = 'W_%i' % i # worker name
+ workers.append(Worker(i_name, GLOBAL_AC))
+
+ COORD = tf.train.Coordinator()
+ SESS.run(tf.global_variables_initializer())
+
+ if OUTPUT_GRAPH:
+ if os.path.exists(LOG_DIR):
+ shutil.rmtree(LOG_DIR)
+ tf.summary.FileWriter(LOG_DIR, SESS.graph)
+
+ worker_threads = []
+ for worker in workers:
+ job = lambda: worker.work()
+ t = threading.Thread(target=job)
+ t.start()
+ worker_threads.append(t)
+ COORD.join(worker_threads)
+
+ plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
+ plt.xlabel('step')
+ plt.ylabel('Total moving reward')
+ plt.show()
diff --git a/contents/11_Dyna_Q/RL_brain.py b/contents/11_Dyna_Q/RL_brain.py
new file mode 100644
index 0000000..f4be936
--- /dev/null
+++ b/contents/11_Dyna_Q/RL_brain.py
@@ -0,0 +1,79 @@
+"""
+This part of code is the Dyna-Q learning brain, which is a brain of the agent.
+All decisions and learning processes are made in here.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+"""
+
+import numpy as np
+import pandas as pd
+
+
+class QLearningTable:
+ def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
+ self.actions = actions # a list
+ self.lr = learning_rate
+ self.gamma = reward_decay
+ self.epsilon = e_greedy
+ self.q_table = pd.DataFrame(columns=self.actions)
+
+ def choose_action(self, observation):
+ self.check_state_exist(observation)
+ # action selection
+ if np.random.uniform() < self.epsilon:
+ # choose best action
+ state_action = self.q_table.ix[observation, :]
+ state_action = state_action.reindex(np.random.permutation(state_action.index)) # some actions have same value
+ action = state_action.argmax()
+ else:
+ # choose random action
+ action = np.random.choice(self.actions)
+ return action
+
+ def learn(self, s, a, r, s_):
+ self.check_state_exist(s_)
+ q_predict = self.q_table.ix[s, a]
+ if s_ != 'terminal':
+ q_target = r + self.gamma * self.q_table.ix[s_, :].max() # next state is not terminal
+ else:
+ q_target = r # next state is terminal
+ self.q_table.ix[s, a] += self.lr * (q_target - q_predict) # update
+
+ def check_state_exist(self, state):
+ if state not in self.q_table.index:
+ # append new state to q table
+ self.q_table = self.q_table.append(
+ pd.Series(
+ [0]*len(self.actions),
+ index=self.q_table.columns,
+ name=state,
+ )
+ )
+
+
+class EnvModel:
+ """Similar to the memory buffer in DQN, you can store past experiences in here.
+ Alternatively, the model can generate next state and reward signal accurately."""
+ def __init__(self, actions):
+ # the simplest case is to think about the model is a memory which has all past transition information
+ self.actions = actions
+ self.database = pd.DataFrame(columns=actions, dtype=np.object)
+
+ def store_transition(self, s, a, r, s_):
+ if s not in self.database.index:
+ self.database = self.database.append(
+ pd.Series(
+ [None] * len(self.actions),
+ index=self.database.columns,
+ name=s,
+ ))
+ self.database.set_value(s, a, (r, s_))
+
+ def sample_s_a(self):
+ s = np.random.choice(self.database.index)
+ a = np.random.choice(self.database.ix[s].dropna().index) # filter out the None value
+ return s, a
+
+ def get_r_s_(self, s, a):
+ r, s_ = self.database.ix[s, a]
+ return r, s_
diff --git a/contents/11_Dyna_Q/maze_env.py b/contents/11_Dyna_Q/maze_env.py
new file mode 100644
index 0000000..5ec5370
--- /dev/null
+++ b/contents/11_Dyna_Q/maze_env.py
@@ -0,0 +1,129 @@
+"""
+Reinforcement learning maze example.
+
+Red rectangle: explorer.
+Black rectangles: hells [reward = -1].
+Yellow bin circle: paradise [reward = +1].
+All other states: ground [reward = 0].
+
+This script is the environment part of this example. The RL is in RL_brain.py.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+"""
+
+
+import numpy as np
+np.random.seed(1)
+import tkinter as tk
+import time
+
+
+UNIT = 40 # pixels
+MAZE_H = 4 # grid height
+MAZE_W = 4 # grid width
+
+
+class Maze(tk.Tk, object):
+ def __init__(self):
+ super(Maze, self).__init__()
+ self.action_space = ['u', 'd', 'l', 'r']
+ self.n_actions = len(self.action_space)
+ self.title('maze')
+ self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
+ self._build_maze()
+
+ def _build_maze(self):
+ self.canvas = tk.Canvas(self, bg='white',
+ height=MAZE_H * UNIT,
+ width=MAZE_W * UNIT)
+
+ # create grids
+ for c in range(0, MAZE_W * UNIT, UNIT):
+ x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
+ self.canvas.create_line(x0, y0, x1, y1)
+ for r in range(0, MAZE_H * UNIT, UNIT):
+ x0, y0, x1, y1 = 0, r, MAZE_H * UNIT, r
+ self.canvas.create_line(x0, y0, x1, y1)
+
+ # create origin
+ origin = np.array([20, 20])
+
+ # hell
+ hell1_center = origin + np.array([UNIT * 2, UNIT])
+ self.hell1 = self.canvas.create_rectangle(
+ hell1_center[0] - 15, hell1_center[1] - 15,
+ hell1_center[0] + 15, hell1_center[1] + 15,
+ fill='black')
+ # hell
+ hell2_center = origin + np.array([UNIT, UNIT * 2])
+ self.hell2 = self.canvas.create_rectangle(
+ hell2_center[0] - 15, hell2_center[1] - 15,
+ hell2_center[0] + 15, hell2_center[1] + 15,
+ fill='black')
+
+ # create oval
+ oval_center = origin + UNIT * 2
+ self.oval = self.canvas.create_oval(
+ oval_center[0] - 15, oval_center[1] - 15,
+ oval_center[0] + 15, oval_center[1] + 15,
+ fill='yellow')
+
+ # create red rect
+ self.rect = self.canvas.create_rectangle(
+ origin[0] - 15, origin[1] - 15,
+ origin[0] + 15, origin[1] + 15,
+ fill='red')
+
+ # pack all
+ self.canvas.pack()
+
+ def reset(self):
+ self.update()
+ time.sleep(0.5)
+ self.canvas.delete(self.rect)
+ origin = np.array([20, 20])
+ self.rect = self.canvas.create_rectangle(
+ origin[0] - 15, origin[1] - 15,
+ origin[0] + 15, origin[1] + 15,
+ fill='red')
+ # return observation
+ return self.canvas.coords(self.rect)
+
+ def step(self, action):
+ s = self.canvas.coords(self.rect)
+ base_action = np.array([0, 0])
+ if action == 0: # up
+ if s[1] > UNIT:
+ base_action[1] -= UNIT
+ elif action == 1: # down
+ if s[1] < (MAZE_H - 1) * UNIT:
+ base_action[1] += UNIT
+ elif action == 2: # right
+ if s[0] < (MAZE_W - 1) * UNIT:
+ base_action[0] += UNIT
+ elif action == 3: # left
+ if s[0] > UNIT:
+ base_action[0] -= UNIT
+
+ self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent
+
+ s_ = self.canvas.coords(self.rect) # next state
+
+ # reward function
+ if s_ == self.canvas.coords(self.oval):
+ reward = 1
+ done = True
+ elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
+ reward = -1
+ done = True
+ else:
+ reward = 0
+ done = False
+
+ return s_, reward, done
+
+ def render(self):
+ # time.sleep(0.1)
+ self.update()
+
+
diff --git a/contents/11_Dyna_Q/run_this.py b/contents/11_Dyna_Q/run_this.py
new file mode 100644
index 0000000..d784bc4
--- /dev/null
+++ b/contents/11_Dyna_Q/run_this.py
@@ -0,0 +1,51 @@
+"""
+Simplest model-based RL, Dyna-Q.
+
+Red rectangle: explorer.
+Black rectangles: hells [reward = -1].
+Yellow bin circle: paradise [reward = +1].
+All other states: ground [reward = 0].
+
+This script is the main part which controls the update method of this example.
+The RL is in RL_brain.py.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+"""
+
+from maze_env import Maze
+from RL_brain import QLearningTable, EnvModel
+
+
+def update():
+ for episode in range(40):
+ s = env.reset()
+ while True:
+ env.render()
+ a = RL.choose_action(str(s))
+ s_, r, done = env.step(a)
+ RL.learn(str(s), a, r, str(s_))
+
+ # use a model to output (r, s_) by inputting (s, a)
+ # the model in dyna Q version is just like a memory replay buffer
+ env_model.store_transition(str(s), a, r, s_)
+ for n in range(10): # learn 10 more times using the env_model
+ ms, ma = env_model.sample_s_a() # ms in here is a str
+ mr, ms_ = env_model.get_r_s_(ms, ma)
+ RL.learn(ms, ma, mr, str(ms_))
+
+ s = s_
+ if done:
+ break
+
+ # end of game
+ print('game over')
+ env.destroy()
+
+
+if __name__ == "__main__":
+ env = Maze()
+ RL = QLearningTable(actions=list(range(env.n_actions)))
+ env_model = EnvModel(actions=list(range(env.n_actions)))
+
+ env.after(0, update)
+ env.mainloop()
\ No newline at end of file
diff --git a/contents/1_command_line_reinforcement_learning/treasure_on_right.py b/contents/1_command_line_reinforcement_learning/treasure_on_right.py
new file mode 100644
index 0000000..5970860
--- /dev/null
+++ b/contents/1_command_line_reinforcement_learning/treasure_on_right.py
@@ -0,0 +1,107 @@
+"""
+A simple example for Reinforcement Learning using table lookup Q-learning method.
+An agent "o" is on the left of a 1 dimensional world, the treasure is on the rightmost location.
+Run this program and to see how the agent will improve its strategy of finding the treasure.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+"""
+
+import numpy as np
+import pandas as pd
+import time
+
+np.random.seed(2) # reproducible
+
+
+N_STATES = 6 # the length of the 1 dimensional world
+ACTIONS = ['left', 'right'] # available actions
+EPSILON = 0.9 # greedy police
+ALPHA = 0.1 # learning rate
+GAMMA = 0.9 # discount factor
+MAX_EPISODES = 13 # maximum episodes
+FRESH_TIME = 0.3 # fresh time for one move
+
+
+def build_q_table(n_states, actions):
+ table = pd.DataFrame(
+ np.zeros((n_states, len(actions))), # q_table initial values
+ columns=actions, # actions's name
+ )
+ # print(table) # show table
+ return table
+
+
+def choose_action(state, q_table):
+ # This is how to choose an action
+ state_actions = q_table.iloc[state, :]
+ if (np.random.uniform() > EPSILON) or (state_actions.all() == 0): # act non-greedy or state-action have no value
+ action_name = np.random.choice(ACTIONS)
+ else: # act greedy
+ action_name = state_actions.argmax()
+ return action_name
+
+
+def get_env_feedback(S, A):
+ # This is how agent will interact with the environment
+ if A == 'right': # move right
+ if S == N_STATES - 2: # terminate
+ S_ = 'terminal'
+ R = 1
+ else:
+ S_ = S + 1
+ R = 0
+ else: # move left
+ R = 0
+ if S == 0:
+ S_ = S # reach the wall
+ else:
+ S_ = S - 1
+ return S_, R
+
+
+def update_env(S, episode, step_counter):
+ # This is how environment be updated
+ env_list = ['-']*(N_STATES-1) + ['T'] # '---------T' our environment
+ if S == 'terminal':
+ interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter)
+ print('\r{}'.format(interaction), end='')
+ time.sleep(2)
+ print('\r ', end='')
+ else:
+ env_list[S] = 'o'
+ interaction = ''.join(env_list)
+ print('\r{}'.format(interaction), end='')
+ time.sleep(FRESH_TIME)
+
+
+def rl():
+ # main part of RL loop
+ q_table = build_q_table(N_STATES, ACTIONS)
+ for episode in range(MAX_EPISODES):
+ step_counter = 0
+ S = 0
+ is_terminated = False
+ update_env(S, episode, step_counter)
+ while not is_terminated:
+
+ A = choose_action(S, q_table)
+ S_, R = get_env_feedback(S, A) # take action & get next state and reward
+ q_predict = q_table.ix[S, A]
+ if S_ != 'terminal':
+ q_target = R + GAMMA * q_table.iloc[S_, :].max() # next state is not terminal
+ else:
+ q_target = R # next state is terminal
+ is_terminated = True # terminate this episode
+
+ q_table.ix[S, A] += ALPHA * (q_target - q_predict) # update
+ S = S_ # move to next state
+
+ update_env(S, episode, step_counter+1)
+ step_counter += 1
+ return q_table
+
+
+if __name__ == "__main__":
+ q_table = rl()
+ print('\r\nQ-table:\n')
+ print(q_table)
diff --git a/contents/2_Q_Learning_maze/RL_brain.py b/contents/2_Q_Learning_maze/RL_brain.py
new file mode 100644
index 0000000..844c475
--- /dev/null
+++ b/contents/2_Q_Learning_maze/RL_brain.py
@@ -0,0 +1,51 @@
+"""
+This part of code is the Q learning brain, which is a brain of the agent.
+All decisions are made in here.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+"""
+
+import numpy as np
+import pandas as pd
+
+
+class QLearningTable:
+ def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
+ self.actions = actions # a list
+ self.lr = learning_rate
+ self.gamma = reward_decay
+ self.epsilon = e_greedy
+ self.q_table = pd.DataFrame(columns=self.actions)
+
+ def choose_action(self, observation):
+ self.check_state_exist(observation)
+ # action selection
+ if np.random.uniform() < self.epsilon:
+ # choose best action
+ state_action = self.q_table.ix[observation, :]
+ state_action = state_action.reindex(np.random.permutation(state_action.index)) # some actions have same value
+ action = state_action.argmax()
+ else:
+ # choose random action
+ action = np.random.choice(self.actions)
+ return action
+
+ def learn(self, s, a, r, s_):
+ self.check_state_exist(s_)
+ q_predict = self.q_table.ix[s, a]
+ if s_ != 'terminal':
+ q_target = r + self.gamma * self.q_table.ix[s_, :].max() # next state is not terminal
+ else:
+ q_target = r # next state is terminal
+ self.q_table.ix[s, a] += self.lr * (q_target - q_predict) # update
+
+ def check_state_exist(self, state):
+ if state not in self.q_table.index:
+ # append new state to q table
+ self.q_table = self.q_table.append(
+ pd.Series(
+ [0]*len(self.actions),
+ index=self.q_table.columns,
+ name=state,
+ )
+ )
\ No newline at end of file
diff --git a/contents/2_Q_Learning_maze/maze_env.py b/contents/2_Q_Learning_maze/maze_env.py
new file mode 100644
index 0000000..d7b8b0a
--- /dev/null
+++ b/contents/2_Q_Learning_maze/maze_env.py
@@ -0,0 +1,129 @@
+"""
+Reinforcement learning maze example.
+
+Red rectangle: explorer.
+Black rectangles: hells [reward = -1].
+Yellow bin circle: paradise [reward = +1].
+All other states: ground [reward = 0].
+
+This script is the environment part of this example. The RL is in RL_brain.py.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+"""
+
+
+import numpy as np
+np.random.seed(1)
+import tkinter as tk
+import time
+
+
+UNIT = 40 # pixels
+MAZE_H = 4 # grid height
+MAZE_W = 4 # grid width
+
+
+class Maze(tk.Tk, object):
+ def __init__(self):
+ super(Maze, self).__init__()
+ self.action_space = ['u', 'd', 'l', 'r']
+ self.n_actions = len(self.action_space)
+ self.title('maze')
+ self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
+ self._build_maze()
+
+ def _build_maze(self):
+ self.canvas = tk.Canvas(self, bg='white',
+ height=MAZE_H * UNIT,
+ width=MAZE_W * UNIT)
+
+ # create grids
+ for c in range(0, MAZE_W * UNIT, UNIT):
+ x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
+ self.canvas.create_line(x0, y0, x1, y1)
+ for r in range(0, MAZE_H * UNIT, UNIT):
+ x0, y0, x1, y1 = 0, r, MAZE_H * UNIT, r
+ self.canvas.create_line(x0, y0, x1, y1)
+
+ # create origin
+ origin = np.array([20, 20])
+
+ # hell
+ hell1_center = origin + np.array([UNIT * 2, UNIT])
+ self.hell1 = self.canvas.create_rectangle(
+ hell1_center[0] - 15, hell1_center[1] - 15,
+ hell1_center[0] + 15, hell1_center[1] + 15,
+ fill='black')
+ # hell
+ hell2_center = origin + np.array([UNIT, UNIT * 2])
+ self.hell2 = self.canvas.create_rectangle(
+ hell2_center[0] - 15, hell2_center[1] - 15,
+ hell2_center[0] + 15, hell2_center[1] + 15,
+ fill='black')
+
+ # create oval
+ oval_center = origin + UNIT * 2
+ self.oval = self.canvas.create_oval(
+ oval_center[0] - 15, oval_center[1] - 15,
+ oval_center[0] + 15, oval_center[1] + 15,
+ fill='yellow')
+
+ # create red rect
+ self.rect = self.canvas.create_rectangle(
+ origin[0] - 15, origin[1] - 15,
+ origin[0] + 15, origin[1] + 15,
+ fill='red')
+
+ # pack all
+ self.canvas.pack()
+
+ def reset(self):
+ self.update()
+ time.sleep(0.5)
+ self.canvas.delete(self.rect)
+ origin = np.array([20, 20])
+ self.rect = self.canvas.create_rectangle(
+ origin[0] - 15, origin[1] - 15,
+ origin[0] + 15, origin[1] + 15,
+ fill='red')
+ # return observation
+ return self.canvas.coords(self.rect)
+
+ def step(self, action):
+ s = self.canvas.coords(self.rect)
+ base_action = np.array([0, 0])
+ if action == 0: # up
+ if s[1] > UNIT:
+ base_action[1] -= UNIT
+ elif action == 1: # down
+ if s[1] < (MAZE_H - 1) * UNIT:
+ base_action[1] += UNIT
+ elif action == 2: # right
+ if s[0] < (MAZE_W - 1) * UNIT:
+ base_action[0] += UNIT
+ elif action == 3: # left
+ if s[0] > UNIT:
+ base_action[0] -= UNIT
+
+ self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent
+
+ s_ = self.canvas.coords(self.rect) # next state
+
+ # reward function
+ if s_ == self.canvas.coords(self.oval):
+ reward = 1
+ done = True
+ elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
+ reward = -1
+ done = True
+ else:
+ reward = 0
+ done = False
+
+ return s_, reward, done
+
+ def render(self):
+ time.sleep(0.1)
+ self.update()
+
+
diff --git a/contents/2_Q_Learning_maze/run_this.py b/contents/2_Q_Learning_maze/run_this.py
new file mode 100644
index 0000000..f817d1e
--- /dev/null
+++ b/contents/2_Q_Learning_maze/run_this.py
@@ -0,0 +1,53 @@
+"""
+Reinforcement learning maze example.
+
+Red rectangle: explorer.
+Black rectangles: hells [reward = -1].
+Yellow bin circle: paradise [reward = +1].
+All other states: ground [reward = 0].
+
+This script is the main part which controls the update method of this example.
+The RL is in RL_brain.py.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+"""
+
+from maze_env import Maze
+from RL_brain import QLearningTable
+
+
+def update():
+ for episode in range(100):
+ # initial observation
+ observation = env.reset()
+
+ while True:
+ # fresh env
+ env.render()
+
+ # RL choose action based on observation
+ action = RL.choose_action(str(observation))
+
+ # RL take action and get next observation and reward
+ observation_, reward, done = env.step(action)
+
+ # RL learn from this transition
+ RL.learn(str(observation), action, reward, str(observation_))
+
+ # swap observation
+ observation = observation_
+
+ # break while loop when end of this episode
+ if done:
+ break
+
+ # end of game
+ print('game over')
+ env.destroy()
+
+if __name__ == "__main__":
+ env = Maze()
+ RL = QLearningTable(actions=list(range(env.n_actions)))
+
+ env.after(100, update)
+ env.mainloop()
\ No newline at end of file
diff --git a/contents/3_Sarsa_maze/RL_brain.py b/contents/3_Sarsa_maze/RL_brain.py
new file mode 100644
index 0000000..3b8b5da
--- /dev/null
+++ b/contents/3_Sarsa_maze/RL_brain.py
@@ -0,0 +1,77 @@
+"""
+This part of code is the Q learning brain, which is a brain of the agent.
+All decisions are made in here.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+"""
+
+import numpy as np
+import pandas as pd
+
+
+class RL(object):
+ def __init__(self, action_space, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
+ self.actions = action_space # a list
+ self.lr = learning_rate
+ self.gamma = reward_decay
+ self.epsilon = e_greedy
+
+ self.q_table = pd.DataFrame(columns=self.actions)
+
+ def check_state_exist(self, state):
+ if state not in self.q_table.index:
+ # append new state to q table
+ self.q_table = self.q_table.append(
+ pd.Series(
+ [0]*len(self.actions),
+ index=self.q_table.columns,
+ name=state,
+ )
+ )
+
+ def choose_action(self, observation):
+ self.check_state_exist(observation)
+ # action selection
+ if np.random.rand() < self.epsilon:
+ # choose best action
+ state_action = self.q_table.ix[observation, :]
+ state_action = state_action.reindex(np.random.permutation(state_action.index)) # some actions have same value
+ action = state_action.argmax()
+ else:
+ # choose random action
+ action = np.random.choice(self.actions)
+ return action
+
+ def learn(self, *args):
+ pass
+
+
+# off-policy
+class QLearningTable(RL):
+ def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
+ super(QLearningTable, self).__init__(actions, learning_rate, reward_decay, e_greedy)
+
+ def learn(self, s, a, r, s_):
+ self.check_state_exist(s_)
+ q_predict = self.q_table.ix[s, a]
+ if s_ != 'terminal':
+ q_target = r + self.gamma * self.q_table.ix[s_, :].max() # next state is not terminal
+ else:
+ q_target = r # next state is terminal
+ self.q_table.ix[s, a] += self.lr * (q_target - q_predict) # update
+
+
+# on-policy
+class SarsaTable(RL):
+
+ def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
+ super(SarsaTable, self).__init__(actions, learning_rate, reward_decay, e_greedy)
+
+ def learn(self, s, a, r, s_, a_):
+ self.check_state_exist(s_)
+ q_predict = self.q_table.ix[s, a]
+ if s_ != 'terminal':
+ q_target = r + self.gamma * self.q_table.ix[s_, a_] # next state is not terminal
+ else:
+ q_target = r # next state is terminal
+ self.q_table.ix[s, a] += self.lr * (q_target - q_predict) # update
diff --git a/contents/3_Sarsa_maze/maze_env.py b/contents/3_Sarsa_maze/maze_env.py
new file mode 100644
index 0000000..fc31521
--- /dev/null
+++ b/contents/3_Sarsa_maze/maze_env.py
@@ -0,0 +1,130 @@
+"""
+Reinforcement learning maze example.
+
+Red rectangle: explorer.
+Black rectangles: hells [reward = -1].
+Yellow bin circle: paradise [reward = +1].
+All other states: ground [reward = 0].
+
+This script is the environment part of this example.
+The RL is in RL_brain.py.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+"""
+
+
+import numpy as np
+np.random.seed(1)
+import tkinter as tk
+import time
+
+
+UNIT = 40 # pixels
+MAZE_H = 4 # grid height
+MAZE_W = 4 # grid width
+
+
+class Maze(tk.Tk):
+ def __init__(self):
+ super(Maze, self).__init__()
+ self.action_space = ['u', 'd', 'l', 'r']
+ self.n_actions = len(self.action_space)
+ self.title('maze')
+ self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
+ self._build_maze()
+
+ def _build_maze(self):
+ self.canvas = tk.Canvas(self, bg='white',
+ height=MAZE_H * UNIT,
+ width=MAZE_W * UNIT)
+
+ # create grids
+ for c in range(0, MAZE_W * UNIT, UNIT):
+ x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
+ self.canvas.create_line(x0, y0, x1, y1)
+ for r in range(0, MAZE_H * UNIT, UNIT):
+ x0, y0, x1, y1 = 0, r, MAZE_H * UNIT, r
+ self.canvas.create_line(x0, y0, x1, y1)
+
+ # create origin
+ origin = np.array([20, 20])
+
+ # hell
+ hell1_center = origin + np.array([UNIT * 2, UNIT])
+ self.hell1 = self.canvas.create_rectangle(
+ hell1_center[0] - 15, hell1_center[1] - 15,
+ hell1_center[0] + 15, hell1_center[1] + 15,
+ fill='black')
+ # hell
+ hell2_center = origin + np.array([UNIT, UNIT * 2])
+ self.hell2 = self.canvas.create_rectangle(
+ hell2_center[0] - 15, hell2_center[1] - 15,
+ hell2_center[0] + 15, hell2_center[1] + 15,
+ fill='black')
+
+ # create oval
+ oval_center = origin + UNIT * 2
+ self.oval = self.canvas.create_oval(
+ oval_center[0] - 15, oval_center[1] - 15,
+ oval_center[0] + 15, oval_center[1] + 15,
+ fill='yellow')
+
+ # create red rect
+ self.rect = self.canvas.create_rectangle(
+ origin[0] - 15, origin[1] - 15,
+ origin[0] + 15, origin[1] + 15,
+ fill='red')
+
+ # pack all
+ self.canvas.pack()
+
+ def reset(self):
+ self.update()
+ time.sleep(0.5)
+ self.canvas.delete(self.rect)
+ origin = np.array([20, 20])
+ self.rect = self.canvas.create_rectangle(
+ origin[0] - 15, origin[1] - 15,
+ origin[0] + 15, origin[1] + 15,
+ fill='red')
+ # return observation
+ return self.canvas.coords(self.rect)
+
+ def step(self, action):
+ s = self.canvas.coords(self.rect)
+ base_action = np.array([0, 0])
+ if action == 0: # up
+ if s[1] > UNIT:
+ base_action[1] -= UNIT
+ elif action == 1: # down
+ if s[1] < (MAZE_H - 1) * UNIT:
+ base_action[1] += UNIT
+ elif action == 2: # right
+ if s[0] < (MAZE_W - 1) * UNIT:
+ base_action[0] += UNIT
+ elif action == 3: # left
+ if s[0] > UNIT:
+ base_action[0] -= UNIT
+
+ self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent
+
+ s_ = self.canvas.coords(self.rect) # next state
+
+ # reward function
+ if s_ == self.canvas.coords(self.oval):
+ reward = 1
+ done = True
+ elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
+ reward = -1
+ done = True
+ else:
+ reward = 0
+ done = False
+
+ return s_, reward, done
+
+ def render(self):
+ time.sleep(0.1)
+ self.update()
+
+
diff --git a/contents/3_Sarsa_maze/run_this.py b/contents/3_Sarsa_maze/run_this.py
new file mode 100644
index 0000000..fc2bd1a
--- /dev/null
+++ b/contents/3_Sarsa_maze/run_this.py
@@ -0,0 +1,52 @@
+"""
+Sarsa is a online updating method for Reinforcement learning.
+
+Unlike Q learning which is a offline updating method, Sarsa is updating while in the current trajectory.
+
+You will see the sarsa is more coward when punishment is close because it cares about all behaviours,
+while q learning is more brave because it only cares about maximum behaviour.
+"""
+
+from maze_env import Maze
+from RL_brain import SarsaTable
+
+
+def update():
+ for episode in range(100):
+ # initial observation
+ observation = env.reset()
+
+ # RL choose action based on observation
+ action = RL.choose_action(str(observation))
+
+ while True:
+ # fresh env
+ env.render()
+
+ # RL take action and get next observation and reward
+ observation_, reward, done = env.step(action)
+
+ # RL choose action based on next observation
+ action_ = RL.choose_action(str(observation_))
+
+ # RL learn from this transition (s, a, r, s, a) ==> Sarsa
+ RL.learn(str(observation), action, reward, str(observation_), action_)
+
+ # swap observation and action
+ observation = observation_
+ action = action_
+
+ # break while loop when end of this episode
+ if done:
+ break
+
+ # end of game
+ print('game over')
+ env.destroy()
+
+if __name__ == "__main__":
+ env = Maze()
+ RL = SarsaTable(actions=list(range(env.n_actions)))
+
+ env.after(100, update)
+ env.mainloop()
\ No newline at end of file
diff --git a/contents/4_Sarsa_lambda_maze/RL_brain.py b/contents/4_Sarsa_lambda_maze/RL_brain.py
new file mode 100644
index 0000000..6ad65a9
--- /dev/null
+++ b/contents/4_Sarsa_lambda_maze/RL_brain.py
@@ -0,0 +1,93 @@
+"""
+This part of code is the Q learning brain, which is a brain of the agent.
+All decisions are made in here.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+"""
+
+import numpy as np
+import pandas as pd
+
+
+class RL(object):
+ def __init__(self, action_space, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
+ self.actions = action_space # a list
+ self.lr = learning_rate
+ self.gamma = reward_decay
+ self.epsilon = e_greedy
+
+ self.q_table = pd.DataFrame(columns=self.actions)
+
+ def check_state_exist(self, state):
+ if state not in self.q_table.index:
+ # append new state to q table
+ self.q_table = self.q_table.append(
+ pd.Series(
+ [0]*len(self.actions),
+ index=self.q_table.columns,
+ name=state,
+ )
+ )
+
+ def choose_action(self, observation):
+ self.check_state_exist(observation)
+ # action selection
+ if np.random.rand() < self.epsilon:
+ # choose best action
+ state_action = self.q_table.ix[observation, :]
+ state_action = state_action.reindex(np.random.permutation(state_action.index)) # some actions have same value
+ action = state_action.argmax()
+ else:
+ # choose random action
+ action = np.random.choice(self.actions)
+ return action
+
+ def learn(self, *args):
+ pass
+
+
+# backward eligibility traces
+class SarsaLambdaTable(RL):
+ def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, trace_decay=0.9):
+ super(SarsaLambdaTable, self).__init__(actions, learning_rate, reward_decay, e_greedy)
+
+ # backward view, eligibility trace.
+ self.lambda_ = trace_decay
+ self.eligibility_trace = self.q_table.copy()
+
+ def check_state_exist(self, state):
+ if state not in self.q_table.index:
+ # append new state to q table
+ to_be_append = pd.Series(
+ [0] * len(self.actions),
+ index=self.q_table.columns,
+ name=state,
+ )
+ self.q_table = self.q_table.append(to_be_append)
+
+ # also update eligibility trace
+ self.eligibility_trace = self.eligibility_trace.append(to_be_append)
+
+ def learn(self, s, a, r, s_, a_):
+ self.check_state_exist(s_)
+ q_predict = self.q_table.ix[s, a]
+ if s_ != 'terminal':
+ q_target = r + self.gamma * self.q_table.ix[s_, a_] # next state is not terminal
+ else:
+ q_target = r # next state is terminal
+ error = q_target - q_predict
+
+ # increase trace amount for visited state-action pair
+
+ # Method 1:
+ # self.eligibility_trace.ix[s, a] += 1
+
+ # Method 2:
+ self.eligibility_trace.ix[s, :] *= 0
+ self.eligibility_trace.ix[s, a] = 1
+
+ # Q update
+ self.q_table += self.lr * error * self.eligibility_trace
+
+ # decay eligibility trace after update
+ self.eligibility_trace *= self.gamma*self.lambda_
diff --git a/contents/4_Sarsa_lambda_maze/maze_env.py b/contents/4_Sarsa_lambda_maze/maze_env.py
new file mode 100644
index 0000000..9fe6acb
--- /dev/null
+++ b/contents/4_Sarsa_lambda_maze/maze_env.py
@@ -0,0 +1,130 @@
+"""
+Reinforcement learning maze example.
+
+Red rectangle: explorer.
+Black rectangles: hells [reward = -1].
+Yellow bin circle: paradise [reward = +1].
+All other states: ground [reward = 0].
+
+This script is the environment part of this example.
+The RL is in RL_brain.py.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+"""
+
+
+import numpy as np
+np.random.seed(1)
+import tkinter as tk
+import time
+
+
+UNIT = 40 # pixels
+MAZE_H = 4 # grid height
+MAZE_W = 4 # grid width
+
+
+class Maze(tk.Tk):
+ def __init__(self):
+ super(Maze, self).__init__()
+ self.action_space = ['u', 'd', 'l', 'r']
+ self.n_actions = len(self.action_space)
+ self.title('maze')
+ self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
+ self._build_maze()
+
+ def _build_maze(self):
+ self.canvas = tk.Canvas(self, bg='white',
+ height=MAZE_H * UNIT,
+ width=MAZE_W * UNIT)
+
+ # create grids
+ for c in range(0, MAZE_W * UNIT, UNIT):
+ x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
+ self.canvas.create_line(x0, y0, x1, y1)
+ for r in range(0, MAZE_H * UNIT, UNIT):
+ x0, y0, x1, y1 = 0, r, MAZE_H * UNIT, r
+ self.canvas.create_line(x0, y0, x1, y1)
+
+ # create origin
+ origin = np.array([20, 20])
+
+ # hell
+ hell1_center = origin + np.array([UNIT * 2, UNIT])
+ self.hell1 = self.canvas.create_rectangle(
+ hell1_center[0] - 15, hell1_center[1] - 15,
+ hell1_center[0] + 15, hell1_center[1] + 15,
+ fill='black')
+ # hell
+ hell2_center = origin + np.array([UNIT, UNIT * 2])
+ self.hell2 = self.canvas.create_rectangle(
+ hell2_center[0] - 15, hell2_center[1] - 15,
+ hell2_center[0] + 15, hell2_center[1] + 15,
+ fill='black')
+
+ # create oval
+ oval_center = origin + UNIT * 2
+ self.oval = self.canvas.create_oval(
+ oval_center[0] - 15, oval_center[1] - 15,
+ oval_center[0] + 15, oval_center[1] + 15,
+ fill='yellow')
+
+ # create red rect
+ self.rect = self.canvas.create_rectangle(
+ origin[0] - 15, origin[1] - 15,
+ origin[0] + 15, origin[1] + 15,
+ fill='red')
+
+ # pack all
+ self.canvas.pack()
+
+ def reset(self):
+ self.update()
+ time.sleep(0.5)
+ self.canvas.delete(self.rect)
+ origin = np.array([20, 20])
+ self.rect = self.canvas.create_rectangle(
+ origin[0] - 15, origin[1] - 15,
+ origin[0] + 15, origin[1] + 15,
+ fill='red')
+ # return observation
+ return self.canvas.coords(self.rect)
+
+ def step(self, action):
+ s = self.canvas.coords(self.rect)
+ base_action = np.array([0, 0])
+ if action == 0: # up
+ if s[1] > UNIT:
+ base_action[1] -= UNIT
+ elif action == 1: # down
+ if s[1] < (MAZE_H - 1) * UNIT:
+ base_action[1] += UNIT
+ elif action == 2: # right
+ if s[0] < (MAZE_W - 1) * UNIT:
+ base_action[0] += UNIT
+ elif action == 3: # left
+ if s[0] > UNIT:
+ base_action[0] -= UNIT
+
+ self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent
+
+ s_ = self.canvas.coords(self.rect) # next state
+
+ # reward function
+ if s_ == self.canvas.coords(self.oval):
+ reward = 1
+ done = True
+ elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
+ reward = -1
+ done = True
+ else:
+ reward = 0
+ done = False
+
+ return s_, reward, done
+
+ def render(self):
+ time.sleep(0.05)
+ self.update()
+
+
diff --git a/contents/4_Sarsa_lambda_maze/run_this.py b/contents/4_Sarsa_lambda_maze/run_this.py
new file mode 100644
index 0000000..a0c5afc
--- /dev/null
+++ b/contents/4_Sarsa_lambda_maze/run_this.py
@@ -0,0 +1,52 @@
+"""
+Sarsa is a online updating method for Reinforcement learning.
+
+Unlike Q learning which is a offline updating method, Sarsa is updating while in the current trajectory.
+
+You will see the sarsa is more coward when punishment is close because it cares about all behaviours,
+while q learning is more brave because it only cares about maximum behaviour.
+"""
+
+from maze_env import Maze
+from RL_brain import SarsaLambdaTable
+
+
+def update():
+ for episode in range(100):
+ # initial observation
+ observation = env.reset()
+
+ # RL choose action based on observation
+ action = RL.choose_action(str(observation))
+
+ while True:
+ # fresh env
+ env.render()
+
+ # RL take action and get next observation and reward
+ observation_, reward, done = env.step(action)
+
+ # RL choose action based on next observation
+ action_ = RL.choose_action(str(observation_))
+
+ # RL learn from this transition (s, a, r, s, a) ==> Sarsa
+ RL.learn(str(observation), action, reward, str(observation_), action_)
+
+ # swap observation and action
+ observation = observation_
+ action = action_
+
+ # break while loop when end of this episode
+ if done:
+ break
+
+ # end of game
+ print('game over')
+ env.destroy()
+
+if __name__ == "__main__":
+ env = Maze()
+ RL = SarsaLambdaTable(actions=list(range(env.n_actions)))
+
+ env.after(100, update)
+ env.mainloop()
\ No newline at end of file
diff --git a/contents/5.1_Double_DQN/RL_brain.py b/contents/5.1_Double_DQN/RL_brain.py
new file mode 100644
index 0000000..15053eb
--- /dev/null
+++ b/contents/5.1_Double_DQN/RL_brain.py
@@ -0,0 +1,163 @@
+"""
+The double DQN based on this paper: https://arxiv.org/abs/1509.06461
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+
+Using:
+Tensorflow: 1.0
+gym: 0.8.0
+"""
+
+import numpy as np
+import tensorflow as tf
+
+np.random.seed(1)
+tf.set_random_seed(1)
+
+
+class DoubleDQN:
+ def __init__(
+ self,
+ n_actions,
+ n_features,
+ learning_rate=0.005,
+ reward_decay=0.9,
+ e_greedy=0.9,
+ replace_target_iter=200,
+ memory_size=3000,
+ batch_size=32,
+ e_greedy_increment=None,
+ output_graph=False,
+ double_q=True,
+ sess=None,
+ ):
+ self.n_actions = n_actions
+ self.n_features = n_features
+ self.lr = learning_rate
+ self.gamma = reward_decay
+ self.epsilon_max = e_greedy
+ self.replace_target_iter = replace_target_iter
+ self.memory_size = memory_size
+ self.batch_size = batch_size
+ self.epsilon_increment = e_greedy_increment
+ self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
+
+ self.double_q = double_q # decide to use double q or not
+
+ self.learn_step_counter = 0
+ self.memory = np.zeros((self.memory_size, n_features*2+2))
+ self._build_net()
+ if sess is None:
+ self.sess = tf.Session()
+ self.sess.run(tf.global_variables_initializer())
+ else:
+ self.sess = sess
+ if output_graph:
+ tf.summary.FileWriter("logs/", self.sess.graph)
+ self.cost_his = []
+
+ def _build_net(self):
+ def build_layers(s, c_names, n_l1, w_initializer, b_initializer):
+ with tf.variable_scope('l1'):
+ w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
+ b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
+ l1 = tf.nn.relu(tf.matmul(s, w1) + b1)
+
+ with tf.variable_scope('l2'):
+ w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
+ b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
+ out = tf.matmul(l1, w2) + b2
+ return out
+ # ------------------ build evaluate_net ------------------
+ self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input
+ self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss
+
+ with tf.variable_scope('eval_net'):
+ c_names, n_l1, w_initializer, b_initializer = \
+ ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 20, \
+ tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers
+
+ self.q_eval = build_layers(self.s, c_names, n_l1, w_initializer, b_initializer)
+
+ with tf.variable_scope('loss'):
+ self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
+ with tf.variable_scope('train'):
+ self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
+
+ # ------------------ build target_net ------------------
+ self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input
+ with tf.variable_scope('target_net'):
+ c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
+
+ self.q_next = build_layers(self.s_, c_names, n_l1, w_initializer, b_initializer)
+
+ def store_transition(self, s, a, r, s_):
+ if not hasattr(self, 'memory_counter'):
+ self.memory_counter = 0
+ transition = np.hstack((s, [a, r], s_))
+ index = self.memory_counter % self.memory_size
+ self.memory[index, :] = transition
+ self.memory_counter += 1
+
+ def choose_action(self, observation):
+ observation = observation[np.newaxis, :]
+ actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
+ action = np.argmax(actions_value)
+
+ if not hasattr(self, 'q'): # record action value it gets
+ self.q = []
+ self.running_q = 0
+ self.running_q = self.running_q*0.99 + 0.01 * np.max(actions_value)
+ self.q.append(self.running_q)
+
+ if np.random.uniform() > self.epsilon: # choosing action
+ action = np.random.randint(0, self.n_actions)
+ return action
+
+ def _replace_target_params(self):
+ t_params = tf.get_collection('target_net_params')
+ e_params = tf.get_collection('eval_net_params')
+ self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])
+
+ def learn(self):
+ if self.learn_step_counter % self.replace_target_iter == 0:
+ self._replace_target_params()
+ print('\ntarget_params_replaced\n')
+
+ if self.memory_counter > self.memory_size:
+ sample_index = np.random.choice(self.memory_size, size=self.batch_size)
+ else:
+ sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
+ batch_memory = self.memory[sample_index, :]
+
+ q_next, q_eval4next = self.sess.run(
+ [self.q_next, self.q_eval],
+ feed_dict={self.s_: batch_memory[:, -self.n_features:], # next observation
+ self.s: batch_memory[:, -self.n_features:]}) # next observation
+ q_eval = self.sess.run(self.q_eval, {self.s: batch_memory[:, :self.n_features]})
+
+ q_target = q_eval.copy()
+
+ batch_index = np.arange(self.batch_size, dtype=np.int32)
+ eval_act_index = batch_memory[:, self.n_features].astype(int)
+ reward = batch_memory[:, self.n_features + 1]
+
+ if self.double_q:
+ max_act4next = np.argmax(q_eval4next, axis=1) # the action that brings the highest value is evaluated by q_eval
+ selected_q_next = q_next[batch_index, max_act4next] # Double DQN, select q_next depending on above actions
+ else:
+ selected_q_next = np.max(q_next, axis=1) # the natural DQN
+
+ q_target[batch_index, eval_act_index] = reward + self.gamma * selected_q_next
+
+ _, self.cost = self.sess.run([self._train_op, self.loss],
+ feed_dict={self.s: batch_memory[:, :self.n_features],
+ self.q_target: q_target})
+ self.cost_his.append(self.cost)
+
+ self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
+ self.learn_step_counter += 1
+
+
+
+
diff --git a/contents/5.1_Double_DQN/run_Pendulum.py b/contents/5.1_Double_DQN/run_Pendulum.py
new file mode 100644
index 0000000..d60a362
--- /dev/null
+++ b/contents/5.1_Double_DQN/run_Pendulum.py
@@ -0,0 +1,77 @@
+"""
+Double DQN & Natural DQN comparison,
+The Pendulum example.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+
+Using:
+Tensorflow: 1.0
+gym: 0.8.0
+"""
+
+
+import gym
+from RL_brain import DoubleDQN
+import numpy as np
+import matplotlib.pyplot as plt
+import tensorflow as tf
+
+
+env = gym.make('Pendulum-v0')
+env = env.unwrapped
+env.seed(1)
+MEMORY_SIZE = 3000
+ACTION_SPACE = 11
+
+sess = tf.Session()
+with tf.variable_scope('Natural_DQN'):
+ natural_DQN = DoubleDQN(
+ n_actions=ACTION_SPACE, n_features=3, memory_size=MEMORY_SIZE,
+ e_greedy_increment=0.001, double_q=False, sess=sess
+ )
+
+with tf.variable_scope('Double_DQN'):
+ double_DQN = DoubleDQN(
+ n_actions=ACTION_SPACE, n_features=3, memory_size=MEMORY_SIZE,
+ e_greedy_increment=0.001, double_q=True, sess=sess, output_graph=True)
+
+sess.run(tf.global_variables_initializer())
+
+
+def train(RL):
+ total_steps = 0
+ observation = env.reset()
+ while True:
+ # if total_steps - MEMORY_SIZE > 8000: env.render()
+
+ action = RL.choose_action(observation)
+
+ f_action = (action-(ACTION_SPACE-1)/2)/((ACTION_SPACE-1)/4) # convert to [-2 ~ 2] float actions
+ observation_, reward, done, info = env.step(np.array([f_action]))
+
+ reward /= 10 # normalize to a range of (-1, 0). r = 0 when get upright
+ # the Q target at upright state will be 0, because Q_target = r + gamma * Qmax(s', a') = 0 + gamma * 0
+ # so when Q at this state is greater than 0, the agent overestimates the Q. Please refer to the final result.
+
+ RL.store_transition(observation, action, reward, observation_)
+
+ if total_steps > MEMORY_SIZE: # learning
+ RL.learn()
+
+ if total_steps - MEMORY_SIZE > 20000: # stop game
+ break
+
+ observation = observation_
+ total_steps += 1
+ return RL.q
+
+q_natural = train(natural_DQN)
+q_double = train(double_DQN)
+
+plt.plot(np.array(q_natural), c='r', label='natural')
+plt.plot(np.array(q_double), c='b', label='double')
+plt.legend(loc='best')
+plt.ylabel('Q eval')
+plt.xlabel('training steps')
+plt.grid()
+plt.show()
diff --git a/contents/5.2_Prioritized_Replay_DQN/RL_brain.py b/contents/5.2_Prioritized_Replay_DQN/RL_brain.py
new file mode 100644
index 0000000..27d0e50
--- /dev/null
+++ b/contents/5.2_Prioritized_Replay_DQN/RL_brain.py
@@ -0,0 +1,300 @@
+"""
+The DQN improvement: Prioritized Experience Replay (based on https://arxiv.org/abs/1511.05952)
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+
+Using:
+Tensorflow: 1.0
+gym: 0.8.0
+"""
+
+import numpy as np
+import tensorflow as tf
+
+np.random.seed(1)
+tf.set_random_seed(1)
+
+
+class SumTree(object):
+ """
+ This SumTree code is modified version and the original code is from:
+ https://github.com/jaara/AI-blog/blob/master/SumTree.py
+
+ Story the data with it priority in tree and data frameworks.
+ """
+ data_pointer = 0
+
+ def __init__(self, capacity):
+ self.capacity = capacity # for all priority values
+ self.tree = np.zeros(2*capacity - 1)
+ # [--------------Parent nodes-------------][-------leaves to recode priority-------]
+ # size: capacity - 1 size: capacity
+ self.data = np.zeros(capacity, dtype=object) # for all transitions
+ # [--------------data frame-------------]
+ # size: capacity
+
+ def add_new_priority(self, p, data):
+ leaf_idx = self.data_pointer + self.capacity - 1
+
+ self.data[self.data_pointer] = data # update data_frame
+ self.update(leaf_idx, p) # update tree_frame
+
+ self.data_pointer += 1
+ if self.data_pointer >= self.capacity: # replace when exceed the capacity
+ self.data_pointer = 0
+
+ def update(self, tree_idx, p):
+ change = p - self.tree[tree_idx]
+
+ self.tree[tree_idx] = p
+ self._propagate_change(tree_idx, change)
+
+ def _propagate_change(self, tree_idx, change):
+ """change the sum of priority value in all parent nodes"""
+ parent_idx = (tree_idx - 1) // 2
+ self.tree[parent_idx] += change
+ if parent_idx != 0:
+ self._propagate_change(parent_idx, change)
+
+ def get_leaf(self, lower_bound):
+ leaf_idx = self._retrieve(lower_bound) # search the max leaf priority based on the lower_bound
+ data_idx = leaf_idx - self.capacity + 1
+ return [leaf_idx, self.tree[leaf_idx], self.data[data_idx]]
+
+ def _retrieve(self, lower_bound, parent_idx=0):
+ """
+ Tree structure and array storage:
+
+ Tree index:
+ 0 -> storing priority sum
+ / \
+ 1 2
+ / \ / \
+ 3 4 5 6 -> storing priority for transitions
+
+ Array type for storing:
+ [0,1,2,3,4,5,6]
+ """
+ left_child_idx = 2 * parent_idx + 1
+ right_child_idx = left_child_idx + 1
+
+ if left_child_idx >= len(self.tree): # end search when no more child
+ return parent_idx
+
+ if self.tree[left_child_idx] == self.tree[right_child_idx]:
+ return self._retrieve(lower_bound, np.random.choice([left_child_idx, right_child_idx]))
+ if lower_bound <= self.tree[left_child_idx]: # downward search, always search for a higher priority node
+ return self._retrieve(lower_bound, left_child_idx)
+ else:
+ return self._retrieve(lower_bound-self.tree[left_child_idx], right_child_idx)
+
+ @property
+ def root_priority(self):
+ return self.tree[0] # the root
+
+
+class Memory(object): # stored as ( s, a, r, s_ ) in SumTree
+ """
+ This SumTree code is modified version and the original code is from:
+ https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py
+ """
+ epsilon = 0.01 # small amount to avoid zero priority
+ alpha = 0.6 # [0~1] convert the importance of TD error to priority
+ beta = 0.4 # importance-sampling, from initial value increasing to 1
+ beta_increment_per_sampling = 0.001
+ abs_err_upper = 1 # clipped abs error
+
+ def __init__(self, capacity):
+ self.tree = SumTree(capacity)
+
+ def store(self, error, transition):
+ p = self._get_priority(error)
+ self.tree.add_new_priority(p, transition)
+
+ def sample(self, n):
+ batch_idx, batch_memory, ISWeights = [], [], []
+ segment = self.tree.root_priority / n
+ self.beta = np.min([1, self.beta + self.beta_increment_per_sampling]) # max = 1
+
+ min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.root_priority
+ maxiwi = np.power(self.tree.capacity * min_prob, -self.beta) # for later normalizing ISWeights
+ for i in range(n):
+ a = segment * i
+ b = segment * (i + 1)
+ lower_bound = np.random.uniform(a, b)
+ idx, p, data = self.tree.get_leaf(lower_bound)
+ prob = p / self.tree.root_priority
+ ISWeights.append(self.tree.capacity * prob)
+ batch_idx.append(idx)
+ batch_memory.append(data)
+
+ ISWeights = np.vstack(ISWeights)
+ ISWeights = np.power(ISWeights, -self.beta) / maxiwi # normalize
+ return batch_idx, np.vstack(batch_memory), ISWeights
+
+ def update(self, idx, error):
+ p = self._get_priority(error)
+ self.tree.update(idx, p)
+
+ def _get_priority(self, error):
+ error += self.epsilon # avoid 0
+ clipped_error = np.clip(error, 0, self.abs_err_upper)
+ return np.power(clipped_error, self.alpha)
+
+
+class DQNPrioritizedReplay:
+ def __init__(
+ self,
+ n_actions,
+ n_features,
+ learning_rate=0.005,
+ reward_decay=0.9,
+ e_greedy=0.9,
+ replace_target_iter=500,
+ memory_size=10000,
+ batch_size=32,
+ e_greedy_increment=None,
+ output_graph=False,
+ prioritized=True,
+ sess=None,
+ ):
+ self.n_actions = n_actions
+ self.n_features = n_features
+ self.lr = learning_rate
+ self.gamma = reward_decay
+ self.epsilon_max = e_greedy
+ self.replace_target_iter = replace_target_iter
+ self.memory_size = memory_size
+ self.batch_size = batch_size
+ self.epsilon_increment = e_greedy_increment
+ self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
+
+ self.prioritized = prioritized # decide to use double q or not
+
+ self.learn_step_counter = 0
+
+ self._build_net()
+
+ if self.prioritized:
+ self.memory = Memory(capacity=memory_size)
+ else:
+ self.memory = np.zeros((self.memory_size, n_features*2+2))
+
+ if sess is None:
+ self.sess = tf.Session()
+ self.sess.run(tf.global_variables_initializer())
+ else:
+ self.sess = sess
+
+ if output_graph:
+ tf.summary.FileWriter("logs/", self.sess.graph)
+
+ self.cost_his = []
+
+ def _build_net(self):
+ def build_layers(s, c_names, n_l1, w_initializer, b_initializer):
+ with tf.variable_scope('l1'):
+ w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
+ b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
+ l1 = tf.nn.relu(tf.matmul(s, w1) + b1)
+
+ with tf.variable_scope('l2'):
+ w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
+ b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
+ out = tf.matmul(l1, w2) + b2
+ return out
+
+ # ------------------ build evaluate_net ------------------
+ self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input
+ self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss
+ if self.prioritized:
+ self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights')
+ with tf.variable_scope('eval_net'):
+ c_names, n_l1, w_initializer, b_initializer = \
+ ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 20, \
+ tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers
+
+ self.q_eval = build_layers(self.s, c_names, n_l1, w_initializer, b_initializer)
+
+ with tf.variable_scope('loss'):
+ if self.prioritized:
+ self.abs_errors = tf.reduce_sum(tf.abs(self.q_target - self.q_eval), axis=1) # for updating Sumtree
+ self.loss = tf.reduce_mean(self.ISWeights * tf.squared_difference(self.q_target, self.q_eval))
+ else:
+ self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
+ with tf.variable_scope('train'):
+ self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
+
+ # ------------------ build target_net ------------------
+ self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input
+ with tf.variable_scope('target_net'):
+ c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
+ self.q_next = build_layers(self.s_, c_names, n_l1, w_initializer, b_initializer)
+
+ def store_transition(self, s, a, r, s_):
+ if self.prioritized: # prioritized replay
+ transition = np.hstack((s, [a, r], s_))
+ max_p = np.max(self.memory.tree.tree[-self.memory.tree.capacity:]) # have high priority for newly arrived transition
+ self.memory.store(max_p, transition)
+ else: # random replay
+ if not hasattr(self, 'memory_counter'):
+ self.memory_counter = 0
+ transition = np.hstack((s, [a, r], s_))
+ index = self.memory_counter % self.memory_size
+ self.memory[index, :] = transition
+ self.memory_counter += 1
+
+ def choose_action(self, observation):
+ observation = observation[np.newaxis, :]
+ if np.random.uniform() < self.epsilon:
+ actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
+ action = np.argmax(actions_value)
+ else:
+ action = np.random.randint(0, self.n_actions)
+ return action
+
+ def _replace_target_params(self):
+ t_params = tf.get_collection('target_net_params')
+ e_params = tf.get_collection('eval_net_params')
+ self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])
+
+ def learn(self):
+ if self.learn_step_counter % self.replace_target_iter == 0:
+ self._replace_target_params()
+ print('\ntarget_params_replaced\n')
+
+ if self.prioritized:
+ tree_idx, batch_memory, ISWeights = self.memory.sample(self.batch_size)
+ else:
+ sample_index = np.random.choice(self.memory_size, size=self.batch_size)
+ batch_memory = self.memory[sample_index, :]
+
+ q_next, q_eval = self.sess.run(
+ [self.q_next, self.q_eval],
+ feed_dict={self.s_: batch_memory[:, -self.n_features:],
+ self.s: batch_memory[:, :self.n_features]})
+
+ q_target = q_eval.copy()
+ batch_index = np.arange(self.batch_size, dtype=np.int32)
+ eval_act_index = batch_memory[:, self.n_features].astype(int)
+ reward = batch_memory[:, self.n_features + 1]
+
+ q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
+
+ if self.prioritized:
+ _, abs_errors, self.cost = self.sess.run([self._train_op, self.abs_errors, self.loss],
+ feed_dict={self.s: batch_memory[:, :self.n_features],
+ self.q_target: q_target,
+ self.ISWeights: ISWeights})
+ for i in range(len(tree_idx)): # update priority
+ idx = tree_idx[i]
+ self.memory.update(idx, abs_errors[i])
+ else:
+ _, self.cost = self.sess.run([self._train_op, self.loss],
+ feed_dict={self.s: batch_memory[:, :self.n_features],
+ self.q_target: q_target})
+
+ self.cost_his.append(self.cost)
+
+ self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
+ self.learn_step_counter += 1
diff --git a/contents/5.2_Prioritized_Replay_DQN/run_MountainCar.py b/contents/5.2_Prioritized_Replay_DQN/run_MountainCar.py
new file mode 100644
index 0000000..08c2562
--- /dev/null
+++ b/contents/5.2_Prioritized_Replay_DQN/run_MountainCar.py
@@ -0,0 +1,80 @@
+"""
+The DQN improvement: Prioritized Experience Replay (based on https://arxiv.org/abs/1511.05952)
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+
+Using:
+Tensorflow: 1.0
+gym: 0.8.0
+"""
+
+
+import gym
+from RL_brain import DQNPrioritizedReplay
+import matplotlib.pyplot as plt
+import tensorflow as tf
+import numpy as np
+
+env = gym.make('MountainCar-v0')
+env = env.unwrapped
+env.seed(21)
+MEMORY_SIZE = 10000
+
+sess = tf.Session()
+with tf.variable_scope('natural_DQN'):
+ RL_natural = DQNPrioritizedReplay(
+ n_actions=3, n_features=2, memory_size=MEMORY_SIZE,
+ e_greedy_increment=0.00005, sess=sess, prioritized=False,
+ )
+
+with tf.variable_scope('DQN_with_prioritized_replay'):
+ RL_prio = DQNPrioritizedReplay(
+ n_actions=3, n_features=2, memory_size=MEMORY_SIZE,
+ e_greedy_increment=0.00005, sess=sess, prioritized=True, output_graph=True,
+ )
+sess.run(tf.global_variables_initializer())
+
+
+def train(RL):
+ total_steps = 0
+ steps = []
+ episodes = []
+ for i_episode in range(20):
+ observation = env.reset()
+ while True:
+ # env.render()
+
+ action = RL.choose_action(observation)
+
+ observation_, reward, done, info = env.step(action)
+
+ if done: reward = 10
+
+ RL.store_transition(observation, action, reward, observation_)
+
+ if total_steps > MEMORY_SIZE:
+ RL.learn()
+
+ if done:
+ print('episode ', i_episode, ' finished')
+ steps.append(total_steps)
+ episodes.append(i_episode)
+ break
+
+ observation = observation_
+ total_steps += 1
+ return np.vstack((episodes, steps))
+
+his_natural = train(RL_natural)
+his_prio = train(RL_prio)
+
+# compare based on first success
+plt.plot(his_natural[0, :], his_natural[1, :] - his_natural[1, 0], c='b', label='natural DQN')
+plt.plot(his_prio[0, :], his_prio[1, :] - his_prio[1, 0], c='r', label='DQN with prioritized replay')
+plt.legend(loc='best')
+plt.ylabel('total training time')
+plt.xlabel('episode')
+plt.grid()
+plt.show()
+
+
diff --git a/contents/5.3_Dueling_DQN/RL_brain.py b/contents/5.3_Dueling_DQN/RL_brain.py
new file mode 100644
index 0000000..fec458f
--- /dev/null
+++ b/contents/5.3_Dueling_DQN/RL_brain.py
@@ -0,0 +1,165 @@
+"""
+The Dueling DQN based on this paper: https://arxiv.org/abs/1511.06581
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+
+Using:
+Tensorflow: 1.0
+gym: 0.8.0
+"""
+
+import numpy as np
+import tensorflow as tf
+
+np.random.seed(1)
+tf.set_random_seed(1)
+
+
+class DuelingDQN:
+ def __init__(
+ self,
+ n_actions,
+ n_features,
+ learning_rate=0.001,
+ reward_decay=0.9,
+ e_greedy=0.9,
+ replace_target_iter=200,
+ memory_size=500,
+ batch_size=32,
+ e_greedy_increment=None,
+ output_graph=False,
+ dueling=True,
+ sess=None,
+ ):
+ self.n_actions = n_actions
+ self.n_features = n_features
+ self.lr = learning_rate
+ self.gamma = reward_decay
+ self.epsilon_max = e_greedy
+ self.replace_target_iter = replace_target_iter
+ self.memory_size = memory_size
+ self.batch_size = batch_size
+ self.epsilon_increment = e_greedy_increment
+ self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
+
+ self.dueling = dueling # decide to use dueling DQN or not
+
+ self.learn_step_counter = 0
+ self.memory = np.zeros((self.memory_size, n_features*2+2))
+ self._build_net()
+ if sess is None:
+ self.sess = tf.Session()
+ self.sess.run(tf.global_variables_initializer())
+ else:
+ self.sess = sess
+ if output_graph:
+ tf.summary.FileWriter("logs/", self.sess.graph)
+ self.cost_his = []
+
+ def _build_net(self):
+ def build_layers(s, c_names, n_l1, w_initializer, b_initializer):
+ with tf.variable_scope('l1'):
+ w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
+ b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
+ l1 = tf.nn.relu(tf.matmul(s, w1) + b1)
+
+ if self.dueling:
+ # Dueling DQN
+ with tf.variable_scope('Value'):
+ w2 = tf.get_variable('w2', [n_l1, 1], initializer=w_initializer, collections=c_names)
+ b2 = tf.get_variable('b2', [1, 1], initializer=b_initializer, collections=c_names)
+ self.V = tf.matmul(l1, w2) + b2
+
+ with tf.variable_scope('Advantage'):
+ w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
+ b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
+ self.A = tf.matmul(l1, w2) + b2
+
+ with tf.variable_scope('Q'):
+ out = self.V + (self.A - tf.reduce_mean(self.A, axis=1, keep_dims=True)) # Q = V(s) + A(s,a)
+ else:
+ with tf.variable_scope('Q'):
+ w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
+ b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
+ out = tf.matmul(l1, w2) + b2
+
+ return out
+
+ # ------------------ build evaluate_net ------------------
+ self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input
+ self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss
+ with tf.variable_scope('eval_net'):
+ c_names, n_l1, w_initializer, b_initializer = \
+ ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 20, \
+ tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers
+
+ self.q_eval = build_layers(self.s, c_names, n_l1, w_initializer, b_initializer)
+
+ with tf.variable_scope('loss'):
+ self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
+ with tf.variable_scope('train'):
+ self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
+
+ # ------------------ build target_net ------------------
+ self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input
+ with tf.variable_scope('target_net'):
+ c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
+
+ self.q_next = build_layers(self.s_, c_names, n_l1, w_initializer, b_initializer)
+
+ def store_transition(self, s, a, r, s_):
+ if not hasattr(self, 'memory_counter'):
+ self.memory_counter = 0
+ transition = np.hstack((s, [a, r], s_))
+ index = self.memory_counter % self.memory_size
+ self.memory[index, :] = transition
+ self.memory_counter += 1
+
+ def choose_action(self, observation):
+ observation = observation[np.newaxis, :]
+ if np.random.uniform() < self.epsilon: # choosing action
+ actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
+ action = np.argmax(actions_value)
+ else:
+ action = np.random.randint(0, self.n_actions)
+ return action
+
+ def _replace_target_params(self):
+ t_params = tf.get_collection('target_net_params')
+ e_params = tf.get_collection('eval_net_params')
+ self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])
+
+ def learn(self):
+ if self.learn_step_counter % self.replace_target_iter == 0:
+ self._replace_target_params()
+ print('\ntarget_params_replaced\n')
+
+ sample_index = np.random.choice(self.memory_size, size=self.batch_size)
+ batch_memory = self.memory[sample_index, :]
+
+ q_next, q_eval4next, = self.sess.run(
+ [self.q_next, self.q_eval],
+ feed_dict={self.s_: batch_memory[:, -self.n_features:], # next observation
+ self.s: batch_memory[:, -self.n_features:]}) # next observation
+ q_eval = self.sess.run(self.q_eval, {self.s: batch_memory[:, :self.n_features]})
+
+ q_target = q_eval.copy()
+
+ batch_index = np.arange(self.batch_size, dtype=np.int32)
+ eval_act_index = batch_memory[:, self.n_features].astype(int)
+ reward = batch_memory[:, self.n_features + 1]
+
+ q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
+
+ _, self.cost = self.sess.run([self._train_op, self.loss],
+ feed_dict={self.s: batch_memory[:, :self.n_features],
+ self.q_target: q_target})
+ self.cost_his.append(self.cost)
+
+ self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
+ self.learn_step_counter += 1
+
+
+
+
+
diff --git a/contents/5.3_Dueling_DQN/run_Pendulum.py b/contents/5.3_Dueling_DQN/run_Pendulum.py
new file mode 100644
index 0000000..d7b2e70
--- /dev/null
+++ b/contents/5.3_Dueling_DQN/run_Pendulum.py
@@ -0,0 +1,86 @@
+"""
+Dueling DQN & Natural DQN comparison
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+
+Using:
+Tensorflow: 1.0
+gym: 0.8.0
+"""
+
+
+import gym
+from RL_brain import DuelingDQN
+import numpy as np
+import matplotlib.pyplot as plt
+import tensorflow as tf
+
+
+env = gym.make('Pendulum-v0')
+env = env.unwrapped
+env.seed(1)
+MEMORY_SIZE = 3000
+ACTION_SPACE = 25
+
+sess = tf.Session()
+with tf.variable_scope('natural'):
+ natural_DQN = DuelingDQN(
+ n_actions=ACTION_SPACE, n_features=3, memory_size=MEMORY_SIZE,
+ e_greedy_increment=0.001, sess=sess, dueling=False)
+
+with tf.variable_scope('dueling'):
+ dueling_DQN = DuelingDQN(
+ n_actions=ACTION_SPACE, n_features=3, memory_size=MEMORY_SIZE,
+ e_greedy_increment=0.001, sess=sess, dueling=True, output_graph=True)
+
+sess.run(tf.global_variables_initializer())
+
+
+def train(RL):
+ acc_r = [0]
+ total_steps = 0
+ observation = env.reset()
+ while True:
+ # if total_steps-MEMORY_SIZE > 9000: env.render()
+
+ action = RL.choose_action(observation)
+
+ f_action = (action-(ACTION_SPACE-1)/2)/((ACTION_SPACE-1)/4) # [-2 ~ 2] float actions
+ observation_, reward, done, info = env.step(np.array([f_action]))
+
+ reward /= 10 # normalize to a range of (-1, 0)
+ acc_r.append(reward + acc_r[-1]) # accumulated reward
+
+ RL.store_transition(observation, action, reward, observation_)
+
+ if total_steps > MEMORY_SIZE:
+ RL.learn()
+
+ if total_steps-MEMORY_SIZE > 15000:
+ break
+
+ observation = observation_
+ total_steps += 1
+ return RL.cost_his, acc_r
+
+c_natural, r_natural = train(natural_DQN)
+c_dueling, r_dueling = train(dueling_DQN)
+
+plt.figure(1)
+plt.plot(np.array(c_natural), c='r', label='natural')
+plt.plot(np.array(c_dueling), c='b', label='dueling')
+plt.legend(loc='best')
+plt.ylabel('cost')
+plt.xlabel('training steps')
+plt.grid()
+
+plt.figure(2)
+plt.plot(np.array(r_natural), c='r', label='natural')
+plt.plot(np.array(r_dueling), c='b', label='dueling')
+plt.legend(loc='best')
+plt.ylabel('accumulated reward')
+plt.xlabel('training steps')
+plt.grid()
+
+plt.show()
+
diff --git a/contents/5_Deep_Q_Network/DQN_modified.py b/contents/5_Deep_Q_Network/DQN_modified.py
new file mode 100644
index 0000000..16712ec
--- /dev/null
+++ b/contents/5_Deep_Q_Network/DQN_modified.py
@@ -0,0 +1,172 @@
+"""
+This part of code is the DQN brain.
+
+view the tensorboard picture about this DQN structure on: https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/4-3-DQN3/#modification
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+
+Using:
+Tensorflow: 1.0
+gym: 0.7.3
+"""
+
+import numpy as np
+import tensorflow as tf
+
+np.random.seed(1)
+tf.set_random_seed(1)
+
+
+# Deep Q Network off-policy
+class DeepQNetwork:
+ def __init__(
+ self,
+ n_actions,
+ n_features,
+ learning_rate=0.01,
+ reward_decay=0.9,
+ e_greedy=0.9,
+ replace_target_iter=300,
+ memory_size=500,
+ batch_size=32,
+ e_greedy_increment=None,
+ output_graph=False,
+ ):
+ self.n_actions = n_actions
+ self.n_features = n_features
+ self.lr = learning_rate
+ self.gamma = reward_decay
+ self.epsilon_max = e_greedy
+ self.replace_target_iter = replace_target_iter
+ self.memory_size = memory_size
+ self.batch_size = batch_size
+ self.epsilon_increment = e_greedy_increment
+ self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
+
+ # total learning step
+ self.learn_step_counter = 0
+
+ # initialize zero memory [s, a, r, s_]
+ self.memory = np.zeros((self.memory_size, n_features * 2 + 2))
+
+ # consist of [target_net, evaluate_net]
+ self._build_net()
+
+ self.sess = tf.Session()
+
+ if output_graph:
+ # $ tensorboard --logdir=logs
+ # tf.train.SummaryWriter soon be deprecated, use following
+ tf.summary.FileWriter("logs/", self.sess.graph)
+
+ self.sess.run(tf.global_variables_initializer())
+ self.cost_his = []
+
+ def _build_net(self):
+ def build_layers(s, c_names, n_l1, w_initializer, b_initializer):
+ with tf.variable_scope('l1'):
+ w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
+ b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
+ l1 = tf.nn.relu(tf.matmul(s, w1) + b1)
+
+ with tf.variable_scope('l2'):
+ w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
+ b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
+ out = tf.matmul(l1, w2) + b2
+ return out
+
+ # ------------------ all inputs ------------------------
+ self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input State
+ self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input Next State
+ self.r = tf.placeholder(tf.float32, [None, ], name='r') # input Reward
+ self.a = tf.placeholder(tf.int32, [None, ], name='a') # input Action
+
+ # ------------------ build evaluate_net ------------------
+ with tf.variable_scope('eval_net'):
+ # c_names(collections_names) are the collections to store variables
+ c_names, n_l1, w_initializer, b_initializer = \
+ ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 10, \
+ tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers
+ self.q_eval = build_layers(self.s, c_names, n_l1, w_initializer, b_initializer)
+
+ # ------------------ build target_net ------------------
+ with tf.variable_scope('target_net'):
+ # c_names(collections_names) are the collections to store variables
+ c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
+ self.q_next = build_layers(self.s_, c_names, n_l1, w_initializer, b_initializer)
+
+ with tf.variable_scope('q_target'):
+ self.q_target = self.r + self.gamma * tf.reduce_max(self.q_next, axis=1, name='Qmax_s_') # shape=(None, )
+
+ with tf.variable_scope('q_eval'):
+ a_one_hot = tf.one_hot(self.a, depth=self.n_actions, dtype=tf.float32)
+ self.q_eval_wrt_a = tf.reduce_sum(self.q_eval * a_one_hot, axis=1) # shape=(None, )
+
+ with tf.variable_scope('loss'):
+ self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval_wrt_a, name='TD_error'))
+ with tf.variable_scope('train'):
+ self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
+
+ def store_transition(self, s, a, r, s_):
+ if not hasattr(self, 'memory_counter'):
+ self.memory_counter = 0
+ transition = np.hstack((s, [a, r], s_))
+ # replace the old memory with new memory
+ index = self.memory_counter % self.memory_size
+ self.memory[index, :] = transition
+ self.memory_counter += 1
+
+ def choose_action(self, observation):
+ # to have batch dimension when feed into tf placeholder
+ observation = observation[np.newaxis, :]
+
+ if np.random.uniform() < self.epsilon:
+ # forward feed the observation and get q value for every actions
+ actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
+ action = np.argmax(actions_value)
+ else:
+ action = np.random.randint(0, self.n_actions)
+ return action
+
+ def _replace_target_params(self):
+ t_params = tf.get_collection('target_net_params')
+ e_params = tf.get_collection('eval_net_params')
+ self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])
+
+ def learn(self):
+ # check to replace target parameters
+ if self.learn_step_counter % self.replace_target_iter == 0:
+ self._replace_target_params()
+ print('\ntarget_params_replaced\n')
+
+ # sample batch memory from all memory
+ if self.memory_counter > self.memory_size:
+ sample_index = np.random.choice(self.memory_size, size=self.batch_size)
+ else:
+ sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
+ batch_memory = self.memory[sample_index, :]
+
+ _, cost = self.sess.run(
+ [self._train_op, self.loss],
+ feed_dict={
+ self.s: batch_memory[:, :self.n_features],
+ self.a: batch_memory[:, self.n_features],
+ self.r: batch_memory[:, self.n_features + 1],
+ self.s_: batch_memory[:, -self.n_features:],
+ })
+
+ self.cost_his.append(cost)
+
+ # increasing epsilon
+ self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
+ self.learn_step_counter += 1
+
+ def plot_cost(self):
+ import matplotlib.pyplot as plt
+ plt.plot(np.arange(len(self.cost_his)), self.cost_his)
+ plt.ylabel('Cost')
+ plt.xlabel('training steps')
+ plt.show()
+
+if __name__ == '__main__':
+ DQN = DeepQNetwork(3,4, output_graph=True)
\ No newline at end of file
diff --git a/contents/5_Deep_Q_Network/RL_brain.py b/contents/5_Deep_Q_Network/RL_brain.py
new file mode 100644
index 0000000..f2d0ef7
--- /dev/null
+++ b/contents/5_Deep_Q_Network/RL_brain.py
@@ -0,0 +1,213 @@
+"""
+This part of code is the DQN brain, which is a brain of the agent.
+All decisions are made in here.
+Using Tensorflow to build the neural network.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+
+Using:
+Tensorflow: 1.0
+gym: 0.7.3
+"""
+
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+
+np.random.seed(1)
+tf.set_random_seed(1)
+
+
+# Deep Q Network off-policy
+class DeepQNetwork:
+ def __init__(
+ self,
+ n_actions,
+ n_features,
+ learning_rate=0.01,
+ reward_decay=0.9,
+ e_greedy=0.9,
+ replace_target_iter=300,
+ memory_size=500,
+ batch_size=32,
+ e_greedy_increment=None,
+ output_graph=False,
+ ):
+ self.n_actions = n_actions
+ self.n_features = n_features
+ self.lr = learning_rate
+ self.gamma = reward_decay
+ self.epsilon_max = e_greedy
+ self.replace_target_iter = replace_target_iter
+ self.memory_size = memory_size
+ self.batch_size = batch_size
+ self.epsilon_increment = e_greedy_increment
+ self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
+
+ # total learning step
+ self.learn_step_counter = 0
+
+ # initialize zero memory [s, a, r, s_]
+ self.memory = np.zeros((self.memory_size, n_features * 2 + 2))
+
+ # consist of [target_net, evaluate_net]
+ self._build_net()
+
+ self.sess = tf.Session()
+
+ if output_graph:
+ # $ tensorboard --logdir=logs
+ # tf.train.SummaryWriter soon be deprecated, use following
+ tf.summary.FileWriter("logs/", self.sess.graph)
+
+ self.sess.run(tf.global_variables_initializer())
+ self.cost_his = []
+
+ def _build_net(self):
+ # ------------------ build evaluate_net ------------------
+ self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input
+ self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss
+ with tf.variable_scope('eval_net'):
+ # c_names(collections_names) are the collections to store variables
+ c_names, n_l1, w_initializer, b_initializer = \
+ ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 10, \
+ tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers
+
+ # first layer. collections is used later when assign to target net
+ with tf.variable_scope('l1'):
+ w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
+ b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
+ l1 = tf.nn.relu(tf.matmul(self.s, w1) + b1)
+
+ # second layer. collections is used later when assign to target net
+ with tf.variable_scope('l2'):
+ w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
+ b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
+ self.q_eval = tf.matmul(l1, w2) + b2
+
+ with tf.variable_scope('loss'):
+ self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
+ with tf.variable_scope('train'):
+ self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
+
+ # ------------------ build target_net ------------------
+ self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input
+ with tf.variable_scope('target_net'):
+ # c_names(collections_names) are the collections to store variables
+ c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
+
+ # first layer. collections is used later when assign to target net
+ with tf.variable_scope('l1'):
+ w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
+ b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
+ l1 = tf.nn.relu(tf.matmul(self.s_, w1) + b1)
+
+ # second layer. collections is used later when assign to target net
+ with tf.variable_scope('l2'):
+ w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
+ b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
+ self.q_next = tf.matmul(l1, w2) + b2
+
+ def store_transition(self, s, a, r, s_):
+ if not hasattr(self, 'memory_counter'):
+ self.memory_counter = 0
+
+ transition = np.hstack((s, [a, r], s_))
+
+ # replace the old memory with new memory
+ index = self.memory_counter % self.memory_size
+ self.memory[index, :] = transition
+
+ self.memory_counter += 1
+
+ def choose_action(self, observation):
+ # to have batch dimension when feed into tf placeholder
+ observation = observation[np.newaxis, :]
+
+ if np.random.uniform() < self.epsilon:
+ # forward feed the observation and get q value for every actions
+ actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
+ action = np.argmax(actions_value)
+ else:
+ action = np.random.randint(0, self.n_actions)
+ return action
+
+ def _replace_target_params(self):
+ t_params = tf.get_collection('target_net_params')
+ e_params = tf.get_collection('eval_net_params')
+ self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])
+
+ def learn(self):
+ # check to replace target parameters
+ if self.learn_step_counter % self.replace_target_iter == 0:
+ self._replace_target_params()
+ print('\ntarget_params_replaced\n')
+
+ # sample batch memory from all memory
+ if self.memory_counter > self.memory_size:
+ sample_index = np.random.choice(self.memory_size, size=self.batch_size)
+ else:
+ sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
+ batch_memory = self.memory[sample_index, :]
+
+ q_next, q_eval = self.sess.run(
+ [self.q_next, self.q_eval],
+ feed_dict={
+ self.s_: batch_memory[:, -self.n_features:], # fixed params
+ self.s: batch_memory[:, :self.n_features], # newest params
+ })
+
+ # change q_target w.r.t q_eval's action
+ q_target = q_eval.copy()
+
+ batch_index = np.arange(self.batch_size, dtype=np.int32)
+ eval_act_index = batch_memory[:, self.n_features].astype(int)
+ reward = batch_memory[:, self.n_features + 1]
+
+ q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
+
+ """
+ For example in this batch I have 2 samples and 3 actions:
+ q_eval =
+ [[1, 2, 3],
+ [4, 5, 6]]
+
+ q_target = q_eval =
+ [[1, 2, 3],
+ [4, 5, 6]]
+
+ Then change q_target with the real q_target value w.r.t the q_eval's action.
+ For example in:
+ sample 0, I took action 0, and the max q_target value is -1;
+ sample 1, I took action 2, and the max q_target value is -2:
+ q_target =
+ [[-1, 2, 3],
+ [4, 5, -2]]
+
+ So the (q_target - q_eval) becomes:
+ [[(-1)-(1), 0, 0],
+ [0, 0, (-2)-(6)]]
+
+ We then backpropagate this error w.r.t the corresponding action to network,
+ leave other action as error=0 cause we didn't choose it.
+ """
+
+ # train eval network
+ _, self.cost = self.sess.run([self._train_op, self.loss],
+ feed_dict={self.s: batch_memory[:, :self.n_features],
+ self.q_target: q_target})
+ self.cost_his.append(self.cost)
+
+ # increasing epsilon
+ self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
+ self.learn_step_counter += 1
+
+ def plot_cost(self):
+ import matplotlib.pyplot as plt
+ plt.plot(np.arange(len(self.cost_his)), self.cost_his)
+ plt.ylabel('Cost')
+ plt.xlabel('training steps')
+ plt.show()
+
+
+
diff --git a/contents/5_Deep_Q_Network/maze_env.py b/contents/5_Deep_Q_Network/maze_env.py
new file mode 100644
index 0000000..5134df0
--- /dev/null
+++ b/contents/5_Deep_Q_Network/maze_env.py
@@ -0,0 +1,130 @@
+"""
+Reinforcement learning maze example.
+
+Red rectangle: explorer.
+Black rectangles: hells [reward = -1].
+Yellow bin circle: paradise [reward = +1].
+All other states: ground [reward = 0].
+
+This script is the environment part of this example.
+The RL is in RL_brain.py.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+"""
+
+
+import numpy as np
+import tkinter as tk
+import time
+
+
+UNIT = 40 # pixels
+MAZE_H = 4 # grid height
+MAZE_W = 4 # grid width
+
+
+class Maze(tk.Tk):
+ def __init__(self):
+ super(Maze, self).__init__()
+ self.action_space = ['u', 'd', 'l', 'r']
+ self.n_actions = len(self.action_space)
+ self.n_features = 2
+ self.title('maze')
+ self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
+ self._build_maze()
+
+ def _build_maze(self):
+ self.canvas = tk.Canvas(self, bg='white',
+ height=MAZE_H * UNIT,
+ width=MAZE_W * UNIT)
+
+ # create grids
+ for c in range(0, MAZE_W * UNIT, UNIT):
+ x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
+ self.canvas.create_line(x0, y0, x1, y1)
+ for r in range(0, MAZE_H * UNIT, UNIT):
+ x0, y0, x1, y1 = 0, r, MAZE_H * UNIT, r
+ self.canvas.create_line(x0, y0, x1, y1)
+
+ # create origin
+ origin = np.array([20, 20])
+
+ # hell
+ hell1_center = origin + np.array([UNIT * 2, UNIT])
+ self.hell1 = self.canvas.create_rectangle(
+ hell1_center[0] - 15, hell1_center[1] - 15,
+ hell1_center[0] + 15, hell1_center[1] + 15,
+ fill='black')
+ # hell
+ # hell2_center = origin + np.array([UNIT, UNIT * 2])
+ # self.hell2 = self.canvas.create_rectangle(
+ # hell2_center[0] - 15, hell2_center[1] - 15,
+ # hell2_center[0] + 15, hell2_center[1] + 15,
+ # fill='black')
+
+ # create oval
+ oval_center = origin + UNIT * 2
+ self.oval = self.canvas.create_oval(
+ oval_center[0] - 15, oval_center[1] - 15,
+ oval_center[0] + 15, oval_center[1] + 15,
+ fill='yellow')
+
+ # create red rect
+ self.rect = self.canvas.create_rectangle(
+ origin[0] - 15, origin[1] - 15,
+ origin[0] + 15, origin[1] + 15,
+ fill='red')
+
+ # pack all
+ self.canvas.pack()
+
+ def reset(self):
+ self.update()
+ time.sleep(0.1)
+ self.canvas.delete(self.rect)
+ origin = np.array([20, 20])
+ self.rect = self.canvas.create_rectangle(
+ origin[0] - 15, origin[1] - 15,
+ origin[0] + 15, origin[1] + 15,
+ fill='red')
+ # return observation
+ return (np.array(self.canvas.coords(self.rect)[:2]) - np.array(self.canvas.coords(self.oval)[:2]))/(MAZE_H*UNIT)
+
+ def step(self, action):
+ s = self.canvas.coords(self.rect)
+ base_action = np.array([0, 0])
+ if action == 0: # up
+ if s[1] > UNIT:
+ base_action[1] -= UNIT
+ elif action == 1: # down
+ if s[1] < (MAZE_H - 1) * UNIT:
+ base_action[1] += UNIT
+ elif action == 2: # right
+ if s[0] < (MAZE_W - 1) * UNIT:
+ base_action[0] += UNIT
+ elif action == 3: # left
+ if s[0] > UNIT:
+ base_action[0] -= UNIT
+
+ self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent
+
+ next_coords = self.canvas.coords(self.rect) # next state
+
+ # reward function
+ if next_coords == self.canvas.coords(self.oval):
+ reward = 1
+ done = True
+ elif next_coords in [self.canvas.coords(self.hell1)]:
+ reward = -1
+ done = True
+ else:
+ reward = 0
+ done = False
+ s_ = (np.array(next_coords[:2]) - np.array(self.canvas.coords(self.oval)[:2]))/(MAZE_H*UNIT)
+ return s_, reward, done
+
+ def render(self):
+ # time.sleep(0.01)
+ self.update()
+
+
diff --git a/contents/5_Deep_Q_Network/run_this.py b/contents/5_Deep_Q_Network/run_this.py
new file mode 100644
index 0000000..cec116b
--- /dev/null
+++ b/contents/5_Deep_Q_Network/run_this.py
@@ -0,0 +1,61 @@
+"""
+Sarsa is a online updating method for Reinforcement learning.
+
+Unlike Q learning which is a offline updating method, Sarsa is updating while in the current trajectory.
+
+You will see the sarsa is more coward when punishment is close because it cares about all behaviours,
+while q learning is more brave because it only cares about maximum behaviour.
+"""
+
+from maze_env import Maze
+from RL_brain import DeepQNetwork
+
+
+def run_maze():
+ step = 0
+ for episode in range(300):
+ # initial observation
+ observation = env.reset()
+
+ while True:
+ # fresh env
+ env.render()
+
+ # RL choose action based on observation
+ action = RL.choose_action(observation)
+
+ # RL take action and get next observation and reward
+ observation_, reward, done = env.step(action)
+
+ RL.store_transition(observation, action, reward, observation_)
+
+ if (step > 200) and (step % 5 == 0):
+ RL.learn()
+
+ # swap observation
+ observation = observation_
+
+ # break while loop when end of this episode
+ if done:
+ break
+ step += 1
+
+ # end of game
+ print('game over')
+ env.destroy()
+
+
+if __name__ == "__main__":
+ # maze game
+ env = Maze()
+ RL = DeepQNetwork(env.n_actions, env.n_features,
+ learning_rate=0.01,
+ reward_decay=0.9,
+ e_greedy=0.9,
+ replace_target_iter=200,
+ memory_size=2000,
+ # output_graph=True
+ )
+ env.after(100, run_maze)
+ env.mainloop()
+ RL.plot_cost()
\ No newline at end of file
diff --git a/contents/6_OpenAI_gym/RL_brain.py b/contents/6_OpenAI_gym/RL_brain.py
new file mode 100644
index 0000000..bc3796c
--- /dev/null
+++ b/contents/6_OpenAI_gym/RL_brain.py
@@ -0,0 +1,213 @@
+"""
+This part of code is the DQN brain, which is a brain of the agent.
+All decisions are made in here.
+Using Tensorflow to build the neural network.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+
+Using:
+Tensorflow: 1.0
+gym: 0.8.0
+"""
+
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+
+np.random.seed(1)
+tf.set_random_seed(1)
+
+
+# Deep Q Network off-policy
+class DeepQNetwork:
+ def __init__(
+ self,
+ n_actions,
+ n_features,
+ learning_rate=0.01,
+ reward_decay=0.9,
+ e_greedy=0.9,
+ replace_target_iter=300,
+ memory_size=500,
+ batch_size=32,
+ e_greedy_increment=None,
+ output_graph=False,
+ ):
+ self.n_actions = n_actions
+ self.n_features = n_features
+ self.lr = learning_rate
+ self.gamma = reward_decay
+ self.epsilon_max = e_greedy
+ self.replace_target_iter = replace_target_iter
+ self.memory_size = memory_size
+ self.batch_size = batch_size
+ self.epsilon_increment = e_greedy_increment
+ self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
+
+ # total learning step
+ self.learn_step_counter = 0
+
+ # initialize zero memory [s, a, r, s_]
+ self.memory = np.zeros((self.memory_size, n_features * 2 + 2))
+
+ # consist of [target_net, evaluate_net]
+ self._build_net()
+
+ self.sess = tf.Session()
+
+ if output_graph:
+ # $ tensorboard --logdir=logs
+ # tf.train.SummaryWriter soon be deprecated, use following
+ tf.summary.FileWriter("logs/", self.sess.graph)
+
+ self.sess.run(tf.global_variables_initializer())
+ self.cost_his = []
+
+ def _build_net(self):
+ # ------------------ build evaluate_net ------------------
+ self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input
+ self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss
+ with tf.variable_scope('eval_net'):
+ # c_names(collections_names) are the collections to store variables
+ c_names, n_l1, w_initializer, b_initializer = \
+ ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 10, \
+ tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers
+
+ # first layer. collections is used later when assign to target net
+ with tf.variable_scope('l1'):
+ w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
+ b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
+ l1 = tf.nn.relu(tf.matmul(self.s, w1) + b1)
+
+ # second layer. collections is used later when assign to target net
+ with tf.variable_scope('l2'):
+ w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
+ b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
+ self.q_eval = tf.matmul(l1, w2) + b2
+
+ with tf.variable_scope('loss'):
+ self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
+ with tf.variable_scope('train'):
+ self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
+
+ # ------------------ build target_net ------------------
+ self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input
+ with tf.variable_scope('target_net'):
+ # c_names(collections_names) are the collections to store variables
+ c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
+
+ # first layer. collections is used later when assign to target net
+ with tf.variable_scope('l1'):
+ w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
+ b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
+ l1 = tf.nn.relu(tf.matmul(self.s_, w1) + b1)
+
+ # second layer. collections is used later when assign to target net
+ with tf.variable_scope('l2'):
+ w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
+ b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
+ self.q_next = tf.matmul(l1, w2) + b2
+
+ def store_transition(self, s, a, r, s_):
+ if not hasattr(self, 'memory_counter'):
+ self.memory_counter = 0
+
+ transition = np.hstack((s, [a, r], s_))
+
+ # replace the old memory with new memory
+ index = self.memory_counter % self.memory_size
+ self.memory[index, :] = transition
+
+ self.memory_counter += 1
+
+ def choose_action(self, observation):
+ # to have batch dimension when feed into tf placeholder
+ observation = observation[np.newaxis, :]
+
+ if np.random.uniform() < self.epsilon:
+ # forward feed the observation and get q value for every actions
+ actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
+ action = np.argmax(actions_value)
+ else:
+ action = np.random.randint(0, self.n_actions)
+ return action
+
+ def _replace_target_params(self):
+ t_params = tf.get_collection('target_net_params')
+ e_params = tf.get_collection('eval_net_params')
+ self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])
+
+ def learn(self):
+ # check to replace target parameters
+ if self.learn_step_counter % self.replace_target_iter == 0:
+ self._replace_target_params()
+ print('\ntarget_params_replaced\n')
+
+ # sample batch memory from all memory
+ if self.memory_counter > self.memory_size:
+ sample_index = np.random.choice(self.memory_size, size=self.batch_size)
+ else:
+ sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
+ batch_memory = self.memory[sample_index, :]
+
+ q_next, q_eval = self.sess.run(
+ [self.q_next, self.q_eval],
+ feed_dict={
+ self.s_: batch_memory[:, -self.n_features:], # fixed params
+ self.s: batch_memory[:, :self.n_features], # newest params
+ })
+
+ # change q_target w.r.t q_eval's action
+ q_target = q_eval.copy()
+
+ batch_index = np.arange(self.batch_size, dtype=np.int32)
+ eval_act_index = batch_memory[:, self.n_features].astype(int)
+ reward = batch_memory[:, self.n_features + 1]
+
+ q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
+
+ """
+ For example in this batch I have 2 samples and 3 actions:
+ q_eval =
+ [[1, 2, 3],
+ [4, 5, 6]]
+
+ q_target = q_eval =
+ [[1, 2, 3],
+ [4, 5, 6]]
+
+ Then change q_target with the real q_target value w.r.t the q_eval's action.
+ For example in:
+ sample 0, I took action 0, and the max q_target value is -1;
+ sample 1, I took action 2, and the max q_target value is -2:
+ q_target =
+ [[-1, 2, 3],
+ [4, 5, -2]]
+
+ So the (q_target - q_eval) becomes:
+ [[(-1)-(1), 0, 0],
+ [0, 0, (-2)-(6)]]
+
+ We then backpropagate this error w.r.t the corresponding action to network,
+ leave other action as error=0 cause we didn't choose it.
+ """
+
+ # train eval network
+ _, self.cost = self.sess.run([self._train_op, self.loss],
+ feed_dict={self.s: batch_memory[:, :self.n_features],
+ self.q_target: q_target})
+ self.cost_his.append(self.cost)
+
+ # increasing epsilon
+ self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
+ self.learn_step_counter += 1
+
+ def plot_cost(self):
+ import matplotlib.pyplot as plt
+ plt.plot(np.arange(len(self.cost_his)), self.cost_his)
+ plt.ylabel('Cost')
+ plt.xlabel('training steps')
+ plt.show()
+
+
+
diff --git a/contents/6_OpenAI_gym/run_CartPole.py b/contents/6_OpenAI_gym/run_CartPole.py
new file mode 100644
index 0000000..104bde4
--- /dev/null
+++ b/contents/6_OpenAI_gym/run_CartPole.py
@@ -0,0 +1,62 @@
+"""
+Deep Q network,
+
+Using:
+Tensorflow: 1.0
+gym: 0.7.3
+"""
+
+
+import gym
+from RL_brain import DeepQNetwork
+
+env = gym.make('CartPole-v0')
+env = env.unwrapped
+
+print(env.action_space)
+print(env.observation_space)
+print(env.observation_space.high)
+print(env.observation_space.low)
+
+RL = DeepQNetwork(n_actions=env.action_space.n,
+ n_features=env.observation_space.shape[0],
+ learning_rate=0.01, e_greedy=0.9,
+ replace_target_iter=100, memory_size=2000,
+ e_greedy_increment=0.001,)
+
+total_steps = 0
+
+
+for i_episode in range(100):
+
+ observation = env.reset()
+ ep_r = 0
+ while True:
+ env.render()
+
+ action = RL.choose_action(observation)
+
+ observation_, reward, done, info = env.step(action)
+
+ # the smaller theta and closer to center the better
+ x, x_dot, theta, theta_dot = observation_
+ r1 = (env.x_threshold - abs(x))/env.x_threshold - 0.8
+ r2 = (env.theta_threshold_radians - abs(theta))/env.theta_threshold_radians - 0.5
+ reward = r1 + r2
+
+ RL.store_transition(observation, action, reward, observation_)
+
+ ep_r += reward
+ if total_steps > 1000:
+ RL.learn()
+
+ if done:
+ print('episode: ', i_episode,
+ 'ep_r: ', round(ep_r, 2),
+ ' epsilon: ', round(RL.epsilon, 2))
+ break
+
+ observation = observation_
+ total_steps += 1
+
+RL.plot_cost()
diff --git a/contents/6_OpenAI_gym/run_MountainCar.py b/contents/6_OpenAI_gym/run_MountainCar.py
new file mode 100644
index 0000000..cdda953
--- /dev/null
+++ b/contents/6_OpenAI_gym/run_MountainCar.py
@@ -0,0 +1,61 @@
+"""
+Deep Q network,
+
+Using:
+Tensorflow: 1.0
+gym: 0.8.0
+"""
+
+
+import gym
+from RL_brain import DeepQNetwork
+
+env = gym.make('MountainCar-v0')
+env = env.unwrapped
+
+print(env.action_space)
+print(env.observation_space)
+print(env.observation_space.high)
+print(env.observation_space.low)
+
+RL = DeepQNetwork(n_actions=3, n_features=2, learning_rate=0.001, e_greedy=0.9,
+ replace_target_iter=300, memory_size=3000,
+ e_greedy_increment=0.0002,)
+
+total_steps = 0
+
+
+for i_episode in range(10):
+
+ observation = env.reset()
+ ep_r = 0
+ while True:
+ env.render()
+
+ action = RL.choose_action(observation)
+
+ observation_, reward, done, info = env.step(action)
+
+ position, velocity = observation_
+
+ # the higher the better
+ reward = abs(position - (-0.5)) # r in [0, 1]
+
+ RL.store_transition(observation, action, reward, observation_)
+
+ if total_steps > 1000:
+ RL.learn()
+
+ ep_r += reward
+ if done:
+ get = '| Get' if observation_[0] >= env.unwrapped.goal_position else '| ----'
+ print('Epi: ', i_episode,
+ get,
+ '| Ep_r: ', round(ep_r, 4),
+ '| Epsilon: ', round(RL.epsilon, 2))
+ break
+
+ observation = observation_
+ total_steps += 1
+
+RL.plot_cost()
diff --git a/contents/7_Policy_gradient_softmax/RL_brain.py b/contents/7_Policy_gradient_softmax/RL_brain.py
new file mode 100644
index 0000000..e76b234
--- /dev/null
+++ b/contents/7_Policy_gradient_softmax/RL_brain.py
@@ -0,0 +1,124 @@
+"""
+This part of code is the reinforcement learning brain, which is a brain of the agent.
+All decisions are made in here.
+
+Policy Gradient, Reinforcement Learning.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+
+Using:
+Tensorflow: 1.0
+gym: 0.8.0
+"""
+
+import numpy as np
+import tensorflow as tf
+
+# reproducible
+np.random.seed(1)
+tf.set_random_seed(1)
+
+
+class PolicyGradient:
+ def __init__(
+ self,
+ n_actions,
+ n_features,
+ learning_rate=0.01,
+ reward_decay=0.95,
+ output_graph=False,
+ ):
+ self.n_actions = n_actions
+ self.n_features = n_features
+ self.lr = learning_rate
+ self.gamma = reward_decay
+
+ self.ep_obs, self.ep_as, self.ep_rs = [], [], []
+
+ self._build_net()
+
+ self.sess = tf.Session()
+
+ if output_graph:
+ # $ tensorboard --logdir=logs
+ # http://0.0.0.0:6006/
+ # tf.train.SummaryWriter soon be deprecated, use following
+ tf.summary.FileWriter("logs/", self.sess.graph)
+
+ self.sess.run(tf.global_variables_initializer())
+
+ def _build_net(self):
+ with tf.name_scope('inputs'):
+ self.tf_obs = tf.placeholder(tf.float32, [None, self.n_features], name="observations")
+ self.tf_acts = tf.placeholder(tf.int32, [None, ], name="actions_num")
+ self.tf_vt = tf.placeholder(tf.float32, [None, ], name="actions_value")
+ # fc1
+ layer = tf.layers.dense(
+ inputs=self.tf_obs,
+ units=10,
+ activation=tf.nn.tanh, # tanh activation
+ kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3),
+ bias_initializer=tf.constant_initializer(0.1),
+ name='fc1'
+ )
+ # fc2
+ all_act = tf.layers.dense(
+ inputs=layer,
+ units=self.n_actions,
+ activation=None,
+ kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3),
+ bias_initializer=tf.constant_initializer(0.1),
+ name='fc2'
+ )
+
+ self.all_act_prob = tf.nn.softmax(all_act, name='act_prob') # use softmax to convert to probability
+
+ with tf.name_scope('loss'):
+ # to maximize total reward (log_p * R) is to minimize -(log_p * R), and the tf only have minimize(loss)
+ neg_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=all_act, labels=self.tf_acts) # this is negative log of chosen action
+ # or in this way:
+ # neg_log_prob = tf.reduce_sum(-tf.log(self.all_act_prob)*tf.one_hot(self.tf_acts, self.n_actions), axis=1)
+ loss = tf.reduce_mean(neg_log_prob * self.tf_vt) # reward guided loss
+
+ with tf.name_scope('train'):
+ self.train_op = tf.train.AdamOptimizer(self.lr).minimize(loss)
+
+ def choose_action(self, observation):
+ prob_weights = self.sess.run(self.all_act_prob, feed_dict={self.tf_obs: observation[np.newaxis, :]})
+ action = np.random.choice(range(prob_weights.shape[1]), p=prob_weights.ravel()) # select action w.r.t the actions prob
+ return action
+
+ def store_transition(self, s, a, r):
+ self.ep_obs.append(s)
+ self.ep_as.append(a)
+ self.ep_rs.append(r)
+
+ def learn(self):
+ # discount and normalize episode reward
+ discounted_ep_rs_norm = self._discount_and_norm_rewards()
+
+ # train on episode
+ self.sess.run(self.train_op, feed_dict={
+ self.tf_obs: np.vstack(self.ep_obs), # shape=[None, n_obs]
+ self.tf_acts: np.array(self.ep_as), # shape=[None, ]
+ self.tf_vt: discounted_ep_rs_norm, # shape=[None, ]
+ })
+
+ self.ep_obs, self.ep_as, self.ep_rs = [], [], [] # empty episode data
+ return discounted_ep_rs_norm
+
+ def _discount_and_norm_rewards(self):
+ # discount episode rewards
+ discounted_ep_rs = np.zeros_like(self.ep_rs)
+ running_add = 0
+ for t in reversed(range(0, len(self.ep_rs))):
+ running_add = running_add * self.gamma + self.ep_rs[t]
+ discounted_ep_rs[t] = running_add
+
+ # normalize episode rewards
+ discounted_ep_rs -= np.mean(discounted_ep_rs)
+ discounted_ep_rs /= np.std(discounted_ep_rs)
+ return discounted_ep_rs
+
+
+
diff --git a/contents/7_Policy_gradient_softmax/run_CartPole.py b/contents/7_Policy_gradient_softmax/run_CartPole.py
new file mode 100644
index 0000000..7d46aee
--- /dev/null
+++ b/contents/7_Policy_gradient_softmax/run_CartPole.py
@@ -0,0 +1,69 @@
+"""
+Policy Gradient, Reinforcement Learning.
+
+The cart pole example
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+Using:
+Tensorflow: 1.0
+gym: 0.8.0
+"""
+
+import gym
+from RL_brain import PolicyGradient
+import matplotlib.pyplot as plt
+
+DISPLAY_REWARD_THRESHOLD = 400 # renders environment if total episode reward is greater then this threshold
+RENDER = False # rendering wastes time
+
+env = gym.make('CartPole-v0')
+env.seed(1) # reproducible, general Policy gradient has high variance
+env = env.unwrapped
+
+print(env.action_space)
+print(env.observation_space)
+print(env.observation_space.high)
+print(env.observation_space.low)
+
+RL = PolicyGradient(
+ n_actions=env.action_space.n,
+ n_features=env.observation_space.shape[0],
+ learning_rate=0.02,
+ reward_decay=0.99,
+ # output_graph=True,
+)
+
+for i_episode in range(3000):
+
+ observation = env.reset()
+
+ while True:
+ if RENDER: env.render()
+
+ action = RL.choose_action(observation)
+
+ observation_, reward, done, info = env.step(action)
+
+ RL.store_transition(observation, action, reward)
+
+ if done:
+ ep_rs_sum = sum(RL.ep_rs)
+
+ if 'running_reward' not in globals():
+ running_reward = ep_rs_sum
+ else:
+ running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
+ if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering
+ print("episode:", i_episode, " reward:", int(running_reward))
+
+ vt = RL.learn()
+
+ if i_episode == 0:
+ plt.plot(vt) # plot the episode vt
+ plt.xlabel('episode steps')
+ plt.ylabel('normalized state-action value')
+ plt.show()
+ break
+
+ observation = observation_
diff --git a/contents/7_Policy_gradient_softmax/run_MountainCar.py b/contents/7_Policy_gradient_softmax/run_MountainCar.py
new file mode 100644
index 0000000..926269d
--- /dev/null
+++ b/contents/7_Policy_gradient_softmax/run_MountainCar.py
@@ -0,0 +1,76 @@
+"""
+Policy Gradient, Reinforcement Learning.
+
+The cart pole example
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+Using:
+Tensorflow: 1.0
+gym: 0.8.0
+"""
+
+import gym
+from RL_brain import PolicyGradient
+import matplotlib.pyplot as plt
+
+DISPLAY_REWARD_THRESHOLD = -2000 # renders environment if total episode reward is greater then this threshold
+# episode: 154 reward: -10667
+# episode: 387 reward: -2009
+# episode: 489 reward: -1006
+# episode: 628 reward: -502
+
+RENDER = False # rendering wastes time
+
+env = gym.make('MountainCar-v0')
+env.seed(1) # reproducible, general Policy gradient has high variance
+env = env.unwrapped
+
+print(env.action_space)
+print(env.observation_space)
+print(env.observation_space.high)
+print(env.observation_space.low)
+
+RL = PolicyGradient(
+ n_actions=env.action_space.n,
+ n_features=env.observation_space.shape[0],
+ learning_rate=0.02,
+ reward_decay=0.995,
+ # output_graph=True,
+)
+
+for i_episode in range(1000):
+
+ observation = env.reset()
+
+ while True:
+ if RENDER: env.render()
+
+ action = RL.choose_action(observation)
+
+ observation_, reward, done, info = env.step(action) # reward = -1 in all cases
+
+ RL.store_transition(observation, action, reward)
+
+ if done:
+ # calculate running reward
+ ep_rs_sum = sum(RL.ep_rs)
+ if 'running_reward' not in globals():
+ running_reward = ep_rs_sum
+ else:
+ running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
+ if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering
+
+ print("episode:", i_episode, " reward:", int(running_reward))
+
+ vt = RL.learn() # train
+
+ if i_episode == 30:
+ plt.plot(vt) # plot the episode vt
+ plt.xlabel('episode steps')
+ plt.ylabel('normalized state-action value')
+ plt.show()
+
+ break
+
+ observation = observation_
diff --git a/contents/8_Actor_Critic_Advantage/AC_CartPole.py b/contents/8_Actor_Critic_Advantage/AC_CartPole.py
new file mode 100644
index 0000000..b01a4b9
--- /dev/null
+++ b/contents/8_Actor_Critic_Advantage/AC_CartPole.py
@@ -0,0 +1,169 @@
+"""
+Actor-Critic using TD-error as the Advantage, Reinforcement Learning.
+
+The cart pole example. Policy is oscillated.
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+Using:
+tensorflow 1.0
+gym 0.8.0
+"""
+
+import numpy as np
+import tensorflow as tf
+import gym
+
+np.random.seed(2)
+tf.set_random_seed(2) # reproducible
+
+# Superparameters
+OUTPUT_GRAPH = False
+MAX_EPISODE = 3000
+DISPLAY_REWARD_THRESHOLD = 200 # renders environment if total episode reward is greater then this threshold
+MAX_EP_STEPS = 1000 # maximum time step in one episode
+RENDER = False # rendering wastes time
+GAMMA = 0.9 # reward discount in TD error
+LR_A = 0.001 # learning rate for actor
+LR_C = 0.01 # learning rate for critic
+
+env = gym.make('CartPole-v0')
+env.seed(1) # reproducible
+env = env.unwrapped
+
+N_F = env.observation_space.shape[0]
+N_A = env.action_space.n
+
+
+class Actor(object):
+ def __init__(self, sess, n_features, n_actions, lr=0.001):
+ self.sess = sess
+
+ self.s = tf.placeholder(tf.float32, [1, n_features], "state")
+ self.a = tf.placeholder(tf.int32, None, "act")
+ self.td_error = tf.placeholder(tf.float32, None, "td_error") # TD_error
+
+ with tf.variable_scope('Actor'):
+ l1 = tf.layers.dense(
+ inputs=self.s,
+ units=20, # number of hidden units
+ activation=tf.nn.relu,
+ kernel_initializer=tf.random_normal_initializer(0., .1), # weights
+ bias_initializer=tf.constant_initializer(0.1), # biases
+ name='l1'
+ )
+
+ self.acts_prob = tf.layers.dense(
+ inputs=l1,
+ units=n_actions, # output units
+ activation=tf.nn.softmax, # get action probabilities
+ kernel_initializer=tf.random_normal_initializer(0., .1), # weights
+ bias_initializer=tf.constant_initializer(0.1), # biases
+ name='acts_prob'
+ )
+
+ with tf.variable_scope('exp_v'):
+ log_prob = tf.log(self.acts_prob[0, self.a])
+ self.exp_v = tf.reduce_mean(log_prob * self.td_error) # advantage (TD_error) guided loss
+
+ with tf.variable_scope('train'):
+ self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v) # minimize(-exp_v) = maximize(exp_v)
+
+ def learn(self, s, a, td):
+ s = s[np.newaxis, :]
+ feed_dict = {self.s: s, self.a: a, self.td_error: td}
+ _, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict)
+ return exp_v
+
+ def choose_action(self, s):
+ s = s[np.newaxis, :]
+ probs = self.sess.run(self.acts_prob, {self.s: s}) # get probabilities for all actions
+ return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel()) # return a int
+
+
+class Critic(object):
+ def __init__(self, sess, n_features, lr=0.01):
+ self.sess = sess
+
+ self.s = tf.placeholder(tf.float32, [1, n_features], "state")
+ self.v_ = tf.placeholder(tf.float32, [1, 1], "v_next")
+ self.r = tf.placeholder(tf.float32, None, 'r')
+
+ with tf.variable_scope('Critic'):
+ l1 = tf.layers.dense(
+ inputs=self.s,
+ units=20, # number of hidden units
+ activation=tf.nn.relu, # None
+ # have to be linear to make sure the convergence of actor.
+ # But linear approximator seems hardly learns the correct Q.
+ kernel_initializer=tf.random_normal_initializer(0., .1), # weights
+ bias_initializer=tf.constant_initializer(0.1), # biases
+ name='l1'
+ )
+
+ self.v = tf.layers.dense(
+ inputs=l1,
+ units=1, # output units
+ activation=None,
+ kernel_initializer=tf.random_normal_initializer(0., .1), # weights
+ bias_initializer=tf.constant_initializer(0.1), # biases
+ name='V'
+ )
+
+ with tf.variable_scope('squared_TD_error'):
+ self.td_error = self.r + GAMMA * self.v_ - self.v
+ self.loss = tf.square(self.td_error) # TD_error = (r+gamma*V_next) - V_eval
+ with tf.variable_scope('train'):
+ self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)
+
+ def learn(self, s, r, s_):
+ s, s_ = s[np.newaxis, :], s_[np.newaxis, :]
+
+ v_ = self.sess.run(self.v, {self.s: s_})
+ td_error, _ = self.sess.run([self.td_error, self.train_op],
+ {self.s: s, self.v_: v_, self.r: r})
+ return td_error
+
+
+sess = tf.Session()
+
+actor = Actor(sess, n_features=N_F, n_actions=N_A, lr=LR_A)
+critic = Critic(sess, n_features=N_F, lr=LR_C) # we need a good teacher, so the teacher should learn faster than the actor
+
+sess.run(tf.global_variables_initializer())
+
+if OUTPUT_GRAPH:
+ tf.summary.FileWriter("logs/", sess.graph)
+
+for i_episode in range(MAX_EPISODE):
+ s = env.reset()
+ t = 0
+ track_r = []
+ while True:
+ if RENDER: env.render()
+
+ a = actor.choose_action(s)
+
+ s_, r, done, info = env.step(a)
+
+ if done: r = -20
+
+ track_r.append(r)
+
+ td_error = critic.learn(s, r, s_) # gradient = grad[r + gamma * V(s_) - V(s)]
+ actor.learn(s, a, td_error) # true_gradient = grad[logPi(s,a) * td_error]
+
+ s = s_
+ t += 1
+
+ if done or t >= MAX_EP_STEPS:
+ ep_rs_sum = sum(track_r)
+
+ if 'running_reward' not in globals():
+ running_reward = ep_rs_sum
+ else:
+ running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
+ if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering
+ print("episode:", i_episode, " reward:", int(running_reward))
+ break
+
diff --git a/contents/8_Actor_Critic_Advantage/AC_continue_Pendulum.py b/contents/8_Actor_Critic_Advantage/AC_continue_Pendulum.py
new file mode 100644
index 0000000..07fc378
--- /dev/null
+++ b/contents/8_Actor_Critic_Advantage/AC_continue_Pendulum.py
@@ -0,0 +1,179 @@
+"""
+Actor-Critic with continuous action using TD-error as the Advantage, Reinforcement Learning.
+
+The Pendulum example (based on https://github.com/dennybritz/reinforcement-learning/blob/master/PolicyGradient/Continuous%20MountainCar%20Actor%20Critic%20Solution.ipynb)
+
+Cannot converge!!! oscillate!!!
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+Using:
+tensorflow 1.0
+gym 0.8.0
+"""
+
+import tensorflow as tf
+import numpy as np
+import gym
+
+np.random.seed(2)
+tf.set_random_seed(2) # reproducible
+
+
+class Actor(object):
+ def __init__(self, sess, n_features, action_bound, lr=0.0001):
+ self.sess = sess
+
+ self.s = tf.placeholder(tf.float32, [1, n_features], "state")
+ self.a = tf.placeholder(tf.float32, None, name="act")
+ self.td_error = tf.placeholder(tf.float32, None, name="td_error") # TD_error
+
+ l1 = tf.layers.dense(
+ inputs=self.s,
+ units=30, # number of hidden units
+ activation=tf.nn.relu,
+ kernel_initializer=tf.random_normal_initializer(0., .1), # weights
+ bias_initializer=tf.constant_initializer(0.1), # biases
+ name='l1'
+ )
+
+ mu = tf.layers.dense(
+ inputs=l1,
+ units=1, # number of hidden units
+ activation=tf.nn.tanh,
+ kernel_initializer=tf.random_normal_initializer(0., .1), # weights
+ bias_initializer=tf.constant_initializer(0.1), # biases
+ name='mu'
+ )
+
+ sigma = tf.layers.dense(
+ inputs=l1,
+ units=1, # output units
+ activation=tf.nn.softplus, # get action probabilities
+ kernel_initializer=tf.random_normal_initializer(0., .1), # weights
+ bias_initializer=tf.constant_initializer(1.), # biases
+ name='sigma'
+ )
+ global_step = tf.Variable(0, trainable=False)
+ # self.e = epsilon = tf.train.exponential_decay(2., global_step, 1000, 0.9)
+ self.mu, self.sigma = tf.squeeze(mu*2), tf.squeeze(sigma+0.1)
+ self.normal_dist = tf.contrib.distributions.Normal(self.mu, self.sigma)
+
+ self.action = tf.clip_by_value(self.normal_dist.sample(1), action_bound[0], action_bound[1])
+
+ with tf.name_scope('exp_v'):
+ log_prob = self.normal_dist.log_prob(self.a) # loss without advantage
+ self.exp_v = log_prob * self.td_error # advantage (TD_error) guided loss
+ # Add cross entropy cost to encourage exploration
+ self.exp_v += self.normal_dist.entropy()
+
+ with tf.name_scope('train'):
+ self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v, global_step) # min(v) = max(-v)
+
+ def learn(self, s, a, td):
+ s = s[np.newaxis, :]
+ feed_dict = {self.s: s, self.a: a, self.td_error: td}
+ _, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict)
+ return exp_v
+
+ def choose_action(self, s):
+ s = s[np.newaxis, :]
+ return self.sess.run(self.action, {self.s: s}) # get probabilities for all actions
+
+
+class Critic(object):
+ def __init__(self, sess, n_features, lr=0.01):
+ self.sess = sess
+ with tf.name_scope('inputs'):
+ self.s = tf.placeholder(tf.float32, [1, n_features], "state")
+ self.v_ = tf.placeholder(tf.float32, [1, 1], name="v_next")
+ self.r = tf.placeholder(tf.float32, name='r')
+
+ with tf.variable_scope('Critic'):
+ l1 = tf.layers.dense(
+ inputs=self.s,
+ units=30, # number of hidden units
+ activation=tf.nn.relu,
+ kernel_initializer=tf.random_normal_initializer(0., .1), # weights
+ bias_initializer=tf.constant_initializer(0.1), # biases
+ name='l1'
+ )
+
+ self.v = tf.layers.dense(
+ inputs=l1,
+ units=1, # output units
+ activation=None,
+ kernel_initializer=tf.random_normal_initializer(0., .1), # weights
+ bias_initializer=tf.constant_initializer(0.1), # biases
+ name='V'
+ )
+
+ with tf.variable_scope('squared_TD_error'):
+ self.td_error = tf.reduce_mean(self.r + GAMMA * self.v_ - self.v)
+ self.loss = tf.square(self.td_error) # TD_error = (r+gamma*V_next) - V_eval
+ with tf.variable_scope('train'):
+ self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)
+
+ def learn(self, s, r, s_):
+ s, s_ = s[np.newaxis, :], s_[np.newaxis, :]
+
+ v_ = self.sess.run(self.v, {self.s: s_})
+ td_error, _ = self.sess.run([self.td_error, self.train_op],
+ {self.s: s, self.v_: v_, self.r: r})
+ return td_error
+
+
+OUTPUT_GRAPH = False
+MAX_EPISODE = 1000
+MAX_EP_STEPS = 300
+DISPLAY_REWARD_THRESHOLD = -550 # renders environment if total episode reward is greater then this threshold
+RENDER = False # rendering wastes time
+GAMMA = 0.9
+LR_A = 0.001 # learning rate for actor
+LR_C = 0.01 # learning rate for critic
+
+env = gym.make('Pendulum-v0')
+env.seed(1) # reproducible
+env = env.unwrapped
+
+N_S = env.observation_space.shape[0]
+A_BOUND = env.action_space.high
+
+sess = tf.Session()
+
+actor = Actor(sess, n_features=N_S, lr=LR_A, action_bound=[-A_BOUND, A_BOUND])
+critic = Critic(sess, n_features=N_S, lr=LR_C)
+
+sess.run(tf.global_variables_initializer())
+
+if OUTPUT_GRAPH:
+ tf.summary.FileWriter("logs/", sess.graph)
+
+for i_episode in range(MAX_EPISODE):
+ s = env.reset()
+ t = 0
+ ep_rs = []
+ while True:
+ # if RENDER:
+ env.render()
+ a = actor.choose_action(s)
+
+ s_, r, done, info = env.step(a)
+ r /= 10
+
+ td_error = critic.learn(s, r, s_) # gradient = grad[r + gamma * V(s_) - V(s)]
+ actor.learn(s, a, td_error) # true_gradient = grad[logPi(s,a) * td_error]
+
+ s = s_
+ t += 1
+ ep_rs.append(r)
+ if t > MAX_EP_STEPS:
+ ep_rs_sum = sum(ep_rs)
+ if 'running_reward' not in globals():
+ running_reward = ep_rs_sum
+ else:
+ running_reward = running_reward * 0.9 + ep_rs_sum * 0.1
+ if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering
+ print("episode:", i_episode, " reward:", int(running_reward))
+ break
+
diff --git a/contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG.py b/contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG.py
new file mode 100644
index 0000000..629eb6e
--- /dev/null
+++ b/contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG.py
@@ -0,0 +1,252 @@
+"""
+Deep Deterministic Policy Gradient (DDPG), Reinforcement Learning.
+DDPG is Actor Critic based algorithm.
+Pendulum example.
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+Using:
+tensorflow 1.0
+gym 0.8.0
+"""
+
+import tensorflow as tf
+import numpy as np
+import gym
+
+np.random.seed(1)
+tf.set_random_seed(1)
+
+##################### hyper parameters ####################
+
+MAX_EPISODES = 70
+MAX_EP_STEPS = 400
+LR_A = 0.01 # learning rate for actor
+LR_C = 0.01 # learning rate for critic
+GAMMA = 0.9 # reward discount
+TAU = 0.01 # Soft update for target param, but this is computationally expansive
+# so we use replace_iter instead
+REPLACE_ITER_A = 500
+REPLACE_ITER_C = 300
+MEMORY_CAPACITY = 7000
+BATCH_SIZE = 32
+
+RENDER = False
+OUTPUT_GRAPH = True
+ENV_NAME = 'Pendulum-v0'
+
+############################### Actor ####################################
+
+class Actor(object):
+ def __init__(self, sess, action_dim, action_bound, learning_rate, t_replace_iter):
+ self.sess = sess
+ self.a_dim = action_dim
+ self.action_bound = action_bound
+ self.lr = learning_rate
+ self.t_replace_iter = t_replace_iter
+ self.t_replace_counter = 0
+
+ with tf.variable_scope('Actor'):
+ # input s, output a
+ self.a = self._build_net(S, scope='eval_net', trainable=True)
+
+ # input s_, output a, get a_ for critic
+ self.a_ = self._build_net(S_, scope='target_net', trainable=False)
+
+ self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval_net')
+ self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target_net')
+
+ def _build_net(self, s, scope, trainable):
+ with tf.variable_scope(scope):
+ init_w = tf.random_normal_initializer(0., 0.3)
+ init_b = tf.constant_initializer(0.1)
+ net = tf.layers.dense(s, 30, activation=tf.nn.relu,
+ kernel_initializer=init_w, bias_initializer=init_b, name='l1',
+ trainable=trainable)
+ with tf.variable_scope('a'):
+ actions = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, kernel_initializer=init_w,
+ bias_initializer=init_b, name='a', trainable=trainable)
+ scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a') # Scale output to -action_bound to action_bound
+ return scaled_a
+
+ def learn(self, s, a): # batch update
+ self.sess.run(self.train_op, feed_dict={S: s, A: a})
+ # the following method for soft replace target params is computational expansive
+ # target_params = (1-tau) * target_params + tau * eval_params
+ # self.sess.run([tf.assign(t, (1 - self.tau) * t + self.tau * e) for t, e in zip(self.t_params, self.e_params)])
+
+ # instead of above method, I use a hard replacement here
+ if self.t_replace_counter % self.t_replace_iter == 0:
+ self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
+ self.t_replace_counter += 1
+
+ def choose_action(self, s):
+ s = s[np.newaxis, :] # single state
+ return self.sess.run(self.a, feed_dict={S: s})[0] # single action
+
+ def add_grad_to_graph(self, a_grads):
+ with tf.variable_scope('policy_grads'):
+ # ys = policy;
+ # xs = policy's parameters;
+ # self.a_grads = the gradients of the policy to get more Q
+ # tf.gradients will calculate dys/dxs with a initial gradients for ys, so this is dq/da * da/dparams
+ self.policy_grads = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads)
+
+ with tf.variable_scope('A_train'):
+ opt = tf.train.AdamOptimizer(-self.lr / BATCH_SIZE) # (- learning rate) for ascent policy, div to take mean
+ self.train_op = opt.apply_gradients(zip(self.policy_grads, self.e_params))
+
+
+############################### Critic ####################################
+
+class Critic(object):
+ def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter, a_):
+ self.sess = sess
+ self.s_dim = state_dim
+ self.a_dim = action_dim
+ self.lr = learning_rate
+ self.gamma = gamma
+ self.t_replace_iter = t_replace_iter
+ self.t_replace_counter = 0
+
+ with tf.variable_scope('Critic'):
+ # Input (s, a), output q
+ self.q = self._build_net(S, A, 'eval_net', trainable=True)
+
+ # Input (s_, a_), output q_ for q_target
+ self.q_ = self._build_net(S_, a_, 'target_net', trainable=False) # target_q is based on a_ from Actor's target_net
+
+ self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_net')
+ self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_net')
+
+ with tf.variable_scope('target_q'):
+ self.target_q = R + self.gamma * self.q_
+
+ with tf.variable_scope('TD_error'):
+ self.loss = tf.reduce_mean(tf.squared_difference(self.target_q, self.q))
+
+ with tf.variable_scope('C_train'):
+ self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)
+
+ with tf.variable_scope('a_grad'):
+ self.a_grads = tf.gradients(self.q, A)[0] # tensor of gradients of each sample (None, a_dim)
+
+ def _build_net(self, s, a, scope, trainable):
+ with tf.variable_scope(scope):
+ init_w = tf.random_normal_initializer(0., 0.1)
+ init_b = tf.constant_initializer(0.1)
+
+ with tf.variable_scope('l1'):
+ n_l1 = 30
+ w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], initializer=init_w, trainable=trainable)
+ w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], initializer=init_w, trainable=trainable)
+ b1 = tf.get_variable('b1', [1, n_l1], initializer=init_b, trainable=trainable)
+ net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
+
+ with tf.variable_scope('q'):
+ q = tf.layers.dense(net, 1, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable) # Q(s,a)
+ return q
+
+ def learn(self, s, a, r, s_):
+ self.sess.run(self.train_op, feed_dict={S: s, A: a, R: r, S_: s_})
+ # the following method for soft replace target params is computational expansive
+ # target_params = (1-tau) * target_params + tau * eval_params
+ # self.sess.run([tf.assign(t, (1 - self.tau) * t + self.tau * e) for t, e in zip(self.t_params, self.e_params)])
+
+ # instead of above method, we use a hard replacement here
+ if self.t_replace_counter % self.t_replace_iter == 0:
+ self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
+ self.t_replace_counter += 1
+
+
+##################### Memory ####################
+
+class Memory(object):
+ def __init__(self, capacity, dims):
+ self.capacity = capacity
+ self.data = np.zeros((capacity, dims))
+ self.pointer = 0
+
+ def store_transition(self, s, a, r, s_):
+ transition = np.hstack((s, a, [r], s_))
+ index = self.pointer % self.capacity # replace the old memory with new memory
+ self.data[index, :] = transition
+ self.pointer += 1
+
+ def sample(self, n):
+ assert self.pointer >= self.capacity, 'Memory has not been fulfilled'
+ indices = np.random.choice(self.capacity, size=n)
+ return self.data[indices, :]
+
+
+env = gym.make(ENV_NAME)
+env = env.unwrapped
+env.seed(1)
+
+state_dim = env.observation_space.shape[0]
+action_dim = env.action_space.shape[0]
+action_bound = env.action_space.high
+
+# all placeholder for tf
+with tf.name_scope('S'):
+ S = tf.placeholder(tf.float32, shape=[None, state_dim], name='s')
+with tf.name_scope('A'):
+ A = tf.placeholder(tf.float32, shape=[None, action_dim], name='a')
+with tf.name_scope('R'):
+ R = tf.placeholder(tf.float32, [None, 1], name='r')
+with tf.name_scope('S_'):
+ S_ = tf.placeholder(tf.float32, shape=[None, state_dim], name='s_')
+
+
+sess = tf.Session()
+
+# Create actor and critic.
+# They are actually connected to each other, details can be seen in tensorboard or in this picture:
+actor = Actor(sess, action_dim, action_bound, LR_A, REPLACE_ITER_A)
+critic = Critic(sess, state_dim, action_dim, LR_C, GAMMA, REPLACE_ITER_C, actor.a_)
+actor.add_grad_to_graph(critic.a_grads)
+
+sess.run(tf.global_variables_initializer())
+
+M = Memory(MEMORY_CAPACITY, dims=2 * state_dim + action_dim + 1)
+
+if OUTPUT_GRAPH:
+ tf.summary.FileWriter("logs/", sess.graph)
+
+var = 3 # control exploration
+
+for i in range(MAX_EPISODES):
+ s = env.reset()
+ ep_reward = 0
+
+ for j in range(MAX_EP_STEPS):
+
+ if RENDER:
+ env.render()
+
+ # Added exploration noise
+ a = actor.choose_action(s)
+ a = np.clip(np.random.normal(a, var), -2, 2) # add randomness to action selection for exploration
+ s_, r, done, info = env.step(a)
+
+ M.store_transition(s, a, r / 10, s_)
+
+ if M.pointer > MEMORY_CAPACITY:
+ var *= .9995 # decay the action randomness
+ b_M = M.sample(BATCH_SIZE)
+ b_s = b_M[:, :state_dim]
+ b_a = b_M[:, state_dim: state_dim + action_dim]
+ b_r = b_M[:, -state_dim - 1: -state_dim]
+ b_s_ = b_M[:, -state_dim:]
+
+ critic.learn(b_s, b_a, b_r, b_s_)
+ actor.learn(b_s, b_a)
+
+ s = s_
+ ep_reward += r
+
+ if j == MAX_EP_STEPS-1:
+ print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
+ if ep_reward > -1000:
+ RENDER = True
+ break
\ No newline at end of file
diff --git a/experiments/2D_car/DDPG.py b/experiments/2D_car/DDPG.py
new file mode 100644
index 0000000..f91bfe0
--- /dev/null
+++ b/experiments/2D_car/DDPG.py
@@ -0,0 +1,269 @@
+"""
+Environment is a 2D car.
+Car has 5 sensors to obtain distance information.
+
+Car collision => reward = -1, otherwise => reward = 0.
+
+You can train this RL by using LOAD = False, after training, this model will be store in the a local folder.
+Using LOAD = True to reload the trained model for playing.
+
+You can customize this script in a way you want.
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+Requirement:
+pyglet >= 1.2.4
+numpy >= 1.12.1
+tensorflow >= 1.0.1
+"""
+
+import tensorflow as tf
+import numpy as np
+import os
+import shutil
+from car_env import CarEnv
+
+
+np.random.seed(1)
+tf.set_random_seed(1)
+
+MAX_EPISODES = 225
+MAX_EP_STEPS = 600
+LR_A = 1e-3 # learning rate for actor
+LR_C = 1e-3 # learning rate for critic
+GAMMA = 0.95 # reward discount
+REPLACE_ITER_A = 800
+REPLACE_ITER_C = 700
+MEMORY_CAPACITY = 5000
+BATCH_SIZE = 16
+VAR_MIN = 0.1
+RENDER = True
+LOAD = False
+DISCRETE_ACTION = False
+
+env = CarEnv(discrete_action=DISCRETE_ACTION)
+STATE_DIM = env.state_dim
+ACTION_DIM = env.action_dim
+ACTION_BOUND = env.action_bound
+
+# all placeholder for tf
+with tf.name_scope('S'):
+ S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s')
+with tf.name_scope('A'):
+ A = tf.placeholder(tf.float32, shape=[None, ACTION_DIM], name='a')
+with tf.name_scope('R'):
+ R = tf.placeholder(tf.float32, [None, 1], name='r')
+with tf.name_scope('S_'):
+ S_ = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s_')
+
+
+class Actor(object):
+ def __init__(self, sess, action_dim, action_bound, learning_rate, t_replace_iter):
+ self.sess = sess
+ self.a_dim = action_dim
+ self.action_bound = action_bound
+ self.lr = learning_rate
+ self.t_replace_iter = t_replace_iter
+ self.t_replace_counter = 0
+
+ with tf.variable_scope('Actor'):
+ # input s, output a
+ self.a = self._build_net(S, scope='eval_net', trainable=True)
+
+ # input s_, output a, get a_ for critic
+ self.a_ = self._build_net(S_, scope='target_net', trainable=False)
+
+ self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval_net')
+ self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target_net')
+
+ def _build_net(self, s, scope, trainable):
+ with tf.variable_scope(scope):
+ init_w = tf.contrib.layers.xavier_initializer()
+ init_b = tf.constant_initializer(0.001)
+ net = tf.layers.dense(s, 100, activation=tf.nn.relu,
+ kernel_initializer=init_w, bias_initializer=init_b, name='l1',
+ trainable=trainable)
+ net = tf.layers.dense(net, 20, activation=tf.nn.relu,
+ kernel_initializer=init_w, bias_initializer=init_b, name='l2',
+ trainable=trainable)
+ with tf.variable_scope('a'):
+ actions = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, kernel_initializer=init_w,
+ name='a', trainable=trainable)
+ scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a') # Scale output to -action_bound to action_bound
+ return scaled_a
+
+ def learn(self, s, a): # batch update
+ self.sess.run(self.train_op, feed_dict={S: s, A: a})
+ if self.t_replace_counter % self.t_replace_iter == 0:
+ self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
+ self.t_replace_counter += 1
+
+ def choose_action(self, s):
+ s = s[np.newaxis, :] # single state
+ return self.sess.run(self.a, feed_dict={S: s})[0] # single action
+
+ def add_grad_to_graph(self, a_grads):
+ with tf.variable_scope('policy_grads'):
+ self.policy_grads = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads)
+
+ with tf.variable_scope('A_train'):
+ opt = tf.train.RMSPropOptimizer(-self.lr / BATCH_SIZE) # (- learning rate) for ascent policy, div to take mean
+ self.train_op = opt.apply_gradients(zip(self.policy_grads, self.e_params))
+
+
+class Critic(object):
+ def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter, a_):
+ self.sess = sess
+ self.s_dim = state_dim
+ self.a_dim = action_dim
+ self.lr = learning_rate
+ self.gamma = gamma
+ self.t_replace_iter = t_replace_iter
+ self.t_replace_counter = 0
+
+ with tf.variable_scope('Critic'):
+ # Input (s, a), output q
+ self.q = self._build_net(S, A, 'eval_net', trainable=True)
+
+ # Input (s_, a_), output q_ for q_target
+ self.q_ = self._build_net(S_, a_, 'target_net', trainable=False) # target_q is based on a_ from Actor's target_net
+
+ self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_net')
+ self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_net')
+
+ with tf.variable_scope('target_q'):
+ self.target_q = R + self.gamma * self.q_
+
+ with tf.variable_scope('TD_error'):
+ self.loss = tf.reduce_mean(tf.squared_difference(self.target_q, self.q))
+
+ with tf.variable_scope('C_train'):
+ self.train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
+
+ with tf.variable_scope('a_grad'):
+ self.a_grads = tf.gradients(self.q, A)[0] # tensor of gradients of each sample (None, a_dim)
+
+ def _build_net(self, s, a, scope, trainable):
+ with tf.variable_scope(scope):
+ init_w = tf.contrib.layers.xavier_initializer()
+ init_b = tf.constant_initializer(0.01)
+
+ with tf.variable_scope('l1'):
+ n_l1 = 100
+ w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], initializer=init_w, trainable=trainable)
+ w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], initializer=init_w, trainable=trainable)
+ b1 = tf.get_variable('b1', [1, n_l1], initializer=init_b, trainable=trainable)
+ net = tf.nn.relu6(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
+ net = tf.layers.dense(net, 20, activation=tf.nn.relu,
+ kernel_initializer=init_w, bias_initializer=init_b, name='l2',
+ trainable=trainable)
+ with tf.variable_scope('q'):
+ q = tf.layers.dense(net, 1, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable) # Q(s,a)
+ return q
+
+ def learn(self, s, a, r, s_):
+ self.sess.run(self.train_op, feed_dict={S: s, A: a, R: r, S_: s_})
+ if self.t_replace_counter % self.t_replace_iter == 0:
+ self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
+ self.t_replace_counter += 1
+
+
+class Memory(object):
+ def __init__(self, capacity, dims):
+ self.capacity = capacity
+ self.data = np.zeros((capacity, dims))
+ self.pointer = 0
+
+ def store_transition(self, s, a, r, s_):
+ transition = np.hstack((s, a, [r], s_))
+ index = self.pointer % self.capacity # replace the old memory with new memory
+ self.data[index, :] = transition
+ self.pointer += 1
+
+ def sample(self, n):
+ assert self.pointer >= self.capacity, 'Memory has not been fulfilled'
+ indices = np.random.choice(self.capacity, size=n)
+ return self.data[indices, :]
+
+
+sess = tf.Session()
+
+# Create actor and critic.
+actor = Actor(sess, ACTION_DIM, ACTION_BOUND[1], LR_A, REPLACE_ITER_A)
+critic = Critic(sess, STATE_DIM, ACTION_DIM, LR_C, GAMMA, REPLACE_ITER_C, actor.a_)
+actor.add_grad_to_graph(critic.a_grads)
+
+M = Memory(MEMORY_CAPACITY, dims=2 * STATE_DIM + ACTION_DIM + 1)
+
+saver = tf.train.Saver()
+path = './discrete' if DISCRETE_ACTION else './continuous'
+
+if LOAD:
+ saver.restore(sess, tf.train.latest_checkpoint(path))
+else:
+ sess.run(tf.global_variables_initializer())
+
+
+def train():
+ var = 2. # control exploration
+ for ep in range(MAX_EPISODES):
+ s = env.reset()
+ ep_step = 0
+
+ for t in range(MAX_EP_STEPS):
+ # while True:
+ if RENDER:
+ env.render()
+
+ # Added exploration noise
+ a = actor.choose_action(s)
+ a = np.clip(np.random.normal(a, var), *ACTION_BOUND) # add randomness to action selection for exploration
+ s_, r, done = env.step(a)
+ M.store_transition(s, a, r, s_)
+
+ if M.pointer > MEMORY_CAPACITY:
+ var = max([var*.9995, VAR_MIN]) # decay the action randomness
+ b_M = M.sample(BATCH_SIZE)
+ b_s = b_M[:, :STATE_DIM]
+ b_a = b_M[:, STATE_DIM: STATE_DIM + ACTION_DIM]
+ b_r = b_M[:, -STATE_DIM - 1: -STATE_DIM]
+ b_s_ = b_M[:, -STATE_DIM:]
+
+ critic.learn(b_s, b_a, b_r, b_s_)
+ actor.learn(b_s, b_a)
+
+ s = s_
+ ep_step += 1
+
+ if done or t == MAX_EP_STEPS - 1:
+ # if done:
+ print('Ep:', ep,
+ '| Steps: %i' % int(ep_step),
+ '| Explore: %.2f' % var,
+ )
+ break
+
+ if os.path.isdir(path): shutil.rmtree(path)
+ os.mkdir(path)
+ ckpt_path = os.path.join(path, 'DDPG.ckpt')
+ save_path = saver.save(sess, ckpt_path, write_meta_graph=False)
+ print("\nSave Model %s\n" % save_path)
+
+
+def eval():
+ env.set_fps(30)
+ while True:
+ s = env.reset()
+ while True:
+ env.render()
+ a = actor.choose_action(s)
+ s_, r, done = env.step(a)
+ s = s_
+ if done:
+ break
+
+if __name__ == '__main__':
+ if LOAD:
+ eval()
+ else:
+ train()
\ No newline at end of file
diff --git a/experiments/2D_car/car_env.py b/experiments/2D_car/car_env.py
new file mode 100644
index 0000000..8a8ed2b
--- /dev/null
+++ b/experiments/2D_car/car_env.py
@@ -0,0 +1,234 @@
+"""
+Environment for 2D car driving.
+You can customize this script in a way you want.
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+
+Requirement:
+pyglet >= 1.2.4
+numpy >= 1.12.1
+"""
+import numpy as np
+import pyglet
+
+
+pyglet.clock.set_fps_limit(10000)
+
+
+class CarEnv(object):
+ n_sensor = 5
+ action_dim = 1
+ state_dim = n_sensor
+ viewer = None
+ viewer_xy = (500, 500)
+ sensor_max = 150.
+ start_point = [450, 300]
+ speed = 50.
+ dt = 0.1
+
+ def __init__(self, discrete_action=False):
+ self.is_discrete_action = discrete_action
+ if discrete_action:
+ self.actions = [-1, 0, 1]
+ else:
+ self.action_bound = [-1, 1]
+
+ self.terminal = False
+ # node1 (x, y, r, w, l),
+ self.car_info = np.array([0, 0, 0, 20, 40], dtype=np.float64) # car coordination
+ self.obstacle_coords = np.array([
+ [120, 120],
+ [380, 120],
+ [380, 380],
+ [120, 380],
+ ])
+ self.sensor_info = self.sensor_max + np.zeros((self.n_sensor, 3)) # n sensors, (distance, end_x, end_y)
+
+ def step(self, action):
+ if self.is_discrete_action:
+ action = self.actions[action]
+ else:
+ action = np.clip(action, *self.action_bound)[0]
+ self.car_info[2] += action * np.pi/30 # max r = 6 degree
+ self.car_info[:2] = self.car_info[:2] + \
+ self.speed * self.dt * np.array([np.cos(self.car_info[2]), np.sin(self.car_info[2])])
+
+ self._update_sensor()
+ s = self._get_state()
+ r = -1 if self.terminal else 0
+ return s, r, self.terminal
+
+ def reset(self):
+ self.terminal = False
+ self.car_info[:3] = np.array([*self.start_point, -np.pi/2])
+ self._update_sensor()
+ return self._get_state()
+
+ def render(self):
+ if self.viewer is None:
+ self.viewer = Viewer(*self.viewer_xy, self.car_info, self.sensor_info, self.obstacle_coords)
+ self.viewer.render()
+
+ def sample_action(self):
+ if self.is_discrete_action:
+ a = np.random.choice(list(range(3)))
+ else:
+ a = np.random.uniform(*self.action_bound, size=self.action_dim)
+ return a
+
+ def set_fps(self, fps=30):
+ pyglet.clock.set_fps_limit(fps)
+
+ def _get_state(self):
+ s = self.sensor_info[:, 0].flatten()/self.sensor_max
+ return s
+
+ def _update_sensor(self):
+ cx, cy, rotation = self.car_info[:3]
+
+ n_sensors = len(self.sensor_info)
+ sensor_theta = np.linspace(-np.pi / 2, np.pi / 2, n_sensors)
+ xs = cx + (np.zeros((n_sensors, ))+self.sensor_max) * np.cos(sensor_theta)
+ ys = cy + (np.zeros((n_sensors, ))+self.sensor_max) * np.sin(sensor_theta)
+ xys = np.array([[x, y] for x, y in zip(xs, ys)]) # shape (5 sensors, 2)
+
+ # sensors
+ tmp_x = xys[:, 0] - cx
+ tmp_y = xys[:, 1] - cy
+ # apply rotation
+ rotated_x = tmp_x * np.cos(rotation) - tmp_y * np.sin(rotation)
+ rotated_y = tmp_x * np.sin(rotation) + tmp_y * np.cos(rotation)
+ # rotated x y
+ self.sensor_info[:, -2:] = np.vstack([rotated_x+cx, rotated_y+cy]).T
+
+ q = np.array([cx, cy])
+ for si in range(len(self.sensor_info)):
+ s = self.sensor_info[si, -2:] - q
+ possible_sensor_distance = [self.sensor_max]
+ possible_intersections = [self.sensor_info[si, -2:]]
+
+ # obstacle collision
+ for oi in range(len(self.obstacle_coords)):
+ p = self.obstacle_coords[oi]
+ r = self.obstacle_coords[(oi + 1) % len(self.obstacle_coords)] - self.obstacle_coords[oi]
+ if np.cross(r, s) != 0: # may collision
+ t = np.cross((q - p), s) / np.cross(r, s)
+ u = np.cross((q - p), r) / np.cross(r, s)
+ if 0 <= t <= 1 and 0 <= u <= 1:
+ intersection = q + u * s
+ possible_intersections.append(intersection)
+ possible_sensor_distance.append(np.linalg.norm(u*s))
+
+ # window collision
+ win_coord = np.array([
+ [0, 0],
+ [self.viewer_xy[0], 0],
+ [*self.viewer_xy],
+ [0, self.viewer_xy[1]],
+ [0, 0],
+ ])
+ for oi in range(4):
+ p = win_coord[oi]
+ r = win_coord[(oi + 1) % len(win_coord)] - win_coord[oi]
+ if np.cross(r, s) != 0: # may collision
+ t = np.cross((q - p), s) / np.cross(r, s)
+ u = np.cross((q - p), r) / np.cross(r, s)
+ if 0 <= t <= 1 and 0 <= u <= 1:
+ intersection = p + t * r
+ possible_intersections.append(intersection)
+ possible_sensor_distance.append(np.linalg.norm(intersection - q))
+
+ distance = np.min(possible_sensor_distance)
+ distance_index = np.argmin(possible_sensor_distance)
+ self.sensor_info[si, 0] = distance
+ self.sensor_info[si, -2:] = possible_intersections[distance_index]
+ if distance < self.car_info[-1]/2:
+ self.terminal = True
+
+
+class Viewer(pyglet.window.Window):
+ color = {
+ 'background': [1]*3 + [1]
+ }
+ fps_display = pyglet.clock.ClockDisplay()
+ bar_thc = 5
+
+ def __init__(self, width, height, car_info, sensor_info, obstacle_coords):
+ super(Viewer, self).__init__(width, height, resizable=False, caption='2D car', vsync=False) # vsync=False to not use the monitor FPS
+ self.set_location(x=80, y=10)
+ pyglet.gl.glClearColor(*self.color['background'])
+
+ self.car_info = car_info
+ self.sensor_info = sensor_info
+
+ self.batch = pyglet.graphics.Batch()
+ background = pyglet.graphics.OrderedGroup(0)
+ foreground = pyglet.graphics.OrderedGroup(1)
+
+ self.sensors = []
+ line_coord = [0, 0] * 2
+ c = (73, 73, 73) * 2
+ for i in range(len(self.sensor_info)):
+ self.sensors.append(self.batch.add(2, pyglet.gl.GL_LINES, foreground, ('v2f', line_coord), ('c3B', c)))
+
+ car_box = [0, 0] * 4
+ c = (249, 86, 86) * 4
+ self.car = self.batch.add(4, pyglet.gl.GL_QUADS, foreground, ('v2f', car_box), ('c3B', c))
+
+ c = (134, 181, 244) * 4
+ self.obstacle = self.batch.add(4, pyglet.gl.GL_QUADS, background, ('v2f', obstacle_coords.flatten()), ('c3B', c))
+
+ def render(self):
+ pyglet.clock.tick()
+ self._update()
+ self.switch_to()
+ self.dispatch_events()
+ self.dispatch_event('on_draw')
+ self.flip()
+
+ def on_draw(self):
+ self.clear()
+ self.batch.draw()
+ # self.fps_display.draw()
+
+ def _update(self):
+ cx, cy, r, w, l = self.car_info
+
+ # sensors
+ for i, sensor in enumerate(self.sensors):
+ sensor.vertices = [cx, cy, *self.sensor_info[i, -2:]]
+
+ # car
+ xys = [
+ [cx + l / 2, cy + w / 2],
+ [cx - l / 2, cy + w / 2],
+ [cx - l / 2, cy - w / 2],
+ [cx + l / 2, cy - w / 2],
+ ]
+ r_xys = []
+ for x, y in xys:
+ tempX = x - cx
+ tempY = y - cy
+ # apply rotation
+ rotatedX = tempX * np.cos(r) - tempY * np.sin(r)
+ rotatedY = tempX * np.sin(r) + tempY * np.cos(r)
+ # rotated x y
+ x = rotatedX + cx
+ y = rotatedY + cy
+ r_xys += [x, y]
+ self.car.vertices = r_xys
+
+
+if __name__ == '__main__':
+ np.random.seed(1)
+ env = CarEnv()
+ env.set_fps(30)
+ for ep in range(20):
+ s = env.reset()
+ # for t in range(100):
+ while True:
+ env.render()
+ s, r, done = env.step(env.sample_action())
+ if done:
+ break
\ No newline at end of file
diff --git a/experiments/2D_car/collision.py b/experiments/2D_car/collision.py
new file mode 100644
index 0000000..1b77643
--- /dev/null
+++ b/experiments/2D_car/collision.py
@@ -0,0 +1,57 @@
+import numpy as np
+
+def intersection():
+ p = np.array([0, 0])
+ r = np.array([1, 1])
+ q = np.array([0.1, 0.1])
+ s = np.array([.1, .1])
+
+ if np.cross(r, s) == 0 and np.cross((q-p), r) == 0: # collinear
+ # t0 = (q − p) · r / (r · r)
+ # t1 = (q + s − p) · r / (r · r) = t0 + s · r / (r · r)
+ t0 = np.dot(q-p, r)/np.dot(r, r)
+ t1 = t0 + np.dot(s, r)/np.dot(r, r)
+ print(t1, t0)
+ if ((np.dot(s, r) > 0) and (0 <= t1 - t0 <= 1)) or ((np.dot(s, r) <= 0) and (0 <= t0 - t1 <= 1)):
+ print('collinear and overlapping, q_s in p_r')
+ else:
+ print('collinear and disjoint')
+ elif np.cross(r, s) == 0 and np.cross((q-p), r) != 0: # parallel r × s = 0 and (q − p) × r ≠ 0,
+ print('parallel')
+ else:
+ t = np.cross((q - p), s) / np.cross(r, s)
+ u = np.cross((q - p), r) / np.cross(r, s)
+ if 0 <= t <= 1 and 0 <= u <= 1:
+ # If r × s ≠ 0 and 0 ≤ t ≤ 1 and 0 ≤ u ≤ 1, the two line segments meet at the point p + t r = q + u s
+ print('intersection: ', p + t*r)
+ else:
+ print('not parallel and not intersect')
+
+
+def point2segment():
+ p = np.array([-1, 1]) # coordination of point
+ a = np.array([0, 1]) # coordination of line segment end 1
+ b = np.array([1, 0]) # coordination of line segment end 2
+ ab = b-a # line ab
+ ap = p-a
+ distance = np.abs(np.cross(ab, ap)/np.linalg.norm(ab)) # d = (AB x AC)/|AB|
+ print(distance)
+
+ # angle Cos(θ) = A dot B /(|A||B|)
+ bp = p-b
+ cosTheta1 = np.dot(ap, ab) / (np.linalg.norm(ap) * np.linalg.norm(ab))
+ theta1 = np.arccos(cosTheta1)
+ cosTheta2 = np.dot(bp, ab) / (np.linalg.norm(bp) * np.linalg.norm(ab))
+ theta2 = np.arccos(cosTheta2)
+ if np.pi/2 <= (theta1 % (np.pi*2)) <= 3/2 * np.pi:
+ print('out of a')
+ elif -np.pi/2 <= (theta2 % (np.pi*2)) <= np.pi/2:
+ print('out of b')
+ else:
+ print('between a and b')
+
+
+
+if __name__ == '__main__':
+ point2segment()
+ # intersection()
diff --git a/experiments/Robot_arm/A3C.py b/experiments/Robot_arm/A3C.py
new file mode 100644
index 0000000..89150db
--- /dev/null
+++ b/experiments/Robot_arm/A3C.py
@@ -0,0 +1,214 @@
+"""
+Environment is a Robot Arm. The arm tries to get to the blue point.
+The environment will return a geographic (distance) information for the arm to learn.
+
+The far away from blue point the less reward; touch blue r+=1; stop at blue for a while then get r=+10.
+
+You can train this RL by using LOAD = False, after training, this model will be store in the a local folder.
+Using LOAD = True to reload the trained model for playing.
+
+You can customize this script in a way you want.
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+
+Requirement:
+pyglet >= 1.2.4
+numpy >= 1.12.1
+tensorflow >= 1.0.1
+"""
+
+import multiprocessing
+import threading
+import tensorflow as tf
+import numpy as np
+from arm_env import ArmEnv
+
+
+# np.random.seed(1)
+# tf.set_random_seed(1)
+
+MAX_GLOBAL_EP = 2000
+MAX_EP_STEP = 300
+UPDATE_GLOBAL_ITER = 5
+N_WORKERS = multiprocessing.cpu_count()
+LR_A = 1e-4 # learning rate for actor
+LR_C = 2e-4 # learning rate for critic
+GAMMA = 0.9 # reward discount
+MODE = ['easy', 'hard']
+n_model = 1
+GLOBAL_NET_SCOPE = 'Global_Net'
+ENTROPY_BETA = 0.01
+GLOBAL_RUNNING_R = []
+GLOBAL_EP = 0
+
+
+env = ArmEnv(mode=MODE[n_model])
+N_S = env.state_dim
+N_A = env.action_dim
+A_BOUND = env.action_bound
+del env
+
+
+class ACNet(object):
+ def __init__(self, scope, globalAC=None):
+
+ if scope == GLOBAL_NET_SCOPE: # get global network
+ with tf.variable_scope(scope):
+ self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+ self._build_net()
+ self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+ self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+ else: # local net, calculate losses
+ with tf.variable_scope(scope):
+ self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+ self.a_his = tf.placeholder(tf.float32, [None, N_A], 'A')
+ self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
+
+ mu, sigma, self.v = self._build_net()
+
+ td = tf.subtract(self.v_target, self.v, name='TD_error')
+ with tf.name_scope('c_loss'):
+ self.c_loss = tf.reduce_mean(tf.square(td))
+
+ with tf.name_scope('wrap_a_out'):
+ self.test = sigma[0]
+ mu, sigma = mu * A_BOUND[1], sigma + 1e-5
+
+ normal_dist = tf.contrib.distributions.Normal(mu, sigma)
+
+ with tf.name_scope('a_loss'):
+ log_prob = normal_dist.log_prob(self.a_his)
+ exp_v = log_prob * td
+ entropy = normal_dist.entropy() # encourage exploration
+ self.exp_v = ENTROPY_BETA * entropy + exp_v
+ self.a_loss = tf.reduce_mean(-self.exp_v)
+
+ with tf.name_scope('choose_a'): # use local params to choose action
+ self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=0), *A_BOUND)
+ with tf.name_scope('local_grad'):
+ self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+ self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+ self.a_grads = tf.gradients(self.a_loss, self.a_params)
+ self.c_grads = tf.gradients(self.c_loss, self.c_params)
+
+ with tf.name_scope('sync'):
+ with tf.name_scope('pull'):
+ self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)]
+ self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)]
+ with tf.name_scope('push'):
+ self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params))
+ self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params))
+
+ def _build_net(self):
+ w_init = tf.contrib.layers.xavier_initializer()
+ with tf.variable_scope('actor'):
+ l_a = tf.layers.dense(self.s, 400, tf.nn.relu6, kernel_initializer=w_init, name='la')
+ l_a = tf.layers.dense(l_a, 300, tf.nn.relu6, kernel_initializer=w_init, name='la2')
+ mu = tf.layers.dense(l_a, N_A, tf.nn.tanh, kernel_initializer=w_init, name='mu')
+ sigma = tf.layers.dense(l_a, N_A, tf.nn.softplus, kernel_initializer=w_init, name='sigma')
+ with tf.variable_scope('critic'):
+ l_c = tf.layers.dense(self.s, 400, tf.nn.relu6, kernel_initializer=w_init, name='lc')
+ l_c = tf.layers.dense(l_c, 200, tf.nn.relu6, kernel_initializer=w_init, name='lc2')
+ v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v') # state value
+ return mu, sigma, v
+
+ def update_global(self, feed_dict): # run by a local
+ _, _, t = SESS.run([self.update_a_op, self.update_c_op, self.test], feed_dict) # local grads applies to global net
+ return t
+
+ def pull_global(self): # run by a local
+ SESS.run([self.pull_a_params_op, self.pull_c_params_op])
+
+ def choose_action(self, s): # run by a local
+ s = s[np.newaxis, :]
+ return SESS.run(self.A, {self.s: s})[0]
+
+
+class Worker(object):
+ def __init__(self, name, globalAC):
+ self.env = ArmEnv(mode=MODE[n_model])
+ self.name = name
+ self.AC = ACNet(name, globalAC)
+
+ def work(self):
+ global GLOBAL_RUNNING_R, GLOBAL_EP
+ total_step = 1
+ buffer_s, buffer_a, buffer_r = [], [], []
+ while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
+ s = self.env.reset()
+ ep_r = 0
+ for ep_t in range(MAX_EP_STEP):
+ if self.name == 'W_0':
+ self.env.render()
+ a = self.AC.choose_action(s)
+ s_, r, done = self.env.step(a)
+ if ep_t == MAX_EP_STEP - 1: done = True
+ ep_r += r
+ buffer_s.append(s)
+ buffer_a.append(a)
+ buffer_r.append(r)
+
+ if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net
+ if done:
+ v_s_ = 0 # terminal
+ else:
+ v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0]
+ buffer_v_target = []
+ for r in buffer_r[::-1]: # reverse buffer r
+ v_s_ = r + GAMMA * v_s_
+ buffer_v_target.append(v_s_)
+ buffer_v_target.reverse()
+
+ buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target)
+ feed_dict = {
+ self.AC.s: buffer_s,
+ self.AC.a_his: buffer_a,
+ self.AC.v_target: buffer_v_target,
+ }
+ test = self.AC.update_global(feed_dict)
+ buffer_s, buffer_a, buffer_r = [], [], []
+ self.AC.pull_global()
+
+ s = s_
+ total_step += 1
+ if done:
+ if len(GLOBAL_RUNNING_R) == 0: # record running episode reward
+ GLOBAL_RUNNING_R.append(ep_r)
+ else:
+ GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r)
+ print(
+ self.name,
+ "Ep:", GLOBAL_EP,
+ "| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
+ '| Var:', test,
+
+ )
+ GLOBAL_EP += 1
+ break
+
+if __name__ == "__main__":
+ SESS = tf.Session()
+
+ with tf.device("/cpu:0"):
+ OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA')
+ OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC')
+ GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) # we only need its params
+ workers = []
+ # Create worker
+ for i in range(N_WORKERS):
+ i_name = 'W_%i' % i # worker name
+ workers.append(Worker(i_name, GLOBAL_AC))
+
+ COORD = tf.train.Coordinator()
+ SESS.run(tf.global_variables_initializer())
+
+ worker_threads = []
+ for worker in workers:
+ job = lambda: worker.work()
+ t = threading.Thread(target=job)
+ t.start()
+ worker_threads.append(t)
+ COORD.join(worker_threads)
+
+
diff --git a/experiments/Robot_arm/DDPG.py b/experiments/Robot_arm/DDPG.py
new file mode 100644
index 0000000..0eb1b8a
--- /dev/null
+++ b/experiments/Robot_arm/DDPG.py
@@ -0,0 +1,277 @@
+"""
+Environment is a Robot Arm. The arm tries to get to the blue point.
+The environment will return a geographic (distance) information for the arm to learn.
+
+The far away from blue point the less reward; touch blue r+=1; stop at blue for a while then get r=+10.
+
+You can train this RL by using LOAD = False, after training, this model will be store in the a local folder.
+Using LOAD = True to reload the trained model for playing.
+
+You can customize this script in a way you want.
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+Requirement:
+pyglet >= 1.2.4
+numpy >= 1.12.1
+tensorflow >= 1.0.1
+"""
+
+import tensorflow as tf
+import numpy as np
+import os
+import shutil
+from arm_env import ArmEnv
+
+
+np.random.seed(1)
+tf.set_random_seed(1)
+
+MAX_EPISODES = 600
+MAX_EP_STEPS = 200
+LR_A = 1e-4 # learning rate for actor
+LR_C = 1e-4 # learning rate for critic
+GAMMA = 0.999 # reward discount
+REPLACE_ITER_A = 1100
+REPLACE_ITER_C = 1000
+MEMORY_CAPACITY = 10000
+BATCH_SIZE = 16
+VAR_MIN = 0.1
+RENDER = True
+LOAD = False
+MODE = ['easy', 'hard']
+n_model = 1
+
+env = ArmEnv(mode=MODE[n_model])
+STATE_DIM = env.state_dim
+ACTION_DIM = env.action_dim
+ACTION_BOUND = env.action_bound
+
+# all placeholder for tf
+with tf.name_scope('S'):
+ S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s')
+with tf.name_scope('A'):
+ A = tf.placeholder(tf.float32, shape=[None, ACTION_DIM], name='a')
+with tf.name_scope('R'):
+ R = tf.placeholder(tf.float32, [None, 1], name='r')
+with tf.name_scope('S_'):
+ S_ = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s_')
+
+
+class Actor(object):
+ def __init__(self, sess, action_dim, action_bound, learning_rate, t_replace_iter):
+ self.sess = sess
+ self.a_dim = action_dim
+ self.action_bound = action_bound
+ self.lr = learning_rate
+ self.t_replace_iter = t_replace_iter
+ self.t_replace_counter = 0
+
+ with tf.variable_scope('Actor'):
+ # input s, output a
+ self.a = self._build_net(S, scope='eval_net', trainable=True)
+
+ # input s_, output a, get a_ for critic
+ self.a_ = self._build_net(S_, scope='target_net', trainable=False)
+
+ self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval_net')
+ self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target_net')
+
+ def _build_net(self, s, scope, trainable):
+ with tf.variable_scope(scope):
+ init_w = tf.contrib.layers.xavier_initializer()
+ init_b = tf.constant_initializer(0.001)
+ net = tf.layers.dense(s, 200, activation=tf.nn.relu6,
+ kernel_initializer=init_w, bias_initializer=init_b, name='l1',
+ trainable=trainable)
+ net = tf.layers.dense(net, 200, activation=tf.nn.relu6,
+ kernel_initializer=init_w, bias_initializer=init_b, name='l2',
+ trainable=trainable)
+ net = tf.layers.dense(net, 10, activation=tf.nn.relu,
+ kernel_initializer=init_w, bias_initializer=init_b, name='l3',
+ trainable=trainable)
+ with tf.variable_scope('a'):
+ actions = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, kernel_initializer=init_w,
+ name='a', trainable=trainable)
+ scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a') # Scale output to -action_bound to action_bound
+ return scaled_a
+
+ def learn(self, s, a): # batch update
+ self.sess.run(self.train_op, feed_dict={S: s, A: a})
+ if self.t_replace_counter % self.t_replace_iter == 0:
+ self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
+ self.t_replace_counter += 1
+
+ def choose_action(self, s):
+ s = s[np.newaxis, :] # single state
+ return self.sess.run(self.a, feed_dict={S: s})[0] # single action
+
+ def add_grad_to_graph(self, a_grads):
+ with tf.variable_scope('policy_grads'):
+ self.policy_grads = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads)
+
+ with tf.variable_scope('A_train'):
+ opt = tf.train.RMSPropOptimizer(-self.lr / BATCH_SIZE) # (- learning rate) for ascent policy, div to take mean
+ self.train_op = opt.apply_gradients(zip(self.policy_grads, self.e_params))
+
+
+class Critic(object):
+ def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter, a_):
+ self.sess = sess
+ self.s_dim = state_dim
+ self.a_dim = action_dim
+ self.lr = learning_rate
+ self.gamma = gamma
+ self.t_replace_iter = t_replace_iter
+ self.t_replace_counter = 0
+
+ with tf.variable_scope('Critic'):
+ # Input (s, a), output q
+ self.q = self._build_net(S, A, 'eval_net', trainable=True)
+
+ # Input (s_, a_), output q_ for q_target
+ self.q_ = self._build_net(S_, a_, 'target_net', trainable=False) # target_q is based on a_ from Actor's target_net
+
+ self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_net')
+ self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_net')
+
+ with tf.variable_scope('target_q'):
+ self.target_q = R + self.gamma * self.q_
+
+ with tf.variable_scope('TD_error'):
+ self.loss = tf.reduce_mean(tf.squared_difference(self.target_q, self.q))
+
+ with tf.variable_scope('C_train'):
+ self.train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
+
+ with tf.variable_scope('a_grad'):
+ self.a_grads = tf.gradients(self.q, A)[0] # tensor of gradients of each sample (None, a_dim)
+
+ def _build_net(self, s, a, scope, trainable):
+ with tf.variable_scope(scope):
+ init_w = tf.contrib.layers.xavier_initializer()
+ init_b = tf.constant_initializer(0.01)
+
+ with tf.variable_scope('l1'):
+ n_l1 = 200
+ w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], initializer=init_w, trainable=trainable)
+ w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], initializer=init_w, trainable=trainable)
+ b1 = tf.get_variable('b1', [1, n_l1], initializer=init_b, trainable=trainable)
+ net = tf.nn.relu6(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
+ net = tf.layers.dense(net, 200, activation=tf.nn.relu6,
+ kernel_initializer=init_w, bias_initializer=init_b, name='l2',
+ trainable=trainable)
+ net = tf.layers.dense(net, 10, activation=tf.nn.relu,
+ kernel_initializer=init_w, bias_initializer=init_b, name='l3',
+ trainable=trainable)
+ with tf.variable_scope('q'):
+ q = tf.layers.dense(net, 1, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable) # Q(s,a)
+ return q
+
+ def learn(self, s, a, r, s_):
+ self.sess.run(self.train_op, feed_dict={S: s, A: a, R: r, S_: s_})
+ if self.t_replace_counter % self.t_replace_iter == 0:
+ self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
+ self.t_replace_counter += 1
+
+
+class Memory(object):
+ def __init__(self, capacity, dims):
+ self.capacity = capacity
+ self.data = np.zeros((capacity, dims))
+ self.pointer = 0
+
+ def store_transition(self, s, a, r, s_):
+ transition = np.hstack((s, a, [r], s_))
+ index = self.pointer % self.capacity # replace the old memory with new memory
+ self.data[index, :] = transition
+ self.pointer += 1
+
+ def sample(self, n):
+ assert self.pointer >= self.capacity, 'Memory has not been fulfilled'
+ indices = np.random.choice(self.capacity, size=n)
+ return self.data[indices, :]
+
+
+sess = tf.Session()
+
+# Create actor and critic.
+actor = Actor(sess, ACTION_DIM, ACTION_BOUND[1], LR_A, REPLACE_ITER_A)
+critic = Critic(sess, STATE_DIM, ACTION_DIM, LR_C, GAMMA, REPLACE_ITER_C, actor.a_)
+actor.add_grad_to_graph(critic.a_grads)
+
+M = Memory(MEMORY_CAPACITY, dims=2 * STATE_DIM + ACTION_DIM + 1)
+
+saver = tf.train.Saver()
+path = './'+MODE[n_model]
+
+if LOAD:
+ saver.restore(sess, tf.train.latest_checkpoint(path))
+else:
+ sess.run(tf.global_variables_initializer())
+
+
+def train():
+ var = 2. # control exploration
+
+ for ep in range(MAX_EPISODES):
+ s = env.reset()
+ ep_reward = 0
+
+ for t in range(MAX_EP_STEPS):
+ # while True:
+ if RENDER:
+ env.render()
+
+ # Added exploration noise
+ a = actor.choose_action(s)
+ a = np.clip(np.random.normal(a, var), *ACTION_BOUND) # add randomness to action selection for exploration
+ s_, r, done = env.step(a)
+ M.store_transition(s, a, r, s_)
+
+ if M.pointer > MEMORY_CAPACITY:
+ var = max([var*.99995, VAR_MIN]) # decay the action randomness
+ b_M = M.sample(BATCH_SIZE)
+ b_s = b_M[:, :STATE_DIM]
+ b_a = b_M[:, STATE_DIM: STATE_DIM + ACTION_DIM]
+ b_r = b_M[:, -STATE_DIM - 1: -STATE_DIM]
+ b_s_ = b_M[:, -STATE_DIM:]
+
+ critic.learn(b_s, b_a, b_r, b_s_)
+ actor.learn(b_s, b_a)
+
+ s = s_
+ ep_reward += r
+
+ if t == MAX_EP_STEPS-1 or done:
+ # if done:
+ result = '| done' if done else '| ----'
+ print('Ep:', ep,
+ result,
+ '| R: %i' % int(ep_reward),
+ '| Explore: %.2f' % var,
+ )
+ break
+
+ if os.path.isdir(path): shutil.rmtree(path)
+ os.mkdir(path)
+ ckpt_path = os.path.join('./'+MODE[n_model], 'DDPG.ckpt')
+ save_path = saver.save(sess, ckpt_path, write_meta_graph=False)
+ print("\nSave Model %s\n" % save_path)
+
+
+def eval():
+ env.set_fps(30)
+ s = env.reset()
+ while True:
+ if RENDER:
+ env.render()
+ a = actor.choose_action(s)
+ s_, r, done = env.step(a)
+ s = s_
+
+if __name__ == '__main__':
+ if LOAD:
+ eval()
+ else:
+ train()
\ No newline at end of file
diff --git a/experiments/Robot_arm/arm_env.py b/experiments/Robot_arm/arm_env.py
new file mode 100644
index 0000000..a0eb0fd
--- /dev/null
+++ b/experiments/Robot_arm/arm_env.py
@@ -0,0 +1,218 @@
+"""
+Environment for Robot Arm.
+You can customize this script in a way you want.
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+
+Requirement:
+pyglet >= 1.2.4
+numpy >= 1.12.1
+"""
+import numpy as np
+import pyglet
+
+
+pyglet.clock.set_fps_limit(10000)
+
+
+class ArmEnv(object):
+ action_bound = [-1, 1]
+ action_dim = 2
+ state_dim = 7
+ dt = .1 # refresh rate
+ arm1l = 100
+ arm2l = 100
+ viewer = None
+ viewer_xy = (400, 400)
+ get_point = False
+ mouse_in = np.array([False])
+ point_l = 15
+ grab_counter = 0
+
+ def __init__(self, mode='easy'):
+ # node1 (l, d_rad, x, y),
+ # node2 (l, d_rad, x, y)
+ self.mode = mode
+ self.arm_info = np.zeros((2, 4))
+ self.arm_info[0, 0] = self.arm1l
+ self.arm_info[1, 0] = self.arm2l
+ self.point_info = np.array([250, 303])
+ self.point_info_init = self.point_info.copy()
+ self.center_coord = np.array(self.viewer_xy)/2
+
+ def step(self, action):
+ # action = (node1 angular v, node2 angular v)
+ action = np.clip(action, *self.action_bound)
+ self.arm_info[:, 1] += action * self.dt
+ self.arm_info[:, 1] %= np.pi * 2
+
+ arm1rad = self.arm_info[0, 1]
+ arm2rad = self.arm_info[1, 1]
+ arm1dx_dy = np.array([self.arm_info[0, 0] * np.cos(arm1rad), self.arm_info[0, 0] * np.sin(arm1rad)])
+ arm2dx_dy = np.array([self.arm_info[1, 0] * np.cos(arm2rad), self.arm_info[1, 0] * np.sin(arm2rad)])
+ self.arm_info[0, 2:4] = self.center_coord + arm1dx_dy # (x1, y1)
+ self.arm_info[1, 2:4] = self.arm_info[0, 2:4] + arm2dx_dy # (x2, y2)
+
+ s, arm2_distance = self._get_state()
+ r = self._r_func(arm2_distance)
+
+ return s, r, self.get_point
+
+ def reset(self):
+ self.get_point = False
+ self.grab_counter = 0
+
+ if self.mode == 'hard':
+ pxy = np.clip(np.random.rand(2) * self.viewer_xy[0], 100, 300)
+ self.point_info[:] = pxy
+ else:
+ arm1rad, arm2rad = np.random.rand(2) * np.pi * 2
+ self.arm_info[0, 1] = arm1rad
+ self.arm_info[1, 1] = arm2rad
+ arm1dx_dy = np.array([self.arm_info[0, 0] * np.cos(arm1rad), self.arm_info[0, 0] * np.sin(arm1rad)])
+ arm2dx_dy = np.array([self.arm_info[1, 0] * np.cos(arm2rad), self.arm_info[1, 0] * np.sin(arm2rad)])
+ self.arm_info[0, 2:4] = self.center_coord + arm1dx_dy # (x1, y1)
+ self.arm_info[1, 2:4] = self.arm_info[0, 2:4] + arm2dx_dy # (x2, y2)
+
+ self.point_info[:] = self.point_info_init
+ return self._get_state()[0]
+
+ def render(self):
+ if self.viewer is None:
+ self.viewer = Viewer(*self.viewer_xy, self.arm_info, self.point_info, self.point_l, self.mouse_in)
+ self.viewer.render()
+
+ def sample_action(self):
+ return np.random.uniform(*self.action_bound, size=self.action_dim)
+
+ def set_fps(self, fps=30):
+ pyglet.clock.set_fps_limit(fps)
+
+ def _get_state(self):
+ # return the distance (dx, dy) between arm finger point with blue point
+ arm_end = self.arm_info[:, 2:4]
+ t_arms = np.ravel(arm_end - self.point_info)
+ center_dis = (self.center_coord - self.point_info)/200
+ in_point = 1 if self.grab_counter > 0 else 0
+ return np.hstack([in_point, t_arms/200, center_dis,
+ # arm1_distance_p, arm1_distance_b,
+ ]), t_arms[-2:]
+
+ def _r_func(self, distance):
+ t = 50
+ abs_distance = np.sqrt(np.sum(np.square(distance)))
+ r = -abs_distance/200
+ if abs_distance < self.point_l and (not self.get_point):
+ r += 1.
+ self.grab_counter += 1
+ if self.grab_counter > t:
+ r += 10.
+ self.get_point = True
+ elif abs_distance > self.point_l:
+ self.grab_counter = 0
+ self.get_point = False
+ return r
+
+
+class Viewer(pyglet.window.Window):
+ color = {
+ 'background': [1]*3 + [1]
+ }
+ fps_display = pyglet.clock.ClockDisplay()
+ bar_thc = 5
+
+ def __init__(self, width, height, arm_info, point_info, point_l, mouse_in):
+ super(Viewer, self).__init__(width, height, resizable=False, caption='Arm', vsync=False) # vsync=False to not use the monitor FPS
+ self.set_location(x=80, y=10)
+ pyglet.gl.glClearColor(*self.color['background'])
+
+ self.arm_info = arm_info
+ self.point_info = point_info
+ self.mouse_in = mouse_in
+ self.point_l = point_l
+
+ self.center_coord = np.array((min(width, height)/2, ) * 2)
+ self.batch = pyglet.graphics.Batch()
+
+ arm1_box, arm2_box, point_box = [0]*8, [0]*8, [0]*8
+ c1, c2, c3 = (249, 86, 86)*4, (86, 109, 249)*4, (249, 39, 65)*4
+ self.point = self.batch.add(4, pyglet.gl.GL_QUADS, None, ('v2f', point_box), ('c3B', c2))
+ self.arm1 = self.batch.add(4, pyglet.gl.GL_QUADS, None, ('v2f', arm1_box), ('c3B', c1))
+ self.arm2 = self.batch.add(4, pyglet.gl.GL_QUADS, None, ('v2f', arm2_box), ('c3B', c1))
+
+ def render(self):
+ pyglet.clock.tick()
+ self._update_arm()
+ self.switch_to()
+ self.dispatch_events()
+ self.dispatch_event('on_draw')
+ self.flip()
+
+ def on_draw(self):
+ self.clear()
+ self.batch.draw()
+ # self.fps_display.draw()
+
+ def _update_arm(self):
+ point_l = self.point_l
+ point_box = (self.point_info[0] - point_l, self.point_info[1] - point_l,
+ self.point_info[0] + point_l, self.point_info[1] - point_l,
+ self.point_info[0] + point_l, self.point_info[1] + point_l,
+ self.point_info[0] - point_l, self.point_info[1] + point_l)
+ self.point.vertices = point_box
+
+ arm1_coord = (*self.center_coord, *(self.arm_info[0, 2:4])) # (x0, y0, x1, y1)
+ arm2_coord = (*(self.arm_info[0, 2:4]), *(self.arm_info[1, 2:4])) # (x1, y1, x2, y2)
+ arm1_thick_rad = np.pi / 2 - self.arm_info[0, 1]
+ x01, y01 = arm1_coord[0] - np.cos(arm1_thick_rad) * self.bar_thc, arm1_coord[1] + np.sin(
+ arm1_thick_rad) * self.bar_thc
+ x02, y02 = arm1_coord[0] + np.cos(arm1_thick_rad) * self.bar_thc, arm1_coord[1] - np.sin(
+ arm1_thick_rad) * self.bar_thc
+ x11, y11 = arm1_coord[2] + np.cos(arm1_thick_rad) * self.bar_thc, arm1_coord[3] - np.sin(
+ arm1_thick_rad) * self.bar_thc
+ x12, y12 = arm1_coord[2] - np.cos(arm1_thick_rad) * self.bar_thc, arm1_coord[3] + np.sin(
+ arm1_thick_rad) * self.bar_thc
+ arm1_box = (x01, y01, x02, y02, x11, y11, x12, y12)
+ arm2_thick_rad = np.pi / 2 - self.arm_info[1, 1]
+ x11_, y11_ = arm2_coord[0] + np.cos(arm2_thick_rad) * self.bar_thc, arm2_coord[1] - np.sin(
+ arm2_thick_rad) * self.bar_thc
+ x12_, y12_ = arm2_coord[0] - np.cos(arm2_thick_rad) * self.bar_thc, arm2_coord[1] + np.sin(
+ arm2_thick_rad) * self.bar_thc
+ x21, y21 = arm2_coord[2] - np.cos(arm2_thick_rad) * self.bar_thc, arm2_coord[3] + np.sin(
+ arm2_thick_rad) * self.bar_thc
+ x22, y22 = arm2_coord[2] + np.cos(arm2_thick_rad) * self.bar_thc, arm2_coord[3] - np.sin(
+ arm2_thick_rad) * self.bar_thc
+ arm2_box = (x11_, y11_, x12_, y12_, x21, y21, x22, y22)
+ self.arm1.vertices = arm1_box
+ self.arm2.vertices = arm2_box
+
+ def on_key_press(self, symbol, modifiers):
+ if symbol == pyglet.window.key.UP:
+ self.arm_info[0, 1] += .1
+ print(self.arm_info[:, 2:4] - self.point_info)
+ elif symbol == pyglet.window.key.DOWN:
+ self.arm_info[0, 1] -= .1
+ print(self.arm_info[:, 2:4] - self.point_info)
+ elif symbol == pyglet.window.key.LEFT:
+ self.arm_info[1, 1] += .1
+ print(self.arm_info[:, 2:4] - self.point_info)
+ elif symbol == pyglet.window.key.RIGHT:
+ self.arm_info[1, 1] -= .1
+ print(self.arm_info[:, 2:4] - self.point_info)
+ elif symbol == pyglet.window.key.Q:
+ pyglet.clock.set_fps_limit(1000)
+ elif symbol == pyglet.window.key.A:
+ pyglet.clock.set_fps_limit(30)
+
+ def on_mouse_motion(self, x, y, dx, dy):
+ self.point_info[:] = [x, y]
+
+ def on_mouse_enter(self, x, y):
+ self.mouse_in[0] = True
+
+ def on_mouse_leave(self, x, y):
+ self.mouse_in[0] = False
+
+
+
diff --git a/experiments/Solve_BipedalWalker/A3C.py b/experiments/Solve_BipedalWalker/A3C.py
new file mode 100644
index 0000000..7f4bc45
--- /dev/null
+++ b/experiments/Solve_BipedalWalker/A3C.py
@@ -0,0 +1,209 @@
+"""
+Asynchronous Advantage Actor Critic (A3C), Reinforcement Learning.
+
+The BipedalWalker example.
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+Using:
+tensorflow 1.0
+gym 0.8.0
+"""
+
+import multiprocessing
+import threading
+import tensorflow as tf
+import numpy as np
+import gym
+import os
+import shutil
+
+
+GAME = 'BipedalWalker-v2'
+OUTPUT_GRAPH = False
+LOG_DIR = './log'
+N_WORKERS = multiprocessing.cpu_count()
+MAX_GLOBAL_EP = 8000
+GLOBAL_NET_SCOPE = 'Global_Net'
+UPDATE_GLOBAL_ITER = 10
+GAMMA = 0.999
+ENTROPY_BETA = 0.005
+LR_A = 0.00002 # learning rate for actor
+LR_C = 0.0001 # learning rate for critic
+GLOBAL_RUNNING_R = []
+GLOBAL_EP = 0
+
+env = gym.make(GAME)
+
+N_S = env.observation_space.shape[0]
+N_A = env.action_space.shape[0]
+A_BOUND = [env.action_space.low, env.action_space.high]
+del env
+
+
+class ACNet(object):
+ def __init__(self, scope, globalAC=None):
+
+ if scope == GLOBAL_NET_SCOPE: # get global network
+ with tf.variable_scope(scope):
+ self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+ self._build_net()
+ self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+ self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+ else: # local net, calculate losses
+ with tf.variable_scope(scope):
+ self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+ self.a_his = tf.placeholder(tf.float32, [None, N_A], 'A')
+ self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
+
+ mu, sigma, self.v = self._build_net()
+
+ td = tf.subtract(self.v_target, self.v, name='TD_error')
+ with tf.name_scope('c_loss'):
+ self.c_loss = tf.reduce_mean(tf.square(td))
+
+ with tf.name_scope('wrap_a_out'):
+ self.test = sigma[0]
+ mu, sigma = mu * A_BOUND[1], sigma + 1e-5
+
+ normal_dist = tf.contrib.distributions.Normal(mu, sigma)
+
+ with tf.name_scope('a_loss'):
+ log_prob = normal_dist.log_prob(self.a_his)
+ exp_v = log_prob * td
+ entropy = normal_dist.entropy() # encourage exploration
+ self.exp_v = ENTROPY_BETA * entropy + exp_v
+ self.a_loss = tf.reduce_mean(-self.exp_v)
+
+ with tf.name_scope('choose_a'): # use local params to choose action
+ self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=0), *A_BOUND)
+ with tf.name_scope('local_grad'):
+ self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+ self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+ self.a_grads = tf.gradients(self.a_loss, self.a_params)
+ self.c_grads = tf.gradients(self.c_loss, self.c_params)
+
+ with tf.name_scope('sync'):
+ with tf.name_scope('pull'):
+ self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)]
+ self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)]
+ with tf.name_scope('push'):
+ self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params))
+ self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params))
+
+ def _build_net(self):
+ w_init = tf.contrib.layers.xavier_initializer()
+ with tf.variable_scope('actor'):
+ l_a = tf.layers.dense(self.s, 500, tf.nn.relu6, kernel_initializer=w_init, name='la')
+ l_a = tf.layers.dense(l_a, 300, tf.nn.relu6, kernel_initializer=w_init, name='la2')
+ mu = tf.layers.dense(l_a, N_A, tf.nn.tanh, kernel_initializer=w_init, name='mu')
+ sigma = tf.layers.dense(l_a, N_A, tf.nn.softplus, kernel_initializer=w_init, name='sigma')
+ with tf.variable_scope('critic'):
+ l_c = tf.layers.dense(self.s, 500, tf.nn.relu6, kernel_initializer=w_init, name='lc')
+ l_c = tf.layers.dense(l_c, 200, tf.nn.relu6, kernel_initializer=w_init, name='lc2')
+ v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v') # state value
+ return mu, sigma, v
+
+ def update_global(self, feed_dict): # run by a local
+ _, _, t = SESS.run([self.update_a_op, self.update_c_op, self.test], feed_dict) # local grads applies to global net
+ return t
+
+ def pull_global(self): # run by a local
+ SESS.run([self.pull_a_params_op, self.pull_c_params_op])
+
+ def choose_action(self, s): # run by a local
+ s = s[np.newaxis, :]
+ return SESS.run(self.A, {self.s: s})[0]
+
+
+class Worker(object):
+ def __init__(self, name, globalAC):
+ self.env = gym.make(GAME)
+ self.name = name
+ self.AC = ACNet(name, globalAC)
+
+ def work(self):
+ global GLOBAL_RUNNING_R, GLOBAL_EP
+ total_step = 1
+ buffer_s, buffer_a, buffer_r = [], [], []
+ while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
+ s = self.env.reset()
+ ep_r = 0
+ while True:
+ if self.name == 'W_0' and total_step % 30 == 0:
+ self.env.render()
+ a = self.AC.choose_action(s)
+ s_, r, done, info = self.env.step(a)
+ if r == -100: r = -2
+
+ ep_r += r
+ buffer_s.append(s)
+ buffer_a.append(a)
+ buffer_r.append(r)
+
+ if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net
+ if done:
+ v_s_ = 0 # terminal
+ else:
+ v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0]
+ buffer_v_target = []
+ for r in buffer_r[::-1]: # reverse buffer r
+ v_s_ = r + GAMMA * v_s_
+ buffer_v_target.append(v_s_)
+ buffer_v_target.reverse()
+
+ buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target)
+ feed_dict = {
+ self.AC.s: buffer_s,
+ self.AC.a_his: buffer_a,
+ self.AC.v_target: buffer_v_target,
+ }
+ test = self.AC.update_global(feed_dict)
+ buffer_s, buffer_a, buffer_r = [], [], []
+ self.AC.pull_global()
+
+ s = s_
+ total_step += 1
+ if done:
+ achieve = '| Achieve' if self.env.unwrapped.hull.position[0] >= 88 else '| -------'
+ if len(GLOBAL_RUNNING_R) == 0: # record running episode reward
+ GLOBAL_RUNNING_R.append(ep_r)
+ else:
+ GLOBAL_RUNNING_R.append(0.95 * GLOBAL_RUNNING_R[-1] + 0.05 * ep_r)
+ print(
+ self.name,
+ "Ep:", GLOBAL_EP,
+ achieve,
+ "| Pos: %i" % self.env.unwrapped.hull.position[0],
+ "| RR: %.1f" % GLOBAL_RUNNING_R[-1],
+ '| EpR: %.1f' % ep_r,
+ '| var:', test,
+ )
+ GLOBAL_EP += 1
+ break
+
+if __name__ == "__main__":
+ SESS = tf.Session()
+
+ with tf.device("/cpu:0"):
+ OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA')
+ OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC')
+ GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) # we only need its params
+ workers = []
+ # Create worker
+ for i in range(N_WORKERS):
+ i_name = 'W_%i' % i # worker name
+ workers.append(Worker(i_name, GLOBAL_AC))
+
+ COORD = tf.train.Coordinator()
+ SESS.run(tf.global_variables_initializer())
+
+ worker_threads = []
+ for worker in workers:
+ job = lambda: worker.work()
+ t = threading.Thread(target=job)
+ t.start()
+ worker_threads.append(t)
+ COORD.join(worker_threads)
+
+
diff --git a/experiments/Solve_BipedalWalker/A3C_rnn.py b/experiments/Solve_BipedalWalker/A3C_rnn.py
new file mode 100644
index 0000000..acdc951
--- /dev/null
+++ b/experiments/Solve_BipedalWalker/A3C_rnn.py
@@ -0,0 +1,235 @@
+"""
+Asynchronous Advantage Actor Critic (A3C), Reinforcement Learning.
+
+The BipedalWalker example.
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+Using:
+tensorflow 1.0
+gym 0.8.0
+"""
+
+import multiprocessing
+import threading
+import tensorflow as tf
+import numpy as np
+import gym
+import os
+import shutil
+
+
+GAME = 'BipedalWalker-v2'
+OUTPUT_GRAPH = False
+LOG_DIR = './log'
+N_WORKERS = multiprocessing.cpu_count()
+MAX_GLOBAL_EP = 8000
+GLOBAL_NET_SCOPE = 'Global_Net'
+UPDATE_GLOBAL_ITER = 10
+GAMMA = 0.99
+ENTROPY_BETA = 0.005
+LR_A = 0.00001 # learning rate for actor
+LR_C = 0.0001 # learning rate for critic
+GLOBAL_RUNNING_R = []
+GLOBAL_EP = 0
+
+env = gym.make(GAME)
+
+N_S = env.observation_space.shape[0]
+N_A = env.action_space.shape[0]
+A_BOUND = [env.action_space.low, env.action_space.high]
+del env
+
+
+class ACNet(object):
+ def __init__(self, scope, globalAC=None):
+
+ if scope == GLOBAL_NET_SCOPE: # get global network
+ with tf.variable_scope(scope):
+ self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+ self._build_net(N_A)
+ self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+ self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+ else: # local net, calculate losses
+ with tf.variable_scope(scope):
+ self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+ self.a_his = tf.placeholder(tf.float32, [None, N_A], 'A')
+ self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
+
+ mu, sigma, self.v = self._build_net(N_A)
+
+ td = tf.subtract(self.v_target, self.v, name='TD_error')
+ with tf.name_scope('c_loss'):
+ self.c_loss = tf.reduce_mean(tf.square(td))
+
+ with tf.name_scope('wrap_a_out'):
+ self.test = sigma[0]
+ mu, sigma = mu * A_BOUND[1], sigma + 1e-5
+
+ normal_dist = tf.contrib.distributions.Normal(mu, sigma)
+
+ with tf.name_scope('a_loss'):
+ log_prob = normal_dist.log_prob(self.a_his)
+ exp_v = log_prob * td
+ entropy = normal_dist.entropy() # encourage exploration
+ self.exp_v = ENTROPY_BETA * entropy + exp_v
+ self.a_loss = tf.reduce_mean(-self.exp_v)
+
+ with tf.name_scope('choose_a'): # use local params to choose action
+ self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=0), A_BOUND[0], A_BOUND[1])
+
+ with tf.name_scope('local_grad'):
+ self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+ self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+ self.a_grads = tf.gradients(self.a_loss, self.a_params)
+ self.c_grads = tf.gradients(self.c_loss, self.c_params)
+
+ with tf.name_scope('sync'):
+ with tf.name_scope('pull'):
+ self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in
+ zip(self.a_params, globalAC.a_params)]
+ self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in
+ zip(self.c_params, globalAC.c_params)]
+ with tf.name_scope('push'):
+ self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params))
+ self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params))
+
+ def _build_net(self, n_a):
+ w_init = tf.random_normal_initializer(0., .01)
+ with tf.variable_scope('critic'): # only critic controls the rnn update
+ cell_size = 128
+ s = tf.expand_dims(self.s, axis=1,
+ name='timely_input') # [time_step, feature] => [time_step, batch, feature]
+ rnn_cell = tf.contrib.rnn.BasicRNNCell(cell_size)
+ self.init_state = rnn_cell.zero_state(batch_size=1, dtype=tf.float32)
+ outputs, self.final_state = tf.nn.dynamic_rnn(
+ cell=rnn_cell, inputs=s, initial_state=self.init_state, time_major=True)
+ cell_out = tf.reshape(outputs, [-1, cell_size], name='flatten_rnn_outputs') # joined state representation
+ l_c = tf.layers.dense(cell_out, 300, tf.nn.relu6, kernel_initializer=w_init, name='lc')
+ v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v') # state value
+
+ with tf.variable_scope('actor'): # state representation is based on critic
+ cell_out = tf.stop_gradient(cell_out, name='c_cell_out') # from what critic think it is
+ l_a = tf.layers.dense(cell_out, 400, tf.nn.relu6, kernel_initializer=w_init, name='la')
+ mu = tf.layers.dense(l_a, N_A, tf.nn.tanh, kernel_initializer=w_init, name='mu')
+ sigma = tf.layers.dense(l_a, N_A, tf.nn.softplus, kernel_initializer=w_init, name='sigma') # restrict variance
+ return mu, sigma, v
+
+ def update_global(self, feed_dict): # run by a local
+ _, _, t = SESS.run([self.update_a_op, self.update_c_op, self.test], feed_dict) # local grads applies to global net
+ return t
+
+ def pull_global(self): # run by a local
+ SESS.run([self.pull_a_params_op, self.pull_c_params_op])
+
+ def choose_action(self, s, cell_state): # run by a local
+ s = s[np.newaxis, :]
+ a, cell_state = SESS.run([self.A, self.final_state], {self.s: s, self.init_state: cell_state})
+ return a[0], cell_state
+
+
+class Worker(object):
+ def __init__(self, name, globalAC):
+ self.env = gym.make(GAME)
+ self.name = name
+ self.AC = ACNet(name, globalAC)
+
+ def work(self):
+ global GLOBAL_RUNNING_R, GLOBAL_EP
+ total_step = 1
+ buffer_s, buffer_a, buffer_r = [], [], []
+ while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
+ s = self.env.reset()
+ ep_r = 0
+ rnn_state = SESS.run(self.AC.init_state) # zero rnn state at beginning
+ keep_state = rnn_state.copy() # keep rnn state for updating global net
+ while True:
+ if self.name == 'W_0' and total_step % 30 == 0:
+ self.env.render()
+
+ a, rnn_state_ = self.AC.choose_action(s, rnn_state) # get the action and next rnn state
+ s_, r, done, info = self.env.step(a)
+ if r == -100: r = -2
+
+ ep_r += r
+ buffer_s.append(s)
+ buffer_a.append(a)
+ buffer_r.append(r)
+
+ if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net
+ if done:
+ v_s_ = 0 # terminal
+ else:
+ v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :], self.AC.init_state: rnn_state_})[
+ 0, 0]
+ buffer_v_target = []
+ for r in buffer_r[::-1]: # reverse buffer r
+ v_s_ = r + GAMMA * v_s_
+ buffer_v_target.append(v_s_)
+ buffer_v_target.reverse()
+
+ buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(
+ buffer_v_target)
+
+ feed_dict = {
+ self.AC.s: buffer_s,
+ self.AC.a_his: buffer_a,
+ self.AC.v_target: buffer_v_target,
+ self.AC.init_state: keep_state,
+ }
+
+ test = self.AC.update_global(feed_dict)
+ buffer_s, buffer_a, buffer_r = [], [], []
+ self.AC.pull_global()
+ keep_state = rnn_state_.copy() # replace the keep_state as the new initial rnn state_
+
+ s = s_
+ rnn_state = rnn_state_ # renew rnn state
+ total_step += 1
+
+ if done:
+ achieve = '| Achieve' if self.env.unwrapped.hull.position[0] >= 88 else '| -------'
+ if len(GLOBAL_RUNNING_R) == 0: # record running episode reward
+ GLOBAL_RUNNING_R.append(ep_r)
+ else:
+ GLOBAL_RUNNING_R.append(0.95 * GLOBAL_RUNNING_R[-1] + 0.05 * ep_r)
+ print(
+ self.name,
+ "Ep:", GLOBAL_EP,
+ achieve,
+ "| Pos: %i" % self.env.unwrapped.hull.position[0],
+ "| RR: %.1f" % GLOBAL_RUNNING_R[-1],
+ '| EpR: %.1f' % ep_r,
+ '| var:', test,
+ )
+ GLOBAL_EP += 1
+ break
+
+
+if __name__ == "__main__":
+ SESS = tf.Session()
+
+ with tf.device("/cpu:0"):
+ OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA', decay=0.95)
+ OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC', decay=0.95)
+ GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) # we only need its params
+ workers = []
+ # Create worker
+ for i in range(N_WORKERS):
+ i_name = 'W_%i' % i # worker name
+ workers.append(Worker(i_name, GLOBAL_AC))
+
+ COORD = tf.train.Coordinator()
+ SESS.run(tf.global_variables_initializer())
+
+ if OUTPUT_GRAPH:
+ if os.path.exists(LOG_DIR):
+ shutil.rmtree(LOG_DIR)
+ tf.summary.FileWriter(LOG_DIR, SESS.graph)
+
+ worker_threads = []
+ for worker in workers:
+ t = threading.Thread(target=worker.work)
+ t.start()
+ worker_threads.append(t)
+ COORD.join(worker_threads)
diff --git a/experiments/Solve_BipedalWalker/DDPG.py b/experiments/Solve_BipedalWalker/DDPG.py
new file mode 100644
index 0000000..9f0a824
--- /dev/null
+++ b/experiments/Solve_BipedalWalker/DDPG.py
@@ -0,0 +1,390 @@
+import tensorflow as tf
+import numpy as np
+import gym
+import os
+import shutil
+
+np.random.seed(1)
+tf.set_random_seed(1)
+
+MAX_EPISODES = 2000
+LR_A = 0.0005 # learning rate for actor
+LR_C = 0.0005 # learning rate for critic
+GAMMA = 0.999 # reward discount
+REPLACE_ITER_A = 1700
+REPLACE_ITER_C = 1500
+MEMORY_CAPACITY = 200000
+BATCH_SIZE = 32
+DISPLAY_THRESHOLD = 100 # display until the running reward > 100
+DATA_PATH = './data'
+LOAD_MODEL = False
+SAVE_MODEL_ITER = 100000
+RENDER = False
+OUTPUT_GRAPH = False
+ENV_NAME = 'BipedalWalker-v2'
+
+GLOBAL_STEP = tf.Variable(0, trainable=False)
+INCREASE_GS = GLOBAL_STEP.assign(tf.add(GLOBAL_STEP, 1))
+LR_A = tf.train.exponential_decay(LR_A, GLOBAL_STEP, 10000, .97, staircase=True)
+LR_C = tf.train.exponential_decay(LR_C, GLOBAL_STEP, 10000, .97, staircase=True)
+END_POINT = (200 - 10) * (14/30) # from game
+
+env = gym.make(ENV_NAME)
+env.seed(1)
+
+STATE_DIM = env.observation_space.shape[0] # 24
+ACTION_DIM = env.action_space.shape[0] # 4
+ACTION_BOUND = env.action_space.high # [1, 1, 1, 1]
+
+# all placeholder for tf
+with tf.name_scope('S'):
+ S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s')
+with tf.name_scope('A'):
+ A = tf.placeholder(tf.float32, shape=[None, ACTION_DIM], name='a')
+with tf.name_scope('R'):
+ R = tf.placeholder(tf.float32, [None, 1], name='r')
+with tf.name_scope('S_'):
+ S_ = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s_')
+
+############################### Actor ####################################
+
+class Actor(object):
+ def __init__(self, sess, action_dim, action_bound, learning_rate, t_replace_iter):
+ self.sess = sess
+ self.a_dim = action_dim
+ self.action_bound = action_bound
+ self.lr = learning_rate
+ self.t_replace_iter = t_replace_iter
+ self.t_replace_counter = 0
+
+ with tf.variable_scope('Actor'):
+ # input s, output a
+ self.a = self._build_net(S, scope='eval_net', trainable=True)
+
+ # input s_, output a, get a_ for critic
+ self.a_ = self._build_net(S_, scope='target_net', trainable=False)
+
+ self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval_net')
+ self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target_net')
+
+ def _build_net(self, s, scope, trainable):
+ with tf.variable_scope(scope):
+ init_w = tf.random_normal_initializer(0., 0.01)
+ init_b = tf.constant_initializer(0.01)
+ net = tf.layers.dense(s, 500, activation=tf.nn.relu,
+ kernel_initializer=init_w, bias_initializer=init_b, name='l1', trainable=trainable)
+ net = tf.layers.dense(net, 200, activation=tf.nn.relu,
+ kernel_initializer=init_w, bias_initializer=init_b, name='l2', trainable=trainable)
+
+ with tf.variable_scope('a'):
+ actions = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, kernel_initializer=init_w,
+ bias_initializer=init_b, name='a', trainable=trainable)
+ scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a') # Scale output to -action_bound to action_bound
+ return scaled_a
+
+ def learn(self, s, a): # batch update
+ self.sess.run(self.train_op, feed_dict={S: s, A: a})
+ if self.t_replace_counter % self.t_replace_iter == 0:
+ self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
+ self.t_replace_counter += 1
+
+ def choose_action(self, s):
+ s = s[np.newaxis, :] # single state
+ return self.sess.run(self.a, feed_dict={S: s})[0] # single action
+
+ def add_grad_to_graph(self, a_grads):
+ with tf.variable_scope('policy_grads'):
+ # ys = policy;
+ # xs = policy's parameters;
+ # self.a_grads = the gradients of the policy to get more Q
+ # tf.gradients will calculate dys/dxs with a initial gradients for ys, so this is dq/da * da/dparams
+ self.policy_grads_and_vars = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads)
+
+ with tf.variable_scope('A_train'):
+ opt = tf.train.AdamOptimizer(-self.lr/BATCH_SIZE) # (- learning rate) for ascent policy
+ self.train_op = opt.apply_gradients(zip(self.policy_grads_and_vars, self.e_params), global_step=GLOBAL_STEP)
+
+
+############################### Critic ####################################
+
+class Critic(object):
+ def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter, a_):
+ self.sess = sess
+ self.s_dim = state_dim
+ self.a_dim = action_dim
+ self.lr = learning_rate
+ self.gamma = gamma
+ self.t_replace_iter = t_replace_iter
+ self.t_replace_counter = 0
+
+ with tf.variable_scope('Critic'):
+ # Input (s, a), output q
+ self.q = self._build_net(S, A, 'eval_net', trainable=True)
+
+ # Input (s_, a_), output q_ for q_target
+ self.q_ = self._build_net(S_, a_, 'target_net', trainable=False) # target_q is based on a_ from Actor's target_net
+
+ self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_net')
+ self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_net')
+
+ with tf.variable_scope('target_q'):
+ self.target_q = R + self.gamma * self.q_
+
+ with tf.variable_scope('abs_TD'):
+ self.abs_td = tf.abs(self.target_q - self.q)
+ self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights')
+ with tf.variable_scope('TD_error'):
+ self.loss = tf.reduce_mean(self.ISWeights * tf.squared_difference(self.target_q, self.q))
+
+ with tf.variable_scope('C_train'):
+ self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss, global_step=GLOBAL_STEP)
+
+ with tf.variable_scope('a_grad'):
+ self.a_grads = tf.gradients(self.q, A)[0] # tensor of gradients of each sample (None, a_dim)
+
+ def _build_net(self, s, a, scope, trainable):
+ with tf.variable_scope(scope):
+ init_w = tf.random_normal_initializer(0., 0.01)
+ init_b = tf.constant_initializer(0.01)
+
+ with tf.variable_scope('l1'):
+ n_l1 = 700
+ # combine the action and states together in this way
+ w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], initializer=init_w, trainable=trainable)
+ w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], initializer=init_w, trainable=trainable)
+ b1 = tf.get_variable('b1', [1, n_l1], initializer=init_b, trainable=trainable)
+ net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
+ with tf.variable_scope('l2'):
+ net = tf.layers.dense(net, 20, activation=tf.nn.relu, kernel_initializer=init_w,
+ bias_initializer=init_b, name='l2', trainable=trainable)
+ with tf.variable_scope('q'):
+ q = tf.layers.dense(net, 1, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable) # Q(s,a)
+ return q
+
+ def learn(self, s, a, r, s_, ISW):
+ _, abs_td = self.sess.run([self.train_op, self.abs_td], feed_dict={S: s, A: a, R: r, S_: s_, self.ISWeights: ISW})
+ if self.t_replace_counter % self.t_replace_iter == 0:
+ self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
+ self.t_replace_counter += 1
+ return abs_td
+
+
+class SumTree(object):
+ """
+ This SumTree code is modified version and the original code is from:
+ https://github.com/jaara/AI-blog/blob/master/SumTree.py
+
+ Story the data with it priority in tree and data frameworks.
+ """
+ data_pointer = 0
+
+ def __init__(self, capacity):
+ self.capacity = capacity # for all priority values
+ self.tree = np.zeros(2 * capacity - 1)+1e-5
+ # [--------------Parent nodes-------------][-------leaves to recode priority-------]
+ # size: capacity - 1 size: capacity
+ self.data = np.zeros(capacity, dtype=object) # for all transitions
+ # [--------------data frame-------------]
+ # size: capacity
+
+ def add_new_priority(self, p, data):
+ leaf_idx = self.data_pointer + self.capacity - 1
+
+ self.data[self.data_pointer] = data # update data_frame
+ self.update(leaf_idx, p) # update tree_frame
+ self.data_pointer += 1
+ if self.data_pointer >= self.capacity: # replace when exceed the capacity
+ self.data_pointer = 0
+
+ def update(self, tree_idx, p):
+ change = p - self.tree[tree_idx]
+
+ self.tree[tree_idx] = p
+ self._propagate_change(tree_idx, change)
+
+ def _propagate_change(self, tree_idx, change):
+ """change the sum of priority value in all parent nodes"""
+ parent_idx = (tree_idx - 1) // 2
+ self.tree[parent_idx] += change
+ if parent_idx != 0:
+ self._propagate_change(parent_idx, change)
+
+ def get_leaf(self, lower_bound):
+ leaf_idx = self._retrieve(lower_bound) # search the max leaf priority based on the lower_bound
+ data_idx = leaf_idx - self.capacity + 1
+ return [leaf_idx, self.tree[leaf_idx], self.data[data_idx]]
+
+ def _retrieve(self, lower_bound, parent_idx=0):
+ """
+ Tree structure and array storage:
+
+ Tree index:
+ 0 -> storing priority sum
+ / \
+ 1 2
+ / \ / \
+ 3 4 5 6 -> storing priority for transitions
+
+ Array type for storing:
+ [0,1,2,3,4,5,6]
+ """
+ left_child_idx = 2 * parent_idx + 1
+ right_child_idx = left_child_idx + 1
+
+ if left_child_idx >= len(self.tree): # end search when no more child
+ return parent_idx
+
+ if self.tree[left_child_idx] == self.tree[right_child_idx]:
+ return self._retrieve(lower_bound, np.random.choice([left_child_idx, right_child_idx]))
+ if lower_bound <= self.tree[left_child_idx]: # downward search, always search for a higher priority node
+ return self._retrieve(lower_bound, left_child_idx)
+ else:
+ return self._retrieve(lower_bound - self.tree[left_child_idx], right_child_idx)
+
+ @property
+ def root_priority(self):
+ return self.tree[0] # the root
+
+
+class Memory(object): # stored as ( s, a, r, s_ ) in SumTree
+ """
+ This SumTree code is modified version and the original code is from:
+ https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py
+ """
+ epsilon = 0.001 # small amount to avoid zero priority
+ alpha = 0.6 # [0~1] convert the importance of TD error to priority
+ beta = 0.4 # importance-sampling, from initial value increasing to 1
+ beta_increment_per_sampling = 1e-5 # annealing the bias
+ abs_err_upper = 1 # for stability refer to paper
+
+ def __init__(self, capacity):
+ self.tree = SumTree(capacity)
+
+ def store(self, error, transition):
+ p = self._get_priority(error)
+ self.tree.add_new_priority(p, transition)
+
+ def prio_sample(self, n):
+ batch_idx, batch_memory, ISWeights = [], [], []
+ segment = self.tree.root_priority / n
+ self.beta = np.min([1, self.beta + self.beta_increment_per_sampling]) # max = 1
+
+ min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.root_priority
+ maxiwi = np.power(self.tree.capacity * min_prob, -self.beta) # for later normalizing ISWeights
+ for i in range(n):
+ a = segment * i
+ b = segment * (i + 1)
+ lower_bound = np.random.uniform(a, b)
+ while True:
+ idx, p, data = self.tree.get_leaf(lower_bound)
+ if type(data) is int:
+ i -= 1
+ lower_bound = np.random.uniform(segment * i, segment * (i+1))
+ else:
+ break
+ prob = p / self.tree.root_priority
+ ISWeights.append(self.tree.capacity * prob)
+ batch_idx.append(idx)
+ batch_memory.append(data)
+
+ ISWeights = np.vstack(ISWeights)
+ ISWeights = np.power(ISWeights, -self.beta) / maxiwi # normalize
+ return batch_idx, np.vstack(batch_memory), ISWeights
+
+ def random_sample(self, n):
+ idx = np.random.randint(0, self.tree.capacity, size=n, dtype=np.int)
+ return np.vstack(self.tree.data[idx])
+
+ def update(self, idx, error):
+ p = self._get_priority(error)
+ self.tree.update(idx, p)
+
+ def _get_priority(self, error):
+ error += self.epsilon # avoid 0
+ clipped_error = np.clip(error, 0, self.abs_err_upper)
+ return np.power(clipped_error, self.alpha)
+
+
+sess = tf.Session()
+
+# Create actor and critic.
+actor = Actor(sess, ACTION_DIM, ACTION_BOUND, LR_A, REPLACE_ITER_A)
+critic = Critic(sess, STATE_DIM, ACTION_DIM, LR_C, GAMMA, REPLACE_ITER_C, actor.a_)
+actor.add_grad_to_graph(critic.a_grads)
+
+M = Memory(MEMORY_CAPACITY)
+
+saver = tf.train.Saver(max_to_keep=100)
+
+if LOAD_MODEL:
+ all_ckpt = tf.train.get_checkpoint_state('./data', 'checkpoint').all_model_checkpoint_paths
+ saver.restore(sess, all_ckpt[-1])
+else:
+ if os.path.isdir(DATA_PATH): shutil.rmtree(DATA_PATH)
+ os.mkdir(DATA_PATH)
+ sess.run(tf.global_variables_initializer())
+
+if OUTPUT_GRAPH:
+ tf.summary.FileWriter('logs', graph=sess.graph)
+
+var = 3 # control exploration
+var_min = 0.01
+
+for i_episode in range(MAX_EPISODES):
+ # s = (hull angle speed, angular velocity, horizontal speed, vertical speed, position of joints and joints angular speed, legs contact with ground, and 10 lidar rangefinder measurements.)
+ s = env.reset()
+ ep_r = 0
+ while True:
+ if RENDER:
+ env.render()
+ a = actor.choose_action(s)
+ a = np.clip(np.random.normal(a, var), -1, 1) # add randomness to action selection for exploration
+ s_, r, done, _ = env.step(a) # r = total 300+ points up to the far end. If the robot falls, it gets -100.
+
+ if r == -100: r = -2
+ ep_r += r
+
+ transition = np.hstack((s, a, [r], s_))
+ max_p = np.max(M.tree.tree[-M.tree.capacity:])
+ M.store(max_p, transition)
+
+ if GLOBAL_STEP.eval(sess) > MEMORY_CAPACITY/20:
+ var = max([var*0.9999, var_min]) # decay the action randomness
+ tree_idx, b_M, ISWeights = M.prio_sample(BATCH_SIZE) # for critic update
+ b_s = b_M[:, :STATE_DIM]
+ b_a = b_M[:, STATE_DIM: STATE_DIM + ACTION_DIM]
+ b_r = b_M[:, -STATE_DIM - 1: -STATE_DIM]
+ b_s_ = b_M[:, -STATE_DIM:]
+
+ abs_td = critic.learn(b_s, b_a, b_r, b_s_, ISWeights)
+ actor.learn(b_s, b_a)
+ for i in range(len(tree_idx)): # update priority
+ idx = tree_idx[i]
+ M.update(idx, abs_td[i])
+ if GLOBAL_STEP.eval(sess) % SAVE_MODEL_ITER == 0:
+ ckpt_path = os.path.join(DATA_PATH, 'DDPG.ckpt')
+ save_path = saver.save(sess, ckpt_path, global_step=GLOBAL_STEP, write_meta_graph=False)
+ print("\nSave Model %s\n" % save_path)
+
+ if done:
+ if "running_r" not in globals():
+ running_r = ep_r
+ else:
+ running_r = 0.95*running_r + 0.05*ep_r
+ if running_r > DISPLAY_THRESHOLD: RENDER = True
+ else: RENDER = False
+
+ done = '| Achieve ' if env.unwrapped.hull.position[0] >= END_POINT else '| -----'
+ print('Episode:', i_episode,
+ done,
+ '| Running_r: %i' % int(running_r),
+ '| Epi_r: %.2f' % ep_r,
+ '| Exploration: %.3f' % var,
+ '| Pos: %.i' % int(env.unwrapped.hull.position[0]),
+ '| LR_A: %.6f' % sess.run(LR_A),
+ '| LR_C: %.6f' % sess.run(LR_C),
+ )
+ break
+
+ s = s_
+ sess.run(INCREASE_GS)
\ No newline at end of file
diff --git a/experiments/Solve_BipedalWalker/log/events.out.tfevents.1490801027.Morvan b/experiments/Solve_BipedalWalker/log/events.out.tfevents.1490801027.Morvan
new file mode 100644
index 0000000..7746ab0
Binary files /dev/null and b/experiments/Solve_BipedalWalker/log/events.out.tfevents.1490801027.Morvan differ
diff --git a/experiments/Solve_LunarLander/A3C.py b/experiments/Solve_LunarLander/A3C.py
new file mode 100644
index 0000000..a57a547
--- /dev/null
+++ b/experiments/Solve_LunarLander/A3C.py
@@ -0,0 +1,224 @@
+"""
+Asynchronous Advantage Actor Critic (A3C) with continuous action space, Reinforcement Learning.
+
+The Pendulum example. Convergence promised, but difficult environment, this code hardly converge.
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+Using:
+tensorflow 1.0
+gym 0.8.0
+"""
+
+import multiprocessing
+import threading
+import tensorflow as tf
+import numpy as np
+import gym
+import os
+import shutil
+import matplotlib.pyplot as plt
+
+
+GAME = 'LunarLander-v2'
+OUTPUT_GRAPH = False
+LOG_DIR = './log'
+N_WORKERS = multiprocessing.cpu_count()
+MAX_GLOBAL_EP = 5000
+GLOBAL_NET_SCOPE = 'Global_Net'
+UPDATE_GLOBAL_ITER = 5
+GAMMA = 0.99
+ENTROPY_BETA = 0.001 # not useful in this case
+LR_A = 0.0005 # learning rate for actor
+LR_C = 0.001 # learning rate for critic
+GLOBAL_RUNNING_R = []
+GLOBAL_EP = 0
+
+env = gym.make(GAME)
+
+N_S = env.observation_space.shape[0]
+N_A = env.action_space.n
+del env
+
+
+class ACNet(object):
+ def __init__(self, scope, globalAC=None):
+ if scope == GLOBAL_NET_SCOPE: # get global network
+ with tf.variable_scope(scope):
+ self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+ self._build_net(N_A)
+ self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+ self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+ else: # local net, calculate losses
+ with tf.variable_scope(scope):
+ self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+ self.a_his = tf.placeholder(tf.int32, [None, ], 'A')
+ self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
+
+ self.a_prob, self.v = self._build_net(N_A)
+
+ td = tf.subtract(self.v_target, self.v, name='TD_error')
+ with tf.name_scope('c_loss'):
+ self.c_loss = tf.reduce_mean(tf.square(td))
+
+ with tf.name_scope('a_loss'):
+ log_prob = tf.reduce_sum(tf.log(self.a_prob) * tf.one_hot(self.a_his, N_A, dtype=tf.float32), axis=1, keep_dims=True)
+ exp_v = log_prob * td
+ entropy = -tf.reduce_sum(self.a_prob * tf.log(self.a_prob), axis=1, keep_dims=True) # encourage exploration
+ self.exp_v = ENTROPY_BETA * entropy + exp_v
+ self.a_loss = tf.reduce_mean(-self.exp_v)
+
+ with tf.name_scope('local_grad'):
+ self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+ self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+ self.a_grads = tf.gradients(self.a_loss, self.a_params)
+ self.c_grads = tf.gradients(self.c_loss, self.c_params)
+
+ with tf.name_scope('sync'):
+ with tf.name_scope('pull'):
+ self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)]
+ self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)]
+ with tf.name_scope('push'):
+ self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params))
+ self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params))
+
+ def _build_net(self, n_a):
+ w_init = tf.random_normal_initializer(0., .01)
+ with tf.variable_scope('critic'):
+ cell_size = 64
+ s = tf.expand_dims(self.s, axis=1,
+ name='timely_input') # [time_step, feature] => [time_step, batch, feature]
+ rnn_cell = tf.contrib.rnn.BasicRNNCell(cell_size)
+ self.init_state = rnn_cell.zero_state(batch_size=1, dtype=tf.float32)
+ outputs, self.final_state = tf.nn.dynamic_rnn(
+ cell=rnn_cell, inputs=s, initial_state=self.init_state, time_major=True)
+ cell_out = tf.reshape(outputs, [-1, cell_size], name='flatten_rnn_outputs') # joined state representation
+ l_c = tf.layers.dense(cell_out, 200, tf.nn.relu6, kernel_initializer=w_init, name='lc')
+ v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v') # state value
+ with tf.variable_scope('actor'):
+ cell_out = tf.stop_gradient(cell_out, name='c_cell_out')
+ l_a = tf.layers.dense(cell_out, 300, tf.nn.relu6, kernel_initializer=w_init, name='la')
+ a_prob = tf.layers.dense(l_a, n_a, tf.nn.softmax, kernel_initializer=w_init, name='ap')
+
+ return a_prob, v
+
+ def update_global(self, feed_dict): # run by a local
+ SESS.run([self.update_a_op, self.update_c_op], feed_dict) # local grads applies to global net
+
+ def pull_global(self): # run by a local
+ SESS.run([self.pull_a_params_op, self.pull_c_params_op])
+
+ def choose_action(self, s, cell_state): # run by a local
+ prob_weights, cell_state = SESS.run([self.a_prob, self.final_state], feed_dict={self.s: s[np.newaxis, :],
+ self.init_state: cell_state})
+ action = np.random.choice(range(prob_weights.shape[1]),
+ p=prob_weights.ravel()) # select action w.r.t the actions prob
+ return action, cell_state
+
+
+class Worker(object):
+ def __init__(self, name, globalAC):
+ self.env = gym.make(GAME)
+ self.name = name
+ self.AC = ACNet(name, globalAC)
+
+ def work(self):
+ global GLOBAL_RUNNING_R, GLOBAL_EP
+ total_step = 1
+ r_scale = 100
+ buffer_s, buffer_a, buffer_r = [], [], []
+ while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
+ s = self.env.reset()
+ ep_r = 0
+ ep_t = 0
+ rnn_state = SESS.run(self.AC.init_state) # zero rnn state at beginning
+ keep_state = rnn_state.copy() # keep rnn state for updating global net
+ while True:
+ # if self.name == 'W_0' and total_step % 10 == 0:
+ # self.env.render()
+ a, rnn_state_ = self.AC.choose_action(s, rnn_state) # get the action and next rnn state
+ s_, r, done, info = self.env.step(a)
+ if r == -100: r = -10
+ ep_r += r
+ buffer_s.append(s)
+ buffer_a.append(a)
+ buffer_r.append(r/r_scale)
+
+ if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net
+ if done:
+ v_s_ = 0 # terminal
+ else:
+ v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :], self.AC.init_state: rnn_state_})[0,0]
+ buffer_v_target = []
+ for r in buffer_r[::-1]: # reverse buffer r
+ v_s_ = r + GAMMA * v_s_
+ buffer_v_target.append(v_s_)
+ buffer_v_target.reverse()
+
+ buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.array(buffer_a), np.vstack(buffer_v_target)
+ feed_dict = {
+ self.AC.s: buffer_s,
+ self.AC.a_his: buffer_a,
+ self.AC.v_target: buffer_v_target,
+ self.AC.init_state: keep_state,
+ }
+
+ self.AC.update_global(feed_dict)
+
+ buffer_s, buffer_a, buffer_r = [], [], []
+ self.AC.pull_global()
+ keep_state = rnn_state_.copy() # replace the keep_state as the new initial rnn state_
+
+ s = s_
+ total_step += 1
+ rnn_state = rnn_state_ # renew rnn state
+ ep_t += 1
+ if done:
+ if len(GLOBAL_RUNNING_R) == 0: # record running episode reward
+ GLOBAL_RUNNING_R.append(ep_r)
+ else:
+ GLOBAL_RUNNING_R.append(0.99 * GLOBAL_RUNNING_R[-1] + 0.01 * ep_r)
+ if not self.env.unwrapped.lander.awake: solve = '| Landed'
+ else: solve = '| ------'
+ print(
+ self.name,
+ "Ep:", GLOBAL_EP,
+ solve,
+ "| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
+ )
+ GLOBAL_EP += 1
+ break
+
+if __name__ == "__main__":
+ SESS = tf.Session()
+
+ with tf.device("/cpu:0"):
+ OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA')
+ OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC')
+ GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) # we only need its params
+ workers = []
+ # Create worker
+ for i in range(N_WORKERS):
+ i_name = 'W_%i' % i # worker name
+ workers.append(Worker(i_name, GLOBAL_AC))
+
+ COORD = tf.train.Coordinator()
+ SESS.run(tf.global_variables_initializer())
+
+ if OUTPUT_GRAPH:
+ if os.path.exists(LOG_DIR):
+ shutil.rmtree(LOG_DIR)
+ tf.summary.FileWriter(LOG_DIR, SESS.graph)
+
+ worker_threads = []
+ for worker in workers:
+ job = lambda: worker.work()
+ t = threading.Thread(target=job)
+ t.start()
+ worker_threads.append(t)
+ COORD.join(worker_threads)
+
+ plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
+ plt.xlabel('step')
+ plt.ylabel('Total moving reward')
+ plt.show()
diff --git a/experiments/Solve_LunarLander/DuelingDQNPrioritizedReplay.py b/experiments/Solve_LunarLander/DuelingDQNPrioritizedReplay.py
new file mode 100644
index 0000000..3d6ed1b
--- /dev/null
+++ b/experiments/Solve_LunarLander/DuelingDQNPrioritizedReplay.py
@@ -0,0 +1,307 @@
+"""
+The DQN improvement: Prioritized Experience Replay (based on https://arxiv.org/abs/1511.05952)
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+
+Using:
+Tensorflow: 1.0
+"""
+
+import numpy as np
+import tensorflow as tf
+
+np.random.seed(1)
+tf.set_random_seed(1)
+
+
+class SumTree(object):
+ """
+ This SumTree code is modified version and the original code is from:
+ https://github.com/jaara/AI-blog/blob/master/SumTree.py
+
+ Story the data with it priority in tree and data frameworks.
+ """
+ data_pointer = 0
+
+ def __init__(self, capacity):
+ self.capacity = capacity # for all priority values
+ self.tree = np.zeros(2 * capacity - 1)
+ # [--------------Parent nodes-------------][-------leaves to recode priority-------]
+ # size: capacity - 1 size: capacity
+ self.data = np.zeros(capacity, dtype=object) # for all transitions
+ # [--------------data frame-------------]
+ # size: capacity
+
+ def add_new_priority(self, p, data):
+ leaf_idx = self.data_pointer + self.capacity - 1
+
+ self.data[self.data_pointer] = data # update data_frame
+ self.update(leaf_idx, p) # update tree_frame
+ self.data_pointer += 1
+ if self.data_pointer >= self.capacity: # replace when exceed the capacity
+ self.data_pointer = 0
+
+ def update(self, tree_idx, p):
+ change = p - self.tree[tree_idx]
+
+ self.tree[tree_idx] = p
+ self._propagate_change(tree_idx, change)
+
+ def _propagate_change(self, tree_idx, change):
+ """change the sum of priority value in all parent nodes"""
+ parent_idx = (tree_idx - 1) // 2
+ self.tree[parent_idx] += change
+ if parent_idx != 0:
+ self._propagate_change(parent_idx, change)
+
+ def get_leaf(self, lower_bound):
+ leaf_idx = self._retrieve(lower_bound) # search the max leaf priority based on the lower_bound
+ data_idx = leaf_idx - self.capacity + 1
+ return [leaf_idx, self.tree[leaf_idx], self.data[data_idx]]
+
+ def _retrieve(self, lower_bound, parent_idx=0):
+ """
+ Tree structure and array storage:
+
+ Tree index:
+ 0 -> storing priority sum
+ / \
+ 1 2
+ / \ / \
+ 3 4 5 6 -> storing priority for transitions
+
+ Array type for storing:
+ [0,1,2,3,4,5,6]
+ """
+ left_child_idx = 2 * parent_idx + 1
+ right_child_idx = left_child_idx + 1
+
+ if left_child_idx >= len(self.tree): # end search when no more child
+ return parent_idx
+
+ if self.tree[left_child_idx] == self.tree[right_child_idx]:
+ return self._retrieve(lower_bound, np.random.choice([left_child_idx, right_child_idx]))
+ if lower_bound <= self.tree[left_child_idx]: # downward search, always search for a higher priority node
+ return self._retrieve(lower_bound, left_child_idx)
+ else:
+ return self._retrieve(lower_bound - self.tree[left_child_idx], right_child_idx)
+
+ @property
+ def root_priority(self):
+ return self.tree[0] # the root
+
+
+class Memory(object): # stored as ( s, a, r, s_ ) in SumTree
+ """
+ This SumTree code is modified version and the original code is from:
+ https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py
+ """
+ epsilon = 0.001 # small amount to avoid zero priority
+ alpha = 0.6 # [0~1] convert the importance of TD error to priority
+ beta = 0.4 # importance-sampling, from initial value increasing to 1
+ beta_increment_per_sampling = 1e-4 # annealing the bias
+ abs_err_upper = 1 # for stability refer to paper
+
+ def __init__(self, capacity):
+ self.tree = SumTree(capacity)
+
+ def store(self, error, transition):
+ p = self._get_priority(error)
+ self.tree.add_new_priority(p, transition)
+
+ def sample(self, n):
+ batch_idx, batch_memory, ISWeights = [], [], []
+ segment = self.tree.root_priority / n
+ self.beta = np.min([1, self.beta + self.beta_increment_per_sampling]) # max = 1
+
+ min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.root_priority
+ maxiwi = np.power(self.tree.capacity * min_prob, -self.beta) # for later normalizing ISWeights
+ for i in range(n):
+ a = segment * i
+ b = segment * (i + 1)
+ lower_bound = np.random.uniform(a, b)
+ idx, p, data = self.tree.get_leaf(lower_bound)
+ prob = p / self.tree.root_priority
+ ISWeights.append(self.tree.capacity * prob)
+ batch_idx.append(idx)
+ batch_memory.append(data)
+
+ ISWeights = np.vstack(ISWeights)
+ ISWeights = np.power(ISWeights, -self.beta) / maxiwi # normalize
+ return batch_idx, np.vstack(batch_memory), ISWeights
+
+ def update(self, idx, error):
+ p = self._get_priority(error)
+ self.tree.update(idx, p)
+
+ def _get_priority(self, error):
+ error += self.epsilon # avoid 0
+ clipped_error = np.clip(error, 0, self.abs_err_upper)
+ return np.power(clipped_error, self.alpha)
+
+
+class DuelingDQNPrioritizedReplay:
+ def __init__(
+ self,
+ n_actions,
+ n_features,
+ learning_rate=0.005,
+ reward_decay=0.9,
+ e_greedy=0.9,
+ replace_target_iter=500,
+ memory_size=10000,
+ batch_size=32,
+ e_greedy_increment=None,
+ hidden=[100, 50],
+ output_graph=False,
+ sess=None,
+ ):
+ self.n_actions = n_actions
+ self.n_features = n_features
+ self.lr = learning_rate
+ self.gamma = reward_decay
+ self.epsilon_max = e_greedy
+ self.replace_target_iter = replace_target_iter
+ self.memory_size = memory_size
+ self.batch_size = batch_size
+ self.hidden = hidden
+ self.epsilon_increment = e_greedy_increment
+ self.epsilon = 0.5 if e_greedy_increment is not None else self.epsilon_max
+
+ self.learn_step_counter = 0
+ self._build_net()
+ self.memory = Memory(capacity=memory_size)
+
+ if sess is None:
+ self.sess = tf.Session()
+ self.sess.run(tf.global_variables_initializer())
+ else:
+ self.sess = sess
+
+ if output_graph:
+ tf.summary.FileWriter("logs/", self.sess.graph)
+
+ self.cost_his = []
+
+ def _build_net(self):
+ def build_layers(s, c_names, w_initializer, b_initializer):
+ for i, h in enumerate(self.hidden):
+ if i == 0:
+ in_units, out_units, inputs = self.n_features, self.hidden[i], s
+ else:
+ in_units, out_units, inputs = self.hidden[i-1], self.hidden[i], l
+ with tf.variable_scope('l%i' % i):
+ w = tf.get_variable('w', [in_units, out_units], initializer=w_initializer, collections=c_names)
+ b = tf.get_variable('b', [1, out_units], initializer=b_initializer, collections=c_names)
+ l = tf.nn.relu(tf.matmul(inputs, w) + b)
+
+ with tf.variable_scope('Value'):
+ w = tf.get_variable('w', [self.hidden[-1], 1], initializer=w_initializer, collections=c_names)
+ b = tf.get_variable('b', [1, 1], initializer=b_initializer, collections=c_names)
+ self.V = tf.matmul(l, w) + b
+
+ with tf.variable_scope('Advantage'):
+ w = tf.get_variable('w', [self.hidden[-1], self.n_actions], initializer=w_initializer, collections=c_names)
+ b = tf.get_variable('b', [1, self.n_actions], initializer=b_initializer, collections=c_names)
+ self.A = tf.matmul(l, w) + b
+
+ with tf.variable_scope('Q'):
+ out = self.V + (self.A - tf.reduce_mean(self.A, axis=1, keep_dims=True)) # Q = V(s) + A(s,a)
+
+ # with tf.variable_scope('out'):
+ # w = tf.get_variable('w', [self.hidden[-1], self.n_actions], initializer=w_initializer, collections=c_names)
+ # b = tf.get_variable('b', [1, self.n_actions], initializer=b_initializer, collections=c_names)
+ # out = tf.matmul(l, w) + b
+ return out
+
+ # ------------------ build evaluate_net ------------------
+ self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input
+ self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss
+ self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights')
+ with tf.variable_scope('eval_net'):
+ c_names, w_initializer, b_initializer = \
+ ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], \
+ tf.random_normal_initializer(0., 0.01), tf.constant_initializer(0.01) # config of layers
+
+ self.q_eval = build_layers(self.s, c_names, w_initializer, b_initializer)
+
+ with tf.variable_scope('loss'):
+ self.abs_errors = tf.abs(tf.reduce_sum(self.q_target - self.q_eval, axis=1)) # for updating Sumtree
+ self.loss = tf.reduce_mean(self.ISWeights * tf.squared_difference(self.q_target, self.q_eval))
+
+ with tf.variable_scope('train'):
+ self._train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)
+
+ # ------------------ build target_net ------------------
+ self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input
+ with tf.variable_scope('target_net'):
+ c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
+ self.q_next = build_layers(self.s_, c_names, w_initializer, b_initializer)
+
+ def store_transition(self, s, a, r, s_):
+ transition = np.hstack((s, [a, r], s_))
+ max_p = np.max(self.memory.tree.tree[-self.memory.tree.capacity:])
+ self.memory.store(max_p, transition)
+
+ def choose_action(self, observation):
+ observation = observation[np.newaxis, :]
+ if np.random.uniform() < self.epsilon:
+ actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
+ action = np.argmax(actions_value)
+ else:
+ action = np.random.randint(0, self.n_actions)
+ return action
+
+ def _replace_target_params(self):
+ t_params = tf.get_collection('target_net_params')
+ e_params = tf.get_collection('eval_net_params')
+ self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])
+
+ def learn(self):
+ if self.learn_step_counter % self.replace_target_iter == 0:
+ self._replace_target_params()
+
+ tree_idx, batch_memory, ISWeights = self.memory.sample(self.batch_size)
+
+ # double DQN
+ q_next, q_eval4next = self.sess.run(
+ [self.q_next, self.q_eval],
+ feed_dict={self.s_: batch_memory[:, -self.n_features:], # next observation
+ self.s: batch_memory[:, -self.n_features:]}) # next observation
+ q_eval = self.sess.run(self.q_eval, {self.s: batch_memory[:, :self.n_features]})
+
+ q_target = q_eval.copy()
+
+ batch_index = np.arange(self.batch_size, dtype=np.int32)
+ eval_act_index = batch_memory[:, self.n_features].astype(int)
+ reward = batch_memory[:, self.n_features + 1]
+ max_act4next = np.argmax(q_eval4next,
+ axis=1) # the action that brings the highest value is evaluated by q_eval
+ selected_q_next = q_next[batch_index, max_act4next] # Double DQN, select q_next depending on above actions
+
+ q_target[batch_index, eval_act_index] = reward + self.gamma * selected_q_next
+
+ # q_next, q_eval = self.sess.run(
+ # [self.q_next, self.q_eval],
+ # feed_dict={self.s_: batch_memory[:, -self.n_features:],
+ # self.s: batch_memory[:, :self.n_features]})
+ #
+ # q_target = q_eval.copy()
+ # batch_index = np.arange(self.batch_size, dtype=np.int32)
+ # eval_act_index = batch_memory[:, self.n_features].astype(int)
+ # reward = batch_memory[:, self.n_features + 1]
+ #
+ # q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
+
+ _, abs_errors, self.cost = self.sess.run([self._train_op, self.abs_errors, self.loss],
+ feed_dict={self.s: batch_memory[:, :self.n_features],
+ self.q_target: q_target,
+ self.ISWeights: ISWeights})
+ for i in range(len(tree_idx)): # update priority
+ idx = tree_idx[i]
+ self.memory.update(idx, abs_errors[i])
+
+ self.cost_his.append(self.cost)
+
+ self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
+ self.learn_step_counter += 1
diff --git a/experiments/Solve_LunarLander/run_LunarLander.py b/experiments/Solve_LunarLander/run_LunarLander.py
new file mode 100644
index 0000000..b286109
--- /dev/null
+++ b/experiments/Solve_LunarLander/run_LunarLander.py
@@ -0,0 +1,68 @@
+"""
+Deep Q network,
+
+LunarLander-v2 example
+
+Using:
+Tensorflow: 1.0
+gym: 0.8.0
+"""
+
+
+import gym
+from gym import wrappers
+from DuelingDQNPrioritizedReplay import DuelingDQNPrioritizedReplay
+
+env = gym.make('LunarLander-v2')
+# env = env.unwrapped
+env.seed(1)
+
+N_A = env.action_space.n
+N_S = env.observation_space.shape[0]
+MEMORY_CAPACITY = 50000
+TARGET_REP_ITER = 2000
+MAX_EPISODES = 900
+E_GREEDY = 0.95
+E_INCREMENT = 0.00001
+GAMMA = 0.99
+LR = 0.0001
+BATCH_SIZE = 32
+HIDDEN = [400, 400]
+RENDER = True
+
+RL = DuelingDQNPrioritizedReplay(
+ n_actions=N_A, n_features=N_S, learning_rate=LR, e_greedy=E_GREEDY, reward_decay=GAMMA,
+ hidden=HIDDEN, batch_size=BATCH_SIZE, replace_target_iter=TARGET_REP_ITER,
+ memory_size=MEMORY_CAPACITY, e_greedy_increment=E_INCREMENT,)
+
+
+total_steps = 0
+running_r = 0
+r_scale = 100
+for i_episode in range(MAX_EPISODES):
+ s = env.reset() # (coord_x, coord_y, vel_x, vel_y, angle, angular_vel, l_leg_on_ground, r_leg_on_ground)
+ ep_r = 0
+ while True:
+ if total_steps > MEMORY_CAPACITY: env.render()
+ a = RL.choose_action(s)
+ s_, r, done, _ = env.step(a)
+ if r == -100: r = -30
+ r /= r_scale
+
+ ep_r += r
+ RL.store_transition(s, a, r, s_)
+ if total_steps > MEMORY_CAPACITY:
+ RL.learn()
+ if done:
+ land = '| Landed' if r == 100/r_scale else '| ------'
+ running_r = 0.99 * running_r + 0.01 * ep_r
+ print('Epi: ', i_episode,
+ land,
+ '| Epi_R: ', round(ep_r, 2),
+ '| Running_R: ', round(running_r, 2),
+ '| Epsilon: ', round(RL.epsilon, 3))
+ break
+
+ s = s_
+ total_steps += 1
+