diff --git a/README.md b/README.md new file mode 100644 index 0000000..5f17bfe --- /dev/null +++ b/README.md @@ -0,0 +1,50 @@ +

+ + + +

+ +--- + +
+ +# Reinforcement Learning Methods and Tutorials + +In these tutorials for reinforcement learning, it covers from the basic RL algorithms to advanced algorithms developed recent years. + +**For Chinese speaker, visit [莫烦 Python](https://morvanzhou.github.io/tutorials/) or my [Youtube channel](https://www.youtube.com/channel/UCdyjiB5H8Pu7aDTNVXTTpcg) for more.** + +**As many requests about making these tutorials available in English, please find them in this playlist:** ([https://www.youtube.com/playlist?list=PLXO45tsB95cIplu-fLMpUEEZTwrDNh6Ba](https://www.youtube.com/playlist?list=PLXO45tsB95cIplu-fLMpUEEZTwrDNh6Ba)) + + +* [Simple entry example](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/1_command_line_reinforcement_learning) +* Tabular Methods + * [Q-learning](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/2_Q_Learning_maze) + * [Sarsa](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/3_Sarsa_maze) + * [Sarsa(lambda)](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/4_Sarsa_lambda_maze) +* Function Approximation (DQN) + * [Deep Q Network](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/5_Deep_Q_Network) +* [Using OpenAI Gym](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/6_OpenAI_gym) +* DQN-based methods + * [Double DQN](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/5.1_Double_DQN) + * [DQN with Prioitized Experience Replay](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/5.2_Prioritized_Replay_DQN) + * [Dueling DQN](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/5.3_Dueling_DQN) +* [Policy Gradients](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/7_Policy_gradient_softmax) +* [Actor Critic](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/8_Actor_Critic_Advantage) + * [Deep Deterministic Policy Gradient](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/9_Deep_Deterministic_Policy_Gradient_DDPG) + * [A3C](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/10_A3C) +* Model-based RL (WIP) + * [Dyna-Q](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/11_Dyna_Q) + + +# Donation + +*If this does help you, please consider donating to support me for better tutorials. Any contribution is greatly appreciated!* + +
+ + Paypal +
diff --git a/RL_cover.jpg b/RL_cover.jpg new file mode 100644 index 0000000..8a47adc Binary files /dev/null and b/RL_cover.jpg differ diff --git a/contents/10_A3C/A3C_RNN.py b/contents/10_A3C/A3C_RNN.py new file mode 100644 index 0000000..82ea6bb --- /dev/null +++ b/contents/10_A3C/A3C_RNN.py @@ -0,0 +1,230 @@ +""" +Asynchronous Advantage Actor Critic (A3C) + RNN with continuous action space, Reinforcement Learning. + +The Pendulum example. + +View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/ + +Using: +tensorflow 1.0 +gym 0.8.0 +""" + +import multiprocessing +import threading +import tensorflow as tf +import numpy as np +import gym +import os +import shutil +import matplotlib.pyplot as plt + +GAME = 'Pendulum-v0' +OUTPUT_GRAPH = True +LOG_DIR = './log' +N_WORKERS = multiprocessing.cpu_count() +MAX_EP_STEP = 400 +MAX_GLOBAL_EP = 800 +GLOBAL_NET_SCOPE = 'Global_Net' +UPDATE_GLOBAL_ITER = 5 +GAMMA = 0.9 +ENTROPY_BETA = 0.01 +LR_A = 0.0001 # learning rate for actor +LR_C = 0.001 # learning rate for critic +GLOBAL_RUNNING_R = [] +GLOBAL_EP = 0 + +env = gym.make(GAME) + +N_S = env.observation_space.shape[0] +N_A = env.action_space.shape[0] +A_BOUND = [env.action_space.low, env.action_space.high] + + +class ACNet(object): + def __init__(self, scope, globalAC=None): + + if scope == GLOBAL_NET_SCOPE: # get global network + with tf.variable_scope(scope): + self.s = tf.placeholder(tf.float32, [None, N_S], 'S') + self._build_net() + self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor') + self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic') + else: # local net, calculate losses + with tf.variable_scope(scope): + self.s = tf.placeholder(tf.float32, [None, N_S], 'S') + self.a_his = tf.placeholder(tf.float32, [None, N_A], 'A') + self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget') + + mu, sigma, self.v = self._build_net() + + td = tf.subtract(self.v_target, self.v, name='TD_error') + with tf.name_scope('c_loss'): + self.c_loss = tf.reduce_mean(tf.square(td)) + + with tf.name_scope('wrap_a_out'): + mu, sigma = mu * A_BOUND[1], sigma + 1e-4 + + normal_dist = tf.contrib.distributions.Normal(mu, sigma) + + with tf.name_scope('a_loss'): + log_prob = normal_dist.log_prob(self.a_his) + exp_v = log_prob * td + entropy = normal_dist.entropy() # encourage exploration + self.exp_v = ENTROPY_BETA * entropy + exp_v + self.a_loss = tf.reduce_mean(-self.exp_v) + + with tf.name_scope('choose_a'): # use local params to choose action + self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=0), A_BOUND[0], A_BOUND[1]) + with tf.name_scope('local_grad'): + self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor') + self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic') + self.a_grads = tf.gradients(self.a_loss, self.a_params) + self.c_grads = tf.gradients(self.c_loss, self.c_params) + + with tf.name_scope('sync'): + with tf.name_scope('pull'): + self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)] + self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)] + with tf.name_scope('push'): + self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params)) + self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params)) + + def _build_net(self): + w_init = tf.random_normal_initializer(0., .1) + with tf.variable_scope('critic'): # only critic controls the rnn update + cell_size = 32 + s = tf.expand_dims(self.s, axis=1, + name='timely_input') # [time_step, feature] => [time_step, batch, feature] + rnn_cell = tf.contrib.rnn.BasicRNNCell(cell_size) + self.init_state = rnn_cell.zero_state(batch_size=1, dtype=tf.float32) + outputs, self.final_state = tf.nn.dynamic_rnn( + cell=rnn_cell, inputs=s, initial_state=self.init_state, time_major=True) + cell_out = tf.reshape(outputs, [-1, cell_size], name='flatten_rnn_outputs') # joined state representation + l_c = tf.layers.dense(cell_out, 50, tf.nn.relu6, kernel_initializer=w_init, name='lc') + v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v') # state value + + with tf.variable_scope('actor'): # state representation is based on critic + cell_out = tf.stop_gradient(cell_out, name='c_cell_out') # from what critic think it is + l_a = tf.layers.dense(cell_out, 80, tf.nn.relu6, kernel_initializer=w_init, name='la') + mu = tf.layers.dense(l_a, N_A, tf.nn.tanh, kernel_initializer=w_init, name='mu') + sigma = tf.layers.dense(l_a, N_A, tf.nn.softplus, kernel_initializer=w_init, name='sigma') + return mu, sigma, v + + def update_global(self, feed_dict): # run by a local + SESS.run([self.update_a_op, self.update_c_op], feed_dict) # local grads applies to global net + + def pull_global(self): # run by a local + SESS.run([self.pull_a_params_op, self.pull_c_params_op]) + + def choose_action(self, s, cell_state): # run by a local + s = s[np.newaxis, :] + a, cell_state = SESS.run([self.A, self.final_state], {self.s: s, self.init_state: cell_state}) + return a[0], cell_state + + +class Worker(object): + def __init__(self, name, globalAC): + self.env = gym.make(GAME).unwrapped + self.name = name + self.AC = ACNet(name, globalAC) + + def work(self): + global GLOBAL_RUNNING_R, GLOBAL_EP + total_step = 1 + buffer_s, buffer_a, buffer_r = [], [], [] + while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: + s = self.env.reset() + ep_r = 0 + rnn_state = SESS.run(self.AC.init_state) # zero rnn state at beginning + keep_state = rnn_state.copy() # keep rnn state for updating global net + for ep_t in range(MAX_EP_STEP): + if self.name == 'W_0': + self.env.render() + + a, rnn_state_ = self.AC.choose_action(s, rnn_state) # get the action and next rnn state + s_, r, done, info = self.env.step(a) + done = True if ep_t == MAX_EP_STEP - 1 else False + r /= 10 # normalize reward + + ep_r += r + buffer_s.append(s) + buffer_a.append(a) + buffer_r.append(r) + + if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net + if done: + v_s_ = 0 # terminal + else: + v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :], self.AC.init_state: rnn_state_})[0, 0] + buffer_v_target = [] + for r in buffer_r[::-1]: # reverse buffer r + v_s_ = r + GAMMA * v_s_ + buffer_v_target.append(v_s_) + buffer_v_target.reverse() + + buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target) + + feed_dict = { + self.AC.s: buffer_s, + self.AC.a_his: buffer_a, + self.AC.v_target: buffer_v_target, + self.AC.init_state: keep_state, + } + + self.AC.update_global(feed_dict) + buffer_s, buffer_a, buffer_r = [], [], [] + self.AC.pull_global() + keep_state = rnn_state_.copy() # replace the keep_state as the new initial rnn state_ + + s = s_ + rnn_state = rnn_state_ # renew rnn state + total_step += 1 + + if done: + if len(GLOBAL_RUNNING_R) == 0: # record running episode reward + GLOBAL_RUNNING_R.append(ep_r) + else: + GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r) + print( + self.name, + "Ep:", GLOBAL_EP, + "| Ep_r: %i" % GLOBAL_RUNNING_R[-1], + ) + GLOBAL_EP += 1 + break + +if __name__ == "__main__": + SESS = tf.Session() + + with tf.device("/cpu:0"): + OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA') + OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC') + GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) # we only need its params + workers = [] + # Create worker + for i in range(N_WORKERS): + i_name = 'W_%i' % i # worker name + workers.append(Worker(i_name, GLOBAL_AC)) + + COORD = tf.train.Coordinator() + SESS.run(tf.global_variables_initializer()) + + if OUTPUT_GRAPH: + if os.path.exists(LOG_DIR): + shutil.rmtree(LOG_DIR) + tf.summary.FileWriter(LOG_DIR, SESS.graph) + + worker_threads = [] + for worker in workers: + job = lambda: worker.work() + t = threading.Thread(target=job) + t.start() + worker_threads.append(t) + COORD.join(worker_threads) + + plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R) + plt.xlabel('step') + plt.ylabel('Total moving reward') + plt.show() + diff --git a/contents/10_A3C/A3C_continuous_action.py b/contents/10_A3C/A3C_continuous_action.py new file mode 100644 index 0000000..4cd534a --- /dev/null +++ b/contents/10_A3C/A3C_continuous_action.py @@ -0,0 +1,210 @@ +""" +Asynchronous Advantage Actor Critic (A3C) with continuous action space, Reinforcement Learning. + +The Pendulum example. + +View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/ + +Using: +tensorflow 1.0 +gym 0.8.0 +""" + +import multiprocessing +import threading +import tensorflow as tf +import numpy as np +import gym +import os +import shutil +import matplotlib.pyplot as plt + +GAME = 'Pendulum-v0' +OUTPUT_GRAPH = True +LOG_DIR = './log' +N_WORKERS = multiprocessing.cpu_count() +MAX_EP_STEP = 400 +MAX_GLOBAL_EP = 800 +GLOBAL_NET_SCOPE = 'Global_Net' +UPDATE_GLOBAL_ITER = 5 +GAMMA = 0.9 +ENTROPY_BETA = 0.01 +LR_A = 0.0001 # learning rate for actor +LR_C = 0.001 # learning rate for critic +GLOBAL_RUNNING_R = [] +GLOBAL_EP = 0 + +env = gym.make(GAME) + +N_S = env.observation_space.shape[0] +N_A = env.action_space.shape[0] +A_BOUND = [env.action_space.low, env.action_space.high] + + +class ACNet(object): + def __init__(self, scope, globalAC=None): + + if scope == GLOBAL_NET_SCOPE: # get global network + with tf.variable_scope(scope): + self.s = tf.placeholder(tf.float32, [None, N_S], 'S') + self._build_net() + self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor') + self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic') + else: # local net, calculate losses + with tf.variable_scope(scope): + self.s = tf.placeholder(tf.float32, [None, N_S], 'S') + self.a_his = tf.placeholder(tf.float32, [None, N_A], 'A') + self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget') + + mu, sigma, self.v = self._build_net() + + td = tf.subtract(self.v_target, self.v, name='TD_error') + with tf.name_scope('c_loss'): + self.c_loss = tf.reduce_mean(tf.square(td)) + + with tf.name_scope('wrap_a_out'): + mu, sigma = mu * A_BOUND[1], sigma + 1e-4 + + normal_dist = tf.contrib.distributions.Normal(mu, sigma) + + with tf.name_scope('a_loss'): + log_prob = normal_dist.log_prob(self.a_his) + exp_v = log_prob * td + entropy = normal_dist.entropy() # encourage exploration + self.exp_v = ENTROPY_BETA * entropy + exp_v + self.a_loss = tf.reduce_mean(-self.exp_v) + + with tf.name_scope('choose_a'): # use local params to choose action + self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=0), A_BOUND[0], A_BOUND[1]) + with tf.name_scope('local_grad'): + self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor') + self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic') + self.a_grads = tf.gradients(self.a_loss, self.a_params) + self.c_grads = tf.gradients(self.c_loss, self.c_params) + + with tf.name_scope('sync'): + with tf.name_scope('pull'): + self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)] + self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)] + with tf.name_scope('push'): + self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params)) + self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params)) + + def _build_net(self ): + w_init = tf.random_normal_initializer(0., .1) + with tf.variable_scope('actor'): + l_a = tf.layers.dense(self.s, 200, tf.nn.relu6, kernel_initializer=w_init, name='la') + mu = tf.layers.dense(l_a, N_A, tf.nn.tanh, kernel_initializer=w_init, name='mu') + sigma = tf.layers.dense(l_a, N_A, tf.nn.softplus, kernel_initializer=w_init, name='sigma') + with tf.variable_scope('critic'): + l_c = tf.layers.dense(self.s, 100, tf.nn.relu6, kernel_initializer=w_init, name='lc') + v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v') # state value + return mu, sigma, v + + def update_global(self, feed_dict): # run by a local + SESS.run([self.update_a_op, self.update_c_op], feed_dict) # local grads applies to global net + + def pull_global(self): # run by a local + SESS.run([self.pull_a_params_op, self.pull_c_params_op]) + + def choose_action(self, s): # run by a local + s = s[np.newaxis, :] + return SESS.run(self.A, {self.s: s})[0] + + +class Worker(object): + def __init__(self, name, globalAC): + self.env = gym.make(GAME).unwrapped + self.name = name + self.AC = ACNet(name, globalAC) + + def work(self): + global GLOBAL_RUNNING_R, GLOBAL_EP + total_step = 1 + buffer_s, buffer_a, buffer_r = [], [], [] + while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: + s = self.env.reset() + ep_r = 0 + for ep_t in range(MAX_EP_STEP): + if self.name == 'W_0': + self.env.render() + a = self.AC.choose_action(s) + s_, r, done, info = self.env.step(a) + done = True if ep_t == MAX_EP_STEP - 1 else False + r /= 10 # normalize reward + + ep_r += r + buffer_s.append(s) + buffer_a.append(a) + buffer_r.append(r) + + if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net + if done: + v_s_ = 0 # terminal + else: + v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0] + buffer_v_target = [] + for r in buffer_r[::-1]: # reverse buffer r + v_s_ = r + GAMMA * v_s_ + buffer_v_target.append(v_s_) + buffer_v_target.reverse() + + buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target) + feed_dict = { + self.AC.s: buffer_s, + self.AC.a_his: buffer_a, + self.AC.v_target: buffer_v_target, + } + self.AC.update_global(feed_dict) + buffer_s, buffer_a, buffer_r = [], [], [] + self.AC.pull_global() + + s = s_ + total_step += 1 + if done: + if len(GLOBAL_RUNNING_R) == 0: # record running episode reward + GLOBAL_RUNNING_R.append(ep_r) + else: + GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r) + print( + self.name, + "Ep:", GLOBAL_EP, + "| Ep_r: %i" % GLOBAL_RUNNING_R[-1], + ) + GLOBAL_EP += 1 + break + +if __name__ == "__main__": + SESS = tf.Session() + + with tf.device("/cpu:0"): + OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA') + OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC') + GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) # we only need its params + workers = [] + # Create worker + for i in range(N_WORKERS): + i_name = 'W_%i' % i # worker name + workers.append(Worker(i_name, GLOBAL_AC)) + + COORD = tf.train.Coordinator() + SESS.run(tf.global_variables_initializer()) + + if OUTPUT_GRAPH: + if os.path.exists(LOG_DIR): + shutil.rmtree(LOG_DIR) + tf.summary.FileWriter(LOG_DIR, SESS.graph) + + worker_threads = [] + for worker in workers: + job = lambda: worker.work() + t = threading.Thread(target=job) + t.start() + worker_threads.append(t) + COORD.join(worker_threads) + + plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R) + plt.xlabel('step') + plt.ylabel('Total moving reward') + plt.show() + diff --git a/contents/10_A3C/A3C_discrete_action.py b/contents/10_A3C/A3C_discrete_action.py new file mode 100644 index 0000000..f17352a --- /dev/null +++ b/contents/10_A3C/A3C_discrete_action.py @@ -0,0 +1,201 @@ +""" +Asynchronous Advantage Actor Critic (A3C) with discrete action space, Reinforcement Learning. + +The Cartpole example. + +View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/ + +Using: +tensorflow 1.0 +gym 0.8.0 +""" + +import multiprocessing +import threading +import tensorflow as tf +import numpy as np +import gym +import os +import shutil +import matplotlib.pyplot as plt + + +GAME = 'CartPole-v0' +OUTPUT_GRAPH = True +LOG_DIR = './log' +N_WORKERS = multiprocessing.cpu_count() +MAX_GLOBAL_EP = 1000 +GLOBAL_NET_SCOPE = 'Global_Net' +UPDATE_GLOBAL_ITER = 20 +GAMMA = 0.9 +ENTROPY_BETA = 0.001 +LR_A = 0.001 # learning rate for actor +LR_C = 0.001 # learning rate for critic +GLOBAL_RUNNING_R = [] +GLOBAL_EP = 0 + +env = gym.make(GAME) + +N_S = env.observation_space.shape[0] +N_A = env.action_space.n + + +class ACNet(object): + def __init__(self, scope, globalAC=None): + + if scope == GLOBAL_NET_SCOPE: # get global network + with tf.variable_scope(scope): + self.s = tf.placeholder(tf.float32, [None, N_S], 'S') + self._build_net() + self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor') + self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic') + else: # local net, calculate losses + with tf.variable_scope(scope): + self.s = tf.placeholder(tf.float32, [None, N_S], 'S') + self.a_his = tf.placeholder(tf.int32, [None, ], 'A') + self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget') + + self.a_prob, self.v = self._build_net() + + td = tf.subtract(self.v_target, self.v, name='TD_error') + with tf.name_scope('c_loss'): + self.c_loss = tf.reduce_mean(tf.square(td)) + + with tf.name_scope('a_loss'): + log_prob = tf.reduce_sum(tf.log(self.a_prob) * tf.one_hot(self.a_his, N_A, dtype=tf.float32), axis=1, keep_dims=True) + exp_v = log_prob * td + entropy = -tf.reduce_sum(self.a_prob * tf.log(self.a_prob), axis=1, keep_dims=True) # encourage exploration + self.exp_v = ENTROPY_BETA * entropy + exp_v + self.a_loss = tf.reduce_mean(-self.exp_v) + + with tf.name_scope('local_grad'): + self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor') + self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic') + self.a_grads = tf.gradients(self.a_loss, self.a_params) + self.c_grads = tf.gradients(self.c_loss, self.c_params) + + with tf.name_scope('sync'): + with tf.name_scope('pull'): + self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)] + self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)] + with tf.name_scope('push'): + self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params)) + self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params)) + + def _build_net(self): + w_init = tf.random_normal_initializer(0., .1) + with tf.variable_scope('actor'): + l_a = tf.layers.dense(self.s, 200, tf.nn.relu6, kernel_initializer=w_init, name='la') + a_prob = tf.layers.dense(l_a, N_A, tf.nn.softmax, kernel_initializer=w_init, name='ap') + with tf.variable_scope('critic'): + l_c = tf.layers.dense(self.s, 100, tf.nn.relu6, kernel_initializer=w_init, name='lc') + v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v') # state value + return a_prob, v + + def update_global(self, feed_dict): # run by a local + SESS.run([self.update_a_op, self.update_c_op], feed_dict) # local grads applies to global net + + def pull_global(self): # run by a local + SESS.run([self.pull_a_params_op, self.pull_c_params_op]) + + def choose_action(self, s): # run by a local + prob_weights = SESS.run(self.a_prob, feed_dict={self.s: s[np.newaxis, :]}) + action = np.random.choice(range(prob_weights.shape[1]), + p=prob_weights.ravel()) # select action w.r.t the actions prob + return action + + +class Worker(object): + def __init__(self, name, globalAC): + self.env = gym.make(GAME).unwrapped + self.name = name + self.AC = ACNet(name, globalAC) + + def work(self): + global GLOBAL_RUNNING_R, GLOBAL_EP + total_step = 1 + buffer_s, buffer_a, buffer_r = [], [], [] + while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: + s = self.env.reset() + ep_r = 0 + while True: + if self.name == 'W_0': + self.env.render() + a = self.AC.choose_action(s) + s_, r, done, info = self.env.step(a) + if done: r = -5 + ep_r += r + buffer_s.append(s) + buffer_a.append(a) + buffer_r.append(r) + + if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net + if done: + v_s_ = 0 # terminal + else: + v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0] + buffer_v_target = [] + for r in buffer_r[::-1]: # reverse buffer r + v_s_ = r + GAMMA * v_s_ + buffer_v_target.append(v_s_) + buffer_v_target.reverse() + + buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.array(buffer_a), np.vstack(buffer_v_target) + feed_dict = { + self.AC.s: buffer_s, + self.AC.a_his: buffer_a, + self.AC.v_target: buffer_v_target, + } + self.AC.update_global(feed_dict) + + buffer_s, buffer_a, buffer_r = [], [], [] + self.AC.pull_global() + + s = s_ + total_step += 1 + if done: + if len(GLOBAL_RUNNING_R) == 0: # record running episode reward + GLOBAL_RUNNING_R.append(ep_r) + else: + GLOBAL_RUNNING_R.append(0.99 * GLOBAL_RUNNING_R[-1] + 0.01 * ep_r) + print( + self.name, + "Ep:", GLOBAL_EP, + "| Ep_r: %i" % GLOBAL_RUNNING_R[-1], + ) + GLOBAL_EP += 1 + break + +if __name__ == "__main__": + SESS = tf.Session() + + with tf.device("/cpu:0"): + OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA') + OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC') + GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) # we only need its params + workers = [] + # Create worker + for i in range(N_WORKERS): + i_name = 'W_%i' % i # worker name + workers.append(Worker(i_name, GLOBAL_AC)) + + COORD = tf.train.Coordinator() + SESS.run(tf.global_variables_initializer()) + + if OUTPUT_GRAPH: + if os.path.exists(LOG_DIR): + shutil.rmtree(LOG_DIR) + tf.summary.FileWriter(LOG_DIR, SESS.graph) + + worker_threads = [] + for worker in workers: + job = lambda: worker.work() + t = threading.Thread(target=job) + t.start() + worker_threads.append(t) + COORD.join(worker_threads) + + plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R) + plt.xlabel('step') + plt.ylabel('Total moving reward') + plt.show() diff --git a/contents/11_Dyna_Q/RL_brain.py b/contents/11_Dyna_Q/RL_brain.py new file mode 100644 index 0000000..f4be936 --- /dev/null +++ b/contents/11_Dyna_Q/RL_brain.py @@ -0,0 +1,79 @@ +""" +This part of code is the Dyna-Q learning brain, which is a brain of the agent. +All decisions and learning processes are made in here. + +View more on 莫烦Python: https://morvanzhou.github.io/tutorials/ +""" + +import numpy as np +import pandas as pd + + +class QLearningTable: + def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9): + self.actions = actions # a list + self.lr = learning_rate + self.gamma = reward_decay + self.epsilon = e_greedy + self.q_table = pd.DataFrame(columns=self.actions) + + def choose_action(self, observation): + self.check_state_exist(observation) + # action selection + if np.random.uniform() < self.epsilon: + # choose best action + state_action = self.q_table.ix[observation, :] + state_action = state_action.reindex(np.random.permutation(state_action.index)) # some actions have same value + action = state_action.argmax() + else: + # choose random action + action = np.random.choice(self.actions) + return action + + def learn(self, s, a, r, s_): + self.check_state_exist(s_) + q_predict = self.q_table.ix[s, a] + if s_ != 'terminal': + q_target = r + self.gamma * self.q_table.ix[s_, :].max() # next state is not terminal + else: + q_target = r # next state is terminal + self.q_table.ix[s, a] += self.lr * (q_target - q_predict) # update + + def check_state_exist(self, state): + if state not in self.q_table.index: + # append new state to q table + self.q_table = self.q_table.append( + pd.Series( + [0]*len(self.actions), + index=self.q_table.columns, + name=state, + ) + ) + + +class EnvModel: + """Similar to the memory buffer in DQN, you can store past experiences in here. + Alternatively, the model can generate next state and reward signal accurately.""" + def __init__(self, actions): + # the simplest case is to think about the model is a memory which has all past transition information + self.actions = actions + self.database = pd.DataFrame(columns=actions, dtype=np.object) + + def store_transition(self, s, a, r, s_): + if s not in self.database.index: + self.database = self.database.append( + pd.Series( + [None] * len(self.actions), + index=self.database.columns, + name=s, + )) + self.database.set_value(s, a, (r, s_)) + + def sample_s_a(self): + s = np.random.choice(self.database.index) + a = np.random.choice(self.database.ix[s].dropna().index) # filter out the None value + return s, a + + def get_r_s_(self, s, a): + r, s_ = self.database.ix[s, a] + return r, s_ diff --git a/contents/11_Dyna_Q/maze_env.py b/contents/11_Dyna_Q/maze_env.py new file mode 100644 index 0000000..5ec5370 --- /dev/null +++ b/contents/11_Dyna_Q/maze_env.py @@ -0,0 +1,129 @@ +""" +Reinforcement learning maze example. + +Red rectangle: explorer. +Black rectangles: hells [reward = -1]. +Yellow bin circle: paradise [reward = +1]. +All other states: ground [reward = 0]. + +This script is the environment part of this example. The RL is in RL_brain.py. + +View more on 莫烦Python: https://morvanzhou.github.io/tutorials/ +""" + + +import numpy as np +np.random.seed(1) +import tkinter as tk +import time + + +UNIT = 40 # pixels +MAZE_H = 4 # grid height +MAZE_W = 4 # grid width + + +class Maze(tk.Tk, object): + def __init__(self): + super(Maze, self).__init__() + self.action_space = ['u', 'd', 'l', 'r'] + self.n_actions = len(self.action_space) + self.title('maze') + self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT)) + self._build_maze() + + def _build_maze(self): + self.canvas = tk.Canvas(self, bg='white', + height=MAZE_H * UNIT, + width=MAZE_W * UNIT) + + # create grids + for c in range(0, MAZE_W * UNIT, UNIT): + x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT + self.canvas.create_line(x0, y0, x1, y1) + for r in range(0, MAZE_H * UNIT, UNIT): + x0, y0, x1, y1 = 0, r, MAZE_H * UNIT, r + self.canvas.create_line(x0, y0, x1, y1) + + # create origin + origin = np.array([20, 20]) + + # hell + hell1_center = origin + np.array([UNIT * 2, UNIT]) + self.hell1 = self.canvas.create_rectangle( + hell1_center[0] - 15, hell1_center[1] - 15, + hell1_center[0] + 15, hell1_center[1] + 15, + fill='black') + # hell + hell2_center = origin + np.array([UNIT, UNIT * 2]) + self.hell2 = self.canvas.create_rectangle( + hell2_center[0] - 15, hell2_center[1] - 15, + hell2_center[0] + 15, hell2_center[1] + 15, + fill='black') + + # create oval + oval_center = origin + UNIT * 2 + self.oval = self.canvas.create_oval( + oval_center[0] - 15, oval_center[1] - 15, + oval_center[0] + 15, oval_center[1] + 15, + fill='yellow') + + # create red rect + self.rect = self.canvas.create_rectangle( + origin[0] - 15, origin[1] - 15, + origin[0] + 15, origin[1] + 15, + fill='red') + + # pack all + self.canvas.pack() + + def reset(self): + self.update() + time.sleep(0.5) + self.canvas.delete(self.rect) + origin = np.array([20, 20]) + self.rect = self.canvas.create_rectangle( + origin[0] - 15, origin[1] - 15, + origin[0] + 15, origin[1] + 15, + fill='red') + # return observation + return self.canvas.coords(self.rect) + + def step(self, action): + s = self.canvas.coords(self.rect) + base_action = np.array([0, 0]) + if action == 0: # up + if s[1] > UNIT: + base_action[1] -= UNIT + elif action == 1: # down + if s[1] < (MAZE_H - 1) * UNIT: + base_action[1] += UNIT + elif action == 2: # right + if s[0] < (MAZE_W - 1) * UNIT: + base_action[0] += UNIT + elif action == 3: # left + if s[0] > UNIT: + base_action[0] -= UNIT + + self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent + + s_ = self.canvas.coords(self.rect) # next state + + # reward function + if s_ == self.canvas.coords(self.oval): + reward = 1 + done = True + elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]: + reward = -1 + done = True + else: + reward = 0 + done = False + + return s_, reward, done + + def render(self): + # time.sleep(0.1) + self.update() + + diff --git a/contents/11_Dyna_Q/run_this.py b/contents/11_Dyna_Q/run_this.py new file mode 100644 index 0000000..d784bc4 --- /dev/null +++ b/contents/11_Dyna_Q/run_this.py @@ -0,0 +1,51 @@ +""" +Simplest model-based RL, Dyna-Q. + +Red rectangle: explorer. +Black rectangles: hells [reward = -1]. +Yellow bin circle: paradise [reward = +1]. +All other states: ground [reward = 0]. + +This script is the main part which controls the update method of this example. +The RL is in RL_brain.py. + +View more on 莫烦Python: https://morvanzhou.github.io/tutorials/ +""" + +from maze_env import Maze +from RL_brain import QLearningTable, EnvModel + + +def update(): + for episode in range(40): + s = env.reset() + while True: + env.render() + a = RL.choose_action(str(s)) + s_, r, done = env.step(a) + RL.learn(str(s), a, r, str(s_)) + + # use a model to output (r, s_) by inputting (s, a) + # the model in dyna Q version is just like a memory replay buffer + env_model.store_transition(str(s), a, r, s_) + for n in range(10): # learn 10 more times using the env_model + ms, ma = env_model.sample_s_a() # ms in here is a str + mr, ms_ = env_model.get_r_s_(ms, ma) + RL.learn(ms, ma, mr, str(ms_)) + + s = s_ + if done: + break + + # end of game + print('game over') + env.destroy() + + +if __name__ == "__main__": + env = Maze() + RL = QLearningTable(actions=list(range(env.n_actions))) + env_model = EnvModel(actions=list(range(env.n_actions))) + + env.after(0, update) + env.mainloop() \ No newline at end of file diff --git a/contents/1_command_line_reinforcement_learning/treasure_on_right.py b/contents/1_command_line_reinforcement_learning/treasure_on_right.py new file mode 100644 index 0000000..5970860 --- /dev/null +++ b/contents/1_command_line_reinforcement_learning/treasure_on_right.py @@ -0,0 +1,107 @@ +""" +A simple example for Reinforcement Learning using table lookup Q-learning method. +An agent "o" is on the left of a 1 dimensional world, the treasure is on the rightmost location. +Run this program and to see how the agent will improve its strategy of finding the treasure. + +View more on 莫烦Python: https://morvanzhou.github.io/tutorials/ +""" + +import numpy as np +import pandas as pd +import time + +np.random.seed(2) # reproducible + + +N_STATES = 6 # the length of the 1 dimensional world +ACTIONS = ['left', 'right'] # available actions +EPSILON = 0.9 # greedy police +ALPHA = 0.1 # learning rate +GAMMA = 0.9 # discount factor +MAX_EPISODES = 13 # maximum episodes +FRESH_TIME = 0.3 # fresh time for one move + + +def build_q_table(n_states, actions): + table = pd.DataFrame( + np.zeros((n_states, len(actions))), # q_table initial values + columns=actions, # actions's name + ) + # print(table) # show table + return table + + +def choose_action(state, q_table): + # This is how to choose an action + state_actions = q_table.iloc[state, :] + if (np.random.uniform() > EPSILON) or (state_actions.all() == 0): # act non-greedy or state-action have no value + action_name = np.random.choice(ACTIONS) + else: # act greedy + action_name = state_actions.argmax() + return action_name + + +def get_env_feedback(S, A): + # This is how agent will interact with the environment + if A == 'right': # move right + if S == N_STATES - 2: # terminate + S_ = 'terminal' + R = 1 + else: + S_ = S + 1 + R = 0 + else: # move left + R = 0 + if S == 0: + S_ = S # reach the wall + else: + S_ = S - 1 + return S_, R + + +def update_env(S, episode, step_counter): + # This is how environment be updated + env_list = ['-']*(N_STATES-1) + ['T'] # '---------T' our environment + if S == 'terminal': + interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter) + print('\r{}'.format(interaction), end='') + time.sleep(2) + print('\r ', end='') + else: + env_list[S] = 'o' + interaction = ''.join(env_list) + print('\r{}'.format(interaction), end='') + time.sleep(FRESH_TIME) + + +def rl(): + # main part of RL loop + q_table = build_q_table(N_STATES, ACTIONS) + for episode in range(MAX_EPISODES): + step_counter = 0 + S = 0 + is_terminated = False + update_env(S, episode, step_counter) + while not is_terminated: + + A = choose_action(S, q_table) + S_, R = get_env_feedback(S, A) # take action & get next state and reward + q_predict = q_table.ix[S, A] + if S_ != 'terminal': + q_target = R + GAMMA * q_table.iloc[S_, :].max() # next state is not terminal + else: + q_target = R # next state is terminal + is_terminated = True # terminate this episode + + q_table.ix[S, A] += ALPHA * (q_target - q_predict) # update + S = S_ # move to next state + + update_env(S, episode, step_counter+1) + step_counter += 1 + return q_table + + +if __name__ == "__main__": + q_table = rl() + print('\r\nQ-table:\n') + print(q_table) diff --git a/contents/2_Q_Learning_maze/RL_brain.py b/contents/2_Q_Learning_maze/RL_brain.py new file mode 100644 index 0000000..844c475 --- /dev/null +++ b/contents/2_Q_Learning_maze/RL_brain.py @@ -0,0 +1,51 @@ +""" +This part of code is the Q learning brain, which is a brain of the agent. +All decisions are made in here. + +View more on 莫烦Python: https://morvanzhou.github.io/tutorials/ +""" + +import numpy as np +import pandas as pd + + +class QLearningTable: + def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9): + self.actions = actions # a list + self.lr = learning_rate + self.gamma = reward_decay + self.epsilon = e_greedy + self.q_table = pd.DataFrame(columns=self.actions) + + def choose_action(self, observation): + self.check_state_exist(observation) + # action selection + if np.random.uniform() < self.epsilon: + # choose best action + state_action = self.q_table.ix[observation, :] + state_action = state_action.reindex(np.random.permutation(state_action.index)) # some actions have same value + action = state_action.argmax() + else: + # choose random action + action = np.random.choice(self.actions) + return action + + def learn(self, s, a, r, s_): + self.check_state_exist(s_) + q_predict = self.q_table.ix[s, a] + if s_ != 'terminal': + q_target = r + self.gamma * self.q_table.ix[s_, :].max() # next state is not terminal + else: + q_target = r # next state is terminal + self.q_table.ix[s, a] += self.lr * (q_target - q_predict) # update + + def check_state_exist(self, state): + if state not in self.q_table.index: + # append new state to q table + self.q_table = self.q_table.append( + pd.Series( + [0]*len(self.actions), + index=self.q_table.columns, + name=state, + ) + ) \ No newline at end of file diff --git a/contents/2_Q_Learning_maze/maze_env.py b/contents/2_Q_Learning_maze/maze_env.py new file mode 100644 index 0000000..d7b8b0a --- /dev/null +++ b/contents/2_Q_Learning_maze/maze_env.py @@ -0,0 +1,129 @@ +""" +Reinforcement learning maze example. + +Red rectangle: explorer. +Black rectangles: hells [reward = -1]. +Yellow bin circle: paradise [reward = +1]. +All other states: ground [reward = 0]. + +This script is the environment part of this example. The RL is in RL_brain.py. + +View more on 莫烦Python: https://morvanzhou.github.io/tutorials/ +""" + + +import numpy as np +np.random.seed(1) +import tkinter as tk +import time + + +UNIT = 40 # pixels +MAZE_H = 4 # grid height +MAZE_W = 4 # grid width + + +class Maze(tk.Tk, object): + def __init__(self): + super(Maze, self).__init__() + self.action_space = ['u', 'd', 'l', 'r'] + self.n_actions = len(self.action_space) + self.title('maze') + self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT)) + self._build_maze() + + def _build_maze(self): + self.canvas = tk.Canvas(self, bg='white', + height=MAZE_H * UNIT, + width=MAZE_W * UNIT) + + # create grids + for c in range(0, MAZE_W * UNIT, UNIT): + x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT + self.canvas.create_line(x0, y0, x1, y1) + for r in range(0, MAZE_H * UNIT, UNIT): + x0, y0, x1, y1 = 0, r, MAZE_H * UNIT, r + self.canvas.create_line(x0, y0, x1, y1) + + # create origin + origin = np.array([20, 20]) + + # hell + hell1_center = origin + np.array([UNIT * 2, UNIT]) + self.hell1 = self.canvas.create_rectangle( + hell1_center[0] - 15, hell1_center[1] - 15, + hell1_center[0] + 15, hell1_center[1] + 15, + fill='black') + # hell + hell2_center = origin + np.array([UNIT, UNIT * 2]) + self.hell2 = self.canvas.create_rectangle( + hell2_center[0] - 15, hell2_center[1] - 15, + hell2_center[0] + 15, hell2_center[1] + 15, + fill='black') + + # create oval + oval_center = origin + UNIT * 2 + self.oval = self.canvas.create_oval( + oval_center[0] - 15, oval_center[1] - 15, + oval_center[0] + 15, oval_center[1] + 15, + fill='yellow') + + # create red rect + self.rect = self.canvas.create_rectangle( + origin[0] - 15, origin[1] - 15, + origin[0] + 15, origin[1] + 15, + fill='red') + + # pack all + self.canvas.pack() + + def reset(self): + self.update() + time.sleep(0.5) + self.canvas.delete(self.rect) + origin = np.array([20, 20]) + self.rect = self.canvas.create_rectangle( + origin[0] - 15, origin[1] - 15, + origin[0] + 15, origin[1] + 15, + fill='red') + # return observation + return self.canvas.coords(self.rect) + + def step(self, action): + s = self.canvas.coords(self.rect) + base_action = np.array([0, 0]) + if action == 0: # up + if s[1] > UNIT: + base_action[1] -= UNIT + elif action == 1: # down + if s[1] < (MAZE_H - 1) * UNIT: + base_action[1] += UNIT + elif action == 2: # right + if s[0] < (MAZE_W - 1) * UNIT: + base_action[0] += UNIT + elif action == 3: # left + if s[0] > UNIT: + base_action[0] -= UNIT + + self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent + + s_ = self.canvas.coords(self.rect) # next state + + # reward function + if s_ == self.canvas.coords(self.oval): + reward = 1 + done = True + elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]: + reward = -1 + done = True + else: + reward = 0 + done = False + + return s_, reward, done + + def render(self): + time.sleep(0.1) + self.update() + + diff --git a/contents/2_Q_Learning_maze/run_this.py b/contents/2_Q_Learning_maze/run_this.py new file mode 100644 index 0000000..f817d1e --- /dev/null +++ b/contents/2_Q_Learning_maze/run_this.py @@ -0,0 +1,53 @@ +""" +Reinforcement learning maze example. + +Red rectangle: explorer. +Black rectangles: hells [reward = -1]. +Yellow bin circle: paradise [reward = +1]. +All other states: ground [reward = 0]. + +This script is the main part which controls the update method of this example. +The RL is in RL_brain.py. + +View more on 莫烦Python: https://morvanzhou.github.io/tutorials/ +""" + +from maze_env import Maze +from RL_brain import QLearningTable + + +def update(): + for episode in range(100): + # initial observation + observation = env.reset() + + while True: + # fresh env + env.render() + + # RL choose action based on observation + action = RL.choose_action(str(observation)) + + # RL take action and get next observation and reward + observation_, reward, done = env.step(action) + + # RL learn from this transition + RL.learn(str(observation), action, reward, str(observation_)) + + # swap observation + observation = observation_ + + # break while loop when end of this episode + if done: + break + + # end of game + print('game over') + env.destroy() + +if __name__ == "__main__": + env = Maze() + RL = QLearningTable(actions=list(range(env.n_actions))) + + env.after(100, update) + env.mainloop() \ No newline at end of file diff --git a/contents/3_Sarsa_maze/RL_brain.py b/contents/3_Sarsa_maze/RL_brain.py new file mode 100644 index 0000000..3b8b5da --- /dev/null +++ b/contents/3_Sarsa_maze/RL_brain.py @@ -0,0 +1,77 @@ +""" +This part of code is the Q learning brain, which is a brain of the agent. +All decisions are made in here. + +View more on 莫烦Python: https://morvanzhou.github.io/tutorials/ +""" + +import numpy as np +import pandas as pd + + +class RL(object): + def __init__(self, action_space, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9): + self.actions = action_space # a list + self.lr = learning_rate + self.gamma = reward_decay + self.epsilon = e_greedy + + self.q_table = pd.DataFrame(columns=self.actions) + + def check_state_exist(self, state): + if state not in self.q_table.index: + # append new state to q table + self.q_table = self.q_table.append( + pd.Series( + [0]*len(self.actions), + index=self.q_table.columns, + name=state, + ) + ) + + def choose_action(self, observation): + self.check_state_exist(observation) + # action selection + if np.random.rand() < self.epsilon: + # choose best action + state_action = self.q_table.ix[observation, :] + state_action = state_action.reindex(np.random.permutation(state_action.index)) # some actions have same value + action = state_action.argmax() + else: + # choose random action + action = np.random.choice(self.actions) + return action + + def learn(self, *args): + pass + + +# off-policy +class QLearningTable(RL): + def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9): + super(QLearningTable, self).__init__(actions, learning_rate, reward_decay, e_greedy) + + def learn(self, s, a, r, s_): + self.check_state_exist(s_) + q_predict = self.q_table.ix[s, a] + if s_ != 'terminal': + q_target = r + self.gamma * self.q_table.ix[s_, :].max() # next state is not terminal + else: + q_target = r # next state is terminal + self.q_table.ix[s, a] += self.lr * (q_target - q_predict) # update + + +# on-policy +class SarsaTable(RL): + + def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9): + super(SarsaTable, self).__init__(actions, learning_rate, reward_decay, e_greedy) + + def learn(self, s, a, r, s_, a_): + self.check_state_exist(s_) + q_predict = self.q_table.ix[s, a] + if s_ != 'terminal': + q_target = r + self.gamma * self.q_table.ix[s_, a_] # next state is not terminal + else: + q_target = r # next state is terminal + self.q_table.ix[s, a] += self.lr * (q_target - q_predict) # update diff --git a/contents/3_Sarsa_maze/maze_env.py b/contents/3_Sarsa_maze/maze_env.py new file mode 100644 index 0000000..fc31521 --- /dev/null +++ b/contents/3_Sarsa_maze/maze_env.py @@ -0,0 +1,130 @@ +""" +Reinforcement learning maze example. + +Red rectangle: explorer. +Black rectangles: hells [reward = -1]. +Yellow bin circle: paradise [reward = +1]. +All other states: ground [reward = 0]. + +This script is the environment part of this example. +The RL is in RL_brain.py. + +View more on 莫烦Python: https://morvanzhou.github.io/tutorials/ +""" + + +import numpy as np +np.random.seed(1) +import tkinter as tk +import time + + +UNIT = 40 # pixels +MAZE_H = 4 # grid height +MAZE_W = 4 # grid width + + +class Maze(tk.Tk): + def __init__(self): + super(Maze, self).__init__() + self.action_space = ['u', 'd', 'l', 'r'] + self.n_actions = len(self.action_space) + self.title('maze') + self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT)) + self._build_maze() + + def _build_maze(self): + self.canvas = tk.Canvas(self, bg='white', + height=MAZE_H * UNIT, + width=MAZE_W * UNIT) + + # create grids + for c in range(0, MAZE_W * UNIT, UNIT): + x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT + self.canvas.create_line(x0, y0, x1, y1) + for r in range(0, MAZE_H * UNIT, UNIT): + x0, y0, x1, y1 = 0, r, MAZE_H * UNIT, r + self.canvas.create_line(x0, y0, x1, y1) + + # create origin + origin = np.array([20, 20]) + + # hell + hell1_center = origin + np.array([UNIT * 2, UNIT]) + self.hell1 = self.canvas.create_rectangle( + hell1_center[0] - 15, hell1_center[1] - 15, + hell1_center[0] + 15, hell1_center[1] + 15, + fill='black') + # hell + hell2_center = origin + np.array([UNIT, UNIT * 2]) + self.hell2 = self.canvas.create_rectangle( + hell2_center[0] - 15, hell2_center[1] - 15, + hell2_center[0] + 15, hell2_center[1] + 15, + fill='black') + + # create oval + oval_center = origin + UNIT * 2 + self.oval = self.canvas.create_oval( + oval_center[0] - 15, oval_center[1] - 15, + oval_center[0] + 15, oval_center[1] + 15, + fill='yellow') + + # create red rect + self.rect = self.canvas.create_rectangle( + origin[0] - 15, origin[1] - 15, + origin[0] + 15, origin[1] + 15, + fill='red') + + # pack all + self.canvas.pack() + + def reset(self): + self.update() + time.sleep(0.5) + self.canvas.delete(self.rect) + origin = np.array([20, 20]) + self.rect = self.canvas.create_rectangle( + origin[0] - 15, origin[1] - 15, + origin[0] + 15, origin[1] + 15, + fill='red') + # return observation + return self.canvas.coords(self.rect) + + def step(self, action): + s = self.canvas.coords(self.rect) + base_action = np.array([0, 0]) + if action == 0: # up + if s[1] > UNIT: + base_action[1] -= UNIT + elif action == 1: # down + if s[1] < (MAZE_H - 1) * UNIT: + base_action[1] += UNIT + elif action == 2: # right + if s[0] < (MAZE_W - 1) * UNIT: + base_action[0] += UNIT + elif action == 3: # left + if s[0] > UNIT: + base_action[0] -= UNIT + + self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent + + s_ = self.canvas.coords(self.rect) # next state + + # reward function + if s_ == self.canvas.coords(self.oval): + reward = 1 + done = True + elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]: + reward = -1 + done = True + else: + reward = 0 + done = False + + return s_, reward, done + + def render(self): + time.sleep(0.1) + self.update() + + diff --git a/contents/3_Sarsa_maze/run_this.py b/contents/3_Sarsa_maze/run_this.py new file mode 100644 index 0000000..fc2bd1a --- /dev/null +++ b/contents/3_Sarsa_maze/run_this.py @@ -0,0 +1,52 @@ +""" +Sarsa is a online updating method for Reinforcement learning. + +Unlike Q learning which is a offline updating method, Sarsa is updating while in the current trajectory. + +You will see the sarsa is more coward when punishment is close because it cares about all behaviours, +while q learning is more brave because it only cares about maximum behaviour. +""" + +from maze_env import Maze +from RL_brain import SarsaTable + + +def update(): + for episode in range(100): + # initial observation + observation = env.reset() + + # RL choose action based on observation + action = RL.choose_action(str(observation)) + + while True: + # fresh env + env.render() + + # RL take action and get next observation and reward + observation_, reward, done = env.step(action) + + # RL choose action based on next observation + action_ = RL.choose_action(str(observation_)) + + # RL learn from this transition (s, a, r, s, a) ==> Sarsa + RL.learn(str(observation), action, reward, str(observation_), action_) + + # swap observation and action + observation = observation_ + action = action_ + + # break while loop when end of this episode + if done: + break + + # end of game + print('game over') + env.destroy() + +if __name__ == "__main__": + env = Maze() + RL = SarsaTable(actions=list(range(env.n_actions))) + + env.after(100, update) + env.mainloop() \ No newline at end of file diff --git a/contents/4_Sarsa_lambda_maze/RL_brain.py b/contents/4_Sarsa_lambda_maze/RL_brain.py new file mode 100644 index 0000000..6ad65a9 --- /dev/null +++ b/contents/4_Sarsa_lambda_maze/RL_brain.py @@ -0,0 +1,93 @@ +""" +This part of code is the Q learning brain, which is a brain of the agent. +All decisions are made in here. + +View more on 莫烦Python: https://morvanzhou.github.io/tutorials/ +""" + +import numpy as np +import pandas as pd + + +class RL(object): + def __init__(self, action_space, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9): + self.actions = action_space # a list + self.lr = learning_rate + self.gamma = reward_decay + self.epsilon = e_greedy + + self.q_table = pd.DataFrame(columns=self.actions) + + def check_state_exist(self, state): + if state not in self.q_table.index: + # append new state to q table + self.q_table = self.q_table.append( + pd.Series( + [0]*len(self.actions), + index=self.q_table.columns, + name=state, + ) + ) + + def choose_action(self, observation): + self.check_state_exist(observation) + # action selection + if np.random.rand() < self.epsilon: + # choose best action + state_action = self.q_table.ix[observation, :] + state_action = state_action.reindex(np.random.permutation(state_action.index)) # some actions have same value + action = state_action.argmax() + else: + # choose random action + action = np.random.choice(self.actions) + return action + + def learn(self, *args): + pass + + +# backward eligibility traces +class SarsaLambdaTable(RL): + def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, trace_decay=0.9): + super(SarsaLambdaTable, self).__init__(actions, learning_rate, reward_decay, e_greedy) + + # backward view, eligibility trace. + self.lambda_ = trace_decay + self.eligibility_trace = self.q_table.copy() + + def check_state_exist(self, state): + if state not in self.q_table.index: + # append new state to q table + to_be_append = pd.Series( + [0] * len(self.actions), + index=self.q_table.columns, + name=state, + ) + self.q_table = self.q_table.append(to_be_append) + + # also update eligibility trace + self.eligibility_trace = self.eligibility_trace.append(to_be_append) + + def learn(self, s, a, r, s_, a_): + self.check_state_exist(s_) + q_predict = self.q_table.ix[s, a] + if s_ != 'terminal': + q_target = r + self.gamma * self.q_table.ix[s_, a_] # next state is not terminal + else: + q_target = r # next state is terminal + error = q_target - q_predict + + # increase trace amount for visited state-action pair + + # Method 1: + # self.eligibility_trace.ix[s, a] += 1 + + # Method 2: + self.eligibility_trace.ix[s, :] *= 0 + self.eligibility_trace.ix[s, a] = 1 + + # Q update + self.q_table += self.lr * error * self.eligibility_trace + + # decay eligibility trace after update + self.eligibility_trace *= self.gamma*self.lambda_ diff --git a/contents/4_Sarsa_lambda_maze/maze_env.py b/contents/4_Sarsa_lambda_maze/maze_env.py new file mode 100644 index 0000000..9fe6acb --- /dev/null +++ b/contents/4_Sarsa_lambda_maze/maze_env.py @@ -0,0 +1,130 @@ +""" +Reinforcement learning maze example. + +Red rectangle: explorer. +Black rectangles: hells [reward = -1]. +Yellow bin circle: paradise [reward = +1]. +All other states: ground [reward = 0]. + +This script is the environment part of this example. +The RL is in RL_brain.py. + +View more on 莫烦Python: https://morvanzhou.github.io/tutorials/ +""" + + +import numpy as np +np.random.seed(1) +import tkinter as tk +import time + + +UNIT = 40 # pixels +MAZE_H = 4 # grid height +MAZE_W = 4 # grid width + + +class Maze(tk.Tk): + def __init__(self): + super(Maze, self).__init__() + self.action_space = ['u', 'd', 'l', 'r'] + self.n_actions = len(self.action_space) + self.title('maze') + self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT)) + self._build_maze() + + def _build_maze(self): + self.canvas = tk.Canvas(self, bg='white', + height=MAZE_H * UNIT, + width=MAZE_W * UNIT) + + # create grids + for c in range(0, MAZE_W * UNIT, UNIT): + x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT + self.canvas.create_line(x0, y0, x1, y1) + for r in range(0, MAZE_H * UNIT, UNIT): + x0, y0, x1, y1 = 0, r, MAZE_H * UNIT, r + self.canvas.create_line(x0, y0, x1, y1) + + # create origin + origin = np.array([20, 20]) + + # hell + hell1_center = origin + np.array([UNIT * 2, UNIT]) + self.hell1 = self.canvas.create_rectangle( + hell1_center[0] - 15, hell1_center[1] - 15, + hell1_center[0] + 15, hell1_center[1] + 15, + fill='black') + # hell + hell2_center = origin + np.array([UNIT, UNIT * 2]) + self.hell2 = self.canvas.create_rectangle( + hell2_center[0] - 15, hell2_center[1] - 15, + hell2_center[0] + 15, hell2_center[1] + 15, + fill='black') + + # create oval + oval_center = origin + UNIT * 2 + self.oval = self.canvas.create_oval( + oval_center[0] - 15, oval_center[1] - 15, + oval_center[0] + 15, oval_center[1] + 15, + fill='yellow') + + # create red rect + self.rect = self.canvas.create_rectangle( + origin[0] - 15, origin[1] - 15, + origin[0] + 15, origin[1] + 15, + fill='red') + + # pack all + self.canvas.pack() + + def reset(self): + self.update() + time.sleep(0.5) + self.canvas.delete(self.rect) + origin = np.array([20, 20]) + self.rect = self.canvas.create_rectangle( + origin[0] - 15, origin[1] - 15, + origin[0] + 15, origin[1] + 15, + fill='red') + # return observation + return self.canvas.coords(self.rect) + + def step(self, action): + s = self.canvas.coords(self.rect) + base_action = np.array([0, 0]) + if action == 0: # up + if s[1] > UNIT: + base_action[1] -= UNIT + elif action == 1: # down + if s[1] < (MAZE_H - 1) * UNIT: + base_action[1] += UNIT + elif action == 2: # right + if s[0] < (MAZE_W - 1) * UNIT: + base_action[0] += UNIT + elif action == 3: # left + if s[0] > UNIT: + base_action[0] -= UNIT + + self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent + + s_ = self.canvas.coords(self.rect) # next state + + # reward function + if s_ == self.canvas.coords(self.oval): + reward = 1 + done = True + elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]: + reward = -1 + done = True + else: + reward = 0 + done = False + + return s_, reward, done + + def render(self): + time.sleep(0.05) + self.update() + + diff --git a/contents/4_Sarsa_lambda_maze/run_this.py b/contents/4_Sarsa_lambda_maze/run_this.py new file mode 100644 index 0000000..a0c5afc --- /dev/null +++ b/contents/4_Sarsa_lambda_maze/run_this.py @@ -0,0 +1,52 @@ +""" +Sarsa is a online updating method for Reinforcement learning. + +Unlike Q learning which is a offline updating method, Sarsa is updating while in the current trajectory. + +You will see the sarsa is more coward when punishment is close because it cares about all behaviours, +while q learning is more brave because it only cares about maximum behaviour. +""" + +from maze_env import Maze +from RL_brain import SarsaLambdaTable + + +def update(): + for episode in range(100): + # initial observation + observation = env.reset() + + # RL choose action based on observation + action = RL.choose_action(str(observation)) + + while True: + # fresh env + env.render() + + # RL take action and get next observation and reward + observation_, reward, done = env.step(action) + + # RL choose action based on next observation + action_ = RL.choose_action(str(observation_)) + + # RL learn from this transition (s, a, r, s, a) ==> Sarsa + RL.learn(str(observation), action, reward, str(observation_), action_) + + # swap observation and action + observation = observation_ + action = action_ + + # break while loop when end of this episode + if done: + break + + # end of game + print('game over') + env.destroy() + +if __name__ == "__main__": + env = Maze() + RL = SarsaLambdaTable(actions=list(range(env.n_actions))) + + env.after(100, update) + env.mainloop() \ No newline at end of file diff --git a/contents/5.1_Double_DQN/RL_brain.py b/contents/5.1_Double_DQN/RL_brain.py new file mode 100644 index 0000000..15053eb --- /dev/null +++ b/contents/5.1_Double_DQN/RL_brain.py @@ -0,0 +1,163 @@ +""" +The double DQN based on this paper: https://arxiv.org/abs/1509.06461 + +View more on 莫烦Python: https://morvanzhou.github.io/tutorials/ + +Using: +Tensorflow: 1.0 +gym: 0.8.0 +""" + +import numpy as np +import tensorflow as tf + +np.random.seed(1) +tf.set_random_seed(1) + + +class DoubleDQN: + def __init__( + self, + n_actions, + n_features, + learning_rate=0.005, + reward_decay=0.9, + e_greedy=0.9, + replace_target_iter=200, + memory_size=3000, + batch_size=32, + e_greedy_increment=None, + output_graph=False, + double_q=True, + sess=None, + ): + self.n_actions = n_actions + self.n_features = n_features + self.lr = learning_rate + self.gamma = reward_decay + self.epsilon_max = e_greedy + self.replace_target_iter = replace_target_iter + self.memory_size = memory_size + self.batch_size = batch_size + self.epsilon_increment = e_greedy_increment + self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max + + self.double_q = double_q # decide to use double q or not + + self.learn_step_counter = 0 + self.memory = np.zeros((self.memory_size, n_features*2+2)) + self._build_net() + if sess is None: + self.sess = tf.Session() + self.sess.run(tf.global_variables_initializer()) + else: + self.sess = sess + if output_graph: + tf.summary.FileWriter("logs/", self.sess.graph) + self.cost_his = [] + + def _build_net(self): + def build_layers(s, c_names, n_l1, w_initializer, b_initializer): + with tf.variable_scope('l1'): + w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names) + b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names) + l1 = tf.nn.relu(tf.matmul(s, w1) + b1) + + with tf.variable_scope('l2'): + w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names) + b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names) + out = tf.matmul(l1, w2) + b2 + return out + # ------------------ build evaluate_net ------------------ + self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input + self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss + + with tf.variable_scope('eval_net'): + c_names, n_l1, w_initializer, b_initializer = \ + ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 20, \ + tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers + + self.q_eval = build_layers(self.s, c_names, n_l1, w_initializer, b_initializer) + + with tf.variable_scope('loss'): + self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval)) + with tf.variable_scope('train'): + self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss) + + # ------------------ build target_net ------------------ + self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input + with tf.variable_scope('target_net'): + c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES] + + self.q_next = build_layers(self.s_, c_names, n_l1, w_initializer, b_initializer) + + def store_transition(self, s, a, r, s_): + if not hasattr(self, 'memory_counter'): + self.memory_counter = 0 + transition = np.hstack((s, [a, r], s_)) + index = self.memory_counter % self.memory_size + self.memory[index, :] = transition + self.memory_counter += 1 + + def choose_action(self, observation): + observation = observation[np.newaxis, :] + actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation}) + action = np.argmax(actions_value) + + if not hasattr(self, 'q'): # record action value it gets + self.q = [] + self.running_q = 0 + self.running_q = self.running_q*0.99 + 0.01 * np.max(actions_value) + self.q.append(self.running_q) + + if np.random.uniform() > self.epsilon: # choosing action + action = np.random.randint(0, self.n_actions) + return action + + def _replace_target_params(self): + t_params = tf.get_collection('target_net_params') + e_params = tf.get_collection('eval_net_params') + self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)]) + + def learn(self): + if self.learn_step_counter % self.replace_target_iter == 0: + self._replace_target_params() + print('\ntarget_params_replaced\n') + + if self.memory_counter > self.memory_size: + sample_index = np.random.choice(self.memory_size, size=self.batch_size) + else: + sample_index = np.random.choice(self.memory_counter, size=self.batch_size) + batch_memory = self.memory[sample_index, :] + + q_next, q_eval4next = self.sess.run( + [self.q_next, self.q_eval], + feed_dict={self.s_: batch_memory[:, -self.n_features:], # next observation + self.s: batch_memory[:, -self.n_features:]}) # next observation + q_eval = self.sess.run(self.q_eval, {self.s: batch_memory[:, :self.n_features]}) + + q_target = q_eval.copy() + + batch_index = np.arange(self.batch_size, dtype=np.int32) + eval_act_index = batch_memory[:, self.n_features].astype(int) + reward = batch_memory[:, self.n_features + 1] + + if self.double_q: + max_act4next = np.argmax(q_eval4next, axis=1) # the action that brings the highest value is evaluated by q_eval + selected_q_next = q_next[batch_index, max_act4next] # Double DQN, select q_next depending on above actions + else: + selected_q_next = np.max(q_next, axis=1) # the natural DQN + + q_target[batch_index, eval_act_index] = reward + self.gamma * selected_q_next + + _, self.cost = self.sess.run([self._train_op, self.loss], + feed_dict={self.s: batch_memory[:, :self.n_features], + self.q_target: q_target}) + self.cost_his.append(self.cost) + + self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max + self.learn_step_counter += 1 + + + + diff --git a/contents/5.1_Double_DQN/run_Pendulum.py b/contents/5.1_Double_DQN/run_Pendulum.py new file mode 100644 index 0000000..d60a362 --- /dev/null +++ b/contents/5.1_Double_DQN/run_Pendulum.py @@ -0,0 +1,77 @@ +""" +Double DQN & Natural DQN comparison, +The Pendulum example. + +View more on 莫烦Python: https://morvanzhou.github.io/tutorials/ + +Using: +Tensorflow: 1.0 +gym: 0.8.0 +""" + + +import gym +from RL_brain import DoubleDQN +import numpy as np +import matplotlib.pyplot as plt +import tensorflow as tf + + +env = gym.make('Pendulum-v0') +env = env.unwrapped +env.seed(1) +MEMORY_SIZE = 3000 +ACTION_SPACE = 11 + +sess = tf.Session() +with tf.variable_scope('Natural_DQN'): + natural_DQN = DoubleDQN( + n_actions=ACTION_SPACE, n_features=3, memory_size=MEMORY_SIZE, + e_greedy_increment=0.001, double_q=False, sess=sess + ) + +with tf.variable_scope('Double_DQN'): + double_DQN = DoubleDQN( + n_actions=ACTION_SPACE, n_features=3, memory_size=MEMORY_SIZE, + e_greedy_increment=0.001, double_q=True, sess=sess, output_graph=True) + +sess.run(tf.global_variables_initializer()) + + +def train(RL): + total_steps = 0 + observation = env.reset() + while True: + # if total_steps - MEMORY_SIZE > 8000: env.render() + + action = RL.choose_action(observation) + + f_action = (action-(ACTION_SPACE-1)/2)/((ACTION_SPACE-1)/4) # convert to [-2 ~ 2] float actions + observation_, reward, done, info = env.step(np.array([f_action])) + + reward /= 10 # normalize to a range of (-1, 0). r = 0 when get upright + # the Q target at upright state will be 0, because Q_target = r + gamma * Qmax(s', a') = 0 + gamma * 0 + # so when Q at this state is greater than 0, the agent overestimates the Q. Please refer to the final result. + + RL.store_transition(observation, action, reward, observation_) + + if total_steps > MEMORY_SIZE: # learning + RL.learn() + + if total_steps - MEMORY_SIZE > 20000: # stop game + break + + observation = observation_ + total_steps += 1 + return RL.q + +q_natural = train(natural_DQN) +q_double = train(double_DQN) + +plt.plot(np.array(q_natural), c='r', label='natural') +plt.plot(np.array(q_double), c='b', label='double') +plt.legend(loc='best') +plt.ylabel('Q eval') +plt.xlabel('training steps') +plt.grid() +plt.show() diff --git a/contents/5.2_Prioritized_Replay_DQN/RL_brain.py b/contents/5.2_Prioritized_Replay_DQN/RL_brain.py new file mode 100644 index 0000000..27d0e50 --- /dev/null +++ b/contents/5.2_Prioritized_Replay_DQN/RL_brain.py @@ -0,0 +1,300 @@ +""" +The DQN improvement: Prioritized Experience Replay (based on https://arxiv.org/abs/1511.05952) + +View more on 莫烦Python: https://morvanzhou.github.io/tutorials/ + +Using: +Tensorflow: 1.0 +gym: 0.8.0 +""" + +import numpy as np +import tensorflow as tf + +np.random.seed(1) +tf.set_random_seed(1) + + +class SumTree(object): + """ + This SumTree code is modified version and the original code is from: + https://github.com/jaara/AI-blog/blob/master/SumTree.py + + Story the data with it priority in tree and data frameworks. + """ + data_pointer = 0 + + def __init__(self, capacity): + self.capacity = capacity # for all priority values + self.tree = np.zeros(2*capacity - 1) + # [--------------Parent nodes-------------][-------leaves to recode priority-------] + # size: capacity - 1 size: capacity + self.data = np.zeros(capacity, dtype=object) # for all transitions + # [--------------data frame-------------] + # size: capacity + + def add_new_priority(self, p, data): + leaf_idx = self.data_pointer + self.capacity - 1 + + self.data[self.data_pointer] = data # update data_frame + self.update(leaf_idx, p) # update tree_frame + + self.data_pointer += 1 + if self.data_pointer >= self.capacity: # replace when exceed the capacity + self.data_pointer = 0 + + def update(self, tree_idx, p): + change = p - self.tree[tree_idx] + + self.tree[tree_idx] = p + self._propagate_change(tree_idx, change) + + def _propagate_change(self, tree_idx, change): + """change the sum of priority value in all parent nodes""" + parent_idx = (tree_idx - 1) // 2 + self.tree[parent_idx] += change + if parent_idx != 0: + self._propagate_change(parent_idx, change) + + def get_leaf(self, lower_bound): + leaf_idx = self._retrieve(lower_bound) # search the max leaf priority based on the lower_bound + data_idx = leaf_idx - self.capacity + 1 + return [leaf_idx, self.tree[leaf_idx], self.data[data_idx]] + + def _retrieve(self, lower_bound, parent_idx=0): + """ + Tree structure and array storage: + + Tree index: + 0 -> storing priority sum + / \ + 1 2 + / \ / \ + 3 4 5 6 -> storing priority for transitions + + Array type for storing: + [0,1,2,3,4,5,6] + """ + left_child_idx = 2 * parent_idx + 1 + right_child_idx = left_child_idx + 1 + + if left_child_idx >= len(self.tree): # end search when no more child + return parent_idx + + if self.tree[left_child_idx] == self.tree[right_child_idx]: + return self._retrieve(lower_bound, np.random.choice([left_child_idx, right_child_idx])) + if lower_bound <= self.tree[left_child_idx]: # downward search, always search for a higher priority node + return self._retrieve(lower_bound, left_child_idx) + else: + return self._retrieve(lower_bound-self.tree[left_child_idx], right_child_idx) + + @property + def root_priority(self): + return self.tree[0] # the root + + +class Memory(object): # stored as ( s, a, r, s_ ) in SumTree + """ + This SumTree code is modified version and the original code is from: + https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py + """ + epsilon = 0.01 # small amount to avoid zero priority + alpha = 0.6 # [0~1] convert the importance of TD error to priority + beta = 0.4 # importance-sampling, from initial value increasing to 1 + beta_increment_per_sampling = 0.001 + abs_err_upper = 1 # clipped abs error + + def __init__(self, capacity): + self.tree = SumTree(capacity) + + def store(self, error, transition): + p = self._get_priority(error) + self.tree.add_new_priority(p, transition) + + def sample(self, n): + batch_idx, batch_memory, ISWeights = [], [], [] + segment = self.tree.root_priority / n + self.beta = np.min([1, self.beta + self.beta_increment_per_sampling]) # max = 1 + + min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.root_priority + maxiwi = np.power(self.tree.capacity * min_prob, -self.beta) # for later normalizing ISWeights + for i in range(n): + a = segment * i + b = segment * (i + 1) + lower_bound = np.random.uniform(a, b) + idx, p, data = self.tree.get_leaf(lower_bound) + prob = p / self.tree.root_priority + ISWeights.append(self.tree.capacity * prob) + batch_idx.append(idx) + batch_memory.append(data) + + ISWeights = np.vstack(ISWeights) + ISWeights = np.power(ISWeights, -self.beta) / maxiwi # normalize + return batch_idx, np.vstack(batch_memory), ISWeights + + def update(self, idx, error): + p = self._get_priority(error) + self.tree.update(idx, p) + + def _get_priority(self, error): + error += self.epsilon # avoid 0 + clipped_error = np.clip(error, 0, self.abs_err_upper) + return np.power(clipped_error, self.alpha) + + +class DQNPrioritizedReplay: + def __init__( + self, + n_actions, + n_features, + learning_rate=0.005, + reward_decay=0.9, + e_greedy=0.9, + replace_target_iter=500, + memory_size=10000, + batch_size=32, + e_greedy_increment=None, + output_graph=False, + prioritized=True, + sess=None, + ): + self.n_actions = n_actions + self.n_features = n_features + self.lr = learning_rate + self.gamma = reward_decay + self.epsilon_max = e_greedy + self.replace_target_iter = replace_target_iter + self.memory_size = memory_size + self.batch_size = batch_size + self.epsilon_increment = e_greedy_increment + self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max + + self.prioritized = prioritized # decide to use double q or not + + self.learn_step_counter = 0 + + self._build_net() + + if self.prioritized: + self.memory = Memory(capacity=memory_size) + else: + self.memory = np.zeros((self.memory_size, n_features*2+2)) + + if sess is None: + self.sess = tf.Session() + self.sess.run(tf.global_variables_initializer()) + else: + self.sess = sess + + if output_graph: + tf.summary.FileWriter("logs/", self.sess.graph) + + self.cost_his = [] + + def _build_net(self): + def build_layers(s, c_names, n_l1, w_initializer, b_initializer): + with tf.variable_scope('l1'): + w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names) + b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names) + l1 = tf.nn.relu(tf.matmul(s, w1) + b1) + + with tf.variable_scope('l2'): + w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names) + b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names) + out = tf.matmul(l1, w2) + b2 + return out + + # ------------------ build evaluate_net ------------------ + self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input + self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss + if self.prioritized: + self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights') + with tf.variable_scope('eval_net'): + c_names, n_l1, w_initializer, b_initializer = \ + ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 20, \ + tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers + + self.q_eval = build_layers(self.s, c_names, n_l1, w_initializer, b_initializer) + + with tf.variable_scope('loss'): + if self.prioritized: + self.abs_errors = tf.reduce_sum(tf.abs(self.q_target - self.q_eval), axis=1) # for updating Sumtree + self.loss = tf.reduce_mean(self.ISWeights * tf.squared_difference(self.q_target, self.q_eval)) + else: + self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval)) + with tf.variable_scope('train'): + self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss) + + # ------------------ build target_net ------------------ + self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input + with tf.variable_scope('target_net'): + c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES] + self.q_next = build_layers(self.s_, c_names, n_l1, w_initializer, b_initializer) + + def store_transition(self, s, a, r, s_): + if self.prioritized: # prioritized replay + transition = np.hstack((s, [a, r], s_)) + max_p = np.max(self.memory.tree.tree[-self.memory.tree.capacity:]) # have high priority for newly arrived transition + self.memory.store(max_p, transition) + else: # random replay + if not hasattr(self, 'memory_counter'): + self.memory_counter = 0 + transition = np.hstack((s, [a, r], s_)) + index = self.memory_counter % self.memory_size + self.memory[index, :] = transition + self.memory_counter += 1 + + def choose_action(self, observation): + observation = observation[np.newaxis, :] + if np.random.uniform() < self.epsilon: + actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation}) + action = np.argmax(actions_value) + else: + action = np.random.randint(0, self.n_actions) + return action + + def _replace_target_params(self): + t_params = tf.get_collection('target_net_params') + e_params = tf.get_collection('eval_net_params') + self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)]) + + def learn(self): + if self.learn_step_counter % self.replace_target_iter == 0: + self._replace_target_params() + print('\ntarget_params_replaced\n') + + if self.prioritized: + tree_idx, batch_memory, ISWeights = self.memory.sample(self.batch_size) + else: + sample_index = np.random.choice(self.memory_size, size=self.batch_size) + batch_memory = self.memory[sample_index, :] + + q_next, q_eval = self.sess.run( + [self.q_next, self.q_eval], + feed_dict={self.s_: batch_memory[:, -self.n_features:], + self.s: batch_memory[:, :self.n_features]}) + + q_target = q_eval.copy() + batch_index = np.arange(self.batch_size, dtype=np.int32) + eval_act_index = batch_memory[:, self.n_features].astype(int) + reward = batch_memory[:, self.n_features + 1] + + q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1) + + if self.prioritized: + _, abs_errors, self.cost = self.sess.run([self._train_op, self.abs_errors, self.loss], + feed_dict={self.s: batch_memory[:, :self.n_features], + self.q_target: q_target, + self.ISWeights: ISWeights}) + for i in range(len(tree_idx)): # update priority + idx = tree_idx[i] + self.memory.update(idx, abs_errors[i]) + else: + _, self.cost = self.sess.run([self._train_op, self.loss], + feed_dict={self.s: batch_memory[:, :self.n_features], + self.q_target: q_target}) + + self.cost_his.append(self.cost) + + self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max + self.learn_step_counter += 1 diff --git a/contents/5.2_Prioritized_Replay_DQN/run_MountainCar.py b/contents/5.2_Prioritized_Replay_DQN/run_MountainCar.py new file mode 100644 index 0000000..08c2562 --- /dev/null +++ b/contents/5.2_Prioritized_Replay_DQN/run_MountainCar.py @@ -0,0 +1,80 @@ +""" +The DQN improvement: Prioritized Experience Replay (based on https://arxiv.org/abs/1511.05952) + +View more on 莫烦Python: https://morvanzhou.github.io/tutorials/ + +Using: +Tensorflow: 1.0 +gym: 0.8.0 +""" + + +import gym +from RL_brain import DQNPrioritizedReplay +import matplotlib.pyplot as plt +import tensorflow as tf +import numpy as np + +env = gym.make('MountainCar-v0') +env = env.unwrapped +env.seed(21) +MEMORY_SIZE = 10000 + +sess = tf.Session() +with tf.variable_scope('natural_DQN'): + RL_natural = DQNPrioritizedReplay( + n_actions=3, n_features=2, memory_size=MEMORY_SIZE, + e_greedy_increment=0.00005, sess=sess, prioritized=False, + ) + +with tf.variable_scope('DQN_with_prioritized_replay'): + RL_prio = DQNPrioritizedReplay( + n_actions=3, n_features=2, memory_size=MEMORY_SIZE, + e_greedy_increment=0.00005, sess=sess, prioritized=True, output_graph=True, + ) +sess.run(tf.global_variables_initializer()) + + +def train(RL): + total_steps = 0 + steps = [] + episodes = [] + for i_episode in range(20): + observation = env.reset() + while True: + # env.render() + + action = RL.choose_action(observation) + + observation_, reward, done, info = env.step(action) + + if done: reward = 10 + + RL.store_transition(observation, action, reward, observation_) + + if total_steps > MEMORY_SIZE: + RL.learn() + + if done: + print('episode ', i_episode, ' finished') + steps.append(total_steps) + episodes.append(i_episode) + break + + observation = observation_ + total_steps += 1 + return np.vstack((episodes, steps)) + +his_natural = train(RL_natural) +his_prio = train(RL_prio) + +# compare based on first success +plt.plot(his_natural[0, :], his_natural[1, :] - his_natural[1, 0], c='b', label='natural DQN') +plt.plot(his_prio[0, :], his_prio[1, :] - his_prio[1, 0], c='r', label='DQN with prioritized replay') +plt.legend(loc='best') +plt.ylabel('total training time') +plt.xlabel('episode') +plt.grid() +plt.show() + + diff --git a/contents/5.3_Dueling_DQN/RL_brain.py b/contents/5.3_Dueling_DQN/RL_brain.py new file mode 100644 index 0000000..fec458f --- /dev/null +++ b/contents/5.3_Dueling_DQN/RL_brain.py @@ -0,0 +1,165 @@ +""" +The Dueling DQN based on this paper: https://arxiv.org/abs/1511.06581 + +View more on 莫烦Python: https://morvanzhou.github.io/tutorials/ + +Using: +Tensorflow: 1.0 +gym: 0.8.0 +""" + +import numpy as np +import tensorflow as tf + +np.random.seed(1) +tf.set_random_seed(1) + + +class DuelingDQN: + def __init__( + self, + n_actions, + n_features, + learning_rate=0.001, + reward_decay=0.9, + e_greedy=0.9, + replace_target_iter=200, + memory_size=500, + batch_size=32, + e_greedy_increment=None, + output_graph=False, + dueling=True, + sess=None, + ): + self.n_actions = n_actions + self.n_features = n_features + self.lr = learning_rate + self.gamma = reward_decay + self.epsilon_max = e_greedy + self.replace_target_iter = replace_target_iter + self.memory_size = memory_size + self.batch_size = batch_size + self.epsilon_increment = e_greedy_increment + self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max + + self.dueling = dueling # decide to use dueling DQN or not + + self.learn_step_counter = 0 + self.memory = np.zeros((self.memory_size, n_features*2+2)) + self._build_net() + if sess is None: + self.sess = tf.Session() + self.sess.run(tf.global_variables_initializer()) + else: + self.sess = sess + if output_graph: + tf.summary.FileWriter("logs/", self.sess.graph) + self.cost_his = [] + + def _build_net(self): + def build_layers(s, c_names, n_l1, w_initializer, b_initializer): + with tf.variable_scope('l1'): + w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names) + b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names) + l1 = tf.nn.relu(tf.matmul(s, w1) + b1) + + if self.dueling: + # Dueling DQN + with tf.variable_scope('Value'): + w2 = tf.get_variable('w2', [n_l1, 1], initializer=w_initializer, collections=c_names) + b2 = tf.get_variable('b2', [1, 1], initializer=b_initializer, collections=c_names) + self.V = tf.matmul(l1, w2) + b2 + + with tf.variable_scope('Advantage'): + w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names) + b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names) + self.A = tf.matmul(l1, w2) + b2 + + with tf.variable_scope('Q'): + out = self.V + (self.A - tf.reduce_mean(self.A, axis=1, keep_dims=True)) # Q = V(s) + A(s,a) + else: + with tf.variable_scope('Q'): + w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names) + b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names) + out = tf.matmul(l1, w2) + b2 + + return out + + # ------------------ build evaluate_net ------------------ + self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input + self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss + with tf.variable_scope('eval_net'): + c_names, n_l1, w_initializer, b_initializer = \ + ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 20, \ + tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers + + self.q_eval = build_layers(self.s, c_names, n_l1, w_initializer, b_initializer) + + with tf.variable_scope('loss'): + self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval)) + with tf.variable_scope('train'): + self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss) + + # ------------------ build target_net ------------------ + self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input + with tf.variable_scope('target_net'): + c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES] + + self.q_next = build_layers(self.s_, c_names, n_l1, w_initializer, b_initializer) + + def store_transition(self, s, a, r, s_): + if not hasattr(self, 'memory_counter'): + self.memory_counter = 0 + transition = np.hstack((s, [a, r], s_)) + index = self.memory_counter % self.memory_size + self.memory[index, :] = transition + self.memory_counter += 1 + + def choose_action(self, observation): + observation = observation[np.newaxis, :] + if np.random.uniform() < self.epsilon: # choosing action + actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation}) + action = np.argmax(actions_value) + else: + action = np.random.randint(0, self.n_actions) + return action + + def _replace_target_params(self): + t_params = tf.get_collection('target_net_params') + e_params = tf.get_collection('eval_net_params') + self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)]) + + def learn(self): + if self.learn_step_counter % self.replace_target_iter == 0: + self._replace_target_params() + print('\ntarget_params_replaced\n') + + sample_index = np.random.choice(self.memory_size, size=self.batch_size) + batch_memory = self.memory[sample_index, :] + + q_next, q_eval4next, = self.sess.run( + [self.q_next, self.q_eval], + feed_dict={self.s_: batch_memory[:, -self.n_features:], # next observation + self.s: batch_memory[:, -self.n_features:]}) # next observation + q_eval = self.sess.run(self.q_eval, {self.s: batch_memory[:, :self.n_features]}) + + q_target = q_eval.copy() + + batch_index = np.arange(self.batch_size, dtype=np.int32) + eval_act_index = batch_memory[:, self.n_features].astype(int) + reward = batch_memory[:, self.n_features + 1] + + q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1) + + _, self.cost = self.sess.run([self._train_op, self.loss], + feed_dict={self.s: batch_memory[:, :self.n_features], + self.q_target: q_target}) + self.cost_his.append(self.cost) + + self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max + self.learn_step_counter += 1 + + + + + diff --git a/contents/5.3_Dueling_DQN/run_Pendulum.py b/contents/5.3_Dueling_DQN/run_Pendulum.py new file mode 100644 index 0000000..d7b2e70 --- /dev/null +++ b/contents/5.3_Dueling_DQN/run_Pendulum.py @@ -0,0 +1,86 @@ +""" +Dueling DQN & Natural DQN comparison + +View more on 莫烦Python: https://morvanzhou.github.io/tutorials/ + +Using: +Tensorflow: 1.0 +gym: 0.8.0 +""" + + +import gym +from RL_brain import DuelingDQN +import numpy as np +import matplotlib.pyplot as plt +import tensorflow as tf + + +env = gym.make('Pendulum-v0') +env = env.unwrapped +env.seed(1) +MEMORY_SIZE = 3000 +ACTION_SPACE = 25 + +sess = tf.Session() +with tf.variable_scope('natural'): + natural_DQN = DuelingDQN( + n_actions=ACTION_SPACE, n_features=3, memory_size=MEMORY_SIZE, + e_greedy_increment=0.001, sess=sess, dueling=False) + +with tf.variable_scope('dueling'): + dueling_DQN = DuelingDQN( + n_actions=ACTION_SPACE, n_features=3, memory_size=MEMORY_SIZE, + e_greedy_increment=0.001, sess=sess, dueling=True, output_graph=True) + +sess.run(tf.global_variables_initializer()) + + +def train(RL): + acc_r = [0] + total_steps = 0 + observation = env.reset() + while True: + # if total_steps-MEMORY_SIZE > 9000: env.render() + + action = RL.choose_action(observation) + + f_action = (action-(ACTION_SPACE-1)/2)/((ACTION_SPACE-1)/4) # [-2 ~ 2] float actions + observation_, reward, done, info = env.step(np.array([f_action])) + + reward /= 10 # normalize to a range of (-1, 0) + acc_r.append(reward + acc_r[-1]) # accumulated reward + + RL.store_transition(observation, action, reward, observation_) + + if total_steps > MEMORY_SIZE: + RL.learn() + + if total_steps-MEMORY_SIZE > 15000: + break + + observation = observation_ + total_steps += 1 + return RL.cost_his, acc_r + +c_natural, r_natural = train(natural_DQN) +c_dueling, r_dueling = train(dueling_DQN) + +plt.figure(1) +plt.plot(np.array(c_natural), c='r', label='natural') +plt.plot(np.array(c_dueling), c='b', label='dueling') +plt.legend(loc='best') +plt.ylabel('cost') +plt.xlabel('training steps') +plt.grid() + +plt.figure(2) +plt.plot(np.array(r_natural), c='r', label='natural') +plt.plot(np.array(r_dueling), c='b', label='dueling') +plt.legend(loc='best') +plt.ylabel('accumulated reward') +plt.xlabel('training steps') +plt.grid() + +plt.show() + diff --git a/contents/5_Deep_Q_Network/DQN_modified.py b/contents/5_Deep_Q_Network/DQN_modified.py new file mode 100644 index 0000000..16712ec --- /dev/null +++ b/contents/5_Deep_Q_Network/DQN_modified.py @@ -0,0 +1,172 @@ +""" +This part of code is the DQN brain. + +view the tensorboard picture about this DQN structure on: https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/4-3-DQN3/#modification + +View more on 莫烦Python: https://morvanzhou.github.io/tutorials/ + +Using: +Tensorflow: 1.0 +gym: 0.7.3 +""" + +import numpy as np +import tensorflow as tf + +np.random.seed(1) +tf.set_random_seed(1) + + +# Deep Q Network off-policy +class DeepQNetwork: + def __init__( + self, + n_actions, + n_features, + learning_rate=0.01, + reward_decay=0.9, + e_greedy=0.9, + replace_target_iter=300, + memory_size=500, + batch_size=32, + e_greedy_increment=None, + output_graph=False, + ): + self.n_actions = n_actions + self.n_features = n_features + self.lr = learning_rate + self.gamma = reward_decay + self.epsilon_max = e_greedy + self.replace_target_iter = replace_target_iter + self.memory_size = memory_size + self.batch_size = batch_size + self.epsilon_increment = e_greedy_increment + self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max + + # total learning step + self.learn_step_counter = 0 + + # initialize zero memory [s, a, r, s_] + self.memory = np.zeros((self.memory_size, n_features * 2 + 2)) + + # consist of [target_net, evaluate_net] + self._build_net() + + self.sess = tf.Session() + + if output_graph: + # $ tensorboard --logdir=logs + # tf.train.SummaryWriter soon be deprecated, use following + tf.summary.FileWriter("logs/", self.sess.graph) + + self.sess.run(tf.global_variables_initializer()) + self.cost_his = [] + + def _build_net(self): + def build_layers(s, c_names, n_l1, w_initializer, b_initializer): + with tf.variable_scope('l1'): + w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names) + b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names) + l1 = tf.nn.relu(tf.matmul(s, w1) + b1) + + with tf.variable_scope('l2'): + w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names) + b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names) + out = tf.matmul(l1, w2) + b2 + return out + + # ------------------ all inputs ------------------------ + self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input State + self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input Next State + self.r = tf.placeholder(tf.float32, [None, ], name='r') # input Reward + self.a = tf.placeholder(tf.int32, [None, ], name='a') # input Action + + # ------------------ build evaluate_net ------------------ + with tf.variable_scope('eval_net'): + # c_names(collections_names) are the collections to store variables + c_names, n_l1, w_initializer, b_initializer = \ + ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 10, \ + tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers + self.q_eval = build_layers(self.s, c_names, n_l1, w_initializer, b_initializer) + + # ------------------ build target_net ------------------ + with tf.variable_scope('target_net'): + # c_names(collections_names) are the collections to store variables + c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES] + self.q_next = build_layers(self.s_, c_names, n_l1, w_initializer, b_initializer) + + with tf.variable_scope('q_target'): + self.q_target = self.r + self.gamma * tf.reduce_max(self.q_next, axis=1, name='Qmax_s_') # shape=(None, ) + + with tf.variable_scope('q_eval'): + a_one_hot = tf.one_hot(self.a, depth=self.n_actions, dtype=tf.float32) + self.q_eval_wrt_a = tf.reduce_sum(self.q_eval * a_one_hot, axis=1) # shape=(None, ) + + with tf.variable_scope('loss'): + self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval_wrt_a, name='TD_error')) + with tf.variable_scope('train'): + self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss) + + def store_transition(self, s, a, r, s_): + if not hasattr(self, 'memory_counter'): + self.memory_counter = 0 + transition = np.hstack((s, [a, r], s_)) + # replace the old memory with new memory + index = self.memory_counter % self.memory_size + self.memory[index, :] = transition + self.memory_counter += 1 + + def choose_action(self, observation): + # to have batch dimension when feed into tf placeholder + observation = observation[np.newaxis, :] + + if np.random.uniform() < self.epsilon: + # forward feed the observation and get q value for every actions + actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation}) + action = np.argmax(actions_value) + else: + action = np.random.randint(0, self.n_actions) + return action + + def _replace_target_params(self): + t_params = tf.get_collection('target_net_params') + e_params = tf.get_collection('eval_net_params') + self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)]) + + def learn(self): + # check to replace target parameters + if self.learn_step_counter % self.replace_target_iter == 0: + self._replace_target_params() + print('\ntarget_params_replaced\n') + + # sample batch memory from all memory + if self.memory_counter > self.memory_size: + sample_index = np.random.choice(self.memory_size, size=self.batch_size) + else: + sample_index = np.random.choice(self.memory_counter, size=self.batch_size) + batch_memory = self.memory[sample_index, :] + + _, cost = self.sess.run( + [self._train_op, self.loss], + feed_dict={ + self.s: batch_memory[:, :self.n_features], + self.a: batch_memory[:, self.n_features], + self.r: batch_memory[:, self.n_features + 1], + self.s_: batch_memory[:, -self.n_features:], + }) + + self.cost_his.append(cost) + + # increasing epsilon + self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max + self.learn_step_counter += 1 + + def plot_cost(self): + import matplotlib.pyplot as plt + plt.plot(np.arange(len(self.cost_his)), self.cost_his) + plt.ylabel('Cost') + plt.xlabel('training steps') + plt.show() + +if __name__ == '__main__': + DQN = DeepQNetwork(3,4, output_graph=True) \ No newline at end of file diff --git a/contents/5_Deep_Q_Network/RL_brain.py b/contents/5_Deep_Q_Network/RL_brain.py new file mode 100644 index 0000000..f2d0ef7 --- /dev/null +++ b/contents/5_Deep_Q_Network/RL_brain.py @@ -0,0 +1,213 @@ +""" +This part of code is the DQN brain, which is a brain of the agent. +All decisions are made in here. +Using Tensorflow to build the neural network. + +View more on 莫烦Python: https://morvanzhou.github.io/tutorials/ + +Using: +Tensorflow: 1.0 +gym: 0.7.3 +""" + +import numpy as np +import pandas as pd +import tensorflow as tf + +np.random.seed(1) +tf.set_random_seed(1) + + +# Deep Q Network off-policy +class DeepQNetwork: + def __init__( + self, + n_actions, + n_features, + learning_rate=0.01, + reward_decay=0.9, + e_greedy=0.9, + replace_target_iter=300, + memory_size=500, + batch_size=32, + e_greedy_increment=None, + output_graph=False, + ): + self.n_actions = n_actions + self.n_features = n_features + self.lr = learning_rate + self.gamma = reward_decay + self.epsilon_max = e_greedy + self.replace_target_iter = replace_target_iter + self.memory_size = memory_size + self.batch_size = batch_size + self.epsilon_increment = e_greedy_increment + self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max + + # total learning step + self.learn_step_counter = 0 + + # initialize zero memory [s, a, r, s_] + self.memory = np.zeros((self.memory_size, n_features * 2 + 2)) + + # consist of [target_net, evaluate_net] + self._build_net() + + self.sess = tf.Session() + + if output_graph: + # $ tensorboard --logdir=logs + # tf.train.SummaryWriter soon be deprecated, use following + tf.summary.FileWriter("logs/", self.sess.graph) + + self.sess.run(tf.global_variables_initializer()) + self.cost_his = [] + + def _build_net(self): + # ------------------ build evaluate_net ------------------ + self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input + self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss + with tf.variable_scope('eval_net'): + # c_names(collections_names) are the collections to store variables + c_names, n_l1, w_initializer, b_initializer = \ + ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 10, \ + tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers + + # first layer. collections is used later when assign to target net + with tf.variable_scope('l1'): + w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names) + b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names) + l1 = tf.nn.relu(tf.matmul(self.s, w1) + b1) + + # second layer. collections is used later when assign to target net + with tf.variable_scope('l2'): + w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names) + b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names) + self.q_eval = tf.matmul(l1, w2) + b2 + + with tf.variable_scope('loss'): + self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval)) + with tf.variable_scope('train'): + self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss) + + # ------------------ build target_net ------------------ + self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input + with tf.variable_scope('target_net'): + # c_names(collections_names) are the collections to store variables + c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES] + + # first layer. collections is used later when assign to target net + with tf.variable_scope('l1'): + w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names) + b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names) + l1 = tf.nn.relu(tf.matmul(self.s_, w1) + b1) + + # second layer. collections is used later when assign to target net + with tf.variable_scope('l2'): + w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names) + b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names) + self.q_next = tf.matmul(l1, w2) + b2 + + def store_transition(self, s, a, r, s_): + if not hasattr(self, 'memory_counter'): + self.memory_counter = 0 + + transition = np.hstack((s, [a, r], s_)) + + # replace the old memory with new memory + index = self.memory_counter % self.memory_size + self.memory[index, :] = transition + + self.memory_counter += 1 + + def choose_action(self, observation): + # to have batch dimension when feed into tf placeholder + observation = observation[np.newaxis, :] + + if np.random.uniform() < self.epsilon: + # forward feed the observation and get q value for every actions + actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation}) + action = np.argmax(actions_value) + else: + action = np.random.randint(0, self.n_actions) + return action + + def _replace_target_params(self): + t_params = tf.get_collection('target_net_params') + e_params = tf.get_collection('eval_net_params') + self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)]) + + def learn(self): + # check to replace target parameters + if self.learn_step_counter % self.replace_target_iter == 0: + self._replace_target_params() + print('\ntarget_params_replaced\n') + + # sample batch memory from all memory + if self.memory_counter > self.memory_size: + sample_index = np.random.choice(self.memory_size, size=self.batch_size) + else: + sample_index = np.random.choice(self.memory_counter, size=self.batch_size) + batch_memory = self.memory[sample_index, :] + + q_next, q_eval = self.sess.run( + [self.q_next, self.q_eval], + feed_dict={ + self.s_: batch_memory[:, -self.n_features:], # fixed params + self.s: batch_memory[:, :self.n_features], # newest params + }) + + # change q_target w.r.t q_eval's action + q_target = q_eval.copy() + + batch_index = np.arange(self.batch_size, dtype=np.int32) + eval_act_index = batch_memory[:, self.n_features].astype(int) + reward = batch_memory[:, self.n_features + 1] + + q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1) + + """ + For example in this batch I have 2 samples and 3 actions: + q_eval = + [[1, 2, 3], + [4, 5, 6]] + + q_target = q_eval = + [[1, 2, 3], + [4, 5, 6]] + + Then change q_target with the real q_target value w.r.t the q_eval's action. + For example in: + sample 0, I took action 0, and the max q_target value is -1; + sample 1, I took action 2, and the max q_target value is -2: + q_target = + [[-1, 2, 3], + [4, 5, -2]] + + So the (q_target - q_eval) becomes: + [[(-1)-(1), 0, 0], + [0, 0, (-2)-(6)]] + + We then backpropagate this error w.r.t the corresponding action to network, + leave other action as error=0 cause we didn't choose it. + """ + + # train eval network + _, self.cost = self.sess.run([self._train_op, self.loss], + feed_dict={self.s: batch_memory[:, :self.n_features], + self.q_target: q_target}) + self.cost_his.append(self.cost) + + # increasing epsilon + self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max + self.learn_step_counter += 1 + + def plot_cost(self): + import matplotlib.pyplot as plt + plt.plot(np.arange(len(self.cost_his)), self.cost_his) + plt.ylabel('Cost') + plt.xlabel('training steps') + plt.show() + + + diff --git a/contents/5_Deep_Q_Network/maze_env.py b/contents/5_Deep_Q_Network/maze_env.py new file mode 100644 index 0000000..5134df0 --- /dev/null +++ b/contents/5_Deep_Q_Network/maze_env.py @@ -0,0 +1,130 @@ +""" +Reinforcement learning maze example. + +Red rectangle: explorer. +Black rectangles: hells [reward = -1]. +Yellow bin circle: paradise [reward = +1]. +All other states: ground [reward = 0]. + +This script is the environment part of this example. +The RL is in RL_brain.py. + +View more on 莫烦Python: https://morvanzhou.github.io/tutorials/ +""" + + +import numpy as np +import tkinter as tk +import time + + +UNIT = 40 # pixels +MAZE_H = 4 # grid height +MAZE_W = 4 # grid width + + +class Maze(tk.Tk): + def __init__(self): + super(Maze, self).__init__() + self.action_space = ['u', 'd', 'l', 'r'] + self.n_actions = len(self.action_space) + self.n_features = 2 + self.title('maze') + self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT)) + self._build_maze() + + def _build_maze(self): + self.canvas = tk.Canvas(self, bg='white', + height=MAZE_H * UNIT, + width=MAZE_W * UNIT) + + # create grids + for c in range(0, MAZE_W * UNIT, UNIT): + x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT + self.canvas.create_line(x0, y0, x1, y1) + for r in range(0, MAZE_H * UNIT, UNIT): + x0, y0, x1, y1 = 0, r, MAZE_H * UNIT, r + self.canvas.create_line(x0, y0, x1, y1) + + # create origin + origin = np.array([20, 20]) + + # hell + hell1_center = origin + np.array([UNIT * 2, UNIT]) + self.hell1 = self.canvas.create_rectangle( + hell1_center[0] - 15, hell1_center[1] - 15, + hell1_center[0] + 15, hell1_center[1] + 15, + fill='black') + # hell + # hell2_center = origin + np.array([UNIT, UNIT * 2]) + # self.hell2 = self.canvas.create_rectangle( + # hell2_center[0] - 15, hell2_center[1] - 15, + # hell2_center[0] + 15, hell2_center[1] + 15, + # fill='black') + + # create oval + oval_center = origin + UNIT * 2 + self.oval = self.canvas.create_oval( + oval_center[0] - 15, oval_center[1] - 15, + oval_center[0] + 15, oval_center[1] + 15, + fill='yellow') + + # create red rect + self.rect = self.canvas.create_rectangle( + origin[0] - 15, origin[1] - 15, + origin[0] + 15, origin[1] + 15, + fill='red') + + # pack all + self.canvas.pack() + + def reset(self): + self.update() + time.sleep(0.1) + self.canvas.delete(self.rect) + origin = np.array([20, 20]) + self.rect = self.canvas.create_rectangle( + origin[0] - 15, origin[1] - 15, + origin[0] + 15, origin[1] + 15, + fill='red') + # return observation + return (np.array(self.canvas.coords(self.rect)[:2]) - np.array(self.canvas.coords(self.oval)[:2]))/(MAZE_H*UNIT) + + def step(self, action): + s = self.canvas.coords(self.rect) + base_action = np.array([0, 0]) + if action == 0: # up + if s[1] > UNIT: + base_action[1] -= UNIT + elif action == 1: # down + if s[1] < (MAZE_H - 1) * UNIT: + base_action[1] += UNIT + elif action == 2: # right + if s[0] < (MAZE_W - 1) * UNIT: + base_action[0] += UNIT + elif action == 3: # left + if s[0] > UNIT: + base_action[0] -= UNIT + + self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent + + next_coords = self.canvas.coords(self.rect) # next state + + # reward function + if next_coords == self.canvas.coords(self.oval): + reward = 1 + done = True + elif next_coords in [self.canvas.coords(self.hell1)]: + reward = -1 + done = True + else: + reward = 0 + done = False + s_ = (np.array(next_coords[:2]) - np.array(self.canvas.coords(self.oval)[:2]))/(MAZE_H*UNIT) + return s_, reward, done + + def render(self): + # time.sleep(0.01) + self.update() + + diff --git a/contents/5_Deep_Q_Network/run_this.py b/contents/5_Deep_Q_Network/run_this.py new file mode 100644 index 0000000..cec116b --- /dev/null +++ b/contents/5_Deep_Q_Network/run_this.py @@ -0,0 +1,61 @@ +""" +Sarsa is a online updating method for Reinforcement learning. + +Unlike Q learning which is a offline updating method, Sarsa is updating while in the current trajectory. + +You will see the sarsa is more coward when punishment is close because it cares about all behaviours, +while q learning is more brave because it only cares about maximum behaviour. +""" + +from maze_env import Maze +from RL_brain import DeepQNetwork + + +def run_maze(): + step = 0 + for episode in range(300): + # initial observation + observation = env.reset() + + while True: + # fresh env + env.render() + + # RL choose action based on observation + action = RL.choose_action(observation) + + # RL take action and get next observation and reward + observation_, reward, done = env.step(action) + + RL.store_transition(observation, action, reward, observation_) + + if (step > 200) and (step % 5 == 0): + RL.learn() + + # swap observation + observation = observation_ + + # break while loop when end of this episode + if done: + break + step += 1 + + # end of game + print('game over') + env.destroy() + + +if __name__ == "__main__": + # maze game + env = Maze() + RL = DeepQNetwork(env.n_actions, env.n_features, + learning_rate=0.01, + reward_decay=0.9, + e_greedy=0.9, + replace_target_iter=200, + memory_size=2000, + # output_graph=True + ) + env.after(100, run_maze) + env.mainloop() + RL.plot_cost() \ No newline at end of file diff --git a/contents/6_OpenAI_gym/RL_brain.py b/contents/6_OpenAI_gym/RL_brain.py new file mode 100644 index 0000000..bc3796c --- /dev/null +++ b/contents/6_OpenAI_gym/RL_brain.py @@ -0,0 +1,213 @@ +""" +This part of code is the DQN brain, which is a brain of the agent. +All decisions are made in here. +Using Tensorflow to build the neural network. + +View more on 莫烦Python: https://morvanzhou.github.io/tutorials/ + +Using: +Tensorflow: 1.0 +gym: 0.8.0 +""" + +import numpy as np +import pandas as pd +import tensorflow as tf + +np.random.seed(1) +tf.set_random_seed(1) + + +# Deep Q Network off-policy +class DeepQNetwork: + def __init__( + self, + n_actions, + n_features, + learning_rate=0.01, + reward_decay=0.9, + e_greedy=0.9, + replace_target_iter=300, + memory_size=500, + batch_size=32, + e_greedy_increment=None, + output_graph=False, + ): + self.n_actions = n_actions + self.n_features = n_features + self.lr = learning_rate + self.gamma = reward_decay + self.epsilon_max = e_greedy + self.replace_target_iter = replace_target_iter + self.memory_size = memory_size + self.batch_size = batch_size + self.epsilon_increment = e_greedy_increment + self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max + + # total learning step + self.learn_step_counter = 0 + + # initialize zero memory [s, a, r, s_] + self.memory = np.zeros((self.memory_size, n_features * 2 + 2)) + + # consist of [target_net, evaluate_net] + self._build_net() + + self.sess = tf.Session() + + if output_graph: + # $ tensorboard --logdir=logs + # tf.train.SummaryWriter soon be deprecated, use following + tf.summary.FileWriter("logs/", self.sess.graph) + + self.sess.run(tf.global_variables_initializer()) + self.cost_his = [] + + def _build_net(self): + # ------------------ build evaluate_net ------------------ + self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input + self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss + with tf.variable_scope('eval_net'): + # c_names(collections_names) are the collections to store variables + c_names, n_l1, w_initializer, b_initializer = \ + ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 10, \ + tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers + + # first layer. collections is used later when assign to target net + with tf.variable_scope('l1'): + w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names) + b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names) + l1 = tf.nn.relu(tf.matmul(self.s, w1) + b1) + + # second layer. collections is used later when assign to target net + with tf.variable_scope('l2'): + w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names) + b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names) + self.q_eval = tf.matmul(l1, w2) + b2 + + with tf.variable_scope('loss'): + self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval)) + with tf.variable_scope('train'): + self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss) + + # ------------------ build target_net ------------------ + self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input + with tf.variable_scope('target_net'): + # c_names(collections_names) are the collections to store variables + c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES] + + # first layer. collections is used later when assign to target net + with tf.variable_scope('l1'): + w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names) + b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names) + l1 = tf.nn.relu(tf.matmul(self.s_, w1) + b1) + + # second layer. collections is used later when assign to target net + with tf.variable_scope('l2'): + w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names) + b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names) + self.q_next = tf.matmul(l1, w2) + b2 + + def store_transition(self, s, a, r, s_): + if not hasattr(self, 'memory_counter'): + self.memory_counter = 0 + + transition = np.hstack((s, [a, r], s_)) + + # replace the old memory with new memory + index = self.memory_counter % self.memory_size + self.memory[index, :] = transition + + self.memory_counter += 1 + + def choose_action(self, observation): + # to have batch dimension when feed into tf placeholder + observation = observation[np.newaxis, :] + + if np.random.uniform() < self.epsilon: + # forward feed the observation and get q value for every actions + actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation}) + action = np.argmax(actions_value) + else: + action = np.random.randint(0, self.n_actions) + return action + + def _replace_target_params(self): + t_params = tf.get_collection('target_net_params') + e_params = tf.get_collection('eval_net_params') + self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)]) + + def learn(self): + # check to replace target parameters + if self.learn_step_counter % self.replace_target_iter == 0: + self._replace_target_params() + print('\ntarget_params_replaced\n') + + # sample batch memory from all memory + if self.memory_counter > self.memory_size: + sample_index = np.random.choice(self.memory_size, size=self.batch_size) + else: + sample_index = np.random.choice(self.memory_counter, size=self.batch_size) + batch_memory = self.memory[sample_index, :] + + q_next, q_eval = self.sess.run( + [self.q_next, self.q_eval], + feed_dict={ + self.s_: batch_memory[:, -self.n_features:], # fixed params + self.s: batch_memory[:, :self.n_features], # newest params + }) + + # change q_target w.r.t q_eval's action + q_target = q_eval.copy() + + batch_index = np.arange(self.batch_size, dtype=np.int32) + eval_act_index = batch_memory[:, self.n_features].astype(int) + reward = batch_memory[:, self.n_features + 1] + + q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1) + + """ + For example in this batch I have 2 samples and 3 actions: + q_eval = + [[1, 2, 3], + [4, 5, 6]] + + q_target = q_eval = + [[1, 2, 3], + [4, 5, 6]] + + Then change q_target with the real q_target value w.r.t the q_eval's action. + For example in: + sample 0, I took action 0, and the max q_target value is -1; + sample 1, I took action 2, and the max q_target value is -2: + q_target = + [[-1, 2, 3], + [4, 5, -2]] + + So the (q_target - q_eval) becomes: + [[(-1)-(1), 0, 0], + [0, 0, (-2)-(6)]] + + We then backpropagate this error w.r.t the corresponding action to network, + leave other action as error=0 cause we didn't choose it. + """ + + # train eval network + _, self.cost = self.sess.run([self._train_op, self.loss], + feed_dict={self.s: batch_memory[:, :self.n_features], + self.q_target: q_target}) + self.cost_his.append(self.cost) + + # increasing epsilon + self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max + self.learn_step_counter += 1 + + def plot_cost(self): + import matplotlib.pyplot as plt + plt.plot(np.arange(len(self.cost_his)), self.cost_his) + plt.ylabel('Cost') + plt.xlabel('training steps') + plt.show() + + + diff --git a/contents/6_OpenAI_gym/run_CartPole.py b/contents/6_OpenAI_gym/run_CartPole.py new file mode 100644 index 0000000..104bde4 --- /dev/null +++ b/contents/6_OpenAI_gym/run_CartPole.py @@ -0,0 +1,62 @@ +""" +Deep Q network, + +Using: +Tensorflow: 1.0 +gym: 0.7.3 +""" + + +import gym +from RL_brain import DeepQNetwork + +env = gym.make('CartPole-v0') +env = env.unwrapped + +print(env.action_space) +print(env.observation_space) +print(env.observation_space.high) +print(env.observation_space.low) + +RL = DeepQNetwork(n_actions=env.action_space.n, + n_features=env.observation_space.shape[0], + learning_rate=0.01, e_greedy=0.9, + replace_target_iter=100, memory_size=2000, + e_greedy_increment=0.001,) + +total_steps = 0 + + +for i_episode in range(100): + + observation = env.reset() + ep_r = 0 + while True: + env.render() + + action = RL.choose_action(observation) + + observation_, reward, done, info = env.step(action) + + # the smaller theta and closer to center the better + x, x_dot, theta, theta_dot = observation_ + r1 = (env.x_threshold - abs(x))/env.x_threshold - 0.8 + r2 = (env.theta_threshold_radians - abs(theta))/env.theta_threshold_radians - 0.5 + reward = r1 + r2 + + RL.store_transition(observation, action, reward, observation_) + + ep_r += reward + if total_steps > 1000: + RL.learn() + + if done: + print('episode: ', i_episode, + 'ep_r: ', round(ep_r, 2), + ' epsilon: ', round(RL.epsilon, 2)) + break + + observation = observation_ + total_steps += 1 + +RL.plot_cost() diff --git a/contents/6_OpenAI_gym/run_MountainCar.py b/contents/6_OpenAI_gym/run_MountainCar.py new file mode 100644 index 0000000..cdda953 --- /dev/null +++ b/contents/6_OpenAI_gym/run_MountainCar.py @@ -0,0 +1,61 @@ +""" +Deep Q network, + +Using: +Tensorflow: 1.0 +gym: 0.8.0 +""" + + +import gym +from RL_brain import DeepQNetwork + +env = gym.make('MountainCar-v0') +env = env.unwrapped + +print(env.action_space) +print(env.observation_space) +print(env.observation_space.high) +print(env.observation_space.low) + +RL = DeepQNetwork(n_actions=3, n_features=2, learning_rate=0.001, e_greedy=0.9, + replace_target_iter=300, memory_size=3000, + e_greedy_increment=0.0002,) + +total_steps = 0 + + +for i_episode in range(10): + + observation = env.reset() + ep_r = 0 + while True: + env.render() + + action = RL.choose_action(observation) + + observation_, reward, done, info = env.step(action) + + position, velocity = observation_ + + # the higher the better + reward = abs(position - (-0.5)) # r in [0, 1] + + RL.store_transition(observation, action, reward, observation_) + + if total_steps > 1000: + RL.learn() + + ep_r += reward + if done: + get = '| Get' if observation_[0] >= env.unwrapped.goal_position else '| ----' + print('Epi: ', i_episode, + get, + '| Ep_r: ', round(ep_r, 4), + '| Epsilon: ', round(RL.epsilon, 2)) + break + + observation = observation_ + total_steps += 1 + +RL.plot_cost() diff --git a/contents/7_Policy_gradient_softmax/RL_brain.py b/contents/7_Policy_gradient_softmax/RL_brain.py new file mode 100644 index 0000000..e76b234 --- /dev/null +++ b/contents/7_Policy_gradient_softmax/RL_brain.py @@ -0,0 +1,124 @@ +""" +This part of code is the reinforcement learning brain, which is a brain of the agent. +All decisions are made in here. + +Policy Gradient, Reinforcement Learning. + +View more on 莫烦Python: https://morvanzhou.github.io/tutorials/ + +Using: +Tensorflow: 1.0 +gym: 0.8.0 +""" + +import numpy as np +import tensorflow as tf + +# reproducible +np.random.seed(1) +tf.set_random_seed(1) + + +class PolicyGradient: + def __init__( + self, + n_actions, + n_features, + learning_rate=0.01, + reward_decay=0.95, + output_graph=False, + ): + self.n_actions = n_actions + self.n_features = n_features + self.lr = learning_rate + self.gamma = reward_decay + + self.ep_obs, self.ep_as, self.ep_rs = [], [], [] + + self._build_net() + + self.sess = tf.Session() + + if output_graph: + # $ tensorboard --logdir=logs + # http://0.0.0.0:6006/ + # tf.train.SummaryWriter soon be deprecated, use following + tf.summary.FileWriter("logs/", self.sess.graph) + + self.sess.run(tf.global_variables_initializer()) + + def _build_net(self): + with tf.name_scope('inputs'): + self.tf_obs = tf.placeholder(tf.float32, [None, self.n_features], name="observations") + self.tf_acts = tf.placeholder(tf.int32, [None, ], name="actions_num") + self.tf_vt = tf.placeholder(tf.float32, [None, ], name="actions_value") + # fc1 + layer = tf.layers.dense( + inputs=self.tf_obs, + units=10, + activation=tf.nn.tanh, # tanh activation + kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3), + bias_initializer=tf.constant_initializer(0.1), + name='fc1' + ) + # fc2 + all_act = tf.layers.dense( + inputs=layer, + units=self.n_actions, + activation=None, + kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3), + bias_initializer=tf.constant_initializer(0.1), + name='fc2' + ) + + self.all_act_prob = tf.nn.softmax(all_act, name='act_prob') # use softmax to convert to probability + + with tf.name_scope('loss'): + # to maximize total reward (log_p * R) is to minimize -(log_p * R), and the tf only have minimize(loss) + neg_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=all_act, labels=self.tf_acts) # this is negative log of chosen action + # or in this way: + # neg_log_prob = tf.reduce_sum(-tf.log(self.all_act_prob)*tf.one_hot(self.tf_acts, self.n_actions), axis=1) + loss = tf.reduce_mean(neg_log_prob * self.tf_vt) # reward guided loss + + with tf.name_scope('train'): + self.train_op = tf.train.AdamOptimizer(self.lr).minimize(loss) + + def choose_action(self, observation): + prob_weights = self.sess.run(self.all_act_prob, feed_dict={self.tf_obs: observation[np.newaxis, :]}) + action = np.random.choice(range(prob_weights.shape[1]), p=prob_weights.ravel()) # select action w.r.t the actions prob + return action + + def store_transition(self, s, a, r): + self.ep_obs.append(s) + self.ep_as.append(a) + self.ep_rs.append(r) + + def learn(self): + # discount and normalize episode reward + discounted_ep_rs_norm = self._discount_and_norm_rewards() + + # train on episode + self.sess.run(self.train_op, feed_dict={ + self.tf_obs: np.vstack(self.ep_obs), # shape=[None, n_obs] + self.tf_acts: np.array(self.ep_as), # shape=[None, ] + self.tf_vt: discounted_ep_rs_norm, # shape=[None, ] + }) + + self.ep_obs, self.ep_as, self.ep_rs = [], [], [] # empty episode data + return discounted_ep_rs_norm + + def _discount_and_norm_rewards(self): + # discount episode rewards + discounted_ep_rs = np.zeros_like(self.ep_rs) + running_add = 0 + for t in reversed(range(0, len(self.ep_rs))): + running_add = running_add * self.gamma + self.ep_rs[t] + discounted_ep_rs[t] = running_add + + # normalize episode rewards + discounted_ep_rs -= np.mean(discounted_ep_rs) + discounted_ep_rs /= np.std(discounted_ep_rs) + return discounted_ep_rs + + + diff --git a/contents/7_Policy_gradient_softmax/run_CartPole.py b/contents/7_Policy_gradient_softmax/run_CartPole.py new file mode 100644 index 0000000..7d46aee --- /dev/null +++ b/contents/7_Policy_gradient_softmax/run_CartPole.py @@ -0,0 +1,69 @@ +""" +Policy Gradient, Reinforcement Learning. + +The cart pole example + +View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/ + +Using: +Tensorflow: 1.0 +gym: 0.8.0 +""" + +import gym +from RL_brain import PolicyGradient +import matplotlib.pyplot as plt + +DISPLAY_REWARD_THRESHOLD = 400 # renders environment if total episode reward is greater then this threshold +RENDER = False # rendering wastes time + +env = gym.make('CartPole-v0') +env.seed(1) # reproducible, general Policy gradient has high variance +env = env.unwrapped + +print(env.action_space) +print(env.observation_space) +print(env.observation_space.high) +print(env.observation_space.low) + +RL = PolicyGradient( + n_actions=env.action_space.n, + n_features=env.observation_space.shape[0], + learning_rate=0.02, + reward_decay=0.99, + # output_graph=True, +) + +for i_episode in range(3000): + + observation = env.reset() + + while True: + if RENDER: env.render() + + action = RL.choose_action(observation) + + observation_, reward, done, info = env.step(action) + + RL.store_transition(observation, action, reward) + + if done: + ep_rs_sum = sum(RL.ep_rs) + + if 'running_reward' not in globals(): + running_reward = ep_rs_sum + else: + running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 + if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering + print("episode:", i_episode, " reward:", int(running_reward)) + + vt = RL.learn() + + if i_episode == 0: + plt.plot(vt) # plot the episode vt + plt.xlabel('episode steps') + plt.ylabel('normalized state-action value') + plt.show() + break + + observation = observation_ diff --git a/contents/7_Policy_gradient_softmax/run_MountainCar.py b/contents/7_Policy_gradient_softmax/run_MountainCar.py new file mode 100644 index 0000000..926269d --- /dev/null +++ b/contents/7_Policy_gradient_softmax/run_MountainCar.py @@ -0,0 +1,76 @@ +""" +Policy Gradient, Reinforcement Learning. + +The cart pole example + +View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/ + +Using: +Tensorflow: 1.0 +gym: 0.8.0 +""" + +import gym +from RL_brain import PolicyGradient +import matplotlib.pyplot as plt + +DISPLAY_REWARD_THRESHOLD = -2000 # renders environment if total episode reward is greater then this threshold +# episode: 154 reward: -10667 +# episode: 387 reward: -2009 +# episode: 489 reward: -1006 +# episode: 628 reward: -502 + +RENDER = False # rendering wastes time + +env = gym.make('MountainCar-v0') +env.seed(1) # reproducible, general Policy gradient has high variance +env = env.unwrapped + +print(env.action_space) +print(env.observation_space) +print(env.observation_space.high) +print(env.observation_space.low) + +RL = PolicyGradient( + n_actions=env.action_space.n, + n_features=env.observation_space.shape[0], + learning_rate=0.02, + reward_decay=0.995, + # output_graph=True, +) + +for i_episode in range(1000): + + observation = env.reset() + + while True: + if RENDER: env.render() + + action = RL.choose_action(observation) + + observation_, reward, done, info = env.step(action) # reward = -1 in all cases + + RL.store_transition(observation, action, reward) + + if done: + # calculate running reward + ep_rs_sum = sum(RL.ep_rs) + if 'running_reward' not in globals(): + running_reward = ep_rs_sum + else: + running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 + if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering + + print("episode:", i_episode, " reward:", int(running_reward)) + + vt = RL.learn() # train + + if i_episode == 30: + plt.plot(vt) # plot the episode vt + plt.xlabel('episode steps') + plt.ylabel('normalized state-action value') + plt.show() + + break + + observation = observation_ diff --git a/contents/8_Actor_Critic_Advantage/AC_CartPole.py b/contents/8_Actor_Critic_Advantage/AC_CartPole.py new file mode 100644 index 0000000..b01a4b9 --- /dev/null +++ b/contents/8_Actor_Critic_Advantage/AC_CartPole.py @@ -0,0 +1,169 @@ +""" +Actor-Critic using TD-error as the Advantage, Reinforcement Learning. + +The cart pole example. Policy is oscillated. + +View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/ + +Using: +tensorflow 1.0 +gym 0.8.0 +""" + +import numpy as np +import tensorflow as tf +import gym + +np.random.seed(2) +tf.set_random_seed(2) # reproducible + +# Superparameters +OUTPUT_GRAPH = False +MAX_EPISODE = 3000 +DISPLAY_REWARD_THRESHOLD = 200 # renders environment if total episode reward is greater then this threshold +MAX_EP_STEPS = 1000 # maximum time step in one episode +RENDER = False # rendering wastes time +GAMMA = 0.9 # reward discount in TD error +LR_A = 0.001 # learning rate for actor +LR_C = 0.01 # learning rate for critic + +env = gym.make('CartPole-v0') +env.seed(1) # reproducible +env = env.unwrapped + +N_F = env.observation_space.shape[0] +N_A = env.action_space.n + + +class Actor(object): + def __init__(self, sess, n_features, n_actions, lr=0.001): + self.sess = sess + + self.s = tf.placeholder(tf.float32, [1, n_features], "state") + self.a = tf.placeholder(tf.int32, None, "act") + self.td_error = tf.placeholder(tf.float32, None, "td_error") # TD_error + + with tf.variable_scope('Actor'): + l1 = tf.layers.dense( + inputs=self.s, + units=20, # number of hidden units + activation=tf.nn.relu, + kernel_initializer=tf.random_normal_initializer(0., .1), # weights + bias_initializer=tf.constant_initializer(0.1), # biases + name='l1' + ) + + self.acts_prob = tf.layers.dense( + inputs=l1, + units=n_actions, # output units + activation=tf.nn.softmax, # get action probabilities + kernel_initializer=tf.random_normal_initializer(0., .1), # weights + bias_initializer=tf.constant_initializer(0.1), # biases + name='acts_prob' + ) + + with tf.variable_scope('exp_v'): + log_prob = tf.log(self.acts_prob[0, self.a]) + self.exp_v = tf.reduce_mean(log_prob * self.td_error) # advantage (TD_error) guided loss + + with tf.variable_scope('train'): + self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v) # minimize(-exp_v) = maximize(exp_v) + + def learn(self, s, a, td): + s = s[np.newaxis, :] + feed_dict = {self.s: s, self.a: a, self.td_error: td} + _, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict) + return exp_v + + def choose_action(self, s): + s = s[np.newaxis, :] + probs = self.sess.run(self.acts_prob, {self.s: s}) # get probabilities for all actions + return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel()) # return a int + + +class Critic(object): + def __init__(self, sess, n_features, lr=0.01): + self.sess = sess + + self.s = tf.placeholder(tf.float32, [1, n_features], "state") + self.v_ = tf.placeholder(tf.float32, [1, 1], "v_next") + self.r = tf.placeholder(tf.float32, None, 'r') + + with tf.variable_scope('Critic'): + l1 = tf.layers.dense( + inputs=self.s, + units=20, # number of hidden units + activation=tf.nn.relu, # None + # have to be linear to make sure the convergence of actor. + # But linear approximator seems hardly learns the correct Q. + kernel_initializer=tf.random_normal_initializer(0., .1), # weights + bias_initializer=tf.constant_initializer(0.1), # biases + name='l1' + ) + + self.v = tf.layers.dense( + inputs=l1, + units=1, # output units + activation=None, + kernel_initializer=tf.random_normal_initializer(0., .1), # weights + bias_initializer=tf.constant_initializer(0.1), # biases + name='V' + ) + + with tf.variable_scope('squared_TD_error'): + self.td_error = self.r + GAMMA * self.v_ - self.v + self.loss = tf.square(self.td_error) # TD_error = (r+gamma*V_next) - V_eval + with tf.variable_scope('train'): + self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss) + + def learn(self, s, r, s_): + s, s_ = s[np.newaxis, :], s_[np.newaxis, :] + + v_ = self.sess.run(self.v, {self.s: s_}) + td_error, _ = self.sess.run([self.td_error, self.train_op], + {self.s: s, self.v_: v_, self.r: r}) + return td_error + + +sess = tf.Session() + +actor = Actor(sess, n_features=N_F, n_actions=N_A, lr=LR_A) +critic = Critic(sess, n_features=N_F, lr=LR_C) # we need a good teacher, so the teacher should learn faster than the actor + +sess.run(tf.global_variables_initializer()) + +if OUTPUT_GRAPH: + tf.summary.FileWriter("logs/", sess.graph) + +for i_episode in range(MAX_EPISODE): + s = env.reset() + t = 0 + track_r = [] + while True: + if RENDER: env.render() + + a = actor.choose_action(s) + + s_, r, done, info = env.step(a) + + if done: r = -20 + + track_r.append(r) + + td_error = critic.learn(s, r, s_) # gradient = grad[r + gamma * V(s_) - V(s)] + actor.learn(s, a, td_error) # true_gradient = grad[logPi(s,a) * td_error] + + s = s_ + t += 1 + + if done or t >= MAX_EP_STEPS: + ep_rs_sum = sum(track_r) + + if 'running_reward' not in globals(): + running_reward = ep_rs_sum + else: + running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 + if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering + print("episode:", i_episode, " reward:", int(running_reward)) + break + diff --git a/contents/8_Actor_Critic_Advantage/AC_continue_Pendulum.py b/contents/8_Actor_Critic_Advantage/AC_continue_Pendulum.py new file mode 100644 index 0000000..07fc378 --- /dev/null +++ b/contents/8_Actor_Critic_Advantage/AC_continue_Pendulum.py @@ -0,0 +1,179 @@ +""" +Actor-Critic with continuous action using TD-error as the Advantage, Reinforcement Learning. + +The Pendulum example (based on https://github.com/dennybritz/reinforcement-learning/blob/master/PolicyGradient/Continuous%20MountainCar%20Actor%20Critic%20Solution.ipynb) + +Cannot converge!!! oscillate!!! + +View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/ + +Using: +tensorflow 1.0 +gym 0.8.0 +""" + +import tensorflow as tf +import numpy as np +import gym + +np.random.seed(2) +tf.set_random_seed(2) # reproducible + + +class Actor(object): + def __init__(self, sess, n_features, action_bound, lr=0.0001): + self.sess = sess + + self.s = tf.placeholder(tf.float32, [1, n_features], "state") + self.a = tf.placeholder(tf.float32, None, name="act") + self.td_error = tf.placeholder(tf.float32, None, name="td_error") # TD_error + + l1 = tf.layers.dense( + inputs=self.s, + units=30, # number of hidden units + activation=tf.nn.relu, + kernel_initializer=tf.random_normal_initializer(0., .1), # weights + bias_initializer=tf.constant_initializer(0.1), # biases + name='l1' + ) + + mu = tf.layers.dense( + inputs=l1, + units=1, # number of hidden units + activation=tf.nn.tanh, + kernel_initializer=tf.random_normal_initializer(0., .1), # weights + bias_initializer=tf.constant_initializer(0.1), # biases + name='mu' + ) + + sigma = tf.layers.dense( + inputs=l1, + units=1, # output units + activation=tf.nn.softplus, # get action probabilities + kernel_initializer=tf.random_normal_initializer(0., .1), # weights + bias_initializer=tf.constant_initializer(1.), # biases + name='sigma' + ) + global_step = tf.Variable(0, trainable=False) + # self.e = epsilon = tf.train.exponential_decay(2., global_step, 1000, 0.9) + self.mu, self.sigma = tf.squeeze(mu*2), tf.squeeze(sigma+0.1) + self.normal_dist = tf.contrib.distributions.Normal(self.mu, self.sigma) + + self.action = tf.clip_by_value(self.normal_dist.sample(1), action_bound[0], action_bound[1]) + + with tf.name_scope('exp_v'): + log_prob = self.normal_dist.log_prob(self.a) # loss without advantage + self.exp_v = log_prob * self.td_error # advantage (TD_error) guided loss + # Add cross entropy cost to encourage exploration + self.exp_v += self.normal_dist.entropy() + + with tf.name_scope('train'): + self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v, global_step) # min(v) = max(-v) + + def learn(self, s, a, td): + s = s[np.newaxis, :] + feed_dict = {self.s: s, self.a: a, self.td_error: td} + _, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict) + return exp_v + + def choose_action(self, s): + s = s[np.newaxis, :] + return self.sess.run(self.action, {self.s: s}) # get probabilities for all actions + + +class Critic(object): + def __init__(self, sess, n_features, lr=0.01): + self.sess = sess + with tf.name_scope('inputs'): + self.s = tf.placeholder(tf.float32, [1, n_features], "state") + self.v_ = tf.placeholder(tf.float32, [1, 1], name="v_next") + self.r = tf.placeholder(tf.float32, name='r') + + with tf.variable_scope('Critic'): + l1 = tf.layers.dense( + inputs=self.s, + units=30, # number of hidden units + activation=tf.nn.relu, + kernel_initializer=tf.random_normal_initializer(0., .1), # weights + bias_initializer=tf.constant_initializer(0.1), # biases + name='l1' + ) + + self.v = tf.layers.dense( + inputs=l1, + units=1, # output units + activation=None, + kernel_initializer=tf.random_normal_initializer(0., .1), # weights + bias_initializer=tf.constant_initializer(0.1), # biases + name='V' + ) + + with tf.variable_scope('squared_TD_error'): + self.td_error = tf.reduce_mean(self.r + GAMMA * self.v_ - self.v) + self.loss = tf.square(self.td_error) # TD_error = (r+gamma*V_next) - V_eval + with tf.variable_scope('train'): + self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss) + + def learn(self, s, r, s_): + s, s_ = s[np.newaxis, :], s_[np.newaxis, :] + + v_ = self.sess.run(self.v, {self.s: s_}) + td_error, _ = self.sess.run([self.td_error, self.train_op], + {self.s: s, self.v_: v_, self.r: r}) + return td_error + + +OUTPUT_GRAPH = False +MAX_EPISODE = 1000 +MAX_EP_STEPS = 300 +DISPLAY_REWARD_THRESHOLD = -550 # renders environment if total episode reward is greater then this threshold +RENDER = False # rendering wastes time +GAMMA = 0.9 +LR_A = 0.001 # learning rate for actor +LR_C = 0.01 # learning rate for critic + +env = gym.make('Pendulum-v0') +env.seed(1) # reproducible +env = env.unwrapped + +N_S = env.observation_space.shape[0] +A_BOUND = env.action_space.high + +sess = tf.Session() + +actor = Actor(sess, n_features=N_S, lr=LR_A, action_bound=[-A_BOUND, A_BOUND]) +critic = Critic(sess, n_features=N_S, lr=LR_C) + +sess.run(tf.global_variables_initializer()) + +if OUTPUT_GRAPH: + tf.summary.FileWriter("logs/", sess.graph) + +for i_episode in range(MAX_EPISODE): + s = env.reset() + t = 0 + ep_rs = [] + while True: + # if RENDER: + env.render() + a = actor.choose_action(s) + + s_, r, done, info = env.step(a) + r /= 10 + + td_error = critic.learn(s, r, s_) # gradient = grad[r + gamma * V(s_) - V(s)] + actor.learn(s, a, td_error) # true_gradient = grad[logPi(s,a) * td_error] + + s = s_ + t += 1 + ep_rs.append(r) + if t > MAX_EP_STEPS: + ep_rs_sum = sum(ep_rs) + if 'running_reward' not in globals(): + running_reward = ep_rs_sum + else: + running_reward = running_reward * 0.9 + ep_rs_sum * 0.1 + if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering + print("episode:", i_episode, " reward:", int(running_reward)) + break + diff --git a/contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG.py b/contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG.py new file mode 100644 index 0000000..629eb6e --- /dev/null +++ b/contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG.py @@ -0,0 +1,252 @@ +""" +Deep Deterministic Policy Gradient (DDPG), Reinforcement Learning. +DDPG is Actor Critic based algorithm. +Pendulum example. + +View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/ + +Using: +tensorflow 1.0 +gym 0.8.0 +""" + +import tensorflow as tf +import numpy as np +import gym + +np.random.seed(1) +tf.set_random_seed(1) + +##################### hyper parameters #################### + +MAX_EPISODES = 70 +MAX_EP_STEPS = 400 +LR_A = 0.01 # learning rate for actor +LR_C = 0.01 # learning rate for critic +GAMMA = 0.9 # reward discount +TAU = 0.01 # Soft update for target param, but this is computationally expansive +# so we use replace_iter instead +REPLACE_ITER_A = 500 +REPLACE_ITER_C = 300 +MEMORY_CAPACITY = 7000 +BATCH_SIZE = 32 + +RENDER = False +OUTPUT_GRAPH = True +ENV_NAME = 'Pendulum-v0' + +############################### Actor #################################### + +class Actor(object): + def __init__(self, sess, action_dim, action_bound, learning_rate, t_replace_iter): + self.sess = sess + self.a_dim = action_dim + self.action_bound = action_bound + self.lr = learning_rate + self.t_replace_iter = t_replace_iter + self.t_replace_counter = 0 + + with tf.variable_scope('Actor'): + # input s, output a + self.a = self._build_net(S, scope='eval_net', trainable=True) + + # input s_, output a, get a_ for critic + self.a_ = self._build_net(S_, scope='target_net', trainable=False) + + self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval_net') + self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target_net') + + def _build_net(self, s, scope, trainable): + with tf.variable_scope(scope): + init_w = tf.random_normal_initializer(0., 0.3) + init_b = tf.constant_initializer(0.1) + net = tf.layers.dense(s, 30, activation=tf.nn.relu, + kernel_initializer=init_w, bias_initializer=init_b, name='l1', + trainable=trainable) + with tf.variable_scope('a'): + actions = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, kernel_initializer=init_w, + bias_initializer=init_b, name='a', trainable=trainable) + scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a') # Scale output to -action_bound to action_bound + return scaled_a + + def learn(self, s, a): # batch update + self.sess.run(self.train_op, feed_dict={S: s, A: a}) + # the following method for soft replace target params is computational expansive + # target_params = (1-tau) * target_params + tau * eval_params + # self.sess.run([tf.assign(t, (1 - self.tau) * t + self.tau * e) for t, e in zip(self.t_params, self.e_params)]) + + # instead of above method, I use a hard replacement here + if self.t_replace_counter % self.t_replace_iter == 0: + self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)]) + self.t_replace_counter += 1 + + def choose_action(self, s): + s = s[np.newaxis, :] # single state + return self.sess.run(self.a, feed_dict={S: s})[0] # single action + + def add_grad_to_graph(self, a_grads): + with tf.variable_scope('policy_grads'): + # ys = policy; + # xs = policy's parameters; + # self.a_grads = the gradients of the policy to get more Q + # tf.gradients will calculate dys/dxs with a initial gradients for ys, so this is dq/da * da/dparams + self.policy_grads = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads) + + with tf.variable_scope('A_train'): + opt = tf.train.AdamOptimizer(-self.lr / BATCH_SIZE) # (- learning rate) for ascent policy, div to take mean + self.train_op = opt.apply_gradients(zip(self.policy_grads, self.e_params)) + + +############################### Critic #################################### + +class Critic(object): + def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter, a_): + self.sess = sess + self.s_dim = state_dim + self.a_dim = action_dim + self.lr = learning_rate + self.gamma = gamma + self.t_replace_iter = t_replace_iter + self.t_replace_counter = 0 + + with tf.variable_scope('Critic'): + # Input (s, a), output q + self.q = self._build_net(S, A, 'eval_net', trainable=True) + + # Input (s_, a_), output q_ for q_target + self.q_ = self._build_net(S_, a_, 'target_net', trainable=False) # target_q is based on a_ from Actor's target_net + + self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_net') + self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_net') + + with tf.variable_scope('target_q'): + self.target_q = R + self.gamma * self.q_ + + with tf.variable_scope('TD_error'): + self.loss = tf.reduce_mean(tf.squared_difference(self.target_q, self.q)) + + with tf.variable_scope('C_train'): + self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss) + + with tf.variable_scope('a_grad'): + self.a_grads = tf.gradients(self.q, A)[0] # tensor of gradients of each sample (None, a_dim) + + def _build_net(self, s, a, scope, trainable): + with tf.variable_scope(scope): + init_w = tf.random_normal_initializer(0., 0.1) + init_b = tf.constant_initializer(0.1) + + with tf.variable_scope('l1'): + n_l1 = 30 + w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], initializer=init_w, trainable=trainable) + w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], initializer=init_w, trainable=trainable) + b1 = tf.get_variable('b1', [1, n_l1], initializer=init_b, trainable=trainable) + net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1) + + with tf.variable_scope('q'): + q = tf.layers.dense(net, 1, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable) # Q(s,a) + return q + + def learn(self, s, a, r, s_): + self.sess.run(self.train_op, feed_dict={S: s, A: a, R: r, S_: s_}) + # the following method for soft replace target params is computational expansive + # target_params = (1-tau) * target_params + tau * eval_params + # self.sess.run([tf.assign(t, (1 - self.tau) * t + self.tau * e) for t, e in zip(self.t_params, self.e_params)]) + + # instead of above method, we use a hard replacement here + if self.t_replace_counter % self.t_replace_iter == 0: + self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)]) + self.t_replace_counter += 1 + + +##################### Memory #################### + +class Memory(object): + def __init__(self, capacity, dims): + self.capacity = capacity + self.data = np.zeros((capacity, dims)) + self.pointer = 0 + + def store_transition(self, s, a, r, s_): + transition = np.hstack((s, a, [r], s_)) + index = self.pointer % self.capacity # replace the old memory with new memory + self.data[index, :] = transition + self.pointer += 1 + + def sample(self, n): + assert self.pointer >= self.capacity, 'Memory has not been fulfilled' + indices = np.random.choice(self.capacity, size=n) + return self.data[indices, :] + + +env = gym.make(ENV_NAME) +env = env.unwrapped +env.seed(1) + +state_dim = env.observation_space.shape[0] +action_dim = env.action_space.shape[0] +action_bound = env.action_space.high + +# all placeholder for tf +with tf.name_scope('S'): + S = tf.placeholder(tf.float32, shape=[None, state_dim], name='s') +with tf.name_scope('A'): + A = tf.placeholder(tf.float32, shape=[None, action_dim], name='a') +with tf.name_scope('R'): + R = tf.placeholder(tf.float32, [None, 1], name='r') +with tf.name_scope('S_'): + S_ = tf.placeholder(tf.float32, shape=[None, state_dim], name='s_') + + +sess = tf.Session() + +# Create actor and critic. +# They are actually connected to each other, details can be seen in tensorboard or in this picture: +actor = Actor(sess, action_dim, action_bound, LR_A, REPLACE_ITER_A) +critic = Critic(sess, state_dim, action_dim, LR_C, GAMMA, REPLACE_ITER_C, actor.a_) +actor.add_grad_to_graph(critic.a_grads) + +sess.run(tf.global_variables_initializer()) + +M = Memory(MEMORY_CAPACITY, dims=2 * state_dim + action_dim + 1) + +if OUTPUT_GRAPH: + tf.summary.FileWriter("logs/", sess.graph) + +var = 3 # control exploration + +for i in range(MAX_EPISODES): + s = env.reset() + ep_reward = 0 + + for j in range(MAX_EP_STEPS): + + if RENDER: + env.render() + + # Added exploration noise + a = actor.choose_action(s) + a = np.clip(np.random.normal(a, var), -2, 2) # add randomness to action selection for exploration + s_, r, done, info = env.step(a) + + M.store_transition(s, a, r / 10, s_) + + if M.pointer > MEMORY_CAPACITY: + var *= .9995 # decay the action randomness + b_M = M.sample(BATCH_SIZE) + b_s = b_M[:, :state_dim] + b_a = b_M[:, state_dim: state_dim + action_dim] + b_r = b_M[:, -state_dim - 1: -state_dim] + b_s_ = b_M[:, -state_dim:] + + critic.learn(b_s, b_a, b_r, b_s_) + actor.learn(b_s, b_a) + + s = s_ + ep_reward += r + + if j == MAX_EP_STEPS-1: + print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, ) + if ep_reward > -1000: + RENDER = True + break \ No newline at end of file diff --git a/experiments/2D_car/DDPG.py b/experiments/2D_car/DDPG.py new file mode 100644 index 0000000..f91bfe0 --- /dev/null +++ b/experiments/2D_car/DDPG.py @@ -0,0 +1,269 @@ +""" +Environment is a 2D car. +Car has 5 sensors to obtain distance information. + +Car collision => reward = -1, otherwise => reward = 0. + +You can train this RL by using LOAD = False, after training, this model will be store in the a local folder. +Using LOAD = True to reload the trained model for playing. + +You can customize this script in a way you want. + +View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/ + +Requirement: +pyglet >= 1.2.4 +numpy >= 1.12.1 +tensorflow >= 1.0.1 +""" + +import tensorflow as tf +import numpy as np +import os +import shutil +from car_env import CarEnv + + +np.random.seed(1) +tf.set_random_seed(1) + +MAX_EPISODES = 225 +MAX_EP_STEPS = 600 +LR_A = 1e-3 # learning rate for actor +LR_C = 1e-3 # learning rate for critic +GAMMA = 0.95 # reward discount +REPLACE_ITER_A = 800 +REPLACE_ITER_C = 700 +MEMORY_CAPACITY = 5000 +BATCH_SIZE = 16 +VAR_MIN = 0.1 +RENDER = True +LOAD = False +DISCRETE_ACTION = False + +env = CarEnv(discrete_action=DISCRETE_ACTION) +STATE_DIM = env.state_dim +ACTION_DIM = env.action_dim +ACTION_BOUND = env.action_bound + +# all placeholder for tf +with tf.name_scope('S'): + S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s') +with tf.name_scope('A'): + A = tf.placeholder(tf.float32, shape=[None, ACTION_DIM], name='a') +with tf.name_scope('R'): + R = tf.placeholder(tf.float32, [None, 1], name='r') +with tf.name_scope('S_'): + S_ = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s_') + + +class Actor(object): + def __init__(self, sess, action_dim, action_bound, learning_rate, t_replace_iter): + self.sess = sess + self.a_dim = action_dim + self.action_bound = action_bound + self.lr = learning_rate + self.t_replace_iter = t_replace_iter + self.t_replace_counter = 0 + + with tf.variable_scope('Actor'): + # input s, output a + self.a = self._build_net(S, scope='eval_net', trainable=True) + + # input s_, output a, get a_ for critic + self.a_ = self._build_net(S_, scope='target_net', trainable=False) + + self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval_net') + self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target_net') + + def _build_net(self, s, scope, trainable): + with tf.variable_scope(scope): + init_w = tf.contrib.layers.xavier_initializer() + init_b = tf.constant_initializer(0.001) + net = tf.layers.dense(s, 100, activation=tf.nn.relu, + kernel_initializer=init_w, bias_initializer=init_b, name='l1', + trainable=trainable) + net = tf.layers.dense(net, 20, activation=tf.nn.relu, + kernel_initializer=init_w, bias_initializer=init_b, name='l2', + trainable=trainable) + with tf.variable_scope('a'): + actions = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, kernel_initializer=init_w, + name='a', trainable=trainable) + scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a') # Scale output to -action_bound to action_bound + return scaled_a + + def learn(self, s, a): # batch update + self.sess.run(self.train_op, feed_dict={S: s, A: a}) + if self.t_replace_counter % self.t_replace_iter == 0: + self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)]) + self.t_replace_counter += 1 + + def choose_action(self, s): + s = s[np.newaxis, :] # single state + return self.sess.run(self.a, feed_dict={S: s})[0] # single action + + def add_grad_to_graph(self, a_grads): + with tf.variable_scope('policy_grads'): + self.policy_grads = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads) + + with tf.variable_scope('A_train'): + opt = tf.train.RMSPropOptimizer(-self.lr / BATCH_SIZE) # (- learning rate) for ascent policy, div to take mean + self.train_op = opt.apply_gradients(zip(self.policy_grads, self.e_params)) + + +class Critic(object): + def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter, a_): + self.sess = sess + self.s_dim = state_dim + self.a_dim = action_dim + self.lr = learning_rate + self.gamma = gamma + self.t_replace_iter = t_replace_iter + self.t_replace_counter = 0 + + with tf.variable_scope('Critic'): + # Input (s, a), output q + self.q = self._build_net(S, A, 'eval_net', trainable=True) + + # Input (s_, a_), output q_ for q_target + self.q_ = self._build_net(S_, a_, 'target_net', trainable=False) # target_q is based on a_ from Actor's target_net + + self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_net') + self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_net') + + with tf.variable_scope('target_q'): + self.target_q = R + self.gamma * self.q_ + + with tf.variable_scope('TD_error'): + self.loss = tf.reduce_mean(tf.squared_difference(self.target_q, self.q)) + + with tf.variable_scope('C_train'): + self.train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss) + + with tf.variable_scope('a_grad'): + self.a_grads = tf.gradients(self.q, A)[0] # tensor of gradients of each sample (None, a_dim) + + def _build_net(self, s, a, scope, trainable): + with tf.variable_scope(scope): + init_w = tf.contrib.layers.xavier_initializer() + init_b = tf.constant_initializer(0.01) + + with tf.variable_scope('l1'): + n_l1 = 100 + w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], initializer=init_w, trainable=trainable) + w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], initializer=init_w, trainable=trainable) + b1 = tf.get_variable('b1', [1, n_l1], initializer=init_b, trainable=trainable) + net = tf.nn.relu6(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1) + net = tf.layers.dense(net, 20, activation=tf.nn.relu, + kernel_initializer=init_w, bias_initializer=init_b, name='l2', + trainable=trainable) + with tf.variable_scope('q'): + q = tf.layers.dense(net, 1, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable) # Q(s,a) + return q + + def learn(self, s, a, r, s_): + self.sess.run(self.train_op, feed_dict={S: s, A: a, R: r, S_: s_}) + if self.t_replace_counter % self.t_replace_iter == 0: + self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)]) + self.t_replace_counter += 1 + + +class Memory(object): + def __init__(self, capacity, dims): + self.capacity = capacity + self.data = np.zeros((capacity, dims)) + self.pointer = 0 + + def store_transition(self, s, a, r, s_): + transition = np.hstack((s, a, [r], s_)) + index = self.pointer % self.capacity # replace the old memory with new memory + self.data[index, :] = transition + self.pointer += 1 + + def sample(self, n): + assert self.pointer >= self.capacity, 'Memory has not been fulfilled' + indices = np.random.choice(self.capacity, size=n) + return self.data[indices, :] + + +sess = tf.Session() + +# Create actor and critic. +actor = Actor(sess, ACTION_DIM, ACTION_BOUND[1], LR_A, REPLACE_ITER_A) +critic = Critic(sess, STATE_DIM, ACTION_DIM, LR_C, GAMMA, REPLACE_ITER_C, actor.a_) +actor.add_grad_to_graph(critic.a_grads) + +M = Memory(MEMORY_CAPACITY, dims=2 * STATE_DIM + ACTION_DIM + 1) + +saver = tf.train.Saver() +path = './discrete' if DISCRETE_ACTION else './continuous' + +if LOAD: + saver.restore(sess, tf.train.latest_checkpoint(path)) +else: + sess.run(tf.global_variables_initializer()) + + +def train(): + var = 2. # control exploration + for ep in range(MAX_EPISODES): + s = env.reset() + ep_step = 0 + + for t in range(MAX_EP_STEPS): + # while True: + if RENDER: + env.render() + + # Added exploration noise + a = actor.choose_action(s) + a = np.clip(np.random.normal(a, var), *ACTION_BOUND) # add randomness to action selection for exploration + s_, r, done = env.step(a) + M.store_transition(s, a, r, s_) + + if M.pointer > MEMORY_CAPACITY: + var = max([var*.9995, VAR_MIN]) # decay the action randomness + b_M = M.sample(BATCH_SIZE) + b_s = b_M[:, :STATE_DIM] + b_a = b_M[:, STATE_DIM: STATE_DIM + ACTION_DIM] + b_r = b_M[:, -STATE_DIM - 1: -STATE_DIM] + b_s_ = b_M[:, -STATE_DIM:] + + critic.learn(b_s, b_a, b_r, b_s_) + actor.learn(b_s, b_a) + + s = s_ + ep_step += 1 + + if done or t == MAX_EP_STEPS - 1: + # if done: + print('Ep:', ep, + '| Steps: %i' % int(ep_step), + '| Explore: %.2f' % var, + ) + break + + if os.path.isdir(path): shutil.rmtree(path) + os.mkdir(path) + ckpt_path = os.path.join(path, 'DDPG.ckpt') + save_path = saver.save(sess, ckpt_path, write_meta_graph=False) + print("\nSave Model %s\n" % save_path) + + +def eval(): + env.set_fps(30) + while True: + s = env.reset() + while True: + env.render() + a = actor.choose_action(s) + s_, r, done = env.step(a) + s = s_ + if done: + break + +if __name__ == '__main__': + if LOAD: + eval() + else: + train() \ No newline at end of file diff --git a/experiments/2D_car/car_env.py b/experiments/2D_car/car_env.py new file mode 100644 index 0000000..8a8ed2b --- /dev/null +++ b/experiments/2D_car/car_env.py @@ -0,0 +1,234 @@ +""" +Environment for 2D car driving. +You can customize this script in a way you want. + +View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/ + + +Requirement: +pyglet >= 1.2.4 +numpy >= 1.12.1 +""" +import numpy as np +import pyglet + + +pyglet.clock.set_fps_limit(10000) + + +class CarEnv(object): + n_sensor = 5 + action_dim = 1 + state_dim = n_sensor + viewer = None + viewer_xy = (500, 500) + sensor_max = 150. + start_point = [450, 300] + speed = 50. + dt = 0.1 + + def __init__(self, discrete_action=False): + self.is_discrete_action = discrete_action + if discrete_action: + self.actions = [-1, 0, 1] + else: + self.action_bound = [-1, 1] + + self.terminal = False + # node1 (x, y, r, w, l), + self.car_info = np.array([0, 0, 0, 20, 40], dtype=np.float64) # car coordination + self.obstacle_coords = np.array([ + [120, 120], + [380, 120], + [380, 380], + [120, 380], + ]) + self.sensor_info = self.sensor_max + np.zeros((self.n_sensor, 3)) # n sensors, (distance, end_x, end_y) + + def step(self, action): + if self.is_discrete_action: + action = self.actions[action] + else: + action = np.clip(action, *self.action_bound)[0] + self.car_info[2] += action * np.pi/30 # max r = 6 degree + self.car_info[:2] = self.car_info[:2] + \ + self.speed * self.dt * np.array([np.cos(self.car_info[2]), np.sin(self.car_info[2])]) + + self._update_sensor() + s = self._get_state() + r = -1 if self.terminal else 0 + return s, r, self.terminal + + def reset(self): + self.terminal = False + self.car_info[:3] = np.array([*self.start_point, -np.pi/2]) + self._update_sensor() + return self._get_state() + + def render(self): + if self.viewer is None: + self.viewer = Viewer(*self.viewer_xy, self.car_info, self.sensor_info, self.obstacle_coords) + self.viewer.render() + + def sample_action(self): + if self.is_discrete_action: + a = np.random.choice(list(range(3))) + else: + a = np.random.uniform(*self.action_bound, size=self.action_dim) + return a + + def set_fps(self, fps=30): + pyglet.clock.set_fps_limit(fps) + + def _get_state(self): + s = self.sensor_info[:, 0].flatten()/self.sensor_max + return s + + def _update_sensor(self): + cx, cy, rotation = self.car_info[:3] + + n_sensors = len(self.sensor_info) + sensor_theta = np.linspace(-np.pi / 2, np.pi / 2, n_sensors) + xs = cx + (np.zeros((n_sensors, ))+self.sensor_max) * np.cos(sensor_theta) + ys = cy + (np.zeros((n_sensors, ))+self.sensor_max) * np.sin(sensor_theta) + xys = np.array([[x, y] for x, y in zip(xs, ys)]) # shape (5 sensors, 2) + + # sensors + tmp_x = xys[:, 0] - cx + tmp_y = xys[:, 1] - cy + # apply rotation + rotated_x = tmp_x * np.cos(rotation) - tmp_y * np.sin(rotation) + rotated_y = tmp_x * np.sin(rotation) + tmp_y * np.cos(rotation) + # rotated x y + self.sensor_info[:, -2:] = np.vstack([rotated_x+cx, rotated_y+cy]).T + + q = np.array([cx, cy]) + for si in range(len(self.sensor_info)): + s = self.sensor_info[si, -2:] - q + possible_sensor_distance = [self.sensor_max] + possible_intersections = [self.sensor_info[si, -2:]] + + # obstacle collision + for oi in range(len(self.obstacle_coords)): + p = self.obstacle_coords[oi] + r = self.obstacle_coords[(oi + 1) % len(self.obstacle_coords)] - self.obstacle_coords[oi] + if np.cross(r, s) != 0: # may collision + t = np.cross((q - p), s) / np.cross(r, s) + u = np.cross((q - p), r) / np.cross(r, s) + if 0 <= t <= 1 and 0 <= u <= 1: + intersection = q + u * s + possible_intersections.append(intersection) + possible_sensor_distance.append(np.linalg.norm(u*s)) + + # window collision + win_coord = np.array([ + [0, 0], + [self.viewer_xy[0], 0], + [*self.viewer_xy], + [0, self.viewer_xy[1]], + [0, 0], + ]) + for oi in range(4): + p = win_coord[oi] + r = win_coord[(oi + 1) % len(win_coord)] - win_coord[oi] + if np.cross(r, s) != 0: # may collision + t = np.cross((q - p), s) / np.cross(r, s) + u = np.cross((q - p), r) / np.cross(r, s) + if 0 <= t <= 1 and 0 <= u <= 1: + intersection = p + t * r + possible_intersections.append(intersection) + possible_sensor_distance.append(np.linalg.norm(intersection - q)) + + distance = np.min(possible_sensor_distance) + distance_index = np.argmin(possible_sensor_distance) + self.sensor_info[si, 0] = distance + self.sensor_info[si, -2:] = possible_intersections[distance_index] + if distance < self.car_info[-1]/2: + self.terminal = True + + +class Viewer(pyglet.window.Window): + color = { + 'background': [1]*3 + [1] + } + fps_display = pyglet.clock.ClockDisplay() + bar_thc = 5 + + def __init__(self, width, height, car_info, sensor_info, obstacle_coords): + super(Viewer, self).__init__(width, height, resizable=False, caption='2D car', vsync=False) # vsync=False to not use the monitor FPS + self.set_location(x=80, y=10) + pyglet.gl.glClearColor(*self.color['background']) + + self.car_info = car_info + self.sensor_info = sensor_info + + self.batch = pyglet.graphics.Batch() + background = pyglet.graphics.OrderedGroup(0) + foreground = pyglet.graphics.OrderedGroup(1) + + self.sensors = [] + line_coord = [0, 0] * 2 + c = (73, 73, 73) * 2 + for i in range(len(self.sensor_info)): + self.sensors.append(self.batch.add(2, pyglet.gl.GL_LINES, foreground, ('v2f', line_coord), ('c3B', c))) + + car_box = [0, 0] * 4 + c = (249, 86, 86) * 4 + self.car = self.batch.add(4, pyglet.gl.GL_QUADS, foreground, ('v2f', car_box), ('c3B', c)) + + c = (134, 181, 244) * 4 + self.obstacle = self.batch.add(4, pyglet.gl.GL_QUADS, background, ('v2f', obstacle_coords.flatten()), ('c3B', c)) + + def render(self): + pyglet.clock.tick() + self._update() + self.switch_to() + self.dispatch_events() + self.dispatch_event('on_draw') + self.flip() + + def on_draw(self): + self.clear() + self.batch.draw() + # self.fps_display.draw() + + def _update(self): + cx, cy, r, w, l = self.car_info + + # sensors + for i, sensor in enumerate(self.sensors): + sensor.vertices = [cx, cy, *self.sensor_info[i, -2:]] + + # car + xys = [ + [cx + l / 2, cy + w / 2], + [cx - l / 2, cy + w / 2], + [cx - l / 2, cy - w / 2], + [cx + l / 2, cy - w / 2], + ] + r_xys = [] + for x, y in xys: + tempX = x - cx + tempY = y - cy + # apply rotation + rotatedX = tempX * np.cos(r) - tempY * np.sin(r) + rotatedY = tempX * np.sin(r) + tempY * np.cos(r) + # rotated x y + x = rotatedX + cx + y = rotatedY + cy + r_xys += [x, y] + self.car.vertices = r_xys + + +if __name__ == '__main__': + np.random.seed(1) + env = CarEnv() + env.set_fps(30) + for ep in range(20): + s = env.reset() + # for t in range(100): + while True: + env.render() + s, r, done = env.step(env.sample_action()) + if done: + break \ No newline at end of file diff --git a/experiments/2D_car/collision.py b/experiments/2D_car/collision.py new file mode 100644 index 0000000..1b77643 --- /dev/null +++ b/experiments/2D_car/collision.py @@ -0,0 +1,57 @@ +import numpy as np + +def intersection(): + p = np.array([0, 0]) + r = np.array([1, 1]) + q = np.array([0.1, 0.1]) + s = np.array([.1, .1]) + + if np.cross(r, s) == 0 and np.cross((q-p), r) == 0: # collinear + # t0 = (q − p) · r / (r · r) + # t1 = (q + s − p) · r / (r · r) = t0 + s · r / (r · r) + t0 = np.dot(q-p, r)/np.dot(r, r) + t1 = t0 + np.dot(s, r)/np.dot(r, r) + print(t1, t0) + if ((np.dot(s, r) > 0) and (0 <= t1 - t0 <= 1)) or ((np.dot(s, r) <= 0) and (0 <= t0 - t1 <= 1)): + print('collinear and overlapping, q_s in p_r') + else: + print('collinear and disjoint') + elif np.cross(r, s) == 0 and np.cross((q-p), r) != 0: # parallel r × s = 0 and (q − p) × r ≠ 0, + print('parallel') + else: + t = np.cross((q - p), s) / np.cross(r, s) + u = np.cross((q - p), r) / np.cross(r, s) + if 0 <= t <= 1 and 0 <= u <= 1: + # If r × s ≠ 0 and 0 ≤ t ≤ 1 and 0 ≤ u ≤ 1, the two line segments meet at the point p + t r = q + u s + print('intersection: ', p + t*r) + else: + print('not parallel and not intersect') + + +def point2segment(): + p = np.array([-1, 1]) # coordination of point + a = np.array([0, 1]) # coordination of line segment end 1 + b = np.array([1, 0]) # coordination of line segment end 2 + ab = b-a # line ab + ap = p-a + distance = np.abs(np.cross(ab, ap)/np.linalg.norm(ab)) # d = (AB x AC)/|AB| + print(distance) + + # angle Cos(θ) = A dot B /(|A||B|) + bp = p-b + cosTheta1 = np.dot(ap, ab) / (np.linalg.norm(ap) * np.linalg.norm(ab)) + theta1 = np.arccos(cosTheta1) + cosTheta2 = np.dot(bp, ab) / (np.linalg.norm(bp) * np.linalg.norm(ab)) + theta2 = np.arccos(cosTheta2) + if np.pi/2 <= (theta1 % (np.pi*2)) <= 3/2 * np.pi: + print('out of a') + elif -np.pi/2 <= (theta2 % (np.pi*2)) <= np.pi/2: + print('out of b') + else: + print('between a and b') + + + +if __name__ == '__main__': + point2segment() + # intersection() diff --git a/experiments/Robot_arm/A3C.py b/experiments/Robot_arm/A3C.py new file mode 100644 index 0000000..89150db --- /dev/null +++ b/experiments/Robot_arm/A3C.py @@ -0,0 +1,214 @@ +""" +Environment is a Robot Arm. The arm tries to get to the blue point. +The environment will return a geographic (distance) information for the arm to learn. + +The far away from blue point the less reward; touch blue r+=1; stop at blue for a while then get r=+10. + +You can train this RL by using LOAD = False, after training, this model will be store in the a local folder. +Using LOAD = True to reload the trained model for playing. + +You can customize this script in a way you want. + +View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/ + + +Requirement: +pyglet >= 1.2.4 +numpy >= 1.12.1 +tensorflow >= 1.0.1 +""" + +import multiprocessing +import threading +import tensorflow as tf +import numpy as np +from arm_env import ArmEnv + + +# np.random.seed(1) +# tf.set_random_seed(1) + +MAX_GLOBAL_EP = 2000 +MAX_EP_STEP = 300 +UPDATE_GLOBAL_ITER = 5 +N_WORKERS = multiprocessing.cpu_count() +LR_A = 1e-4 # learning rate for actor +LR_C = 2e-4 # learning rate for critic +GAMMA = 0.9 # reward discount +MODE = ['easy', 'hard'] +n_model = 1 +GLOBAL_NET_SCOPE = 'Global_Net' +ENTROPY_BETA = 0.01 +GLOBAL_RUNNING_R = [] +GLOBAL_EP = 0 + + +env = ArmEnv(mode=MODE[n_model]) +N_S = env.state_dim +N_A = env.action_dim +A_BOUND = env.action_bound +del env + + +class ACNet(object): + def __init__(self, scope, globalAC=None): + + if scope == GLOBAL_NET_SCOPE: # get global network + with tf.variable_scope(scope): + self.s = tf.placeholder(tf.float32, [None, N_S], 'S') + self._build_net() + self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor') + self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic') + else: # local net, calculate losses + with tf.variable_scope(scope): + self.s = tf.placeholder(tf.float32, [None, N_S], 'S') + self.a_his = tf.placeholder(tf.float32, [None, N_A], 'A') + self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget') + + mu, sigma, self.v = self._build_net() + + td = tf.subtract(self.v_target, self.v, name='TD_error') + with tf.name_scope('c_loss'): + self.c_loss = tf.reduce_mean(tf.square(td)) + + with tf.name_scope('wrap_a_out'): + self.test = sigma[0] + mu, sigma = mu * A_BOUND[1], sigma + 1e-5 + + normal_dist = tf.contrib.distributions.Normal(mu, sigma) + + with tf.name_scope('a_loss'): + log_prob = normal_dist.log_prob(self.a_his) + exp_v = log_prob * td + entropy = normal_dist.entropy() # encourage exploration + self.exp_v = ENTROPY_BETA * entropy + exp_v + self.a_loss = tf.reduce_mean(-self.exp_v) + + with tf.name_scope('choose_a'): # use local params to choose action + self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=0), *A_BOUND) + with tf.name_scope('local_grad'): + self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor') + self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic') + self.a_grads = tf.gradients(self.a_loss, self.a_params) + self.c_grads = tf.gradients(self.c_loss, self.c_params) + + with tf.name_scope('sync'): + with tf.name_scope('pull'): + self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)] + self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)] + with tf.name_scope('push'): + self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params)) + self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params)) + + def _build_net(self): + w_init = tf.contrib.layers.xavier_initializer() + with tf.variable_scope('actor'): + l_a = tf.layers.dense(self.s, 400, tf.nn.relu6, kernel_initializer=w_init, name='la') + l_a = tf.layers.dense(l_a, 300, tf.nn.relu6, kernel_initializer=w_init, name='la2') + mu = tf.layers.dense(l_a, N_A, tf.nn.tanh, kernel_initializer=w_init, name='mu') + sigma = tf.layers.dense(l_a, N_A, tf.nn.softplus, kernel_initializer=w_init, name='sigma') + with tf.variable_scope('critic'): + l_c = tf.layers.dense(self.s, 400, tf.nn.relu6, kernel_initializer=w_init, name='lc') + l_c = tf.layers.dense(l_c, 200, tf.nn.relu6, kernel_initializer=w_init, name='lc2') + v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v') # state value + return mu, sigma, v + + def update_global(self, feed_dict): # run by a local + _, _, t = SESS.run([self.update_a_op, self.update_c_op, self.test], feed_dict) # local grads applies to global net + return t + + def pull_global(self): # run by a local + SESS.run([self.pull_a_params_op, self.pull_c_params_op]) + + def choose_action(self, s): # run by a local + s = s[np.newaxis, :] + return SESS.run(self.A, {self.s: s})[0] + + +class Worker(object): + def __init__(self, name, globalAC): + self.env = ArmEnv(mode=MODE[n_model]) + self.name = name + self.AC = ACNet(name, globalAC) + + def work(self): + global GLOBAL_RUNNING_R, GLOBAL_EP + total_step = 1 + buffer_s, buffer_a, buffer_r = [], [], [] + while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: + s = self.env.reset() + ep_r = 0 + for ep_t in range(MAX_EP_STEP): + if self.name == 'W_0': + self.env.render() + a = self.AC.choose_action(s) + s_, r, done = self.env.step(a) + if ep_t == MAX_EP_STEP - 1: done = True + ep_r += r + buffer_s.append(s) + buffer_a.append(a) + buffer_r.append(r) + + if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net + if done: + v_s_ = 0 # terminal + else: + v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0] + buffer_v_target = [] + for r in buffer_r[::-1]: # reverse buffer r + v_s_ = r + GAMMA * v_s_ + buffer_v_target.append(v_s_) + buffer_v_target.reverse() + + buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target) + feed_dict = { + self.AC.s: buffer_s, + self.AC.a_his: buffer_a, + self.AC.v_target: buffer_v_target, + } + test = self.AC.update_global(feed_dict) + buffer_s, buffer_a, buffer_r = [], [], [] + self.AC.pull_global() + + s = s_ + total_step += 1 + if done: + if len(GLOBAL_RUNNING_R) == 0: # record running episode reward + GLOBAL_RUNNING_R.append(ep_r) + else: + GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r) + print( + self.name, + "Ep:", GLOBAL_EP, + "| Ep_r: %i" % GLOBAL_RUNNING_R[-1], + '| Var:', test, + + ) + GLOBAL_EP += 1 + break + +if __name__ == "__main__": + SESS = tf.Session() + + with tf.device("/cpu:0"): + OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA') + OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC') + GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) # we only need its params + workers = [] + # Create worker + for i in range(N_WORKERS): + i_name = 'W_%i' % i # worker name + workers.append(Worker(i_name, GLOBAL_AC)) + + COORD = tf.train.Coordinator() + SESS.run(tf.global_variables_initializer()) + + worker_threads = [] + for worker in workers: + job = lambda: worker.work() + t = threading.Thread(target=job) + t.start() + worker_threads.append(t) + COORD.join(worker_threads) + + diff --git a/experiments/Robot_arm/DDPG.py b/experiments/Robot_arm/DDPG.py new file mode 100644 index 0000000..0eb1b8a --- /dev/null +++ b/experiments/Robot_arm/DDPG.py @@ -0,0 +1,277 @@ +""" +Environment is a Robot Arm. The arm tries to get to the blue point. +The environment will return a geographic (distance) information for the arm to learn. + +The far away from blue point the less reward; touch blue r+=1; stop at blue for a while then get r=+10. + +You can train this RL by using LOAD = False, after training, this model will be store in the a local folder. +Using LOAD = True to reload the trained model for playing. + +You can customize this script in a way you want. + +View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/ + +Requirement: +pyglet >= 1.2.4 +numpy >= 1.12.1 +tensorflow >= 1.0.1 +""" + +import tensorflow as tf +import numpy as np +import os +import shutil +from arm_env import ArmEnv + + +np.random.seed(1) +tf.set_random_seed(1) + +MAX_EPISODES = 600 +MAX_EP_STEPS = 200 +LR_A = 1e-4 # learning rate for actor +LR_C = 1e-4 # learning rate for critic +GAMMA = 0.999 # reward discount +REPLACE_ITER_A = 1100 +REPLACE_ITER_C = 1000 +MEMORY_CAPACITY = 10000 +BATCH_SIZE = 16 +VAR_MIN = 0.1 +RENDER = True +LOAD = False +MODE = ['easy', 'hard'] +n_model = 1 + +env = ArmEnv(mode=MODE[n_model]) +STATE_DIM = env.state_dim +ACTION_DIM = env.action_dim +ACTION_BOUND = env.action_bound + +# all placeholder for tf +with tf.name_scope('S'): + S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s') +with tf.name_scope('A'): + A = tf.placeholder(tf.float32, shape=[None, ACTION_DIM], name='a') +with tf.name_scope('R'): + R = tf.placeholder(tf.float32, [None, 1], name='r') +with tf.name_scope('S_'): + S_ = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s_') + + +class Actor(object): + def __init__(self, sess, action_dim, action_bound, learning_rate, t_replace_iter): + self.sess = sess + self.a_dim = action_dim + self.action_bound = action_bound + self.lr = learning_rate + self.t_replace_iter = t_replace_iter + self.t_replace_counter = 0 + + with tf.variable_scope('Actor'): + # input s, output a + self.a = self._build_net(S, scope='eval_net', trainable=True) + + # input s_, output a, get a_ for critic + self.a_ = self._build_net(S_, scope='target_net', trainable=False) + + self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval_net') + self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target_net') + + def _build_net(self, s, scope, trainable): + with tf.variable_scope(scope): + init_w = tf.contrib.layers.xavier_initializer() + init_b = tf.constant_initializer(0.001) + net = tf.layers.dense(s, 200, activation=tf.nn.relu6, + kernel_initializer=init_w, bias_initializer=init_b, name='l1', + trainable=trainable) + net = tf.layers.dense(net, 200, activation=tf.nn.relu6, + kernel_initializer=init_w, bias_initializer=init_b, name='l2', + trainable=trainable) + net = tf.layers.dense(net, 10, activation=tf.nn.relu, + kernel_initializer=init_w, bias_initializer=init_b, name='l3', + trainable=trainable) + with tf.variable_scope('a'): + actions = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, kernel_initializer=init_w, + name='a', trainable=trainable) + scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a') # Scale output to -action_bound to action_bound + return scaled_a + + def learn(self, s, a): # batch update + self.sess.run(self.train_op, feed_dict={S: s, A: a}) + if self.t_replace_counter % self.t_replace_iter == 0: + self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)]) + self.t_replace_counter += 1 + + def choose_action(self, s): + s = s[np.newaxis, :] # single state + return self.sess.run(self.a, feed_dict={S: s})[0] # single action + + def add_grad_to_graph(self, a_grads): + with tf.variable_scope('policy_grads'): + self.policy_grads = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads) + + with tf.variable_scope('A_train'): + opt = tf.train.RMSPropOptimizer(-self.lr / BATCH_SIZE) # (- learning rate) for ascent policy, div to take mean + self.train_op = opt.apply_gradients(zip(self.policy_grads, self.e_params)) + + +class Critic(object): + def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter, a_): + self.sess = sess + self.s_dim = state_dim + self.a_dim = action_dim + self.lr = learning_rate + self.gamma = gamma + self.t_replace_iter = t_replace_iter + self.t_replace_counter = 0 + + with tf.variable_scope('Critic'): + # Input (s, a), output q + self.q = self._build_net(S, A, 'eval_net', trainable=True) + + # Input (s_, a_), output q_ for q_target + self.q_ = self._build_net(S_, a_, 'target_net', trainable=False) # target_q is based on a_ from Actor's target_net + + self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_net') + self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_net') + + with tf.variable_scope('target_q'): + self.target_q = R + self.gamma * self.q_ + + with tf.variable_scope('TD_error'): + self.loss = tf.reduce_mean(tf.squared_difference(self.target_q, self.q)) + + with tf.variable_scope('C_train'): + self.train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss) + + with tf.variable_scope('a_grad'): + self.a_grads = tf.gradients(self.q, A)[0] # tensor of gradients of each sample (None, a_dim) + + def _build_net(self, s, a, scope, trainable): + with tf.variable_scope(scope): + init_w = tf.contrib.layers.xavier_initializer() + init_b = tf.constant_initializer(0.01) + + with tf.variable_scope('l1'): + n_l1 = 200 + w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], initializer=init_w, trainable=trainable) + w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], initializer=init_w, trainable=trainable) + b1 = tf.get_variable('b1', [1, n_l1], initializer=init_b, trainable=trainable) + net = tf.nn.relu6(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1) + net = tf.layers.dense(net, 200, activation=tf.nn.relu6, + kernel_initializer=init_w, bias_initializer=init_b, name='l2', + trainable=trainable) + net = tf.layers.dense(net, 10, activation=tf.nn.relu, + kernel_initializer=init_w, bias_initializer=init_b, name='l3', + trainable=trainable) + with tf.variable_scope('q'): + q = tf.layers.dense(net, 1, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable) # Q(s,a) + return q + + def learn(self, s, a, r, s_): + self.sess.run(self.train_op, feed_dict={S: s, A: a, R: r, S_: s_}) + if self.t_replace_counter % self.t_replace_iter == 0: + self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)]) + self.t_replace_counter += 1 + + +class Memory(object): + def __init__(self, capacity, dims): + self.capacity = capacity + self.data = np.zeros((capacity, dims)) + self.pointer = 0 + + def store_transition(self, s, a, r, s_): + transition = np.hstack((s, a, [r], s_)) + index = self.pointer % self.capacity # replace the old memory with new memory + self.data[index, :] = transition + self.pointer += 1 + + def sample(self, n): + assert self.pointer >= self.capacity, 'Memory has not been fulfilled' + indices = np.random.choice(self.capacity, size=n) + return self.data[indices, :] + + +sess = tf.Session() + +# Create actor and critic. +actor = Actor(sess, ACTION_DIM, ACTION_BOUND[1], LR_A, REPLACE_ITER_A) +critic = Critic(sess, STATE_DIM, ACTION_DIM, LR_C, GAMMA, REPLACE_ITER_C, actor.a_) +actor.add_grad_to_graph(critic.a_grads) + +M = Memory(MEMORY_CAPACITY, dims=2 * STATE_DIM + ACTION_DIM + 1) + +saver = tf.train.Saver() +path = './'+MODE[n_model] + +if LOAD: + saver.restore(sess, tf.train.latest_checkpoint(path)) +else: + sess.run(tf.global_variables_initializer()) + + +def train(): + var = 2. # control exploration + + for ep in range(MAX_EPISODES): + s = env.reset() + ep_reward = 0 + + for t in range(MAX_EP_STEPS): + # while True: + if RENDER: + env.render() + + # Added exploration noise + a = actor.choose_action(s) + a = np.clip(np.random.normal(a, var), *ACTION_BOUND) # add randomness to action selection for exploration + s_, r, done = env.step(a) + M.store_transition(s, a, r, s_) + + if M.pointer > MEMORY_CAPACITY: + var = max([var*.99995, VAR_MIN]) # decay the action randomness + b_M = M.sample(BATCH_SIZE) + b_s = b_M[:, :STATE_DIM] + b_a = b_M[:, STATE_DIM: STATE_DIM + ACTION_DIM] + b_r = b_M[:, -STATE_DIM - 1: -STATE_DIM] + b_s_ = b_M[:, -STATE_DIM:] + + critic.learn(b_s, b_a, b_r, b_s_) + actor.learn(b_s, b_a) + + s = s_ + ep_reward += r + + if t == MAX_EP_STEPS-1 or done: + # if done: + result = '| done' if done else '| ----' + print('Ep:', ep, + result, + '| R: %i' % int(ep_reward), + '| Explore: %.2f' % var, + ) + break + + if os.path.isdir(path): shutil.rmtree(path) + os.mkdir(path) + ckpt_path = os.path.join('./'+MODE[n_model], 'DDPG.ckpt') + save_path = saver.save(sess, ckpt_path, write_meta_graph=False) + print("\nSave Model %s\n" % save_path) + + +def eval(): + env.set_fps(30) + s = env.reset() + while True: + if RENDER: + env.render() + a = actor.choose_action(s) + s_, r, done = env.step(a) + s = s_ + +if __name__ == '__main__': + if LOAD: + eval() + else: + train() \ No newline at end of file diff --git a/experiments/Robot_arm/arm_env.py b/experiments/Robot_arm/arm_env.py new file mode 100644 index 0000000..a0eb0fd --- /dev/null +++ b/experiments/Robot_arm/arm_env.py @@ -0,0 +1,218 @@ +""" +Environment for Robot Arm. +You can customize this script in a way you want. + +View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/ + + +Requirement: +pyglet >= 1.2.4 +numpy >= 1.12.1 +""" +import numpy as np +import pyglet + + +pyglet.clock.set_fps_limit(10000) + + +class ArmEnv(object): + action_bound = [-1, 1] + action_dim = 2 + state_dim = 7 + dt = .1 # refresh rate + arm1l = 100 + arm2l = 100 + viewer = None + viewer_xy = (400, 400) + get_point = False + mouse_in = np.array([False]) + point_l = 15 + grab_counter = 0 + + def __init__(self, mode='easy'): + # node1 (l, d_rad, x, y), + # node2 (l, d_rad, x, y) + self.mode = mode + self.arm_info = np.zeros((2, 4)) + self.arm_info[0, 0] = self.arm1l + self.arm_info[1, 0] = self.arm2l + self.point_info = np.array([250, 303]) + self.point_info_init = self.point_info.copy() + self.center_coord = np.array(self.viewer_xy)/2 + + def step(self, action): + # action = (node1 angular v, node2 angular v) + action = np.clip(action, *self.action_bound) + self.arm_info[:, 1] += action * self.dt + self.arm_info[:, 1] %= np.pi * 2 + + arm1rad = self.arm_info[0, 1] + arm2rad = self.arm_info[1, 1] + arm1dx_dy = np.array([self.arm_info[0, 0] * np.cos(arm1rad), self.arm_info[0, 0] * np.sin(arm1rad)]) + arm2dx_dy = np.array([self.arm_info[1, 0] * np.cos(arm2rad), self.arm_info[1, 0] * np.sin(arm2rad)]) + self.arm_info[0, 2:4] = self.center_coord + arm1dx_dy # (x1, y1) + self.arm_info[1, 2:4] = self.arm_info[0, 2:4] + arm2dx_dy # (x2, y2) + + s, arm2_distance = self._get_state() + r = self._r_func(arm2_distance) + + return s, r, self.get_point + + def reset(self): + self.get_point = False + self.grab_counter = 0 + + if self.mode == 'hard': + pxy = np.clip(np.random.rand(2) * self.viewer_xy[0], 100, 300) + self.point_info[:] = pxy + else: + arm1rad, arm2rad = np.random.rand(2) * np.pi * 2 + self.arm_info[0, 1] = arm1rad + self.arm_info[1, 1] = arm2rad + arm1dx_dy = np.array([self.arm_info[0, 0] * np.cos(arm1rad), self.arm_info[0, 0] * np.sin(arm1rad)]) + arm2dx_dy = np.array([self.arm_info[1, 0] * np.cos(arm2rad), self.arm_info[1, 0] * np.sin(arm2rad)]) + self.arm_info[0, 2:4] = self.center_coord + arm1dx_dy # (x1, y1) + self.arm_info[1, 2:4] = self.arm_info[0, 2:4] + arm2dx_dy # (x2, y2) + + self.point_info[:] = self.point_info_init + return self._get_state()[0] + + def render(self): + if self.viewer is None: + self.viewer = Viewer(*self.viewer_xy, self.arm_info, self.point_info, self.point_l, self.mouse_in) + self.viewer.render() + + def sample_action(self): + return np.random.uniform(*self.action_bound, size=self.action_dim) + + def set_fps(self, fps=30): + pyglet.clock.set_fps_limit(fps) + + def _get_state(self): + # return the distance (dx, dy) between arm finger point with blue point + arm_end = self.arm_info[:, 2:4] + t_arms = np.ravel(arm_end - self.point_info) + center_dis = (self.center_coord - self.point_info)/200 + in_point = 1 if self.grab_counter > 0 else 0 + return np.hstack([in_point, t_arms/200, center_dis, + # arm1_distance_p, arm1_distance_b, + ]), t_arms[-2:] + + def _r_func(self, distance): + t = 50 + abs_distance = np.sqrt(np.sum(np.square(distance))) + r = -abs_distance/200 + if abs_distance < self.point_l and (not self.get_point): + r += 1. + self.grab_counter += 1 + if self.grab_counter > t: + r += 10. + self.get_point = True + elif abs_distance > self.point_l: + self.grab_counter = 0 + self.get_point = False + return r + + +class Viewer(pyglet.window.Window): + color = { + 'background': [1]*3 + [1] + } + fps_display = pyglet.clock.ClockDisplay() + bar_thc = 5 + + def __init__(self, width, height, arm_info, point_info, point_l, mouse_in): + super(Viewer, self).__init__(width, height, resizable=False, caption='Arm', vsync=False) # vsync=False to not use the monitor FPS + self.set_location(x=80, y=10) + pyglet.gl.glClearColor(*self.color['background']) + + self.arm_info = arm_info + self.point_info = point_info + self.mouse_in = mouse_in + self.point_l = point_l + + self.center_coord = np.array((min(width, height)/2, ) * 2) + self.batch = pyglet.graphics.Batch() + + arm1_box, arm2_box, point_box = [0]*8, [0]*8, [0]*8 + c1, c2, c3 = (249, 86, 86)*4, (86, 109, 249)*4, (249, 39, 65)*4 + self.point = self.batch.add(4, pyglet.gl.GL_QUADS, None, ('v2f', point_box), ('c3B', c2)) + self.arm1 = self.batch.add(4, pyglet.gl.GL_QUADS, None, ('v2f', arm1_box), ('c3B', c1)) + self.arm2 = self.batch.add(4, pyglet.gl.GL_QUADS, None, ('v2f', arm2_box), ('c3B', c1)) + + def render(self): + pyglet.clock.tick() + self._update_arm() + self.switch_to() + self.dispatch_events() + self.dispatch_event('on_draw') + self.flip() + + def on_draw(self): + self.clear() + self.batch.draw() + # self.fps_display.draw() + + def _update_arm(self): + point_l = self.point_l + point_box = (self.point_info[0] - point_l, self.point_info[1] - point_l, + self.point_info[0] + point_l, self.point_info[1] - point_l, + self.point_info[0] + point_l, self.point_info[1] + point_l, + self.point_info[0] - point_l, self.point_info[1] + point_l) + self.point.vertices = point_box + + arm1_coord = (*self.center_coord, *(self.arm_info[0, 2:4])) # (x0, y0, x1, y1) + arm2_coord = (*(self.arm_info[0, 2:4]), *(self.arm_info[1, 2:4])) # (x1, y1, x2, y2) + arm1_thick_rad = np.pi / 2 - self.arm_info[0, 1] + x01, y01 = arm1_coord[0] - np.cos(arm1_thick_rad) * self.bar_thc, arm1_coord[1] + np.sin( + arm1_thick_rad) * self.bar_thc + x02, y02 = arm1_coord[0] + np.cos(arm1_thick_rad) * self.bar_thc, arm1_coord[1] - np.sin( + arm1_thick_rad) * self.bar_thc + x11, y11 = arm1_coord[2] + np.cos(arm1_thick_rad) * self.bar_thc, arm1_coord[3] - np.sin( + arm1_thick_rad) * self.bar_thc + x12, y12 = arm1_coord[2] - np.cos(arm1_thick_rad) * self.bar_thc, arm1_coord[3] + np.sin( + arm1_thick_rad) * self.bar_thc + arm1_box = (x01, y01, x02, y02, x11, y11, x12, y12) + arm2_thick_rad = np.pi / 2 - self.arm_info[1, 1] + x11_, y11_ = arm2_coord[0] + np.cos(arm2_thick_rad) * self.bar_thc, arm2_coord[1] - np.sin( + arm2_thick_rad) * self.bar_thc + x12_, y12_ = arm2_coord[0] - np.cos(arm2_thick_rad) * self.bar_thc, arm2_coord[1] + np.sin( + arm2_thick_rad) * self.bar_thc + x21, y21 = arm2_coord[2] - np.cos(arm2_thick_rad) * self.bar_thc, arm2_coord[3] + np.sin( + arm2_thick_rad) * self.bar_thc + x22, y22 = arm2_coord[2] + np.cos(arm2_thick_rad) * self.bar_thc, arm2_coord[3] - np.sin( + arm2_thick_rad) * self.bar_thc + arm2_box = (x11_, y11_, x12_, y12_, x21, y21, x22, y22) + self.arm1.vertices = arm1_box + self.arm2.vertices = arm2_box + + def on_key_press(self, symbol, modifiers): + if symbol == pyglet.window.key.UP: + self.arm_info[0, 1] += .1 + print(self.arm_info[:, 2:4] - self.point_info) + elif symbol == pyglet.window.key.DOWN: + self.arm_info[0, 1] -= .1 + print(self.arm_info[:, 2:4] - self.point_info) + elif symbol == pyglet.window.key.LEFT: + self.arm_info[1, 1] += .1 + print(self.arm_info[:, 2:4] - self.point_info) + elif symbol == pyglet.window.key.RIGHT: + self.arm_info[1, 1] -= .1 + print(self.arm_info[:, 2:4] - self.point_info) + elif symbol == pyglet.window.key.Q: + pyglet.clock.set_fps_limit(1000) + elif symbol == pyglet.window.key.A: + pyglet.clock.set_fps_limit(30) + + def on_mouse_motion(self, x, y, dx, dy): + self.point_info[:] = [x, y] + + def on_mouse_enter(self, x, y): + self.mouse_in[0] = True + + def on_mouse_leave(self, x, y): + self.mouse_in[0] = False + + + diff --git a/experiments/Solve_BipedalWalker/A3C.py b/experiments/Solve_BipedalWalker/A3C.py new file mode 100644 index 0000000..7f4bc45 --- /dev/null +++ b/experiments/Solve_BipedalWalker/A3C.py @@ -0,0 +1,209 @@ +""" +Asynchronous Advantage Actor Critic (A3C), Reinforcement Learning. + +The BipedalWalker example. + +View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/ + +Using: +tensorflow 1.0 +gym 0.8.0 +""" + +import multiprocessing +import threading +import tensorflow as tf +import numpy as np +import gym +import os +import shutil + + +GAME = 'BipedalWalker-v2' +OUTPUT_GRAPH = False +LOG_DIR = './log' +N_WORKERS = multiprocessing.cpu_count() +MAX_GLOBAL_EP = 8000 +GLOBAL_NET_SCOPE = 'Global_Net' +UPDATE_GLOBAL_ITER = 10 +GAMMA = 0.999 +ENTROPY_BETA = 0.005 +LR_A = 0.00002 # learning rate for actor +LR_C = 0.0001 # learning rate for critic +GLOBAL_RUNNING_R = [] +GLOBAL_EP = 0 + +env = gym.make(GAME) + +N_S = env.observation_space.shape[0] +N_A = env.action_space.shape[0] +A_BOUND = [env.action_space.low, env.action_space.high] +del env + + +class ACNet(object): + def __init__(self, scope, globalAC=None): + + if scope == GLOBAL_NET_SCOPE: # get global network + with tf.variable_scope(scope): + self.s = tf.placeholder(tf.float32, [None, N_S], 'S') + self._build_net() + self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor') + self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic') + else: # local net, calculate losses + with tf.variable_scope(scope): + self.s = tf.placeholder(tf.float32, [None, N_S], 'S') + self.a_his = tf.placeholder(tf.float32, [None, N_A], 'A') + self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget') + + mu, sigma, self.v = self._build_net() + + td = tf.subtract(self.v_target, self.v, name='TD_error') + with tf.name_scope('c_loss'): + self.c_loss = tf.reduce_mean(tf.square(td)) + + with tf.name_scope('wrap_a_out'): + self.test = sigma[0] + mu, sigma = mu * A_BOUND[1], sigma + 1e-5 + + normal_dist = tf.contrib.distributions.Normal(mu, sigma) + + with tf.name_scope('a_loss'): + log_prob = normal_dist.log_prob(self.a_his) + exp_v = log_prob * td + entropy = normal_dist.entropy() # encourage exploration + self.exp_v = ENTROPY_BETA * entropy + exp_v + self.a_loss = tf.reduce_mean(-self.exp_v) + + with tf.name_scope('choose_a'): # use local params to choose action + self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=0), *A_BOUND) + with tf.name_scope('local_grad'): + self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor') + self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic') + self.a_grads = tf.gradients(self.a_loss, self.a_params) + self.c_grads = tf.gradients(self.c_loss, self.c_params) + + with tf.name_scope('sync'): + with tf.name_scope('pull'): + self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)] + self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)] + with tf.name_scope('push'): + self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params)) + self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params)) + + def _build_net(self): + w_init = tf.contrib.layers.xavier_initializer() + with tf.variable_scope('actor'): + l_a = tf.layers.dense(self.s, 500, tf.nn.relu6, kernel_initializer=w_init, name='la') + l_a = tf.layers.dense(l_a, 300, tf.nn.relu6, kernel_initializer=w_init, name='la2') + mu = tf.layers.dense(l_a, N_A, tf.nn.tanh, kernel_initializer=w_init, name='mu') + sigma = tf.layers.dense(l_a, N_A, tf.nn.softplus, kernel_initializer=w_init, name='sigma') + with tf.variable_scope('critic'): + l_c = tf.layers.dense(self.s, 500, tf.nn.relu6, kernel_initializer=w_init, name='lc') + l_c = tf.layers.dense(l_c, 200, tf.nn.relu6, kernel_initializer=w_init, name='lc2') + v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v') # state value + return mu, sigma, v + + def update_global(self, feed_dict): # run by a local + _, _, t = SESS.run([self.update_a_op, self.update_c_op, self.test], feed_dict) # local grads applies to global net + return t + + def pull_global(self): # run by a local + SESS.run([self.pull_a_params_op, self.pull_c_params_op]) + + def choose_action(self, s): # run by a local + s = s[np.newaxis, :] + return SESS.run(self.A, {self.s: s})[0] + + +class Worker(object): + def __init__(self, name, globalAC): + self.env = gym.make(GAME) + self.name = name + self.AC = ACNet(name, globalAC) + + def work(self): + global GLOBAL_RUNNING_R, GLOBAL_EP + total_step = 1 + buffer_s, buffer_a, buffer_r = [], [], [] + while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: + s = self.env.reset() + ep_r = 0 + while True: + if self.name == 'W_0' and total_step % 30 == 0: + self.env.render() + a = self.AC.choose_action(s) + s_, r, done, info = self.env.step(a) + if r == -100: r = -2 + + ep_r += r + buffer_s.append(s) + buffer_a.append(a) + buffer_r.append(r) + + if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net + if done: + v_s_ = 0 # terminal + else: + v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0] + buffer_v_target = [] + for r in buffer_r[::-1]: # reverse buffer r + v_s_ = r + GAMMA * v_s_ + buffer_v_target.append(v_s_) + buffer_v_target.reverse() + + buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target) + feed_dict = { + self.AC.s: buffer_s, + self.AC.a_his: buffer_a, + self.AC.v_target: buffer_v_target, + } + test = self.AC.update_global(feed_dict) + buffer_s, buffer_a, buffer_r = [], [], [] + self.AC.pull_global() + + s = s_ + total_step += 1 + if done: + achieve = '| Achieve' if self.env.unwrapped.hull.position[0] >= 88 else '| -------' + if len(GLOBAL_RUNNING_R) == 0: # record running episode reward + GLOBAL_RUNNING_R.append(ep_r) + else: + GLOBAL_RUNNING_R.append(0.95 * GLOBAL_RUNNING_R[-1] + 0.05 * ep_r) + print( + self.name, + "Ep:", GLOBAL_EP, + achieve, + "| Pos: %i" % self.env.unwrapped.hull.position[0], + "| RR: %.1f" % GLOBAL_RUNNING_R[-1], + '| EpR: %.1f' % ep_r, + '| var:', test, + ) + GLOBAL_EP += 1 + break + +if __name__ == "__main__": + SESS = tf.Session() + + with tf.device("/cpu:0"): + OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA') + OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC') + GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) # we only need its params + workers = [] + # Create worker + for i in range(N_WORKERS): + i_name = 'W_%i' % i # worker name + workers.append(Worker(i_name, GLOBAL_AC)) + + COORD = tf.train.Coordinator() + SESS.run(tf.global_variables_initializer()) + + worker_threads = [] + for worker in workers: + job = lambda: worker.work() + t = threading.Thread(target=job) + t.start() + worker_threads.append(t) + COORD.join(worker_threads) + + diff --git a/experiments/Solve_BipedalWalker/A3C_rnn.py b/experiments/Solve_BipedalWalker/A3C_rnn.py new file mode 100644 index 0000000..acdc951 --- /dev/null +++ b/experiments/Solve_BipedalWalker/A3C_rnn.py @@ -0,0 +1,235 @@ +""" +Asynchronous Advantage Actor Critic (A3C), Reinforcement Learning. + +The BipedalWalker example. + +View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/ + +Using: +tensorflow 1.0 +gym 0.8.0 +""" + +import multiprocessing +import threading +import tensorflow as tf +import numpy as np +import gym +import os +import shutil + + +GAME = 'BipedalWalker-v2' +OUTPUT_GRAPH = False +LOG_DIR = './log' +N_WORKERS = multiprocessing.cpu_count() +MAX_GLOBAL_EP = 8000 +GLOBAL_NET_SCOPE = 'Global_Net' +UPDATE_GLOBAL_ITER = 10 +GAMMA = 0.99 +ENTROPY_BETA = 0.005 +LR_A = 0.00001 # learning rate for actor +LR_C = 0.0001 # learning rate for critic +GLOBAL_RUNNING_R = [] +GLOBAL_EP = 0 + +env = gym.make(GAME) + +N_S = env.observation_space.shape[0] +N_A = env.action_space.shape[0] +A_BOUND = [env.action_space.low, env.action_space.high] +del env + + +class ACNet(object): + def __init__(self, scope, globalAC=None): + + if scope == GLOBAL_NET_SCOPE: # get global network + with tf.variable_scope(scope): + self.s = tf.placeholder(tf.float32, [None, N_S], 'S') + self._build_net(N_A) + self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor') + self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic') + else: # local net, calculate losses + with tf.variable_scope(scope): + self.s = tf.placeholder(tf.float32, [None, N_S], 'S') + self.a_his = tf.placeholder(tf.float32, [None, N_A], 'A') + self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget') + + mu, sigma, self.v = self._build_net(N_A) + + td = tf.subtract(self.v_target, self.v, name='TD_error') + with tf.name_scope('c_loss'): + self.c_loss = tf.reduce_mean(tf.square(td)) + + with tf.name_scope('wrap_a_out'): + self.test = sigma[0] + mu, sigma = mu * A_BOUND[1], sigma + 1e-5 + + normal_dist = tf.contrib.distributions.Normal(mu, sigma) + + with tf.name_scope('a_loss'): + log_prob = normal_dist.log_prob(self.a_his) + exp_v = log_prob * td + entropy = normal_dist.entropy() # encourage exploration + self.exp_v = ENTROPY_BETA * entropy + exp_v + self.a_loss = tf.reduce_mean(-self.exp_v) + + with tf.name_scope('choose_a'): # use local params to choose action + self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=0), A_BOUND[0], A_BOUND[1]) + + with tf.name_scope('local_grad'): + self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor') + self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic') + self.a_grads = tf.gradients(self.a_loss, self.a_params) + self.c_grads = tf.gradients(self.c_loss, self.c_params) + + with tf.name_scope('sync'): + with tf.name_scope('pull'): + self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in + zip(self.a_params, globalAC.a_params)] + self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in + zip(self.c_params, globalAC.c_params)] + with tf.name_scope('push'): + self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params)) + self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params)) + + def _build_net(self, n_a): + w_init = tf.random_normal_initializer(0., .01) + with tf.variable_scope('critic'): # only critic controls the rnn update + cell_size = 128 + s = tf.expand_dims(self.s, axis=1, + name='timely_input') # [time_step, feature] => [time_step, batch, feature] + rnn_cell = tf.contrib.rnn.BasicRNNCell(cell_size) + self.init_state = rnn_cell.zero_state(batch_size=1, dtype=tf.float32) + outputs, self.final_state = tf.nn.dynamic_rnn( + cell=rnn_cell, inputs=s, initial_state=self.init_state, time_major=True) + cell_out = tf.reshape(outputs, [-1, cell_size], name='flatten_rnn_outputs') # joined state representation + l_c = tf.layers.dense(cell_out, 300, tf.nn.relu6, kernel_initializer=w_init, name='lc') + v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v') # state value + + with tf.variable_scope('actor'): # state representation is based on critic + cell_out = tf.stop_gradient(cell_out, name='c_cell_out') # from what critic think it is + l_a = tf.layers.dense(cell_out, 400, tf.nn.relu6, kernel_initializer=w_init, name='la') + mu = tf.layers.dense(l_a, N_A, tf.nn.tanh, kernel_initializer=w_init, name='mu') + sigma = tf.layers.dense(l_a, N_A, tf.nn.softplus, kernel_initializer=w_init, name='sigma') # restrict variance + return mu, sigma, v + + def update_global(self, feed_dict): # run by a local + _, _, t = SESS.run([self.update_a_op, self.update_c_op, self.test], feed_dict) # local grads applies to global net + return t + + def pull_global(self): # run by a local + SESS.run([self.pull_a_params_op, self.pull_c_params_op]) + + def choose_action(self, s, cell_state): # run by a local + s = s[np.newaxis, :] + a, cell_state = SESS.run([self.A, self.final_state], {self.s: s, self.init_state: cell_state}) + return a[0], cell_state + + +class Worker(object): + def __init__(self, name, globalAC): + self.env = gym.make(GAME) + self.name = name + self.AC = ACNet(name, globalAC) + + def work(self): + global GLOBAL_RUNNING_R, GLOBAL_EP + total_step = 1 + buffer_s, buffer_a, buffer_r = [], [], [] + while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: + s = self.env.reset() + ep_r = 0 + rnn_state = SESS.run(self.AC.init_state) # zero rnn state at beginning + keep_state = rnn_state.copy() # keep rnn state for updating global net + while True: + if self.name == 'W_0' and total_step % 30 == 0: + self.env.render() + + a, rnn_state_ = self.AC.choose_action(s, rnn_state) # get the action and next rnn state + s_, r, done, info = self.env.step(a) + if r == -100: r = -2 + + ep_r += r + buffer_s.append(s) + buffer_a.append(a) + buffer_r.append(r) + + if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net + if done: + v_s_ = 0 # terminal + else: + v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :], self.AC.init_state: rnn_state_})[ + 0, 0] + buffer_v_target = [] + for r in buffer_r[::-1]: # reverse buffer r + v_s_ = r + GAMMA * v_s_ + buffer_v_target.append(v_s_) + buffer_v_target.reverse() + + buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack( + buffer_v_target) + + feed_dict = { + self.AC.s: buffer_s, + self.AC.a_his: buffer_a, + self.AC.v_target: buffer_v_target, + self.AC.init_state: keep_state, + } + + test = self.AC.update_global(feed_dict) + buffer_s, buffer_a, buffer_r = [], [], [] + self.AC.pull_global() + keep_state = rnn_state_.copy() # replace the keep_state as the new initial rnn state_ + + s = s_ + rnn_state = rnn_state_ # renew rnn state + total_step += 1 + + if done: + achieve = '| Achieve' if self.env.unwrapped.hull.position[0] >= 88 else '| -------' + if len(GLOBAL_RUNNING_R) == 0: # record running episode reward + GLOBAL_RUNNING_R.append(ep_r) + else: + GLOBAL_RUNNING_R.append(0.95 * GLOBAL_RUNNING_R[-1] + 0.05 * ep_r) + print( + self.name, + "Ep:", GLOBAL_EP, + achieve, + "| Pos: %i" % self.env.unwrapped.hull.position[0], + "| RR: %.1f" % GLOBAL_RUNNING_R[-1], + '| EpR: %.1f' % ep_r, + '| var:', test, + ) + GLOBAL_EP += 1 + break + + +if __name__ == "__main__": + SESS = tf.Session() + + with tf.device("/cpu:0"): + OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA', decay=0.95) + OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC', decay=0.95) + GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) # we only need its params + workers = [] + # Create worker + for i in range(N_WORKERS): + i_name = 'W_%i' % i # worker name + workers.append(Worker(i_name, GLOBAL_AC)) + + COORD = tf.train.Coordinator() + SESS.run(tf.global_variables_initializer()) + + if OUTPUT_GRAPH: + if os.path.exists(LOG_DIR): + shutil.rmtree(LOG_DIR) + tf.summary.FileWriter(LOG_DIR, SESS.graph) + + worker_threads = [] + for worker in workers: + t = threading.Thread(target=worker.work) + t.start() + worker_threads.append(t) + COORD.join(worker_threads) diff --git a/experiments/Solve_BipedalWalker/DDPG.py b/experiments/Solve_BipedalWalker/DDPG.py new file mode 100644 index 0000000..9f0a824 --- /dev/null +++ b/experiments/Solve_BipedalWalker/DDPG.py @@ -0,0 +1,390 @@ +import tensorflow as tf +import numpy as np +import gym +import os +import shutil + +np.random.seed(1) +tf.set_random_seed(1) + +MAX_EPISODES = 2000 +LR_A = 0.0005 # learning rate for actor +LR_C = 0.0005 # learning rate for critic +GAMMA = 0.999 # reward discount +REPLACE_ITER_A = 1700 +REPLACE_ITER_C = 1500 +MEMORY_CAPACITY = 200000 +BATCH_SIZE = 32 +DISPLAY_THRESHOLD = 100 # display until the running reward > 100 +DATA_PATH = './data' +LOAD_MODEL = False +SAVE_MODEL_ITER = 100000 +RENDER = False +OUTPUT_GRAPH = False +ENV_NAME = 'BipedalWalker-v2' + +GLOBAL_STEP = tf.Variable(0, trainable=False) +INCREASE_GS = GLOBAL_STEP.assign(tf.add(GLOBAL_STEP, 1)) +LR_A = tf.train.exponential_decay(LR_A, GLOBAL_STEP, 10000, .97, staircase=True) +LR_C = tf.train.exponential_decay(LR_C, GLOBAL_STEP, 10000, .97, staircase=True) +END_POINT = (200 - 10) * (14/30) # from game + +env = gym.make(ENV_NAME) +env.seed(1) + +STATE_DIM = env.observation_space.shape[0] # 24 +ACTION_DIM = env.action_space.shape[0] # 4 +ACTION_BOUND = env.action_space.high # [1, 1, 1, 1] + +# all placeholder for tf +with tf.name_scope('S'): + S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s') +with tf.name_scope('A'): + A = tf.placeholder(tf.float32, shape=[None, ACTION_DIM], name='a') +with tf.name_scope('R'): + R = tf.placeholder(tf.float32, [None, 1], name='r') +with tf.name_scope('S_'): + S_ = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s_') + +############################### Actor #################################### + +class Actor(object): + def __init__(self, sess, action_dim, action_bound, learning_rate, t_replace_iter): + self.sess = sess + self.a_dim = action_dim + self.action_bound = action_bound + self.lr = learning_rate + self.t_replace_iter = t_replace_iter + self.t_replace_counter = 0 + + with tf.variable_scope('Actor'): + # input s, output a + self.a = self._build_net(S, scope='eval_net', trainable=True) + + # input s_, output a, get a_ for critic + self.a_ = self._build_net(S_, scope='target_net', trainable=False) + + self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval_net') + self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target_net') + + def _build_net(self, s, scope, trainable): + with tf.variable_scope(scope): + init_w = tf.random_normal_initializer(0., 0.01) + init_b = tf.constant_initializer(0.01) + net = tf.layers.dense(s, 500, activation=tf.nn.relu, + kernel_initializer=init_w, bias_initializer=init_b, name='l1', trainable=trainable) + net = tf.layers.dense(net, 200, activation=tf.nn.relu, + kernel_initializer=init_w, bias_initializer=init_b, name='l2', trainable=trainable) + + with tf.variable_scope('a'): + actions = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, kernel_initializer=init_w, + bias_initializer=init_b, name='a', trainable=trainable) + scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a') # Scale output to -action_bound to action_bound + return scaled_a + + def learn(self, s, a): # batch update + self.sess.run(self.train_op, feed_dict={S: s, A: a}) + if self.t_replace_counter % self.t_replace_iter == 0: + self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)]) + self.t_replace_counter += 1 + + def choose_action(self, s): + s = s[np.newaxis, :] # single state + return self.sess.run(self.a, feed_dict={S: s})[0] # single action + + def add_grad_to_graph(self, a_grads): + with tf.variable_scope('policy_grads'): + # ys = policy; + # xs = policy's parameters; + # self.a_grads = the gradients of the policy to get more Q + # tf.gradients will calculate dys/dxs with a initial gradients for ys, so this is dq/da * da/dparams + self.policy_grads_and_vars = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads) + + with tf.variable_scope('A_train'): + opt = tf.train.AdamOptimizer(-self.lr/BATCH_SIZE) # (- learning rate) for ascent policy + self.train_op = opt.apply_gradients(zip(self.policy_grads_and_vars, self.e_params), global_step=GLOBAL_STEP) + + +############################### Critic #################################### + +class Critic(object): + def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter, a_): + self.sess = sess + self.s_dim = state_dim + self.a_dim = action_dim + self.lr = learning_rate + self.gamma = gamma + self.t_replace_iter = t_replace_iter + self.t_replace_counter = 0 + + with tf.variable_scope('Critic'): + # Input (s, a), output q + self.q = self._build_net(S, A, 'eval_net', trainable=True) + + # Input (s_, a_), output q_ for q_target + self.q_ = self._build_net(S_, a_, 'target_net', trainable=False) # target_q is based on a_ from Actor's target_net + + self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_net') + self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_net') + + with tf.variable_scope('target_q'): + self.target_q = R + self.gamma * self.q_ + + with tf.variable_scope('abs_TD'): + self.abs_td = tf.abs(self.target_q - self.q) + self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights') + with tf.variable_scope('TD_error'): + self.loss = tf.reduce_mean(self.ISWeights * tf.squared_difference(self.target_q, self.q)) + + with tf.variable_scope('C_train'): + self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss, global_step=GLOBAL_STEP) + + with tf.variable_scope('a_grad'): + self.a_grads = tf.gradients(self.q, A)[0] # tensor of gradients of each sample (None, a_dim) + + def _build_net(self, s, a, scope, trainable): + with tf.variable_scope(scope): + init_w = tf.random_normal_initializer(0., 0.01) + init_b = tf.constant_initializer(0.01) + + with tf.variable_scope('l1'): + n_l1 = 700 + # combine the action and states together in this way + w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], initializer=init_w, trainable=trainable) + w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], initializer=init_w, trainable=trainable) + b1 = tf.get_variable('b1', [1, n_l1], initializer=init_b, trainable=trainable) + net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1) + with tf.variable_scope('l2'): + net = tf.layers.dense(net, 20, activation=tf.nn.relu, kernel_initializer=init_w, + bias_initializer=init_b, name='l2', trainable=trainable) + with tf.variable_scope('q'): + q = tf.layers.dense(net, 1, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable) # Q(s,a) + return q + + def learn(self, s, a, r, s_, ISW): + _, abs_td = self.sess.run([self.train_op, self.abs_td], feed_dict={S: s, A: a, R: r, S_: s_, self.ISWeights: ISW}) + if self.t_replace_counter % self.t_replace_iter == 0: + self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)]) + self.t_replace_counter += 1 + return abs_td + + +class SumTree(object): + """ + This SumTree code is modified version and the original code is from: + https://github.com/jaara/AI-blog/blob/master/SumTree.py + + Story the data with it priority in tree and data frameworks. + """ + data_pointer = 0 + + def __init__(self, capacity): + self.capacity = capacity # for all priority values + self.tree = np.zeros(2 * capacity - 1)+1e-5 + # [--------------Parent nodes-------------][-------leaves to recode priority-------] + # size: capacity - 1 size: capacity + self.data = np.zeros(capacity, dtype=object) # for all transitions + # [--------------data frame-------------] + # size: capacity + + def add_new_priority(self, p, data): + leaf_idx = self.data_pointer + self.capacity - 1 + + self.data[self.data_pointer] = data # update data_frame + self.update(leaf_idx, p) # update tree_frame + self.data_pointer += 1 + if self.data_pointer >= self.capacity: # replace when exceed the capacity + self.data_pointer = 0 + + def update(self, tree_idx, p): + change = p - self.tree[tree_idx] + + self.tree[tree_idx] = p + self._propagate_change(tree_idx, change) + + def _propagate_change(self, tree_idx, change): + """change the sum of priority value in all parent nodes""" + parent_idx = (tree_idx - 1) // 2 + self.tree[parent_idx] += change + if parent_idx != 0: + self._propagate_change(parent_idx, change) + + def get_leaf(self, lower_bound): + leaf_idx = self._retrieve(lower_bound) # search the max leaf priority based on the lower_bound + data_idx = leaf_idx - self.capacity + 1 + return [leaf_idx, self.tree[leaf_idx], self.data[data_idx]] + + def _retrieve(self, lower_bound, parent_idx=0): + """ + Tree structure and array storage: + + Tree index: + 0 -> storing priority sum + / \ + 1 2 + / \ / \ + 3 4 5 6 -> storing priority for transitions + + Array type for storing: + [0,1,2,3,4,5,6] + """ + left_child_idx = 2 * parent_idx + 1 + right_child_idx = left_child_idx + 1 + + if left_child_idx >= len(self.tree): # end search when no more child + return parent_idx + + if self.tree[left_child_idx] == self.tree[right_child_idx]: + return self._retrieve(lower_bound, np.random.choice([left_child_idx, right_child_idx])) + if lower_bound <= self.tree[left_child_idx]: # downward search, always search for a higher priority node + return self._retrieve(lower_bound, left_child_idx) + else: + return self._retrieve(lower_bound - self.tree[left_child_idx], right_child_idx) + + @property + def root_priority(self): + return self.tree[0] # the root + + +class Memory(object): # stored as ( s, a, r, s_ ) in SumTree + """ + This SumTree code is modified version and the original code is from: + https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py + """ + epsilon = 0.001 # small amount to avoid zero priority + alpha = 0.6 # [0~1] convert the importance of TD error to priority + beta = 0.4 # importance-sampling, from initial value increasing to 1 + beta_increment_per_sampling = 1e-5 # annealing the bias + abs_err_upper = 1 # for stability refer to paper + + def __init__(self, capacity): + self.tree = SumTree(capacity) + + def store(self, error, transition): + p = self._get_priority(error) + self.tree.add_new_priority(p, transition) + + def prio_sample(self, n): + batch_idx, batch_memory, ISWeights = [], [], [] + segment = self.tree.root_priority / n + self.beta = np.min([1, self.beta + self.beta_increment_per_sampling]) # max = 1 + + min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.root_priority + maxiwi = np.power(self.tree.capacity * min_prob, -self.beta) # for later normalizing ISWeights + for i in range(n): + a = segment * i + b = segment * (i + 1) + lower_bound = np.random.uniform(a, b) + while True: + idx, p, data = self.tree.get_leaf(lower_bound) + if type(data) is int: + i -= 1 + lower_bound = np.random.uniform(segment * i, segment * (i+1)) + else: + break + prob = p / self.tree.root_priority + ISWeights.append(self.tree.capacity * prob) + batch_idx.append(idx) + batch_memory.append(data) + + ISWeights = np.vstack(ISWeights) + ISWeights = np.power(ISWeights, -self.beta) / maxiwi # normalize + return batch_idx, np.vstack(batch_memory), ISWeights + + def random_sample(self, n): + idx = np.random.randint(0, self.tree.capacity, size=n, dtype=np.int) + return np.vstack(self.tree.data[idx]) + + def update(self, idx, error): + p = self._get_priority(error) + self.tree.update(idx, p) + + def _get_priority(self, error): + error += self.epsilon # avoid 0 + clipped_error = np.clip(error, 0, self.abs_err_upper) + return np.power(clipped_error, self.alpha) + + +sess = tf.Session() + +# Create actor and critic. +actor = Actor(sess, ACTION_DIM, ACTION_BOUND, LR_A, REPLACE_ITER_A) +critic = Critic(sess, STATE_DIM, ACTION_DIM, LR_C, GAMMA, REPLACE_ITER_C, actor.a_) +actor.add_grad_to_graph(critic.a_grads) + +M = Memory(MEMORY_CAPACITY) + +saver = tf.train.Saver(max_to_keep=100) + +if LOAD_MODEL: + all_ckpt = tf.train.get_checkpoint_state('./data', 'checkpoint').all_model_checkpoint_paths + saver.restore(sess, all_ckpt[-1]) +else: + if os.path.isdir(DATA_PATH): shutil.rmtree(DATA_PATH) + os.mkdir(DATA_PATH) + sess.run(tf.global_variables_initializer()) + +if OUTPUT_GRAPH: + tf.summary.FileWriter('logs', graph=sess.graph) + +var = 3 # control exploration +var_min = 0.01 + +for i_episode in range(MAX_EPISODES): + # s = (hull angle speed, angular velocity, horizontal speed, vertical speed, position of joints and joints angular speed, legs contact with ground, and 10 lidar rangefinder measurements.) + s = env.reset() + ep_r = 0 + while True: + if RENDER: + env.render() + a = actor.choose_action(s) + a = np.clip(np.random.normal(a, var), -1, 1) # add randomness to action selection for exploration + s_, r, done, _ = env.step(a) # r = total 300+ points up to the far end. If the robot falls, it gets -100. + + if r == -100: r = -2 + ep_r += r + + transition = np.hstack((s, a, [r], s_)) + max_p = np.max(M.tree.tree[-M.tree.capacity:]) + M.store(max_p, transition) + + if GLOBAL_STEP.eval(sess) > MEMORY_CAPACITY/20: + var = max([var*0.9999, var_min]) # decay the action randomness + tree_idx, b_M, ISWeights = M.prio_sample(BATCH_SIZE) # for critic update + b_s = b_M[:, :STATE_DIM] + b_a = b_M[:, STATE_DIM: STATE_DIM + ACTION_DIM] + b_r = b_M[:, -STATE_DIM - 1: -STATE_DIM] + b_s_ = b_M[:, -STATE_DIM:] + + abs_td = critic.learn(b_s, b_a, b_r, b_s_, ISWeights) + actor.learn(b_s, b_a) + for i in range(len(tree_idx)): # update priority + idx = tree_idx[i] + M.update(idx, abs_td[i]) + if GLOBAL_STEP.eval(sess) % SAVE_MODEL_ITER == 0: + ckpt_path = os.path.join(DATA_PATH, 'DDPG.ckpt') + save_path = saver.save(sess, ckpt_path, global_step=GLOBAL_STEP, write_meta_graph=False) + print("\nSave Model %s\n" % save_path) + + if done: + if "running_r" not in globals(): + running_r = ep_r + else: + running_r = 0.95*running_r + 0.05*ep_r + if running_r > DISPLAY_THRESHOLD: RENDER = True + else: RENDER = False + + done = '| Achieve ' if env.unwrapped.hull.position[0] >= END_POINT else '| -----' + print('Episode:', i_episode, + done, + '| Running_r: %i' % int(running_r), + '| Epi_r: %.2f' % ep_r, + '| Exploration: %.3f' % var, + '| Pos: %.i' % int(env.unwrapped.hull.position[0]), + '| LR_A: %.6f' % sess.run(LR_A), + '| LR_C: %.6f' % sess.run(LR_C), + ) + break + + s = s_ + sess.run(INCREASE_GS) \ No newline at end of file diff --git a/experiments/Solve_BipedalWalker/log/events.out.tfevents.1490801027.Morvan b/experiments/Solve_BipedalWalker/log/events.out.tfevents.1490801027.Morvan new file mode 100644 index 0000000..7746ab0 Binary files /dev/null and b/experiments/Solve_BipedalWalker/log/events.out.tfevents.1490801027.Morvan differ diff --git a/experiments/Solve_LunarLander/A3C.py b/experiments/Solve_LunarLander/A3C.py new file mode 100644 index 0000000..a57a547 --- /dev/null +++ b/experiments/Solve_LunarLander/A3C.py @@ -0,0 +1,224 @@ +""" +Asynchronous Advantage Actor Critic (A3C) with continuous action space, Reinforcement Learning. + +The Pendulum example. Convergence promised, but difficult environment, this code hardly converge. + +View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/ + +Using: +tensorflow 1.0 +gym 0.8.0 +""" + +import multiprocessing +import threading +import tensorflow as tf +import numpy as np +import gym +import os +import shutil +import matplotlib.pyplot as plt + + +GAME = 'LunarLander-v2' +OUTPUT_GRAPH = False +LOG_DIR = './log' +N_WORKERS = multiprocessing.cpu_count() +MAX_GLOBAL_EP = 5000 +GLOBAL_NET_SCOPE = 'Global_Net' +UPDATE_GLOBAL_ITER = 5 +GAMMA = 0.99 +ENTROPY_BETA = 0.001 # not useful in this case +LR_A = 0.0005 # learning rate for actor +LR_C = 0.001 # learning rate for critic +GLOBAL_RUNNING_R = [] +GLOBAL_EP = 0 + +env = gym.make(GAME) + +N_S = env.observation_space.shape[0] +N_A = env.action_space.n +del env + + +class ACNet(object): + def __init__(self, scope, globalAC=None): + if scope == GLOBAL_NET_SCOPE: # get global network + with tf.variable_scope(scope): + self.s = tf.placeholder(tf.float32, [None, N_S], 'S') + self._build_net(N_A) + self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor') + self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic') + else: # local net, calculate losses + with tf.variable_scope(scope): + self.s = tf.placeholder(tf.float32, [None, N_S], 'S') + self.a_his = tf.placeholder(tf.int32, [None, ], 'A') + self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget') + + self.a_prob, self.v = self._build_net(N_A) + + td = tf.subtract(self.v_target, self.v, name='TD_error') + with tf.name_scope('c_loss'): + self.c_loss = tf.reduce_mean(tf.square(td)) + + with tf.name_scope('a_loss'): + log_prob = tf.reduce_sum(tf.log(self.a_prob) * tf.one_hot(self.a_his, N_A, dtype=tf.float32), axis=1, keep_dims=True) + exp_v = log_prob * td + entropy = -tf.reduce_sum(self.a_prob * tf.log(self.a_prob), axis=1, keep_dims=True) # encourage exploration + self.exp_v = ENTROPY_BETA * entropy + exp_v + self.a_loss = tf.reduce_mean(-self.exp_v) + + with tf.name_scope('local_grad'): + self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor') + self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic') + self.a_grads = tf.gradients(self.a_loss, self.a_params) + self.c_grads = tf.gradients(self.c_loss, self.c_params) + + with tf.name_scope('sync'): + with tf.name_scope('pull'): + self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)] + self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)] + with tf.name_scope('push'): + self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params)) + self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params)) + + def _build_net(self, n_a): + w_init = tf.random_normal_initializer(0., .01) + with tf.variable_scope('critic'): + cell_size = 64 + s = tf.expand_dims(self.s, axis=1, + name='timely_input') # [time_step, feature] => [time_step, batch, feature] + rnn_cell = tf.contrib.rnn.BasicRNNCell(cell_size) + self.init_state = rnn_cell.zero_state(batch_size=1, dtype=tf.float32) + outputs, self.final_state = tf.nn.dynamic_rnn( + cell=rnn_cell, inputs=s, initial_state=self.init_state, time_major=True) + cell_out = tf.reshape(outputs, [-1, cell_size], name='flatten_rnn_outputs') # joined state representation + l_c = tf.layers.dense(cell_out, 200, tf.nn.relu6, kernel_initializer=w_init, name='lc') + v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v') # state value + with tf.variable_scope('actor'): + cell_out = tf.stop_gradient(cell_out, name='c_cell_out') + l_a = tf.layers.dense(cell_out, 300, tf.nn.relu6, kernel_initializer=w_init, name='la') + a_prob = tf.layers.dense(l_a, n_a, tf.nn.softmax, kernel_initializer=w_init, name='ap') + + return a_prob, v + + def update_global(self, feed_dict): # run by a local + SESS.run([self.update_a_op, self.update_c_op], feed_dict) # local grads applies to global net + + def pull_global(self): # run by a local + SESS.run([self.pull_a_params_op, self.pull_c_params_op]) + + def choose_action(self, s, cell_state): # run by a local + prob_weights, cell_state = SESS.run([self.a_prob, self.final_state], feed_dict={self.s: s[np.newaxis, :], + self.init_state: cell_state}) + action = np.random.choice(range(prob_weights.shape[1]), + p=prob_weights.ravel()) # select action w.r.t the actions prob + return action, cell_state + + +class Worker(object): + def __init__(self, name, globalAC): + self.env = gym.make(GAME) + self.name = name + self.AC = ACNet(name, globalAC) + + def work(self): + global GLOBAL_RUNNING_R, GLOBAL_EP + total_step = 1 + r_scale = 100 + buffer_s, buffer_a, buffer_r = [], [], [] + while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: + s = self.env.reset() + ep_r = 0 + ep_t = 0 + rnn_state = SESS.run(self.AC.init_state) # zero rnn state at beginning + keep_state = rnn_state.copy() # keep rnn state for updating global net + while True: + # if self.name == 'W_0' and total_step % 10 == 0: + # self.env.render() + a, rnn_state_ = self.AC.choose_action(s, rnn_state) # get the action and next rnn state + s_, r, done, info = self.env.step(a) + if r == -100: r = -10 + ep_r += r + buffer_s.append(s) + buffer_a.append(a) + buffer_r.append(r/r_scale) + + if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net + if done: + v_s_ = 0 # terminal + else: + v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :], self.AC.init_state: rnn_state_})[0,0] + buffer_v_target = [] + for r in buffer_r[::-1]: # reverse buffer r + v_s_ = r + GAMMA * v_s_ + buffer_v_target.append(v_s_) + buffer_v_target.reverse() + + buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.array(buffer_a), np.vstack(buffer_v_target) + feed_dict = { + self.AC.s: buffer_s, + self.AC.a_his: buffer_a, + self.AC.v_target: buffer_v_target, + self.AC.init_state: keep_state, + } + + self.AC.update_global(feed_dict) + + buffer_s, buffer_a, buffer_r = [], [], [] + self.AC.pull_global() + keep_state = rnn_state_.copy() # replace the keep_state as the new initial rnn state_ + + s = s_ + total_step += 1 + rnn_state = rnn_state_ # renew rnn state + ep_t += 1 + if done: + if len(GLOBAL_RUNNING_R) == 0: # record running episode reward + GLOBAL_RUNNING_R.append(ep_r) + else: + GLOBAL_RUNNING_R.append(0.99 * GLOBAL_RUNNING_R[-1] + 0.01 * ep_r) + if not self.env.unwrapped.lander.awake: solve = '| Landed' + else: solve = '| ------' + print( + self.name, + "Ep:", GLOBAL_EP, + solve, + "| Ep_r: %i" % GLOBAL_RUNNING_R[-1], + ) + GLOBAL_EP += 1 + break + +if __name__ == "__main__": + SESS = tf.Session() + + with tf.device("/cpu:0"): + OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA') + OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC') + GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) # we only need its params + workers = [] + # Create worker + for i in range(N_WORKERS): + i_name = 'W_%i' % i # worker name + workers.append(Worker(i_name, GLOBAL_AC)) + + COORD = tf.train.Coordinator() + SESS.run(tf.global_variables_initializer()) + + if OUTPUT_GRAPH: + if os.path.exists(LOG_DIR): + shutil.rmtree(LOG_DIR) + tf.summary.FileWriter(LOG_DIR, SESS.graph) + + worker_threads = [] + for worker in workers: + job = lambda: worker.work() + t = threading.Thread(target=job) + t.start() + worker_threads.append(t) + COORD.join(worker_threads) + + plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R) + plt.xlabel('step') + plt.ylabel('Total moving reward') + plt.show() diff --git a/experiments/Solve_LunarLander/DuelingDQNPrioritizedReplay.py b/experiments/Solve_LunarLander/DuelingDQNPrioritizedReplay.py new file mode 100644 index 0000000..3d6ed1b --- /dev/null +++ b/experiments/Solve_LunarLander/DuelingDQNPrioritizedReplay.py @@ -0,0 +1,307 @@ +""" +The DQN improvement: Prioritized Experience Replay (based on https://arxiv.org/abs/1511.05952) + +View more on 莫烦Python: https://morvanzhou.github.io/tutorials/ + +Using: +Tensorflow: 1.0 +""" + +import numpy as np +import tensorflow as tf + +np.random.seed(1) +tf.set_random_seed(1) + + +class SumTree(object): + """ + This SumTree code is modified version and the original code is from: + https://github.com/jaara/AI-blog/blob/master/SumTree.py + + Story the data with it priority in tree and data frameworks. + """ + data_pointer = 0 + + def __init__(self, capacity): + self.capacity = capacity # for all priority values + self.tree = np.zeros(2 * capacity - 1) + # [--------------Parent nodes-------------][-------leaves to recode priority-------] + # size: capacity - 1 size: capacity + self.data = np.zeros(capacity, dtype=object) # for all transitions + # [--------------data frame-------------] + # size: capacity + + def add_new_priority(self, p, data): + leaf_idx = self.data_pointer + self.capacity - 1 + + self.data[self.data_pointer] = data # update data_frame + self.update(leaf_idx, p) # update tree_frame + self.data_pointer += 1 + if self.data_pointer >= self.capacity: # replace when exceed the capacity + self.data_pointer = 0 + + def update(self, tree_idx, p): + change = p - self.tree[tree_idx] + + self.tree[tree_idx] = p + self._propagate_change(tree_idx, change) + + def _propagate_change(self, tree_idx, change): + """change the sum of priority value in all parent nodes""" + parent_idx = (tree_idx - 1) // 2 + self.tree[parent_idx] += change + if parent_idx != 0: + self._propagate_change(parent_idx, change) + + def get_leaf(self, lower_bound): + leaf_idx = self._retrieve(lower_bound) # search the max leaf priority based on the lower_bound + data_idx = leaf_idx - self.capacity + 1 + return [leaf_idx, self.tree[leaf_idx], self.data[data_idx]] + + def _retrieve(self, lower_bound, parent_idx=0): + """ + Tree structure and array storage: + + Tree index: + 0 -> storing priority sum + / \ + 1 2 + / \ / \ + 3 4 5 6 -> storing priority for transitions + + Array type for storing: + [0,1,2,3,4,5,6] + """ + left_child_idx = 2 * parent_idx + 1 + right_child_idx = left_child_idx + 1 + + if left_child_idx >= len(self.tree): # end search when no more child + return parent_idx + + if self.tree[left_child_idx] == self.tree[right_child_idx]: + return self._retrieve(lower_bound, np.random.choice([left_child_idx, right_child_idx])) + if lower_bound <= self.tree[left_child_idx]: # downward search, always search for a higher priority node + return self._retrieve(lower_bound, left_child_idx) + else: + return self._retrieve(lower_bound - self.tree[left_child_idx], right_child_idx) + + @property + def root_priority(self): + return self.tree[0] # the root + + +class Memory(object): # stored as ( s, a, r, s_ ) in SumTree + """ + This SumTree code is modified version and the original code is from: + https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py + """ + epsilon = 0.001 # small amount to avoid zero priority + alpha = 0.6 # [0~1] convert the importance of TD error to priority + beta = 0.4 # importance-sampling, from initial value increasing to 1 + beta_increment_per_sampling = 1e-4 # annealing the bias + abs_err_upper = 1 # for stability refer to paper + + def __init__(self, capacity): + self.tree = SumTree(capacity) + + def store(self, error, transition): + p = self._get_priority(error) + self.tree.add_new_priority(p, transition) + + def sample(self, n): + batch_idx, batch_memory, ISWeights = [], [], [] + segment = self.tree.root_priority / n + self.beta = np.min([1, self.beta + self.beta_increment_per_sampling]) # max = 1 + + min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.root_priority + maxiwi = np.power(self.tree.capacity * min_prob, -self.beta) # for later normalizing ISWeights + for i in range(n): + a = segment * i + b = segment * (i + 1) + lower_bound = np.random.uniform(a, b) + idx, p, data = self.tree.get_leaf(lower_bound) + prob = p / self.tree.root_priority + ISWeights.append(self.tree.capacity * prob) + batch_idx.append(idx) + batch_memory.append(data) + + ISWeights = np.vstack(ISWeights) + ISWeights = np.power(ISWeights, -self.beta) / maxiwi # normalize + return batch_idx, np.vstack(batch_memory), ISWeights + + def update(self, idx, error): + p = self._get_priority(error) + self.tree.update(idx, p) + + def _get_priority(self, error): + error += self.epsilon # avoid 0 + clipped_error = np.clip(error, 0, self.abs_err_upper) + return np.power(clipped_error, self.alpha) + + +class DuelingDQNPrioritizedReplay: + def __init__( + self, + n_actions, + n_features, + learning_rate=0.005, + reward_decay=0.9, + e_greedy=0.9, + replace_target_iter=500, + memory_size=10000, + batch_size=32, + e_greedy_increment=None, + hidden=[100, 50], + output_graph=False, + sess=None, + ): + self.n_actions = n_actions + self.n_features = n_features + self.lr = learning_rate + self.gamma = reward_decay + self.epsilon_max = e_greedy + self.replace_target_iter = replace_target_iter + self.memory_size = memory_size + self.batch_size = batch_size + self.hidden = hidden + self.epsilon_increment = e_greedy_increment + self.epsilon = 0.5 if e_greedy_increment is not None else self.epsilon_max + + self.learn_step_counter = 0 + self._build_net() + self.memory = Memory(capacity=memory_size) + + if sess is None: + self.sess = tf.Session() + self.sess.run(tf.global_variables_initializer()) + else: + self.sess = sess + + if output_graph: + tf.summary.FileWriter("logs/", self.sess.graph) + + self.cost_his = [] + + def _build_net(self): + def build_layers(s, c_names, w_initializer, b_initializer): + for i, h in enumerate(self.hidden): + if i == 0: + in_units, out_units, inputs = self.n_features, self.hidden[i], s + else: + in_units, out_units, inputs = self.hidden[i-1], self.hidden[i], l + with tf.variable_scope('l%i' % i): + w = tf.get_variable('w', [in_units, out_units], initializer=w_initializer, collections=c_names) + b = tf.get_variable('b', [1, out_units], initializer=b_initializer, collections=c_names) + l = tf.nn.relu(tf.matmul(inputs, w) + b) + + with tf.variable_scope('Value'): + w = tf.get_variable('w', [self.hidden[-1], 1], initializer=w_initializer, collections=c_names) + b = tf.get_variable('b', [1, 1], initializer=b_initializer, collections=c_names) + self.V = tf.matmul(l, w) + b + + with tf.variable_scope('Advantage'): + w = tf.get_variable('w', [self.hidden[-1], self.n_actions], initializer=w_initializer, collections=c_names) + b = tf.get_variable('b', [1, self.n_actions], initializer=b_initializer, collections=c_names) + self.A = tf.matmul(l, w) + b + + with tf.variable_scope('Q'): + out = self.V + (self.A - tf.reduce_mean(self.A, axis=1, keep_dims=True)) # Q = V(s) + A(s,a) + + # with tf.variable_scope('out'): + # w = tf.get_variable('w', [self.hidden[-1], self.n_actions], initializer=w_initializer, collections=c_names) + # b = tf.get_variable('b', [1, self.n_actions], initializer=b_initializer, collections=c_names) + # out = tf.matmul(l, w) + b + return out + + # ------------------ build evaluate_net ------------------ + self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input + self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss + self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights') + with tf.variable_scope('eval_net'): + c_names, w_initializer, b_initializer = \ + ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], \ + tf.random_normal_initializer(0., 0.01), tf.constant_initializer(0.01) # config of layers + + self.q_eval = build_layers(self.s, c_names, w_initializer, b_initializer) + + with tf.variable_scope('loss'): + self.abs_errors = tf.abs(tf.reduce_sum(self.q_target - self.q_eval, axis=1)) # for updating Sumtree + self.loss = tf.reduce_mean(self.ISWeights * tf.squared_difference(self.q_target, self.q_eval)) + + with tf.variable_scope('train'): + self._train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss) + + # ------------------ build target_net ------------------ + self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input + with tf.variable_scope('target_net'): + c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES] + self.q_next = build_layers(self.s_, c_names, w_initializer, b_initializer) + + def store_transition(self, s, a, r, s_): + transition = np.hstack((s, [a, r], s_)) + max_p = np.max(self.memory.tree.tree[-self.memory.tree.capacity:]) + self.memory.store(max_p, transition) + + def choose_action(self, observation): + observation = observation[np.newaxis, :] + if np.random.uniform() < self.epsilon: + actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation}) + action = np.argmax(actions_value) + else: + action = np.random.randint(0, self.n_actions) + return action + + def _replace_target_params(self): + t_params = tf.get_collection('target_net_params') + e_params = tf.get_collection('eval_net_params') + self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)]) + + def learn(self): + if self.learn_step_counter % self.replace_target_iter == 0: + self._replace_target_params() + + tree_idx, batch_memory, ISWeights = self.memory.sample(self.batch_size) + + # double DQN + q_next, q_eval4next = self.sess.run( + [self.q_next, self.q_eval], + feed_dict={self.s_: batch_memory[:, -self.n_features:], # next observation + self.s: batch_memory[:, -self.n_features:]}) # next observation + q_eval = self.sess.run(self.q_eval, {self.s: batch_memory[:, :self.n_features]}) + + q_target = q_eval.copy() + + batch_index = np.arange(self.batch_size, dtype=np.int32) + eval_act_index = batch_memory[:, self.n_features].astype(int) + reward = batch_memory[:, self.n_features + 1] + max_act4next = np.argmax(q_eval4next, + axis=1) # the action that brings the highest value is evaluated by q_eval + selected_q_next = q_next[batch_index, max_act4next] # Double DQN, select q_next depending on above actions + + q_target[batch_index, eval_act_index] = reward + self.gamma * selected_q_next + + # q_next, q_eval = self.sess.run( + # [self.q_next, self.q_eval], + # feed_dict={self.s_: batch_memory[:, -self.n_features:], + # self.s: batch_memory[:, :self.n_features]}) + # + # q_target = q_eval.copy() + # batch_index = np.arange(self.batch_size, dtype=np.int32) + # eval_act_index = batch_memory[:, self.n_features].astype(int) + # reward = batch_memory[:, self.n_features + 1] + # + # q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1) + + _, abs_errors, self.cost = self.sess.run([self._train_op, self.abs_errors, self.loss], + feed_dict={self.s: batch_memory[:, :self.n_features], + self.q_target: q_target, + self.ISWeights: ISWeights}) + for i in range(len(tree_idx)): # update priority + idx = tree_idx[i] + self.memory.update(idx, abs_errors[i]) + + self.cost_his.append(self.cost) + + self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max + self.learn_step_counter += 1 diff --git a/experiments/Solve_LunarLander/run_LunarLander.py b/experiments/Solve_LunarLander/run_LunarLander.py new file mode 100644 index 0000000..b286109 --- /dev/null +++ b/experiments/Solve_LunarLander/run_LunarLander.py @@ -0,0 +1,68 @@ +""" +Deep Q network, + +LunarLander-v2 example + +Using: +Tensorflow: 1.0 +gym: 0.8.0 +""" + + +import gym +from gym import wrappers +from DuelingDQNPrioritizedReplay import DuelingDQNPrioritizedReplay + +env = gym.make('LunarLander-v2') +# env = env.unwrapped +env.seed(1) + +N_A = env.action_space.n +N_S = env.observation_space.shape[0] +MEMORY_CAPACITY = 50000 +TARGET_REP_ITER = 2000 +MAX_EPISODES = 900 +E_GREEDY = 0.95 +E_INCREMENT = 0.00001 +GAMMA = 0.99 +LR = 0.0001 +BATCH_SIZE = 32 +HIDDEN = [400, 400] +RENDER = True + +RL = DuelingDQNPrioritizedReplay( + n_actions=N_A, n_features=N_S, learning_rate=LR, e_greedy=E_GREEDY, reward_decay=GAMMA, + hidden=HIDDEN, batch_size=BATCH_SIZE, replace_target_iter=TARGET_REP_ITER, + memory_size=MEMORY_CAPACITY, e_greedy_increment=E_INCREMENT,) + + +total_steps = 0 +running_r = 0 +r_scale = 100 +for i_episode in range(MAX_EPISODES): + s = env.reset() # (coord_x, coord_y, vel_x, vel_y, angle, angular_vel, l_leg_on_ground, r_leg_on_ground) + ep_r = 0 + while True: + if total_steps > MEMORY_CAPACITY: env.render() + a = RL.choose_action(s) + s_, r, done, _ = env.step(a) + if r == -100: r = -30 + r /= r_scale + + ep_r += r + RL.store_transition(s, a, r, s_) + if total_steps > MEMORY_CAPACITY: + RL.learn() + if done: + land = '| Landed' if r == 100/r_scale else '| ------' + running_r = 0.99 * running_r + 0.01 * ep_r + print('Epi: ', i_episode, + land, + '| Epi_R: ', round(ep_r, 2), + '| Running_R: ', round(running_r, 2), + '| Epsilon: ', round(RL.epsilon, 3)) + break + + s = s_ + total_steps += 1 +