diff --git a/README.md b/README.md
new file mode 100644
index 0000000..5f17bfe
--- /dev/null
+++ b/README.md
@@ -0,0 +1,50 @@
+<p align="center">
+    <a href="https://www.youtube.com/watch?v=pieI7rOXELI&list=PLXO45tsB95cIplu-fLMpUEEZTwrDNh6Ba" target="_blank">
+    <img width="60%" src="https://github.com/MorvanZhou/tutorials/blob/master/Reinforcement_learning_TUT/RL_cover.jpg" style="max-width:100%;">
+    </a>
+</p>
+
+---
+
+<br>
+
+# Reinforcement Learning Methods and Tutorials
+
+In these tutorials for reinforcement learning, it covers from the basic RL algorithms to advanced algorithms developed recent years.
+
+**For Chinese speaker, visit [莫烦 Python](https://morvanzhou.github.io/tutorials/) or my [Youtube channel](https://www.youtube.com/channel/UCdyjiB5H8Pu7aDTNVXTTpcg) for more.**
+
+**As many requests about making these tutorials available in English, please find them in this playlist:** ([https://www.youtube.com/playlist?list=PLXO45tsB95cIplu-fLMpUEEZTwrDNh6Ba](https://www.youtube.com/playlist?list=PLXO45tsB95cIplu-fLMpUEEZTwrDNh6Ba))
+
+
+* [Simple entry example](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/1_command_line_reinforcement_learning)
+* Tabular Methods
+  * [Q-learning](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/2_Q_Learning_maze)
+  * [Sarsa](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/3_Sarsa_maze)
+  * [Sarsa(lambda)](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/4_Sarsa_lambda_maze)
+* Function Approximation (DQN)
+  * [Deep Q Network](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/5_Deep_Q_Network)
+* [Using OpenAI Gym](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/6_OpenAI_gym)
+* DQN-based methods
+  * [Double DQN](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/5.1_Double_DQN)
+  * [DQN with Prioitized Experience Replay](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/5.2_Prioritized_Replay_DQN)
+  * [Dueling DQN](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/5.3_Dueling_DQN)
+* [Policy Gradients](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/7_Policy_gradient_softmax)
+* [Actor Critic](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/8_Actor_Critic_Advantage)
+  * [Deep Deterministic Policy Gradient](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/9_Deep_Deterministic_Policy_Gradient_DDPG)
+  * [A3C](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/10_A3C)
+* Model-based RL (WIP)
+  * [Dyna-Q](https://github.com/MorvanZhou/tutorials/tree/master/Reinforcement_learning_TUT/11_Dyna_Q)
+
+
+# Donation
+
+*If this does help you, please consider donating to support me for better tutorials. Any contribution is greatly appreciated!*
+
+<div >
+  <a href="https://www.paypal.com/cgi-bin/webscr?cmd=_donations&amp;business=morvanzhou%40gmail%2ecom&amp;lc=C2&amp;item_name=MorvanPython&amp;currency_code=AUD&amp;bn=PP%2dDonationsBF%3abtn_donateCC_LG%2egif%3aNonHosted">
+    <img style="border-radius: 20px;  box-shadow: 0px 0px 10px 1px  #888888;"
+         src="https://www.paypalobjects.com/webstatic/en_US/i/btn/png/silver-pill-paypal-44px.png"
+         alt="Paypal"
+         height="auto" ></a>
+</div>
diff --git a/RL_cover.jpg b/RL_cover.jpg
new file mode 100644
index 0000000..8a47adc
Binary files /dev/null and b/RL_cover.jpg differ
diff --git a/contents/10_A3C/A3C_RNN.py b/contents/10_A3C/A3C_RNN.py
new file mode 100644
index 0000000..82ea6bb
--- /dev/null
+++ b/contents/10_A3C/A3C_RNN.py
@@ -0,0 +1,230 @@
+"""
+Asynchronous Advantage Actor Critic (A3C) + RNN with continuous action space, Reinforcement Learning.
+
+The Pendulum example.
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+Using:
+tensorflow 1.0
+gym 0.8.0
+"""
+
+import multiprocessing
+import threading
+import tensorflow as tf
+import numpy as np
+import gym
+import os
+import shutil
+import matplotlib.pyplot as plt
+
+GAME = 'Pendulum-v0'
+OUTPUT_GRAPH = True
+LOG_DIR = './log'
+N_WORKERS = multiprocessing.cpu_count()
+MAX_EP_STEP = 400
+MAX_GLOBAL_EP = 800
+GLOBAL_NET_SCOPE = 'Global_Net'
+UPDATE_GLOBAL_ITER = 5
+GAMMA = 0.9
+ENTROPY_BETA = 0.01
+LR_A = 0.0001    # learning rate for actor
+LR_C = 0.001    # learning rate for critic
+GLOBAL_RUNNING_R = []
+GLOBAL_EP = 0
+
+env = gym.make(GAME)
+
+N_S = env.observation_space.shape[0]
+N_A = env.action_space.shape[0]
+A_BOUND = [env.action_space.low, env.action_space.high]
+
+
+class ACNet(object):
+    def __init__(self, scope, globalAC=None):
+
+        if scope == GLOBAL_NET_SCOPE:   # get global network
+            with tf.variable_scope(scope):
+                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+                self._build_net()
+                self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+                self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+        else:   # local net, calculate losses
+            with tf.variable_scope(scope):
+                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+                self.a_his = tf.placeholder(tf.float32, [None, N_A], 'A')
+                self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
+
+                mu, sigma, self.v = self._build_net()
+
+                td = tf.subtract(self.v_target, self.v, name='TD_error')
+                with tf.name_scope('c_loss'):
+                    self.c_loss = tf.reduce_mean(tf.square(td))
+
+                with tf.name_scope('wrap_a_out'):
+                    mu, sigma = mu * A_BOUND[1], sigma + 1e-4
+
+                normal_dist = tf.contrib.distributions.Normal(mu, sigma)
+
+                with tf.name_scope('a_loss'):
+                    log_prob = normal_dist.log_prob(self.a_his)
+                    exp_v = log_prob * td
+                    entropy = normal_dist.entropy()  # encourage exploration
+                    self.exp_v = ENTROPY_BETA * entropy + exp_v
+                    self.a_loss = tf.reduce_mean(-self.exp_v)
+
+                with tf.name_scope('choose_a'):  # use local params to choose action
+                    self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=0), A_BOUND[0], A_BOUND[1])
+                with tf.name_scope('local_grad'):
+                    self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+                    self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+                    self.a_grads = tf.gradients(self.a_loss, self.a_params)
+                    self.c_grads = tf.gradients(self.c_loss, self.c_params)
+
+            with tf.name_scope('sync'):
+                with tf.name_scope('pull'):
+                    self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)]
+                    self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)]
+                with tf.name_scope('push'):
+                    self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params))
+                    self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params))
+
+    def _build_net(self):
+        w_init = tf.random_normal_initializer(0., .1)
+        with tf.variable_scope('critic'):   # only critic controls the rnn update
+            cell_size = 32
+            s = tf.expand_dims(self.s, axis=1,
+                               name='timely_input')  # [time_step, feature] => [time_step, batch, feature]
+            rnn_cell = tf.contrib.rnn.BasicRNNCell(cell_size)
+            self.init_state = rnn_cell.zero_state(batch_size=1, dtype=tf.float32)
+            outputs, self.final_state = tf.nn.dynamic_rnn(
+                cell=rnn_cell, inputs=s, initial_state=self.init_state, time_major=True)
+            cell_out = tf.reshape(outputs, [-1, cell_size], name='flatten_rnn_outputs')  # joined state representation
+            l_c = tf.layers.dense(cell_out, 50, tf.nn.relu6, kernel_initializer=w_init, name='lc')
+            v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v')  # state value
+
+        with tf.variable_scope('actor'):  # state representation is based on critic
+            cell_out = tf.stop_gradient(cell_out, name='c_cell_out')    # from what critic think it is
+            l_a = tf.layers.dense(cell_out, 80, tf.nn.relu6, kernel_initializer=w_init, name='la')
+            mu = tf.layers.dense(l_a, N_A, tf.nn.tanh, kernel_initializer=w_init, name='mu')
+            sigma = tf.layers.dense(l_a, N_A, tf.nn.softplus, kernel_initializer=w_init, name='sigma')
+        return mu, sigma, v
+
+    def update_global(self, feed_dict):  # run by a local
+        SESS.run([self.update_a_op, self.update_c_op], feed_dict)  # local grads applies to global net
+
+    def pull_global(self):  # run by a local
+        SESS.run([self.pull_a_params_op, self.pull_c_params_op])
+
+    def choose_action(self, s, cell_state):  # run by a local
+        s = s[np.newaxis, :]
+        a, cell_state = SESS.run([self.A, self.final_state], {self.s: s, self.init_state: cell_state})
+        return a[0], cell_state
+
+
+class Worker(object):
+    def __init__(self, name, globalAC):
+        self.env = gym.make(GAME).unwrapped
+        self.name = name
+        self.AC = ACNet(name, globalAC)
+
+    def work(self):
+        global GLOBAL_RUNNING_R, GLOBAL_EP
+        total_step = 1
+        buffer_s, buffer_a, buffer_r = [], [], []
+        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
+            s = self.env.reset()
+            ep_r = 0
+            rnn_state = SESS.run(self.AC.init_state)    # zero rnn state at beginning
+            keep_state = rnn_state.copy()       # keep rnn state for updating global net
+            for ep_t in range(MAX_EP_STEP):
+                if self.name == 'W_0':
+                    self.env.render()
+
+                a, rnn_state_ = self.AC.choose_action(s, rnn_state)  # get the action and next rnn state
+                s_, r, done, info = self.env.step(a)
+                done = True if ep_t == MAX_EP_STEP - 1 else False
+                r /= 10     # normalize reward
+
+                ep_r += r
+                buffer_s.append(s)
+                buffer_a.append(a)
+                buffer_r.append(r)
+
+                if total_step % UPDATE_GLOBAL_ITER == 0 or done:   # update global and assign to local net
+                    if done:
+                        v_s_ = 0   # terminal
+                    else:
+                        v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :], self.AC.init_state: rnn_state_})[0, 0]
+                    buffer_v_target = []
+                    for r in buffer_r[::-1]:    # reverse buffer r
+                        v_s_ = r + GAMMA * v_s_
+                        buffer_v_target.append(v_s_)
+                    buffer_v_target.reverse()
+
+                    buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target)
+
+                    feed_dict = {
+                        self.AC.s: buffer_s,
+                        self.AC.a_his: buffer_a,
+                        self.AC.v_target: buffer_v_target,
+                        self.AC.init_state: keep_state,
+                    }
+
+                    self.AC.update_global(feed_dict)
+                    buffer_s, buffer_a, buffer_r = [], [], []
+                    self.AC.pull_global()
+                    keep_state = rnn_state_.copy()   # replace the keep_state as the new initial rnn state_
+
+                s = s_
+                rnn_state = rnn_state_  # renew rnn state
+                total_step += 1
+
+                if done:
+                    if len(GLOBAL_RUNNING_R) == 0:  # record running episode reward
+                        GLOBAL_RUNNING_R.append(ep_r)
+                    else:
+                        GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r)
+                    print(
+                        self.name,
+                        "Ep:", GLOBAL_EP,
+                        "| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
+                          )
+                    GLOBAL_EP += 1
+                    break
+
+if __name__ == "__main__":
+    SESS = tf.Session()
+
+    with tf.device("/cpu:0"):
+        OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA')
+        OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC')
+        GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE)  # we only need its params
+        workers = []
+        # Create worker
+        for i in range(N_WORKERS):
+            i_name = 'W_%i' % i   # worker name
+            workers.append(Worker(i_name, GLOBAL_AC))
+
+    COORD = tf.train.Coordinator()
+    SESS.run(tf.global_variables_initializer())
+
+    if OUTPUT_GRAPH:
+        if os.path.exists(LOG_DIR):
+            shutil.rmtree(LOG_DIR)
+        tf.summary.FileWriter(LOG_DIR, SESS.graph)
+
+    worker_threads = []
+    for worker in workers:
+        job = lambda: worker.work()
+        t = threading.Thread(target=job)
+        t.start()
+        worker_threads.append(t)
+    COORD.join(worker_threads)
+
+    plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
+    plt.xlabel('step')
+    plt.ylabel('Total moving reward')
+    plt.show()
+
diff --git a/contents/10_A3C/A3C_continuous_action.py b/contents/10_A3C/A3C_continuous_action.py
new file mode 100644
index 0000000..4cd534a
--- /dev/null
+++ b/contents/10_A3C/A3C_continuous_action.py
@@ -0,0 +1,210 @@
+"""
+Asynchronous Advantage Actor Critic (A3C) with continuous action space, Reinforcement Learning.
+
+The Pendulum example.
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+Using:
+tensorflow 1.0
+gym 0.8.0
+"""
+
+import multiprocessing
+import threading
+import tensorflow as tf
+import numpy as np
+import gym
+import os
+import shutil
+import matplotlib.pyplot as plt
+
+GAME = 'Pendulum-v0'
+OUTPUT_GRAPH = True
+LOG_DIR = './log'
+N_WORKERS = multiprocessing.cpu_count()
+MAX_EP_STEP = 400
+MAX_GLOBAL_EP = 800
+GLOBAL_NET_SCOPE = 'Global_Net'
+UPDATE_GLOBAL_ITER = 5
+GAMMA = 0.9
+ENTROPY_BETA = 0.01
+LR_A = 0.0001    # learning rate for actor
+LR_C = 0.001    # learning rate for critic
+GLOBAL_RUNNING_R = []
+GLOBAL_EP = 0
+
+env = gym.make(GAME)
+
+N_S = env.observation_space.shape[0]
+N_A = env.action_space.shape[0]
+A_BOUND = [env.action_space.low, env.action_space.high]
+
+
+class ACNet(object):
+    def __init__(self, scope, globalAC=None):
+
+        if scope == GLOBAL_NET_SCOPE:   # get global network
+            with tf.variable_scope(scope):
+                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+                self._build_net()
+                self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+                self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+        else:   # local net, calculate losses
+            with tf.variable_scope(scope):
+                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+                self.a_his = tf.placeholder(tf.float32, [None, N_A], 'A')
+                self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
+
+                mu, sigma, self.v = self._build_net()
+
+                td = tf.subtract(self.v_target, self.v, name='TD_error')
+                with tf.name_scope('c_loss'):
+                    self.c_loss = tf.reduce_mean(tf.square(td))
+
+                with tf.name_scope('wrap_a_out'):
+                    mu, sigma = mu * A_BOUND[1], sigma + 1e-4
+
+                normal_dist = tf.contrib.distributions.Normal(mu, sigma)
+
+                with tf.name_scope('a_loss'):
+                    log_prob = normal_dist.log_prob(self.a_his)
+                    exp_v = log_prob * td
+                    entropy = normal_dist.entropy()  # encourage exploration
+                    self.exp_v = ENTROPY_BETA * entropy + exp_v
+                    self.a_loss = tf.reduce_mean(-self.exp_v)
+
+                with tf.name_scope('choose_a'):  # use local params to choose action
+                    self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=0), A_BOUND[0], A_BOUND[1])
+                with tf.name_scope('local_grad'):
+                    self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+                    self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+                    self.a_grads = tf.gradients(self.a_loss, self.a_params)
+                    self.c_grads = tf.gradients(self.c_loss, self.c_params)
+
+            with tf.name_scope('sync'):
+                with tf.name_scope('pull'):
+                    self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)]
+                    self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)]
+                with tf.name_scope('push'):
+                    self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params))
+                    self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params))
+
+    def _build_net(self ):
+        w_init = tf.random_normal_initializer(0., .1)
+        with tf.variable_scope('actor'):
+            l_a = tf.layers.dense(self.s, 200, tf.nn.relu6, kernel_initializer=w_init, name='la')
+            mu = tf.layers.dense(l_a, N_A, tf.nn.tanh, kernel_initializer=w_init, name='mu')
+            sigma = tf.layers.dense(l_a, N_A, tf.nn.softplus, kernel_initializer=w_init, name='sigma')
+        with tf.variable_scope('critic'):
+            l_c = tf.layers.dense(self.s, 100, tf.nn.relu6, kernel_initializer=w_init, name='lc')
+            v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v')  # state value
+        return mu, sigma, v
+
+    def update_global(self, feed_dict):  # run by a local
+        SESS.run([self.update_a_op, self.update_c_op], feed_dict)  # local grads applies to global net
+
+    def pull_global(self):  # run by a local
+        SESS.run([self.pull_a_params_op, self.pull_c_params_op])
+
+    def choose_action(self, s):  # run by a local
+        s = s[np.newaxis, :]
+        return SESS.run(self.A, {self.s: s})[0]
+
+
+class Worker(object):
+    def __init__(self, name, globalAC):
+        self.env = gym.make(GAME).unwrapped
+        self.name = name
+        self.AC = ACNet(name, globalAC)
+
+    def work(self):
+        global GLOBAL_RUNNING_R, GLOBAL_EP
+        total_step = 1
+        buffer_s, buffer_a, buffer_r = [], [], []
+        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
+            s = self.env.reset()
+            ep_r = 0
+            for ep_t in range(MAX_EP_STEP):
+                if self.name == 'W_0':
+                    self.env.render()
+                a = self.AC.choose_action(s)
+                s_, r, done, info = self.env.step(a)
+                done = True if ep_t == MAX_EP_STEP - 1 else False
+                r /= 10     # normalize reward
+
+                ep_r += r
+                buffer_s.append(s)
+                buffer_a.append(a)
+                buffer_r.append(r)
+
+                if total_step % UPDATE_GLOBAL_ITER == 0 or done:   # update global and assign to local net
+                    if done:
+                        v_s_ = 0   # terminal
+                    else:
+                        v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0]
+                    buffer_v_target = []
+                    for r in buffer_r[::-1]:    # reverse buffer r
+                        v_s_ = r + GAMMA * v_s_
+                        buffer_v_target.append(v_s_)
+                    buffer_v_target.reverse()
+
+                    buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target)
+                    feed_dict = {
+                        self.AC.s: buffer_s,
+                        self.AC.a_his: buffer_a,
+                        self.AC.v_target: buffer_v_target,
+                    }
+                    self.AC.update_global(feed_dict)
+                    buffer_s, buffer_a, buffer_r = [], [], []
+                    self.AC.pull_global()
+
+                s = s_
+                total_step += 1
+                if done:
+                    if len(GLOBAL_RUNNING_R) == 0:  # record running episode reward
+                        GLOBAL_RUNNING_R.append(ep_r)
+                    else:
+                        GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r)
+                    print(
+                        self.name,
+                        "Ep:", GLOBAL_EP,
+                        "| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
+                          )
+                    GLOBAL_EP += 1
+                    break
+
+if __name__ == "__main__":
+    SESS = tf.Session()
+
+    with tf.device("/cpu:0"):
+        OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA')
+        OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC')
+        GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE)  # we only need its params
+        workers = []
+        # Create worker
+        for i in range(N_WORKERS):
+            i_name = 'W_%i' % i   # worker name
+            workers.append(Worker(i_name, GLOBAL_AC))
+
+    COORD = tf.train.Coordinator()
+    SESS.run(tf.global_variables_initializer())
+
+    if OUTPUT_GRAPH:
+        if os.path.exists(LOG_DIR):
+            shutil.rmtree(LOG_DIR)
+        tf.summary.FileWriter(LOG_DIR, SESS.graph)
+
+    worker_threads = []
+    for worker in workers:
+        job = lambda: worker.work()
+        t = threading.Thread(target=job)
+        t.start()
+        worker_threads.append(t)
+    COORD.join(worker_threads)
+
+    plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
+    plt.xlabel('step')
+    plt.ylabel('Total moving reward')
+    plt.show()
+
diff --git a/contents/10_A3C/A3C_discrete_action.py b/contents/10_A3C/A3C_discrete_action.py
new file mode 100644
index 0000000..f17352a
--- /dev/null
+++ b/contents/10_A3C/A3C_discrete_action.py
@@ -0,0 +1,201 @@
+"""
+Asynchronous Advantage Actor Critic (A3C) with discrete action space, Reinforcement Learning.
+
+The Cartpole example.
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+Using:
+tensorflow 1.0
+gym 0.8.0
+"""
+
+import multiprocessing
+import threading
+import tensorflow as tf
+import numpy as np
+import gym
+import os
+import shutil
+import matplotlib.pyplot as plt
+
+
+GAME = 'CartPole-v0'
+OUTPUT_GRAPH = True
+LOG_DIR = './log'
+N_WORKERS = multiprocessing.cpu_count()
+MAX_GLOBAL_EP = 1000
+GLOBAL_NET_SCOPE = 'Global_Net'
+UPDATE_GLOBAL_ITER = 20
+GAMMA = 0.9
+ENTROPY_BETA = 0.001
+LR_A = 0.001    # learning rate for actor
+LR_C = 0.001    # learning rate for critic
+GLOBAL_RUNNING_R = []
+GLOBAL_EP = 0
+
+env = gym.make(GAME)
+
+N_S = env.observation_space.shape[0]
+N_A = env.action_space.n
+
+
+class ACNet(object):
+    def __init__(self, scope, globalAC=None):
+
+        if scope == GLOBAL_NET_SCOPE:   # get global network
+            with tf.variable_scope(scope):
+                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+                self._build_net()
+                self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+                self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+        else:   # local net, calculate losses
+            with tf.variable_scope(scope):
+                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+                self.a_his = tf.placeholder(tf.int32, [None, ], 'A')
+                self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
+
+                self.a_prob, self.v = self._build_net()
+
+                td = tf.subtract(self.v_target, self.v, name='TD_error')
+                with tf.name_scope('c_loss'):
+                    self.c_loss = tf.reduce_mean(tf.square(td))
+
+                with tf.name_scope('a_loss'):
+                    log_prob = tf.reduce_sum(tf.log(self.a_prob) * tf.one_hot(self.a_his, N_A, dtype=tf.float32), axis=1, keep_dims=True)
+                    exp_v = log_prob * td
+                    entropy = -tf.reduce_sum(self.a_prob * tf.log(self.a_prob), axis=1, keep_dims=True)  # encourage exploration
+                    self.exp_v = ENTROPY_BETA * entropy + exp_v
+                    self.a_loss = tf.reduce_mean(-self.exp_v)
+
+                with tf.name_scope('local_grad'):
+                    self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+                    self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+                    self.a_grads = tf.gradients(self.a_loss, self.a_params)
+                    self.c_grads = tf.gradients(self.c_loss, self.c_params)
+
+            with tf.name_scope('sync'):
+                with tf.name_scope('pull'):
+                    self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)]
+                    self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)]
+                with tf.name_scope('push'):
+                    self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params))
+                    self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params))
+
+    def _build_net(self):
+        w_init = tf.random_normal_initializer(0., .1)
+        with tf.variable_scope('actor'):
+            l_a = tf.layers.dense(self.s, 200, tf.nn.relu6, kernel_initializer=w_init, name='la')
+            a_prob = tf.layers.dense(l_a, N_A, tf.nn.softmax, kernel_initializer=w_init, name='ap')
+        with tf.variable_scope('critic'):
+            l_c = tf.layers.dense(self.s, 100, tf.nn.relu6, kernel_initializer=w_init, name='lc')
+            v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v')  # state value
+        return a_prob, v
+
+    def update_global(self, feed_dict):  # run by a local
+        SESS.run([self.update_a_op, self.update_c_op], feed_dict)  # local grads applies to global net
+
+    def pull_global(self):  # run by a local
+        SESS.run([self.pull_a_params_op, self.pull_c_params_op])
+
+    def choose_action(self, s):  # run by a local
+        prob_weights = SESS.run(self.a_prob, feed_dict={self.s: s[np.newaxis, :]})
+        action = np.random.choice(range(prob_weights.shape[1]),
+                                  p=prob_weights.ravel())  # select action w.r.t the actions prob
+        return action
+
+
+class Worker(object):
+    def __init__(self, name, globalAC):
+        self.env = gym.make(GAME).unwrapped
+        self.name = name
+        self.AC = ACNet(name, globalAC)
+
+    def work(self):
+        global GLOBAL_RUNNING_R, GLOBAL_EP
+        total_step = 1
+        buffer_s, buffer_a, buffer_r = [], [], []
+        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
+            s = self.env.reset()
+            ep_r = 0
+            while True:
+                if self.name == 'W_0':
+                    self.env.render()
+                a = self.AC.choose_action(s)
+                s_, r, done, info = self.env.step(a)
+                if done: r = -5
+                ep_r += r
+                buffer_s.append(s)
+                buffer_a.append(a)
+                buffer_r.append(r)
+
+                if total_step % UPDATE_GLOBAL_ITER == 0 or done:   # update global and assign to local net
+                    if done:
+                        v_s_ = 0   # terminal
+                    else:
+                        v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0]
+                    buffer_v_target = []
+                    for r in buffer_r[::-1]:    # reverse buffer r
+                        v_s_ = r + GAMMA * v_s_
+                        buffer_v_target.append(v_s_)
+                    buffer_v_target.reverse()
+
+                    buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.array(buffer_a), np.vstack(buffer_v_target)
+                    feed_dict = {
+                        self.AC.s: buffer_s,
+                        self.AC.a_his: buffer_a,
+                        self.AC.v_target: buffer_v_target,
+                    }
+                    self.AC.update_global(feed_dict)
+
+                    buffer_s, buffer_a, buffer_r = [], [], []
+                    self.AC.pull_global()
+
+                s = s_
+                total_step += 1
+                if done:
+                    if len(GLOBAL_RUNNING_R) == 0:  # record running episode reward
+                        GLOBAL_RUNNING_R.append(ep_r)
+                    else:
+                        GLOBAL_RUNNING_R.append(0.99 * GLOBAL_RUNNING_R[-1] + 0.01 * ep_r)
+                    print(
+                        self.name,
+                        "Ep:", GLOBAL_EP,
+                        "| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
+                          )
+                    GLOBAL_EP += 1
+                    break
+
+if __name__ == "__main__":
+    SESS = tf.Session()
+
+    with tf.device("/cpu:0"):
+        OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA')
+        OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC')
+        GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE)  # we only need its params
+        workers = []
+        # Create worker
+        for i in range(N_WORKERS):
+            i_name = 'W_%i' % i   # worker name
+            workers.append(Worker(i_name, GLOBAL_AC))
+
+    COORD = tf.train.Coordinator()
+    SESS.run(tf.global_variables_initializer())
+
+    if OUTPUT_GRAPH:
+        if os.path.exists(LOG_DIR):
+            shutil.rmtree(LOG_DIR)
+        tf.summary.FileWriter(LOG_DIR, SESS.graph)
+
+    worker_threads = []
+    for worker in workers:
+        job = lambda: worker.work()
+        t = threading.Thread(target=job)
+        t.start()
+        worker_threads.append(t)
+    COORD.join(worker_threads)
+
+    plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
+    plt.xlabel('step')
+    plt.ylabel('Total moving reward')
+    plt.show()
diff --git a/contents/11_Dyna_Q/RL_brain.py b/contents/11_Dyna_Q/RL_brain.py
new file mode 100644
index 0000000..f4be936
--- /dev/null
+++ b/contents/11_Dyna_Q/RL_brain.py
@@ -0,0 +1,79 @@
+"""
+This part of code is the Dyna-Q learning brain, which is a brain of the agent.
+All decisions and learning processes are made in here.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+"""
+
+import numpy as np
+import pandas as pd
+
+
+class QLearningTable:
+    def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
+        self.actions = actions  # a list
+        self.lr = learning_rate
+        self.gamma = reward_decay
+        self.epsilon = e_greedy
+        self.q_table = pd.DataFrame(columns=self.actions)
+
+    def choose_action(self, observation):
+        self.check_state_exist(observation)
+        # action selection
+        if np.random.uniform() < self.epsilon:
+            # choose best action
+            state_action = self.q_table.ix[observation, :]
+            state_action = state_action.reindex(np.random.permutation(state_action.index))     # some actions have same value
+            action = state_action.argmax()
+        else:
+            # choose random action
+            action = np.random.choice(self.actions)
+        return action
+
+    def learn(self, s, a, r, s_):
+        self.check_state_exist(s_)
+        q_predict = self.q_table.ix[s, a]
+        if s_ != 'terminal':
+            q_target = r + self.gamma * self.q_table.ix[s_, :].max()  # next state is not terminal
+        else:
+            q_target = r  # next state is terminal
+        self.q_table.ix[s, a] += self.lr * (q_target - q_predict)  # update
+
+    def check_state_exist(self, state):
+        if state not in self.q_table.index:
+            # append new state to q table
+            self.q_table = self.q_table.append(
+                pd.Series(
+                    [0]*len(self.actions),
+                    index=self.q_table.columns,
+                    name=state,
+                )
+            )
+
+
+class EnvModel:
+    """Similar to the memory buffer in DQN, you can store past experiences in here.
+    Alternatively, the model can generate next state and reward signal accurately."""
+    def __init__(self, actions):
+        # the simplest case is to think about the model is a memory which has all past transition information
+        self.actions = actions
+        self.database = pd.DataFrame(columns=actions, dtype=np.object)
+
+    def store_transition(self, s, a, r, s_):
+        if s not in self.database.index:
+            self.database = self.database.append(
+                pd.Series(
+                    [None] * len(self.actions),
+                    index=self.database.columns,
+                    name=s,
+                ))
+        self.database.set_value(s, a, (r, s_))
+
+    def sample_s_a(self):
+        s = np.random.choice(self.database.index)
+        a = np.random.choice(self.database.ix[s].dropna().index)    # filter out the None value
+        return s, a
+
+    def get_r_s_(self, s, a):
+        r, s_ = self.database.ix[s, a]
+        return r, s_
diff --git a/contents/11_Dyna_Q/maze_env.py b/contents/11_Dyna_Q/maze_env.py
new file mode 100644
index 0000000..5ec5370
--- /dev/null
+++ b/contents/11_Dyna_Q/maze_env.py
@@ -0,0 +1,129 @@
+"""
+Reinforcement learning maze example.
+
+Red rectangle:          explorer.
+Black rectangles:       hells       [reward = -1].
+Yellow bin circle:      paradise    [reward = +1].
+All other states:       ground      [reward = 0].
+
+This script is the environment part of this example. The RL is in RL_brain.py.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+"""
+
+
+import numpy as np
+np.random.seed(1)
+import tkinter as tk
+import time
+
+
+UNIT = 40   # pixels
+MAZE_H = 4  # grid height
+MAZE_W = 4  # grid width
+
+
+class Maze(tk.Tk, object):
+    def __init__(self):
+        super(Maze, self).__init__()
+        self.action_space = ['u', 'd', 'l', 'r']
+        self.n_actions = len(self.action_space)
+        self.title('maze')
+        self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
+        self._build_maze()
+
+    def _build_maze(self):
+        self.canvas = tk.Canvas(self, bg='white',
+                           height=MAZE_H * UNIT,
+                           width=MAZE_W * UNIT)
+
+        # create grids
+        for c in range(0, MAZE_W * UNIT, UNIT):
+            x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
+            self.canvas.create_line(x0, y0, x1, y1)
+        for r in range(0, MAZE_H * UNIT, UNIT):
+            x0, y0, x1, y1 = 0, r, MAZE_H * UNIT, r
+            self.canvas.create_line(x0, y0, x1, y1)
+
+        # create origin
+        origin = np.array([20, 20])
+
+        # hell
+        hell1_center = origin + np.array([UNIT * 2, UNIT])
+        self.hell1 = self.canvas.create_rectangle(
+            hell1_center[0] - 15, hell1_center[1] - 15,
+            hell1_center[0] + 15, hell1_center[1] + 15,
+            fill='black')
+        # hell
+        hell2_center = origin + np.array([UNIT, UNIT * 2])
+        self.hell2 = self.canvas.create_rectangle(
+            hell2_center[0] - 15, hell2_center[1] - 15,
+            hell2_center[0] + 15, hell2_center[1] + 15,
+            fill='black')
+
+        # create oval
+        oval_center = origin + UNIT * 2
+        self.oval = self.canvas.create_oval(
+            oval_center[0] - 15, oval_center[1] - 15,
+            oval_center[0] + 15, oval_center[1] + 15,
+            fill='yellow')
+
+        # create red rect
+        self.rect = self.canvas.create_rectangle(
+            origin[0] - 15, origin[1] - 15,
+            origin[0] + 15, origin[1] + 15,
+            fill='red')
+
+        # pack all
+        self.canvas.pack()
+
+    def reset(self):
+        self.update()
+        time.sleep(0.5)
+        self.canvas.delete(self.rect)
+        origin = np.array([20, 20])
+        self.rect = self.canvas.create_rectangle(
+            origin[0] - 15, origin[1] - 15,
+            origin[0] + 15, origin[1] + 15,
+            fill='red')
+        # return observation
+        return self.canvas.coords(self.rect)
+
+    def step(self, action):
+        s = self.canvas.coords(self.rect)
+        base_action = np.array([0, 0])
+        if action == 0:   # up
+            if s[1] > UNIT:
+                base_action[1] -= UNIT
+        elif action == 1:   # down
+            if s[1] < (MAZE_H - 1) * UNIT:
+                base_action[1] += UNIT
+        elif action == 2:   # right
+            if s[0] < (MAZE_W - 1) * UNIT:
+                base_action[0] += UNIT
+        elif action == 3:   # left
+            if s[0] > UNIT:
+                base_action[0] -= UNIT
+
+        self.canvas.move(self.rect, base_action[0], base_action[1])  # move agent
+
+        s_ = self.canvas.coords(self.rect)  # next state
+
+        # reward function
+        if s_ == self.canvas.coords(self.oval):
+            reward = 1
+            done = True
+        elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
+            reward = -1
+            done = True
+        else:
+            reward = 0
+            done = False
+
+        return s_, reward, done
+
+    def render(self):
+        # time.sleep(0.1)
+        self.update()
+
+
diff --git a/contents/11_Dyna_Q/run_this.py b/contents/11_Dyna_Q/run_this.py
new file mode 100644
index 0000000..d784bc4
--- /dev/null
+++ b/contents/11_Dyna_Q/run_this.py
@@ -0,0 +1,51 @@
+"""
+Simplest model-based RL, Dyna-Q.
+
+Red rectangle:          explorer.
+Black rectangles:       hells       [reward = -1].
+Yellow bin circle:      paradise    [reward = +1].
+All other states:       ground      [reward = 0].
+
+This script is the main part which controls the update method of this example.
+The RL is in RL_brain.py.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+"""
+
+from maze_env import Maze
+from RL_brain import QLearningTable, EnvModel
+
+
+def update():
+    for episode in range(40):
+        s = env.reset()
+        while True:
+            env.render()
+            a = RL.choose_action(str(s))
+            s_, r, done = env.step(a)
+            RL.learn(str(s), a, r, str(s_))
+
+            # use a model to output (r, s_) by inputting (s, a)
+            # the model in dyna Q version is just like a memory replay buffer
+            env_model.store_transition(str(s), a, r, s_)
+            for n in range(10):     # learn 10 more times using the env_model
+                ms, ma = env_model.sample_s_a()  # ms in here is a str
+                mr, ms_ = env_model.get_r_s_(ms, ma)
+                RL.learn(ms, ma, mr, str(ms_))
+
+            s = s_
+            if done:
+                break
+
+    # end of game
+    print('game over')
+    env.destroy()
+
+
+if __name__ == "__main__":
+    env = Maze()
+    RL = QLearningTable(actions=list(range(env.n_actions)))
+    env_model = EnvModel(actions=list(range(env.n_actions)))
+
+    env.after(0, update)
+    env.mainloop()
\ No newline at end of file
diff --git a/contents/1_command_line_reinforcement_learning/treasure_on_right.py b/contents/1_command_line_reinforcement_learning/treasure_on_right.py
new file mode 100644
index 0000000..5970860
--- /dev/null
+++ b/contents/1_command_line_reinforcement_learning/treasure_on_right.py
@@ -0,0 +1,107 @@
+"""
+A simple example for Reinforcement Learning using table lookup Q-learning method.
+An agent "o" is on the left of a 1 dimensional world, the treasure is on the rightmost location.
+Run this program and to see how the agent will improve its strategy of finding the treasure.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+"""
+
+import numpy as np
+import pandas as pd
+import time
+
+np.random.seed(2)  # reproducible
+
+
+N_STATES = 6   # the length of the 1 dimensional world
+ACTIONS = ['left', 'right']     # available actions
+EPSILON = 0.9   # greedy police
+ALPHA = 0.1     # learning rate
+GAMMA = 0.9    # discount factor
+MAX_EPISODES = 13   # maximum episodes
+FRESH_TIME = 0.3    # fresh time for one move
+
+
+def build_q_table(n_states, actions):
+    table = pd.DataFrame(
+        np.zeros((n_states, len(actions))),     # q_table initial values
+        columns=actions,    # actions's name
+    )
+    # print(table)    # show table
+    return table
+
+
+def choose_action(state, q_table):
+    # This is how to choose an action
+    state_actions = q_table.iloc[state, :]
+    if (np.random.uniform() > EPSILON) or (state_actions.all() == 0):  # act non-greedy or state-action have no value
+        action_name = np.random.choice(ACTIONS)
+    else:   # act greedy
+        action_name = state_actions.argmax()
+    return action_name
+
+
+def get_env_feedback(S, A):
+    # This is how agent will interact with the environment
+    if A == 'right':    # move right
+        if S == N_STATES - 2:   # terminate
+            S_ = 'terminal'
+            R = 1
+        else:
+            S_ = S + 1
+            R = 0
+    else:   # move left
+        R = 0
+        if S == 0:
+            S_ = S  # reach the wall
+        else:
+            S_ = S - 1
+    return S_, R
+
+
+def update_env(S, episode, step_counter):
+    # This is how environment be updated
+    env_list = ['-']*(N_STATES-1) + ['T']   # '---------T' our environment
+    if S == 'terminal':
+        interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter)
+        print('\r{}'.format(interaction), end='')
+        time.sleep(2)
+        print('\r                                ', end='')
+    else:
+        env_list[S] = 'o'
+        interaction = ''.join(env_list)
+        print('\r{}'.format(interaction), end='')
+        time.sleep(FRESH_TIME)
+
+
+def rl():
+    # main part of RL loop
+    q_table = build_q_table(N_STATES, ACTIONS)
+    for episode in range(MAX_EPISODES):
+        step_counter = 0
+        S = 0
+        is_terminated = False
+        update_env(S, episode, step_counter)
+        while not is_terminated:
+
+            A = choose_action(S, q_table)
+            S_, R = get_env_feedback(S, A)  # take action & get next state and reward
+            q_predict = q_table.ix[S, A]
+            if S_ != 'terminal':
+                q_target = R + GAMMA * q_table.iloc[S_, :].max()   # next state is not terminal
+            else:
+                q_target = R     # next state is terminal
+                is_terminated = True    # terminate this episode
+
+            q_table.ix[S, A] += ALPHA * (q_target - q_predict)  # update
+            S = S_  # move to next state
+
+            update_env(S, episode, step_counter+1)
+            step_counter += 1
+    return q_table
+
+
+if __name__ == "__main__":
+    q_table = rl()
+    print('\r\nQ-table:\n')
+    print(q_table)
diff --git a/contents/2_Q_Learning_maze/RL_brain.py b/contents/2_Q_Learning_maze/RL_brain.py
new file mode 100644
index 0000000..844c475
--- /dev/null
+++ b/contents/2_Q_Learning_maze/RL_brain.py
@@ -0,0 +1,51 @@
+"""
+This part of code is the Q learning brain, which is a brain of the agent.
+All decisions are made in here.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+"""
+
+import numpy as np
+import pandas as pd
+
+
+class QLearningTable:
+    def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
+        self.actions = actions  # a list
+        self.lr = learning_rate
+        self.gamma = reward_decay
+        self.epsilon = e_greedy
+        self.q_table = pd.DataFrame(columns=self.actions)
+
+    def choose_action(self, observation):
+        self.check_state_exist(observation)
+        # action selection
+        if np.random.uniform() < self.epsilon:
+            # choose best action
+            state_action = self.q_table.ix[observation, :]
+            state_action = state_action.reindex(np.random.permutation(state_action.index))     # some actions have same value
+            action = state_action.argmax()
+        else:
+            # choose random action
+            action = np.random.choice(self.actions)
+        return action
+
+    def learn(self, s, a, r, s_):
+        self.check_state_exist(s_)
+        q_predict = self.q_table.ix[s, a]
+        if s_ != 'terminal':
+            q_target = r + self.gamma * self.q_table.ix[s_, :].max()  # next state is not terminal
+        else:
+            q_target = r  # next state is terminal
+        self.q_table.ix[s, a] += self.lr * (q_target - q_predict)  # update
+
+    def check_state_exist(self, state):
+        if state not in self.q_table.index:
+            # append new state to q table
+            self.q_table = self.q_table.append(
+                pd.Series(
+                    [0]*len(self.actions),
+                    index=self.q_table.columns,
+                    name=state,
+                )
+            )
\ No newline at end of file
diff --git a/contents/2_Q_Learning_maze/maze_env.py b/contents/2_Q_Learning_maze/maze_env.py
new file mode 100644
index 0000000..d7b8b0a
--- /dev/null
+++ b/contents/2_Q_Learning_maze/maze_env.py
@@ -0,0 +1,129 @@
+"""
+Reinforcement learning maze example.
+
+Red rectangle:          explorer.
+Black rectangles:       hells       [reward = -1].
+Yellow bin circle:      paradise    [reward = +1].
+All other states:       ground      [reward = 0].
+
+This script is the environment part of this example. The RL is in RL_brain.py.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+"""
+
+
+import numpy as np
+np.random.seed(1)
+import tkinter as tk
+import time
+
+
+UNIT = 40   # pixels
+MAZE_H = 4  # grid height
+MAZE_W = 4  # grid width
+
+
+class Maze(tk.Tk, object):
+    def __init__(self):
+        super(Maze, self).__init__()
+        self.action_space = ['u', 'd', 'l', 'r']
+        self.n_actions = len(self.action_space)
+        self.title('maze')
+        self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
+        self._build_maze()
+
+    def _build_maze(self):
+        self.canvas = tk.Canvas(self, bg='white',
+                           height=MAZE_H * UNIT,
+                           width=MAZE_W * UNIT)
+
+        # create grids
+        for c in range(0, MAZE_W * UNIT, UNIT):
+            x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
+            self.canvas.create_line(x0, y0, x1, y1)
+        for r in range(0, MAZE_H * UNIT, UNIT):
+            x0, y0, x1, y1 = 0, r, MAZE_H * UNIT, r
+            self.canvas.create_line(x0, y0, x1, y1)
+
+        # create origin
+        origin = np.array([20, 20])
+
+        # hell
+        hell1_center = origin + np.array([UNIT * 2, UNIT])
+        self.hell1 = self.canvas.create_rectangle(
+            hell1_center[0] - 15, hell1_center[1] - 15,
+            hell1_center[0] + 15, hell1_center[1] + 15,
+            fill='black')
+        # hell
+        hell2_center = origin + np.array([UNIT, UNIT * 2])
+        self.hell2 = self.canvas.create_rectangle(
+            hell2_center[0] - 15, hell2_center[1] - 15,
+            hell2_center[0] + 15, hell2_center[1] + 15,
+            fill='black')
+
+        # create oval
+        oval_center = origin + UNIT * 2
+        self.oval = self.canvas.create_oval(
+            oval_center[0] - 15, oval_center[1] - 15,
+            oval_center[0] + 15, oval_center[1] + 15,
+            fill='yellow')
+
+        # create red rect
+        self.rect = self.canvas.create_rectangle(
+            origin[0] - 15, origin[1] - 15,
+            origin[0] + 15, origin[1] + 15,
+            fill='red')
+
+        # pack all
+        self.canvas.pack()
+
+    def reset(self):
+        self.update()
+        time.sleep(0.5)
+        self.canvas.delete(self.rect)
+        origin = np.array([20, 20])
+        self.rect = self.canvas.create_rectangle(
+            origin[0] - 15, origin[1] - 15,
+            origin[0] + 15, origin[1] + 15,
+            fill='red')
+        # return observation
+        return self.canvas.coords(self.rect)
+
+    def step(self, action):
+        s = self.canvas.coords(self.rect)
+        base_action = np.array([0, 0])
+        if action == 0:   # up
+            if s[1] > UNIT:
+                base_action[1] -= UNIT
+        elif action == 1:   # down
+            if s[1] < (MAZE_H - 1) * UNIT:
+                base_action[1] += UNIT
+        elif action == 2:   # right
+            if s[0] < (MAZE_W - 1) * UNIT:
+                base_action[0] += UNIT
+        elif action == 3:   # left
+            if s[0] > UNIT:
+                base_action[0] -= UNIT
+
+        self.canvas.move(self.rect, base_action[0], base_action[1])  # move agent
+
+        s_ = self.canvas.coords(self.rect)  # next state
+
+        # reward function
+        if s_ == self.canvas.coords(self.oval):
+            reward = 1
+            done = True
+        elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
+            reward = -1
+            done = True
+        else:
+            reward = 0
+            done = False
+
+        return s_, reward, done
+
+    def render(self):
+        time.sleep(0.1)
+        self.update()
+
+
diff --git a/contents/2_Q_Learning_maze/run_this.py b/contents/2_Q_Learning_maze/run_this.py
new file mode 100644
index 0000000..f817d1e
--- /dev/null
+++ b/contents/2_Q_Learning_maze/run_this.py
@@ -0,0 +1,53 @@
+"""
+Reinforcement learning maze example.
+
+Red rectangle:          explorer.
+Black rectangles:       hells       [reward = -1].
+Yellow bin circle:      paradise    [reward = +1].
+All other states:       ground      [reward = 0].
+
+This script is the main part which controls the update method of this example.
+The RL is in RL_brain.py.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+"""
+
+from maze_env import Maze
+from RL_brain import QLearningTable
+
+
+def update():
+    for episode in range(100):
+        # initial observation
+        observation = env.reset()
+
+        while True:
+            # fresh env
+            env.render()
+
+            # RL choose action based on observation
+            action = RL.choose_action(str(observation))
+
+            # RL take action and get next observation and reward
+            observation_, reward, done = env.step(action)
+
+            # RL learn from this transition
+            RL.learn(str(observation), action, reward, str(observation_))
+
+            # swap observation
+            observation = observation_
+
+            # break while loop when end of this episode
+            if done:
+                break
+
+    # end of game
+    print('game over')
+    env.destroy()
+
+if __name__ == "__main__":
+    env = Maze()
+    RL = QLearningTable(actions=list(range(env.n_actions)))
+
+    env.after(100, update)
+    env.mainloop()
\ No newline at end of file
diff --git a/contents/3_Sarsa_maze/RL_brain.py b/contents/3_Sarsa_maze/RL_brain.py
new file mode 100644
index 0000000..3b8b5da
--- /dev/null
+++ b/contents/3_Sarsa_maze/RL_brain.py
@@ -0,0 +1,77 @@
+"""
+This part of code is the Q learning brain, which is a brain of the agent.
+All decisions are made in here.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+"""
+
+import numpy as np
+import pandas as pd
+
+
+class RL(object):
+    def __init__(self, action_space, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
+        self.actions = action_space  # a list
+        self.lr = learning_rate
+        self.gamma = reward_decay
+        self.epsilon = e_greedy
+
+        self.q_table = pd.DataFrame(columns=self.actions)
+
+    def check_state_exist(self, state):
+        if state not in self.q_table.index:
+            # append new state to q table
+            self.q_table = self.q_table.append(
+                pd.Series(
+                    [0]*len(self.actions),
+                    index=self.q_table.columns,
+                    name=state,
+                )
+            )
+
+    def choose_action(self, observation):
+        self.check_state_exist(observation)
+        # action selection
+        if np.random.rand() < self.epsilon:
+            # choose best action
+            state_action = self.q_table.ix[observation, :]
+            state_action = state_action.reindex(np.random.permutation(state_action.index))     # some actions have same value
+            action = state_action.argmax()
+        else:
+            # choose random action
+            action = np.random.choice(self.actions)
+        return action
+
+    def learn(self, *args):
+        pass
+
+
+# off-policy
+class QLearningTable(RL):
+    def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
+        super(QLearningTable, self).__init__(actions, learning_rate, reward_decay, e_greedy)
+
+    def learn(self, s, a, r, s_):
+        self.check_state_exist(s_)
+        q_predict = self.q_table.ix[s, a]
+        if s_ != 'terminal':
+            q_target = r + self.gamma * self.q_table.ix[s_, :].max()  # next state is not terminal
+        else:
+            q_target = r  # next state is terminal
+        self.q_table.ix[s, a] += self.lr * (q_target - q_predict)  # update
+
+
+# on-policy
+class SarsaTable(RL):
+
+    def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
+        super(SarsaTable, self).__init__(actions, learning_rate, reward_decay, e_greedy)
+
+    def learn(self, s, a, r, s_, a_):
+        self.check_state_exist(s_)
+        q_predict = self.q_table.ix[s, a]
+        if s_ != 'terminal':
+            q_target = r + self.gamma * self.q_table.ix[s_, a_]  # next state is not terminal
+        else:
+            q_target = r  # next state is terminal
+        self.q_table.ix[s, a] += self.lr * (q_target - q_predict)  # update
diff --git a/contents/3_Sarsa_maze/maze_env.py b/contents/3_Sarsa_maze/maze_env.py
new file mode 100644
index 0000000..fc31521
--- /dev/null
+++ b/contents/3_Sarsa_maze/maze_env.py
@@ -0,0 +1,130 @@
+"""
+Reinforcement learning maze example.
+
+Red rectangle:          explorer.
+Black rectangles:       hells       [reward = -1].
+Yellow bin circle:      paradise    [reward = +1].
+All other states:       ground      [reward = 0].
+
+This script is the environment part of this example.
+The RL is in RL_brain.py.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+"""
+
+
+import numpy as np
+np.random.seed(1)
+import tkinter as tk
+import time
+
+
+UNIT = 40   # pixels
+MAZE_H = 4  # grid height
+MAZE_W = 4  # grid width
+
+
+class Maze(tk.Tk):
+    def __init__(self):
+        super(Maze, self).__init__()
+        self.action_space = ['u', 'd', 'l', 'r']
+        self.n_actions = len(self.action_space)
+        self.title('maze')
+        self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
+        self._build_maze()
+
+    def _build_maze(self):
+        self.canvas = tk.Canvas(self, bg='white',
+                           height=MAZE_H * UNIT,
+                           width=MAZE_W * UNIT)
+
+        # create grids
+        for c in range(0, MAZE_W * UNIT, UNIT):
+            x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
+            self.canvas.create_line(x0, y0, x1, y1)
+        for r in range(0, MAZE_H * UNIT, UNIT):
+            x0, y0, x1, y1 = 0, r, MAZE_H * UNIT, r
+            self.canvas.create_line(x0, y0, x1, y1)
+
+        # create origin
+        origin = np.array([20, 20])
+
+        # hell
+        hell1_center = origin + np.array([UNIT * 2, UNIT])
+        self.hell1 = self.canvas.create_rectangle(
+            hell1_center[0] - 15, hell1_center[1] - 15,
+            hell1_center[0] + 15, hell1_center[1] + 15,
+            fill='black')
+        # hell
+        hell2_center = origin + np.array([UNIT, UNIT * 2])
+        self.hell2 = self.canvas.create_rectangle(
+            hell2_center[0] - 15, hell2_center[1] - 15,
+            hell2_center[0] + 15, hell2_center[1] + 15,
+            fill='black')
+
+        # create oval
+        oval_center = origin + UNIT * 2
+        self.oval = self.canvas.create_oval(
+            oval_center[0] - 15, oval_center[1] - 15,
+            oval_center[0] + 15, oval_center[1] + 15,
+            fill='yellow')
+
+        # create red rect
+        self.rect = self.canvas.create_rectangle(
+            origin[0] - 15, origin[1] - 15,
+            origin[0] + 15, origin[1] + 15,
+            fill='red')
+
+        # pack all
+        self.canvas.pack()
+
+    def reset(self):
+        self.update()
+        time.sleep(0.5)
+        self.canvas.delete(self.rect)
+        origin = np.array([20, 20])
+        self.rect = self.canvas.create_rectangle(
+            origin[0] - 15, origin[1] - 15,
+            origin[0] + 15, origin[1] + 15,
+            fill='red')
+        # return observation
+        return self.canvas.coords(self.rect)
+
+    def step(self, action):
+        s = self.canvas.coords(self.rect)
+        base_action = np.array([0, 0])
+        if action == 0:   # up
+            if s[1] > UNIT:
+                base_action[1] -= UNIT
+        elif action == 1:   # down
+            if s[1] < (MAZE_H - 1) * UNIT:
+                base_action[1] += UNIT
+        elif action == 2:   # right
+            if s[0] < (MAZE_W - 1) * UNIT:
+                base_action[0] += UNIT
+        elif action == 3:   # left
+            if s[0] > UNIT:
+                base_action[0] -= UNIT
+
+        self.canvas.move(self.rect, base_action[0], base_action[1])  # move agent
+
+        s_ = self.canvas.coords(self.rect)  # next state
+
+        # reward function
+        if s_ == self.canvas.coords(self.oval):
+            reward = 1
+            done = True
+        elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
+            reward = -1
+            done = True
+        else:
+            reward = 0
+            done = False
+
+        return s_, reward, done
+
+    def render(self):
+        time.sleep(0.1)
+        self.update()
+
+
diff --git a/contents/3_Sarsa_maze/run_this.py b/contents/3_Sarsa_maze/run_this.py
new file mode 100644
index 0000000..fc2bd1a
--- /dev/null
+++ b/contents/3_Sarsa_maze/run_this.py
@@ -0,0 +1,52 @@
+"""
+Sarsa is a online updating method for Reinforcement learning.
+
+Unlike Q learning which is a offline updating method, Sarsa is updating while in the current trajectory.
+
+You will see the sarsa is more coward when punishment is close because it cares about all behaviours,
+while q learning is more brave because it only cares about maximum behaviour.
+"""
+
+from maze_env import Maze
+from RL_brain import SarsaTable
+
+
+def update():
+    for episode in range(100):
+        # initial observation
+        observation = env.reset()
+
+        # RL choose action based on observation
+        action = RL.choose_action(str(observation))
+
+        while True:
+            # fresh env
+            env.render()
+
+            # RL take action and get next observation and reward
+            observation_, reward, done = env.step(action)
+
+            # RL choose action based on next observation
+            action_ = RL.choose_action(str(observation_))
+
+            # RL learn from this transition (s, a, r, s, a) ==> Sarsa
+            RL.learn(str(observation), action, reward, str(observation_), action_)
+
+            # swap observation and action
+            observation = observation_
+            action = action_
+
+            # break while loop when end of this episode
+            if done:
+                break
+
+    # end of game
+    print('game over')
+    env.destroy()
+
+if __name__ == "__main__":
+    env = Maze()
+    RL = SarsaTable(actions=list(range(env.n_actions)))
+
+    env.after(100, update)
+    env.mainloop()
\ No newline at end of file
diff --git a/contents/4_Sarsa_lambda_maze/RL_brain.py b/contents/4_Sarsa_lambda_maze/RL_brain.py
new file mode 100644
index 0000000..6ad65a9
--- /dev/null
+++ b/contents/4_Sarsa_lambda_maze/RL_brain.py
@@ -0,0 +1,93 @@
+"""
+This part of code is the Q learning brain, which is a brain of the agent.
+All decisions are made in here.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+"""
+
+import numpy as np
+import pandas as pd
+
+
+class RL(object):
+    def __init__(self, action_space, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
+        self.actions = action_space  # a list
+        self.lr = learning_rate
+        self.gamma = reward_decay
+        self.epsilon = e_greedy
+
+        self.q_table = pd.DataFrame(columns=self.actions)
+
+    def check_state_exist(self, state):
+        if state not in self.q_table.index:
+            # append new state to q table
+            self.q_table = self.q_table.append(
+                pd.Series(
+                    [0]*len(self.actions),
+                    index=self.q_table.columns,
+                    name=state,
+                )
+            )
+
+    def choose_action(self, observation):
+        self.check_state_exist(observation)
+        # action selection
+        if np.random.rand() < self.epsilon:
+            # choose best action
+            state_action = self.q_table.ix[observation, :]
+            state_action = state_action.reindex(np.random.permutation(state_action.index))     # some actions have same value
+            action = state_action.argmax()
+        else:
+            # choose random action
+            action = np.random.choice(self.actions)
+        return action
+
+    def learn(self, *args):
+        pass
+
+
+# backward eligibility traces
+class SarsaLambdaTable(RL):
+    def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, trace_decay=0.9):
+        super(SarsaLambdaTable, self).__init__(actions, learning_rate, reward_decay, e_greedy)
+
+        # backward view, eligibility trace.
+        self.lambda_ = trace_decay
+        self.eligibility_trace = self.q_table.copy()
+
+    def check_state_exist(self, state):
+        if state not in self.q_table.index:
+            # append new state to q table
+            to_be_append = pd.Series(
+                    [0] * len(self.actions),
+                    index=self.q_table.columns,
+                    name=state,
+                )
+            self.q_table = self.q_table.append(to_be_append)
+
+            # also update eligibility trace
+            self.eligibility_trace = self.eligibility_trace.append(to_be_append)
+
+    def learn(self, s, a, r, s_, a_):
+        self.check_state_exist(s_)
+        q_predict = self.q_table.ix[s, a]
+        if s_ != 'terminal':
+            q_target = r + self.gamma * self.q_table.ix[s_, a_]  # next state is not terminal
+        else:
+            q_target = r  # next state is terminal
+        error = q_target - q_predict
+
+        # increase trace amount for visited state-action pair
+
+        # Method 1:
+        # self.eligibility_trace.ix[s, a] += 1
+
+        # Method 2:
+        self.eligibility_trace.ix[s, :] *= 0
+        self.eligibility_trace.ix[s, a] = 1
+
+        # Q update
+        self.q_table += self.lr * error * self.eligibility_trace
+
+        # decay eligibility trace after update
+        self.eligibility_trace *= self.gamma*self.lambda_
diff --git a/contents/4_Sarsa_lambda_maze/maze_env.py b/contents/4_Sarsa_lambda_maze/maze_env.py
new file mode 100644
index 0000000..9fe6acb
--- /dev/null
+++ b/contents/4_Sarsa_lambda_maze/maze_env.py
@@ -0,0 +1,130 @@
+"""
+Reinforcement learning maze example.
+
+Red rectangle:          explorer.
+Black rectangles:       hells       [reward = -1].
+Yellow bin circle:      paradise    [reward = +1].
+All other states:       ground      [reward = 0].
+
+This script is the environment part of this example.
+The RL is in RL_brain.py.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+"""
+
+
+import numpy as np
+np.random.seed(1)
+import tkinter as tk
+import time
+
+
+UNIT = 40   # pixels
+MAZE_H = 4  # grid height
+MAZE_W = 4  # grid width
+
+
+class Maze(tk.Tk):
+    def __init__(self):
+        super(Maze, self).__init__()
+        self.action_space = ['u', 'd', 'l', 'r']
+        self.n_actions = len(self.action_space)
+        self.title('maze')
+        self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
+        self._build_maze()
+
+    def _build_maze(self):
+        self.canvas = tk.Canvas(self, bg='white',
+                           height=MAZE_H * UNIT,
+                           width=MAZE_W * UNIT)
+
+        # create grids
+        for c in range(0, MAZE_W * UNIT, UNIT):
+            x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
+            self.canvas.create_line(x0, y0, x1, y1)
+        for r in range(0, MAZE_H * UNIT, UNIT):
+            x0, y0, x1, y1 = 0, r, MAZE_H * UNIT, r
+            self.canvas.create_line(x0, y0, x1, y1)
+
+        # create origin
+        origin = np.array([20, 20])
+
+        # hell
+        hell1_center = origin + np.array([UNIT * 2, UNIT])
+        self.hell1 = self.canvas.create_rectangle(
+            hell1_center[0] - 15, hell1_center[1] - 15,
+            hell1_center[0] + 15, hell1_center[1] + 15,
+            fill='black')
+        # hell
+        hell2_center = origin + np.array([UNIT, UNIT * 2])
+        self.hell2 = self.canvas.create_rectangle(
+            hell2_center[0] - 15, hell2_center[1] - 15,
+            hell2_center[0] + 15, hell2_center[1] + 15,
+            fill='black')
+
+        # create oval
+        oval_center = origin + UNIT * 2
+        self.oval = self.canvas.create_oval(
+            oval_center[0] - 15, oval_center[1] - 15,
+            oval_center[0] + 15, oval_center[1] + 15,
+            fill='yellow')
+
+        # create red rect
+        self.rect = self.canvas.create_rectangle(
+            origin[0] - 15, origin[1] - 15,
+            origin[0] + 15, origin[1] + 15,
+            fill='red')
+
+        # pack all
+        self.canvas.pack()
+
+    def reset(self):
+        self.update()
+        time.sleep(0.5)
+        self.canvas.delete(self.rect)
+        origin = np.array([20, 20])
+        self.rect = self.canvas.create_rectangle(
+            origin[0] - 15, origin[1] - 15,
+            origin[0] + 15, origin[1] + 15,
+            fill='red')
+        # return observation
+        return self.canvas.coords(self.rect)
+
+    def step(self, action):
+        s = self.canvas.coords(self.rect)
+        base_action = np.array([0, 0])
+        if action == 0:   # up
+            if s[1] > UNIT:
+                base_action[1] -= UNIT
+        elif action == 1:   # down
+            if s[1] < (MAZE_H - 1) * UNIT:
+                base_action[1] += UNIT
+        elif action == 2:   # right
+            if s[0] < (MAZE_W - 1) * UNIT:
+                base_action[0] += UNIT
+        elif action == 3:   # left
+            if s[0] > UNIT:
+                base_action[0] -= UNIT
+
+        self.canvas.move(self.rect, base_action[0], base_action[1])  # move agent
+
+        s_ = self.canvas.coords(self.rect)  # next state
+
+        # reward function
+        if s_ == self.canvas.coords(self.oval):
+            reward = 1
+            done = True
+        elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
+            reward = -1
+            done = True
+        else:
+            reward = 0
+            done = False
+
+        return s_, reward, done
+
+    def render(self):
+        time.sleep(0.05)
+        self.update()
+
+
diff --git a/contents/4_Sarsa_lambda_maze/run_this.py b/contents/4_Sarsa_lambda_maze/run_this.py
new file mode 100644
index 0000000..a0c5afc
--- /dev/null
+++ b/contents/4_Sarsa_lambda_maze/run_this.py
@@ -0,0 +1,52 @@
+"""
+Sarsa is a online updating method for Reinforcement learning.
+
+Unlike Q learning which is a offline updating method, Sarsa is updating while in the current trajectory.
+
+You will see the sarsa is more coward when punishment is close because it cares about all behaviours,
+while q learning is more brave because it only cares about maximum behaviour.
+"""
+
+from maze_env import Maze
+from RL_brain import SarsaLambdaTable
+
+
+def update():
+    for episode in range(100):
+        # initial observation
+        observation = env.reset()
+
+        # RL choose action based on observation
+        action = RL.choose_action(str(observation))
+
+        while True:
+            # fresh env
+            env.render()
+
+            # RL take action and get next observation and reward
+            observation_, reward, done = env.step(action)
+
+            # RL choose action based on next observation
+            action_ = RL.choose_action(str(observation_))
+
+            # RL learn from this transition (s, a, r, s, a) ==> Sarsa
+            RL.learn(str(observation), action, reward, str(observation_), action_)
+
+            # swap observation and action
+            observation = observation_
+            action = action_
+
+            # break while loop when end of this episode
+            if done:
+                break
+
+    # end of game
+    print('game over')
+    env.destroy()
+
+if __name__ == "__main__":
+    env = Maze()
+    RL = SarsaLambdaTable(actions=list(range(env.n_actions)))
+
+    env.after(100, update)
+    env.mainloop()
\ No newline at end of file
diff --git a/contents/5.1_Double_DQN/RL_brain.py b/contents/5.1_Double_DQN/RL_brain.py
new file mode 100644
index 0000000..15053eb
--- /dev/null
+++ b/contents/5.1_Double_DQN/RL_brain.py
@@ -0,0 +1,163 @@
+"""
+The double DQN based on this paper: https://arxiv.org/abs/1509.06461
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+
+Using:
+Tensorflow: 1.0
+gym: 0.8.0
+"""
+
+import numpy as np
+import tensorflow as tf
+
+np.random.seed(1)
+tf.set_random_seed(1)
+
+
+class DoubleDQN:
+    def __init__(
+            self,
+            n_actions,
+            n_features,
+            learning_rate=0.005,
+            reward_decay=0.9,
+            e_greedy=0.9,
+            replace_target_iter=200,
+            memory_size=3000,
+            batch_size=32,
+            e_greedy_increment=None,
+            output_graph=False,
+            double_q=True,
+            sess=None,
+    ):
+        self.n_actions = n_actions
+        self.n_features = n_features
+        self.lr = learning_rate
+        self.gamma = reward_decay
+        self.epsilon_max = e_greedy
+        self.replace_target_iter = replace_target_iter
+        self.memory_size = memory_size
+        self.batch_size = batch_size
+        self.epsilon_increment = e_greedy_increment
+        self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
+
+        self.double_q = double_q    # decide to use double q or not
+
+        self.learn_step_counter = 0
+        self.memory = np.zeros((self.memory_size, n_features*2+2))
+        self._build_net()
+        if sess is None:
+            self.sess = tf.Session()
+            self.sess.run(tf.global_variables_initializer())
+        else:
+            self.sess = sess
+        if output_graph:
+            tf.summary.FileWriter("logs/", self.sess.graph)
+        self.cost_his = []
+
+    def _build_net(self):
+        def build_layers(s, c_names, n_l1, w_initializer, b_initializer):
+            with tf.variable_scope('l1'):
+                w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
+                b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
+                l1 = tf.nn.relu(tf.matmul(s, w1) + b1)
+
+            with tf.variable_scope('l2'):
+                w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
+                b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
+                out = tf.matmul(l1, w2) + b2
+            return out
+        # ------------------ build evaluate_net ------------------
+        self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')  # input
+        self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target')  # for calculating loss
+
+        with tf.variable_scope('eval_net'):
+            c_names, n_l1, w_initializer, b_initializer = \
+                ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 20, \
+                tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)  # config of layers
+
+            self.q_eval = build_layers(self.s, c_names, n_l1, w_initializer, b_initializer)
+
+        with tf.variable_scope('loss'):
+            self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
+        with tf.variable_scope('train'):
+            self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
+
+        # ------------------ build target_net ------------------
+        self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_')    # input
+        with tf.variable_scope('target_net'):
+            c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
+
+            self.q_next = build_layers(self.s_, c_names, n_l1, w_initializer, b_initializer)
+
+    def store_transition(self, s, a, r, s_):
+        if not hasattr(self, 'memory_counter'):
+            self.memory_counter = 0
+        transition = np.hstack((s, [a, r], s_))
+        index = self.memory_counter % self.memory_size
+        self.memory[index, :] = transition
+        self.memory_counter += 1
+
+    def choose_action(self, observation):
+        observation = observation[np.newaxis, :]
+        actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
+        action = np.argmax(actions_value)
+
+        if not hasattr(self, 'q'):  # record action value it gets
+            self.q = []
+            self.running_q = 0
+        self.running_q = self.running_q*0.99 + 0.01 * np.max(actions_value)
+        self.q.append(self.running_q)
+
+        if np.random.uniform() > self.epsilon:  # choosing action
+            action = np.random.randint(0, self.n_actions)
+        return action
+
+    def _replace_target_params(self):
+        t_params = tf.get_collection('target_net_params')
+        e_params = tf.get_collection('eval_net_params')
+        self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])
+
+    def learn(self):
+        if self.learn_step_counter % self.replace_target_iter == 0:
+            self._replace_target_params()
+            print('\ntarget_params_replaced\n')
+
+        if self.memory_counter > self.memory_size:
+            sample_index = np.random.choice(self.memory_size, size=self.batch_size)
+        else:
+            sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
+        batch_memory = self.memory[sample_index, :]
+
+        q_next, q_eval4next = self.sess.run(
+            [self.q_next, self.q_eval],
+            feed_dict={self.s_: batch_memory[:, -self.n_features:],    # next observation
+                       self.s: batch_memory[:, -self.n_features:]})    # next observation
+        q_eval = self.sess.run(self.q_eval, {self.s: batch_memory[:, :self.n_features]})
+
+        q_target = q_eval.copy()
+
+        batch_index = np.arange(self.batch_size, dtype=np.int32)
+        eval_act_index = batch_memory[:, self.n_features].astype(int)
+        reward = batch_memory[:, self.n_features + 1]
+
+        if self.double_q:
+            max_act4next = np.argmax(q_eval4next, axis=1)        # the action that brings the highest value is evaluated by q_eval
+            selected_q_next = q_next[batch_index, max_act4next]  # Double DQN, select q_next depending on above actions
+        else:
+            selected_q_next = np.max(q_next, axis=1)    # the natural DQN
+
+        q_target[batch_index, eval_act_index] = reward + self.gamma * selected_q_next
+
+        _, self.cost = self.sess.run([self._train_op, self.loss],
+                                     feed_dict={self.s: batch_memory[:, :self.n_features],
+                                                self.q_target: q_target})
+        self.cost_his.append(self.cost)
+
+        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
+        self.learn_step_counter += 1
+
+
+
+
diff --git a/contents/5.1_Double_DQN/run_Pendulum.py b/contents/5.1_Double_DQN/run_Pendulum.py
new file mode 100644
index 0000000..d60a362
--- /dev/null
+++ b/contents/5.1_Double_DQN/run_Pendulum.py
@@ -0,0 +1,77 @@
+"""
+Double DQN & Natural DQN comparison,
+The Pendulum example.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+
+Using:
+Tensorflow: 1.0
+gym: 0.8.0
+"""
+
+
+import gym
+from RL_brain import DoubleDQN
+import numpy as np
+import matplotlib.pyplot as plt
+import tensorflow as tf
+
+
+env = gym.make('Pendulum-v0')
+env = env.unwrapped
+env.seed(1)
+MEMORY_SIZE = 3000
+ACTION_SPACE = 11
+
+sess = tf.Session()
+with tf.variable_scope('Natural_DQN'):
+    natural_DQN = DoubleDQN(
+        n_actions=ACTION_SPACE, n_features=3, memory_size=MEMORY_SIZE,
+        e_greedy_increment=0.001, double_q=False, sess=sess
+    )
+
+with tf.variable_scope('Double_DQN'):
+    double_DQN = DoubleDQN(
+        n_actions=ACTION_SPACE, n_features=3, memory_size=MEMORY_SIZE,
+        e_greedy_increment=0.001, double_q=True, sess=sess, output_graph=True)
+
+sess.run(tf.global_variables_initializer())
+
+
+def train(RL):
+    total_steps = 0
+    observation = env.reset()
+    while True:
+        # if total_steps - MEMORY_SIZE > 8000: env.render()
+
+        action = RL.choose_action(observation)
+
+        f_action = (action-(ACTION_SPACE-1)/2)/((ACTION_SPACE-1)/4)   # convert to [-2 ~ 2] float actions
+        observation_, reward, done, info = env.step(np.array([f_action]))
+
+        reward /= 10     # normalize to a range of (-1, 0). r = 0 when get upright
+        # the Q target at upright state will be 0, because Q_target = r + gamma * Qmax(s', a') = 0 + gamma * 0
+        # so when Q at this state is greater than 0, the agent overestimates the Q. Please refer to the final result.
+
+        RL.store_transition(observation, action, reward, observation_)
+
+        if total_steps > MEMORY_SIZE:   # learning
+            RL.learn()
+
+        if total_steps - MEMORY_SIZE > 20000:   # stop game
+            break
+
+        observation = observation_
+        total_steps += 1
+    return RL.q
+
+q_natural = train(natural_DQN)
+q_double = train(double_DQN)
+
+plt.plot(np.array(q_natural), c='r', label='natural')
+plt.plot(np.array(q_double), c='b', label='double')
+plt.legend(loc='best')
+plt.ylabel('Q eval')
+plt.xlabel('training steps')
+plt.grid()
+plt.show()
diff --git a/contents/5.2_Prioritized_Replay_DQN/RL_brain.py b/contents/5.2_Prioritized_Replay_DQN/RL_brain.py
new file mode 100644
index 0000000..27d0e50
--- /dev/null
+++ b/contents/5.2_Prioritized_Replay_DQN/RL_brain.py
@@ -0,0 +1,300 @@
+"""
+The DQN improvement: Prioritized Experience Replay (based on https://arxiv.org/abs/1511.05952)
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+
+Using:
+Tensorflow: 1.0
+gym: 0.8.0
+"""
+
+import numpy as np
+import tensorflow as tf
+
+np.random.seed(1)
+tf.set_random_seed(1)
+
+
+class SumTree(object):
+    """
+    This SumTree code is modified version and the original code is from: 
+    https://github.com/jaara/AI-blog/blob/master/SumTree.py
+    
+    Story the data with it priority in tree and data frameworks.
+    """
+    data_pointer = 0
+
+    def __init__(self, capacity):
+        self.capacity = capacity    # for all priority values
+        self.tree = np.zeros(2*capacity - 1)
+        # [--------------Parent nodes-------------][-------leaves to recode priority-------]
+        #             size: capacity - 1                       size: capacity
+        self.data = np.zeros(capacity, dtype=object)    # for all transitions
+        # [--------------data frame-------------]
+        #             size: capacity
+
+    def add_new_priority(self, p, data):
+        leaf_idx = self.data_pointer + self.capacity - 1
+
+        self.data[self.data_pointer] = data # update data_frame
+        self.update(leaf_idx, p)    # update tree_frame
+
+        self.data_pointer += 1
+        if self.data_pointer >= self.capacity:  # replace when exceed the capacity
+            self.data_pointer = 0
+
+    def update(self, tree_idx, p):
+        change = p - self.tree[tree_idx]
+
+        self.tree[tree_idx] = p
+        self._propagate_change(tree_idx, change)
+
+    def _propagate_change(self, tree_idx, change):
+        """change the sum of priority value in all parent nodes"""
+        parent_idx = (tree_idx - 1) // 2
+        self.tree[parent_idx] += change
+        if parent_idx != 0:
+            self._propagate_change(parent_idx, change)
+
+    def get_leaf(self, lower_bound):
+        leaf_idx = self._retrieve(lower_bound)  # search the max leaf priority based on the lower_bound
+        data_idx = leaf_idx - self.capacity + 1
+        return [leaf_idx, self.tree[leaf_idx], self.data[data_idx]]
+
+    def _retrieve(self, lower_bound, parent_idx=0):
+        """
+        Tree structure and array storage:
+
+        Tree index:
+             0         -> storing priority sum
+            / \
+          1     2
+         / \   / \
+        3   4 5   6    -> storing priority for transitions
+
+        Array type for storing:
+        [0,1,2,3,4,5,6]
+        """
+        left_child_idx = 2 * parent_idx + 1
+        right_child_idx = left_child_idx + 1
+
+        if left_child_idx >= len(self.tree):    # end search when no more child
+            return parent_idx
+
+        if self.tree[left_child_idx] == self.tree[right_child_idx]:
+            return self._retrieve(lower_bound, np.random.choice([left_child_idx, right_child_idx]))
+        if lower_bound <= self.tree[left_child_idx]:  # downward search, always search for a higher priority node
+            return self._retrieve(lower_bound, left_child_idx)
+        else:
+            return self._retrieve(lower_bound-self.tree[left_child_idx], right_child_idx)
+
+    @property
+    def root_priority(self):
+        return self.tree[0]     # the root
+
+
+class Memory(object):   # stored as ( s, a, r, s_ ) in SumTree
+    """
+    This SumTree code is modified version and the original code is from:
+    https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py
+    """
+    epsilon = 0.01  # small amount to avoid zero priority
+    alpha = 0.6     # [0~1] convert the importance of TD error to priority
+    beta = 0.4      # importance-sampling, from initial value increasing to 1
+    beta_increment_per_sampling = 0.001
+    abs_err_upper = 1  # clipped abs error
+
+    def __init__(self, capacity):
+        self.tree = SumTree(capacity)
+
+    def store(self, error, transition):
+        p = self._get_priority(error)
+        self.tree.add_new_priority(p, transition)
+
+    def sample(self, n):
+        batch_idx, batch_memory, ISWeights = [], [], []
+        segment = self.tree.root_priority / n
+        self.beta = np.min([1, self.beta + self.beta_increment_per_sampling])  # max = 1
+
+        min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.root_priority
+        maxiwi = np.power(self.tree.capacity * min_prob, -self.beta)  # for later normalizing ISWeights
+        for i in range(n):
+            a = segment * i
+            b = segment * (i + 1)
+            lower_bound = np.random.uniform(a, b)
+            idx, p, data = self.tree.get_leaf(lower_bound)
+            prob = p / self.tree.root_priority
+            ISWeights.append(self.tree.capacity * prob)
+            batch_idx.append(idx)
+            batch_memory.append(data)
+
+        ISWeights = np.vstack(ISWeights)
+        ISWeights = np.power(ISWeights, -self.beta) / maxiwi  # normalize
+        return batch_idx, np.vstack(batch_memory), ISWeights
+
+    def update(self, idx, error):
+        p = self._get_priority(error)
+        self.tree.update(idx, p)
+
+    def _get_priority(self, error):
+        error += self.epsilon  # avoid 0
+        clipped_error = np.clip(error, 0, self.abs_err_upper)
+        return np.power(clipped_error, self.alpha)
+
+
+class DQNPrioritizedReplay:
+    def __init__(
+            self,
+            n_actions,
+            n_features,
+            learning_rate=0.005,
+            reward_decay=0.9,
+            e_greedy=0.9,
+            replace_target_iter=500,
+            memory_size=10000,
+            batch_size=32,
+            e_greedy_increment=None,
+            output_graph=False,
+            prioritized=True,
+            sess=None,
+    ):
+        self.n_actions = n_actions
+        self.n_features = n_features
+        self.lr = learning_rate
+        self.gamma = reward_decay
+        self.epsilon_max = e_greedy
+        self.replace_target_iter = replace_target_iter
+        self.memory_size = memory_size
+        self.batch_size = batch_size
+        self.epsilon_increment = e_greedy_increment
+        self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
+
+        self.prioritized = prioritized    # decide to use double q or not
+
+        self.learn_step_counter = 0
+
+        self._build_net()
+
+        if self.prioritized:
+            self.memory = Memory(capacity=memory_size)
+        else:
+            self.memory = np.zeros((self.memory_size, n_features*2+2))
+
+        if sess is None:
+            self.sess = tf.Session()
+            self.sess.run(tf.global_variables_initializer())
+        else:
+            self.sess = sess
+
+        if output_graph:
+            tf.summary.FileWriter("logs/", self.sess.graph)
+
+        self.cost_his = []
+
+    def _build_net(self):
+        def build_layers(s, c_names, n_l1, w_initializer, b_initializer):
+            with tf.variable_scope('l1'):
+                w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
+                b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
+                l1 = tf.nn.relu(tf.matmul(s, w1) + b1)
+
+            with tf.variable_scope('l2'):
+                w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
+                b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
+                out = tf.matmul(l1, w2) + b2
+            return out
+
+        # ------------------ build evaluate_net ------------------
+        self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')  # input
+        self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target')  # for calculating loss
+        if self.prioritized:
+            self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights')
+        with tf.variable_scope('eval_net'):
+            c_names, n_l1, w_initializer, b_initializer = \
+                ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 20, \
+                tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)  # config of layers
+
+            self.q_eval = build_layers(self.s, c_names, n_l1, w_initializer, b_initializer)
+
+        with tf.variable_scope('loss'):
+            if self.prioritized:
+                self.abs_errors = tf.reduce_sum(tf.abs(self.q_target - self.q_eval), axis=1)    # for updating Sumtree
+                self.loss = tf.reduce_mean(self.ISWeights * tf.squared_difference(self.q_target, self.q_eval))
+            else:
+                self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
+        with tf.variable_scope('train'):
+            self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
+
+        # ------------------ build target_net ------------------
+        self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_')    # input
+        with tf.variable_scope('target_net'):
+            c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
+            self.q_next = build_layers(self.s_, c_names, n_l1, w_initializer, b_initializer)
+
+    def store_transition(self, s, a, r, s_):
+        if self.prioritized:    # prioritized replay
+            transition = np.hstack((s, [a, r], s_))
+            max_p = np.max(self.memory.tree.tree[-self.memory.tree.capacity:])   # have high priority for newly arrived transition
+            self.memory.store(max_p, transition)
+        else:       # random replay
+            if not hasattr(self, 'memory_counter'):
+                self.memory_counter = 0
+            transition = np.hstack((s, [a, r], s_))
+            index = self.memory_counter % self.memory_size
+            self.memory[index, :] = transition
+            self.memory_counter += 1
+
+    def choose_action(self, observation):
+        observation = observation[np.newaxis, :]
+        if np.random.uniform() < self.epsilon:
+            actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
+            action = np.argmax(actions_value)
+        else:
+            action = np.random.randint(0, self.n_actions)
+        return action
+
+    def _replace_target_params(self):
+        t_params = tf.get_collection('target_net_params')
+        e_params = tf.get_collection('eval_net_params')
+        self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])
+
+    def learn(self):
+        if self.learn_step_counter % self.replace_target_iter == 0:
+            self._replace_target_params()
+            print('\ntarget_params_replaced\n')
+
+        if self.prioritized:
+            tree_idx, batch_memory, ISWeights = self.memory.sample(self.batch_size)
+        else:
+            sample_index = np.random.choice(self.memory_size, size=self.batch_size)
+            batch_memory = self.memory[sample_index, :]
+
+        q_next, q_eval = self.sess.run(
+                [self.q_next, self.q_eval],
+                feed_dict={self.s_: batch_memory[:, -self.n_features:],
+                           self.s: batch_memory[:, :self.n_features]})
+
+        q_target = q_eval.copy()
+        batch_index = np.arange(self.batch_size, dtype=np.int32)
+        eval_act_index = batch_memory[:, self.n_features].astype(int)
+        reward = batch_memory[:, self.n_features + 1]
+
+        q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
+
+        if self.prioritized:
+            _, abs_errors, self.cost = self.sess.run([self._train_op, self.abs_errors, self.loss],
+                                         feed_dict={self.s: batch_memory[:, :self.n_features],
+                                                    self.q_target: q_target,
+                                                    self.ISWeights: ISWeights})
+            for i in range(len(tree_idx)):  # update priority
+                idx = tree_idx[i]
+                self.memory.update(idx, abs_errors[i])
+        else:
+            _, self.cost = self.sess.run([self._train_op, self.loss],
+                                         feed_dict={self.s: batch_memory[:, :self.n_features],
+                                                    self.q_target: q_target})
+
+        self.cost_his.append(self.cost)
+
+        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
+        self.learn_step_counter += 1
diff --git a/contents/5.2_Prioritized_Replay_DQN/run_MountainCar.py b/contents/5.2_Prioritized_Replay_DQN/run_MountainCar.py
new file mode 100644
index 0000000..08c2562
--- /dev/null
+++ b/contents/5.2_Prioritized_Replay_DQN/run_MountainCar.py
@@ -0,0 +1,80 @@
+"""
+The DQN improvement: Prioritized Experience Replay (based on https://arxiv.org/abs/1511.05952)
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+
+Using:
+Tensorflow: 1.0
+gym: 0.8.0
+"""
+
+
+import gym
+from RL_brain import DQNPrioritizedReplay
+import matplotlib.pyplot as plt
+import tensorflow as tf
+import numpy as np
+
+env = gym.make('MountainCar-v0')
+env = env.unwrapped
+env.seed(21)
+MEMORY_SIZE = 10000
+
+sess = tf.Session()
+with tf.variable_scope('natural_DQN'):
+    RL_natural = DQNPrioritizedReplay(
+        n_actions=3, n_features=2, memory_size=MEMORY_SIZE,
+        e_greedy_increment=0.00005, sess=sess, prioritized=False,
+    )
+
+with tf.variable_scope('DQN_with_prioritized_replay'):
+    RL_prio = DQNPrioritizedReplay(
+        n_actions=3, n_features=2, memory_size=MEMORY_SIZE,
+        e_greedy_increment=0.00005, sess=sess, prioritized=True, output_graph=True,
+    )
+sess.run(tf.global_variables_initializer())
+
+
+def train(RL):
+    total_steps = 0
+    steps = []
+    episodes = []
+    for i_episode in range(20):
+        observation = env.reset()
+        while True:
+            # env.render()
+
+            action = RL.choose_action(observation)
+
+            observation_, reward, done, info = env.step(action)
+
+            if done: reward = 10
+
+            RL.store_transition(observation, action, reward, observation_)
+
+            if total_steps > MEMORY_SIZE:
+                RL.learn()
+
+            if done:
+                print('episode ', i_episode, ' finished')
+                steps.append(total_steps)
+                episodes.append(i_episode)
+                break
+
+            observation = observation_
+            total_steps += 1
+    return np.vstack((episodes, steps))
+
+his_natural = train(RL_natural)
+his_prio = train(RL_prio)
+
+# compare based on first success
+plt.plot(his_natural[0, :], his_natural[1, :] - his_natural[1, 0], c='b', label='natural DQN')
+plt.plot(his_prio[0, :], his_prio[1, :] - his_prio[1, 0], c='r', label='DQN with prioritized replay')
+plt.legend(loc='best')
+plt.ylabel('total training time')
+plt.xlabel('episode')
+plt.grid()
+plt.show()
+
+
diff --git a/contents/5.3_Dueling_DQN/RL_brain.py b/contents/5.3_Dueling_DQN/RL_brain.py
new file mode 100644
index 0000000..fec458f
--- /dev/null
+++ b/contents/5.3_Dueling_DQN/RL_brain.py
@@ -0,0 +1,165 @@
+"""
+The Dueling DQN based on this paper: https://arxiv.org/abs/1511.06581
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+
+Using:
+Tensorflow: 1.0
+gym: 0.8.0
+"""
+
+import numpy as np
+import tensorflow as tf
+
+np.random.seed(1)
+tf.set_random_seed(1)
+
+
+class DuelingDQN:
+    def __init__(
+            self,
+            n_actions,
+            n_features,
+            learning_rate=0.001,
+            reward_decay=0.9,
+            e_greedy=0.9,
+            replace_target_iter=200,
+            memory_size=500,
+            batch_size=32,
+            e_greedy_increment=None,
+            output_graph=False,
+            dueling=True,
+            sess=None,
+    ):
+        self.n_actions = n_actions
+        self.n_features = n_features
+        self.lr = learning_rate
+        self.gamma = reward_decay
+        self.epsilon_max = e_greedy
+        self.replace_target_iter = replace_target_iter
+        self.memory_size = memory_size
+        self.batch_size = batch_size
+        self.epsilon_increment = e_greedy_increment
+        self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
+
+        self.dueling = dueling      # decide to use dueling DQN or not
+
+        self.learn_step_counter = 0
+        self.memory = np.zeros((self.memory_size, n_features*2+2))
+        self._build_net()
+        if sess is None:
+            self.sess = tf.Session()
+            self.sess.run(tf.global_variables_initializer())
+        else:
+            self.sess = sess
+        if output_graph:
+            tf.summary.FileWriter("logs/", self.sess.graph)
+        self.cost_his = []
+
+    def _build_net(self):
+        def build_layers(s, c_names, n_l1, w_initializer, b_initializer):
+            with tf.variable_scope('l1'):
+                w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
+                b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
+                l1 = tf.nn.relu(tf.matmul(s, w1) + b1)
+
+            if self.dueling:
+                # Dueling DQN
+                with tf.variable_scope('Value'):
+                    w2 = tf.get_variable('w2', [n_l1, 1], initializer=w_initializer, collections=c_names)
+                    b2 = tf.get_variable('b2', [1, 1], initializer=b_initializer, collections=c_names)
+                    self.V = tf.matmul(l1, w2) + b2
+
+                with tf.variable_scope('Advantage'):
+                    w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
+                    b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
+                    self.A = tf.matmul(l1, w2) + b2
+
+                with tf.variable_scope('Q'):
+                    out = self.V + (self.A - tf.reduce_mean(self.A, axis=1, keep_dims=True))     # Q = V(s) + A(s,a)
+            else:
+                with tf.variable_scope('Q'):
+                    w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
+                    b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
+                    out = tf.matmul(l1, w2) + b2
+
+            return out
+
+        # ------------------ build evaluate_net ------------------
+        self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')  # input
+        self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target')  # for calculating loss
+        with tf.variable_scope('eval_net'):
+            c_names, n_l1, w_initializer, b_initializer = \
+                ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 20, \
+                tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)  # config of layers
+
+            self.q_eval = build_layers(self.s, c_names, n_l1, w_initializer, b_initializer)
+
+        with tf.variable_scope('loss'):
+            self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
+        with tf.variable_scope('train'):
+            self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
+
+        # ------------------ build target_net ------------------
+        self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_')    # input
+        with tf.variable_scope('target_net'):
+            c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
+
+            self.q_next = build_layers(self.s_, c_names, n_l1, w_initializer, b_initializer)
+
+    def store_transition(self, s, a, r, s_):
+        if not hasattr(self, 'memory_counter'):
+            self.memory_counter = 0
+        transition = np.hstack((s, [a, r], s_))
+        index = self.memory_counter % self.memory_size
+        self.memory[index, :] = transition
+        self.memory_counter += 1
+
+    def choose_action(self, observation):
+        observation = observation[np.newaxis, :]
+        if np.random.uniform() < self.epsilon:  # choosing action
+            actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
+            action = np.argmax(actions_value)
+        else:
+            action = np.random.randint(0, self.n_actions)
+        return action
+
+    def _replace_target_params(self):
+        t_params = tf.get_collection('target_net_params')
+        e_params = tf.get_collection('eval_net_params')
+        self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])
+
+    def learn(self):
+        if self.learn_step_counter % self.replace_target_iter == 0:
+            self._replace_target_params()
+            print('\ntarget_params_replaced\n')
+
+        sample_index = np.random.choice(self.memory_size, size=self.batch_size)
+        batch_memory = self.memory[sample_index, :]
+
+        q_next, q_eval4next,  = self.sess.run(
+            [self.q_next, self.q_eval],
+            feed_dict={self.s_: batch_memory[:, -self.n_features:],    # next observation
+                       self.s: batch_memory[:, -self.n_features:]})    # next observation
+        q_eval = self.sess.run(self.q_eval, {self.s: batch_memory[:, :self.n_features]})
+
+        q_target = q_eval.copy()
+
+        batch_index = np.arange(self.batch_size, dtype=np.int32)
+        eval_act_index = batch_memory[:, self.n_features].astype(int)
+        reward = batch_memory[:, self.n_features + 1]
+
+        q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
+
+        _, self.cost = self.sess.run([self._train_op, self.loss],
+                                     feed_dict={self.s: batch_memory[:, :self.n_features],
+                                                self.q_target: q_target})
+        self.cost_his.append(self.cost)
+
+        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
+        self.learn_step_counter += 1
+
+
+
+
+
diff --git a/contents/5.3_Dueling_DQN/run_Pendulum.py b/contents/5.3_Dueling_DQN/run_Pendulum.py
new file mode 100644
index 0000000..d7b2e70
--- /dev/null
+++ b/contents/5.3_Dueling_DQN/run_Pendulum.py
@@ -0,0 +1,86 @@
+"""
+Dueling DQN & Natural DQN comparison
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+
+Using:
+Tensorflow: 1.0
+gym: 0.8.0
+"""
+
+
+import gym
+from RL_brain import DuelingDQN
+import numpy as np
+import matplotlib.pyplot as plt
+import tensorflow as tf
+
+
+env = gym.make('Pendulum-v0')
+env = env.unwrapped
+env.seed(1)
+MEMORY_SIZE = 3000
+ACTION_SPACE = 25
+
+sess = tf.Session()
+with tf.variable_scope('natural'):
+    natural_DQN = DuelingDQN(
+        n_actions=ACTION_SPACE, n_features=3, memory_size=MEMORY_SIZE,
+        e_greedy_increment=0.001, sess=sess, dueling=False)
+
+with tf.variable_scope('dueling'):
+    dueling_DQN = DuelingDQN(
+        n_actions=ACTION_SPACE, n_features=3, memory_size=MEMORY_SIZE,
+        e_greedy_increment=0.001, sess=sess, dueling=True, output_graph=True)
+
+sess.run(tf.global_variables_initializer())
+
+
+def train(RL):
+    acc_r = [0]
+    total_steps = 0
+    observation = env.reset()
+    while True:
+        # if total_steps-MEMORY_SIZE > 9000: env.render()
+
+        action = RL.choose_action(observation)
+
+        f_action = (action-(ACTION_SPACE-1)/2)/((ACTION_SPACE-1)/4)   # [-2 ~ 2] float actions
+        observation_, reward, done, info = env.step(np.array([f_action]))
+
+        reward /= 10      # normalize to a range of (-1, 0)
+        acc_r.append(reward + acc_r[-1])  # accumulated reward
+
+        RL.store_transition(observation, action, reward, observation_)
+
+        if total_steps > MEMORY_SIZE:
+            RL.learn()
+
+        if total_steps-MEMORY_SIZE > 15000:
+            break
+
+        observation = observation_
+        total_steps += 1
+    return RL.cost_his, acc_r
+
+c_natural, r_natural = train(natural_DQN)
+c_dueling, r_dueling = train(dueling_DQN)
+
+plt.figure(1)
+plt.plot(np.array(c_natural), c='r', label='natural')
+plt.plot(np.array(c_dueling), c='b', label='dueling')
+plt.legend(loc='best')
+plt.ylabel('cost')
+plt.xlabel('training steps')
+plt.grid()
+
+plt.figure(2)
+plt.plot(np.array(r_natural), c='r', label='natural')
+plt.plot(np.array(r_dueling), c='b', label='dueling')
+plt.legend(loc='best')
+plt.ylabel('accumulated reward')
+plt.xlabel('training steps')
+plt.grid()
+
+plt.show()
+
diff --git a/contents/5_Deep_Q_Network/DQN_modified.py b/contents/5_Deep_Q_Network/DQN_modified.py
new file mode 100644
index 0000000..16712ec
--- /dev/null
+++ b/contents/5_Deep_Q_Network/DQN_modified.py
@@ -0,0 +1,172 @@
+"""
+This part of code is the DQN brain.
+
+view the tensorboard picture about this DQN structure on: https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/4-3-DQN3/#modification
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+
+Using:
+Tensorflow: 1.0
+gym: 0.7.3
+"""
+
+import numpy as np
+import tensorflow as tf
+
+np.random.seed(1)
+tf.set_random_seed(1)
+
+
+# Deep Q Network off-policy
+class DeepQNetwork:
+    def __init__(
+            self,
+            n_actions,
+            n_features,
+            learning_rate=0.01,
+            reward_decay=0.9,
+            e_greedy=0.9,
+            replace_target_iter=300,
+            memory_size=500,
+            batch_size=32,
+            e_greedy_increment=None,
+            output_graph=False,
+    ):
+        self.n_actions = n_actions
+        self.n_features = n_features
+        self.lr = learning_rate
+        self.gamma = reward_decay
+        self.epsilon_max = e_greedy
+        self.replace_target_iter = replace_target_iter
+        self.memory_size = memory_size
+        self.batch_size = batch_size
+        self.epsilon_increment = e_greedy_increment
+        self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
+
+        # total learning step
+        self.learn_step_counter = 0
+
+        # initialize zero memory [s, a, r, s_]
+        self.memory = np.zeros((self.memory_size, n_features * 2 + 2))
+
+        # consist of [target_net, evaluate_net]
+        self._build_net()
+
+        self.sess = tf.Session()
+
+        if output_graph:
+            # $ tensorboard --logdir=logs
+            # tf.train.SummaryWriter soon be deprecated, use following
+            tf.summary.FileWriter("logs/", self.sess.graph)
+
+        self.sess.run(tf.global_variables_initializer())
+        self.cost_his = []
+
+    def _build_net(self):
+        def build_layers(s, c_names, n_l1, w_initializer, b_initializer):
+            with tf.variable_scope('l1'):
+                w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
+                b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
+                l1 = tf.nn.relu(tf.matmul(s, w1) + b1)
+
+            with tf.variable_scope('l2'):
+                w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
+                b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
+                out = tf.matmul(l1, w2) + b2
+            return out
+
+        # ------------------ all inputs ------------------------
+        self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')  # input State
+        self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_')  # input Next State
+        self.r = tf.placeholder(tf.float32, [None, ], name='r')  # input Reward
+        self.a = tf.placeholder(tf.int32, [None, ], name='a')  # input Action
+
+        # ------------------ build evaluate_net ------------------
+        with tf.variable_scope('eval_net'):
+            # c_names(collections_names) are the collections to store variables
+            c_names, n_l1, w_initializer, b_initializer = \
+                ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 10, \
+                tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)  # config of layers
+            self.q_eval = build_layers(self.s, c_names, n_l1, w_initializer, b_initializer)
+
+        # ------------------ build target_net ------------------
+        with tf.variable_scope('target_net'):
+            # c_names(collections_names) are the collections to store variables
+            c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
+            self.q_next = build_layers(self.s_, c_names, n_l1, w_initializer, b_initializer)
+
+        with tf.variable_scope('q_target'):
+            self.q_target = self.r + self.gamma * tf.reduce_max(self.q_next, axis=1, name='Qmax_s_')    # shape=(None, )
+
+        with tf.variable_scope('q_eval'):
+            a_one_hot = tf.one_hot(self.a, depth=self.n_actions, dtype=tf.float32)
+            self.q_eval_wrt_a = tf.reduce_sum(self.q_eval * a_one_hot, axis=1)     # shape=(None, )
+
+        with tf.variable_scope('loss'):
+            self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval_wrt_a, name='TD_error'))
+        with tf.variable_scope('train'):
+            self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
+
+    def store_transition(self, s, a, r, s_):
+        if not hasattr(self, 'memory_counter'):
+            self.memory_counter = 0
+        transition = np.hstack((s, [a, r], s_))
+        # replace the old memory with new memory
+        index = self.memory_counter % self.memory_size
+        self.memory[index, :] = transition
+        self.memory_counter += 1
+
+    def choose_action(self, observation):
+        # to have batch dimension when feed into tf placeholder
+        observation = observation[np.newaxis, :]
+
+        if np.random.uniform() < self.epsilon:
+            # forward feed the observation and get q value for every actions
+            actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
+            action = np.argmax(actions_value)
+        else:
+            action = np.random.randint(0, self.n_actions)
+        return action
+
+    def _replace_target_params(self):
+        t_params = tf.get_collection('target_net_params')
+        e_params = tf.get_collection('eval_net_params')
+        self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])
+
+    def learn(self):
+        # check to replace target parameters
+        if self.learn_step_counter % self.replace_target_iter == 0:
+            self._replace_target_params()
+            print('\ntarget_params_replaced\n')
+
+        # sample batch memory from all memory
+        if self.memory_counter > self.memory_size:
+            sample_index = np.random.choice(self.memory_size, size=self.batch_size)
+        else:
+            sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
+        batch_memory = self.memory[sample_index, :]
+
+        _, cost = self.sess.run(
+            [self._train_op, self.loss],
+            feed_dict={
+                self.s: batch_memory[:, :self.n_features],
+                self.a: batch_memory[:, self.n_features],
+                self.r: batch_memory[:, self.n_features + 1],
+                self.s_: batch_memory[:, -self.n_features:],
+            })
+
+        self.cost_his.append(cost)
+
+        # increasing epsilon
+        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
+        self.learn_step_counter += 1
+
+    def plot_cost(self):
+        import matplotlib.pyplot as plt
+        plt.plot(np.arange(len(self.cost_his)), self.cost_his)
+        plt.ylabel('Cost')
+        plt.xlabel('training steps')
+        plt.show()
+
+if __name__ == '__main__':
+    DQN = DeepQNetwork(3,4, output_graph=True)
\ No newline at end of file
diff --git a/contents/5_Deep_Q_Network/RL_brain.py b/contents/5_Deep_Q_Network/RL_brain.py
new file mode 100644
index 0000000..f2d0ef7
--- /dev/null
+++ b/contents/5_Deep_Q_Network/RL_brain.py
@@ -0,0 +1,213 @@
+"""
+This part of code is the DQN brain, which is a brain of the agent.
+All decisions are made in here.
+Using Tensorflow to build the neural network.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+
+Using:
+Tensorflow: 1.0
+gym: 0.7.3
+"""
+
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+
+np.random.seed(1)
+tf.set_random_seed(1)
+
+
+# Deep Q Network off-policy
+class DeepQNetwork:
+    def __init__(
+            self,
+            n_actions,
+            n_features,
+            learning_rate=0.01,
+            reward_decay=0.9,
+            e_greedy=0.9,
+            replace_target_iter=300,
+            memory_size=500,
+            batch_size=32,
+            e_greedy_increment=None,
+            output_graph=False,
+    ):
+        self.n_actions = n_actions
+        self.n_features = n_features
+        self.lr = learning_rate
+        self.gamma = reward_decay
+        self.epsilon_max = e_greedy
+        self.replace_target_iter = replace_target_iter
+        self.memory_size = memory_size
+        self.batch_size = batch_size
+        self.epsilon_increment = e_greedy_increment
+        self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
+
+        # total learning step
+        self.learn_step_counter = 0
+
+        # initialize zero memory [s, a, r, s_]
+        self.memory = np.zeros((self.memory_size, n_features * 2 + 2))
+
+        # consist of [target_net, evaluate_net]
+        self._build_net()
+
+        self.sess = tf.Session()
+
+        if output_graph:
+            # $ tensorboard --logdir=logs
+            # tf.train.SummaryWriter soon be deprecated, use following
+            tf.summary.FileWriter("logs/", self.sess.graph)
+
+        self.sess.run(tf.global_variables_initializer())
+        self.cost_his = []
+
+    def _build_net(self):
+        # ------------------ build evaluate_net ------------------
+        self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')  # input
+        self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target')  # for calculating loss
+        with tf.variable_scope('eval_net'):
+            # c_names(collections_names) are the collections to store variables
+            c_names, n_l1, w_initializer, b_initializer = \
+                ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 10, \
+                tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)  # config of layers
+
+            # first layer. collections is used later when assign to target net
+            with tf.variable_scope('l1'):
+                w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
+                b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
+                l1 = tf.nn.relu(tf.matmul(self.s, w1) + b1)
+
+            # second layer. collections is used later when assign to target net
+            with tf.variable_scope('l2'):
+                w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
+                b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
+                self.q_eval = tf.matmul(l1, w2) + b2
+
+        with tf.variable_scope('loss'):
+            self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
+        with tf.variable_scope('train'):
+            self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
+
+        # ------------------ build target_net ------------------
+        self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_')    # input
+        with tf.variable_scope('target_net'):
+            # c_names(collections_names) are the collections to store variables
+            c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
+
+            # first layer. collections is used later when assign to target net
+            with tf.variable_scope('l1'):
+                w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
+                b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
+                l1 = tf.nn.relu(tf.matmul(self.s_, w1) + b1)
+
+            # second layer. collections is used later when assign to target net
+            with tf.variable_scope('l2'):
+                w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
+                b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
+                self.q_next = tf.matmul(l1, w2) + b2
+
+    def store_transition(self, s, a, r, s_):
+        if not hasattr(self, 'memory_counter'):
+            self.memory_counter = 0
+
+        transition = np.hstack((s, [a, r], s_))
+
+        # replace the old memory with new memory
+        index = self.memory_counter % self.memory_size
+        self.memory[index, :] = transition
+
+        self.memory_counter += 1
+
+    def choose_action(self, observation):
+        # to have batch dimension when feed into tf placeholder
+        observation = observation[np.newaxis, :]
+
+        if np.random.uniform() < self.epsilon:
+            # forward feed the observation and get q value for every actions
+            actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
+            action = np.argmax(actions_value)
+        else:
+            action = np.random.randint(0, self.n_actions)
+        return action
+
+    def _replace_target_params(self):
+        t_params = tf.get_collection('target_net_params')
+        e_params = tf.get_collection('eval_net_params')
+        self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])
+
+    def learn(self):
+        # check to replace target parameters
+        if self.learn_step_counter % self.replace_target_iter == 0:
+            self._replace_target_params()
+            print('\ntarget_params_replaced\n')
+
+        # sample batch memory from all memory
+        if self.memory_counter > self.memory_size:
+            sample_index = np.random.choice(self.memory_size, size=self.batch_size)
+        else:
+            sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
+        batch_memory = self.memory[sample_index, :]
+
+        q_next, q_eval = self.sess.run(
+            [self.q_next, self.q_eval],
+            feed_dict={
+                self.s_: batch_memory[:, -self.n_features:],  # fixed params
+                self.s: batch_memory[:, :self.n_features],  # newest params
+            })
+
+        # change q_target w.r.t q_eval's action
+        q_target = q_eval.copy()
+
+        batch_index = np.arange(self.batch_size, dtype=np.int32)
+        eval_act_index = batch_memory[:, self.n_features].astype(int)
+        reward = batch_memory[:, self.n_features + 1]
+
+        q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
+
+        """
+        For example in this batch I have 2 samples and 3 actions:
+        q_eval =
+        [[1, 2, 3],
+         [4, 5, 6]]
+
+        q_target = q_eval =
+        [[1, 2, 3],
+         [4, 5, 6]]
+
+        Then change q_target with the real q_target value w.r.t the q_eval's action.
+        For example in:
+            sample 0, I took action 0, and the max q_target value is -1;
+            sample 1, I took action 2, and the max q_target value is -2:
+        q_target =
+        [[-1, 2, 3],
+         [4, 5, -2]]
+
+        So the (q_target - q_eval) becomes:
+        [[(-1)-(1), 0, 0],
+         [0, 0, (-2)-(6)]]
+
+        We then backpropagate this error w.r.t the corresponding action to network,
+        leave other action as error=0 cause we didn't choose it.
+        """
+
+        # train eval network
+        _, self.cost = self.sess.run([self._train_op, self.loss],
+                                     feed_dict={self.s: batch_memory[:, :self.n_features],
+                                                self.q_target: q_target})
+        self.cost_his.append(self.cost)
+
+        # increasing epsilon
+        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
+        self.learn_step_counter += 1
+
+    def plot_cost(self):
+        import matplotlib.pyplot as plt
+        plt.plot(np.arange(len(self.cost_his)), self.cost_his)
+        plt.ylabel('Cost')
+        plt.xlabel('training steps')
+        plt.show()
+
+
+
diff --git a/contents/5_Deep_Q_Network/maze_env.py b/contents/5_Deep_Q_Network/maze_env.py
new file mode 100644
index 0000000..5134df0
--- /dev/null
+++ b/contents/5_Deep_Q_Network/maze_env.py
@@ -0,0 +1,130 @@
+"""
+Reinforcement learning maze example.
+
+Red rectangle:          explorer.
+Black rectangles:       hells       [reward = -1].
+Yellow bin circle:      paradise    [reward = +1].
+All other states:       ground      [reward = 0].
+
+This script is the environment part of this example.
+The RL is in RL_brain.py.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+"""
+
+
+import numpy as np
+import tkinter as tk
+import time
+
+
+UNIT = 40   # pixels
+MAZE_H = 4  # grid height
+MAZE_W = 4  # grid width
+
+
+class Maze(tk.Tk):
+    def __init__(self):
+        super(Maze, self).__init__()
+        self.action_space = ['u', 'd', 'l', 'r']
+        self.n_actions = len(self.action_space)
+        self.n_features = 2
+        self.title('maze')
+        self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
+        self._build_maze()
+
+    def _build_maze(self):
+        self.canvas = tk.Canvas(self, bg='white',
+                           height=MAZE_H * UNIT,
+                           width=MAZE_W * UNIT)
+
+        # create grids
+        for c in range(0, MAZE_W * UNIT, UNIT):
+            x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
+            self.canvas.create_line(x0, y0, x1, y1)
+        for r in range(0, MAZE_H * UNIT, UNIT):
+            x0, y0, x1, y1 = 0, r, MAZE_H * UNIT, r
+            self.canvas.create_line(x0, y0, x1, y1)
+
+        # create origin
+        origin = np.array([20, 20])
+
+        # hell
+        hell1_center = origin + np.array([UNIT * 2, UNIT])
+        self.hell1 = self.canvas.create_rectangle(
+            hell1_center[0] - 15, hell1_center[1] - 15,
+            hell1_center[0] + 15, hell1_center[1] + 15,
+            fill='black')
+        # hell
+        # hell2_center = origin + np.array([UNIT, UNIT * 2])
+        # self.hell2 = self.canvas.create_rectangle(
+        #     hell2_center[0] - 15, hell2_center[1] - 15,
+        #     hell2_center[0] + 15, hell2_center[1] + 15,
+        #     fill='black')
+
+        # create oval
+        oval_center = origin + UNIT * 2
+        self.oval = self.canvas.create_oval(
+            oval_center[0] - 15, oval_center[1] - 15,
+            oval_center[0] + 15, oval_center[1] + 15,
+            fill='yellow')
+
+        # create red rect
+        self.rect = self.canvas.create_rectangle(
+            origin[0] - 15, origin[1] - 15,
+            origin[0] + 15, origin[1] + 15,
+            fill='red')
+
+        # pack all
+        self.canvas.pack()
+
+    def reset(self):
+        self.update()
+        time.sleep(0.1)
+        self.canvas.delete(self.rect)
+        origin = np.array([20, 20])
+        self.rect = self.canvas.create_rectangle(
+            origin[0] - 15, origin[1] - 15,
+            origin[0] + 15, origin[1] + 15,
+            fill='red')
+        # return observation
+        return (np.array(self.canvas.coords(self.rect)[:2]) - np.array(self.canvas.coords(self.oval)[:2]))/(MAZE_H*UNIT)
+
+    def step(self, action):
+        s = self.canvas.coords(self.rect)
+        base_action = np.array([0, 0])
+        if action == 0:   # up
+            if s[1] > UNIT:
+                base_action[1] -= UNIT
+        elif action == 1:   # down
+            if s[1] < (MAZE_H - 1) * UNIT:
+                base_action[1] += UNIT
+        elif action == 2:   # right
+            if s[0] < (MAZE_W - 1) * UNIT:
+                base_action[0] += UNIT
+        elif action == 3:   # left
+            if s[0] > UNIT:
+                base_action[0] -= UNIT
+
+        self.canvas.move(self.rect, base_action[0], base_action[1])  # move agent
+
+        next_coords = self.canvas.coords(self.rect)  # next state
+
+        # reward function
+        if next_coords == self.canvas.coords(self.oval):
+            reward = 1
+            done = True
+        elif next_coords in [self.canvas.coords(self.hell1)]:
+            reward = -1
+            done = True
+        else:
+            reward = 0
+            done = False
+        s_ = (np.array(next_coords[:2]) - np.array(self.canvas.coords(self.oval)[:2]))/(MAZE_H*UNIT)
+        return s_, reward, done
+
+    def render(self):
+        # time.sleep(0.01)
+        self.update()
+
+
diff --git a/contents/5_Deep_Q_Network/run_this.py b/contents/5_Deep_Q_Network/run_this.py
new file mode 100644
index 0000000..cec116b
--- /dev/null
+++ b/contents/5_Deep_Q_Network/run_this.py
@@ -0,0 +1,61 @@
+"""
+Sarsa is a online updating method for Reinforcement learning.
+
+Unlike Q learning which is a offline updating method, Sarsa is updating while in the current trajectory.
+
+You will see the sarsa is more coward when punishment is close because it cares about all behaviours,
+while q learning is more brave because it only cares about maximum behaviour.
+"""
+
+from maze_env import Maze
+from RL_brain import DeepQNetwork
+
+
+def run_maze():
+    step = 0
+    for episode in range(300):
+        # initial observation
+        observation = env.reset()
+
+        while True:
+            # fresh env
+            env.render()
+
+            # RL choose action based on observation
+            action = RL.choose_action(observation)
+
+            # RL take action and get next observation and reward
+            observation_, reward, done = env.step(action)
+
+            RL.store_transition(observation, action, reward, observation_)
+
+            if (step > 200) and (step % 5 == 0):
+                RL.learn()
+
+            # swap observation
+            observation = observation_
+
+            # break while loop when end of this episode
+            if done:
+                break
+            step += 1
+
+    # end of game
+    print('game over')
+    env.destroy()
+
+
+if __name__ == "__main__":
+    # maze game
+    env = Maze()
+    RL = DeepQNetwork(env.n_actions, env.n_features,
+                      learning_rate=0.01,
+                      reward_decay=0.9,
+                      e_greedy=0.9,
+                      replace_target_iter=200,
+                      memory_size=2000,
+                      # output_graph=True
+                      )
+    env.after(100, run_maze)
+    env.mainloop()
+    RL.plot_cost()
\ No newline at end of file
diff --git a/contents/6_OpenAI_gym/RL_brain.py b/contents/6_OpenAI_gym/RL_brain.py
new file mode 100644
index 0000000..bc3796c
--- /dev/null
+++ b/contents/6_OpenAI_gym/RL_brain.py
@@ -0,0 +1,213 @@
+"""
+This part of code is the DQN brain, which is a brain of the agent.
+All decisions are made in here.
+Using Tensorflow to build the neural network.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+
+Using:
+Tensorflow: 1.0
+gym: 0.8.0
+"""
+
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+
+np.random.seed(1)
+tf.set_random_seed(1)
+
+
+# Deep Q Network off-policy
+class DeepQNetwork:
+    def __init__(
+            self,
+            n_actions,
+            n_features,
+            learning_rate=0.01,
+            reward_decay=0.9,
+            e_greedy=0.9,
+            replace_target_iter=300,
+            memory_size=500,
+            batch_size=32,
+            e_greedy_increment=None,
+            output_graph=False,
+    ):
+        self.n_actions = n_actions
+        self.n_features = n_features
+        self.lr = learning_rate
+        self.gamma = reward_decay
+        self.epsilon_max = e_greedy
+        self.replace_target_iter = replace_target_iter
+        self.memory_size = memory_size
+        self.batch_size = batch_size
+        self.epsilon_increment = e_greedy_increment
+        self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
+
+        # total learning step
+        self.learn_step_counter = 0
+
+        # initialize zero memory [s, a, r, s_]
+        self.memory = np.zeros((self.memory_size, n_features * 2 + 2))
+
+        # consist of [target_net, evaluate_net]
+        self._build_net()
+
+        self.sess = tf.Session()
+
+        if output_graph:
+            # $ tensorboard --logdir=logs
+            # tf.train.SummaryWriter soon be deprecated, use following
+            tf.summary.FileWriter("logs/", self.sess.graph)
+
+        self.sess.run(tf.global_variables_initializer())
+        self.cost_his = []
+
+    def _build_net(self):
+        # ------------------ build evaluate_net ------------------
+        self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')  # input
+        self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target')  # for calculating loss
+        with tf.variable_scope('eval_net'):
+            # c_names(collections_names) are the collections to store variables
+            c_names, n_l1, w_initializer, b_initializer = \
+                ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 10, \
+                tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)  # config of layers
+
+            # first layer. collections is used later when assign to target net
+            with tf.variable_scope('l1'):
+                w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
+                b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
+                l1 = tf.nn.relu(tf.matmul(self.s, w1) + b1)
+
+            # second layer. collections is used later when assign to target net
+            with tf.variable_scope('l2'):
+                w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
+                b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
+                self.q_eval = tf.matmul(l1, w2) + b2
+
+        with tf.variable_scope('loss'):
+            self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
+        with tf.variable_scope('train'):
+            self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
+
+        # ------------------ build target_net ------------------
+        self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_')    # input
+        with tf.variable_scope('target_net'):
+            # c_names(collections_names) are the collections to store variables
+            c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
+
+            # first layer. collections is used later when assign to target net
+            with tf.variable_scope('l1'):
+                w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
+                b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
+                l1 = tf.nn.relu(tf.matmul(self.s_, w1) + b1)
+
+            # second layer. collections is used later when assign to target net
+            with tf.variable_scope('l2'):
+                w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
+                b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
+                self.q_next = tf.matmul(l1, w2) + b2
+
+    def store_transition(self, s, a, r, s_):
+        if not hasattr(self, 'memory_counter'):
+            self.memory_counter = 0
+
+        transition = np.hstack((s, [a, r], s_))
+
+        # replace the old memory with new memory
+        index = self.memory_counter % self.memory_size
+        self.memory[index, :] = transition
+
+        self.memory_counter += 1
+
+    def choose_action(self, observation):
+        # to have batch dimension when feed into tf placeholder
+        observation = observation[np.newaxis, :]
+
+        if np.random.uniform() < self.epsilon:
+            # forward feed the observation and get q value for every actions
+            actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
+            action = np.argmax(actions_value)
+        else:
+            action = np.random.randint(0, self.n_actions)
+        return action
+
+    def _replace_target_params(self):
+        t_params = tf.get_collection('target_net_params')
+        e_params = tf.get_collection('eval_net_params')
+        self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])
+
+    def learn(self):
+        # check to replace target parameters
+        if self.learn_step_counter % self.replace_target_iter == 0:
+            self._replace_target_params()
+            print('\ntarget_params_replaced\n')
+
+        # sample batch memory from all memory
+        if self.memory_counter > self.memory_size:
+            sample_index = np.random.choice(self.memory_size, size=self.batch_size)
+        else:
+            sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
+        batch_memory = self.memory[sample_index, :]
+
+        q_next, q_eval = self.sess.run(
+            [self.q_next, self.q_eval],
+            feed_dict={
+                self.s_: batch_memory[:, -self.n_features:],  # fixed params
+                self.s: batch_memory[:, :self.n_features],  # newest params
+            })
+
+        # change q_target w.r.t q_eval's action
+        q_target = q_eval.copy()
+
+        batch_index = np.arange(self.batch_size, dtype=np.int32)
+        eval_act_index = batch_memory[:, self.n_features].astype(int)
+        reward = batch_memory[:, self.n_features + 1]
+
+        q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
+
+        """
+        For example in this batch I have 2 samples and 3 actions:
+        q_eval =
+        [[1, 2, 3],
+         [4, 5, 6]]
+
+        q_target = q_eval =
+        [[1, 2, 3],
+         [4, 5, 6]]
+
+        Then change q_target with the real q_target value w.r.t the q_eval's action.
+        For example in:
+            sample 0, I took action 0, and the max q_target value is -1;
+            sample 1, I took action 2, and the max q_target value is -2:
+        q_target =
+        [[-1, 2, 3],
+         [4, 5, -2]]
+
+        So the (q_target - q_eval) becomes:
+        [[(-1)-(1), 0, 0],
+         [0, 0, (-2)-(6)]]
+
+        We then backpropagate this error w.r.t the corresponding action to network,
+        leave other action as error=0 cause we didn't choose it.
+        """
+
+        # train eval network
+        _, self.cost = self.sess.run([self._train_op, self.loss],
+                                     feed_dict={self.s: batch_memory[:, :self.n_features],
+                                                self.q_target: q_target})
+        self.cost_his.append(self.cost)
+
+        # increasing epsilon
+        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
+        self.learn_step_counter += 1
+
+    def plot_cost(self):
+        import matplotlib.pyplot as plt
+        plt.plot(np.arange(len(self.cost_his)), self.cost_his)
+        plt.ylabel('Cost')
+        plt.xlabel('training steps')
+        plt.show()
+
+
+
diff --git a/contents/6_OpenAI_gym/run_CartPole.py b/contents/6_OpenAI_gym/run_CartPole.py
new file mode 100644
index 0000000..104bde4
--- /dev/null
+++ b/contents/6_OpenAI_gym/run_CartPole.py
@@ -0,0 +1,62 @@
+"""
+Deep Q network,
+
+Using:
+Tensorflow: 1.0
+gym: 0.7.3
+"""
+
+
+import gym
+from RL_brain import DeepQNetwork
+
+env = gym.make('CartPole-v0')
+env = env.unwrapped
+
+print(env.action_space)
+print(env.observation_space)
+print(env.observation_space.high)
+print(env.observation_space.low)
+
+RL = DeepQNetwork(n_actions=env.action_space.n,
+                  n_features=env.observation_space.shape[0],
+                  learning_rate=0.01, e_greedy=0.9,
+                  replace_target_iter=100, memory_size=2000,
+                  e_greedy_increment=0.001,)
+
+total_steps = 0
+
+
+for i_episode in range(100):
+
+    observation = env.reset()
+    ep_r = 0
+    while True:
+        env.render()
+
+        action = RL.choose_action(observation)
+
+        observation_, reward, done, info = env.step(action)
+
+        # the smaller theta and closer to center the better
+        x, x_dot, theta, theta_dot = observation_
+        r1 = (env.x_threshold - abs(x))/env.x_threshold - 0.8
+        r2 = (env.theta_threshold_radians - abs(theta))/env.theta_threshold_radians - 0.5
+        reward = r1 + r2
+
+        RL.store_transition(observation, action, reward, observation_)
+
+        ep_r += reward
+        if total_steps > 1000:
+            RL.learn()
+
+        if done:
+            print('episode: ', i_episode,
+                  'ep_r: ', round(ep_r, 2),
+                  ' epsilon: ', round(RL.epsilon, 2))
+            break
+
+        observation = observation_
+        total_steps += 1
+
+RL.plot_cost()
diff --git a/contents/6_OpenAI_gym/run_MountainCar.py b/contents/6_OpenAI_gym/run_MountainCar.py
new file mode 100644
index 0000000..cdda953
--- /dev/null
+++ b/contents/6_OpenAI_gym/run_MountainCar.py
@@ -0,0 +1,61 @@
+"""
+Deep Q network,
+
+Using:
+Tensorflow: 1.0
+gym: 0.8.0
+"""
+
+
+import gym
+from RL_brain import DeepQNetwork
+
+env = gym.make('MountainCar-v0')
+env = env.unwrapped
+
+print(env.action_space)
+print(env.observation_space)
+print(env.observation_space.high)
+print(env.observation_space.low)
+
+RL = DeepQNetwork(n_actions=3, n_features=2, learning_rate=0.001, e_greedy=0.9,
+                  replace_target_iter=300, memory_size=3000,
+                  e_greedy_increment=0.0002,)
+
+total_steps = 0
+
+
+for i_episode in range(10):
+
+    observation = env.reset()
+    ep_r = 0
+    while True:
+        env.render()
+
+        action = RL.choose_action(observation)
+
+        observation_, reward, done, info = env.step(action)
+
+        position, velocity = observation_
+
+        # the higher the better
+        reward = abs(position - (-0.5))     # r in [0, 1]
+
+        RL.store_transition(observation, action, reward, observation_)
+
+        if total_steps > 1000:
+            RL.learn()
+
+        ep_r += reward
+        if done:
+            get = '| Get' if observation_[0] >= env.unwrapped.goal_position else '| ----'
+            print('Epi: ', i_episode,
+                  get,
+                  '| Ep_r: ', round(ep_r, 4),
+                  '| Epsilon: ', round(RL.epsilon, 2))
+            break
+
+        observation = observation_
+        total_steps += 1
+
+RL.plot_cost()
diff --git a/contents/7_Policy_gradient_softmax/RL_brain.py b/contents/7_Policy_gradient_softmax/RL_brain.py
new file mode 100644
index 0000000..e76b234
--- /dev/null
+++ b/contents/7_Policy_gradient_softmax/RL_brain.py
@@ -0,0 +1,124 @@
+"""
+This part of code is the reinforcement learning brain, which is a brain of the agent.
+All decisions are made in here.
+
+Policy Gradient, Reinforcement Learning.
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+
+Using:
+Tensorflow: 1.0
+gym: 0.8.0
+"""
+
+import numpy as np
+import tensorflow as tf
+
+# reproducible
+np.random.seed(1)
+tf.set_random_seed(1)
+
+
+class PolicyGradient:
+    def __init__(
+            self,
+            n_actions,
+            n_features,
+            learning_rate=0.01,
+            reward_decay=0.95,
+            output_graph=False,
+    ):
+        self.n_actions = n_actions
+        self.n_features = n_features
+        self.lr = learning_rate
+        self.gamma = reward_decay
+
+        self.ep_obs, self.ep_as, self.ep_rs = [], [], []
+
+        self._build_net()
+
+        self.sess = tf.Session()
+
+        if output_graph:
+            # $ tensorboard --logdir=logs
+            # http://0.0.0.0:6006/
+            # tf.train.SummaryWriter soon be deprecated, use following
+            tf.summary.FileWriter("logs/", self.sess.graph)
+
+        self.sess.run(tf.global_variables_initializer())
+
+    def _build_net(self):
+        with tf.name_scope('inputs'):
+            self.tf_obs = tf.placeholder(tf.float32, [None, self.n_features], name="observations")
+            self.tf_acts = tf.placeholder(tf.int32, [None, ], name="actions_num")
+            self.tf_vt = tf.placeholder(tf.float32, [None, ], name="actions_value")
+        # fc1
+        layer = tf.layers.dense(
+            inputs=self.tf_obs,
+            units=10,
+            activation=tf.nn.tanh,  # tanh activation
+            kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3),
+            bias_initializer=tf.constant_initializer(0.1),
+            name='fc1'
+        )
+        # fc2
+        all_act = tf.layers.dense(
+            inputs=layer,
+            units=self.n_actions,
+            activation=None,
+            kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3),
+            bias_initializer=tf.constant_initializer(0.1),
+            name='fc2'
+        )
+
+        self.all_act_prob = tf.nn.softmax(all_act, name='act_prob')  # use softmax to convert to probability
+
+        with tf.name_scope('loss'):
+            # to maximize total reward (log_p * R) is to minimize -(log_p * R), and the tf only have minimize(loss)
+            neg_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=all_act, labels=self.tf_acts)   # this is negative log of chosen action
+            # or in this way:
+            # neg_log_prob = tf.reduce_sum(-tf.log(self.all_act_prob)*tf.one_hot(self.tf_acts, self.n_actions), axis=1)
+            loss = tf.reduce_mean(neg_log_prob * self.tf_vt)  # reward guided loss
+
+        with tf.name_scope('train'):
+            self.train_op = tf.train.AdamOptimizer(self.lr).minimize(loss)
+
+    def choose_action(self, observation):
+        prob_weights = self.sess.run(self.all_act_prob, feed_dict={self.tf_obs: observation[np.newaxis, :]})
+        action = np.random.choice(range(prob_weights.shape[1]), p=prob_weights.ravel())  # select action w.r.t the actions prob
+        return action
+
+    def store_transition(self, s, a, r):
+        self.ep_obs.append(s)
+        self.ep_as.append(a)
+        self.ep_rs.append(r)
+
+    def learn(self):
+        # discount and normalize episode reward
+        discounted_ep_rs_norm = self._discount_and_norm_rewards()
+
+        # train on episode
+        self.sess.run(self.train_op, feed_dict={
+             self.tf_obs: np.vstack(self.ep_obs),  # shape=[None, n_obs]
+             self.tf_acts: np.array(self.ep_as),  # shape=[None, ]
+             self.tf_vt: discounted_ep_rs_norm,  # shape=[None, ]
+        })
+
+        self.ep_obs, self.ep_as, self.ep_rs = [], [], []    # empty episode data
+        return discounted_ep_rs_norm
+
+    def _discount_and_norm_rewards(self):
+        # discount episode rewards
+        discounted_ep_rs = np.zeros_like(self.ep_rs)
+        running_add = 0
+        for t in reversed(range(0, len(self.ep_rs))):
+            running_add = running_add * self.gamma + self.ep_rs[t]
+            discounted_ep_rs[t] = running_add
+
+        # normalize episode rewards
+        discounted_ep_rs -= np.mean(discounted_ep_rs)
+        discounted_ep_rs /= np.std(discounted_ep_rs)
+        return discounted_ep_rs
+
+
+
diff --git a/contents/7_Policy_gradient_softmax/run_CartPole.py b/contents/7_Policy_gradient_softmax/run_CartPole.py
new file mode 100644
index 0000000..7d46aee
--- /dev/null
+++ b/contents/7_Policy_gradient_softmax/run_CartPole.py
@@ -0,0 +1,69 @@
+"""
+Policy Gradient, Reinforcement Learning.
+
+The cart pole example
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+Using:
+Tensorflow: 1.0
+gym: 0.8.0
+"""
+
+import gym
+from RL_brain import PolicyGradient
+import matplotlib.pyplot as plt
+
+DISPLAY_REWARD_THRESHOLD = 400  # renders environment if total episode reward is greater then this threshold
+RENDER = False  # rendering wastes time
+
+env = gym.make('CartPole-v0')
+env.seed(1)     # reproducible, general Policy gradient has high variance
+env = env.unwrapped
+
+print(env.action_space)
+print(env.observation_space)
+print(env.observation_space.high)
+print(env.observation_space.low)
+
+RL = PolicyGradient(
+    n_actions=env.action_space.n,
+    n_features=env.observation_space.shape[0],
+    learning_rate=0.02,
+    reward_decay=0.99,
+    # output_graph=True,
+)
+
+for i_episode in range(3000):
+
+    observation = env.reset()
+
+    while True:
+        if RENDER: env.render()
+
+        action = RL.choose_action(observation)
+
+        observation_, reward, done, info = env.step(action)
+
+        RL.store_transition(observation, action, reward)
+
+        if done:
+            ep_rs_sum = sum(RL.ep_rs)
+
+            if 'running_reward' not in globals():
+                running_reward = ep_rs_sum
+            else:
+                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
+            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True     # rendering
+            print("episode:", i_episode, "  reward:", int(running_reward))
+
+            vt = RL.learn()
+
+            if i_episode == 0:
+                plt.plot(vt)    # plot the episode vt
+                plt.xlabel('episode steps')
+                plt.ylabel('normalized state-action value')
+                plt.show()
+            break
+
+        observation = observation_
diff --git a/contents/7_Policy_gradient_softmax/run_MountainCar.py b/contents/7_Policy_gradient_softmax/run_MountainCar.py
new file mode 100644
index 0000000..926269d
--- /dev/null
+++ b/contents/7_Policy_gradient_softmax/run_MountainCar.py
@@ -0,0 +1,76 @@
+"""
+Policy Gradient, Reinforcement Learning.
+
+The cart pole example
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+Using:
+Tensorflow: 1.0
+gym: 0.8.0
+"""
+
+import gym
+from RL_brain import PolicyGradient
+import matplotlib.pyplot as plt
+
+DISPLAY_REWARD_THRESHOLD = -2000  # renders environment if total episode reward is greater then this threshold
+# episode: 154   reward: -10667
+# episode: 387   reward: -2009
+# episode: 489   reward: -1006
+# episode: 628   reward: -502
+
+RENDER = False  # rendering wastes time
+
+env = gym.make('MountainCar-v0')
+env.seed(1)     # reproducible, general Policy gradient has high variance
+env = env.unwrapped
+
+print(env.action_space)
+print(env.observation_space)
+print(env.observation_space.high)
+print(env.observation_space.low)
+
+RL = PolicyGradient(
+    n_actions=env.action_space.n,
+    n_features=env.observation_space.shape[0],
+    learning_rate=0.02,
+    reward_decay=0.995,
+    # output_graph=True,
+)
+
+for i_episode in range(1000):
+
+    observation = env.reset()
+
+    while True:
+        if RENDER: env.render()
+
+        action = RL.choose_action(observation)
+
+        observation_, reward, done, info = env.step(action)     # reward = -1 in all cases
+
+        RL.store_transition(observation, action, reward)
+
+        if done:
+            # calculate running reward
+            ep_rs_sum = sum(RL.ep_rs)
+            if 'running_reward' not in globals():
+                running_reward = ep_rs_sum
+            else:
+                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
+            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True     # rendering
+
+            print("episode:", i_episode, "  reward:", int(running_reward))
+
+            vt = RL.learn()  # train
+
+            if i_episode == 30:
+                plt.plot(vt)  # plot the episode vt
+                plt.xlabel('episode steps')
+                plt.ylabel('normalized state-action value')
+                plt.show()
+
+            break
+
+        observation = observation_
diff --git a/contents/8_Actor_Critic_Advantage/AC_CartPole.py b/contents/8_Actor_Critic_Advantage/AC_CartPole.py
new file mode 100644
index 0000000..b01a4b9
--- /dev/null
+++ b/contents/8_Actor_Critic_Advantage/AC_CartPole.py
@@ -0,0 +1,169 @@
+"""
+Actor-Critic using TD-error as the Advantage, Reinforcement Learning.
+
+The cart pole example. Policy is oscillated.
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+Using:
+tensorflow 1.0
+gym 0.8.0
+"""
+
+import numpy as np
+import tensorflow as tf
+import gym
+
+np.random.seed(2)
+tf.set_random_seed(2)  # reproducible
+
+# Superparameters
+OUTPUT_GRAPH = False
+MAX_EPISODE = 3000
+DISPLAY_REWARD_THRESHOLD = 200  # renders environment if total episode reward is greater then this threshold
+MAX_EP_STEPS = 1000   # maximum time step in one episode
+RENDER = False  # rendering wastes time
+GAMMA = 0.9     # reward discount in TD error
+LR_A = 0.001    # learning rate for actor
+LR_C = 0.01     # learning rate for critic
+
+env = gym.make('CartPole-v0')
+env.seed(1)  # reproducible
+env = env.unwrapped
+
+N_F = env.observation_space.shape[0]
+N_A = env.action_space.n
+
+
+class Actor(object):
+    def __init__(self, sess, n_features, n_actions, lr=0.001):
+        self.sess = sess
+
+        self.s = tf.placeholder(tf.float32, [1, n_features], "state")
+        self.a = tf.placeholder(tf.int32, None, "act")
+        self.td_error = tf.placeholder(tf.float32, None, "td_error")  # TD_error
+
+        with tf.variable_scope('Actor'):
+            l1 = tf.layers.dense(
+                inputs=self.s,
+                units=20,    # number of hidden units
+                activation=tf.nn.relu,
+                kernel_initializer=tf.random_normal_initializer(0., .1),    # weights
+                bias_initializer=tf.constant_initializer(0.1),  # biases
+                name='l1'
+            )
+
+            self.acts_prob = tf.layers.dense(
+                inputs=l1,
+                units=n_actions,    # output units
+                activation=tf.nn.softmax,   # get action probabilities
+                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
+                bias_initializer=tf.constant_initializer(0.1),  # biases
+                name='acts_prob'
+            )
+
+        with tf.variable_scope('exp_v'):
+            log_prob = tf.log(self.acts_prob[0, self.a])
+            self.exp_v = tf.reduce_mean(log_prob * self.td_error)  # advantage (TD_error) guided loss
+
+        with tf.variable_scope('train'):
+            self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v)  # minimize(-exp_v) = maximize(exp_v)
+
+    def learn(self, s, a, td):
+        s = s[np.newaxis, :]
+        feed_dict = {self.s: s, self.a: a, self.td_error: td}
+        _, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict)
+        return exp_v
+
+    def choose_action(self, s):
+        s = s[np.newaxis, :]
+        probs = self.sess.run(self.acts_prob, {self.s: s})   # get probabilities for all actions
+        return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel())   # return a int
+
+
+class Critic(object):
+    def __init__(self, sess, n_features, lr=0.01):
+        self.sess = sess
+
+        self.s = tf.placeholder(tf.float32, [1, n_features], "state")
+        self.v_ = tf.placeholder(tf.float32, [1, 1], "v_next")
+        self.r = tf.placeholder(tf.float32, None, 'r')
+
+        with tf.variable_scope('Critic'):
+            l1 = tf.layers.dense(
+                inputs=self.s,
+                units=20,  # number of hidden units
+                activation=tf.nn.relu,  # None
+                # have to be linear to make sure the convergence of actor.
+                # But linear approximator seems hardly learns the correct Q.
+                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
+                bias_initializer=tf.constant_initializer(0.1),  # biases
+                name='l1'
+            )
+
+            self.v = tf.layers.dense(
+                inputs=l1,
+                units=1,  # output units
+                activation=None,
+                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
+                bias_initializer=tf.constant_initializer(0.1),  # biases
+                name='V'
+            )
+
+        with tf.variable_scope('squared_TD_error'):
+            self.td_error = self.r + GAMMA * self.v_ - self.v
+            self.loss = tf.square(self.td_error)    # TD_error = (r+gamma*V_next) - V_eval
+        with tf.variable_scope('train'):
+            self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)
+
+    def learn(self, s, r, s_):
+        s, s_ = s[np.newaxis, :], s_[np.newaxis, :]
+
+        v_ = self.sess.run(self.v, {self.s: s_})
+        td_error, _ = self.sess.run([self.td_error, self.train_op],
+                                          {self.s: s, self.v_: v_, self.r: r})
+        return td_error
+
+
+sess = tf.Session()
+
+actor = Actor(sess, n_features=N_F, n_actions=N_A, lr=LR_A)
+critic = Critic(sess, n_features=N_F, lr=LR_C)     # we need a good teacher, so the teacher should learn faster than the actor
+
+sess.run(tf.global_variables_initializer())
+
+if OUTPUT_GRAPH:
+    tf.summary.FileWriter("logs/", sess.graph)
+
+for i_episode in range(MAX_EPISODE):
+    s = env.reset()
+    t = 0
+    track_r = []
+    while True:
+        if RENDER: env.render()
+
+        a = actor.choose_action(s)
+
+        s_, r, done, info = env.step(a)
+
+        if done: r = -20
+
+        track_r.append(r)
+
+        td_error = critic.learn(s, r, s_)  # gradient = grad[r + gamma * V(s_) - V(s)]
+        actor.learn(s, a, td_error)     # true_gradient = grad[logPi(s,a) * td_error]
+
+        s = s_
+        t += 1
+
+        if done or t >= MAX_EP_STEPS:
+            ep_rs_sum = sum(track_r)
+
+            if 'running_reward' not in globals():
+                running_reward = ep_rs_sum
+            else:
+                running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
+            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
+            print("episode:", i_episode, "  reward:", int(running_reward))
+            break
+
diff --git a/contents/8_Actor_Critic_Advantage/AC_continue_Pendulum.py b/contents/8_Actor_Critic_Advantage/AC_continue_Pendulum.py
new file mode 100644
index 0000000..07fc378
--- /dev/null
+++ b/contents/8_Actor_Critic_Advantage/AC_continue_Pendulum.py
@@ -0,0 +1,179 @@
+"""
+Actor-Critic with continuous action using TD-error as the Advantage, Reinforcement Learning.
+
+The Pendulum example (based on https://github.com/dennybritz/reinforcement-learning/blob/master/PolicyGradient/Continuous%20MountainCar%20Actor%20Critic%20Solution.ipynb)
+
+Cannot converge!!! oscillate!!!
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+Using:
+tensorflow 1.0
+gym 0.8.0
+"""
+
+import tensorflow as tf
+import numpy as np
+import gym
+
+np.random.seed(2)
+tf.set_random_seed(2)  # reproducible
+
+
+class Actor(object):
+    def __init__(self, sess, n_features, action_bound, lr=0.0001):
+        self.sess = sess
+
+        self.s = tf.placeholder(tf.float32, [1, n_features], "state")
+        self.a = tf.placeholder(tf.float32, None, name="act")
+        self.td_error = tf.placeholder(tf.float32, None, name="td_error")  # TD_error
+
+        l1 = tf.layers.dense(
+            inputs=self.s,
+            units=30,  # number of hidden units
+            activation=tf.nn.relu,
+            kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
+            bias_initializer=tf.constant_initializer(0.1),  # biases
+            name='l1'
+        )
+
+        mu = tf.layers.dense(
+            inputs=l1,
+            units=1,  # number of hidden units
+            activation=tf.nn.tanh,
+            kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
+            bias_initializer=tf.constant_initializer(0.1),  # biases
+            name='mu'
+        )
+
+        sigma = tf.layers.dense(
+            inputs=l1,
+            units=1,  # output units
+            activation=tf.nn.softplus,  # get action probabilities
+            kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
+            bias_initializer=tf.constant_initializer(1.),  # biases
+            name='sigma'
+        )
+        global_step = tf.Variable(0, trainable=False)
+        # self.e = epsilon = tf.train.exponential_decay(2., global_step, 1000, 0.9)
+        self.mu, self.sigma = tf.squeeze(mu*2), tf.squeeze(sigma+0.1)
+        self.normal_dist = tf.contrib.distributions.Normal(self.mu, self.sigma)
+
+        self.action = tf.clip_by_value(self.normal_dist.sample(1), action_bound[0], action_bound[1])
+
+        with tf.name_scope('exp_v'):
+            log_prob = self.normal_dist.log_prob(self.a)  # loss without advantage
+            self.exp_v = log_prob * self.td_error  # advantage (TD_error) guided loss
+            # Add cross entropy cost to encourage exploration
+            self.exp_v += self.normal_dist.entropy()
+
+        with tf.name_scope('train'):
+            self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v, global_step)    # min(v) = max(-v)
+
+    def learn(self, s, a, td):
+        s = s[np.newaxis, :]
+        feed_dict = {self.s: s, self.a: a, self.td_error: td}
+        _, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict)
+        return exp_v
+
+    def choose_action(self, s):
+        s = s[np.newaxis, :]
+        return self.sess.run(self.action, {self.s: s})  # get probabilities for all actions
+
+
+class Critic(object):
+    def __init__(self, sess, n_features, lr=0.01):
+        self.sess = sess
+        with tf.name_scope('inputs'):
+            self.s = tf.placeholder(tf.float32, [1, n_features], "state")
+            self.v_ = tf.placeholder(tf.float32, [1, 1], name="v_next")
+            self.r = tf.placeholder(tf.float32, name='r')
+
+        with tf.variable_scope('Critic'):
+            l1 = tf.layers.dense(
+                inputs=self.s,
+                units=30,  # number of hidden units
+                activation=tf.nn.relu,
+                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
+                bias_initializer=tf.constant_initializer(0.1),  # biases
+                name='l1'
+            )
+
+            self.v = tf.layers.dense(
+                inputs=l1,
+                units=1,  # output units
+                activation=None,
+                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
+                bias_initializer=tf.constant_initializer(0.1),  # biases
+                name='V'
+            )
+
+        with tf.variable_scope('squared_TD_error'):
+            self.td_error = tf.reduce_mean(self.r + GAMMA * self.v_ - self.v)
+            self.loss = tf.square(self.td_error)    # TD_error = (r+gamma*V_next) - V_eval
+        with tf.variable_scope('train'):
+            self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)
+
+    def learn(self, s, r, s_):
+        s, s_ = s[np.newaxis, :], s_[np.newaxis, :]
+
+        v_ = self.sess.run(self.v, {self.s: s_})
+        td_error, _ = self.sess.run([self.td_error, self.train_op],
+                                          {self.s: s, self.v_: v_, self.r: r})
+        return td_error
+
+
+OUTPUT_GRAPH = False
+MAX_EPISODE = 1000
+MAX_EP_STEPS = 300
+DISPLAY_REWARD_THRESHOLD = -550  # renders environment if total episode reward is greater then this threshold
+RENDER = False  # rendering wastes time
+GAMMA = 0.9
+LR_A = 0.001    # learning rate for actor
+LR_C = 0.01     # learning rate for critic
+
+env = gym.make('Pendulum-v0')
+env.seed(1)  # reproducible
+env = env.unwrapped
+
+N_S = env.observation_space.shape[0]
+A_BOUND = env.action_space.high
+
+sess = tf.Session()
+
+actor = Actor(sess, n_features=N_S, lr=LR_A, action_bound=[-A_BOUND, A_BOUND])
+critic = Critic(sess, n_features=N_S, lr=LR_C)
+
+sess.run(tf.global_variables_initializer())
+
+if OUTPUT_GRAPH:
+    tf.summary.FileWriter("logs/", sess.graph)
+
+for i_episode in range(MAX_EPISODE):
+    s = env.reset()
+    t = 0
+    ep_rs = []
+    while True:
+        # if RENDER:
+        env.render()
+        a = actor.choose_action(s)
+
+        s_, r, done, info = env.step(a)
+        r /= 10
+
+        td_error = critic.learn(s, r, s_)  # gradient = grad[r + gamma * V(s_) - V(s)]
+        actor.learn(s, a, td_error)  # true_gradient = grad[logPi(s,a) * td_error]
+
+        s = s_
+        t += 1
+        ep_rs.append(r)
+        if t > MAX_EP_STEPS:
+            ep_rs_sum = sum(ep_rs)
+            if 'running_reward' not in globals():
+                running_reward = ep_rs_sum
+            else:
+                running_reward = running_reward * 0.9 + ep_rs_sum * 0.1
+            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
+            print("episode:", i_episode, "  reward:", int(running_reward))
+            break
+
diff --git a/contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG.py b/contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG.py
new file mode 100644
index 0000000..629eb6e
--- /dev/null
+++ b/contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG.py
@@ -0,0 +1,252 @@
+"""
+Deep Deterministic Policy Gradient (DDPG), Reinforcement Learning.
+DDPG is Actor Critic based algorithm.
+Pendulum example.
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+Using:
+tensorflow 1.0
+gym 0.8.0
+"""
+
+import tensorflow as tf
+import numpy as np
+import gym
+
+np.random.seed(1)
+tf.set_random_seed(1)
+
+#####################  hyper parameters  ####################
+
+MAX_EPISODES = 70
+MAX_EP_STEPS = 400
+LR_A = 0.01  # learning rate for actor
+LR_C = 0.01  # learning rate for critic
+GAMMA = 0.9  # reward discount
+TAU = 0.01  # Soft update for target param, but this is computationally expansive
+# so we use replace_iter instead
+REPLACE_ITER_A = 500
+REPLACE_ITER_C = 300
+MEMORY_CAPACITY = 7000
+BATCH_SIZE = 32
+
+RENDER = False
+OUTPUT_GRAPH = True
+ENV_NAME = 'Pendulum-v0'
+
+###############################  Actor  ####################################
+
+class Actor(object):
+    def __init__(self, sess, action_dim, action_bound, learning_rate, t_replace_iter):
+        self.sess = sess
+        self.a_dim = action_dim
+        self.action_bound = action_bound
+        self.lr = learning_rate
+        self.t_replace_iter = t_replace_iter
+        self.t_replace_counter = 0
+
+        with tf.variable_scope('Actor'):
+            # input s, output a
+            self.a = self._build_net(S, scope='eval_net', trainable=True)
+
+            # input s_, output a, get a_ for critic
+            self.a_ = self._build_net(S_, scope='target_net', trainable=False)
+
+        self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval_net')
+        self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target_net')
+
+    def _build_net(self, s, scope, trainable):
+        with tf.variable_scope(scope):
+            init_w = tf.random_normal_initializer(0., 0.3)
+            init_b = tf.constant_initializer(0.1)
+            net = tf.layers.dense(s, 30, activation=tf.nn.relu,
+                                  kernel_initializer=init_w, bias_initializer=init_b, name='l1',
+                                  trainable=trainable)
+            with tf.variable_scope('a'):
+                actions = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, kernel_initializer=init_w,
+                                          bias_initializer=init_b, name='a', trainable=trainable)
+                scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a')  # Scale output to -action_bound to action_bound
+        return scaled_a
+
+    def learn(self, s, a):   # batch update
+        self.sess.run(self.train_op, feed_dict={S: s, A: a})
+        # the following method for soft replace target params is computational expansive
+        # target_params = (1-tau) * target_params + tau * eval_params
+        # self.sess.run([tf.assign(t, (1 - self.tau) * t + self.tau * e) for t, e in zip(self.t_params, self.e_params)])
+
+        # instead of above method, I use a hard replacement here
+        if self.t_replace_counter % self.t_replace_iter == 0:
+            self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
+        self.t_replace_counter += 1
+
+    def choose_action(self, s):
+        s = s[np.newaxis, :]    # single state
+        return self.sess.run(self.a, feed_dict={S: s})[0]  # single action
+
+    def add_grad_to_graph(self, a_grads):
+        with tf.variable_scope('policy_grads'):
+            # ys = policy;
+            # xs = policy's parameters;
+            # self.a_grads = the gradients of the policy to get more Q
+            # tf.gradients will calculate dys/dxs with a initial gradients for ys, so this is dq/da * da/dparams
+            self.policy_grads = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads)
+
+        with tf.variable_scope('A_train'):
+            opt = tf.train.AdamOptimizer(-self.lr / BATCH_SIZE)  # (- learning rate) for ascent policy, div to take mean
+            self.train_op = opt.apply_gradients(zip(self.policy_grads, self.e_params))
+
+
+###############################  Critic  ####################################
+
+class Critic(object):
+    def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter, a_):
+        self.sess = sess
+        self.s_dim = state_dim
+        self.a_dim = action_dim
+        self.lr = learning_rate
+        self.gamma = gamma
+        self.t_replace_iter = t_replace_iter
+        self.t_replace_counter = 0
+
+        with tf.variable_scope('Critic'):
+            # Input (s, a), output q
+            self.q = self._build_net(S, A, 'eval_net', trainable=True)
+
+            # Input (s_, a_), output q_ for q_target
+            self.q_ = self._build_net(S_, a_, 'target_net', trainable=False)    # target_q is based on a_ from Actor's target_net
+
+            self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_net')
+            self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_net')
+
+        with tf.variable_scope('target_q'):
+            self.target_q = R + self.gamma * self.q_
+
+        with tf.variable_scope('TD_error'):
+            self.loss = tf.reduce_mean(tf.squared_difference(self.target_q, self.q))
+
+        with tf.variable_scope('C_train'):
+            self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)
+
+        with tf.variable_scope('a_grad'):
+            self.a_grads = tf.gradients(self.q, A)[0]   # tensor of gradients of each sample (None, a_dim)
+
+    def _build_net(self, s, a, scope, trainable):
+        with tf.variable_scope(scope):
+            init_w = tf.random_normal_initializer(0., 0.1)
+            init_b = tf.constant_initializer(0.1)
+
+            with tf.variable_scope('l1'):
+                n_l1 = 30
+                w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], initializer=init_w, trainable=trainable)
+                w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], initializer=init_w, trainable=trainable)
+                b1 = tf.get_variable('b1', [1, n_l1], initializer=init_b, trainable=trainable)
+                net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
+
+            with tf.variable_scope('q'):
+                q = tf.layers.dense(net, 1, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable)   # Q(s,a)
+        return q
+
+    def learn(self, s, a, r, s_):
+        self.sess.run(self.train_op, feed_dict={S: s, A: a, R: r, S_: s_})
+        # the following method for soft replace target params is computational expansive
+        # target_params = (1-tau) * target_params + tau * eval_params
+        # self.sess.run([tf.assign(t, (1 - self.tau) * t + self.tau * e) for t, e in zip(self.t_params, self.e_params)])
+
+        # instead of above method, we use a hard replacement here
+        if self.t_replace_counter % self.t_replace_iter == 0:
+            self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
+        self.t_replace_counter += 1
+
+
+#####################  Memory  ####################
+
+class Memory(object):
+    def __init__(self, capacity, dims):
+        self.capacity = capacity
+        self.data = np.zeros((capacity, dims))
+        self.pointer = 0
+
+    def store_transition(self, s, a, r, s_):
+        transition = np.hstack((s, a, [r], s_))
+        index = self.pointer % self.capacity  # replace the old memory with new memory
+        self.data[index, :] = transition
+        self.pointer += 1
+
+    def sample(self, n):
+        assert self.pointer >= self.capacity, 'Memory has not been fulfilled'
+        indices = np.random.choice(self.capacity, size=n)
+        return self.data[indices, :]
+
+
+env = gym.make(ENV_NAME)
+env = env.unwrapped
+env.seed(1)
+
+state_dim = env.observation_space.shape[0]
+action_dim = env.action_space.shape[0]
+action_bound = env.action_space.high
+
+# all placeholder for tf
+with tf.name_scope('S'):
+    S = tf.placeholder(tf.float32, shape=[None, state_dim], name='s')
+with tf.name_scope('A'):
+    A = tf.placeholder(tf.float32, shape=[None, action_dim], name='a')
+with tf.name_scope('R'):
+    R = tf.placeholder(tf.float32, [None, 1], name='r')
+with tf.name_scope('S_'):
+    S_ = tf.placeholder(tf.float32, shape=[None, state_dim], name='s_')
+
+
+sess = tf.Session()
+
+# Create actor and critic.
+# They are actually connected to each other, details can be seen in tensorboard or in this picture:
+actor = Actor(sess, action_dim, action_bound, LR_A, REPLACE_ITER_A)
+critic = Critic(sess, state_dim, action_dim, LR_C, GAMMA, REPLACE_ITER_C, actor.a_)
+actor.add_grad_to_graph(critic.a_grads)
+
+sess.run(tf.global_variables_initializer())
+
+M = Memory(MEMORY_CAPACITY, dims=2 * state_dim + action_dim + 1)
+
+if OUTPUT_GRAPH:
+    tf.summary.FileWriter("logs/", sess.graph)
+
+var = 3  # control exploration
+
+for i in range(MAX_EPISODES):
+    s = env.reset()
+    ep_reward = 0
+
+    for j in range(MAX_EP_STEPS):
+
+        if RENDER:
+            env.render()
+
+        # Added exploration noise
+        a = actor.choose_action(s)
+        a = np.clip(np.random.normal(a, var), -2, 2)    # add randomness to action selection for exploration
+        s_, r, done, info = env.step(a)
+
+        M.store_transition(s, a, r / 10, s_)
+
+        if M.pointer > MEMORY_CAPACITY:
+            var *= .9995    # decay the action randomness
+            b_M = M.sample(BATCH_SIZE)
+            b_s = b_M[:, :state_dim]
+            b_a = b_M[:, state_dim: state_dim + action_dim]
+            b_r = b_M[:, -state_dim - 1: -state_dim]
+            b_s_ = b_M[:, -state_dim:]
+
+            critic.learn(b_s, b_a, b_r, b_s_)
+            actor.learn(b_s, b_a)
+
+        s = s_
+        ep_reward += r
+
+        if j == MAX_EP_STEPS-1:
+            print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
+            if ep_reward > -1000:
+                RENDER = True
+            break
\ No newline at end of file
diff --git a/experiments/2D_car/DDPG.py b/experiments/2D_car/DDPG.py
new file mode 100644
index 0000000..f91bfe0
--- /dev/null
+++ b/experiments/2D_car/DDPG.py
@@ -0,0 +1,269 @@
+"""
+Environment is a 2D car.
+Car has 5 sensors to obtain distance information.
+
+Car collision => reward = -1, otherwise => reward = 0.
+ 
+You can train this RL by using LOAD = False, after training, this model will be store in the a local folder.
+Using LOAD = True to reload the trained model for playing.
+
+You can customize this script in a way you want.
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+Requirement:
+pyglet >= 1.2.4
+numpy >= 1.12.1
+tensorflow >= 1.0.1
+"""
+
+import tensorflow as tf
+import numpy as np
+import os
+import shutil
+from car_env import CarEnv
+
+
+np.random.seed(1)
+tf.set_random_seed(1)
+
+MAX_EPISODES = 225
+MAX_EP_STEPS = 600
+LR_A = 1e-3  # learning rate for actor
+LR_C = 1e-3  # learning rate for critic
+GAMMA = 0.95  # reward discount
+REPLACE_ITER_A = 800
+REPLACE_ITER_C = 700
+MEMORY_CAPACITY = 5000
+BATCH_SIZE = 16
+VAR_MIN = 0.1
+RENDER = True
+LOAD = False
+DISCRETE_ACTION = False
+
+env = CarEnv(discrete_action=DISCRETE_ACTION)
+STATE_DIM = env.state_dim
+ACTION_DIM = env.action_dim
+ACTION_BOUND = env.action_bound
+
+# all placeholder for tf
+with tf.name_scope('S'):
+    S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s')
+with tf.name_scope('A'):
+    A = tf.placeholder(tf.float32, shape=[None, ACTION_DIM], name='a')
+with tf.name_scope('R'):
+    R = tf.placeholder(tf.float32, [None, 1], name='r')
+with tf.name_scope('S_'):
+    S_ = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s_')
+
+
+class Actor(object):
+    def __init__(self, sess, action_dim, action_bound, learning_rate, t_replace_iter):
+        self.sess = sess
+        self.a_dim = action_dim
+        self.action_bound = action_bound
+        self.lr = learning_rate
+        self.t_replace_iter = t_replace_iter
+        self.t_replace_counter = 0
+
+        with tf.variable_scope('Actor'):
+            # input s, output a
+            self.a = self._build_net(S, scope='eval_net', trainable=True)
+
+            # input s_, output a, get a_ for critic
+            self.a_ = self._build_net(S_, scope='target_net', trainable=False)
+
+        self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval_net')
+        self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target_net')
+
+    def _build_net(self, s, scope, trainable):
+        with tf.variable_scope(scope):
+            init_w = tf.contrib.layers.xavier_initializer()
+            init_b = tf.constant_initializer(0.001)
+            net = tf.layers.dense(s, 100, activation=tf.nn.relu,
+                                  kernel_initializer=init_w, bias_initializer=init_b, name='l1',
+                                  trainable=trainable)
+            net = tf.layers.dense(net, 20, activation=tf.nn.relu,
+                                  kernel_initializer=init_w, bias_initializer=init_b, name='l2',
+                                  trainable=trainable)
+            with tf.variable_scope('a'):
+                actions = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, kernel_initializer=init_w,
+                                          name='a', trainable=trainable)
+                scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a')  # Scale output to -action_bound to action_bound
+        return scaled_a
+
+    def learn(self, s, a):   # batch update
+        self.sess.run(self.train_op, feed_dict={S: s, A: a})
+        if self.t_replace_counter % self.t_replace_iter == 0:
+            self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
+        self.t_replace_counter += 1
+
+    def choose_action(self, s):
+        s = s[np.newaxis, :]    # single state
+        return self.sess.run(self.a, feed_dict={S: s})[0]  # single action
+
+    def add_grad_to_graph(self, a_grads):
+        with tf.variable_scope('policy_grads'):
+            self.policy_grads = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads)
+
+        with tf.variable_scope('A_train'):
+            opt = tf.train.RMSPropOptimizer(-self.lr / BATCH_SIZE)  # (- learning rate) for ascent policy, div to take mean
+            self.train_op = opt.apply_gradients(zip(self.policy_grads, self.e_params))
+
+
+class Critic(object):
+    def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter, a_):
+        self.sess = sess
+        self.s_dim = state_dim
+        self.a_dim = action_dim
+        self.lr = learning_rate
+        self.gamma = gamma
+        self.t_replace_iter = t_replace_iter
+        self.t_replace_counter = 0
+
+        with tf.variable_scope('Critic'):
+            # Input (s, a), output q
+            self.q = self._build_net(S, A, 'eval_net', trainable=True)
+
+            # Input (s_, a_), output q_ for q_target
+            self.q_ = self._build_net(S_, a_, 'target_net', trainable=False)    # target_q is based on a_ from Actor's target_net
+
+            self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_net')
+            self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_net')
+
+        with tf.variable_scope('target_q'):
+            self.target_q = R + self.gamma * self.q_
+
+        with tf.variable_scope('TD_error'):
+            self.loss = tf.reduce_mean(tf.squared_difference(self.target_q, self.q))
+
+        with tf.variable_scope('C_train'):
+            self.train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
+
+        with tf.variable_scope('a_grad'):
+            self.a_grads = tf.gradients(self.q, A)[0]   # tensor of gradients of each sample (None, a_dim)
+
+    def _build_net(self, s, a, scope, trainable):
+        with tf.variable_scope(scope):
+            init_w = tf.contrib.layers.xavier_initializer()
+            init_b = tf.constant_initializer(0.01)
+
+            with tf.variable_scope('l1'):
+                n_l1 = 100
+                w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], initializer=init_w, trainable=trainable)
+                w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], initializer=init_w, trainable=trainable)
+                b1 = tf.get_variable('b1', [1, n_l1], initializer=init_b, trainable=trainable)
+                net = tf.nn.relu6(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
+            net = tf.layers.dense(net, 20, activation=tf.nn.relu,
+                                  kernel_initializer=init_w, bias_initializer=init_b, name='l2',
+                                  trainable=trainable)
+            with tf.variable_scope('q'):
+                q = tf.layers.dense(net, 1, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable)   # Q(s,a)
+        return q
+
+    def learn(self, s, a, r, s_):
+        self.sess.run(self.train_op, feed_dict={S: s, A: a, R: r, S_: s_})
+        if self.t_replace_counter % self.t_replace_iter == 0:
+            self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
+        self.t_replace_counter += 1
+
+
+class Memory(object):
+    def __init__(self, capacity, dims):
+        self.capacity = capacity
+        self.data = np.zeros((capacity, dims))
+        self.pointer = 0
+
+    def store_transition(self, s, a, r, s_):
+        transition = np.hstack((s, a, [r], s_))
+        index = self.pointer % self.capacity  # replace the old memory with new memory
+        self.data[index, :] = transition
+        self.pointer += 1
+
+    def sample(self, n):
+        assert self.pointer >= self.capacity, 'Memory has not been fulfilled'
+        indices = np.random.choice(self.capacity, size=n)
+        return self.data[indices, :]
+
+
+sess = tf.Session()
+
+# Create actor and critic.
+actor = Actor(sess, ACTION_DIM, ACTION_BOUND[1], LR_A, REPLACE_ITER_A)
+critic = Critic(sess, STATE_DIM, ACTION_DIM, LR_C, GAMMA, REPLACE_ITER_C, actor.a_)
+actor.add_grad_to_graph(critic.a_grads)
+
+M = Memory(MEMORY_CAPACITY, dims=2 * STATE_DIM + ACTION_DIM + 1)
+
+saver = tf.train.Saver()
+path = './discrete' if DISCRETE_ACTION else './continuous'
+
+if LOAD:
+    saver.restore(sess, tf.train.latest_checkpoint(path))
+else:
+    sess.run(tf.global_variables_initializer())
+
+
+def train():
+    var = 2.  # control exploration
+    for ep in range(MAX_EPISODES):
+        s = env.reset()
+        ep_step = 0
+
+        for t in range(MAX_EP_STEPS):
+        # while True:
+            if RENDER:
+                env.render()
+
+            # Added exploration noise
+            a = actor.choose_action(s)
+            a = np.clip(np.random.normal(a, var), *ACTION_BOUND)    # add randomness to action selection for exploration
+            s_, r, done = env.step(a)
+            M.store_transition(s, a, r, s_)
+
+            if M.pointer > MEMORY_CAPACITY:
+                var = max([var*.9995, VAR_MIN])    # decay the action randomness
+                b_M = M.sample(BATCH_SIZE)
+                b_s = b_M[:, :STATE_DIM]
+                b_a = b_M[:, STATE_DIM: STATE_DIM + ACTION_DIM]
+                b_r = b_M[:, -STATE_DIM - 1: -STATE_DIM]
+                b_s_ = b_M[:, -STATE_DIM:]
+
+                critic.learn(b_s, b_a, b_r, b_s_)
+                actor.learn(b_s, b_a)
+
+            s = s_
+            ep_step += 1
+
+            if done or t == MAX_EP_STEPS - 1:
+            # if done:
+                print('Ep:', ep,
+                      '| Steps: %i' % int(ep_step),
+                      '| Explore: %.2f' % var,
+                      )
+                break
+
+    if os.path.isdir(path): shutil.rmtree(path)
+    os.mkdir(path)
+    ckpt_path = os.path.join(path, 'DDPG.ckpt')
+    save_path = saver.save(sess, ckpt_path, write_meta_graph=False)
+    print("\nSave Model %s\n" % save_path)
+
+
+def eval():
+    env.set_fps(30)
+    while True:
+        s = env.reset()
+        while True:
+            env.render()
+            a = actor.choose_action(s)
+            s_, r, done = env.step(a)
+            s = s_
+            if done:
+                break
+
+if __name__ == '__main__':
+    if LOAD:
+        eval()
+    else:
+        train()
\ No newline at end of file
diff --git a/experiments/2D_car/car_env.py b/experiments/2D_car/car_env.py
new file mode 100644
index 0000000..8a8ed2b
--- /dev/null
+++ b/experiments/2D_car/car_env.py
@@ -0,0 +1,234 @@
+"""
+Environment for 2D car driving.
+You can customize this script in a way you want.
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+
+Requirement:
+pyglet >= 1.2.4
+numpy >= 1.12.1
+"""
+import numpy as np
+import pyglet
+
+
+pyglet.clock.set_fps_limit(10000)
+
+
+class CarEnv(object):
+    n_sensor = 5
+    action_dim = 1
+    state_dim = n_sensor
+    viewer = None
+    viewer_xy = (500, 500)
+    sensor_max = 150.
+    start_point = [450, 300]
+    speed = 50.
+    dt = 0.1
+
+    def __init__(self, discrete_action=False):
+        self.is_discrete_action = discrete_action
+        if discrete_action:
+            self.actions = [-1, 0, 1]
+        else:
+            self.action_bound = [-1, 1]
+
+        self.terminal = False
+        # node1 (x, y, r, w, l),
+        self.car_info = np.array([0, 0, 0, 20, 40], dtype=np.float64)   # car coordination
+        self.obstacle_coords = np.array([
+            [120, 120],
+            [380, 120],
+            [380, 380],
+            [120, 380],
+        ])
+        self.sensor_info = self.sensor_max + np.zeros((self.n_sensor, 3))  # n sensors, (distance, end_x, end_y)
+
+    def step(self, action):
+        if self.is_discrete_action:
+            action = self.actions[action]
+        else:
+            action = np.clip(action, *self.action_bound)[0]
+        self.car_info[2] += action * np.pi/30  # max r = 6 degree
+        self.car_info[:2] = self.car_info[:2] + \
+                            self.speed * self.dt * np.array([np.cos(self.car_info[2]), np.sin(self.car_info[2])])
+
+        self._update_sensor()
+        s = self._get_state()
+        r = -1 if self.terminal else 0
+        return s, r, self.terminal
+
+    def reset(self):
+        self.terminal = False
+        self.car_info[:3] = np.array([*self.start_point, -np.pi/2])
+        self._update_sensor()
+        return self._get_state()
+
+    def render(self):
+        if self.viewer is None:
+            self.viewer = Viewer(*self.viewer_xy, self.car_info, self.sensor_info, self.obstacle_coords)
+        self.viewer.render()
+
+    def sample_action(self):
+        if self.is_discrete_action:
+            a = np.random.choice(list(range(3)))
+        else:
+            a = np.random.uniform(*self.action_bound, size=self.action_dim)
+        return a
+
+    def set_fps(self, fps=30):
+        pyglet.clock.set_fps_limit(fps)
+
+    def _get_state(self):
+        s = self.sensor_info[:, 0].flatten()/self.sensor_max
+        return s
+
+    def _update_sensor(self):
+        cx, cy, rotation = self.car_info[:3]
+
+        n_sensors = len(self.sensor_info)
+        sensor_theta = np.linspace(-np.pi / 2, np.pi / 2, n_sensors)
+        xs = cx + (np.zeros((n_sensors, ))+self.sensor_max) * np.cos(sensor_theta)
+        ys = cy + (np.zeros((n_sensors, ))+self.sensor_max) * np.sin(sensor_theta)
+        xys = np.array([[x, y] for x, y in zip(xs, ys)])    # shape (5 sensors, 2)
+
+        # sensors
+        tmp_x = xys[:, 0] - cx
+        tmp_y = xys[:, 1] - cy
+        # apply rotation
+        rotated_x = tmp_x * np.cos(rotation) - tmp_y * np.sin(rotation)
+        rotated_y = tmp_x * np.sin(rotation) + tmp_y * np.cos(rotation)
+        # rotated x y
+        self.sensor_info[:, -2:] = np.vstack([rotated_x+cx, rotated_y+cy]).T
+
+        q = np.array([cx, cy])
+        for si in range(len(self.sensor_info)):
+            s = self.sensor_info[si, -2:] - q
+            possible_sensor_distance = [self.sensor_max]
+            possible_intersections = [self.sensor_info[si, -2:]]
+
+            # obstacle collision
+            for oi in range(len(self.obstacle_coords)):
+                p = self.obstacle_coords[oi]
+                r = self.obstacle_coords[(oi + 1) % len(self.obstacle_coords)] - self.obstacle_coords[oi]
+                if np.cross(r, s) != 0:  # may collision
+                    t = np.cross((q - p), s) / np.cross(r, s)
+                    u = np.cross((q - p), r) / np.cross(r, s)
+                    if 0 <= t <= 1 and 0 <= u <= 1:
+                        intersection = q + u * s
+                        possible_intersections.append(intersection)
+                        possible_sensor_distance.append(np.linalg.norm(u*s))
+
+            # window collision
+            win_coord = np.array([
+                [0, 0],
+                [self.viewer_xy[0], 0],
+                [*self.viewer_xy],
+                [0, self.viewer_xy[1]],
+                [0, 0],
+            ])
+            for oi in range(4):
+                p = win_coord[oi]
+                r = win_coord[(oi + 1) % len(win_coord)] - win_coord[oi]
+                if np.cross(r, s) != 0:  # may collision
+                    t = np.cross((q - p), s) / np.cross(r, s)
+                    u = np.cross((q - p), r) / np.cross(r, s)
+                    if 0 <= t <= 1 and 0 <= u <= 1:
+                        intersection = p + t * r
+                        possible_intersections.append(intersection)
+                        possible_sensor_distance.append(np.linalg.norm(intersection - q))
+
+            distance = np.min(possible_sensor_distance)
+            distance_index = np.argmin(possible_sensor_distance)
+            self.sensor_info[si, 0] = distance
+            self.sensor_info[si, -2:] = possible_intersections[distance_index]
+            if distance < self.car_info[-1]/2:
+                self.terminal = True
+
+
+class Viewer(pyglet.window.Window):
+    color = {
+        'background': [1]*3 + [1]
+    }
+    fps_display = pyglet.clock.ClockDisplay()
+    bar_thc = 5
+
+    def __init__(self, width, height, car_info, sensor_info, obstacle_coords):
+        super(Viewer, self).__init__(width, height, resizable=False, caption='2D car', vsync=False)  # vsync=False to not use the monitor FPS
+        self.set_location(x=80, y=10)
+        pyglet.gl.glClearColor(*self.color['background'])
+
+        self.car_info = car_info
+        self.sensor_info = sensor_info
+
+        self.batch = pyglet.graphics.Batch()
+        background = pyglet.graphics.OrderedGroup(0)
+        foreground = pyglet.graphics.OrderedGroup(1)
+
+        self.sensors = []
+        line_coord = [0, 0] * 2
+        c = (73, 73, 73) * 2
+        for i in range(len(self.sensor_info)):
+            self.sensors.append(self.batch.add(2, pyglet.gl.GL_LINES, foreground, ('v2f', line_coord), ('c3B', c)))
+
+        car_box = [0, 0] * 4
+        c = (249, 86, 86) * 4
+        self.car = self.batch.add(4, pyglet.gl.GL_QUADS, foreground, ('v2f', car_box), ('c3B', c))
+
+        c = (134, 181, 244) * 4
+        self.obstacle = self.batch.add(4, pyglet.gl.GL_QUADS, background, ('v2f', obstacle_coords.flatten()), ('c3B', c))
+
+    def render(self):
+        pyglet.clock.tick()
+        self._update()
+        self.switch_to()
+        self.dispatch_events()
+        self.dispatch_event('on_draw')
+        self.flip()
+
+    def on_draw(self):
+        self.clear()
+        self.batch.draw()
+        # self.fps_display.draw()
+
+    def _update(self):
+        cx, cy, r, w, l = self.car_info
+
+        # sensors
+        for i, sensor in enumerate(self.sensors):
+            sensor.vertices = [cx, cy, *self.sensor_info[i, -2:]]
+
+        # car
+        xys = [
+            [cx + l / 2, cy + w / 2],
+            [cx - l / 2, cy + w / 2],
+            [cx - l / 2, cy - w / 2],
+            [cx + l / 2, cy - w / 2],
+        ]
+        r_xys = []
+        for x, y in xys:
+            tempX = x - cx
+            tempY = y - cy
+            # apply rotation
+            rotatedX = tempX * np.cos(r) - tempY * np.sin(r)
+            rotatedY = tempX * np.sin(r) + tempY * np.cos(r)
+            # rotated x y
+            x = rotatedX + cx
+            y = rotatedY + cy
+            r_xys += [x, y]
+        self.car.vertices = r_xys
+
+
+if __name__ == '__main__':
+    np.random.seed(1)
+    env = CarEnv()
+    env.set_fps(30)
+    for ep in range(20):
+        s = env.reset()
+        # for t in range(100):
+        while True:
+            env.render()
+            s, r, done = env.step(env.sample_action())
+            if done:
+                break
\ No newline at end of file
diff --git a/experiments/2D_car/collision.py b/experiments/2D_car/collision.py
new file mode 100644
index 0000000..1b77643
--- /dev/null
+++ b/experiments/2D_car/collision.py
@@ -0,0 +1,57 @@
+import numpy as np
+
+def intersection():
+    p = np.array([0, 0])
+    r = np.array([1, 1])
+    q = np.array([0.1, 0.1])
+    s = np.array([.1, .1])
+
+    if np.cross(r, s) == 0 and np.cross((q-p), r) == 0:    # collinear
+        # t0 = (q − p) · r / (r · r)
+        # t1 = (q + s − p) · r / (r · r) = t0 + s · r / (r · r)
+        t0 = np.dot(q-p, r)/np.dot(r, r)
+        t1 = t0 + np.dot(s, r)/np.dot(r, r)
+        print(t1, t0)
+        if ((np.dot(s, r) > 0) and (0 <= t1 - t0 <= 1)) or ((np.dot(s, r) <= 0) and (0 <= t0 - t1 <= 1)):
+            print('collinear and overlapping, q_s in p_r')
+        else:
+            print('collinear and disjoint')
+    elif np.cross(r, s) == 0 and np.cross((q-p), r) != 0:  # parallel r × s = 0 and (q − p) × r ≠ 0,
+        print('parallel')
+    else:
+        t = np.cross((q - p), s) / np.cross(r, s)
+        u = np.cross((q - p), r) / np.cross(r, s)
+        if 0 <= t <= 1 and 0 <= u <= 1:
+            # If r × s ≠ 0 and 0 ≤ t ≤ 1 and 0 ≤ u ≤ 1, the two line segments meet at the point p + t r = q + u s
+            print('intersection: ', p + t*r)
+        else:
+            print('not parallel and not intersect')
+
+
+def point2segment():
+    p = np.array([-1, 1])    # coordination of point
+    a = np.array([0, 1])    # coordination of line segment end 1
+    b = np.array([1, 0])    # coordination of line segment end 2
+    ab = b-a    # line ab
+    ap = p-a
+    distance = np.abs(np.cross(ab, ap)/np.linalg.norm(ab))  # d = (AB x AC)/|AB|
+    print(distance)
+
+    # angle  Cos(θ) = A dot B /(|A||B|)
+    bp = p-b
+    cosTheta1 = np.dot(ap, ab) / (np.linalg.norm(ap) * np.linalg.norm(ab))
+    theta1 = np.arccos(cosTheta1)
+    cosTheta2 = np.dot(bp, ab) / (np.linalg.norm(bp) * np.linalg.norm(ab))
+    theta2 = np.arccos(cosTheta2)
+    if np.pi/2 <= (theta1 % (np.pi*2)) <= 3/2 * np.pi:
+        print('out of a')
+    elif -np.pi/2 <= (theta2 % (np.pi*2)) <= np.pi/2:
+        print('out of b')
+    else:
+        print('between a and b')
+
+
+
+if __name__ == '__main__':
+    point2segment()
+    # intersection()
diff --git a/experiments/Robot_arm/A3C.py b/experiments/Robot_arm/A3C.py
new file mode 100644
index 0000000..89150db
--- /dev/null
+++ b/experiments/Robot_arm/A3C.py
@@ -0,0 +1,214 @@
+"""
+Environment is a Robot Arm. The arm tries to get to the blue point.
+The environment will return a geographic (distance) information for the arm to learn.
+
+The far away from blue point the less reward; touch blue r+=1; stop at blue for a while then get r=+10.
+ 
+You can train this RL by using LOAD = False, after training, this model will be store in the a local folder.
+Using LOAD = True to reload the trained model for playing.
+
+You can customize this script in a way you want.
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+
+Requirement:
+pyglet >= 1.2.4
+numpy >= 1.12.1
+tensorflow >= 1.0.1
+"""
+
+import multiprocessing
+import threading
+import tensorflow as tf
+import numpy as np
+from arm_env import ArmEnv
+
+
+# np.random.seed(1)
+# tf.set_random_seed(1)
+
+MAX_GLOBAL_EP = 2000
+MAX_EP_STEP = 300
+UPDATE_GLOBAL_ITER = 5
+N_WORKERS = multiprocessing.cpu_count()
+LR_A = 1e-4  # learning rate for actor
+LR_C = 2e-4  # learning rate for critic
+GAMMA = 0.9  # reward discount
+MODE = ['easy', 'hard']
+n_model = 1
+GLOBAL_NET_SCOPE = 'Global_Net'
+ENTROPY_BETA = 0.01
+GLOBAL_RUNNING_R = []
+GLOBAL_EP = 0
+
+
+env = ArmEnv(mode=MODE[n_model])
+N_S = env.state_dim
+N_A = env.action_dim
+A_BOUND = env.action_bound
+del env
+
+
+class ACNet(object):
+    def __init__(self, scope, globalAC=None):
+
+        if scope == GLOBAL_NET_SCOPE:   # get global network
+            with tf.variable_scope(scope):
+                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+                self._build_net()
+                self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+                self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+        else:   # local net, calculate losses
+            with tf.variable_scope(scope):
+                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+                self.a_his = tf.placeholder(tf.float32, [None, N_A], 'A')
+                self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
+
+                mu, sigma, self.v = self._build_net()
+
+                td = tf.subtract(self.v_target, self.v, name='TD_error')
+                with tf.name_scope('c_loss'):
+                    self.c_loss = tf.reduce_mean(tf.square(td))
+
+                with tf.name_scope('wrap_a_out'):
+                    self.test = sigma[0]
+                    mu, sigma = mu * A_BOUND[1], sigma + 1e-5
+
+                normal_dist = tf.contrib.distributions.Normal(mu, sigma)
+
+                with tf.name_scope('a_loss'):
+                    log_prob = normal_dist.log_prob(self.a_his)
+                    exp_v = log_prob * td
+                    entropy = normal_dist.entropy()  # encourage exploration
+                    self.exp_v = ENTROPY_BETA * entropy + exp_v
+                    self.a_loss = tf.reduce_mean(-self.exp_v)
+
+                with tf.name_scope('choose_a'):  # use local params to choose action
+                    self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=0), *A_BOUND)
+                with tf.name_scope('local_grad'):
+                    self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+                    self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+                    self.a_grads = tf.gradients(self.a_loss, self.a_params)
+                    self.c_grads = tf.gradients(self.c_loss, self.c_params)
+
+            with tf.name_scope('sync'):
+                with tf.name_scope('pull'):
+                    self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)]
+                    self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)]
+                with tf.name_scope('push'):
+                    self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params))
+                    self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params))
+
+    def _build_net(self):
+        w_init = tf.contrib.layers.xavier_initializer()
+        with tf.variable_scope('actor'):
+            l_a = tf.layers.dense(self.s, 400, tf.nn.relu6, kernel_initializer=w_init, name='la')
+            l_a = tf.layers.dense(l_a, 300, tf.nn.relu6, kernel_initializer=w_init, name='la2')
+            mu = tf.layers.dense(l_a, N_A, tf.nn.tanh, kernel_initializer=w_init, name='mu')
+            sigma = tf.layers.dense(l_a, N_A, tf.nn.softplus, kernel_initializer=w_init, name='sigma')
+        with tf.variable_scope('critic'):
+            l_c = tf.layers.dense(self.s, 400, tf.nn.relu6, kernel_initializer=w_init, name='lc')
+            l_c = tf.layers.dense(l_c, 200, tf.nn.relu6, kernel_initializer=w_init, name='lc2')
+            v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v')  # state value
+        return mu, sigma, v
+
+    def update_global(self, feed_dict):  # run by a local
+        _, _, t = SESS.run([self.update_a_op, self.update_c_op, self.test], feed_dict)  # local grads applies to global net
+        return t
+
+    def pull_global(self):  # run by a local
+        SESS.run([self.pull_a_params_op, self.pull_c_params_op])
+
+    def choose_action(self, s):  # run by a local
+        s = s[np.newaxis, :]
+        return SESS.run(self.A, {self.s: s})[0]
+
+
+class Worker(object):
+    def __init__(self, name, globalAC):
+        self.env = ArmEnv(mode=MODE[n_model])
+        self.name = name
+        self.AC = ACNet(name, globalAC)
+
+    def work(self):
+        global GLOBAL_RUNNING_R, GLOBAL_EP
+        total_step = 1
+        buffer_s, buffer_a, buffer_r = [], [], []
+        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
+            s = self.env.reset()
+            ep_r = 0
+            for ep_t in range(MAX_EP_STEP):
+                if self.name == 'W_0':
+                    self.env.render()
+                a = self.AC.choose_action(s)
+                s_, r, done = self.env.step(a)
+                if ep_t == MAX_EP_STEP - 1: done = True
+                ep_r += r
+                buffer_s.append(s)
+                buffer_a.append(a)
+                buffer_r.append(r)
+
+                if total_step % UPDATE_GLOBAL_ITER == 0 or done:   # update global and assign to local net
+                    if done:
+                        v_s_ = 0   # terminal
+                    else:
+                        v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0]
+                    buffer_v_target = []
+                    for r in buffer_r[::-1]:    # reverse buffer r
+                        v_s_ = r + GAMMA * v_s_
+                        buffer_v_target.append(v_s_)
+                    buffer_v_target.reverse()
+
+                    buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target)
+                    feed_dict = {
+                        self.AC.s: buffer_s,
+                        self.AC.a_his: buffer_a,
+                        self.AC.v_target: buffer_v_target,
+                    }
+                    test = self.AC.update_global(feed_dict)
+                    buffer_s, buffer_a, buffer_r = [], [], []
+                    self.AC.pull_global()
+
+                s = s_
+                total_step += 1
+                if done:
+                    if len(GLOBAL_RUNNING_R) == 0:  # record running episode reward
+                        GLOBAL_RUNNING_R.append(ep_r)
+                    else:
+                        GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r)
+                    print(
+                        self.name,
+                        "Ep:", GLOBAL_EP,
+                        "| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
+                        '| Var:', test,
+
+                          )
+                    GLOBAL_EP += 1
+                    break
+
+if __name__ == "__main__":
+    SESS = tf.Session()
+
+    with tf.device("/cpu:0"):
+        OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA')
+        OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC')
+        GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE)  # we only need its params
+        workers = []
+        # Create worker
+        for i in range(N_WORKERS):
+            i_name = 'W_%i' % i   # worker name
+            workers.append(Worker(i_name, GLOBAL_AC))
+
+    COORD = tf.train.Coordinator()
+    SESS.run(tf.global_variables_initializer())
+
+    worker_threads = []
+    for worker in workers:
+        job = lambda: worker.work()
+        t = threading.Thread(target=job)
+        t.start()
+        worker_threads.append(t)
+    COORD.join(worker_threads)
+
+
diff --git a/experiments/Robot_arm/DDPG.py b/experiments/Robot_arm/DDPG.py
new file mode 100644
index 0000000..0eb1b8a
--- /dev/null
+++ b/experiments/Robot_arm/DDPG.py
@@ -0,0 +1,277 @@
+"""
+Environment is a Robot Arm. The arm tries to get to the blue point.
+The environment will return a geographic (distance) information for the arm to learn.
+
+The far away from blue point the less reward; touch blue r+=1; stop at blue for a while then get r=+10.
+ 
+You can train this RL by using LOAD = False, after training, this model will be store in the a local folder.
+Using LOAD = True to reload the trained model for playing.
+
+You can customize this script in a way you want.
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+Requirement:
+pyglet >= 1.2.4
+numpy >= 1.12.1
+tensorflow >= 1.0.1
+"""
+
+import tensorflow as tf
+import numpy as np
+import os
+import shutil
+from arm_env import ArmEnv
+
+
+np.random.seed(1)
+tf.set_random_seed(1)
+
+MAX_EPISODES = 600
+MAX_EP_STEPS = 200
+LR_A = 1e-4  # learning rate for actor
+LR_C = 1e-4  # learning rate for critic
+GAMMA = 0.999  # reward discount
+REPLACE_ITER_A = 1100
+REPLACE_ITER_C = 1000
+MEMORY_CAPACITY = 10000
+BATCH_SIZE = 16
+VAR_MIN = 0.1
+RENDER = True
+LOAD = False
+MODE = ['easy', 'hard']
+n_model = 1
+
+env = ArmEnv(mode=MODE[n_model])
+STATE_DIM = env.state_dim
+ACTION_DIM = env.action_dim
+ACTION_BOUND = env.action_bound
+
+# all placeholder for tf
+with tf.name_scope('S'):
+    S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s')
+with tf.name_scope('A'):
+    A = tf.placeholder(tf.float32, shape=[None, ACTION_DIM], name='a')
+with tf.name_scope('R'):
+    R = tf.placeholder(tf.float32, [None, 1], name='r')
+with tf.name_scope('S_'):
+    S_ = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s_')
+
+
+class Actor(object):
+    def __init__(self, sess, action_dim, action_bound, learning_rate, t_replace_iter):
+        self.sess = sess
+        self.a_dim = action_dim
+        self.action_bound = action_bound
+        self.lr = learning_rate
+        self.t_replace_iter = t_replace_iter
+        self.t_replace_counter = 0
+
+        with tf.variable_scope('Actor'):
+            # input s, output a
+            self.a = self._build_net(S, scope='eval_net', trainable=True)
+
+            # input s_, output a, get a_ for critic
+            self.a_ = self._build_net(S_, scope='target_net', trainable=False)
+
+        self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval_net')
+        self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target_net')
+
+    def _build_net(self, s, scope, trainable):
+        with tf.variable_scope(scope):
+            init_w = tf.contrib.layers.xavier_initializer()
+            init_b = tf.constant_initializer(0.001)
+            net = tf.layers.dense(s, 200, activation=tf.nn.relu6,
+                                  kernel_initializer=init_w, bias_initializer=init_b, name='l1',
+                                  trainable=trainable)
+            net = tf.layers.dense(net, 200, activation=tf.nn.relu6,
+                                  kernel_initializer=init_w, bias_initializer=init_b, name='l2',
+                                  trainable=trainable)
+            net = tf.layers.dense(net, 10, activation=tf.nn.relu,
+                                  kernel_initializer=init_w, bias_initializer=init_b, name='l3',
+                                  trainable=trainable)
+            with tf.variable_scope('a'):
+                actions = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, kernel_initializer=init_w,
+                                          name='a', trainable=trainable)
+                scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a')  # Scale output to -action_bound to action_bound
+        return scaled_a
+
+    def learn(self, s, a):   # batch update
+        self.sess.run(self.train_op, feed_dict={S: s, A: a})
+        if self.t_replace_counter % self.t_replace_iter == 0:
+            self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
+        self.t_replace_counter += 1
+
+    def choose_action(self, s):
+        s = s[np.newaxis, :]    # single state
+        return self.sess.run(self.a, feed_dict={S: s})[0]  # single action
+
+    def add_grad_to_graph(self, a_grads):
+        with tf.variable_scope('policy_grads'):
+            self.policy_grads = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads)
+
+        with tf.variable_scope('A_train'):
+            opt = tf.train.RMSPropOptimizer(-self.lr / BATCH_SIZE)  # (- learning rate) for ascent policy, div to take mean
+            self.train_op = opt.apply_gradients(zip(self.policy_grads, self.e_params))
+
+
+class Critic(object):
+    def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter, a_):
+        self.sess = sess
+        self.s_dim = state_dim
+        self.a_dim = action_dim
+        self.lr = learning_rate
+        self.gamma = gamma
+        self.t_replace_iter = t_replace_iter
+        self.t_replace_counter = 0
+
+        with tf.variable_scope('Critic'):
+            # Input (s, a), output q
+            self.q = self._build_net(S, A, 'eval_net', trainable=True)
+
+            # Input (s_, a_), output q_ for q_target
+            self.q_ = self._build_net(S_, a_, 'target_net', trainable=False)    # target_q is based on a_ from Actor's target_net
+
+            self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_net')
+            self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_net')
+
+        with tf.variable_scope('target_q'):
+            self.target_q = R + self.gamma * self.q_
+
+        with tf.variable_scope('TD_error'):
+            self.loss = tf.reduce_mean(tf.squared_difference(self.target_q, self.q))
+
+        with tf.variable_scope('C_train'):
+            self.train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
+
+        with tf.variable_scope('a_grad'):
+            self.a_grads = tf.gradients(self.q, A)[0]   # tensor of gradients of each sample (None, a_dim)
+
+    def _build_net(self, s, a, scope, trainable):
+        with tf.variable_scope(scope):
+            init_w = tf.contrib.layers.xavier_initializer()
+            init_b = tf.constant_initializer(0.01)
+
+            with tf.variable_scope('l1'):
+                n_l1 = 200
+                w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], initializer=init_w, trainable=trainable)
+                w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], initializer=init_w, trainable=trainable)
+                b1 = tf.get_variable('b1', [1, n_l1], initializer=init_b, trainable=trainable)
+                net = tf.nn.relu6(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
+            net = tf.layers.dense(net, 200, activation=tf.nn.relu6,
+                                  kernel_initializer=init_w, bias_initializer=init_b, name='l2',
+                                  trainable=trainable)
+            net = tf.layers.dense(net, 10, activation=tf.nn.relu,
+                                  kernel_initializer=init_w, bias_initializer=init_b, name='l3',
+                                  trainable=trainable)
+            with tf.variable_scope('q'):
+                q = tf.layers.dense(net, 1, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable)   # Q(s,a)
+        return q
+
+    def learn(self, s, a, r, s_):
+        self.sess.run(self.train_op, feed_dict={S: s, A: a, R: r, S_: s_})
+        if self.t_replace_counter % self.t_replace_iter == 0:
+            self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
+        self.t_replace_counter += 1
+
+
+class Memory(object):
+    def __init__(self, capacity, dims):
+        self.capacity = capacity
+        self.data = np.zeros((capacity, dims))
+        self.pointer = 0
+
+    def store_transition(self, s, a, r, s_):
+        transition = np.hstack((s, a, [r], s_))
+        index = self.pointer % self.capacity  # replace the old memory with new memory
+        self.data[index, :] = transition
+        self.pointer += 1
+
+    def sample(self, n):
+        assert self.pointer >= self.capacity, 'Memory has not been fulfilled'
+        indices = np.random.choice(self.capacity, size=n)
+        return self.data[indices, :]
+
+
+sess = tf.Session()
+
+# Create actor and critic.
+actor = Actor(sess, ACTION_DIM, ACTION_BOUND[1], LR_A, REPLACE_ITER_A)
+critic = Critic(sess, STATE_DIM, ACTION_DIM, LR_C, GAMMA, REPLACE_ITER_C, actor.a_)
+actor.add_grad_to_graph(critic.a_grads)
+
+M = Memory(MEMORY_CAPACITY, dims=2 * STATE_DIM + ACTION_DIM + 1)
+
+saver = tf.train.Saver()
+path = './'+MODE[n_model]
+
+if LOAD:
+    saver.restore(sess, tf.train.latest_checkpoint(path))
+else:
+    sess.run(tf.global_variables_initializer())
+
+
+def train():
+    var = 2.  # control exploration
+
+    for ep in range(MAX_EPISODES):
+        s = env.reset()
+        ep_reward = 0
+
+        for t in range(MAX_EP_STEPS):
+        # while True:
+            if RENDER:
+                env.render()
+
+            # Added exploration noise
+            a = actor.choose_action(s)
+            a = np.clip(np.random.normal(a, var), *ACTION_BOUND)    # add randomness to action selection for exploration
+            s_, r, done = env.step(a)
+            M.store_transition(s, a, r, s_)
+
+            if M.pointer > MEMORY_CAPACITY:
+                var = max([var*.99995, VAR_MIN])    # decay the action randomness
+                b_M = M.sample(BATCH_SIZE)
+                b_s = b_M[:, :STATE_DIM]
+                b_a = b_M[:, STATE_DIM: STATE_DIM + ACTION_DIM]
+                b_r = b_M[:, -STATE_DIM - 1: -STATE_DIM]
+                b_s_ = b_M[:, -STATE_DIM:]
+
+                critic.learn(b_s, b_a, b_r, b_s_)
+                actor.learn(b_s, b_a)
+
+            s = s_
+            ep_reward += r
+
+            if t == MAX_EP_STEPS-1 or done:
+            # if done:
+                result = '| done' if done else '| ----'
+                print('Ep:', ep,
+                      result,
+                      '| R: %i' % int(ep_reward),
+                      '| Explore: %.2f' % var,
+                      )
+                break
+
+    if os.path.isdir(path): shutil.rmtree(path)
+    os.mkdir(path)
+    ckpt_path = os.path.join('./'+MODE[n_model], 'DDPG.ckpt')
+    save_path = saver.save(sess, ckpt_path, write_meta_graph=False)
+    print("\nSave Model %s\n" % save_path)
+
+
+def eval():
+    env.set_fps(30)
+    s = env.reset()
+    while True:
+        if RENDER:
+            env.render()
+        a = actor.choose_action(s)
+        s_, r, done = env.step(a)
+        s = s_
+
+if __name__ == '__main__':
+    if LOAD:
+        eval()
+    else:
+        train()
\ No newline at end of file
diff --git a/experiments/Robot_arm/arm_env.py b/experiments/Robot_arm/arm_env.py
new file mode 100644
index 0000000..a0eb0fd
--- /dev/null
+++ b/experiments/Robot_arm/arm_env.py
@@ -0,0 +1,218 @@
+"""
+Environment for Robot Arm.
+You can customize this script in a way you want.
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+
+Requirement:
+pyglet >= 1.2.4
+numpy >= 1.12.1
+"""
+import numpy as np
+import pyglet
+
+
+pyglet.clock.set_fps_limit(10000)
+
+
+class ArmEnv(object):
+    action_bound = [-1, 1]
+    action_dim = 2
+    state_dim = 7
+    dt = .1  # refresh rate
+    arm1l = 100
+    arm2l = 100
+    viewer = None
+    viewer_xy = (400, 400)
+    get_point = False
+    mouse_in = np.array([False])
+    point_l = 15
+    grab_counter = 0
+
+    def __init__(self, mode='easy'):
+        # node1 (l, d_rad, x, y),
+        # node2 (l, d_rad, x, y)
+        self.mode = mode
+        self.arm_info = np.zeros((2, 4))
+        self.arm_info[0, 0] = self.arm1l
+        self.arm_info[1, 0] = self.arm2l
+        self.point_info = np.array([250, 303])
+        self.point_info_init = self.point_info.copy()
+        self.center_coord = np.array(self.viewer_xy)/2
+
+    def step(self, action):
+        # action = (node1 angular v, node2 angular v)
+        action = np.clip(action, *self.action_bound)
+        self.arm_info[:, 1] += action * self.dt
+        self.arm_info[:, 1] %= np.pi * 2
+
+        arm1rad = self.arm_info[0, 1]
+        arm2rad = self.arm_info[1, 1]
+        arm1dx_dy = np.array([self.arm_info[0, 0] * np.cos(arm1rad), self.arm_info[0, 0] * np.sin(arm1rad)])
+        arm2dx_dy = np.array([self.arm_info[1, 0] * np.cos(arm2rad), self.arm_info[1, 0] * np.sin(arm2rad)])
+        self.arm_info[0, 2:4] = self.center_coord + arm1dx_dy  # (x1, y1)
+        self.arm_info[1, 2:4] = self.arm_info[0, 2:4] + arm2dx_dy  # (x2, y2)
+
+        s, arm2_distance = self._get_state()
+        r = self._r_func(arm2_distance)
+
+        return s, r, self.get_point
+
+    def reset(self):
+        self.get_point = False
+        self.grab_counter = 0
+
+        if self.mode == 'hard':
+            pxy = np.clip(np.random.rand(2) * self.viewer_xy[0], 100, 300)
+            self.point_info[:] = pxy
+        else:
+            arm1rad, arm2rad = np.random.rand(2) * np.pi * 2
+            self.arm_info[0, 1] = arm1rad
+            self.arm_info[1, 1] = arm2rad
+            arm1dx_dy = np.array([self.arm_info[0, 0] * np.cos(arm1rad), self.arm_info[0, 0] * np.sin(arm1rad)])
+            arm2dx_dy = np.array([self.arm_info[1, 0] * np.cos(arm2rad), self.arm_info[1, 0] * np.sin(arm2rad)])
+            self.arm_info[0, 2:4] = self.center_coord + arm1dx_dy  # (x1, y1)
+            self.arm_info[1, 2:4] = self.arm_info[0, 2:4] + arm2dx_dy  # (x2, y2)
+
+            self.point_info[:] = self.point_info_init
+        return self._get_state()[0]
+
+    def render(self):
+        if self.viewer is None:
+            self.viewer = Viewer(*self.viewer_xy, self.arm_info, self.point_info, self.point_l, self.mouse_in)
+        self.viewer.render()
+
+    def sample_action(self):
+        return np.random.uniform(*self.action_bound, size=self.action_dim)
+
+    def set_fps(self, fps=30):
+        pyglet.clock.set_fps_limit(fps)
+
+    def _get_state(self):
+        # return the distance (dx, dy) between arm finger point with blue point
+        arm_end = self.arm_info[:, 2:4]
+        t_arms = np.ravel(arm_end - self.point_info)
+        center_dis = (self.center_coord - self.point_info)/200
+        in_point = 1 if self.grab_counter > 0 else 0
+        return np.hstack([in_point, t_arms/200, center_dis,
+                          # arm1_distance_p, arm1_distance_b,
+                          ]), t_arms[-2:]
+
+    def _r_func(self, distance):
+        t = 50
+        abs_distance = np.sqrt(np.sum(np.square(distance)))
+        r = -abs_distance/200
+        if abs_distance < self.point_l and (not self.get_point):
+            r += 1.
+            self.grab_counter += 1
+            if self.grab_counter > t:
+                r += 10.
+                self.get_point = True
+        elif abs_distance > self.point_l:
+            self.grab_counter = 0
+            self.get_point = False
+        return r
+
+
+class Viewer(pyglet.window.Window):
+    color = {
+        'background': [1]*3 + [1]
+    }
+    fps_display = pyglet.clock.ClockDisplay()
+    bar_thc = 5
+
+    def __init__(self, width, height, arm_info, point_info, point_l, mouse_in):
+        super(Viewer, self).__init__(width, height, resizable=False, caption='Arm', vsync=False)  # vsync=False to not use the monitor FPS
+        self.set_location(x=80, y=10)
+        pyglet.gl.glClearColor(*self.color['background'])
+
+        self.arm_info = arm_info
+        self.point_info = point_info
+        self.mouse_in = mouse_in
+        self.point_l = point_l
+
+        self.center_coord = np.array((min(width, height)/2, ) * 2)
+        self.batch = pyglet.graphics.Batch()
+
+        arm1_box, arm2_box, point_box = [0]*8, [0]*8, [0]*8
+        c1, c2, c3 = (249, 86, 86)*4, (86, 109, 249)*4, (249, 39, 65)*4
+        self.point = self.batch.add(4, pyglet.gl.GL_QUADS, None, ('v2f', point_box), ('c3B', c2))
+        self.arm1 = self.batch.add(4, pyglet.gl.GL_QUADS, None, ('v2f', arm1_box), ('c3B', c1))
+        self.arm2 = self.batch.add(4, pyglet.gl.GL_QUADS, None, ('v2f', arm2_box), ('c3B', c1))
+
+    def render(self):
+        pyglet.clock.tick()
+        self._update_arm()
+        self.switch_to()
+        self.dispatch_events()
+        self.dispatch_event('on_draw')
+        self.flip()
+
+    def on_draw(self):
+        self.clear()
+        self.batch.draw()
+        # self.fps_display.draw()
+
+    def _update_arm(self):
+        point_l = self.point_l
+        point_box = (self.point_info[0] - point_l, self.point_info[1] - point_l,
+                     self.point_info[0] + point_l, self.point_info[1] - point_l,
+                     self.point_info[0] + point_l, self.point_info[1] + point_l,
+                     self.point_info[0] - point_l, self.point_info[1] + point_l)
+        self.point.vertices = point_box
+
+        arm1_coord = (*self.center_coord, *(self.arm_info[0, 2:4]))  # (x0, y0, x1, y1)
+        arm2_coord = (*(self.arm_info[0, 2:4]), *(self.arm_info[1, 2:4]))  # (x1, y1, x2, y2)
+        arm1_thick_rad = np.pi / 2 - self.arm_info[0, 1]
+        x01, y01 = arm1_coord[0] - np.cos(arm1_thick_rad) * self.bar_thc, arm1_coord[1] + np.sin(
+            arm1_thick_rad) * self.bar_thc
+        x02, y02 = arm1_coord[0] + np.cos(arm1_thick_rad) * self.bar_thc, arm1_coord[1] - np.sin(
+            arm1_thick_rad) * self.bar_thc
+        x11, y11 = arm1_coord[2] + np.cos(arm1_thick_rad) * self.bar_thc, arm1_coord[3] - np.sin(
+            arm1_thick_rad) * self.bar_thc
+        x12, y12 = arm1_coord[2] - np.cos(arm1_thick_rad) * self.bar_thc, arm1_coord[3] + np.sin(
+            arm1_thick_rad) * self.bar_thc
+        arm1_box = (x01, y01, x02, y02, x11, y11, x12, y12)
+        arm2_thick_rad = np.pi / 2 - self.arm_info[1, 1]
+        x11_, y11_ = arm2_coord[0] + np.cos(arm2_thick_rad) * self.bar_thc, arm2_coord[1] - np.sin(
+            arm2_thick_rad) * self.bar_thc
+        x12_, y12_ = arm2_coord[0] - np.cos(arm2_thick_rad) * self.bar_thc, arm2_coord[1] + np.sin(
+            arm2_thick_rad) * self.bar_thc
+        x21, y21 = arm2_coord[2] - np.cos(arm2_thick_rad) * self.bar_thc, arm2_coord[3] + np.sin(
+            arm2_thick_rad) * self.bar_thc
+        x22, y22 = arm2_coord[2] + np.cos(arm2_thick_rad) * self.bar_thc, arm2_coord[3] - np.sin(
+            arm2_thick_rad) * self.bar_thc
+        arm2_box = (x11_, y11_, x12_, y12_, x21, y21, x22, y22)
+        self.arm1.vertices = arm1_box
+        self.arm2.vertices = arm2_box
+
+    def on_key_press(self, symbol, modifiers):
+        if symbol == pyglet.window.key.UP:
+            self.arm_info[0, 1] += .1
+            print(self.arm_info[:, 2:4] - self.point_info)
+        elif symbol == pyglet.window.key.DOWN:
+            self.arm_info[0, 1] -= .1
+            print(self.arm_info[:, 2:4] - self.point_info)
+        elif symbol == pyglet.window.key.LEFT:
+            self.arm_info[1, 1] += .1
+            print(self.arm_info[:, 2:4] - self.point_info)
+        elif symbol == pyglet.window.key.RIGHT:
+            self.arm_info[1, 1] -= .1
+            print(self.arm_info[:, 2:4] - self.point_info)
+        elif symbol == pyglet.window.key.Q:
+            pyglet.clock.set_fps_limit(1000)
+        elif symbol == pyglet.window.key.A:
+            pyglet.clock.set_fps_limit(30)
+
+    def on_mouse_motion(self, x, y, dx, dy):
+        self.point_info[:] = [x, y]
+
+    def on_mouse_enter(self, x, y):
+        self.mouse_in[0] = True
+
+    def on_mouse_leave(self, x, y):
+        self.mouse_in[0] = False
+
+
+
diff --git a/experiments/Solve_BipedalWalker/A3C.py b/experiments/Solve_BipedalWalker/A3C.py
new file mode 100644
index 0000000..7f4bc45
--- /dev/null
+++ b/experiments/Solve_BipedalWalker/A3C.py
@@ -0,0 +1,209 @@
+"""
+Asynchronous Advantage Actor Critic (A3C), Reinforcement Learning.
+
+The BipedalWalker example.
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+Using:
+tensorflow 1.0
+gym 0.8.0
+"""
+
+import multiprocessing
+import threading
+import tensorflow as tf
+import numpy as np
+import gym
+import os
+import shutil
+
+
+GAME = 'BipedalWalker-v2'
+OUTPUT_GRAPH = False
+LOG_DIR = './log'
+N_WORKERS = multiprocessing.cpu_count()
+MAX_GLOBAL_EP = 8000
+GLOBAL_NET_SCOPE = 'Global_Net'
+UPDATE_GLOBAL_ITER = 10
+GAMMA = 0.999
+ENTROPY_BETA = 0.005
+LR_A = 0.00002    # learning rate for actor
+LR_C = 0.0001    # learning rate for critic
+GLOBAL_RUNNING_R = []
+GLOBAL_EP = 0
+
+env = gym.make(GAME)
+
+N_S = env.observation_space.shape[0]
+N_A = env.action_space.shape[0]
+A_BOUND = [env.action_space.low, env.action_space.high]
+del env
+
+
+class ACNet(object):
+    def __init__(self, scope, globalAC=None):
+
+        if scope == GLOBAL_NET_SCOPE:   # get global network
+            with tf.variable_scope(scope):
+                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+                self._build_net()
+                self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+                self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+        else:   # local net, calculate losses
+            with tf.variable_scope(scope):
+                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+                self.a_his = tf.placeholder(tf.float32, [None, N_A], 'A')
+                self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
+
+                mu, sigma, self.v = self._build_net()
+
+                td = tf.subtract(self.v_target, self.v, name='TD_error')
+                with tf.name_scope('c_loss'):
+                    self.c_loss = tf.reduce_mean(tf.square(td))
+
+                with tf.name_scope('wrap_a_out'):
+                    self.test = sigma[0]
+                    mu, sigma = mu * A_BOUND[1], sigma + 1e-5
+
+                normal_dist = tf.contrib.distributions.Normal(mu, sigma)
+
+                with tf.name_scope('a_loss'):
+                    log_prob = normal_dist.log_prob(self.a_his)
+                    exp_v = log_prob * td
+                    entropy = normal_dist.entropy()  # encourage exploration
+                    self.exp_v = ENTROPY_BETA * entropy + exp_v
+                    self.a_loss = tf.reduce_mean(-self.exp_v)
+
+                with tf.name_scope('choose_a'):  # use local params to choose action
+                    self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=0), *A_BOUND)
+                with tf.name_scope('local_grad'):
+                    self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+                    self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+                    self.a_grads = tf.gradients(self.a_loss, self.a_params)
+                    self.c_grads = tf.gradients(self.c_loss, self.c_params)
+
+            with tf.name_scope('sync'):
+                with tf.name_scope('pull'):
+                    self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)]
+                    self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)]
+                with tf.name_scope('push'):
+                    self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params))
+                    self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params))
+
+    def _build_net(self):
+        w_init = tf.contrib.layers.xavier_initializer()
+        with tf.variable_scope('actor'):
+            l_a = tf.layers.dense(self.s, 500, tf.nn.relu6, kernel_initializer=w_init, name='la')
+            l_a = tf.layers.dense(l_a, 300, tf.nn.relu6, kernel_initializer=w_init, name='la2')
+            mu = tf.layers.dense(l_a, N_A, tf.nn.tanh, kernel_initializer=w_init, name='mu')
+            sigma = tf.layers.dense(l_a, N_A, tf.nn.softplus, kernel_initializer=w_init, name='sigma')
+        with tf.variable_scope('critic'):
+            l_c = tf.layers.dense(self.s, 500, tf.nn.relu6, kernel_initializer=w_init, name='lc')
+            l_c = tf.layers.dense(l_c, 200, tf.nn.relu6, kernel_initializer=w_init, name='lc2')
+            v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v')  # state value
+        return mu, sigma, v
+
+    def update_global(self, feed_dict):  # run by a local
+        _, _, t = SESS.run([self.update_a_op, self.update_c_op, self.test], feed_dict)  # local grads applies to global net
+        return t
+
+    def pull_global(self):  # run by a local
+        SESS.run([self.pull_a_params_op, self.pull_c_params_op])
+
+    def choose_action(self, s):  # run by a local
+        s = s[np.newaxis, :]
+        return SESS.run(self.A, {self.s: s})[0]
+
+
+class Worker(object):
+    def __init__(self, name, globalAC):
+        self.env = gym.make(GAME)
+        self.name = name
+        self.AC = ACNet(name, globalAC)
+
+    def work(self):
+        global GLOBAL_RUNNING_R, GLOBAL_EP
+        total_step = 1
+        buffer_s, buffer_a, buffer_r = [], [], []
+        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
+            s = self.env.reset()
+            ep_r = 0
+            while True:
+                if self.name == 'W_0' and total_step % 30 == 0:
+                    self.env.render()
+                a = self.AC.choose_action(s)
+                s_, r, done, info = self.env.step(a)
+                if r == -100: r = -2
+
+                ep_r += r
+                buffer_s.append(s)
+                buffer_a.append(a)
+                buffer_r.append(r)
+
+                if total_step % UPDATE_GLOBAL_ITER == 0 or done:   # update global and assign to local net
+                    if done:
+                        v_s_ = 0   # terminal
+                    else:
+                        v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0]
+                    buffer_v_target = []
+                    for r in buffer_r[::-1]:    # reverse buffer r
+                        v_s_ = r + GAMMA * v_s_
+                        buffer_v_target.append(v_s_)
+                    buffer_v_target.reverse()
+
+                    buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target)
+                    feed_dict = {
+                        self.AC.s: buffer_s,
+                        self.AC.a_his: buffer_a,
+                        self.AC.v_target: buffer_v_target,
+                    }
+                    test = self.AC.update_global(feed_dict)
+                    buffer_s, buffer_a, buffer_r = [], [], []
+                    self.AC.pull_global()
+
+                s = s_
+                total_step += 1
+                if done:
+                    achieve = '| Achieve' if self.env.unwrapped.hull.position[0] >= 88 else '| -------'
+                    if len(GLOBAL_RUNNING_R) == 0:  # record running episode reward
+                        GLOBAL_RUNNING_R.append(ep_r)
+                    else:
+                        GLOBAL_RUNNING_R.append(0.95 * GLOBAL_RUNNING_R[-1] + 0.05 * ep_r)
+                    print(
+                        self.name,
+                        "Ep:", GLOBAL_EP,
+                        achieve,
+                        "| Pos: %i" % self.env.unwrapped.hull.position[0],
+                        "| RR: %.1f" % GLOBAL_RUNNING_R[-1],
+                        '| EpR: %.1f' % ep_r,
+                        '| var:', test,
+                    )
+                    GLOBAL_EP += 1
+                    break
+
+if __name__ == "__main__":
+    SESS = tf.Session()
+
+    with tf.device("/cpu:0"):
+        OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA')
+        OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC')
+        GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE)  # we only need its params
+        workers = []
+        # Create worker
+        for i in range(N_WORKERS):
+            i_name = 'W_%i' % i   # worker name
+            workers.append(Worker(i_name, GLOBAL_AC))
+
+    COORD = tf.train.Coordinator()
+    SESS.run(tf.global_variables_initializer())
+
+    worker_threads = []
+    for worker in workers:
+        job = lambda: worker.work()
+        t = threading.Thread(target=job)
+        t.start()
+        worker_threads.append(t)
+    COORD.join(worker_threads)
+
+
diff --git a/experiments/Solve_BipedalWalker/A3C_rnn.py b/experiments/Solve_BipedalWalker/A3C_rnn.py
new file mode 100644
index 0000000..acdc951
--- /dev/null
+++ b/experiments/Solve_BipedalWalker/A3C_rnn.py
@@ -0,0 +1,235 @@
+"""
+Asynchronous Advantage Actor Critic (A3C), Reinforcement Learning.
+
+The BipedalWalker example.
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+Using:
+tensorflow 1.0
+gym 0.8.0
+"""
+
+import multiprocessing
+import threading
+import tensorflow as tf
+import numpy as np
+import gym
+import os
+import shutil
+
+
+GAME = 'BipedalWalker-v2'
+OUTPUT_GRAPH = False
+LOG_DIR = './log'
+N_WORKERS = multiprocessing.cpu_count()
+MAX_GLOBAL_EP = 8000
+GLOBAL_NET_SCOPE = 'Global_Net'
+UPDATE_GLOBAL_ITER = 10
+GAMMA = 0.99
+ENTROPY_BETA = 0.005
+LR_A = 0.00001    # learning rate for actor
+LR_C = 0.0001    # learning rate for critic
+GLOBAL_RUNNING_R = []
+GLOBAL_EP = 0
+
+env = gym.make(GAME)
+
+N_S = env.observation_space.shape[0]
+N_A = env.action_space.shape[0]
+A_BOUND = [env.action_space.low, env.action_space.high]
+del env
+
+
+class ACNet(object):
+    def __init__(self, scope, globalAC=None):
+
+        if scope == GLOBAL_NET_SCOPE:   # get global network
+            with tf.variable_scope(scope):
+                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+                self._build_net(N_A)
+                self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+                self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+        else:   # local net, calculate losses
+            with tf.variable_scope(scope):
+                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+                self.a_his = tf.placeholder(tf.float32, [None, N_A], 'A')
+                self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
+
+                mu, sigma, self.v = self._build_net(N_A)
+
+                td = tf.subtract(self.v_target, self.v, name='TD_error')
+                with tf.name_scope('c_loss'):
+                    self.c_loss = tf.reduce_mean(tf.square(td))
+
+                with tf.name_scope('wrap_a_out'):
+                    self.test = sigma[0]
+                    mu, sigma = mu * A_BOUND[1], sigma + 1e-5
+
+                normal_dist = tf.contrib.distributions.Normal(mu, sigma)
+
+                with tf.name_scope('a_loss'):
+                    log_prob = normal_dist.log_prob(self.a_his)
+                    exp_v = log_prob * td
+                    entropy = normal_dist.entropy()  # encourage exploration
+                    self.exp_v = ENTROPY_BETA * entropy + exp_v
+                    self.a_loss = tf.reduce_mean(-self.exp_v)
+
+                with tf.name_scope('choose_a'):  # use local params to choose action
+                    self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=0), A_BOUND[0], A_BOUND[1])
+
+                with tf.name_scope('local_grad'):
+                    self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+                    self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+                    self.a_grads = tf.gradients(self.a_loss, self.a_params)
+                    self.c_grads = tf.gradients(self.c_loss, self.c_params)
+
+            with tf.name_scope('sync'):
+                with tf.name_scope('pull'):
+                    self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in
+                                             zip(self.a_params, globalAC.a_params)]
+                    self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in
+                                             zip(self.c_params, globalAC.c_params)]
+                with tf.name_scope('push'):
+                    self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params))
+                    self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params))
+
+    def _build_net(self, n_a):
+        w_init = tf.random_normal_initializer(0., .01)
+        with tf.variable_scope('critic'):  # only critic controls the rnn update
+            cell_size = 128
+            s = tf.expand_dims(self.s, axis=1,
+                               name='timely_input')  # [time_step, feature] => [time_step, batch, feature]
+            rnn_cell = tf.contrib.rnn.BasicRNNCell(cell_size)
+            self.init_state = rnn_cell.zero_state(batch_size=1, dtype=tf.float32)
+            outputs, self.final_state = tf.nn.dynamic_rnn(
+                cell=rnn_cell, inputs=s, initial_state=self.init_state, time_major=True)
+            cell_out = tf.reshape(outputs, [-1, cell_size], name='flatten_rnn_outputs')  # joined state representation
+            l_c = tf.layers.dense(cell_out, 300, tf.nn.relu6, kernel_initializer=w_init, name='lc')
+            v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v')  # state value
+
+        with tf.variable_scope('actor'):  # state representation is based on critic
+            cell_out = tf.stop_gradient(cell_out, name='c_cell_out')  # from what critic think it is
+            l_a = tf.layers.dense(cell_out, 400, tf.nn.relu6, kernel_initializer=w_init, name='la')
+            mu = tf.layers.dense(l_a, N_A, tf.nn.tanh, kernel_initializer=w_init, name='mu')
+            sigma = tf.layers.dense(l_a, N_A, tf.nn.softplus, kernel_initializer=w_init, name='sigma') # restrict variance
+        return mu, sigma, v
+
+    def update_global(self, feed_dict):  # run by a local
+        _, _, t = SESS.run([self.update_a_op, self.update_c_op, self.test], feed_dict)  # local grads applies to global net
+        return t
+
+    def pull_global(self):  # run by a local
+        SESS.run([self.pull_a_params_op, self.pull_c_params_op])
+
+    def choose_action(self, s, cell_state):  # run by a local
+        s = s[np.newaxis, :]
+        a, cell_state = SESS.run([self.A, self.final_state], {self.s: s, self.init_state: cell_state})
+        return a[0], cell_state
+
+
+class Worker(object):
+    def __init__(self, name, globalAC):
+        self.env = gym.make(GAME)
+        self.name = name
+        self.AC = ACNet(name, globalAC)
+
+    def work(self):
+        global GLOBAL_RUNNING_R, GLOBAL_EP
+        total_step = 1
+        buffer_s, buffer_a, buffer_r = [], [], []
+        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
+            s = self.env.reset()
+            ep_r = 0
+            rnn_state = SESS.run(self.AC.init_state)  # zero rnn state at beginning
+            keep_state = rnn_state.copy()  # keep rnn state for updating global net
+            while True:
+                if self.name == 'W_0' and total_step % 30 == 0:
+                    self.env.render()
+
+                a, rnn_state_ = self.AC.choose_action(s, rnn_state)  # get the action and next rnn state
+                s_, r, done, info = self.env.step(a)
+                if r == -100: r = -2
+
+                ep_r += r
+                buffer_s.append(s)
+                buffer_a.append(a)
+                buffer_r.append(r)
+
+                if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # update global and assign to local net
+                    if done:
+                        v_s_ = 0  # terminal
+                    else:
+                        v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :], self.AC.init_state: rnn_state_})[
+                            0, 0]
+                    buffer_v_target = []
+                    for r in buffer_r[::-1]:  # reverse buffer r
+                        v_s_ = r + GAMMA * v_s_
+                        buffer_v_target.append(v_s_)
+                    buffer_v_target.reverse()
+
+                    buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(
+                        buffer_v_target)
+
+                    feed_dict = {
+                        self.AC.s: buffer_s,
+                        self.AC.a_his: buffer_a,
+                        self.AC.v_target: buffer_v_target,
+                        self.AC.init_state: keep_state,
+                    }
+
+                    test = self.AC.update_global(feed_dict)
+                    buffer_s, buffer_a, buffer_r = [], [], []
+                    self.AC.pull_global()
+                    keep_state = rnn_state_.copy()  # replace the keep_state as the new initial rnn state_
+
+                s = s_
+                rnn_state = rnn_state_  # renew rnn state
+                total_step += 1
+
+                if done:
+                    achieve = '| Achieve' if self.env.unwrapped.hull.position[0] >= 88 else '| -------'
+                    if len(GLOBAL_RUNNING_R) == 0:  # record running episode reward
+                        GLOBAL_RUNNING_R.append(ep_r)
+                    else:
+                        GLOBAL_RUNNING_R.append(0.95 * GLOBAL_RUNNING_R[-1] + 0.05 * ep_r)
+                    print(
+                        self.name,
+                        "Ep:", GLOBAL_EP,
+                        achieve,
+                        "| Pos: %i" % self.env.unwrapped.hull.position[0],
+                        "| RR: %.1f" % GLOBAL_RUNNING_R[-1],
+                        '| EpR: %.1f' % ep_r,
+                        '| var:', test,
+                    )
+                    GLOBAL_EP += 1
+                    break
+
+
+if __name__ == "__main__":
+    SESS = tf.Session()
+
+    with tf.device("/cpu:0"):
+        OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA', decay=0.95)
+        OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC', decay=0.95)
+        GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE)  # we only need its params
+        workers = []
+        # Create worker
+        for i in range(N_WORKERS):
+            i_name = 'W_%i' % i   # worker name
+            workers.append(Worker(i_name, GLOBAL_AC))
+
+    COORD = tf.train.Coordinator()
+    SESS.run(tf.global_variables_initializer())
+
+    if OUTPUT_GRAPH:
+        if os.path.exists(LOG_DIR):
+            shutil.rmtree(LOG_DIR)
+        tf.summary.FileWriter(LOG_DIR, SESS.graph)
+
+    worker_threads = []
+    for worker in workers:
+        t = threading.Thread(target=worker.work)
+        t.start()
+        worker_threads.append(t)
+    COORD.join(worker_threads)
diff --git a/experiments/Solve_BipedalWalker/DDPG.py b/experiments/Solve_BipedalWalker/DDPG.py
new file mode 100644
index 0000000..9f0a824
--- /dev/null
+++ b/experiments/Solve_BipedalWalker/DDPG.py
@@ -0,0 +1,390 @@
+import tensorflow as tf
+import numpy as np
+import gym
+import os
+import shutil
+
+np.random.seed(1)
+tf.set_random_seed(1)
+
+MAX_EPISODES = 2000
+LR_A = 0.0005  # learning rate for actor
+LR_C = 0.0005  # learning rate for critic
+GAMMA = 0.999  # reward discount
+REPLACE_ITER_A = 1700
+REPLACE_ITER_C = 1500
+MEMORY_CAPACITY = 200000
+BATCH_SIZE = 32
+DISPLAY_THRESHOLD = 100  # display until the running reward > 100
+DATA_PATH = './data'
+LOAD_MODEL = False
+SAVE_MODEL_ITER = 100000
+RENDER = False
+OUTPUT_GRAPH = False
+ENV_NAME = 'BipedalWalker-v2'
+
+GLOBAL_STEP = tf.Variable(0, trainable=False)
+INCREASE_GS = GLOBAL_STEP.assign(tf.add(GLOBAL_STEP, 1))
+LR_A = tf.train.exponential_decay(LR_A, GLOBAL_STEP, 10000, .97, staircase=True)
+LR_C = tf.train.exponential_decay(LR_C, GLOBAL_STEP, 10000, .97, staircase=True)
+END_POINT = (200 - 10) * (14/30)    # from game
+
+env = gym.make(ENV_NAME)
+env.seed(1)
+
+STATE_DIM = env.observation_space.shape[0]  # 24
+ACTION_DIM = env.action_space.shape[0]  # 4
+ACTION_BOUND = env.action_space.high    # [1, 1, 1, 1]
+
+# all placeholder for tf
+with tf.name_scope('S'):
+    S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s')
+with tf.name_scope('A'):
+    A = tf.placeholder(tf.float32, shape=[None, ACTION_DIM], name='a')
+with tf.name_scope('R'):
+    R = tf.placeholder(tf.float32, [None, 1], name='r')
+with tf.name_scope('S_'):
+    S_ = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s_')
+
+###############################  Actor  ####################################
+
+class Actor(object):
+    def __init__(self, sess, action_dim, action_bound, learning_rate, t_replace_iter):
+        self.sess = sess
+        self.a_dim = action_dim
+        self.action_bound = action_bound
+        self.lr = learning_rate
+        self.t_replace_iter = t_replace_iter
+        self.t_replace_counter = 0
+
+        with tf.variable_scope('Actor'):
+            # input s, output a
+            self.a = self._build_net(S, scope='eval_net', trainable=True)
+
+            # input s_, output a, get a_ for critic
+            self.a_ = self._build_net(S_, scope='target_net', trainable=False)
+
+        self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval_net')
+        self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target_net')
+
+    def _build_net(self, s, scope, trainable):
+        with tf.variable_scope(scope):
+            init_w = tf.random_normal_initializer(0., 0.01)
+            init_b = tf.constant_initializer(0.01)
+            net = tf.layers.dense(s, 500, activation=tf.nn.relu,
+                                  kernel_initializer=init_w, bias_initializer=init_b, name='l1', trainable=trainable)
+            net = tf.layers.dense(net, 200, activation=tf.nn.relu,
+                                  kernel_initializer=init_w, bias_initializer=init_b, name='l2', trainable=trainable)
+
+            with tf.variable_scope('a'):
+                actions = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, kernel_initializer=init_w,
+                                          bias_initializer=init_b, name='a', trainable=trainable)
+                scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a')  # Scale output to -action_bound to action_bound
+        return scaled_a
+
+    def learn(self, s, a):   # batch update
+        self.sess.run(self.train_op, feed_dict={S: s, A: a})
+        if self.t_replace_counter % self.t_replace_iter == 0:
+            self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
+        self.t_replace_counter += 1
+
+    def choose_action(self, s):
+        s = s[np.newaxis, :]    # single state
+        return self.sess.run(self.a, feed_dict={S: s})[0]  # single action
+
+    def add_grad_to_graph(self, a_grads):
+        with tf.variable_scope('policy_grads'):
+            # ys = policy;
+            # xs = policy's parameters;
+            # self.a_grads = the gradients of the policy to get more Q
+            # tf.gradients will calculate dys/dxs with a initial gradients for ys, so this is dq/da * da/dparams
+            self.policy_grads_and_vars = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads)
+
+        with tf.variable_scope('A_train'):
+            opt = tf.train.AdamOptimizer(-self.lr/BATCH_SIZE)  # (- learning rate) for ascent policy
+            self.train_op = opt.apply_gradients(zip(self.policy_grads_and_vars, self.e_params), global_step=GLOBAL_STEP)
+
+
+###############################  Critic  ####################################
+
+class Critic(object):
+    def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter, a_):
+        self.sess = sess
+        self.s_dim = state_dim
+        self.a_dim = action_dim
+        self.lr = learning_rate
+        self.gamma = gamma
+        self.t_replace_iter = t_replace_iter
+        self.t_replace_counter = 0
+
+        with tf.variable_scope('Critic'):
+            # Input (s, a), output q
+            self.q = self._build_net(S, A, 'eval_net', trainable=True)
+
+            # Input (s_, a_), output q_ for q_target
+            self.q_ = self._build_net(S_, a_, 'target_net', trainable=False)    # target_q is based on a_ from Actor's target_net
+
+            self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_net')
+            self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_net')
+
+        with tf.variable_scope('target_q'):
+            self.target_q = R + self.gamma * self.q_
+
+        with tf.variable_scope('abs_TD'):
+            self.abs_td = tf.abs(self.target_q - self.q)
+        self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights')
+        with tf.variable_scope('TD_error'):
+            self.loss = tf.reduce_mean(self.ISWeights * tf.squared_difference(self.target_q, self.q))
+
+        with tf.variable_scope('C_train'):
+            self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss, global_step=GLOBAL_STEP)
+
+        with tf.variable_scope('a_grad'):
+            self.a_grads = tf.gradients(self.q, A)[0]   # tensor of gradients of each sample (None, a_dim)
+
+    def _build_net(self, s, a, scope, trainable):
+        with tf.variable_scope(scope):
+            init_w = tf.random_normal_initializer(0., 0.01)
+            init_b = tf.constant_initializer(0.01)
+
+            with tf.variable_scope('l1'):
+                n_l1 = 700
+                # combine the action and states together in this way
+                w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], initializer=init_w, trainable=trainable)
+                w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], initializer=init_w, trainable=trainable)
+                b1 = tf.get_variable('b1', [1, n_l1], initializer=init_b, trainable=trainable)
+                net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
+            with tf.variable_scope('l2'):
+                net = tf.layers.dense(net, 20, activation=tf.nn.relu, kernel_initializer=init_w,
+                                      bias_initializer=init_b, name='l2', trainable=trainable)
+            with tf.variable_scope('q'):
+                q = tf.layers.dense(net, 1, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable)   # Q(s,a)
+        return q
+
+    def learn(self, s, a, r, s_, ISW):
+        _, abs_td = self.sess.run([self.train_op, self.abs_td], feed_dict={S: s, A: a, R: r, S_: s_, self.ISWeights: ISW})
+        if self.t_replace_counter % self.t_replace_iter == 0:
+            self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
+        self.t_replace_counter += 1
+        return abs_td
+
+
+class SumTree(object):
+    """
+    This SumTree code is modified version and the original code is from:
+    https://github.com/jaara/AI-blog/blob/master/SumTree.py
+
+    Story the data with it priority in tree and data frameworks.
+    """
+    data_pointer = 0
+
+    def __init__(self, capacity):
+        self.capacity = capacity  # for all priority values
+        self.tree = np.zeros(2 * capacity - 1)+1e-5
+        # [--------------Parent nodes-------------][-------leaves to recode priority-------]
+        #             size: capacity - 1                       size: capacity
+        self.data = np.zeros(capacity, dtype=object)  # for all transitions
+        # [--------------data frame-------------]
+        #             size: capacity
+
+    def add_new_priority(self, p, data):
+        leaf_idx = self.data_pointer + self.capacity - 1
+
+        self.data[self.data_pointer] = data  # update data_frame
+        self.update(leaf_idx, p)  # update tree_frame
+        self.data_pointer += 1
+        if self.data_pointer >= self.capacity:  # replace when exceed the capacity
+            self.data_pointer = 0
+
+    def update(self, tree_idx, p):
+        change = p - self.tree[tree_idx]
+
+        self.tree[tree_idx] = p
+        self._propagate_change(tree_idx, change)
+
+    def _propagate_change(self, tree_idx, change):
+        """change the sum of priority value in all parent nodes"""
+        parent_idx = (tree_idx - 1) // 2
+        self.tree[parent_idx] += change
+        if parent_idx != 0:
+            self._propagate_change(parent_idx, change)
+
+    def get_leaf(self, lower_bound):
+        leaf_idx = self._retrieve(lower_bound)  # search the max leaf priority based on the lower_bound
+        data_idx = leaf_idx - self.capacity + 1
+        return [leaf_idx, self.tree[leaf_idx], self.data[data_idx]]
+
+    def _retrieve(self, lower_bound, parent_idx=0):
+        """
+        Tree structure and array storage:
+
+        Tree index:
+             0         -> storing priority sum
+            / \
+          1     2
+         / \   / \
+        3   4 5   6    -> storing priority for transitions
+
+        Array type for storing:
+        [0,1,2,3,4,5,6]
+        """
+        left_child_idx = 2 * parent_idx + 1
+        right_child_idx = left_child_idx + 1
+
+        if left_child_idx >= len(self.tree):  # end search when no more child
+            return parent_idx
+
+        if self.tree[left_child_idx] == self.tree[right_child_idx]:
+            return self._retrieve(lower_bound, np.random.choice([left_child_idx, right_child_idx]))
+        if lower_bound <= self.tree[left_child_idx]:  # downward search, always search for a higher priority node
+            return self._retrieve(lower_bound, left_child_idx)
+        else:
+            return self._retrieve(lower_bound - self.tree[left_child_idx], right_child_idx)
+
+    @property
+    def root_priority(self):
+        return self.tree[0]  # the root
+
+
+class Memory(object):  # stored as ( s, a, r, s_ ) in SumTree
+    """
+    This SumTree code is modified version and the original code is from:
+    https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py
+    """
+    epsilon = 0.001  # small amount to avoid zero priority
+    alpha = 0.6  # [0~1] convert the importance of TD error to priority
+    beta = 0.4  # importance-sampling, from initial value increasing to 1
+    beta_increment_per_sampling = 1e-5  # annealing the bias
+    abs_err_upper = 1   # for stability refer to paper
+
+    def __init__(self, capacity):
+        self.tree = SumTree(capacity)
+
+    def store(self, error, transition):
+        p = self._get_priority(error)
+        self.tree.add_new_priority(p, transition)
+
+    def prio_sample(self, n):
+        batch_idx, batch_memory, ISWeights = [], [], []
+        segment = self.tree.root_priority / n
+        self.beta = np.min([1, self.beta + self.beta_increment_per_sampling])  # max = 1
+
+        min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.root_priority
+        maxiwi = np.power(self.tree.capacity * min_prob, -self.beta)  # for later normalizing ISWeights
+        for i in range(n):
+            a = segment * i
+            b = segment * (i + 1)
+            lower_bound = np.random.uniform(a, b)
+            while True:
+                idx, p, data = self.tree.get_leaf(lower_bound)
+                if type(data) is int:
+                    i -= 1
+                    lower_bound = np.random.uniform(segment * i, segment * (i+1))
+                else:
+                    break
+            prob = p / self.tree.root_priority
+            ISWeights.append(self.tree.capacity * prob)
+            batch_idx.append(idx)
+            batch_memory.append(data)
+
+        ISWeights = np.vstack(ISWeights)
+        ISWeights = np.power(ISWeights, -self.beta) / maxiwi  # normalize
+        return batch_idx, np.vstack(batch_memory), ISWeights
+
+    def random_sample(self, n):
+        idx = np.random.randint(0, self.tree.capacity, size=n, dtype=np.int)
+        return np.vstack(self.tree.data[idx])
+
+    def update(self, idx, error):
+        p = self._get_priority(error)
+        self.tree.update(idx, p)
+
+    def _get_priority(self, error):
+        error += self.epsilon   # avoid 0
+        clipped_error = np.clip(error, 0, self.abs_err_upper)
+        return np.power(clipped_error, self.alpha)
+
+
+sess = tf.Session()
+
+# Create actor and critic.
+actor = Actor(sess, ACTION_DIM, ACTION_BOUND, LR_A, REPLACE_ITER_A)
+critic = Critic(sess, STATE_DIM, ACTION_DIM, LR_C, GAMMA, REPLACE_ITER_C, actor.a_)
+actor.add_grad_to_graph(critic.a_grads)
+
+M = Memory(MEMORY_CAPACITY)
+
+saver = tf.train.Saver(max_to_keep=100)
+
+if LOAD_MODEL:
+    all_ckpt = tf.train.get_checkpoint_state('./data', 'checkpoint').all_model_checkpoint_paths
+    saver.restore(sess, all_ckpt[-1])
+else:
+    if os.path.isdir(DATA_PATH): shutil.rmtree(DATA_PATH)
+    os.mkdir(DATA_PATH)
+    sess.run(tf.global_variables_initializer())
+
+if OUTPUT_GRAPH:
+    tf.summary.FileWriter('logs', graph=sess.graph)
+
+var = 3  # control exploration
+var_min = 0.01
+
+for i_episode in range(MAX_EPISODES):
+    # s = (hull angle speed, angular velocity, horizontal speed, vertical speed, position of joints and joints angular speed, legs contact with ground, and 10 lidar rangefinder measurements.)
+    s = env.reset()
+    ep_r = 0
+    while True:
+        if RENDER:
+            env.render()
+        a = actor.choose_action(s)
+        a = np.clip(np.random.normal(a, var), -1, 1)    # add randomness to action selection for exploration
+        s_, r, done, _ = env.step(a)    # r = total 300+ points up to the far end. If the robot falls, it gets -100.
+
+        if r == -100: r = -2
+        ep_r += r
+
+        transition = np.hstack((s, a, [r], s_))
+        max_p = np.max(M.tree.tree[-M.tree.capacity:])
+        M.store(max_p, transition)
+
+        if GLOBAL_STEP.eval(sess) > MEMORY_CAPACITY/20:
+            var = max([var*0.9999, var_min])  # decay the action randomness
+            tree_idx, b_M, ISWeights = M.prio_sample(BATCH_SIZE)    # for critic update
+            b_s = b_M[:, :STATE_DIM]
+            b_a = b_M[:, STATE_DIM: STATE_DIM + ACTION_DIM]
+            b_r = b_M[:, -STATE_DIM - 1: -STATE_DIM]
+            b_s_ = b_M[:, -STATE_DIM:]
+
+            abs_td = critic.learn(b_s, b_a, b_r, b_s_, ISWeights)
+            actor.learn(b_s, b_a)
+            for i in range(len(tree_idx)):  # update priority
+                idx = tree_idx[i]
+                M.update(idx, abs_td[i])
+        if GLOBAL_STEP.eval(sess) % SAVE_MODEL_ITER == 0:
+            ckpt_path = os.path.join(DATA_PATH, 'DDPG.ckpt')
+            save_path = saver.save(sess, ckpt_path, global_step=GLOBAL_STEP, write_meta_graph=False)
+            print("\nSave Model %s\n" % save_path)
+
+        if done:
+            if "running_r" not in globals():
+                running_r = ep_r
+            else:
+                running_r = 0.95*running_r + 0.05*ep_r
+            if running_r > DISPLAY_THRESHOLD: RENDER = True
+            else: RENDER = False
+
+            done = '| Achieve ' if env.unwrapped.hull.position[0] >= END_POINT else '| -----'
+            print('Episode:', i_episode,
+                  done,
+                  '| Running_r: %i' % int(running_r),
+                  '| Epi_r: %.2f' % ep_r,
+                  '| Exploration: %.3f' % var,
+                  '| Pos: %.i' % int(env.unwrapped.hull.position[0]),
+                  '| LR_A: %.6f' % sess.run(LR_A),
+                  '| LR_C: %.6f' % sess.run(LR_C),
+                  )
+            break
+
+        s = s_
+        sess.run(INCREASE_GS)
\ No newline at end of file
diff --git a/experiments/Solve_BipedalWalker/log/events.out.tfevents.1490801027.Morvan b/experiments/Solve_BipedalWalker/log/events.out.tfevents.1490801027.Morvan
new file mode 100644
index 0000000..7746ab0
Binary files /dev/null and b/experiments/Solve_BipedalWalker/log/events.out.tfevents.1490801027.Morvan differ
diff --git a/experiments/Solve_LunarLander/A3C.py b/experiments/Solve_LunarLander/A3C.py
new file mode 100644
index 0000000..a57a547
--- /dev/null
+++ b/experiments/Solve_LunarLander/A3C.py
@@ -0,0 +1,224 @@
+"""
+Asynchronous Advantage Actor Critic (A3C) with continuous action space, Reinforcement Learning.
+
+The Pendulum example. Convergence promised, but difficult environment, this code hardly converge.
+
+View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
+
+Using:
+tensorflow 1.0
+gym 0.8.0
+"""
+
+import multiprocessing
+import threading
+import tensorflow as tf
+import numpy as np
+import gym
+import os
+import shutil
+import matplotlib.pyplot as plt
+
+
+GAME = 'LunarLander-v2'
+OUTPUT_GRAPH = False
+LOG_DIR = './log'
+N_WORKERS = multiprocessing.cpu_count()
+MAX_GLOBAL_EP = 5000
+GLOBAL_NET_SCOPE = 'Global_Net'
+UPDATE_GLOBAL_ITER = 5
+GAMMA = 0.99
+ENTROPY_BETA = 0.001   # not useful in this case
+LR_A = 0.0005    # learning rate for actor
+LR_C = 0.001    # learning rate for critic
+GLOBAL_RUNNING_R = []
+GLOBAL_EP = 0
+
+env = gym.make(GAME)
+
+N_S = env.observation_space.shape[0]
+N_A = env.action_space.n
+del env
+
+
+class ACNet(object):
+    def __init__(self, scope, globalAC=None):
+        if scope == GLOBAL_NET_SCOPE:   # get global network
+            with tf.variable_scope(scope):
+                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+                self._build_net(N_A)
+                self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+                self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+        else:   # local net, calculate losses
+            with tf.variable_scope(scope):
+                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+                self.a_his = tf.placeholder(tf.int32, [None, ], 'A')
+                self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
+
+                self.a_prob, self.v = self._build_net(N_A)
+
+                td = tf.subtract(self.v_target, self.v, name='TD_error')
+                with tf.name_scope('c_loss'):
+                    self.c_loss = tf.reduce_mean(tf.square(td))
+
+                with tf.name_scope('a_loss'):
+                    log_prob = tf.reduce_sum(tf.log(self.a_prob) * tf.one_hot(self.a_his, N_A, dtype=tf.float32), axis=1, keep_dims=True)
+                    exp_v = log_prob * td
+                    entropy = -tf.reduce_sum(self.a_prob * tf.log(self.a_prob), axis=1, keep_dims=True)  # encourage exploration
+                    self.exp_v = ENTROPY_BETA * entropy + exp_v
+                    self.a_loss = tf.reduce_mean(-self.exp_v)
+
+                with tf.name_scope('local_grad'):
+                    self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+                    self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+                    self.a_grads = tf.gradients(self.a_loss, self.a_params)
+                    self.c_grads = tf.gradients(self.c_loss, self.c_params)
+
+            with tf.name_scope('sync'):
+                with tf.name_scope('pull'):
+                    self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)]
+                    self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)]
+                with tf.name_scope('push'):
+                    self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params))
+                    self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params))
+
+    def _build_net(self, n_a):
+        w_init = tf.random_normal_initializer(0., .01)
+        with tf.variable_scope('critic'):
+            cell_size = 64
+            s = tf.expand_dims(self.s, axis=1,
+                               name='timely_input')  # [time_step, feature] => [time_step, batch, feature]
+            rnn_cell = tf.contrib.rnn.BasicRNNCell(cell_size)
+            self.init_state = rnn_cell.zero_state(batch_size=1, dtype=tf.float32)
+            outputs, self.final_state = tf.nn.dynamic_rnn(
+                cell=rnn_cell, inputs=s, initial_state=self.init_state, time_major=True)
+            cell_out = tf.reshape(outputs, [-1, cell_size], name='flatten_rnn_outputs')  # joined state representation
+            l_c = tf.layers.dense(cell_out, 200, tf.nn.relu6, kernel_initializer=w_init, name='lc')
+            v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v')  # state value
+        with tf.variable_scope('actor'):
+            cell_out = tf.stop_gradient(cell_out, name='c_cell_out')
+            l_a = tf.layers.dense(cell_out, 300, tf.nn.relu6, kernel_initializer=w_init, name='la')
+            a_prob = tf.layers.dense(l_a, n_a, tf.nn.softmax, kernel_initializer=w_init, name='ap')
+
+        return a_prob, v
+
+    def update_global(self, feed_dict):  # run by a local
+        SESS.run([self.update_a_op, self.update_c_op], feed_dict)  # local grads applies to global net
+
+    def pull_global(self):  # run by a local
+        SESS.run([self.pull_a_params_op, self.pull_c_params_op])
+
+    def choose_action(self, s, cell_state):  # run by a local
+        prob_weights, cell_state = SESS.run([self.a_prob, self.final_state], feed_dict={self.s: s[np.newaxis, :],
+                                                                            self.init_state: cell_state})
+        action = np.random.choice(range(prob_weights.shape[1]),
+                                  p=prob_weights.ravel())  # select action w.r.t the actions prob
+        return action, cell_state
+
+
+class Worker(object):
+    def __init__(self, name, globalAC):
+        self.env = gym.make(GAME)
+        self.name = name
+        self.AC = ACNet(name, globalAC)
+
+    def work(self):
+        global GLOBAL_RUNNING_R, GLOBAL_EP
+        total_step = 1
+        r_scale = 100
+        buffer_s, buffer_a, buffer_r = [], [], []
+        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
+            s = self.env.reset()
+            ep_r = 0
+            ep_t = 0
+            rnn_state = SESS.run(self.AC.init_state)  # zero rnn state at beginning
+            keep_state = rnn_state.copy()  # keep rnn state for updating global net
+            while True:
+                # if self.name == 'W_0' and total_step % 10 == 0:
+                #     self.env.render()
+                a, rnn_state_ = self.AC.choose_action(s, rnn_state)  # get the action and next rnn state
+                s_, r, done, info = self.env.step(a)
+                if r == -100: r = -10
+                ep_r += r
+                buffer_s.append(s)
+                buffer_a.append(a)
+                buffer_r.append(r/r_scale)
+
+                if total_step % UPDATE_GLOBAL_ITER == 0 or done:   # update global and assign to local net
+                    if done:
+                        v_s_ = 0   # terminal
+                    else:
+                        v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :], self.AC.init_state: rnn_state_})[0,0]
+                    buffer_v_target = []
+                    for r in buffer_r[::-1]:    # reverse buffer r
+                        v_s_ = r + GAMMA * v_s_
+                        buffer_v_target.append(v_s_)
+                    buffer_v_target.reverse()
+
+                    buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.array(buffer_a), np.vstack(buffer_v_target)
+                    feed_dict = {
+                        self.AC.s: buffer_s,
+                        self.AC.a_his: buffer_a,
+                        self.AC.v_target: buffer_v_target,
+                        self.AC.init_state: keep_state,
+                    }
+
+                    self.AC.update_global(feed_dict)
+
+                    buffer_s, buffer_a, buffer_r = [], [], []
+                    self.AC.pull_global()
+                    keep_state = rnn_state_.copy()  # replace the keep_state as the new initial rnn state_
+
+                s = s_
+                total_step += 1
+                rnn_state = rnn_state_  # renew rnn state
+                ep_t += 1
+                if done:
+                    if len(GLOBAL_RUNNING_R) == 0:  # record running episode reward
+                        GLOBAL_RUNNING_R.append(ep_r)
+                    else:
+                        GLOBAL_RUNNING_R.append(0.99 * GLOBAL_RUNNING_R[-1] + 0.01 * ep_r)
+                    if not self.env.unwrapped.lander.awake: solve = '| Landed'
+                    else: solve = '| ------'
+                    print(
+                        self.name,
+                        "Ep:", GLOBAL_EP,
+                        solve,
+                        "| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
+                          )
+                    GLOBAL_EP += 1
+                    break
+
+if __name__ == "__main__":
+    SESS = tf.Session()
+
+    with tf.device("/cpu:0"):
+        OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA')
+        OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC')
+        GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE)  # we only need its params
+        workers = []
+        # Create worker
+        for i in range(N_WORKERS):
+            i_name = 'W_%i' % i   # worker name
+            workers.append(Worker(i_name, GLOBAL_AC))
+
+    COORD = tf.train.Coordinator()
+    SESS.run(tf.global_variables_initializer())
+
+    if OUTPUT_GRAPH:
+        if os.path.exists(LOG_DIR):
+            shutil.rmtree(LOG_DIR)
+        tf.summary.FileWriter(LOG_DIR, SESS.graph)
+
+    worker_threads = []
+    for worker in workers:
+        job = lambda: worker.work()
+        t = threading.Thread(target=job)
+        t.start()
+        worker_threads.append(t)
+    COORD.join(worker_threads)
+
+    plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
+    plt.xlabel('step')
+    plt.ylabel('Total moving reward')
+    plt.show()
diff --git a/experiments/Solve_LunarLander/DuelingDQNPrioritizedReplay.py b/experiments/Solve_LunarLander/DuelingDQNPrioritizedReplay.py
new file mode 100644
index 0000000..3d6ed1b
--- /dev/null
+++ b/experiments/Solve_LunarLander/DuelingDQNPrioritizedReplay.py
@@ -0,0 +1,307 @@
+"""
+The DQN improvement: Prioritized Experience Replay (based on https://arxiv.org/abs/1511.05952)
+
+View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
+
+Using:
+Tensorflow: 1.0
+"""
+
+import numpy as np
+import tensorflow as tf
+
+np.random.seed(1)
+tf.set_random_seed(1)
+
+
+class SumTree(object):
+    """
+    This SumTree code is modified version and the original code is from:
+    https://github.com/jaara/AI-blog/blob/master/SumTree.py
+
+    Story the data with it priority in tree and data frameworks.
+    """
+    data_pointer = 0
+
+    def __init__(self, capacity):
+        self.capacity = capacity  # for all priority values
+        self.tree = np.zeros(2 * capacity - 1)
+        # [--------------Parent nodes-------------][-------leaves to recode priority-------]
+        #             size: capacity - 1                       size: capacity
+        self.data = np.zeros(capacity, dtype=object)  # for all transitions
+        # [--------------data frame-------------]
+        #             size: capacity
+
+    def add_new_priority(self, p, data):
+        leaf_idx = self.data_pointer + self.capacity - 1
+
+        self.data[self.data_pointer] = data  # update data_frame
+        self.update(leaf_idx, p)  # update tree_frame
+        self.data_pointer += 1
+        if self.data_pointer >= self.capacity:  # replace when exceed the capacity
+            self.data_pointer = 0
+
+    def update(self, tree_idx, p):
+        change = p - self.tree[tree_idx]
+
+        self.tree[tree_idx] = p
+        self._propagate_change(tree_idx, change)
+
+    def _propagate_change(self, tree_idx, change):
+        """change the sum of priority value in all parent nodes"""
+        parent_idx = (tree_idx - 1) // 2
+        self.tree[parent_idx] += change
+        if parent_idx != 0:
+            self._propagate_change(parent_idx, change)
+
+    def get_leaf(self, lower_bound):
+        leaf_idx = self._retrieve(lower_bound)  # search the max leaf priority based on the lower_bound
+        data_idx = leaf_idx - self.capacity + 1
+        return [leaf_idx, self.tree[leaf_idx], self.data[data_idx]]
+
+    def _retrieve(self, lower_bound, parent_idx=0):
+        """
+        Tree structure and array storage:
+
+        Tree index:
+             0         -> storing priority sum
+            / \
+          1     2
+         / \   / \
+        3   4 5   6    -> storing priority for transitions
+
+        Array type for storing:
+        [0,1,2,3,4,5,6]
+        """
+        left_child_idx = 2 * parent_idx + 1
+        right_child_idx = left_child_idx + 1
+
+        if left_child_idx >= len(self.tree):  # end search when no more child
+            return parent_idx
+
+        if self.tree[left_child_idx] == self.tree[right_child_idx]:
+            return self._retrieve(lower_bound, np.random.choice([left_child_idx, right_child_idx]))
+        if lower_bound <= self.tree[left_child_idx]:  # downward search, always search for a higher priority node
+            return self._retrieve(lower_bound, left_child_idx)
+        else:
+            return self._retrieve(lower_bound - self.tree[left_child_idx], right_child_idx)
+
+    @property
+    def root_priority(self):
+        return self.tree[0]  # the root
+
+
+class Memory(object):  # stored as ( s, a, r, s_ ) in SumTree
+    """
+    This SumTree code is modified version and the original code is from:
+    https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py
+    """
+    epsilon = 0.001  # small amount to avoid zero priority
+    alpha = 0.6  # [0~1] convert the importance of TD error to priority
+    beta = 0.4  # importance-sampling, from initial value increasing to 1
+    beta_increment_per_sampling = 1e-4  # annealing the bias
+    abs_err_upper = 1   # for stability refer to paper
+
+    def __init__(self, capacity):
+        self.tree = SumTree(capacity)
+
+    def store(self, error, transition):
+        p = self._get_priority(error)
+        self.tree.add_new_priority(p, transition)
+
+    def sample(self, n):
+        batch_idx, batch_memory, ISWeights = [], [], []
+        segment = self.tree.root_priority / n
+        self.beta = np.min([1, self.beta + self.beta_increment_per_sampling])  # max = 1
+
+        min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.root_priority
+        maxiwi = np.power(self.tree.capacity * min_prob, -self.beta)  # for later normalizing ISWeights
+        for i in range(n):
+            a = segment * i
+            b = segment * (i + 1)
+            lower_bound = np.random.uniform(a, b)
+            idx, p, data = self.tree.get_leaf(lower_bound)
+            prob = p / self.tree.root_priority
+            ISWeights.append(self.tree.capacity * prob)
+            batch_idx.append(idx)
+            batch_memory.append(data)
+
+        ISWeights = np.vstack(ISWeights)
+        ISWeights = np.power(ISWeights, -self.beta) / maxiwi  # normalize
+        return batch_idx, np.vstack(batch_memory), ISWeights
+
+    def update(self, idx, error):
+        p = self._get_priority(error)
+        self.tree.update(idx, p)
+
+    def _get_priority(self, error):
+        error += self.epsilon   # avoid 0
+        clipped_error = np.clip(error, 0, self.abs_err_upper)
+        return np.power(clipped_error, self.alpha)
+
+
+class DuelingDQNPrioritizedReplay:
+    def __init__(
+            self,
+            n_actions,
+            n_features,
+            learning_rate=0.005,
+            reward_decay=0.9,
+            e_greedy=0.9,
+            replace_target_iter=500,
+            memory_size=10000,
+            batch_size=32,
+            e_greedy_increment=None,
+            hidden=[100, 50],
+            output_graph=False,
+            sess=None,
+    ):
+        self.n_actions = n_actions
+        self.n_features = n_features
+        self.lr = learning_rate
+        self.gamma = reward_decay
+        self.epsilon_max = e_greedy
+        self.replace_target_iter = replace_target_iter
+        self.memory_size = memory_size
+        self.batch_size = batch_size
+        self.hidden = hidden
+        self.epsilon_increment = e_greedy_increment
+        self.epsilon = 0.5 if e_greedy_increment is not None else self.epsilon_max
+
+        self.learn_step_counter = 0
+        self._build_net()
+        self.memory = Memory(capacity=memory_size)
+
+        if sess is None:
+            self.sess = tf.Session()
+            self.sess.run(tf.global_variables_initializer())
+        else:
+            self.sess = sess
+
+        if output_graph:
+            tf.summary.FileWriter("logs/", self.sess.graph)
+
+        self.cost_his = []
+
+    def _build_net(self):
+        def build_layers(s, c_names, w_initializer, b_initializer):
+            for i, h in enumerate(self.hidden):
+                if i == 0:
+                    in_units, out_units, inputs = self.n_features, self.hidden[i], s
+                else:
+                    in_units, out_units, inputs = self.hidden[i-1], self.hidden[i], l
+                with tf.variable_scope('l%i' % i):
+                    w = tf.get_variable('w', [in_units, out_units], initializer=w_initializer, collections=c_names)
+                    b = tf.get_variable('b', [1, out_units], initializer=b_initializer, collections=c_names)
+                    l = tf.nn.relu(tf.matmul(inputs, w) + b)
+
+            with tf.variable_scope('Value'):
+                w = tf.get_variable('w', [self.hidden[-1], 1], initializer=w_initializer, collections=c_names)
+                b = tf.get_variable('b', [1, 1], initializer=b_initializer, collections=c_names)
+                self.V = tf.matmul(l, w) + b
+
+            with tf.variable_scope('Advantage'):
+                w = tf.get_variable('w', [self.hidden[-1], self.n_actions], initializer=w_initializer, collections=c_names)
+                b = tf.get_variable('b', [1, self.n_actions], initializer=b_initializer, collections=c_names)
+                self.A = tf.matmul(l, w) + b
+
+            with tf.variable_scope('Q'):
+                out = self.V + (self.A - tf.reduce_mean(self.A, axis=1, keep_dims=True))  # Q = V(s) + A(s,a)
+
+            # with tf.variable_scope('out'):
+            #     w = tf.get_variable('w', [self.hidden[-1], self.n_actions], initializer=w_initializer, collections=c_names)
+            #     b = tf.get_variable('b', [1, self.n_actions], initializer=b_initializer, collections=c_names)
+            #     out = tf.matmul(l, w) + b
+            return out
+
+        # ------------------ build evaluate_net ------------------
+        self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')  # input
+        self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target')  # for calculating loss
+        self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights')
+        with tf.variable_scope('eval_net'):
+            c_names, w_initializer, b_initializer = \
+                ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], \
+                tf.random_normal_initializer(0., 0.01), tf.constant_initializer(0.01)  # config of layers
+
+            self.q_eval = build_layers(self.s, c_names, w_initializer, b_initializer)
+
+        with tf.variable_scope('loss'):
+            self.abs_errors = tf.abs(tf.reduce_sum(self.q_target - self.q_eval, axis=1))  # for updating Sumtree
+            self.loss = tf.reduce_mean(self.ISWeights * tf.squared_difference(self.q_target, self.q_eval))
+
+        with tf.variable_scope('train'):
+            self._train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)
+
+        # ------------------ build target_net ------------------
+        self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_')  # input
+        with tf.variable_scope('target_net'):
+            c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
+            self.q_next = build_layers(self.s_, c_names, w_initializer, b_initializer)
+
+    def store_transition(self, s, a, r, s_):
+        transition = np.hstack((s, [a, r], s_))
+        max_p = np.max(self.memory.tree.tree[-self.memory.tree.capacity:])
+        self.memory.store(max_p, transition)
+
+    def choose_action(self, observation):
+        observation = observation[np.newaxis, :]
+        if np.random.uniform() < self.epsilon:
+            actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
+            action = np.argmax(actions_value)
+        else:
+            action = np.random.randint(0, self.n_actions)
+        return action
+
+    def _replace_target_params(self):
+        t_params = tf.get_collection('target_net_params')
+        e_params = tf.get_collection('eval_net_params')
+        self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])
+
+    def learn(self):
+        if self.learn_step_counter % self.replace_target_iter == 0:
+            self._replace_target_params()
+
+        tree_idx, batch_memory, ISWeights = self.memory.sample(self.batch_size)
+
+        # double DQN
+        q_next, q_eval4next = self.sess.run(
+            [self.q_next, self.q_eval],
+            feed_dict={self.s_: batch_memory[:, -self.n_features:],  # next observation
+                       self.s: batch_memory[:, -self.n_features:]})  # next observation
+        q_eval = self.sess.run(self.q_eval, {self.s: batch_memory[:, :self.n_features]})
+
+        q_target = q_eval.copy()
+
+        batch_index = np.arange(self.batch_size, dtype=np.int32)
+        eval_act_index = batch_memory[:, self.n_features].astype(int)
+        reward = batch_memory[:, self.n_features + 1]
+        max_act4next = np.argmax(q_eval4next,
+                                 axis=1)  # the action that brings the highest value is evaluated by q_eval
+        selected_q_next = q_next[batch_index, max_act4next]  # Double DQN, select q_next depending on above actions
+
+        q_target[batch_index, eval_act_index] = reward + self.gamma * selected_q_next
+
+        # q_next, q_eval = self.sess.run(
+        #     [self.q_next, self.q_eval],
+        #     feed_dict={self.s_: batch_memory[:, -self.n_features:],
+        #                self.s: batch_memory[:, :self.n_features]})
+        #
+        # q_target = q_eval.copy()
+        # batch_index = np.arange(self.batch_size, dtype=np.int32)
+        # eval_act_index = batch_memory[:, self.n_features].astype(int)
+        # reward = batch_memory[:, self.n_features + 1]
+        #
+        # q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
+
+        _, abs_errors, self.cost = self.sess.run([self._train_op, self.abs_errors, self.loss],
+                                                 feed_dict={self.s: batch_memory[:, :self.n_features],
+                                                            self.q_target: q_target,
+                                                            self.ISWeights: ISWeights})
+        for i in range(len(tree_idx)):  # update priority
+            idx = tree_idx[i]
+            self.memory.update(idx, abs_errors[i])
+
+        self.cost_his.append(self.cost)
+
+        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
+        self.learn_step_counter += 1
diff --git a/experiments/Solve_LunarLander/run_LunarLander.py b/experiments/Solve_LunarLander/run_LunarLander.py
new file mode 100644
index 0000000..b286109
--- /dev/null
+++ b/experiments/Solve_LunarLander/run_LunarLander.py
@@ -0,0 +1,68 @@
+"""
+Deep Q network,
+
+LunarLander-v2 example
+
+Using:
+Tensorflow: 1.0
+gym: 0.8.0
+"""
+
+
+import gym
+from gym import wrappers
+from DuelingDQNPrioritizedReplay import DuelingDQNPrioritizedReplay
+
+env = gym.make('LunarLander-v2')
+# env = env.unwrapped
+env.seed(1)
+
+N_A = env.action_space.n
+N_S = env.observation_space.shape[0]
+MEMORY_CAPACITY = 50000
+TARGET_REP_ITER = 2000
+MAX_EPISODES = 900
+E_GREEDY = 0.95
+E_INCREMENT = 0.00001
+GAMMA = 0.99
+LR = 0.0001
+BATCH_SIZE = 32
+HIDDEN = [400, 400]
+RENDER = True
+
+RL = DuelingDQNPrioritizedReplay(
+    n_actions=N_A, n_features=N_S, learning_rate=LR, e_greedy=E_GREEDY, reward_decay=GAMMA,
+    hidden=HIDDEN, batch_size=BATCH_SIZE, replace_target_iter=TARGET_REP_ITER,
+    memory_size=MEMORY_CAPACITY, e_greedy_increment=E_INCREMENT,)
+
+
+total_steps = 0
+running_r = 0
+r_scale = 100
+for i_episode in range(MAX_EPISODES):
+    s = env.reset()  # (coord_x, coord_y, vel_x, vel_y, angle, angular_vel, l_leg_on_ground, r_leg_on_ground)
+    ep_r = 0
+    while True:
+        if total_steps > MEMORY_CAPACITY: env.render()
+        a = RL.choose_action(s)
+        s_, r, done, _ = env.step(a)
+        if r == -100: r = -30
+        r /= r_scale
+
+        ep_r += r
+        RL.store_transition(s, a, r, s_)
+        if total_steps > MEMORY_CAPACITY:
+            RL.learn()
+        if done:
+            land = '| Landed' if r == 100/r_scale else '| ------'
+            running_r = 0.99 * running_r + 0.01 * ep_r
+            print('Epi: ', i_episode,
+                  land,
+                  '| Epi_R: ', round(ep_r, 2),
+                  '| Running_R: ', round(running_r, 2),
+                  '| Epsilon: ', round(RL.epsilon, 3))
+            break
+
+        s = s_
+        total_steps += 1
+