From 71223d590a517f3d9596bfe13a6bfb6cb7d8b9ff Mon Sep 17 00:00:00 2001
From: Kismuz <muzikinae@gmail.com>
Date: Sat, 8 Jul 2017 16:53:15 +0300
Subject: [PATCH 01/56] DQN copy_model_parameters memory leak fixed,
 tensorboard summaries updated with cpu/mem usage

---
 DQN/Deep Q Learning Solution.ipynb | 120 +++++++++++++++++------------
 1 file changed, 70 insertions(+), 50 deletions(-)

diff --git a/DQN/Deep Q Learning Solution.ipynb b/DQN/Deep Q Learning Solution.ipynb
index 7dd832212..7cf615137 100644
--- a/DQN/Deep Q Learning Solution.ipynb	
+++ b/DQN/Deep Q Learning Solution.ipynb	
@@ -17,6 +17,7 @@
     "import os\n",
     "import random\n",
     "import sys\n",
+    "import psutil\n",
     "import tensorflow as tf\n",
     "\n",
     "if \"../\" not in sys.path:\n",
@@ -29,9 +30,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "env = gym.envs.make(\"Breakout-v0\")"
@@ -40,9 +39,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Atari Actions: 0 (noop), 1 (fire), 2 (left) and 3 (right) are valid actions\n",
@@ -86,9 +83,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "class Estimator():\n",
@@ -198,9 +193,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# For Testing....\n",
@@ -234,30 +227,39 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": false
+    "collapsed": true
    },
    "outputs": [],
    "source": [
-    "def copy_model_parameters(sess, estimator1, estimator2):\n",
+    "class ModelParametersCopier():\n",
     "    \"\"\"\n",
-    "    Copies the model parameters of one estimator to another.\n",
-    "\n",
-    "    Args:\n",
-    "      sess: Tensorflow session instance\n",
-    "      estimator1: Estimator to copy the paramters from\n",
-    "      estimator2: Estimator to copy the parameters to\n",
+    "    Copy model parameters of one estimator to another.\n",
     "    \"\"\"\n",
-    "    e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1.scope)]\n",
-    "    e1_params = sorted(e1_params, key=lambda v: v.name)\n",
-    "    e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)]\n",
-    "    e2_params = sorted(e2_params, key=lambda v: v.name)\n",
-    "\n",
-    "    update_ops = []\n",
-    "    for e1_v, e2_v in zip(e1_params, e2_params):\n",
-    "        op = e2_v.assign(e1_v)\n",
-    "        update_ops.append(op)\n",
-    "\n",
-    "    sess.run(update_ops)"
+    "    \n",
+    "    def __init__(self, estimator1, estimator2):\n",
+    "        \"\"\"\n",
+    "        Defines copy-work operation graph.  \n",
+    "        Args:\n",
+    "          estimator1: Estimator to copy the paramters from\n",
+    "          estimator2: Estimator to copy the parameters to\n",
+    "        \"\"\"\n",
+    "        e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1.scope)]\n",
+    "        e1_params = sorted(e1_params, key=lambda v: v.name)\n",
+    "        e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)]\n",
+    "        e2_params = sorted(e2_params, key=lambda v: v.name)\n",
+    "\n",
+    "        self.update_ops = []\n",
+    "        for e1_v, e2_v in zip(e1_params, e2_params):\n",
+    "            op = e2_v.assign(e1_v)\n",
+    "            self.update_ops.append(op)\n",
+    "            \n",
+    "    def make(self, sess):\n",
+    "        \"\"\"\n",
+    "        Makes copy.\n",
+    "        Args:\n",
+    "            sess: Tensorflow session instance\n",
+    "        \"\"\"\n",
+    "        sess.run(self.update_ops)"
    ]
   },
   {
@@ -293,9 +295,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def deep_q_learning(sess,\n",
@@ -347,11 +347,17 @@
     "\n",
     "    # The replay memory\n",
     "    replay_memory = []\n",
+    "    \n",
+    "    # Make model copier object\n",
+    "    estimator_copy = ModelParametersCopier(q_estimator, target_estimator)\n",
     "\n",
     "    # Keeps track of useful statistics\n",
     "    stats = plotting.EpisodeStats(\n",
     "        episode_lengths=np.zeros(num_episodes),\n",
     "        episode_rewards=np.zeros(num_episodes))\n",
+    "    \n",
+    "    # For 'system/' summaries, usefull to check if currrent process looks healthy\n",
+    "    current_process = psutil.Process()\n",
     "\n",
     "    # Create directories for checkpoints and summaries\n",
     "    checkpoint_dir = os.path.join(experiment_dir, \"checkpoints\")\n",
@@ -422,14 +428,9 @@
     "            # Epsilon for this time step\n",
     "            epsilon = epsilons[min(total_t, epsilon_decay_steps-1)]\n",
     "\n",
-    "            # Add epsilon to Tensorboard\n",
-    "            episode_summary = tf.Summary()\n",
-    "            episode_summary.value.add(simple_value=epsilon, tag=\"epsilon\")\n",
-    "            q_estimator.summary_writer.add_summary(episode_summary, total_t)\n",
-    "\n",
     "            # Maybe update the target estimator\n",
     "            if total_t % update_target_estimator_every == 0:\n",
-    "                copy_model_parameters(sess, q_estimator, target_estimator)\n",
+    "                estimator_copy.make(sess)\n",
     "                print(\"\\nCopied model parameters to target network.\")\n",
     "\n",
     "            # Print out which step we're on, useful for debugging.\n",
@@ -475,11 +476,14 @@
     "\n",
     "        # Add summaries to tensorboard\n",
     "        episode_summary = tf.Summary()\n",
-    "        episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], node_name=\"episode_reward\", tag=\"episode_reward\")\n",
-    "        episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], node_name=\"episode_length\", tag=\"episode_length\")\n",
-    "        q_estimator.summary_writer.add_summary(episode_summary, total_t)\n",
+    "        episode_summary.value.add(simple_value=epsilon, tag=\"episode/epsilon\")\n",
+    "        episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], tag=\"episode/reward\")\n",
+    "        episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], tag=\"episode/length\")\n",
+    "        episode_summary.value.add(simple_value=current_process.cpu_percent(), tag=\"system/cpu_usage_percent\")\n",
+    "        episode_summary.value.add(simple_value=current_process.memory_percent(memtype=\"vms\"), tag=\"system/v_memeory_usage_percent\")\n",
+    "        q_estimator.summary_writer.add_summary(episode_summary, i_episode)\n",
     "        q_estimator.summary_writer.flush()\n",
-    "\n",
+    "        \n",
     "        yield total_t, plotting.EpisodeStats(\n",
     "            episode_lengths=stats.episode_lengths[:i_episode+1],\n",
     "            episode_rewards=stats.episode_rewards[:i_episode+1])\n",
@@ -490,9 +494,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "tf.reset_default_graph()\n",
@@ -504,7 +506,7 @@
     "global_step = tf.Variable(0, name='global_step', trainable=False)\n",
     "    \n",
     "# Create estimators\n",
-    "q_estimator = Estimator(scope=\"q\", summaries_dir=experiment_dir)\n",
+    "q_estimator = Estimator(scope=\"q_estimator\", summaries_dir=experiment_dir)\n",
     "target_estimator = Estimator(scope=\"target_q\")\n",
     "\n",
     "# State processor\n",
@@ -531,6 +533,24 @@
     "\n",
     "        print(\"\\nEpisode Reward: {}\".format(stats.episode_rewards[-1]))"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -549,9 +569,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.4.3"
+   "version": "3.6.0"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }

From 2b576bd992014977238bb344a3a07a1145eda31f Mon Sep 17 00:00:00 2001
From: sstarzycki <simonstar@wp.pl>
Date: Fri, 21 Jul 2017 07:59:04 +0200
Subject: [PATCH 02/56] Update description of env.P[s][a]

env.P[s][a] is not a tuple but rather a list of tuples (as probability in that tuple wouldn't make sense otherwise).
---
 DP/Policy Evaluation.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DP/Policy Evaluation.ipynb b/DP/Policy Evaluation.ipynb
index e401c7759..9c3e0cdd3 100644
--- a/DP/Policy Evaluation.ipynb	
+++ b/DP/Policy Evaluation.ipynb	
@@ -41,7 +41,7 @@
     "    Args:\n",
     "        policy: [S, A] shaped matrix representing the policy.\n",
     "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
-    "            env.P[s][a] is a (prob, next_state, reward, done) tuple.\n",
+    "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
     "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
     "        discount_factor: gamma discount factor.\n",
     "    \n",

From 1f04c1d7606c8e607c7d9213745f515a9fc2baf4 Mon Sep 17 00:00:00 2001
From: himanshusahni <himanshu@gatech.edu>
Date: Wed, 4 Oct 2017 18:58:13 -0400
Subject: [PATCH 03/56] bind worker within lambda to avoid running worker twice

---
 PolicyGradient/a3c/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PolicyGradient/a3c/train.py b/PolicyGradient/a3c/train.py
index 28064e506..286ca7a9d 100755
--- a/PolicyGradient/a3c/train.py
+++ b/PolicyGradient/a3c/train.py
@@ -125,7 +125,7 @@ def make_env(wrap=True):
   # Start worker threads
   worker_threads = []
   for worker in workers:
-    worker_fn = lambda: worker.run(sess, coord, FLAGS.t_max)
+    worker_fn = lambda worker=worker: worker.run(sess, coord, FLAGS.t_max)
     t = threading.Thread(target=worker_fn)
     t.start()
     worker_threads.append(t)

From bc7ee056e33518aea8d685ea43c12b8426993a96 Mon Sep 17 00:00:00 2001
From: himanshusahni <himanshu@gatech.edu>
Date: Tue, 10 Oct 2017 01:39:40 -0400
Subject: [PATCH 04/56] worker name scope should have trailing backslash
 otherwise any worker 10-19 will clash in scope with worker 1, and so on.

---
 PolicyGradient/a3c/worker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PolicyGradient/a3c/worker.py b/PolicyGradient/a3c/worker.py
index 5f310ac3c..6371558f2 100644
--- a/PolicyGradient/a3c/worker.py
+++ b/PolicyGradient/a3c/worker.py
@@ -85,7 +85,7 @@ def __init__(self, name, env, policy_net, value_net, global_counter, discount_fa
     # Op to copy params from global policy/valuenets
     self.copy_params_op = make_copy_params_op(
       tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES),
-      tf.contrib.slim.get_variables(scope=self.name, collection=tf.GraphKeys.TRAINABLE_VARIABLES))
+      tf.contrib.slim.get_variables(scope=self.name+'/', collection=tf.GraphKeys.TRAINABLE_VARIABLES))
 
     self.vnet_train_op = make_train_op(self.value_net, self.global_value_net)
     self.pnet_train_op = make_train_op(self.policy_net, self.global_policy_net)

From 3611ec96b7429058a49c2478082506e90729ad3e Mon Sep 17 00:00:00 2001
From: Praveen Palanisamy <praveenofpersia@gmail.com>
Date: Tue, 31 Oct 2017 23:53:06 -0400
Subject: [PATCH 05/56] Fixed some of the issues with the DQN script as pointed
 out in #117

---
 DQN/dqn.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/DQN/dqn.py b/DQN/dqn.py
index 7b459240d..81d4aa58d 100755
--- a/DQN/dqn.py
+++ b/DQN/dqn.py
@@ -1,4 +1,5 @@
 import gym
+form gym.wrappers import Monitor
 import itertools
 import numpy as np
 import os
@@ -28,7 +29,7 @@ def __init__(self):
             self.output = tf.image.rgb_to_grayscale(self.input_state)
             self.output = tf.image.crop_to_bounding_box(self.output, 34, 0, 160, 160)
             self.output = tf.image.resize_images(
-                self.output, 84, 84, method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
+                self.output, [84, 84], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
             self.output = tf.squeeze(self.output)
 
     def process(self, sess, state):
@@ -292,9 +293,11 @@ def deep_q_learning(sess,
             state = next_state
 
     # Record videos
-    env.monitor.start(monitor_path,
-                      resume=True,
-                      video_callable=lambda count: count % record_video_every == 0)
+    # Use the gym env Monitor wrapper
+    env = Monitor(env,
+                  directory=monitor_path,
+                  resume=True,
+                  video_callable=lambda count: count % record_video_every ==0)
 
     for i_episode in range(num_episodes):
 

From e9068bfe7a5d7b1f32e710a95a99df5c441b039d Mon Sep 17 00:00:00 2001
From: Praveen Palanisamy <praveenofpersia@gmail.com>
Date: Tue, 31 Oct 2017 23:59:14 -0400
Subject: [PATCH 06/56] Updated to support recent versions of TF. Removed
 deprecated functions. Pointed out in #117

---
 DQN/dqn.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/DQN/dqn.py b/DQN/dqn.py
index 81d4aa58d..80466556c 100755
--- a/DQN/dqn.py
+++ b/DQN/dqn.py
@@ -60,7 +60,7 @@ def __init__(self, scope="estimator", summaries_dir=None):
                 summary_dir = os.path.join(summaries_dir, "summaries_{}".format(scope))
                 if not os.path.exists(summary_dir):
                     os.makedirs(summary_dir)
-                self.summary_writer = tf.train.SummaryWriter(summary_dir)
+                self.summary_writer = tf.summary.FileWriter(summary_dir)
 
     def _build_model(self):
         """
@@ -104,11 +104,11 @@ def _build_model(self):
         self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step())
 
         # Summaries for Tensorboard
-        self.summaries = tf.merge_summary([
-            tf.scalar_summary("loss", self.loss),
-            tf.histogram_summary("loss_hist", self.losses),
-            tf.histogram_summary("q_values_hist", self.predictions),
-            tf.scalar_summary("max_q_value", tf.reduce_max(self.predictions))
+        self.summaries = tf.summary.merge([
+            tf.summary.scalar("loss", self.loss),
+            tf.summary.histogram("loss_hist", self.losses),
+            tf.summary.histogram("q_values_hist", self.predictions),
+            tf.summary.scalar("max_q_value", tf.reduce_max(self.predictions))
         ])
 
 
@@ -401,7 +401,7 @@ def deep_q_learning(sess,
 state_processor = StateProcessor()
 
 with tf.Session() as sess:
-    sess.run(tf.initialize_all_variables())
+    sess.run(tf.global_variables_initializer())
     for t, stats in deep_q_learning(sess,
                                     env,
                                     q_estimator=q_estimator,

From 0b2ae4144b817c871153283188b270484aeeb7c2 Mon Sep 17 00:00:00 2001
From: Praveen Palanisamy <praveenofpersia@gmail.com>
Date: Wed, 1 Nov 2017 00:13:25 -0400
Subject: [PATCH 07/56] Fixed issues with the DQN in the exercise notebook

---
 DQN/Deep Q Learning.ipynb | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/DQN/Deep Q Learning.ipynb b/DQN/Deep Q Learning.ipynb
index d2a295cf1..c3210d2ad 100644
--- a/DQN/Deep Q Learning.ipynb	
+++ b/DQN/Deep Q Learning.ipynb	
@@ -11,6 +11,7 @@
     "%matplotlib inline\n",
     "\n",
     "import gym\n",
+    "from gym.wrappers import Monitor\n",
     "import itertools\n",
     "import numpy as np\n",
     "import os\n",
@@ -67,7 +68,7 @@
     "            self.output = tf.image.rgb_to_grayscale(self.input_state)\n",
     "            self.output = tf.image.crop_to_bounding_box(self.output, 34, 0, 160, 160)\n",
     "            self.output = tf.image.resize_images(\n",
-    "                self.output, 84, 84, method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)\n",
+    "                self.output, [84, 84], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)\n",
     "            self.output = tf.squeeze(self.output)\n",
     "\n",
     "    def process(self, sess, state):\n",
@@ -107,7 +108,7 @@
     "                summary_dir = os.path.join(summaries_dir, \"summaries_{}\".format(scope))\n",
     "                if not os.path.exists(summary_dir):\n",
     "                    os.makedirs(summary_dir)\n",
-    "                self.summary_writer = tf.train.SummaryWriter(summary_dir)\n",
+    "                self.summary_writer = tf.summary.FileWriter(summary_dir)\n",
     "\n",
     "    def _build_model(self):\n",
     "        \"\"\"\n",
@@ -151,11 +152,11 @@
     "        self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step())\n",
     "\n",
     "        # Summaries for Tensorboard\n",
-    "        self.summaries = tf.merge_summary([\n",
-    "            tf.scalar_summary(\"loss\", self.loss),\n",
-    "            tf.histogram_summary(\"loss_hist\", self.losses),\n",
-    "            tf.histogram_summary(\"q_values_hist\", self.predictions),\n",
-    "            tf.scalar_summary(\"max_q_value\", tf.reduce_max(self.predictions))\n",
+    "        self.summaries = tf.summary.merge([\n",
+    "            tf.summary.scalar(\"loss\", self.loss),\n",
+    "            tf.summary.histogram(\"loss_hist\", self.losses),\n",
+    "            tf.summary.histogram(\"q_values_hist\", self.predictions),\n",
+    "            tf.summary.scalar(\"max_q_value\", tf.reduce_max(self.predictions))\n",
     "        ])\n",
     "\n",
     "\n",
@@ -212,7 +213,7 @@
     "sp = StateProcessor()\n",
     "\n",
     "with tf.Session() as sess:\n",
-    "    sess.run(tf.initialize_all_variables())\n",
+    "    sess.run(tf.global_variables_initializer())\n",
     "    \n",
     "    # Example observation batch\n",
     "    observation = env.reset()\n",
@@ -391,9 +392,10 @@
     "        pass\n",
     "\n",
     "    # Record videos\n",
-    "    env.monitor.start(monitor_path,\n",
-    "                      resume=True,\n",
-    "                      video_callable=lambda count: count % record_video_every == 0)\n",
+    "    env= Monitor(env,\n",
+    "                 directory=monitor_path,\n",
+    "                 resume=True,\n",
+    "                 video_callable=lambda count: count % record_video_every == 0)\n",
     "\n",
     "    for i_episode in range(num_episodes):\n",
     "\n",
@@ -526,7 +528,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "version": "3.6.0"
   }
  },
  "nbformat": 4,

From 10ce5dc7eb242ab4953b6a07d342f8aae9267214 Mon Sep 17 00:00:00 2001
From: Praveen Palanisamy <praveenofpersia@gmail.com>
Date: Wed, 1 Nov 2017 00:13:48 -0400
Subject: [PATCH 08/56] Fixed typo

---
 DQN/dqn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DQN/dqn.py b/DQN/dqn.py
index 80466556c..d54d4d1bf 100755
--- a/DQN/dqn.py
+++ b/DQN/dqn.py
@@ -1,5 +1,5 @@
 import gym
-form gym.wrappers import Monitor
+from gym.wrappers import Monitor
 import itertools
 import numpy as np
 import os

From 60013e507080222c8fa473d77e7808795ea3f4eb Mon Sep 17 00:00:00 2001
From: Alex <alex@noul.kr>
Date: Thu, 16 Nov 2017 13:54:30 +0900
Subject: [PATCH 09/56] Sync function descriptions. Lambda -> gamma (discount
 factor). Added description of env.nS and env.nA

---
 DP/Policy Evaluation Solution.ipynb | 34 ++++++++++-------------
 DP/Policy Evaluation.ipynb          | 30 +++++++++------------
 DP/Policy Iteration Solution.ipynb  | 34 +++++++++++------------
 DP/Policy Iteration.ipynb           | 38 +++++++++++---------------
 DP/Value Iteration Solution.ipynb   | 38 +++++++++++---------------
 DP/Value Iteration.ipynb            | 42 ++++++++++++-----------------
 6 files changed, 90 insertions(+), 126 deletions(-)

diff --git a/DP/Policy Evaluation Solution.ipynb b/DP/Policy Evaluation Solution.ipynb
index a8b949367..9b9b11b49 100644
--- a/DP/Policy Evaluation Solution.ipynb	
+++ b/DP/Policy Evaluation Solution.ipynb	
@@ -3,9 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": 53,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -43,9 +41,11 @@
     "    Args:\n",
     "        policy: [S, A] shaped matrix representing the policy.\n",
     "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
-    "            env.P[s][a] is a (prob, next_state, reward, done) tuple.\n",
+    "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
+    "            env.nS is a number of available states. \n",
+    "            env.nA is a number of available actions.\n",
     "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
-    "        discount_factor: lambda discount factor.\n",
+    "        discount_factor: gamma discount factor.\n",
     "    \n",
     "    Returns:\n",
     "        Vector of length env.nS representing the value function.\n",
@@ -75,9 +75,7 @@
   {
    "cell_type": "code",
    "execution_count": 56,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "random_policy = np.ones([env.nS, env.nA]) / env.nA\n",
@@ -87,9 +85,7 @@
   {
    "cell_type": "code",
    "execution_count": 57,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -122,9 +118,7 @@
   {
    "cell_type": "code",
    "execution_count": 51,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Test: Make sure the evaluated policy is what we expected\n",
@@ -144,23 +138,23 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 2",
    "language": "python",
-   "name": "python3"
+   "name": "python2"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 3
+    "version": 2
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/DP/Policy Evaluation.ipynb b/DP/Policy Evaluation.ipynb
index 9c3e0cdd3..160ac6dd9 100644
--- a/DP/Policy Evaluation.ipynb	
+++ b/DP/Policy Evaluation.ipynb	
@@ -3,9 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": 23,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -29,9 +27,7 @@
   {
    "cell_type": "code",
    "execution_count": 25,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):\n",
@@ -42,6 +38,8 @@
     "        policy: [S, A] shaped matrix representing the policy.\n",
     "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
     "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
+    "            env.nS is a number of available states. \n",
+    "            env.nA is a number of available actions.\n",
     "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
     "        discount_factor: gamma discount factor.\n",
     "    \n",
@@ -59,9 +57,7 @@
   {
    "cell_type": "code",
    "execution_count": 26,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "random_policy = np.ones([env.nS, env.nA]) / env.nA\n",
@@ -71,9 +67,7 @@
   {
    "cell_type": "code",
    "execution_count": 22,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "ename": "AssertionError",
@@ -107,23 +101,23 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 2",
    "language": "python",
-   "name": "python3"
+   "name": "python2"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 3
+    "version": 2
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/DP/Policy Iteration Solution.ipynb b/DP/Policy Iteration Solution.ipynb
index cfb68a2e5..8cf4faf78 100644
--- a/DP/Policy Iteration Solution.ipynb	
+++ b/DP/Policy Iteration Solution.ipynb	
@@ -3,9 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -45,9 +43,11 @@
     "    Args:\n",
     "        policy: [S, A] shaped matrix representing the policy.\n",
     "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
-    "            env.P[s][a] is a (prob, next_state, reward, done) tuple.\n",
-    "        theta: We stop evaluation one our value function change is less than theta for all states.\n",
-    "        discount_factor: lambda discount factor.\n",
+    "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
+    "            env.nS is a number of available states. \n",
+    "            env.nA is a number of available actions.\n",
+    "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
+    "        discount_factor: gamma discount factor.\n",
     "    \n",
     "    Returns:\n",
     "        Vector of length env.nS representing the value function.\n",
@@ -91,7 +91,7 @@
     "        env: The OpenAI envrionment.\n",
     "        policy_eval_fn: Policy Evaluation function that takes 3 arguments:\n",
     "            policy, env, discount_factor.\n",
-    "        discount_factor: Lambda discount factor.\n",
+    "        discount_factor: gamma discount factor.\n",
     "        \n",
     "    Returns:\n",
     "        A tuple (policy, V). \n",
@@ -136,9 +136,7 @@
   {
    "cell_type": "code",
    "execution_count": 64,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -203,9 +201,7 @@
   {
    "cell_type": "code",
    "execution_count": 59,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Test the value function\n",
@@ -225,23 +221,23 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 2",
    "language": "python",
-   "name": "python3"
+   "name": "python2"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 3
+    "version": 2
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/DP/Policy Iteration.ipynb b/DP/Policy Iteration.ipynb
index d67b22505..afd417593 100644
--- a/DP/Policy Iteration.ipynb	
+++ b/DP/Policy Iteration.ipynb	
@@ -3,9 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -45,9 +43,11 @@
     "    Args:\n",
     "        policy: [S, A] shaped matrix representing the policy.\n",
     "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
-    "            env.P[s][a] is a (prob, next_state, reward, done) tuple.\n",
-    "        theta: We stop evaluation one our value function change is less than theta for all states.\n",
-    "        discount_factor: lambda discount factor.\n",
+    "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
+    "            env.nS is a number of available states. \n",
+    "            env.nA is a number of available actions.\n",
+    "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
+    "        discount_factor: gamma discount factor.\n",
     "    \n",
     "    Returns:\n",
     "        Vector of length env.nS representing the value function.\n",
@@ -77,9 +77,7 @@
   {
    "cell_type": "code",
    "execution_count": 13,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def policy_improvement(env, policy_eval_fn=policy_eval, discount_factor=1.0):\n",
@@ -91,7 +89,7 @@
     "        env: The OpenAI envrionment.\n",
     "        policy_eval_fn: Policy Evaluation function that takes 3 arguments:\n",
     "            policy, env, discount_factor.\n",
-    "        discount_factor: Lambda discount factor.\n",
+    "        discount_factor: gamma discount factor.\n",
     "        \n",
     "    Returns:\n",
     "        A tuple (policy, V). \n",
@@ -113,9 +111,7 @@
   {
    "cell_type": "code",
    "execution_count": 14,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -180,9 +176,7 @@
   {
    "cell_type": "code",
    "execution_count": 15,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "ename": "AssertionError",
@@ -216,23 +210,23 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 2",
    "language": "python",
-   "name": "python3"
+   "name": "python2"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 3
+    "version": 2
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/DP/Value Iteration Solution.ipynb b/DP/Value Iteration Solution.ipynb
index 7bc985d15..fb98665f8 100644
--- a/DP/Value Iteration Solution.ipynb	
+++ b/DP/Value Iteration Solution.ipynb	
@@ -3,9 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": 17,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -19,9 +17,7 @@
   {
    "cell_type": "code",
    "execution_count": 18,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "pp = pprint.PrettyPrinter(indent=2)\n",
@@ -31,9 +27,7 @@
   {
    "cell_type": "code",
    "execution_count": 19,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def value_iteration(env, theta=0.0001, discount_factor=1.0):\n",
@@ -41,10 +35,12 @@
     "    Value Iteration Algorithm.\n",
     "    \n",
     "    Args:\n",
-    "        env: OpenAI environment. env.P represents the transition probabilities of the environment.\n",
-    "        theta: Stopping threshold. If the value of all states changes less than theta\n",
-    "            in one iteration we are done.\n",
-    "        discount_factor: lambda time discount factor.\n",
+    "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
+    "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
+    "            env.nS is a number of available states. \n",
+    "            env.nA is a number of available actions.\n",
+    "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
+    "        discount_factor: gamma discount factor.\n",
     "        \n",
     "    Returns:\n",
     "        A tuple (policy, V) of the optimal policy and the optimal value function.\n",
@@ -99,9 +95,7 @@
   {
    "cell_type": "code",
    "execution_count": 20,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -176,23 +170,23 @@
  "metadata": {
   "anaconda-cloud": {},
   "kernelspec": {
-   "display_name": "Python [Root]",
+   "display_name": "Python 2",
    "language": "python",
-   "name": "Python [Root]"
+   "name": "python2"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 3
+    "version": 2
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/DP/Value Iteration.ipynb b/DP/Value Iteration.ipynb
index 6329d12f7..66c902113 100644
--- a/DP/Value Iteration.ipynb	
+++ b/DP/Value Iteration.ipynb	
@@ -3,9 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -19,9 +17,7 @@
   {
    "cell_type": "code",
    "execution_count": 4,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "pp = pprint.PrettyPrinter(indent=2)\n",
@@ -31,9 +27,7 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def value_iteration(env, theta=0.0001, discount_factor=1.0):\n",
@@ -41,10 +35,12 @@
     "    Value Iteration Algorithm.\n",
     "    \n",
     "    Args:\n",
-    "        env: OpenAI environment. env.P represents the transition probabilities of the environment.\n",
-    "        theta: Stopping threshold. If the value of all states changes less than theta\n",
-    "            in one iteration we are done.\n",
-    "        discount_factor: lambda time discount factor.\n",
+    "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
+    "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
+    "            env.nS is a number of available states. \n",
+    "            env.nA is a number of available actions.\n",
+    "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
+    "        discount_factor: gamma discount factor.\n",
     "        \n",
     "    Returns:\n",
     "        A tuple (policy, V) of the optimal policy and the optimal value function.        \n",
@@ -61,9 +57,7 @@
   {
    "cell_type": "code",
    "execution_count": 6,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -128,9 +122,7 @@
   {
    "cell_type": "code",
    "execution_count": 7,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "ename": "AssertionError",
@@ -155,23 +147,23 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 2",
    "language": "python",
-   "name": "python3"
+   "name": "python2"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 3
+    "version": 2
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }

From 430766766f34681460986ae044ecc7d408ddb691 Mon Sep 17 00:00:00 2001
From: Alex <alex@noul.kr>
Date: Tue, 21 Nov 2017 15:15:28 +0900
Subject: [PATCH 10/56] Updates function description in DP. Fixed typos in MC.
 Changed Lambda to Gamma as in the book.

---
 DP/Policy Evaluation Solution.ipynb           | 18 ++++++----
 DP/Policy Evaluation.ipynb                    | 18 ++++++----
 DP/Policy Iteration Solution.ipynb            | 14 +++++---
 DP/Policy Iteration.ipynb                     | 14 +++++---
 DP/Value Iteration Solution.ipynb             | 18 ++++++----
 DP/Value Iteration.ipynb                      | 18 ++++++----
 ...ith Epsilon-Greedy Policies Solution.ipynb | 30 +++++++---------
 ...Control with Epsilon-Greedy Policies.ipynb | 26 +++++---------
 MC/MC Prediction Solution.ipynb               | 33 +++++++----------
 MC/MC Prediction.ipynb                        | 25 +++++--------
 ...eighted Importance Sampling Solution.ipynb | 36 +++++++------------
 ...ol with Weighted Importance Sampling.ipynb | 36 +++++++------------
 12 files changed, 133 insertions(+), 153 deletions(-)

diff --git a/DP/Policy Evaluation Solution.ipynb b/DP/Policy Evaluation Solution.ipynb
index 9b9b11b49..8db76d578 100644
--- a/DP/Policy Evaluation Solution.ipynb	
+++ b/DP/Policy Evaluation Solution.ipynb	
@@ -3,7 +3,9 @@
   {
    "cell_type": "code",
    "execution_count": 53,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -42,10 +44,10 @@
     "        policy: [S, A] shaped matrix representing the policy.\n",
     "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
     "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
-    "            env.nS is a number of available states. \n",
-    "            env.nA is a number of available actions.\n",
+    "            env.nS is a number of states in the environment. \n",
+    "            env.nA is a number of actions in the environment.\n",
     "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
-    "        discount_factor: gamma discount factor.\n",
+    "        discount_factor: Gamma discount factor.\n",
     "    \n",
     "    Returns:\n",
     "        Vector of length env.nS representing the value function.\n",
@@ -75,7 +77,9 @@
   {
    "cell_type": "code",
    "execution_count": 56,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "random_policy = np.ones([env.nS, env.nA]) / env.nA\n",
@@ -118,7 +122,9 @@
   {
    "cell_type": "code",
    "execution_count": 51,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "# Test: Make sure the evaluated policy is what we expected\n",
diff --git a/DP/Policy Evaluation.ipynb b/DP/Policy Evaluation.ipynb
index 160ac6dd9..e4f5f3673 100644
--- a/DP/Policy Evaluation.ipynb	
+++ b/DP/Policy Evaluation.ipynb	
@@ -3,7 +3,9 @@
   {
    "cell_type": "code",
    "execution_count": 23,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -27,7 +29,9 @@
   {
    "cell_type": "code",
    "execution_count": 25,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):\n",
@@ -38,10 +42,10 @@
     "        policy: [S, A] shaped matrix representing the policy.\n",
     "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
     "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
-    "            env.nS is a number of available states. \n",
-    "            env.nA is a number of available actions.\n",
+    "            env.nS is a number of states in the environment. \n",
+    "            env.nA is a number of actions in the environment.\n",
     "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
-    "        discount_factor: gamma discount factor.\n",
+    "        discount_factor: Gamma discount factor.\n",
     "    \n",
     "    Returns:\n",
     "        Vector of length env.nS representing the value function.\n",
@@ -57,7 +61,9 @@
   {
    "cell_type": "code",
    "execution_count": 26,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "random_policy = np.ones([env.nS, env.nA]) / env.nA\n",
diff --git a/DP/Policy Iteration Solution.ipynb b/DP/Policy Iteration Solution.ipynb
index 8cf4faf78..bf6fa631a 100644
--- a/DP/Policy Iteration Solution.ipynb	
+++ b/DP/Policy Iteration Solution.ipynb	
@@ -3,7 +3,9 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -44,10 +46,10 @@
     "        policy: [S, A] shaped matrix representing the policy.\n",
     "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
     "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
-    "            env.nS is a number of available states. \n",
-    "            env.nA is a number of available actions.\n",
+    "            env.nS is a number of states in the environment. \n",
+    "            env.nA is a number of actions in the environment.\n",
     "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
-    "        discount_factor: gamma discount factor.\n",
+    "        discount_factor: Gamma discount factor.\n",
     "    \n",
     "    Returns:\n",
     "        Vector of length env.nS representing the value function.\n",
@@ -201,7 +203,9 @@
   {
    "cell_type": "code",
    "execution_count": 59,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "# Test the value function\n",
diff --git a/DP/Policy Iteration.ipynb b/DP/Policy Iteration.ipynb
index afd417593..bdff9deea 100644
--- a/DP/Policy Iteration.ipynb	
+++ b/DP/Policy Iteration.ipynb	
@@ -3,7 +3,9 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -44,10 +46,10 @@
     "        policy: [S, A] shaped matrix representing the policy.\n",
     "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
     "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
-    "            env.nS is a number of available states. \n",
-    "            env.nA is a number of available actions.\n",
+    "            env.nS is a number of states in the environment. \n",
+    "            env.nA is a number of actions in the environment.\n",
     "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
-    "        discount_factor: gamma discount factor.\n",
+    "        discount_factor: Gamma discount factor.\n",
     "    \n",
     "    Returns:\n",
     "        Vector of length env.nS representing the value function.\n",
@@ -77,7 +79,9 @@
   {
    "cell_type": "code",
    "execution_count": 13,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "def policy_improvement(env, policy_eval_fn=policy_eval, discount_factor=1.0):\n",
diff --git a/DP/Value Iteration Solution.ipynb b/DP/Value Iteration Solution.ipynb
index fb98665f8..ebd1b5d49 100644
--- a/DP/Value Iteration Solution.ipynb	
+++ b/DP/Value Iteration Solution.ipynb	
@@ -3,7 +3,9 @@
   {
    "cell_type": "code",
    "execution_count": 17,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -17,7 +19,9 @@
   {
    "cell_type": "code",
    "execution_count": 18,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "pp = pprint.PrettyPrinter(indent=2)\n",
@@ -27,7 +31,9 @@
   {
    "cell_type": "code",
    "execution_count": 19,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "def value_iteration(env, theta=0.0001, discount_factor=1.0):\n",
@@ -37,10 +43,10 @@
     "    Args:\n",
     "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
     "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
-    "            env.nS is a number of available states. \n",
-    "            env.nA is a number of available actions.\n",
+    "            env.nS is a number of states in the environment. \n",
+    "            env.nA is a number of actions in the environment.\n",
     "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
-    "        discount_factor: gamma discount factor.\n",
+    "        discount_factor: Gamma discount factor.\n",
     "        \n",
     "    Returns:\n",
     "        A tuple (policy, V) of the optimal policy and the optimal value function.\n",
diff --git a/DP/Value Iteration.ipynb b/DP/Value Iteration.ipynb
index 66c902113..f947fd761 100644
--- a/DP/Value Iteration.ipynb	
+++ b/DP/Value Iteration.ipynb	
@@ -3,7 +3,9 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -17,7 +19,9 @@
   {
    "cell_type": "code",
    "execution_count": 4,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "pp = pprint.PrettyPrinter(indent=2)\n",
@@ -27,7 +31,9 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "def value_iteration(env, theta=0.0001, discount_factor=1.0):\n",
@@ -37,10 +43,10 @@
     "    Args:\n",
     "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
     "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
-    "            env.nS is a number of available states. \n",
-    "            env.nA is a number of available actions.\n",
+    "            env.nS is a number of states in the environment. \n",
+    "            env.nA is a number of actions in the environment.\n",
     "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
-    "        discount_factor: gamma discount factor.\n",
+    "        discount_factor: Gamma discount factor.\n",
     "        \n",
     "    Returns:\n",
     "        A tuple (policy, V) of the optimal policy and the optimal value function.        \n",
diff --git a/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb b/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb
index 4484eb2f6..c41cf3b1a 100644
--- a/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb	
+++ b/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb	
@@ -27,9 +27,7 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "env = BlackjackEnv()"
@@ -81,14 +79,14 @@
     "    \n",
     "    Args:\n",
     "        env: OpenAI gym environment.\n",
-    "        num_episodes: Nubmer of episodes to sample.\n",
-    "        discount_factor: Lambda discount factor.\n",
+    "        num_episodes: Number of episodes to sample.\n",
+    "        discount_factor: Gamma discount factor.\n",
     "        epsilon: Chance the sample a random action. Float betwen 0 and 1.\n",
     "    \n",
     "    Returns:\n",
     "        A tuple (Q, policy).\n",
     "        Q is a dictionary mapping state -> action values.\n",
-    "        policy is a function taht takes an observation as an argument and returns\n",
+    "        policy is a function that takes an observation as an argument and returns\n",
     "        action probabilities\n",
     "    \"\"\"\n",
     "    \n",
@@ -147,9 +145,7 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -166,9 +162,7 @@
   {
    "cell_type": "code",
    "execution_count": 6,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -213,23 +207,23 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 2",
    "language": "python",
-   "name": "python3"
+   "name": "python2"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 3
+    "version": 2
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.5.2"
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/MC/MC Control with Epsilon-Greedy Policies.ipynb b/MC/MC Control with Epsilon-Greedy Policies.ipynb
index dab7af2ac..7963c8d18 100644
--- a/MC/MC Control with Epsilon-Greedy Policies.ipynb	
+++ b/MC/MC Control with Epsilon-Greedy Policies.ipynb	
@@ -27,9 +27,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "env = BlackjackEnv()"
@@ -38,9 +36,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def make_epsilon_greedy_policy(Q, epsilon, nA):\n",
@@ -79,14 +75,14 @@
     "    \n",
     "    Args:\n",
     "        env: OpenAI gym environment.\n",
-    "        num_episodes: Nubmer of episodes to sample.\n",
-    "        discount_factor: Lambda discount factor.\n",
+    "        num_episodes: Number of episodes to sample.\n",
+    "        discount_factor: Gamma discount factor.\n",
     "        epsilon: Chance the sample a random action. Float betwen 0 and 1.\n",
     "    \n",
     "    Returns:\n",
     "        A tuple (Q, policy).\n",
     "        Q is a dictionary mapping state -> action values.\n",
-    "        policy is a function taht takes an observation as an argument and returns\n",
+    "        policy is a function that takes an observation as an argument and returns\n",
     "        action probabilities\n",
     "    \"\"\"\n",
     "    \n",
@@ -111,9 +107,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "Q, policy = mc_control_epsilon_greedy(env, num_episodes=500000, epsilon=0.1)"
@@ -122,9 +116,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# For plotting: Create value function from action-value function\n",
@@ -162,9 +154,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
-   "version": "2.7.11"
+   "version": "2.7.12"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/MC/MC Prediction Solution.ipynb b/MC/MC Prediction Solution.ipynb
index b6cc24f95..7459f6048 100644
--- a/MC/MC Prediction Solution.ipynb	
+++ b/MC/MC Prediction Solution.ipynb	
@@ -3,9 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "%matplotlib inline\n",
@@ -28,9 +26,7 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "env = BlackjackEnv()"
@@ -39,9 +35,7 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def mc_prediction(policy, env, num_episodes, discount_factor=1.0):\n",
@@ -52,8 +46,8 @@
     "    Args:\n",
     "        policy: A function that maps an observation to action probabilities.\n",
     "        env: OpenAI gym environment.\n",
-    "        num_episodes: Nubmer of episodes to sample.\n",
-    "        discount_factor: Lambda discount factor.\n",
+    "        num_episodes: Number of episodes to sample.\n",
+    "        discount_factor: Gamma discount factor.\n",
     "    \n",
     "    Returns:\n",
     "        A dictionary that maps from state -> value.\n",
@@ -106,9 +100,7 @@
   {
    "cell_type": "code",
    "execution_count": 4,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def sample_policy(observation):\n",
@@ -123,7 +115,6 @@
    "cell_type": "code",
    "execution_count": 5,
    "metadata": {
-    "collapsed": false,
     "scrolled": false
    },
    "outputs": [
@@ -202,23 +193,23 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 2",
    "language": "python",
-   "name": "python3"
+   "name": "python2"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 3
+    "version": 2
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.5.2"
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/MC/MC Prediction.ipynb b/MC/MC Prediction.ipynb
index 17c8cf64f..aff53e747 100644
--- a/MC/MC Prediction.ipynb	
+++ b/MC/MC Prediction.ipynb	
@@ -3,9 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "%matplotlib inline\n",
@@ -28,9 +26,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "env = BlackjackEnv()"
@@ -39,9 +35,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def mc_prediction(policy, env, num_episodes, discount_factor=1.0):\n",
@@ -52,8 +46,8 @@
     "    Args:\n",
     "        policy: A function that maps an observation to action probabilities.\n",
     "        env: OpenAI gym environment.\n",
-    "        num_episodes: Nubmer of episodes to sample.\n",
-    "        discount_factor: Lambda discount factor.\n",
+    "        num_episodes: Number of episodes to sample.\n",
+    "        discount_factor: Gamma discount factor.\n",
     "    \n",
     "    Returns:\n",
     "        A dictionary that maps from state -> value.\n",
@@ -77,9 +71,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def sample_policy(observation):\n",
@@ -94,7 +86,6 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": false,
     "scrolled": false
    },
    "outputs": [],
@@ -132,9 +123,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
-   "version": "2.7.11"
+   "version": "2.7.12"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/MC/Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb b/MC/Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb
index 2baf04377..32c7cdaef 100644
--- a/MC/Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb	
+++ b/MC/Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb	
@@ -3,9 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "%matplotlib inline\n",
@@ -27,9 +25,7 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "env = BlackjackEnv()"
@@ -91,9 +87,7 @@
   {
    "cell_type": "code",
    "execution_count": 6,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def mc_control_importance_sampling(env, num_episodes, behavior_policy, discount_factor=1.0):\n",
@@ -103,10 +97,10 @@
     "    \n",
     "    Args:\n",
     "        env: OpenAI gym environment.\n",
-    "        num_episodes: Nubmer of episodes to sample.\n",
+    "        num_episodes: Number of episodes to sample.\n",
     "        behavior_policy: The behavior to follow while generating episodes.\n",
     "            A function that given an observation returns a vector of probabilities for each action.\n",
-    "        discount_factor: Lambda discount factor.\n",
+    "        discount_factor: Gamma discount factor.\n",
     "    \n",
     "    Returns:\n",
     "        A tuple (Q, policy).\n",
@@ -171,9 +165,7 @@
   {
    "cell_type": "code",
    "execution_count": 7,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -191,9 +183,7 @@
   {
    "cell_type": "code",
    "execution_count": 8,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -238,23 +228,23 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 2",
    "language": "python",
-   "name": "python3"
+   "name": "python2"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 3
+    "version": 2
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.5.0"
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/MC/Off-Policy MC Control with Weighted Importance Sampling.ipynb b/MC/Off-Policy MC Control with Weighted Importance Sampling.ipynb
index 5cd2b408c..ff3d43a86 100644
--- a/MC/Off-Policy MC Control with Weighted Importance Sampling.ipynb	
+++ b/MC/Off-Policy MC Control with Weighted Importance Sampling.ipynb	
@@ -27,9 +27,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "env = BlackjackEnv()"
@@ -63,9 +61,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def create_greedy_policy(Q):\n",
@@ -89,9 +85,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def mc_control_importance_sampling(env, num_episodes, behavior_policy, discount_factor=1.0):\n",
@@ -101,10 +95,10 @@
     "    \n",
     "    Args:\n",
     "        env: OpenAI gym environment.\n",
-    "        num_episodes: Nubmer of episodes to sample.\n",
+    "        num_episodes: Number of episodes to sample.\n",
     "        behavior_policy: The behavior to follow while generating episodes.\n",
     "            A function that given an observation returns a vector of probabilities for each action.\n",
-    "        discount_factor: Lambda discount factor.\n",
+    "        discount_factor: Gamma discount factor.\n",
     "    \n",
     "    Returns:\n",
     "        A tuple (Q, policy).\n",
@@ -128,9 +122,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "random_policy = create_random_policy(env.action_space.n)\n",
@@ -140,9 +132,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# For plotting: Create value function from action-value function\n",
@@ -166,23 +156,23 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 2",
    "language": "python",
-   "name": "python3"
+   "name": "python2"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 3
+    "version": 2
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.5.0"
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }

From 7017f9edc2eda903e75ed7387ffec9f4c20fed30 Mon Sep 17 00:00:00 2001
From: jonahweissman <jonahrweissman@gmail.com>
Date: Wed, 22 Nov 2017 11:50:30 -0500
Subject: [PATCH 11/56] Fix links in all the `README.md`s

Markdown doesn't allow spaces in links to files, so I replaced
the spaces with "%20". Now the links correctly display on GitHub.
---
 DP/README.md             | 12 ++++++------
 DQN/README.md            |  8 ++++----
 FA/README.md             |  4 ++--
 MC/README.md             | 14 +++++++-------
 PolicyGradient/README.md |  6 +++---
 README.md                | 28 ++++++++++++++--------------
 TD/README.md             |  8 ++++----
 7 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/DP/README.md b/DP/README.md
index bdb6fd086..7a7d9389a 100644
--- a/DP/README.md
+++ b/DP/README.md
@@ -34,13 +34,13 @@
 ### Exercises
 
 - Implement Policy Evaluation in Python (Gridworld)
-  - [Exercise](Policy Evaluation.ipynb)
-  - [Solution](Policy Evaluation Solution.ipynb)
+  - [Exercise](Policy%20Evaluation.ipynb)
+  - [Solution](Policy%20Evaluation%20Solution.ipynb)
 
 - Implement Policy Iteration in Python (Gridworld)
-  - [Exercise](Policy Iteration.ipynb)
-  - [Solution](Policy Iteration Solution.ipynb)
+  - [Exercise](Policy%20Iteration.ipynb)
+  - [Solution](Policy%20Iteration%20Solution.ipynb)
 
 - Implement Value Iteration in Python (Gridworld)
-  - [Exercise](Value Iteration.ipynb)
-  - [Solution](Value Iteration Solution.ipynb)
+  - [Exercise](Value%20Iteration.ipynb)
+  - [Solution](Value%20Iteration%20Solution.ipynb)
diff --git a/DQN/README.md b/DQN/README.md
index 1528b3d0a..eedbbd894 100644
--- a/DQN/README.md
+++ b/DQN/README.md
@@ -39,11 +39,11 @@
 
 ### Exercises
 
-- [OpenAI Gym Atari Environment Playground](Breakout Playground.ipynb)
+- [OpenAI Gym Atari Environment Playground](Breakout%20Playground.ipynb)
 - Deep-Q Learning for Atari Games
-  - [Exercise](Deep Q Learning.ipynb)
-  - [Solution](Deep Q Learning Solution.ipynb)
+  - [Exercise](Deep%20Q%20Learning.ipynb)
+  - [Solution](Deep%20Q%20Learning%20Solution.ipynb)
 - Double-Q Learning
   - This is a minimal change to Q-Learning so use the same exercise as above
-  - [Solution](Double DQN Solution.ipynb)
+  - [Solution](Double%20DQN%20Solution.ipynb)
 - Prioritized Experience Replay (WIP)
diff --git a/FA/README.md b/FA/README.md
index 9eb97101f..fb6dd111a 100644
--- a/FA/README.md
+++ b/FA/README.md
@@ -36,5 +36,5 @@
 ### Exercises
 
 - Solve Mountain Car Problem using Q-Learning with Linear Function Approximation
-  - [Exercise](Q-Learning with Value Function Approximation.ipynb)
-  - [Solution](Q-Learning with Value Function Approximation Solution.ipynb)
+  - [Exercise](Q-Learning%20with%20Value%20Function%20Approximation.ipynb)
+  - [Solution](Q-Learning%20with%20Value%20Function%20Approximation%20Solution.ipynb)
diff --git a/MC/README.md b/MC/README.md
index 5ed660915..835789227 100644
--- a/MC/README.md
+++ b/MC/README.md
@@ -37,13 +37,13 @@
 
 ### Exercises
 
-- [Get familiar with the Blackjack environment (Blackjack-v0)](Blackjack Playground.ipynb)
+- [Get familiar with the Blackjack environment (Blackjack-v0)](Blackjack%20Playground.ipynb)
 - Implement the Monte Carlo Prediction to estimate state-action values
-  - [Exercise](MC Prediction.ipynb)
-  - [Solution](MC Prediction Solution.ipynb)
+  - [Exercise](MC%20Prediction.ipynb)
+  - [Solution](MC%20Prediction%20Solution.ipynb)
 - Implement the on-policy first-visit Monte Carlo Control algorithm
-  - [Exercise](MC Control with Epsilon-Greedy Policies.ipynb)
-  - [Solution](MC Control with Epsilon-Greedy Policies Solution.ipynb)
+  - [Exercise](MC%20Control%20with%20Epsilon-Greedy%20Policies.ipynb)
+  - [Solution](MC%20Control%20with%20Epsilon-Greedy%20Policies%20Solution.ipynb)
 - Implement the off-policy every-visit Monte Carlo Control using Weighted Important Sampling algorithm
-  - [Exercise](Off-Policy MC Control with Weighted Importance Sampling.ipynb)
-  - [Solution](Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb)
\ No newline at end of file
+  - [Exercise](Off-Policy%20MC%20Control%20with%20Weighted%20Importance%20Sampling.ipynb)
+  - [Solution](Off-Policy%20MC%20Control%20with%20Weighted%20Importance%20Sampling%20Solution.ipynb)
diff --git a/PolicyGradient/README.md b/PolicyGradient/README.md
index 4921e0cd6..3094fb332 100644
--- a/PolicyGradient/README.md
+++ b/PolicyGradient/README.md
@@ -50,13 +50,13 @@
 
 - REINFORCE with Baseline
   - Exercise
-  - [Solution](CliffWalk REINFORCE with Baseline Solution.ipynb)
+  - [Solution](CliffWalk%20REINFORCE%20with%20Baseline%20Solution.ipynb)
 - Actor-Critic with Baseline
   - Exercise
-  - [Solution](CliffWalk Actor-Critic Solution.ipynb)
+  - [Solution](CliffWalk%20Actor-Critic%20Solution.ipynb)
 - Actor-Critic with Baseline for Continuous Action Spaces
   - Exercise
-  - [Solution](Continuous MountainCar Actor-Critic Solution.ipynb)
+  - [Solution](Continuous%20MountainCar%20Actor-Critic%20Solution.ipynb)
 - Deterministic Policy Gradients for Continuous Action Spaces (WIP)
 - Deep Deterministic Policy Gradients (WIP)
 - Asynchronous Advantage Actor-Critic (A3C)
diff --git a/README.md b/README.md
index fd2e42323..ad2abe1d3 100644
--- a/README.md
+++ b/README.md
@@ -26,21 +26,21 @@ All code is written in Python 3 and uses RL environments from [OpenAI Gym](https
 
 ### List of Implemented Algorithms
 
-- [Dynamic Programming Policy Evaluation](DP/Policy Evaluation Solution.ipynb)
-- [Dynamic Programming Policy Iteration](DP/Policy Iteration Solution.ipynb)
-- [Dynamic Programming Value Iteration](DP/Value Iteration Solution.ipynb)
-- [Monte Carlo Prediction](MC/MC Prediction Solution.ipynb)
-- [Monte Carlo Control with Epsilon-Greedy Policies](MC/MC Control with Epsilon-Greedy Policies Solution.ipynb)
-- [Monte Carlo Off-Policy Control with Importance Sampling](MC/Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb)
-- [SARSA (On Policy TD Learning)](TD/SARSA Solution.ipynb)
-- [Q-Learning (Off Policy TD Learning)](TD/Q-Learning Solution.ipynb)
-- [Q-Learning with Linear Function Approximation](FA/Q-Learning with Value Function Approximation Solution.ipynb)
-- [Deep Q-Learning for Atari Games](DQN/Deep Q Learning Solution.ipynb)
-- [Double Deep-Q Learning for Atari Games](DQN/Double DQN Solution.ipynb)
+- [Dynamic Programming Policy Evaluation](DP/Policy%20Evaluation%20Solution.ipynb)
+- [Dynamic Programming Policy Iteration](DP/Policy%20Iteration%20Solution.ipynb)
+- [Dynamic Programming Value Iteration](DP/Value%20Iteration%20Solution.ipynb)
+- [Monte Carlo Prediction](MC/MC%20Prediction%20Solution.ipynb)
+- [Monte Carlo Control with Epsilon-Greedy Policies](MC/MC%20Control%20with%20Epsilon-Greedy%20Policies%20Solution.ipynb)
+- [Monte Carlo Off-Policy Control with Importance Sampling](MC/Off-Policy%20MC%20Control%20with%20Weighted%20Importance%20Sampling%20Solution.ipynb)
+- [SARSA (On Policy TD Learning)](TD/SARSA%20Solution.ipynb)
+- [Q-Learning (Off Policy TD Learning)](TD/Q-Learning%20Solution.ipynb)
+- [Q-Learning with Linear Function Approximation](FA/Q-Learning%20with%20Value%20Function%20Approximation%20Solution.ipynb)
+- [Deep Q-Learning for Atari Games](DQN/Deep%20Q%20Learning%20Solution.ipynb)
+- [Double Deep-Q Learning for Atari Games](DQN/Double%20DQN%20Solution.ipynb)
 - Deep Q-Learning with Prioritized Experience Replay (WIP)
-- [Policy Gradient: REINFORCE with Baseline](PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb)
-- [Policy Gradient: Actor Critic with Baseline](PolicyGradient/CliffWalk Actor Critic Solution.ipynb)
-- [Policy Gradient: Actor Critic with Baseline for Continuous Action Spaces](PolicyGradient/Continuous MountainCar Actor Critic Solution.ipynb)
+- [Policy Gradient: REINFORCE with Baseline](PolicyGradient/CliffWalk%20REINFORCE%20with%20Baseline%20Solution.ipynb)
+- [Policy Gradient: Actor Critic with Baseline](PolicyGradient/CliffWalk%20Actor%20Critic%20Solution.ipynb)
+- [Policy Gradient: Actor Critic with Baseline for Continuous Action Spaces](PolicyGradient/Continuous%20MountainCar%20Actor%20Critic%20Solution.ipynb)
 - Deterministic Policy Gradients for Continuous Action Spaces (WIP)
 - Deep Deterministic Policy Gradients (DDPG) (WIP)
 - [Asynchronous Advantage Actor Critic (A3C)](PolicyGradient/a3c)
diff --git a/TD/README.md b/TD/README.md
index 9044704d1..f0b26aa50 100644
--- a/TD/README.md
+++ b/TD/README.md
@@ -40,11 +40,11 @@
 
 ### Exercises
 
-- [Windy Gridworld Playground](Windy Gridworld Playground.ipynb)
+- [Windy Gridworld Playground](Windy%20Gridworld%20Playground.ipynb)
 - Implement SARSA
   - [Exercise](SARSA.ipynb)
-  - [Solution](SARSA Solution.ipynb)
-- [Cliff Environment Playground](Cliff Environment Playground.ipynb)
+  - [Solution](SARSA%20Solution.ipynb)
+- [Cliff Environment Playground](Cliff%20Environment%20Playground.ipynb)
 - Implement Q-Learning in Python
   - [Exercise](Q-Learning.ipynb)
-  - [Solution](Q-Learning Solution.ipynb)
\ No newline at end of file
+  - [Solution](Q-Learning%20Solution.ipynb)

From da612e5eddc00468bb1894d119d22a8c37566241 Mon Sep 17 00:00:00 2001
From: Alex <alex@noul.kr>
Date: Thu, 23 Nov 2017 09:23:51 +0900
Subject: [PATCH 12/56] Change kernel to python3

---
 DP/Policy Evaluation Solution.ipynb           | 10 +++----
 DP/Policy Evaluation.ipynb                    | 10 +++----
 DP/Policy Iteration Solution.ipynb            | 10 +++----
 DP/Policy Iteration.ipynb                     | 10 +++----
 DP/Value Iteration Solution.ipynb             | 10 +++----
 DP/Value Iteration.ipynb                      | 10 +++----
 MC/Blackjack Playground.ipynb                 | 12 +++-----
 ...ith Epsilon-Greedy Policies Solution.ipynb | 14 +++++----
 ...Control with Epsilon-Greedy Policies.ipynb | 26 ++++++++++------
 MC/MC Prediction Solution.ipynb               | 26 ++++++++++------
 MC/MC Prediction.ipynb                        | 27 +++++++++++------
 ...eighted Importance Sampling Solution.ipynb | 22 +++++++++-----
 ...ol with Weighted Importance Sampling.ipynb | 30 ++++++++++++-------
 13 files changed, 128 insertions(+), 89 deletions(-)

diff --git a/DP/Policy Evaluation Solution.ipynb b/DP/Policy Evaluation Solution.ipynb
index 8db76d578..703e020fb 100644
--- a/DP/Policy Evaluation Solution.ipynb	
+++ b/DP/Policy Evaluation Solution.ipynb	
@@ -144,21 +144,21 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 2",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.12"
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,
diff --git a/DP/Policy Evaluation.ipynb b/DP/Policy Evaluation.ipynb
index e4f5f3673..381a58260 100644
--- a/DP/Policy Evaluation.ipynb	
+++ b/DP/Policy Evaluation.ipynb	
@@ -107,21 +107,21 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 2",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.12"
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,
diff --git a/DP/Policy Iteration Solution.ipynb b/DP/Policy Iteration Solution.ipynb
index bf6fa631a..be7d3710e 100644
--- a/DP/Policy Iteration Solution.ipynb	
+++ b/DP/Policy Iteration Solution.ipynb	
@@ -225,21 +225,21 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 2",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.12"
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,
diff --git a/DP/Policy Iteration.ipynb b/DP/Policy Iteration.ipynb
index bdff9deea..fc87f291b 100644
--- a/DP/Policy Iteration.ipynb	
+++ b/DP/Policy Iteration.ipynb	
@@ -214,21 +214,21 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 2",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.12"
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,
diff --git a/DP/Value Iteration Solution.ipynb b/DP/Value Iteration Solution.ipynb
index ebd1b5d49..cd0da629f 100644
--- a/DP/Value Iteration Solution.ipynb	
+++ b/DP/Value Iteration Solution.ipynb	
@@ -176,21 +176,21 @@
  "metadata": {
   "anaconda-cloud": {},
   "kernelspec": {
-   "display_name": "Python 2",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.12"
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,
diff --git a/DP/Value Iteration.ipynb b/DP/Value Iteration.ipynb
index f947fd761..ff4bf15dd 100644
--- a/DP/Value Iteration.ipynb	
+++ b/DP/Value Iteration.ipynb	
@@ -153,21 +153,21 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 2",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.12"
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,
diff --git a/MC/Blackjack Playground.ipynb b/MC/Blackjack Playground.ipynb
index cbb6c40c8..28dfc1867 100644
--- a/MC/Blackjack Playground.ipynb	
+++ b/MC/Blackjack Playground.ipynb	
@@ -18,9 +18,7 @@
   {
    "cell_type": "code",
    "execution_count": 420,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "env = BlackjackEnv()"
@@ -29,9 +27,7 @@
   {
    "cell_type": "code",
    "execution_count": 422,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -219,9 +215,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb b/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb
index c41cf3b1a..0f10d783e 100644
--- a/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb	
+++ b/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb	
@@ -27,7 +27,9 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "env = BlackjackEnv()"
@@ -207,21 +209,21 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 2",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.12"
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,
diff --git a/MC/MC Control with Epsilon-Greedy Policies.ipynb b/MC/MC Control with Epsilon-Greedy Policies.ipynb
index 7963c8d18..257a84b44 100644
--- a/MC/MC Control with Epsilon-Greedy Policies.ipynb	
+++ b/MC/MC Control with Epsilon-Greedy Policies.ipynb	
@@ -27,7 +27,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "env = BlackjackEnv()"
@@ -36,7 +38,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "def make_epsilon_greedy_policy(Q, epsilon, nA):\n",
@@ -107,7 +111,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "Q, policy = mc_control_epsilon_greedy(env, num_episodes=500000, epsilon=0.1)"
@@ -116,7 +122,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "# For plotting: Create value function from action-value function\n",
@@ -140,21 +148,21 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 2",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.12"
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,
diff --git a/MC/MC Prediction Solution.ipynb b/MC/MC Prediction Solution.ipynb
index 7459f6048..25da5f3ca 100644
--- a/MC/MC Prediction Solution.ipynb	
+++ b/MC/MC Prediction Solution.ipynb	
@@ -3,7 +3,9 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "%matplotlib inline\n",
@@ -26,7 +28,9 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "env = BlackjackEnv()"
@@ -35,7 +39,9 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "def mc_prediction(policy, env, num_episodes, discount_factor=1.0):\n",
@@ -100,7 +106,9 @@
   {
    "cell_type": "code",
    "execution_count": 4,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "def sample_policy(observation):\n",
@@ -193,21 +201,21 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 2",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.12"
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,
diff --git a/MC/MC Prediction.ipynb b/MC/MC Prediction.ipynb
index aff53e747..472f9ef35 100644
--- a/MC/MC Prediction.ipynb	
+++ b/MC/MC Prediction.ipynb	
@@ -3,7 +3,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "%matplotlib inline\n",
@@ -26,7 +28,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "env = BlackjackEnv()"
@@ -35,7 +39,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "def mc_prediction(policy, env, num_episodes, discount_factor=1.0):\n",
@@ -71,7 +77,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "def sample_policy(observation):\n",
@@ -86,6 +94,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
+    "collapsed": true,
     "scrolled": false
    },
    "outputs": [],
@@ -109,21 +118,21 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 2",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.12"
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,
diff --git a/MC/Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb b/MC/Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb
index 32c7cdaef..41dad0fe6 100644
--- a/MC/Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb	
+++ b/MC/Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb	
@@ -3,7 +3,9 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "%matplotlib inline\n",
@@ -25,7 +27,9 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "env = BlackjackEnv()"
@@ -87,7 +91,9 @@
   {
    "cell_type": "code",
    "execution_count": 6,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "def mc_control_importance_sampling(env, num_episodes, behavior_policy, discount_factor=1.0):\n",
@@ -228,21 +234,21 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 2",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.12"
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,
diff --git a/MC/Off-Policy MC Control with Weighted Importance Sampling.ipynb b/MC/Off-Policy MC Control with Weighted Importance Sampling.ipynb
index ff3d43a86..b93408711 100644
--- a/MC/Off-Policy MC Control with Weighted Importance Sampling.ipynb	
+++ b/MC/Off-Policy MC Control with Weighted Importance Sampling.ipynb	
@@ -27,7 +27,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "env = BlackjackEnv()"
@@ -61,7 +63,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "def create_greedy_policy(Q):\n",
@@ -85,7 +89,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "def mc_control_importance_sampling(env, num_episodes, behavior_policy, discount_factor=1.0):\n",
@@ -122,7 +128,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "random_policy = create_random_policy(env.action_space.n)\n",
@@ -132,7 +140,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "# For plotting: Create value function from action-value function\n",
@@ -156,21 +166,21 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 2",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.12"
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,

From 79cadc0ddb885787852bc2ade62641f8ae9ee909 Mon Sep 17 00:00:00 2001
From: Alex <alex@noul.kr>
Date: Fri, 24 Nov 2017 16:47:26 +0900
Subject: [PATCH 13/56] Lambda to Gamma. Updated Readme.

---
 MC/README.md                 |  2 +-
 TD/Q-Learning Solution.ipynb | 26 ++++++++------------------
 TD/Q-Learning.ipynb          | 26 ++++++++------------------
 TD/README.md                 |  4 ++--
 TD/SARSA Solution.ipynb      | 30 +++++++++---------------------
 TD/SARSA.ipynb               | 30 +++++++++---------------------
 6 files changed, 37 insertions(+), 81 deletions(-)

diff --git a/MC/README.md b/MC/README.md
index 835789227..2c1a512d7 100644
--- a/MC/README.md
+++ b/MC/README.md
@@ -37,7 +37,7 @@
 
 ### Exercises
 
-- [Get familiar with the Blackjack environment (Blackjack-v0)](Blackjack%20Playground.ipynb)
+- Get familiar with the [Blackjack environment (Blackjack-v0)](Blackjack%20Playground.ipynb)
 - Implement the Monte Carlo Prediction to estimate state-action values
   - [Exercise](MC%20Prediction.ipynb)
   - [Solution](MC%20Prediction%20Solution.ipynb)
diff --git a/TD/Q-Learning Solution.ipynb b/TD/Q-Learning Solution.ipynb
index 5794e20de..4c1c5be2c 100644
--- a/TD/Q-Learning Solution.ipynb	
+++ b/TD/Q-Learning Solution.ipynb	
@@ -3,9 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "%matplotlib inline\n",
@@ -31,9 +29,7 @@
   {
    "cell_type": "code",
    "execution_count": 15,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "env = CliffWalkingEnv()"
@@ -73,9 +69,7 @@
   {
    "cell_type": "code",
    "execution_count": 17,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def q_learning(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1):\n",
@@ -86,7 +80,7 @@
     "    Args:\n",
     "        env: OpenAI environment.\n",
     "        num_episodes: Number of episodes to run for.\n",
-    "        discount_factor: Lambda time discount factor.\n",
+    "        discount_factor: Gamma discount factor.\n",
     "        alpha: TD learning rate.\n",
     "        epsilon: Chance the sample a random action. Float betwen 0 and 1.\n",
     "    \n",
@@ -147,9 +141,7 @@
   {
    "cell_type": "code",
    "execution_count": 18,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -166,9 +158,7 @@
   {
    "cell_type": "code",
    "execution_count": 19,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -231,9 +221,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/TD/Q-Learning.ipynb b/TD/Q-Learning.ipynb
index 724d682ad..4e1396cf6 100644
--- a/TD/Q-Learning.ipynb
+++ b/TD/Q-Learning.ipynb
@@ -3,9 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "%matplotlib inline\n",
@@ -30,9 +28,7 @@
   {
    "cell_type": "code",
    "execution_count": 4,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "env = CliffWalkingEnv()"
@@ -72,9 +68,7 @@
   {
    "cell_type": "code",
    "execution_count": 6,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def q_learning(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1):\n",
@@ -85,7 +79,7 @@
     "    Args:\n",
     "        env: OpenAI environment.\n",
     "        num_episodes: Number of episodes to run for.\n",
-    "        discount_factor: Lambda time discount factor.\n",
+    "        discount_factor: Gamma discount factor.\n",
     "        alpha: TD learning rate.\n",
     "        epsilon: Chance the sample a random action. Float betwen 0 and 1.\n",
     "    \n",
@@ -121,9 +115,7 @@
   {
    "cell_type": "code",
    "execution_count": 7,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -140,9 +132,7 @@
   {
    "cell_type": "code",
    "execution_count": 8,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -205,9 +195,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/TD/README.md b/TD/README.md
index f0b26aa50..ac2488167 100644
--- a/TD/README.md
+++ b/TD/README.md
@@ -40,11 +40,11 @@
 
 ### Exercises
 
-- [Windy Gridworld Playground](Windy%20Gridworld%20Playground.ipynb)
+- Get familiar with the [Windy Gridworld Playground](Windy%20Gridworld%20Playground.ipynb)
 - Implement SARSA
   - [Exercise](SARSA.ipynb)
   - [Solution](SARSA%20Solution.ipynb)
-- [Cliff Environment Playground](Cliff%20Environment%20Playground.ipynb)
+- Get familiar with the [Cliff Environment Playground](Cliff%20Environment%20Playground.ipynb)
 - Implement Q-Learning in Python
   - [Exercise](Q-Learning.ipynb)
   - [Solution](Q-Learning%20Solution.ipynb)
diff --git a/TD/SARSA Solution.ipynb b/TD/SARSA Solution.ipynb
index feab3db02..df647f193 100644
--- a/TD/SARSA Solution.ipynb	
+++ b/TD/SARSA Solution.ipynb	
@@ -3,9 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": 19,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "%matplotlib inline\n",
@@ -39,9 +37,7 @@
   {
    "cell_type": "code",
    "execution_count": 20,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "env = WindyGridworldEnv()"
@@ -81,9 +77,7 @@
   {
    "cell_type": "code",
    "execution_count": 22,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def sarsa(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1):\n",
@@ -93,7 +87,7 @@
     "    Args:\n",
     "        env: OpenAI environment.\n",
     "        num_episodes: Number of episodes to run for.\n",
-    "        discount_factor: Lambda time discount factor.\n",
+    "        discount_factor: Gamma discount factor.\n",
     "        alpha: TD learning rate.\n",
     "        epsilon: Chance the sample a random action. Float betwen 0 and 1.\n",
     "    \n",
@@ -156,9 +150,7 @@
   {
    "cell_type": "code",
    "execution_count": 23,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -175,9 +167,7 @@
   {
    "cell_type": "code",
    "execution_count": 24,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -217,9 +207,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": []
   }
@@ -240,9 +228,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/TD/SARSA.ipynb b/TD/SARSA.ipynb
index 799915352..8a0344410 100644
--- a/TD/SARSA.ipynb
+++ b/TD/SARSA.ipynb
@@ -3,9 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": 11,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "%matplotlib inline\n",
@@ -30,9 +28,7 @@
   {
    "cell_type": "code",
    "execution_count": 12,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "env = WindyGridworldEnv()"
@@ -72,9 +68,7 @@
   {
    "cell_type": "code",
    "execution_count": 14,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def sarsa(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1):\n",
@@ -84,7 +78,7 @@
     "    Args:\n",
     "        env: OpenAI environment.\n",
     "        num_episodes: Number of episodes to run for.\n",
-    "        discount_factor: Lambda time discount factor.\n",
+    "        discount_factor: Gamma discount factor.\n",
     "        alpha: TD learning rate.\n",
     "        epsilon: Chance the sample a random action. Float betwen 0 and 1.\n",
     "    \n",
@@ -121,9 +115,7 @@
   {
    "cell_type": "code",
    "execution_count": 16,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -140,9 +132,7 @@
   {
    "cell_type": "code",
    "execution_count": 17,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -182,9 +172,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": []
   }
@@ -205,9 +193,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }

From 3fce6b57c93ae505fd7990bad63c57cee4f9a6c1 Mon Sep 17 00:00:00 2001
From: Alex <alex@noul.kr>
Date: Fri, 1 Dec 2017 15:38:01 +0900
Subject: [PATCH 14/56] Updated Readme. Changed Lambda to Gamma

---
 ...alue Function Approximation Solution.ipynb | 40 ++++++-------------
 ...ng with Value Function Approximation.ipynb | 30 +++++---------
 FA/README.md                                  |  2 +
 3 files changed, 25 insertions(+), 47 deletions(-)

diff --git a/FA/Q-Learning with Value Function Approximation Solution.ipynb b/FA/Q-Learning with Value Function Approximation Solution.ipynb
index a271d6a63..49c62ca37 100644
--- a/FA/Q-Learning with Value Function Approximation Solution.ipynb	
+++ b/FA/Q-Learning with Value Function Approximation Solution.ipynb	
@@ -3,9 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "%matplotlib inline\n",
@@ -31,9 +29,7 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stderr",
@@ -50,9 +46,7 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -74,7 +68,7 @@
     "scaler = sklearn.preprocessing.StandardScaler()\n",
     "scaler.fit(observation_examples)\n",
     "\n",
-    "# Used to converte a state to a featurizes represenation.\n",
+    "# Used to convert a state to a featurizes represenation.\n",
     "# We use RBF kernels with different variances to cover different parts of the space\n",
     "featurizer = sklearn.pipeline.FeatureUnion([\n",
     "        (\"rbf1\", RBFSampler(gamma=5.0, n_components=100)),\n",
@@ -88,9 +82,7 @@
   {
    "cell_type": "code",
    "execution_count": 4,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "class Estimator():\n",
@@ -151,9 +143,7 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def make_epsilon_greedy_policy(estimator, epsilon, nA):\n",
@@ -182,9 +172,7 @@
   {
    "cell_type": "code",
    "execution_count": 14,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def q_learning(env, estimator, num_episodes, discount_factor=1.0, epsilon=0.1, epsilon_decay=1.0):\n",
@@ -196,7 +184,7 @@
     "        env: OpenAI environment.\n",
     "        estimator: Action-Value function estimator\n",
     "        num_episodes: Number of episodes to run for.\n",
-    "        discount_factor: Lambda time discount factor.\n",
+    "        discount_factor: Gamma discount factor.\n",
     "        epsilon: Chance the sample a random action. Float betwen 0 and 1.\n",
     "        epsilon_decay: Each episode, epsilon is decayed by this factor\n",
     "    \n",
@@ -283,9 +271,7 @@
   {
    "cell_type": "code",
    "execution_count": 16,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -305,9 +291,7 @@
   {
    "cell_type": "code",
    "execution_count": 17,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -384,9 +368,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/FA/Q-Learning with Value Function Approximation.ipynb b/FA/Q-Learning with Value Function Approximation.ipynb
index e83b6bbb0..442605562 100644
--- a/FA/Q-Learning with Value Function Approximation.ipynb	
+++ b/FA/Q-Learning with Value Function Approximation.ipynb	
@@ -4,7 +4,7 @@
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {
-    "collapsed": false
+    "collapsed": true
    },
    "outputs": [],
    "source": [
@@ -31,9 +31,7 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stderr",
@@ -50,9 +48,7 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -89,7 +85,7 @@
    "cell_type": "code",
    "execution_count": 4,
    "metadata": {
-    "collapsed": false
+    "collapsed": true
    },
    "outputs": [],
    "source": [
@@ -149,7 +145,7 @@
    "cell_type": "code",
    "execution_count": 5,
    "metadata": {
-    "collapsed": false
+    "collapsed": true
    },
    "outputs": [],
    "source": [
@@ -180,7 +176,7 @@
    "cell_type": "code",
    "execution_count": 18,
    "metadata": {
-    "collapsed": false
+    "collapsed": true
    },
    "outputs": [],
    "source": [
@@ -193,7 +189,7 @@
     "        env: OpenAI environment.\n",
     "        estimator: Action-Value function estimator\n",
     "        num_episodes: Number of episodes to run for.\n",
-    "        discount_factor: Lambda time discount factor.\n",
+    "        discount_factor: Gamma discount factor.\n",
     "        epsilon: Chance the sample a random action. Float betwen 0 and 1.\n",
     "        epsilon_decay: Each episode, epsilon is decayed by this factor\n",
     "    \n",
@@ -237,9 +233,7 @@
   {
    "cell_type": "code",
    "execution_count": 20,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -259,9 +253,7 @@
   {
    "cell_type": "code",
    "execution_count": 21,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -326,9 +318,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.4.3"
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/FA/README.md b/FA/README.md
index fb6dd111a..579498c85 100644
--- a/FA/README.md
+++ b/FA/README.md
@@ -35,6 +35,8 @@
 
 ### Exercises
 
+- Get familiar with the [Mountain Car Playground](MountainCar%20Playground.ipynb)
+
 - Solve Mountain Car Problem using Q-Learning with Linear Function Approximation
   - [Exercise](Q-Learning%20with%20Value%20Function%20Approximation.ipynb)
   - [Solution](Q-Learning%20with%20Value%20Function%20Approximation%20Solution.ipynb)

From 152dbc414cfd70d67aff46241c3fc69887256c8b Mon Sep 17 00:00:00 2001
From: Alex <alex@noul.kr>
Date: Wed, 6 Dec 2017 17:15:13 +0900
Subject: [PATCH 15/56] Updated link to Sutton's book

---
 DP/README.md             | 2 +-
 FA/README.md             | 4 ++--
 Introduction/README.md   | 2 +-
 MC/README.md             | 2 +-
 MDP/README.md            | 2 +-
 PolicyGradient/README.md | 2 +-
 README.md                | 2 +-
 TD/README.md             | 6 +++---
 8 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/DP/README.md b/DP/README.md
index 7a7d9389a..1c2bb768b 100644
--- a/DP/README.md
+++ b/DP/README.md
@@ -28,7 +28,7 @@
 
 **Optional:**
 
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 4: Dynamic Programming
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 4: Dynamic Programming
 
 
 ### Exercises
diff --git a/FA/README.md b/FA/README.md
index 579498c85..f50f56cef 100644
--- a/FA/README.md
+++ b/FA/README.md
@@ -25,8 +25,8 @@
 **Required:**
 
 - David Silver's RL Course Lecture 6 - Value Function Approximation ([video](https://www.youtube.com/watch?v=UoPei5o4fps), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/FA.pdf))
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 9: On-policy Prediction with Approximation
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 10: On-policy Control with Approximation
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 9: On-policy Prediction with Approximation
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 10: On-policy Control with Approximation
 
 **Optional:**
 
diff --git a/Introduction/README.md b/Introduction/README.md
index f476fabb9..9e5b383ac 100644
--- a/Introduction/README.md
+++ b/Introduction/README.md
@@ -17,7 +17,7 @@
 
 **Required:**
 
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 1: The Reinforcement Learning Problem
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 1: The Reinforcement Learning Problem
 - David Silver's RL Course Lecture 1 - Introduction to Reinforcement Learning ([video](https://www.youtube.com/watch?v=2pWv7GOvuf0), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/intro_RL.pdf))
 - [OpenAI Gym Tutorial](https://gym.openai.com/docs)
 
diff --git a/MC/README.md b/MC/README.md
index 2c1a512d7..9d23968c2 100644
--- a/MC/README.md
+++ b/MC/README.md
@@ -26,7 +26,7 @@
 
 **Required:**
 
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 5: Monte Carlo Methods
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 5: Monte Carlo Methods
 
 
 **Optional:**
diff --git a/MDP/README.md b/MDP/README.md
index 404cb141b..539799a09 100644
--- a/MDP/README.md
+++ b/MDP/README.md
@@ -25,7 +25,7 @@
 
 **Required:**
 
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 3: Finite Markov Decision Processes
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 3: Finite Markov Decision Processes
 - David Silver's RL Course Lecture 2 - Markov Decision Processes ([video](https://www.youtube.com/watch?v=lfHX2hHRMVQ), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/MDP.pdf))
 
 
diff --git a/PolicyGradient/README.md b/PolicyGradient/README.md
index 3094fb332..1e7a1c68d 100644
--- a/PolicyGradient/README.md
+++ b/PolicyGradient/README.md
@@ -36,7 +36,7 @@
 
 **Optional:**
 
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 11: Policy Gradient Methods (Under Construction)
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 11: Policy Gradient Methods (Under Construction)
 - [Deterministic Policy Gradient Algorithms](http://jmlr.org/proceedings/papers/v32/silver14.pdf)
 - [Deterministic Policy Gradient Algorithms (Talk)](http://techtalks.tv/talks/deterministic-policy-gradient-algorithms/61098/)
 - [Continuous control with deep reinforcement learning](https://arxiv.org/abs/1509.02971)
diff --git a/README.md b/README.md
index ad2abe1d3..60974e0dd 100644
--- a/README.md
+++ b/README.md
@@ -50,7 +50,7 @@ All code is written in Python 3 and uses RL environments from [OpenAI Gym](https
 
 Textbooks:
 
-- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf)
+- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/book/bookdraft2017nov5.pdf)
 
 Classes:
 
diff --git a/TD/README.md b/TD/README.md
index ac2488167..b54bfead8 100644
--- a/TD/README.md
+++ b/TD/README.md
@@ -28,14 +28,14 @@
 
 **Required:**
 
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 6: Temporal-Difference Learning
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 6: Temporal-Difference Learning
 - David Silver's RL Course Lecture 4 - Model-Free Prediction ([video](https://www.youtube.com/watch?v=PnHCvfgC_ZA), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/MC-TD.pdf))
 - David Silver's RL Course Lecture 5 - Model-Free Control ([video](https://www.youtube.com/watch?v=0g4j2k_Ggc4), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/control.pdf))
 
 **Optional:**
 
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 7: Multi-Step Bootstrapping
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 12: Eligibility Traces
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 7: Multi-Step Bootstrapping
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 12: Eligibility Traces
 
 
 ### Exercises

From 9ee6cdd8494ff529df270d6d07658abbec0d62aa Mon Sep 17 00:00:00 2001
From: Alex <alex@noul.kr>
Date: Wed, 6 Dec 2017 17:16:45 +0900
Subject: [PATCH 16/56] Updated link to Sutton's book

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 60974e0dd..43a7be82a 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 This repository provides code, exercises and solutions for popular Reinforcement Learning algorithms. These are meant to serve as a learning tool to complement the theoretical materials from
 
-- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf)
+- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/book/bookdraft2017nov5.pdf)
 - [David Silver's Reinforcement Learning Course](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html)
 
 Each folder in corresponds to one or more chapters of the above textbook and/or course. In addition to exercises and solution, each folder also contains a list of learning goals, a brief concept summary, and links to the relevant readings.

From dee1e01b6e4ed7cbd90ed603a0bf6ccb396fdcc4 Mon Sep 17 00:00:00 2001
From: Alex <alex@noul.kr>
Date: Thu, 7 Dec 2017 15:23:25 +0900
Subject: [PATCH 17/56] DQN: Fixed typos. Changed labmda to gamma. Updated
 Readme

---
 DQN/Deep Q Learning Solution.ipynb | 32 +++++++++++++++++--------
 DQN/Deep Q Learning.ipynb          | 38 ++++++++++--------------------
 DQN/Double DQN Solution.ipynb      | 38 ++++++++++--------------------
 DQN/README.md                      |  2 +-
 DQN/dqn.py                         |  6 ++---
 5 files changed, 50 insertions(+), 66 deletions(-)

diff --git a/DQN/Deep Q Learning Solution.ipynb b/DQN/Deep Q Learning Solution.ipynb
index 7cf615137..1477005ef 100644
--- a/DQN/Deep Q Learning Solution.ipynb	
+++ b/DQN/Deep Q Learning Solution.ipynb	
@@ -30,7 +30,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "env = gym.envs.make(\"Breakout-v0\")"
@@ -39,7 +41,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "# Atari Actions: 0 (noop), 1 (fire), 2 (left) and 3 (right) are valid actions\n",
@@ -56,7 +60,7 @@
    "source": [
     "class StateProcessor():\n",
     "    \"\"\"\n",
-    "    Processes a raw Atari iamges. Resizes it and converts it to grayscale.\n",
+    "    Processes a raw Atari images. Resizes it and converts it to grayscale.\n",
     "    \"\"\"\n",
     "    def __init__(self):\n",
     "        # Build the Tensorflow graph\n",
@@ -83,7 +87,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "class Estimator():\n",
@@ -193,7 +199,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "# For Testing....\n",
@@ -295,7 +303,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "def deep_q_learning(sess,\n",
@@ -315,7 +325,7 @@
     "                    batch_size=32,\n",
     "                    record_video_every=50):\n",
     "    \"\"\"\n",
-    "    Q-Learning algorithm for fff-policy TD control using Function Approximation.\n",
+    "    Q-Learning algorithm for off-policy TD control using Function Approximation.\n",
     "    Finds the optimal greedy policy while following an epsilon-greedy policy.\n",
     "\n",
     "    Args:\n",
@@ -331,7 +341,7 @@
     "          the reply memory.\n",
     "        update_target_estimator_every: Copy parameters from the Q estimator to the \n",
     "          target estimator every N steps\n",
-    "        discount_factor: Lambda time discount factor\n",
+    "        discount_factor: Gamma discount factor\n",
     "        epsilon_start: Chance to sample a random action when taking an action.\n",
     "          Epsilon is decayed over time and this is the start value\n",
     "        epsilon_end: The final minimum value of epsilon after decaying is done\n",
@@ -494,7 +504,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "tf.reset_default_graph()\n",
@@ -569,7 +581,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.0"
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,
diff --git a/DQN/Deep Q Learning.ipynb b/DQN/Deep Q Learning.ipynb
index c3210d2ad..29631ce0a 100644
--- a/DQN/Deep Q Learning.ipynb	
+++ b/DQN/Deep Q Learning.ipynb	
@@ -29,9 +29,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "env = gym.envs.make(\"Breakout-v0\")"
@@ -40,9 +38,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Atari Actions: 0 (noop), 1 (fire), 2 (left) and 3 (right) are valid actions\n",
@@ -59,7 +55,7 @@
    "source": [
     "class StateProcessor():\n",
     "    \"\"\"\n",
-    "    Processes a raw Atari iamges. Resizes it and converts it to grayscale.\n",
+    "    Processes a raw Atari images. Resizes it and converts it to grayscale.\n",
     "    \"\"\"\n",
     "    def __init__(self):\n",
     "        # Build the Tensorflow graph\n",
@@ -86,9 +82,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "class Estimator():\n",
@@ -199,9 +193,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# For Testing....\n",
@@ -234,9 +226,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def copy_model_parameters(sess, estimator1, estimator2):\n",
@@ -294,9 +284,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def deep_q_learning(sess,\n",
@@ -316,7 +304,7 @@
     "                    batch_size=32,\n",
     "                    record_video_every=50):\n",
     "    \"\"\"\n",
-    "    Q-Learning algorithm for fff-policy TD control using Function Approximation.\n",
+    "    Q-Learning algorithm for off-policy TD control using Function Approximation.\n",
     "    Finds the optimal greedy policy while following an epsilon-greedy policy.\n",
     "\n",
     "    Args:\n",
@@ -332,7 +320,7 @@
     "          the reply memory.\n",
     "        update_target_estimator_every: Copy parameters from the Q estimator to the \n",
     "          target estimator every N steps\n",
-    "        discount_factor: Lambda time discount factor\n",
+    "        discount_factor: Gamma discount factor\n",
     "        epsilon_start: Chance to sample a random action when taking an action.\n",
     "          Epsilon is decayed over time and this is the start value\n",
     "        epsilon_end: The final minimum value of epsilon after decaying is done\n",
@@ -469,9 +457,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "tf.reset_default_graph()\n",
@@ -528,9 +514,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.0"
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/DQN/Double DQN Solution.ipynb b/DQN/Double DQN Solution.ipynb
index 22bb9ebc9..7d8411fdd 100644
--- a/DQN/Double DQN Solution.ipynb	
+++ b/DQN/Double DQN Solution.ipynb	
@@ -28,9 +28,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "env = gym.envs.make(\"Breakout-v0\")"
@@ -39,9 +37,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Atari Actions: 0 (noop), 1 (fire), 2 (left) and 3 (right) are valid actions\n",
@@ -58,7 +54,7 @@
    "source": [
     "class StateProcessor():\n",
     "    \"\"\"\n",
-    "    Processes a raw Atari iamges. Resizes it and converts it to grayscale.\n",
+    "    Processes a raw Atari images. Resizes it and converts it to grayscale.\n",
     "    \"\"\"\n",
     "    def __init__(self):\n",
     "        # Build the Tensorflow graph\n",
@@ -85,9 +81,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "class Estimator():\n",
@@ -175,9 +169,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# For Testing....\n",
@@ -210,9 +202,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def copy_model_parameters(sess, estimator1, estimator2):\n",
@@ -270,9 +260,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def deep_q_learning(sess,\n",
@@ -292,7 +280,7 @@
     "                    batch_size=32,\n",
     "                    record_video_every=50):\n",
     "    \"\"\"\n",
-    "    Q-Learning algorithm for fff-policy TD control using Function Approximation.\n",
+    "    Q-Learning algorithm for off-policy TD control using Function Approximation.\n",
     "    Finds the optimal greedy policy while following an epsilon-greedy policy.\n",
     "\n",
     "    Args:\n",
@@ -308,7 +296,7 @@
     "          the reply memory.\n",
     "        update_target_estimator_every: Copy parameters from the Q estimator to the \n",
     "          target estimator every N steps\n",
-    "        discount_factor: Lambda time discount factor\n",
+    "        discount_factor: Gamma discount factor\n",
     "        epsilon_start: Chance to sample a random action when taking an action.\n",
     "          Epsilon is decayed over time and this is the start value\n",
     "        epsilon_end: The final minimum value of epsilon after decaying is done\n",
@@ -472,9 +460,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "tf.reset_default_graph()\n",
@@ -531,9 +517,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/DQN/README.md b/DQN/README.md
index eedbbd894..7d0464727 100644
--- a/DQN/README.md
+++ b/DQN/README.md
@@ -39,7 +39,7 @@
 
 ### Exercises
 
-- [OpenAI Gym Atari Environment Playground](Breakout%20Playground.ipynb)
+- Get familiar with the [OpenAI Gym Atari Environment Playground](Breakout%20Playground.ipynb)
 - Deep-Q Learning for Atari Games
   - [Exercise](Deep%20Q%20Learning.ipynb)
   - [Solution](Deep%20Q%20Learning%20Solution.ipynb)
diff --git a/DQN/dqn.py b/DQN/dqn.py
index d54d4d1bf..be43ec08b 100755
--- a/DQN/dqn.py
+++ b/DQN/dqn.py
@@ -20,7 +20,7 @@
 
 class StateProcessor():
     """
-    Processes a raw Atari iamges. Resizes it and converts it to grayscale.
+    Processes a raw Atari images. Resizes it and converts it to grayscale.
     """
     def __init__(self):
         # Build the Tensorflow graph
@@ -208,7 +208,7 @@ def deep_q_learning(sess,
                     batch_size=32,
                     record_video_every=50):
     """
-    Q-Learning algorithm for fff-policy TD control using Function Approximation.
+    Q-Learning algorithm for off-policy TD control using Function Approximation.
     Finds the optimal greedy policy while following an epsilon-greedy policy.
 
     Args:
@@ -224,7 +224,7 @@ def deep_q_learning(sess,
           the reply memory.
         update_target_estimator_every: Copy parameters from the Q estimator to the 
           target estimator every N steps
-        discount_factor: Lambda time discount factor
+        discount_factor: Gamma discount factor
         epsilon_start: Chance to sample a random action when taking an action.
           Epsilon is decayed over time and this is the start value
         epsilon_end: The final minimum value of epsilon after decaying is done

From 85565ec067d2856f7d2ac033badd943f035adbf3 Mon Sep 17 00:00:00 2001
From: Alex Bailo <alexandr.baylo@gmail.com>
Date: Wed, 27 Dec 2017 15:13:47 +0900
Subject: [PATCH 18/56] "Policy Gradient Methods" is chapter 13 now

---
 PolicyGradient/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PolicyGradient/README.md b/PolicyGradient/README.md
index 1e7a1c68d..dc534c914 100644
--- a/PolicyGradient/README.md
+++ b/PolicyGradient/README.md
@@ -36,7 +36,7 @@
 
 **Optional:**
 
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 11: Policy Gradient Methods (Under Construction)
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 13: Policy Gradient Methods (Under Construction)
 - [Deterministic Policy Gradient Algorithms](http://jmlr.org/proceedings/papers/v32/silver14.pdf)
 - [Deterministic Policy Gradient Algorithms (Talk)](http://techtalks.tv/talks/deterministic-policy-gradient-algorithms/61098/)
 - [Continuous control with deep reinforcement learning](https://arxiv.org/abs/1509.02971)

From f637c42976275d5acd27f4a03779b2e1ddcf8a1a Mon Sep 17 00:00:00 2001
From: Alex Bailo <alexandr.baylo@gmail.com>
Date: Wed, 27 Dec 2017 15:41:46 +0900
Subject: [PATCH 19/56] "Policy Gradient Methods" chapter is completed. Updated
 OpenAI Gym link with cached version.

---
 PolicyGradient/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PolicyGradient/README.md b/PolicyGradient/README.md
index dc534c914..373bcee95 100644
--- a/PolicyGradient/README.md
+++ b/PolicyGradient/README.md
@@ -36,13 +36,13 @@
 
 **Optional:**
 
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 13: Policy Gradient Methods (Under Construction)
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 13: Policy Gradient Methods
 - [Deterministic Policy Gradient Algorithms](http://jmlr.org/proceedings/papers/v32/silver14.pdf)
 - [Deterministic Policy Gradient Algorithms (Talk)](http://techtalks.tv/talks/deterministic-policy-gradient-algorithms/61098/)
 - [Continuous control with deep reinforcement learning](https://arxiv.org/abs/1509.02971)
 - [Deep Deterministic Policy Gradients in TensorFlow](http://pemami4911.github.io/blog_posts/2016/08/21/ddpg-rl.html)
 - [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/abs/1602.01783)
-- [Deep Reinforcement Learning: A Tutorial (Policy Gradient Section)](https://gym.openai.com/docs/rl#policy-gradients)
+- [Deep Reinforcement Learning: A Tutorial (Policy Gradient Section)](http://web.archive.org/web/20161029135055/https://gym.openai.com/docs/rl#id16)
 
 
 

From 1f2e2eb50a36655c9a17da8dad7b533d72333c5e Mon Sep 17 00:00:00 2001
From: Alex Bailo <alexandr.baylo@gmail.com>
Date: Wed, 27 Dec 2017 17:18:09 +0900
Subject: [PATCH 20/56] Fixed broken links to Solutions in PolicyGradient

---
 PolicyGradient/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PolicyGradient/README.md b/PolicyGradient/README.md
index 373bcee95..8d77199fb 100644
--- a/PolicyGradient/README.md
+++ b/PolicyGradient/README.md
@@ -53,10 +53,10 @@
   - [Solution](CliffWalk%20REINFORCE%20with%20Baseline%20Solution.ipynb)
 - Actor-Critic with Baseline
   - Exercise
-  - [Solution](CliffWalk%20Actor-Critic%20Solution.ipynb)
+  - [Solution](CliffWalk%20Actor%20Critic%20Solution.ipynb)
 - Actor-Critic with Baseline for Continuous Action Spaces
   - Exercise
-  - [Solution](Continuous%20MountainCar%20Actor-Critic%20Solution.ipynb)
+  - [Solution](Continuous%20MountainCar%20Actor%20Critic%20Solution.ipynb)
 - Deterministic Policy Gradients for Continuous Action Spaces (WIP)
 - Deep Deterministic Policy Gradients (WIP)
 - Asynchronous Advantage Actor-Critic (A3C)

From 783c2c39a3a94df40b5a62ab768e1570f95c776b Mon Sep 17 00:00:00 2001
From: Alex <alex@noul.kr>
Date: Thu, 28 Dec 2017 10:16:48 +0900
Subject: [PATCH 21/56] Mod. estimator_value comment in actor-critic

---
 .../CliffWalk Actor Critic Solution.ipynb     | 26 ++++++-------------
 ...us MountainCar Actor Critic Solution.ipynb |  8 +++---
 2 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/PolicyGradient/CliffWalk Actor Critic Solution.ipynb b/PolicyGradient/CliffWalk Actor Critic Solution.ipynb
index 0e952a07c..0a8fb509e 100644
--- a/PolicyGradient/CliffWalk Actor Critic Solution.ipynb	
+++ b/PolicyGradient/CliffWalk Actor Critic Solution.ipynb	
@@ -3,9 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": 16,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "%matplotlib inline\n",
@@ -29,9 +27,7 @@
   {
    "cell_type": "code",
    "execution_count": 17,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "env = CliffWalkingEnv()"
@@ -88,9 +84,7 @@
   {
    "cell_type": "code",
    "execution_count": 19,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "class ValueEstimator():\n",
@@ -145,7 +139,7 @@
     "    Args:\n",
     "        env: OpenAI environment.\n",
     "        estimator_policy: Policy Function to be optimized \n",
-    "        estimator_value: Value function approximator, used as a baseline\n",
+    "        estimator_value: Value function approximator, used as a critic\n",
     "        num_episodes: Number of episodes to run for\n",
     "        discount_factor: Time-discount factor\n",
     "    \n",
@@ -209,9 +203,7 @@
   {
    "cell_type": "code",
    "execution_count": 26,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -238,9 +230,7 @@
   {
    "cell_type": "code",
    "execution_count": 28,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -306,9 +296,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.0"
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/PolicyGradient/Continuous MountainCar Actor Critic Solution.ipynb b/PolicyGradient/Continuous MountainCar Actor Critic Solution.ipynb
index 4cbc43d27..6b34a0b62 100644
--- a/PolicyGradient/Continuous MountainCar Actor Critic Solution.ipynb	
+++ b/PolicyGradient/Continuous MountainCar Actor Critic Solution.ipynb	
@@ -233,7 +233,7 @@
     "    Args:\n",
     "        env: OpenAI environment.\n",
     "        estimator_policy: Policy Function to be optimized \n",
-    "        estimator_value: Value function approximator, used as a baseline\n",
+    "        estimator_value: Value function approximator, used as a critic\n",
     "        num_episodes: Number of episodes to run for\n",
     "        discount_factor: Time-discount factor\n",
     "    \n",
@@ -343,7 +343,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "plotting.plot_episode_stats(stats, smoothing_window=10)"
@@ -384,7 +386,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.1"
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,

From d8136b4c575001a6ae4ff67f4362ced9c6cf8846 Mon Sep 17 00:00:00 2001
From: Alex <alex@noul.kr>
Date: Wed, 3 Jan 2018 14:58:39 +0900
Subject: [PATCH 22/56] Updated links to new version of Sutton's book

---
 DP/README.md             | 2 +-
 FA/README.md             | 4 ++--
 Introduction/README.md   | 2 +-
 MC/README.md             | 2 +-
 MDP/README.md            | 2 +-
 PolicyGradient/README.md | 2 +-
 README.md                | 4 ++--
 TD/README.md             | 6 +++---
 8 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/DP/README.md b/DP/README.md
index 1c2bb768b..cf2cbf51f 100644
--- a/DP/README.md
+++ b/DP/README.md
@@ -28,7 +28,7 @@
 
 **Optional:**
 
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 4: Dynamic Programming
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 4: Dynamic Programming
 
 
 ### Exercises
diff --git a/FA/README.md b/FA/README.md
index f50f56cef..247c41e4e 100644
--- a/FA/README.md
+++ b/FA/README.md
@@ -25,8 +25,8 @@
 **Required:**
 
 - David Silver's RL Course Lecture 6 - Value Function Approximation ([video](https://www.youtube.com/watch?v=UoPei5o4fps), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/FA.pdf))
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 9: On-policy Prediction with Approximation
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 10: On-policy Control with Approximation
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 9: On-policy Prediction with Approximation
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 10: On-policy Control with Approximation
 
 **Optional:**
 
diff --git a/Introduction/README.md b/Introduction/README.md
index 9e5b383ac..cd27a4e12 100644
--- a/Introduction/README.md
+++ b/Introduction/README.md
@@ -17,7 +17,7 @@
 
 **Required:**
 
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 1: The Reinforcement Learning Problem
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 1: The Reinforcement Learning Problem
 - David Silver's RL Course Lecture 1 - Introduction to Reinforcement Learning ([video](https://www.youtube.com/watch?v=2pWv7GOvuf0), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/intro_RL.pdf))
 - [OpenAI Gym Tutorial](https://gym.openai.com/docs)
 
diff --git a/MC/README.md b/MC/README.md
index 9d23968c2..7b889ed6f 100644
--- a/MC/README.md
+++ b/MC/README.md
@@ -26,7 +26,7 @@
 
 **Required:**
 
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 5: Monte Carlo Methods
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 5: Monte Carlo Methods
 
 
 **Optional:**
diff --git a/MDP/README.md b/MDP/README.md
index 539799a09..de9bcce35 100644
--- a/MDP/README.md
+++ b/MDP/README.md
@@ -25,7 +25,7 @@
 
 **Required:**
 
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 3: Finite Markov Decision Processes
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 3: Finite Markov Decision Processes
 - David Silver's RL Course Lecture 2 - Markov Decision Processes ([video](https://www.youtube.com/watch?v=lfHX2hHRMVQ), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/MDP.pdf))
 
 
diff --git a/PolicyGradient/README.md b/PolicyGradient/README.md
index 8d77199fb..a7dffdeef 100644
--- a/PolicyGradient/README.md
+++ b/PolicyGradient/README.md
@@ -36,7 +36,7 @@
 
 **Optional:**
 
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 13: Policy Gradient Methods
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 13: Policy Gradient Methods
 - [Deterministic Policy Gradient Algorithms](http://jmlr.org/proceedings/papers/v32/silver14.pdf)
 - [Deterministic Policy Gradient Algorithms (Talk)](http://techtalks.tv/talks/deterministic-policy-gradient-algorithms/61098/)
 - [Continuous control with deep reinforcement learning](https://arxiv.org/abs/1509.02971)
diff --git a/README.md b/README.md
index 43a7be82a..72a11e5a9 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 This repository provides code, exercises and solutions for popular Reinforcement Learning algorithms. These are meant to serve as a learning tool to complement the theoretical materials from
 
-- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/book/bookdraft2017nov5.pdf)
+- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/book/bookdraft2018jan1.pdf)
 - [David Silver's Reinforcement Learning Course](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html)
 
 Each folder in corresponds to one or more chapters of the above textbook and/or course. In addition to exercises and solution, each folder also contains a list of learning goals, a brief concept summary, and links to the relevant readings.
@@ -50,7 +50,7 @@ All code is written in Python 3 and uses RL environments from [OpenAI Gym](https
 
 Textbooks:
 
-- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/book/bookdraft2017nov5.pdf)
+- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/book/bookdraft2018jan1.pdf)
 
 Classes:
 
diff --git a/TD/README.md b/TD/README.md
index b54bfead8..a4c35a0e9 100644
--- a/TD/README.md
+++ b/TD/README.md
@@ -28,14 +28,14 @@
 
 **Required:**
 
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 6: Temporal-Difference Learning
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 6: Temporal-Difference Learning
 - David Silver's RL Course Lecture 4 - Model-Free Prediction ([video](https://www.youtube.com/watch?v=PnHCvfgC_ZA), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/MC-TD.pdf))
 - David Silver's RL Course Lecture 5 - Model-Free Control ([video](https://www.youtube.com/watch?v=0g4j2k_Ggc4), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/control.pdf))
 
 **Optional:**
 
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 7: Multi-Step Bootstrapping
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 12: Eligibility Traces
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 7: Multi-Step Bootstrapping
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 12: Eligibility Traces
 
 
 ### Exercises

From 30326df0cf66d649c1619ce2e3134fc2839dcde9 Mon Sep 17 00:00:00 2001
From: Keith Gould <keithmgould@gmail.com>
Date: Wed, 24 Jan 2018 10:03:44 -0500
Subject: [PATCH 23/56] update value estimator only after calculating advantage

---
 .../CliffWalk REINFORCE with Baseline Solution.ipynb          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb b/PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb
index 4291d5551..cad46261d 100644
--- a/PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb	
+++ b/PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb	
@@ -196,11 +196,11 @@
     "        for t, transition in enumerate(episode):\n",
     "            # The return after this timestep\n",
     "            total_return = sum(discount_factor**i * t.reward for i, t in enumerate(episode[t:]))\n",
-    "            # Update our value estimator\n",
-    "            estimator_value.update(transition.state, total_return)\n",
     "            # Calculate baseline/advantage\n",
     "            baseline_value = estimator_value.predict(transition.state)            \n",
     "            advantage = total_return - baseline_value\n",
+    "            # Update our value estimator\n",
+    "            estimator_value.update(transition.state, total_return)\n",
     "            # Update our policy estimator\n",
     "            estimator_policy.update(transition.state, advantage, transition.action)\n",
     "    \n",

From 9454010f60a87a6e66a517fdb038365b9988146c Mon Sep 17 00:00:00 2001
From: Byzantine <morbyzantine@gmail.com>
Date: Sun, 28 Jan 2018 12:30:06 -0800
Subject: [PATCH 24/56] Minor fix: sync sample policy with the solution

---
 MC/MC Prediction.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MC/MC Prediction.ipynb b/MC/MC Prediction.ipynb
index 472f9ef35..13b3da809 100644
--- a/MC/MC Prediction.ipynb	
+++ b/MC/MC Prediction.ipynb	
@@ -87,7 +87,7 @@
     "    A policy that sticks if the player score is > 20 and hits otherwise.\n",
     "    \"\"\"\n",
     "    score, dealer_score, usable_ace = observation\n",
-    "    return np.array([1.0, 0.0]) if score >= 20 else np.array([0.0, 1.0])"
+    "    return 0 if score >= 20 else 1"
    ]
   },
   {

From 6211e2df03162f367c7a8c05728897385397ae35 Mon Sep 17 00:00:00 2001
From: Sanyam Kapoor <1sanyamkapoor@gmail.com>
Date: Mon, 19 Feb 2018 10:45:30 -0500
Subject: [PATCH 25/56] Add one step lookahead function for easy comparison
 with Value Iteration

---
 DP/Policy Evaluation Solution.ipynb | 42 ++++++--------
 DP/Policy Iteration Solution.ipynb  | 89 +++++++++++++++--------------
 2 files changed, 62 insertions(+), 69 deletions(-)

diff --git a/DP/Policy Evaluation Solution.ipynb b/DP/Policy Evaluation Solution.ipynb
index 703e020fb..d69fe2546 100644
--- a/DP/Policy Evaluation Solution.ipynb	
+++ b/DP/Policy Evaluation Solution.ipynb	
@@ -2,12 +2,11 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 53,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 1,
+   "metadata": {},
    "outputs": [],
    "source": [
+    "from IPython.core.debugger import set_trace\n",
     "import numpy as np\n",
     "import pprint\n",
     "import sys\n",
@@ -18,10 +17,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 54,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 2,
+   "metadata": {},
    "outputs": [],
    "source": [
     "pp = pprint.PrettyPrinter(indent=2)\n",
@@ -30,10 +27,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 55,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 3,
+   "metadata": {},
    "outputs": [],
    "source": [
     "def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):\n",
@@ -76,10 +71,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 4,
+   "metadata": {},
    "outputs": [],
    "source": [
     "random_policy = np.ones([env.nS, env.nA]) / env.nA\n",
@@ -88,7 +81,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 57,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -98,7 +91,8 @@
       "Value Function:\n",
       "[  0.         -13.99993529 -19.99990698 -21.99989761 -13.99993529\n",
       " -17.9999206  -19.99991379 -19.99991477 -19.99990698 -19.99991379\n",
-      " -17.99992725 -13.99994569 -21.99989761 -19.99991477 -13.99994569   0.        ]\n",
+      " -17.99992725 -13.99994569 -21.99989761 -19.99991477 -13.99994569\n",
+      "   0.        ]\n",
       "\n",
       "Reshaped Grid Value Function:\n",
       "[[  0.         -13.99993529 -19.99990698 -21.99989761]\n",
@@ -121,10 +115,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 51,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 6,
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Test: Make sure the evaluated policy is what we expected\n",
@@ -135,9 +127,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": []
   }
@@ -158,7 +148,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.2"
+   "version": "3.6.4"
   }
  },
  "nbformat": 4,
diff --git a/DP/Policy Iteration Solution.ipynb b/DP/Policy Iteration Solution.ipynb
index be7d3710e..dc121c8c5 100644
--- a/DP/Policy Iteration Solution.ipynb	
+++ b/DP/Policy Iteration Solution.ipynb	
@@ -3,9 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -19,9 +17,7 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "pp = pprint.PrettyPrinter(indent=2)\n",
@@ -30,10 +26,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 62,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 3,
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Taken from Policy Evaluation Exercise!\n",
@@ -78,10 +72,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 63,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 4,
+   "metadata": {},
    "outputs": [],
    "source": [
     "def policy_improvement(env, policy_eval_fn=policy_eval, discount_factor=1.0):\n",
@@ -102,6 +94,24 @@
     "        V is the value function for the optimal policy.\n",
     "        \n",
     "    \"\"\"\n",
+    "\n",
+    "    def one_step_lookahead(state, V):\n",
+    "        \"\"\"\n",
+    "        Helper function to calculate the value for all action in a given state.\n",
+    "        \n",
+    "        Args:\n",
+    "            state: The state to consider (int)\n",
+    "            V: The value to use as an estimator, Vector of length env.nS\n",
+    "        \n",
+    "        Returns:\n",
+    "            A vector of length env.nA containing the expected value of each action.\n",
+    "        \"\"\"\n",
+    "        A = np.zeros(env.nA)\n",
+    "        for a in range(env.nA):\n",
+    "            for prob, next_state, reward, done in env.P[state][a]:\n",
+    "                A[a] += prob * (reward + discount_factor * V[next_state])\n",
+    "        return A\n",
+    "    \n",
     "    # Start with a random policy\n",
     "    policy = np.ones([env.nS, env.nA]) / env.nA\n",
     "    \n",
@@ -119,10 +129,7 @@
     "            \n",
     "            # Find the best action by one-step lookahead\n",
     "            # Ties are resolved arbitarily\n",
-    "            action_values = np.zeros(env.nA)\n",
-    "            for a in range(env.nA):\n",
-    "                for prob, next_state, reward, done in env.P[s][a]:\n",
-    "                    action_values[a] += prob * (reward + discount_factor * V[next_state])\n",
+    "            action_values = one_step_lookahead(s, V)\n",
     "            best_a = np.argmax(action_values)\n",
     "            \n",
     "            # Greedily update the policy\n",
@@ -137,7 +144,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 64,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -145,22 +152,22 @@
      "output_type": "stream",
      "text": [
       "Policy Probability Distribution:\n",
-      "[[ 1.  0.  0.  0.]\n",
-      " [ 0.  0.  0.  1.]\n",
-      " [ 0.  0.  0.  1.]\n",
-      " [ 0.  0.  1.  0.]\n",
-      " [ 1.  0.  0.  0.]\n",
-      " [ 1.  0.  0.  0.]\n",
-      " [ 1.  0.  0.  0.]\n",
-      " [ 0.  0.  1.  0.]\n",
-      " [ 1.  0.  0.  0.]\n",
-      " [ 1.  0.  0.  0.]\n",
-      " [ 0.  1.  0.  0.]\n",
-      " [ 0.  0.  1.  0.]\n",
-      " [ 1.  0.  0.  0.]\n",
-      " [ 0.  1.  0.  0.]\n",
-      " [ 0.  1.  0.  0.]\n",
-      " [ 1.  0.  0.  0.]]\n",
+      "[[1. 0. 0. 0.]\n",
+      " [0. 0. 0. 1.]\n",
+      " [0. 0. 0. 1.]\n",
+      " [0. 0. 1. 0.]\n",
+      " [1. 0. 0. 0.]\n",
+      " [1. 0. 0. 0.]\n",
+      " [1. 0. 0. 0.]\n",
+      " [0. 0. 1. 0.]\n",
+      " [1. 0. 0. 0.]\n",
+      " [1. 0. 0. 0.]\n",
+      " [0. 1. 0. 0.]\n",
+      " [0. 0. 1. 0.]\n",
+      " [1. 0. 0. 0.]\n",
+      " [0. 1. 0. 0.]\n",
+      " [0. 1. 0. 0.]\n",
+      " [1. 0. 0. 0.]]\n",
       "\n",
       "Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):\n",
       "[[0 3 3 2]\n",
@@ -202,10 +209,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 59,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 6,
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Test the value function\n",
@@ -216,9 +221,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": []
   }
@@ -239,7 +242,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.2"
+   "version": "3.6.4"
   }
  },
  "nbformat": 4,

From e030ecfe1e980189106d6d58bb69967819b78a7d Mon Sep 17 00:00:00 2001
From: Sanyam Kapoor <1sanyamkapoor@gmail.com>
Date: Mon, 19 Feb 2018 10:45:54 -0500
Subject: [PATCH 26/56] Add value check assertion

---
 DP/Value Iteration Solution.ipynb | 69 ++++++++++++++++---------------
 1 file changed, 36 insertions(+), 33 deletions(-)

diff --git a/DP/Value Iteration Solution.ipynb b/DP/Value Iteration Solution.ipynb
index cd0da629f..c7134dff3 100644
--- a/DP/Value Iteration Solution.ipynb	
+++ b/DP/Value Iteration Solution.ipynb	
@@ -2,10 +2,8 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 1,
+   "metadata": {},
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -18,10 +16,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 2,
+   "metadata": {},
    "outputs": [],
    "source": [
     "pp = pprint.PrettyPrinter(indent=2)\n",
@@ -30,10 +26,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 3,
+   "metadata": {},
    "outputs": [],
    "source": [
     "def value_iteration(env, theta=0.0001, discount_factor=1.0):\n",
@@ -100,7 +94,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -108,22 +102,22 @@
      "output_type": "stream",
      "text": [
       "Policy Probability Distribution:\n",
-      "[[ 1.  0.  0.  0.]\n",
-      " [ 0.  0.  0.  1.]\n",
-      " [ 0.  0.  0.  1.]\n",
-      " [ 0.  0.  1.  0.]\n",
-      " [ 1.  0.  0.  0.]\n",
-      " [ 1.  0.  0.  0.]\n",
-      " [ 1.  0.  0.  0.]\n",
-      " [ 0.  0.  1.  0.]\n",
-      " [ 1.  0.  0.  0.]\n",
-      " [ 1.  0.  0.  0.]\n",
-      " [ 0.  1.  0.  0.]\n",
-      " [ 0.  0.  1.  0.]\n",
-      " [ 1.  0.  0.  0.]\n",
-      " [ 0.  1.  0.  0.]\n",
-      " [ 0.  1.  0.  0.]\n",
-      " [ 1.  0.  0.  0.]]\n",
+      "[[1. 0. 0. 0.]\n",
+      " [0. 0. 0. 1.]\n",
+      " [0. 0. 0. 1.]\n",
+      " [0. 0. 1. 0.]\n",
+      " [1. 0. 0. 0.]\n",
+      " [1. 0. 0. 0.]\n",
+      " [1. 0. 0. 0.]\n",
+      " [0. 0. 1. 0.]\n",
+      " [1. 0. 0. 0.]\n",
+      " [1. 0. 0. 0.]\n",
+      " [0. 1. 0. 0.]\n",
+      " [0. 0. 1. 0.]\n",
+      " [1. 0. 0. 0.]\n",
+      " [0. 1. 0. 0.]\n",
+      " [0. 1. 0. 0.]\n",
+      " [1. 0. 0. 0.]]\n",
       "\n",
       "Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):\n",
       "[[0 3 3 2]\n",
@@ -163,12 +157,21 @@
     "print(\"\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Test the value function\n",
+    "expected_v = np.array([ 0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1,  0])\n",
+    "np.testing.assert_array_almost_equal(v, expected_v, decimal=2)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": []
   }
@@ -190,7 +193,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.2"
+   "version": "3.6.4"
   }
  },
  "nbformat": 4,

From edcba6b8790dbfda151c319378d48ee349a0f4ac Mon Sep 17 00:00:00 2001
From: Sanyam Kapoor <1sanyamkapoor@gmail.com>
Date: Mon, 19 Feb 2018 14:20:31 -0500
Subject: [PATCH 27/56] Fix step and reset NotImplementedError

---
 MC/Blackjack Playground.ipynb | 171 ++++++----------------------------
 lib/envs/blackjack.py         |   8 +-
 2 files changed, 34 insertions(+), 145 deletions(-)

diff --git a/MC/Blackjack Playground.ipynb b/MC/Blackjack Playground.ipynb
index 28dfc1867..f4f6ffe84 100644
--- a/MC/Blackjack Playground.ipynb	
+++ b/MC/Blackjack Playground.ipynb	
@@ -2,10 +2,8 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 419,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 1,
+   "metadata": {},
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -17,7 +15,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 420,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -26,151 +24,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 422,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Player Score: 17 (Usable Ace: False), Dealer Score: 10\n",
-      "Taking action: Hit\n",
-      "Player Score: 18 (Usable Ace: False), Dealer Score: 10\n",
-      "Taking action: Hit\n",
-      "Player Score: 28 (Usable Ace: False), Dealer Score: 10\n",
-      "Game end. Reward: -1.0\n",
-      "\n",
-      "Player Score: 6 (Usable Ace: False), Dealer Score: 9\n",
-      "Taking action: Hit\n",
-      "Player Score: 16 (Usable Ace: False), Dealer Score: 9\n",
-      "Taking action: Hit\n",
-      "Player Score: 26 (Usable Ace: False), Dealer Score: 9\n",
-      "Game end. Reward: -1.0\n",
-      "\n",
-      "Player Score: 12 (Usable Ace: False), Dealer Score: 6\n",
-      "Taking action: Hit\n",
-      "Player Score: 21 (Usable Ace: False), Dealer Score: 6\n",
-      "Taking action: Stick\n",
-      "Player Score: 21 (Usable Ace: False), Dealer Score: 6\n",
-      "Game end. Reward: 1.0\n",
-      "\n",
-      "Player Score: 17 (Usable Ace: True), Dealer Score: 8\n",
-      "Taking action: Hit\n",
-      "Player Score: 17 (Usable Ace: False), Dealer Score: 8\n",
-      "Taking action: Hit\n",
-      "Player Score: 22 (Usable Ace: False), Dealer Score: 8\n",
-      "Game end. Reward: -1.0\n",
-      "\n",
       "Player Score: 17 (Usable Ace: False), Dealer Score: 8\n",
-      "Taking action: Hit\n",
-      "Player Score: 27 (Usable Ace: False), Dealer Score: 8\n",
-      "Game end. Reward: -1.0\n",
-      "\n",
-      "Player Score: 16 (Usable Ace: False), Dealer Score: 10\n",
-      "Taking action: Hit\n",
-      "Player Score: 19 (Usable Ace: False), Dealer Score: 10\n",
-      "Taking action: Hit\n",
-      "Player Score: 28 (Usable Ace: False), Dealer Score: 10\n",
-      "Game end. Reward: -1.0\n",
-      "\n",
-      "Player Score: 13 (Usable Ace: False), Dealer Score: 7\n",
-      "Taking action: Hit\n",
-      "Player Score: 14 (Usable Ace: False), Dealer Score: 7\n",
-      "Taking action: Hit\n",
-      "Player Score: 24 (Usable Ace: False), Dealer Score: 7\n",
-      "Game end. Reward: -1.0\n",
-      "\n",
-      "Player Score: 17 (Usable Ace: False), Dealer Score: 5\n",
-      "Taking action: Hit\n",
-      "Player Score: 25 (Usable Ace: False), Dealer Score: 5\n",
-      "Game end. Reward: -1.0\n",
-      "\n",
-      "Player Score: 20 (Usable Ace: False), Dealer Score: 5\n",
-      "Taking action: Stick\n",
-      "Player Score: 20 (Usable Ace: False), Dealer Score: 5\n",
-      "Game end. Reward: 1.0\n",
-      "\n",
-      "Player Score: 12 (Usable Ace: True), Dealer Score: 10\n",
-      "Taking action: Hit\n",
-      "Player Score: 20 (Usable Ace: True), Dealer Score: 10\n",
-      "Taking action: Stick\n",
-      "Player Score: 20 (Usable Ace: True), Dealer Score: 10\n",
-      "Game end. Reward: 0.0\n",
-      "\n",
-      "Player Score: 12 (Usable Ace: False), Dealer Score: 10\n",
-      "Taking action: Hit\n",
-      "Player Score: 19 (Usable Ace: False), Dealer Score: 10\n",
-      "Taking action: Hit\n",
-      "Player Score: 24 (Usable Ace: False), Dealer Score: 10\n",
-      "Game end. Reward: -1.0\n",
-      "\n",
-      "Player Score: 19 (Usable Ace: False), Dealer Score: 4\n",
-      "Taking action: Hit\n",
-      "Player Score: 22 (Usable Ace: False), Dealer Score: 4\n",
-      "Game end. Reward: -1.0\n",
-      "\n",
-      "Player Score: 16 (Usable Ace: False), Dealer Score: 10\n",
-      "Taking action: Hit\n",
-      "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n",
-      "Taking action: Stick\n",
-      "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n",
-      "Game end. Reward: 0.0\n",
-      "\n",
-      "Player Score: 4 (Usable Ace: False), Dealer Score: 3\n",
-      "Taking action: Hit\n",
-      "Player Score: 14 (Usable Ace: False), Dealer Score: 3\n",
-      "Taking action: Hit\n",
-      "Player Score: 24 (Usable Ace: False), Dealer Score: 3\n",
-      "Game end. Reward: -1.0\n",
-      "\n",
-      "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n",
-      "Taking action: Stick\n",
-      "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n",
-      "Game end. Reward: 1.0\n",
-      "\n",
-      "Player Score: 16 (Usable Ace: True), Dealer Score: 10\n",
-      "Taking action: Hit\n",
-      "Player Score: 12 (Usable Ace: False), Dealer Score: 10\n",
-      "Taking action: Hit\n",
-      "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n",
-      "Taking action: Stick\n",
-      "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n",
-      "Game end. Reward: 1.0\n",
-      "\n",
-      "Player Score: 9 (Usable Ace: False), Dealer Score: 10\n",
-      "Taking action: Hit\n",
-      "Player Score: 19 (Usable Ace: False), Dealer Score: 10\n",
-      "Taking action: Hit\n",
-      "Player Score: 26 (Usable Ace: False), Dealer Score: 10\n",
-      "Game end. Reward: -1.0\n",
-      "\n",
-      "Player Score: 12 (Usable Ace: False), Dealer Score: 5\n",
-      "Taking action: Hit\n",
-      "Player Score: 15 (Usable Ace: False), Dealer Score: 5\n",
-      "Taking action: Hit\n",
-      "Player Score: 21 (Usable Ace: False), Dealer Score: 5\n",
-      "Taking action: Stick\n",
-      "Player Score: 21 (Usable Ace: False), Dealer Score: 5\n",
-      "Game end. Reward: 1.0\n",
-      "\n",
-      "Player Score: 11 (Usable Ace: False), Dealer Score: 9\n",
-      "Taking action: Hit\n",
-      "Player Score: 13 (Usable Ace: False), Dealer Score: 9\n",
-      "Taking action: Hit\n",
-      "Player Score: 17 (Usable Ace: False), Dealer Score: 9\n",
-      "Taking action: Hit\n",
-      "Player Score: 19 (Usable Ace: False), Dealer Score: 9\n",
-      "Taking action: Hit\n",
-      "Player Score: 29 (Usable Ace: False), Dealer Score: 9\n",
-      "Game end. Reward: -1.0\n",
-      "\n",
-      "Player Score: 14 (Usable Ace: False), Dealer Score: 7\n",
-      "Taking action: Hit\n",
-      "Player Score: 19 (Usable Ace: False), Dealer Score: 7\n",
-      "Taking action: Hit\n",
-      "Player Score: 29 (Usable Ace: False), Dealer Score: 7\n",
-      "Game end. Reward: -1.0\n",
-      "\n"
+      "Taking action: Hit\n"
+     ]
+    },
+    {
+     "ename": "RecursionError",
+     "evalue": "maximum recursion depth exceeded",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRecursionError\u001b[0m                            Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-3-e78e3f41e925>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     15\u001b[0m         \u001b[0maction\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstrategy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobservation\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     16\u001b[0m         \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Taking action: {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\"Stick\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Hit\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0maction\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m         \u001b[0mobservation\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreward\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     18\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mdone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     19\u001b[0m             \u001b[0mprint_observation\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobservation\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/Workspace/src/github.com/dennybritz/reinforcement-learning/lib/envs/blackjack.py\u001b[0m in \u001b[0;36mstep\u001b[0;34m(self, action)\u001b[0m\n\u001b[1;32m     84\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     85\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 86\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     87\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     88\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_seed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mseed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "... last 1 frames repeated, from the frame below ...\n",
+      "\u001b[0;32m~/Workspace/src/github.com/dennybritz/reinforcement-learning/lib/envs/blackjack.py\u001b[0m in \u001b[0;36mstep\u001b[0;34m(self, action)\u001b[0m\n\u001b[1;32m     84\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     85\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 86\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     87\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     88\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_seed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mseed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mRecursionError\u001b[0m: maximum recursion depth exceeded"
      ]
     }
    ],
@@ -197,6 +73,13 @@
     "            print(\"Game end. Reward: {}\\n\".format(float(reward)))\n",
     "            break"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -215,7 +98,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.2"
+   "version": "3.6.4"
   }
  },
  "nbformat": 4,
diff --git a/lib/envs/blackjack.py b/lib/envs/blackjack.py
index 158c49709..9052b4677 100644
--- a/lib/envs/blackjack.py
+++ b/lib/envs/blackjack.py
@@ -79,6 +79,12 @@ def __init__(self, natural=False):
         self._reset()        # Number of 
         self.nA = 2
 
+    def reset(self):
+        return self._reset()
+
+    def step(self, action):
+        return self._step(action)
+
     def _seed(self, seed=None):
         self.np_random, seed = seeding.np_random(seed)
         return [seed]
@@ -113,4 +119,4 @@ def _reset(self):
         while sum_hand(self.player) < 12:
             self.player.append(draw_card(self.np_random))
 
-        return self._get_obs()
\ No newline at end of file
+        return self._get_obs()

From ba12f971f316078adc20ae898ff8de65491c1925 Mon Sep 17 00:00:00 2001
From: Sanyam Kapoor <1sanyamkapoor@gmail.com>
Date: Mon, 19 Feb 2018 14:27:28 -0500
Subject: [PATCH 28/56] Update playground output

---
 MC/Blackjack Playground.ipynb | 144 ++++++++++++++++++++++++++++++----
 1 file changed, 128 insertions(+), 16 deletions(-)

diff --git a/MC/Blackjack Playground.ipynb b/MC/Blackjack Playground.ipynb
index f4f6ffe84..412322175 100644
--- a/MC/Blackjack Playground.ipynb	
+++ b/MC/Blackjack Playground.ipynb	
@@ -31,22 +31,134 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Player Score: 17 (Usable Ace: False), Dealer Score: 8\n",
-      "Taking action: Hit\n"
-     ]
-    },
-    {
-     "ename": "RecursionError",
-     "evalue": "maximum recursion depth exceeded",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mRecursionError\u001b[0m                            Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-3-e78e3f41e925>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     15\u001b[0m         \u001b[0maction\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstrategy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobservation\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     16\u001b[0m         \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Taking action: {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\"Stick\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Hit\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0maction\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m         \u001b[0mobservation\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreward\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     18\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mdone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     19\u001b[0m             \u001b[0mprint_observation\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobservation\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/Workspace/src/github.com/dennybritz/reinforcement-learning/lib/envs/blackjack.py\u001b[0m in \u001b[0;36mstep\u001b[0;34m(self, action)\u001b[0m\n\u001b[1;32m     84\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     85\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 86\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     87\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     88\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_seed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mseed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "... last 1 frames repeated, from the frame below ...\n",
-      "\u001b[0;32m~/Workspace/src/github.com/dennybritz/reinforcement-learning/lib/envs/blackjack.py\u001b[0m in \u001b[0;36mstep\u001b[0;34m(self, action)\u001b[0m\n\u001b[1;32m     84\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     85\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 86\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     87\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     88\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_seed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mseed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mRecursionError\u001b[0m: maximum recursion depth exceeded"
+      "Player Score: 19 (Usable Ace: False), Dealer Score: 5\n",
+      "Taking action: Hit\n",
+      "Player Score: 27 (Usable Ace: False), Dealer Score: 5\n",
+      "Game end. Reward: -1.0\n",
+      "\n",
+      "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n",
+      "Taking action: Stick\n",
+      "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n",
+      "Game end. Reward: 0.0\n",
+      "\n",
+      "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n",
+      "Taking action: Stick\n",
+      "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n",
+      "Game end. Reward: 1.0\n",
+      "\n",
+      "Player Score: 14 (Usable Ace: True), Dealer Score: 10\n",
+      "Taking action: Hit\n",
+      "Player Score: 19 (Usable Ace: True), Dealer Score: 10\n",
+      "Taking action: Hit\n",
+      "Player Score: 15 (Usable Ace: False), Dealer Score: 10\n",
+      "Taking action: Hit\n",
+      "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n",
+      "Taking action: Stick\n",
+      "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n",
+      "Game end. Reward: 1.0\n",
+      "\n",
+      "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n",
+      "Taking action: Stick\n",
+      "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n",
+      "Game end. Reward: 1.0\n",
+      "\n",
+      "Player Score: 18 (Usable Ace: False), Dealer Score: 6\n",
+      "Taking action: Hit\n",
+      "Player Score: 27 (Usable Ace: False), Dealer Score: 6\n",
+      "Game end. Reward: -1.0\n",
+      "\n",
+      "Player Score: 16 (Usable Ace: False), Dealer Score: 3\n",
+      "Taking action: Hit\n",
+      "Player Score: 18 (Usable Ace: False), Dealer Score: 3\n",
+      "Taking action: Hit\n",
+      "Player Score: 23 (Usable Ace: False), Dealer Score: 3\n",
+      "Game end. Reward: -1.0\n",
+      "\n",
+      "Player Score: 19 (Usable Ace: False), Dealer Score: 10\n",
+      "Taking action: Hit\n",
+      "Player Score: 23 (Usable Ace: False), Dealer Score: 10\n",
+      "Game end. Reward: -1.0\n",
+      "\n",
+      "Player Score: 19 (Usable Ace: False), Dealer Score: 4\n",
+      "Taking action: Hit\n",
+      "Player Score: 21 (Usable Ace: False), Dealer Score: 4\n",
+      "Taking action: Stick\n",
+      "Player Score: 21 (Usable Ace: False), Dealer Score: 4\n",
+      "Game end. Reward: 1.0\n",
+      "\n",
+      "Player Score: 21 (Usable Ace: True), Dealer Score: 4\n",
+      "Taking action: Stick\n",
+      "Player Score: 21 (Usable Ace: True), Dealer Score: 4\n",
+      "Game end. Reward: 1.0\n",
+      "\n",
+      "Player Score: 16 (Usable Ace: True), Dealer Score: 10\n",
+      "Taking action: Hit\n",
+      "Player Score: 16 (Usable Ace: False), Dealer Score: 10\n",
+      "Taking action: Hit\n",
+      "Player Score: 26 (Usable Ace: False), Dealer Score: 10\n",
+      "Game end. Reward: -1.0\n",
+      "\n",
+      "Player Score: 14 (Usable Ace: False), Dealer Score: 10\n",
+      "Taking action: Hit\n",
+      "Player Score: 23 (Usable Ace: False), Dealer Score: 10\n",
+      "Game end. Reward: -1.0\n",
+      "\n",
+      "Player Score: 12 (Usable Ace: False), Dealer Score: 10\n",
+      "Taking action: Hit\n",
+      "Player Score: 15 (Usable Ace: False), Dealer Score: 10\n",
+      "Taking action: Hit\n",
+      "Player Score: 16 (Usable Ace: False), Dealer Score: 10\n",
+      "Taking action: Hit\n",
+      "Player Score: 26 (Usable Ace: False), Dealer Score: 10\n",
+      "Game end. Reward: -1.0\n",
+      "\n",
+      "Player Score: 16 (Usable Ace: True), Dealer Score: 8\n",
+      "Taking action: Hit\n",
+      "Player Score: 18 (Usable Ace: True), Dealer Score: 8\n",
+      "Taking action: Hit\n",
+      "Player Score: 18 (Usable Ace: False), Dealer Score: 8\n",
+      "Taking action: Hit\n",
+      "Player Score: 20 (Usable Ace: False), Dealer Score: 8\n",
+      "Taking action: Stick\n",
+      "Player Score: 20 (Usable Ace: False), Dealer Score: 8\n",
+      "Game end. Reward: 1.0\n",
+      "\n",
+      "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n",
+      "Taking action: Stick\n",
+      "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n",
+      "Game end. Reward: -1.0\n",
+      "\n",
+      "Player Score: 15 (Usable Ace: False), Dealer Score: 10\n",
+      "Taking action: Hit\n",
+      "Player Score: 16 (Usable Ace: False), Dealer Score: 10\n",
+      "Taking action: Hit\n",
+      "Player Score: 23 (Usable Ace: False), Dealer Score: 10\n",
+      "Game end. Reward: -1.0\n",
+      "\n",
+      "Player Score: 12 (Usable Ace: False), Dealer Score: 4\n",
+      "Taking action: Hit\n",
+      "Player Score: 16 (Usable Ace: False), Dealer Score: 4\n",
+      "Taking action: Hit\n",
+      "Player Score: 24 (Usable Ace: False), Dealer Score: 4\n",
+      "Game end. Reward: -1.0\n",
+      "\n",
+      "Player Score: 20 (Usable Ace: False), Dealer Score: 7\n",
+      "Taking action: Stick\n",
+      "Player Score: 20 (Usable Ace: False), Dealer Score: 7\n",
+      "Game end. Reward: 1.0\n",
+      "\n",
+      "Player Score: 15 (Usable Ace: False), Dealer Score: 7\n",
+      "Taking action: Hit\n",
+      "Player Score: 21 (Usable Ace: False), Dealer Score: 7\n",
+      "Taking action: Stick\n",
+      "Player Score: 21 (Usable Ace: False), Dealer Score: 7\n",
+      "Game end. Reward: 1.0\n",
+      "\n",
+      "Player Score: 15 (Usable Ace: False), Dealer Score: 8\n",
+      "Taking action: Hit\n",
+      "Player Score: 23 (Usable Ace: False), Dealer Score: 8\n",
+      "Game end. Reward: -1.0\n",
+      "\n"
      ]
     }
    ],

From 8da669c1496a617de8cbdf8c62ef075a4b9d8f3f Mon Sep 17 00:00:00 2001
From: Sanyam Kapoor <1sanyamkapoor@gmail.com>
Date: Tue, 20 Feb 2018 16:42:36 -0500
Subject: [PATCH 29/56] Fix missing render()

---
 TD/Cliff Environment Playground.ipynb | 19 +++++++++++--------
 TD/Windy Gridworld Playground.ipynb   | 23 +++++++++++++----------
 lib/envs/cliff_walking.py             |  5 ++++-
 lib/envs/windy_gridworld.py           |  5 ++++-
 4 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/TD/Cliff Environment Playground.ipynb b/TD/Cliff Environment Playground.ipynb
index d50da42b6..414cf811d 100644
--- a/TD/Cliff Environment Playground.ipynb	
+++ b/TD/Cliff Environment Playground.ipynb	
@@ -3,9 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import gym\n",
@@ -21,9 +19,7 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -80,6 +76,13 @@
     "print(env.step(2))\n",
     "env.render()"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -98,9 +101,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "version": "3.6.4"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/TD/Windy Gridworld Playground.ipynb b/TD/Windy Gridworld Playground.ipynb
index 7c37d7857..0572c0d86 100644
--- a/TD/Windy Gridworld Playground.ipynb	
+++ b/TD/Windy Gridworld Playground.ipynb	
@@ -2,10 +2,8 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "collapsed": false
-   },
+   "execution_count": 1,
+   "metadata": {},
    "outputs": [],
    "source": [
     "import gym\n",
@@ -20,10 +18,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "collapsed": false
-   },
+   "execution_count": 2,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -119,6 +115,13 @@
     "print(env.step(1))\n",
     "env.render()"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -137,9 +140,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "version": "3.6.4"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/lib/envs/cliff_walking.py b/lib/envs/cliff_walking.py
index 37516ad1b..30b2ff7bb 100644
--- a/lib/envs/cliff_walking.py
+++ b/lib/envs/cliff_walking.py
@@ -53,6 +53,9 @@ def __init__(self):
 
         super(CliffWalkingEnv, self).__init__(nS, nA, P, isd)
 
+    def render(self, mode='human', close=False):
+        self._render(mode, close)
+
     def _render(self, mode='human', close=False):
         if close:
             return
@@ -78,4 +81,4 @@ def _render(self, mode='human', close=False):
                 output += "\n"
 
             outfile.write(output)
-        outfile.write("\n")
\ No newline at end of file
+        outfile.write("\n")
diff --git a/lib/envs/windy_gridworld.py b/lib/envs/windy_gridworld.py
index 7524dbd58..720c5974b 100644
--- a/lib/envs/windy_gridworld.py
+++ b/lib/envs/windy_gridworld.py
@@ -53,6 +53,9 @@ def __init__(self):
 
         super(WindyGridworldEnv, self).__init__(nS, nA, P, isd)
 
+    def render(self, mode='human', close=False):
+        self._render(mode, close)
+
     def _render(self, mode='human', close=False):
         if close:
             return
@@ -76,4 +79,4 @@ def _render(self, mode='human', close=False):
                 output += "\n"
 
             outfile.write(output)
-        outfile.write("\n")
\ No newline at end of file
+        outfile.write("\n")

From 542cbf04e553b9bbac7c4dc7e0dfd69dacb458f5 Mon Sep 17 00:00:00 2001
From: jonahweissman <19804455+jonahweissman@users.noreply.github.com>
Date: Wed, 7 Mar 2018 18:13:01 -0500
Subject: [PATCH 30/56] Fix typo in MC Control

dictionar -> dictionary
---
 MC/MC Control with Epsilon-Greedy Policies Solution.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb b/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb
index 0f10d783e..40af11f40 100644
--- a/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb	
+++ b/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb	
@@ -139,7 +139,7 @@
     "            returns_count[sa_pair] += 1.0\n",
     "            Q[state][action] = returns_sum[sa_pair] / returns_count[sa_pair]\n",
     "        \n",
-    "        # The policy is improved implicitly by changing the Q dictionar\n",
+    "        # The policy is improved implicitly by changing the Q dictionary\n",
     "    \n",
     "    return Q, policy"
    ]

From c90ebaf06ab507d4a2cb7eaa9bf382bb0f94d2ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ayberk=20Ayd=C4=B1n?= <e187569@metu.edu.tr>
Date: Fri, 13 Apr 2018 18:20:08 +0300
Subject: [PATCH 31/56] correction for state processor output shape

---
 DQN/Deep Q Learning.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DQN/Deep Q Learning.ipynb b/DQN/Deep Q Learning.ipynb
index 29631ce0a..fcd7191a8 100644
--- a/DQN/Deep Q Learning.ipynb	
+++ b/DQN/Deep Q Learning.ipynb	
@@ -74,7 +74,7 @@
     "            state: A [210, 160, 3] Atari RGB State\n",
     "\n",
     "        Returns:\n",
-    "            A processed [84, 84, 1] state representing grayscale values.\n",
+    "            A processed [84, 84] state representing grayscale values.\n",
     "        \"\"\"\n",
     "        return sess.run(self.output, { self.input_state: state })"
    ]

From 56f893c059be47d3d36cf5fbdf9a5bb1270ef182 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ayberk=20Ayd=C4=B1n?= <e187569@metu.edu.tr>
Date: Sat, 14 Apr 2018 14:16:46 +0300
Subject: [PATCH 32/56] typo fix and correction for state processor output
 shape

---
 DQN/Deep Q Learning Solution.ipynb | 4 ++--
 DQN/Deep Q Learning.ipynb          | 2 +-
 DQN/Double DQN Solution.ipynb      | 2 +-
 DQN/dqn.py                         | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/DQN/Deep Q Learning Solution.ipynb b/DQN/Deep Q Learning Solution.ipynb
index 1477005ef..fc88b90ae 100644
--- a/DQN/Deep Q Learning Solution.ipynb	
+++ b/DQN/Deep Q Learning Solution.ipynb	
@@ -79,7 +79,7 @@
     "            state: A [210, 160, 3] Atari RGB State\n",
     "\n",
     "        Returns:\n",
-    "            A processed [84, 84, 1] state representing grayscale values.\n",
+    "            A processed [84, 84] state representing grayscale values.\n",
     "        \"\"\"\n",
     "        return sess.run(self.output, { self.input_state: state })"
    ]
@@ -144,7 +144,7 @@
     "        gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.actions_pl\n",
     "        self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices)\n",
     "\n",
-    "        # Calcualte the loss\n",
+    "        # Calculate the loss\n",
     "        self.losses = tf.squared_difference(self.y_pl, self.action_predictions)\n",
     "        self.loss = tf.reduce_mean(self.losses)\n",
     "\n",
diff --git a/DQN/Deep Q Learning.ipynb b/DQN/Deep Q Learning.ipynb
index fcd7191a8..d3a51697f 100644
--- a/DQN/Deep Q Learning.ipynb	
+++ b/DQN/Deep Q Learning.ipynb	
@@ -137,7 +137,7 @@
     "        gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.actions_pl\n",
     "        self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices)\n",
     "\n",
-    "        # Calcualte the loss\n",
+    "        # Calculate the loss\n",
     "        self.losses = tf.squared_difference(self.y_pl, self.action_predictions)\n",
     "        self.loss = tf.reduce_mean(self.losses)\n",
     "\n",
diff --git a/DQN/Double DQN Solution.ipynb b/DQN/Double DQN Solution.ipynb
index 7d8411fdd..3fc45722b 100644
--- a/DQN/Double DQN Solution.ipynb	
+++ b/DQN/Double DQN Solution.ipynb	
@@ -73,7 +73,7 @@
     "            state: A [210, 160, 3] Atari RGB State\n",
     "\n",
     "        Returns:\n",
-    "            A processed [84, 84, 1] state representing grayscale values.\n",
+    "            A processed [84, 84] state representing grayscale values.\n",
     "        \"\"\"\n",
     "        return sess.run(self.output, { self.input_state: state })"
    ]
diff --git a/DQN/dqn.py b/DQN/dqn.py
index be43ec08b..9d6532a8a 100755
--- a/DQN/dqn.py
+++ b/DQN/dqn.py
@@ -39,7 +39,7 @@ def process(self, sess, state):
             state: A [210, 160, 3] Atari RGB State
 
         Returns:
-            A processed [84, 84, 1] state representing grayscale values.
+            A processed [84, 84] state representing grayscale values.
         """
         return sess.run(self.output, { self.input_state: state })
 
@@ -95,7 +95,7 @@ def _build_model(self):
         gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.actions_pl
         self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices)
 
-        # Calcualte the loss
+        # Calculate the loss
         self.losses = tf.squared_difference(self.y_pl, self.action_predictions)
         self.loss = tf.reduce_mean(self.losses)
 

From 07dd722024306da428923bd2d7a64beb689ef6be Mon Sep 17 00:00:00 2001
From: Aerin Kim <ohmyheroine@hotmail.com>
Date: Sat, 26 May 2018 15:26:45 -0700
Subject: [PATCH 33/56] added the equation reference

---
 DP/Policy Evaluation Solution.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DP/Policy Evaluation Solution.ipynb b/DP/Policy Evaluation Solution.ipynb
index d69fe2546..0b06f87e7 100644
--- a/DP/Policy Evaluation Solution.ipynb	
+++ b/DP/Policy Evaluation Solution.ipynb	
@@ -58,7 +58,7 @@
     "            for a, action_prob in enumerate(policy[s]):\n",
     "                # For each action, look at the possible next states...\n",
     "                for  prob, next_state, reward, done in env.P[s][a]:\n",
-    "                    # Calculate the expected value\n",
+    "                    # Calculate the expected value. Ref: Sutton book eq. 4.6.\n",
     "                    v += action_prob * prob * (reward + discount_factor * V[next_state])\n",
     "            # How much our value function changed (across any states)\n",
     "            delta = max(delta, np.abs(v - V[s]))\n",

From 377c87595ae903e12df44886dba50ee40091a934 Mon Sep 17 00:00:00 2001
From: Aerin Kim <ohmyheroine@hotmail.com>
Date: Sat, 26 May 2018 15:38:42 -0700
Subject: [PATCH 34/56] added Sutton book's equation

---
 DP/Value Iteration Solution.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DP/Value Iteration Solution.ipynb b/DP/Value Iteration Solution.ipynb
index c7134dff3..90ec96a17 100644
--- a/DP/Value Iteration Solution.ipynb	
+++ b/DP/Value Iteration Solution.ipynb	
@@ -74,7 +74,7 @@
     "            best_action_value = np.max(A)\n",
     "            # Calculate delta across all states seen so far\n",
     "            delta = max(delta, np.abs(best_action_value - V[s]))\n",
-    "            # Update the value function\n",
+    "            # Update the value function. Ref: Sutton book eq. 4.10. \n",
     "            V[s] = best_action_value        \n",
     "        # Check if we can stop \n",
     "        if delta < theta:\n",

From 1b5c06f5b00bfa16a8138644387b013e15fbec29 Mon Sep 17 00:00:00 2001
From: Aerin Kim <ohmyheroine@hotmail.com>
Date: Sun, 27 May 2018 17:13:36 -0700
Subject: [PATCH 35/56] Gambler's problem (ex.4.3) added.

---
 DP/Gamblers Problem Solution.ipynb | 289 +++++++++++++++++++++++++++++
 DP/Gamblers Problem.ipynb          | 154 +++++++++++++++
 DP/README.md                       |   4 +
 3 files changed, 447 insertions(+)
 create mode 100644 DP/Gamblers Problem Solution.ipynb
 create mode 100644 DP/Gamblers Problem.ipynb

diff --git a/DP/Gamblers Problem Solution.ipynb b/DP/Gamblers Problem Solution.ipynb
new file mode 100644
index 000000000..d3880ef80
--- /dev/null
+++ b/DP/Gamblers Problem Solution.ipynb	
@@ -0,0 +1,289 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "### This is Example 4.3. Gambler’s Problem from Sutton's book.\n",
+    "\n",
+    "A gambler has the opportunity to make bets on the outcomes of a sequence of coin flips. \n",
+    "If the coin comes up heads, he wins as many dollars as he has staked on that flip; \n",
+    "if it is tails, he loses his stake. The game ends when the gambler wins by reaching his goal of $100, \n",
+    "or loses by running out of money. \n",
+    "\n",
+    "On each flip, the gambler must decide what portion of his capital to stake, in integer numbers of dollars. \n",
+    "This problem can be formulated as an undiscounted, episodic, finite MDP. \n",
+    "\n",
+    "The state is the gambler’s capital, s ∈ {1, 2, . . . , 99}.\n",
+    "The actions are stakes, a ∈ {0, 1, . . . , min(s, 100 − s)}. \n",
+    "The reward is zero on all transitions except those on which the gambler reaches his goal, when it is +1.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import sys\n",
+    "import matplotlib.pyplot as plt\n",
+    "if \"../\" not in sys.path:\n",
+    "  sys.path.append(\"../\") "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "\n",
+    "### Exercise 4.9 (programming)\n",
+    "\n",
+    "Implement value iteration for the gambler’s problem and solve it for p_h = 0.25 and p_h = 0.55."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def value_iteration_for_gamblers(p_h, theta=0.0001, discount_factor=1.0):\n",
+    "    \"\"\"\n",
+    "    Args:\n",
+    "        p_h: Probability of the coin coming up heads\n",
+    "    \"\"\"\n",
+    "    # The reward is zero on all transitions except those on which the gambler reaches his goal, when it is +1.\n",
+    "    rewards = np.zeros(101)\n",
+    "    rewards[100] = 1 \n",
+    "    \n",
+    "    # We introduce two dummy states corresponding to termination with capital of 0 and 100\n",
+    "    V = np.zeros(101)\n",
+    "    \n",
+    "    def one_step_lookahead(s, V, rewards):\n",
+    "        \"\"\"\n",
+    "        Helper function to calculate the value for all action in a given state.\n",
+    "        \n",
+    "        Args:\n",
+    "            s: The gambler’s capital. Integer.\n",
+    "            V: The vector that contains values at each state. \n",
+    "            rewards: The reward vector.\n",
+    "                        \n",
+    "        Returns:\n",
+    "            A vector containing the expected value of each action. Its length equals to the number of actions.\n",
+    "        \"\"\"\n",
+    "        A = np.zeros(101)\n",
+    "        stakes = range(1, min(s, 100-s)+1) # Your minimum bet is 1, maximum bet is min(s, 100-s).\n",
+    "        for a in stakes:\n",
+    "            # rewards[s+a], rewards[s-a] are immediate rewards.\n",
+    "            # V[s+a], V[s-a] are values of the next states.\n",
+    "            # This is the core of the Bellman equation: \n",
+    "            # The expected value of your action is the sum of immediate rewards and the value of the next state.\n",
+    "            A[a] = p_h * (rewards[s+a] + V[s+a]*discount_factor) + (1-p_h) * (rewards[s-a] + V[s-a]*discount_factor)\n",
+    "        return A\n",
+    "    \n",
+    "    while True:\n",
+    "        # Stopping condition\n",
+    "        delta = 0\n",
+    "        # Update each state...\n",
+    "        for s in range(1, 100):\n",
+    "            # Do a one-step lookahead to find the best action\n",
+    "            A = one_step_lookahead(s, V, rewards)\n",
+    "            # print(s,A,V) # if you want to debug.\n",
+    "            best_action_value = np.max(A)\n",
+    "            # Calculate delta across all states seen so far\n",
+    "            delta = max(delta, np.abs(best_action_value - V[s]))\n",
+    "            # Update the value function. Ref: Sutton book eq. 4.10. \n",
+    "            V[s] = best_action_value        \n",
+    "        # Check if we can stop \n",
+    "        if delta < theta:\n",
+    "            break\n",
+    "    \n",
+    "    # Create a deterministic policy using the optimal value function\n",
+    "    policy = np.zeros(100)\n",
+    "    for s in range(1, 100):\n",
+    "        # One step lookahead to find the best action for this state\n",
+    "        A = one_step_lookahead(s, V, rewards)\n",
+    "        best_action = np.argmax(A)\n",
+    "        # Always take the best action\n",
+    "        policy[s] = best_action\n",
+    "    \n",
+    "    return policy, V"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Optimized Policy:\n",
+      "[ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 12. 11. 15. 16. 17.\n",
+      " 18.  6. 20. 21.  3. 23. 24. 25.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10.\n",
+      " 11. 12. 38. 11. 10.  9. 42.  7. 44.  5. 46. 47. 48. 49. 50.  1.  2.  3.\n",
+      "  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 11. 10.  9. 17.  7. 19.  5. 21.\n",
+      " 22. 23. 24. 25.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 12. 11.\n",
+      " 10.  9.  8.  7.  6.  5.  4.  3.  2.  1.]\n",
+      "\n",
+      "Optimized Value Function:\n",
+      "[0.00000000e+00 7.24792480e-05 2.89916992e-04 6.95257448e-04\n",
+      " 1.16010383e-03 1.76906586e-03 2.78102979e-03 4.03504074e-03\n",
+      " 4.66214120e-03 5.59997559e-03 7.08471239e-03 9.03964043e-03\n",
+      " 1.11241192e-02 1.56793594e-02 1.61464431e-02 1.69517994e-02\n",
+      " 1.86512806e-02 1.98249817e-02 2.24047303e-02 2.73845196e-02\n",
+      " 2.83388495e-02 3.04937363e-02 3.61633897e-02 3.84953022e-02\n",
+      " 4.44964767e-02 6.25000000e-02 6.27174377e-02 6.33700779e-02\n",
+      " 6.45857723e-02 6.59966059e-02 6.78135343e-02 7.08430894e-02\n",
+      " 7.46098323e-02 7.64884604e-02 7.93035477e-02 8.37541372e-02\n",
+      " 8.96225423e-02 9.58723575e-02 1.09538078e-01 1.10939329e-01\n",
+      " 1.13360151e-01 1.18457374e-01 1.21977661e-01 1.29716907e-01\n",
+      " 1.44653559e-01 1.47520113e-01 1.53983246e-01 1.70990169e-01\n",
+      " 1.77987434e-01 1.95990576e-01 2.50000000e-01 2.50217438e-01\n",
+      " 2.50870078e-01 2.52085772e-01 2.53496606e-01 2.55313534e-01\n",
+      " 2.58343089e-01 2.62109832e-01 2.63988460e-01 2.66803548e-01\n",
+      " 2.71254137e-01 2.77122542e-01 2.83372357e-01 2.97038078e-01\n",
+      " 2.98439329e-01 3.00860151e-01 3.05957374e-01 3.09477661e-01\n",
+      " 3.17216907e-01 3.32153559e-01 3.35020113e-01 3.41483246e-01\n",
+      " 3.58490169e-01 3.65487434e-01 3.83490576e-01 4.37500000e-01\n",
+      " 4.38152558e-01 4.40122454e-01 4.43757317e-01 4.47991345e-01\n",
+      " 4.53440603e-01 4.62529268e-01 4.73829497e-01 4.79468031e-01\n",
+      " 4.87912680e-01 5.01265085e-01 5.18867627e-01 5.37617932e-01\n",
+      " 5.78614419e-01 5.82817988e-01 5.90080452e-01 6.05372123e-01\n",
+      " 6.15934510e-01 6.39150720e-01 6.83960814e-01 6.92560339e-01\n",
+      " 7.11950883e-01 7.62970611e-01 7.83963162e-01 8.37972371e-01\n",
+      " 0.00000000e+00]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "policy, v = value_iteration_for_gamblers(0.25)\n",
+    "\n",
+    "print(\"Optimized Policy:\")\n",
+    "print(policy)\n",
+    "print(\"\")\n",
+    "\n",
+    "print(\"Optimized Value Function:\")\n",
+    "print(v)\n",
+    "print(\"\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Show your results graphically, as in Figure 4.3.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEWCAYAAACJ0YulAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAIABJREFUeJzt3Xd8HNW5//HPI8mqlmRky7jjbmMb\nQhGmJKGH0OEmJIFAQkng5hJCCKSQhCSENNJuknshxaH3UPIjhksghNCbLReMC26Si1xlSbZlyerP\n748ZKWtZZW1rtCrf9+u1L+3Mnp15zs5qnznnTDF3R0REBCAp0QGIiEjPoaQgIiItlBRERKSFkoKI\niLRQUhARkRZKCiIi0kJJIUJmNsbMdplZchcs6z4z+3FXxNVquW5mE8PnfzSz70Wwjv80s99GsNxL\nzewfXb3crrS/283MDjezt6KIqaczszPM7OluWtd3zOyuA3j/GjM7PXx+vZnd3nXRJYaSQhcIvxi7\nwwTQ/Bjh7uvcfaC7N0a8/ivMrDFc704zW2hm5+7rctz9S+7+oy6OLRW4BfjlAS5nbJjAUprnufvD\n7n7Ggca4j3G0/AhEyd0XAdvN7Lwo12NmqWb2azMrCb8/xWb2m5jX96m+XbTz8lOg5cfVAteb2WIz\nqwpjfcLMDjvA9eDuP3X3L4br2es7to9mAZeZ2dADjSuRlBS6znlhAmh+bOzm9b/t7gOBQcDdwONm\nltfNMbTlAuADd9+Q6EB6oYeB/4x4Hd8GCoCZQDZwCrAg4nW2y8yOAXLd/Z2Y2b8DvgpcD+QBk4Gn\ngXO6P8L2uXsN8Hfg84mO5UAoKUSo9Z6Hmb1iZj8yszfNrNLM/mFmQ2LKP2Fmm81sh5m9ZmbT93Wd\n7t4E3ANkAOPD5V5tZqvMrNzMZpvZiHbi3WMvz8wuCFsdO81stZmdaWafMrN5rd53UwfN/bOAV1uV\nb7eeZpYR7rmuDV9/w8wygNfCItvDPdrjwxbSGzHvPcHM5obvm2tmJ8S81uFn3yq+IWb2rJltDz+z\n180sycweBMYAz4QxfLOz+rRabraZvWxm/xPu/aaZ2a/MbJ2ZbbGg+y4j5i2vAKeZWVoby7rYzApb\nzfuamc0On59tZkvDum4ws6+3FRNwDPD/3H2jB9a4+wPhMvapvmZ2DXAp8M2w/DPh/BFm9pSZlVrQ\nErm+nVig1ffFzCYBXwYucfd/uXutu1eHrcTbwzLnmNmC8Hu63sxujXl/8//gNWa20cw2mdlNMa/f\namYPhZNtfccmmNm/zKzMzLaZ2cNmNqiD+F+hhyWrfebuehzgA1gDnN7G/LGAAynh9CvAaoI9nYxw\n+vaY8lcR7K2lAb8FFsa8dh/w43bWfwXwRvg8hWCvqhLIBU4FtgFHhcv9X+C1mPc6MLH1Ogj2HHcA\nHyPYeRgJTA2XUQ4cGrOMBcAn24ltLvCpVvM6qued4ecyEkgGTgjL7fFZtlHvPKAC+Fz4GVwSTg+O\n57NvFd/PgD8CA8LHRwFrb1vHs92AwcCc2G0Ylp0dxp4NPAP8rNWydwKHtxFjZriNJ7X6rC8On28C\nPho+Pwg4qp263gKsA64FDmuuZ0ff7XjqGzOdBMwDvg+kEuyoFAEfbyeeJ4BvxEx/CVjbyf/fyWHs\nScDhwBbgwlb/g48CWWG50uY6AbcCD7X1/xrOm0jwP5AG5BMkjt+29/kQ/J+VJ/L36EAfCQ+gLzzC\nL8YuYHv4eDqcv8eXjOCH6JaY910LPN/OMgeF780Np/f4Z2tV9gqgIVz3NuCdmC/93cAvYsoOBOqB\nseF0e0nhT8Bv2lnfH4CfhM+nE/z4prVTdiVwZgefXUs9w3/q3cCH2ijX1j/sFfw7KXwOmNPqPW8D\nV+zHZ38b8Lfmz6WNbb3XDkAn2+0eYDF7/tgZUAVMiJl3PFDcankbgBPbWddDwPfD55MIkkRmOL2O\noOspp5PvbjLBnvibQC2wEbj8AOsbmxSOBda1es+3gXvbWd6LwJdipr8LvLOP/4+/bf7uxnxvpsa8\n/gvg7vD5rXSQFNpY9oXAgvY+n3A7NO5LvD3toe6jrnOhuw8KHxd2UG5zzPNqgh9pzCzZzG4Pu2l2\nEnzZANrs4mjDO+G6h7j7ce7+z3D+CGBtcyF33wWUEeyJd2Q0wZ51W+4HPmtmRvBj/Li717ZTtoJg\nrxLotJ5DgPQO1tuRPeoZWsue9Wzzs2/DL4FVwD/MrMjMbm5vpXFut3MIWid/jJmXT7C3Py/sptoO\nPB/Oj5VNkOzb8ghBiwjgswQ7I9Xh9CeBs4G1ZvaqmR3f1gLcvdHd73T3DxP8wP8EuMfMDj2A+sY6\nBBjRXMewnt8BDm6n/B7fF4Lv6vB2yjbHdGzYLVdqZjsIWhet41kf83wtwfelU2Y21MweC7vgdhIk\n4o7+J7MJWti9lpJCz/FZgkHZ0wn2mseG8+0Al7uR4B8zWJhZFkFXRmcDv+uBCW294MEgYB1Bt8pn\ngQc7WM4igi6bZh3VcxtQ0856O7uc7x71DI2h83ruvSL3Sne/yd3HA+cBN5rZae3EEc92+zPBD/5z\n4ecPQV13A9NjdiZyPThYIFhAMPaTCixvJ9R/AEPM7AiC5PBITB3muvsFwFCCQdnH46j3bne/k+CH\nedp+1rd1+fUErZ9BMY9sdz+7nTBaf19eAkaZWUEHoT9C0A032t1zCZJv6/+b0THPxxB8X1pr6zv2\ns3D+4e6eA1zWxrJjHQq818HrPZ6SQs+RTdB8LyPYg/xpFy33EeBKMzsiHLD8KfCuu6/p5H13h+87\nLRxkHWlmU2NefwC4A2hw9zfaXgQAzwEnxUy3W0//9yD5f4eDk8nhYF8aQT9wE+HgeTvrmWxmnzWz\nFDP7DMEP27Od1HMvZnaumU0MW0I7gcbwAUF/dWwM8W636wh+3J81s4ywrn8GfmPhIYzhZ/zxmPec\nDPyrvVaYuzcATxK0bPIIul6aDzO91Mxy3b0+pg5t1fUGMzvZggH+FDO7PKxT8xFI+1rf1uXnADvN\n7FvhOpLNbIYFRxm1ZY/vi7uvBH4PPBrGmWpm6RYMtDe34LIJ+vFrzGwmQeJq7XtmlhkOil8J/KWN\nMm19x7IJu4bNbCTwjXbibnYSwRFIvZaSQs/xAEGzdgOwlGBc4IC5+0vA94CnCAYfJwAXx/G+OQT/\nPL8haA6/yp574g8CM+i4lQDB4OlU+/cRT53V8+vA+wSDpuXAz4GksFvkJ8CbYTfEca3iLQPOBW4i\n+MH6JnCuu2/rrK5tmAT8k+DH4G3g9+7+Svjaz4Bbwhi+Hkd9muNz4BqCPee/mVk68C2Cbqp3wq6J\nfwJTYt52KXt2ObXlEYK99ifCJNHsc8CacLlfItjDbctu4NcEXWvbCMYXPunuRftZ37uBaWH5pz04\nR+c84AigOFzHXQStjL24+3xgh5kdGzP7eoIdkDsJutJWA/9B8N2CYHzoNjOrJBjQbqtV9CrBZ/0S\n8Ct33+ukx3a+Yz8kGDzeAfwf8Ne24gYIt+nZBN2rvVbzERUi+8SCQye3EhzVsrKTstcA09z9hm4J\nrg+w4MSsWe7e5lhAX2ZmZwDXdjI2F++yxhIkowGtkmaXM7OvEHRhfTPK9URNSUH2i5ndSLAnfmqi\nYxFpT3cmhb5if0/nln7MzNYQDLYd8J6ciPQsaimIiEgLDTSLiEiLXtd9NGTIEB87dmyiwxAR6VXm\nzZu3zd1bnxy5l16XFMaOHUthYWHnBUVEpIWZtT7jv03qPhIRkRZKCiIi0kJJQUREWigpiIhICyUF\nERFpoaQgIiItlBRERKSFkoKISA/X1OT89LllLCpp7yZ8XUdJQUSkh1uxtZJZrxWxcsuuyNelpCAi\n0sPNKS4HYOa4vMjXpaQgItLDvVtczojcdEYdlBH5upQURER6MHdnTnE5M8flEdw2PFpKCiIiPdja\nsmpKK2s5phu6jkBJQUSkR2seTzi2LyQFMzvTzJab2Sozu7mN18eY2ctmtsDMFpnZ2VHGIyLS27xb\nXE5eVioT8gd2y/oiSwpmlgzcCZwFTAMuMbNprYrdAjzu7kcCFwO/jyoeEZHeaM6aMmaO7Z7xBIi2\npTATWOXuRe5eBzwGXNCqjAM54fNcYGOE8YiI9CqbduxmffnubjkUtVmUSWEksD5muiScF+tW4DIz\nKwGeA77S1oLM7BozKzSzwtLS0ihiFRHpcbrz/IRmUSaFtto63mr6EuA+dx8FnA08aGZ7xeTus9y9\nwN0L8vM7vcWoiEifMKe4nIFpKRw6PKfzwl0kyqRQAoyOmR7F3t1DXwAeB3D3t4F0YEiEMYmI9Bpz\nisspGHsQyUndM54A0SaFucAkMxtnZqkEA8mzW5VZB5wGYGaHEiQF9Q+JSL+3dWcNK7fu6tauI4gw\nKbh7A3Ad8AKwjOAooyVmdpuZnR8Wuwm42szeAx4FrnD31l1MIiL9zs+fX05KkvHx6cO6db0pUS7c\n3Z8jGECOnff9mOdLgQ9HGYOISG/z9uoynppfwrUnT+i28xOa6YxmEZEepLahke8+/T6j8zL4yqmT\nun39kbYURERk3/zp1SKKSqu478pjyEhN7vb1q6UgItJDbNy+mzteXsU5hw3n5ClDExKDkoKISA/x\n0rIt1DU0cdMZkxMWg5KCiEgP8XZRGSNy0xk3JCthMSgpiIj0AE1NzturyzhuwuBuu/hdW5QURER6\ngOVbKqmorueECYm9qIOSgohID/D26jIAjp8wOKFxKCmIiPQAb60u45DBmYwclJHQOJQUREQSrLHJ\nebe4jOPHJ7aVAEoKIiIJt2TjDiprGhLedQRKCiIiCdcynqCWgoiIvLW6jAn5WQzNSU90KEoKIiKJ\nVN/YxNw15Qk/FLWZkoKISALNW1tBdV1jjxhPACUFEZGEqWto4rZnljJkYBofmdQzWgq6dLaISILc\n+fIqlm7ayazPHU1O+oBEhwOopSAikhCLN+zgzpdX8R9HjuSMbr7lZkeUFEREulltQyNff+I98rJS\n+cF50xIdzh7UfSQi0o1Wba3kG08u4oPNldx9eQGDMlMTHdIelBRERLpBQ2MTs14v4rf/XElmajK/\nu/gITjv04ESHtRclBRGRbnDfW2v4xfPLOWvGMG67YAb52WmJDqlNSgoiIt3g+cWbmTEyhz9cdnSi\nQ+mQBppFRCJWUVXH/HUVnDq153UXtaakICISsddWltLkcMqU/ESH0iklBRGRiL38wVYGZ6XyoVGD\nEh1Kp5QUREQi1NjkvLqilJMm55OUZIkOp1NKCiIiEVq4fjsV1fWcMnVookOJi5KCiEiEXlm+leQk\n48RJPX88AZQUREQi9a8PtnL0mIPIzewZF7zrjJKCiEhEtuysYcnGnZw8tXe0EkBJQUQkMq8s3wrA\nqb1kPAGUFEREIlHf2MSs14oYn5/FlIOzEx1O3JQUREQi8NicdawureLmM6di1vMPRW2mpCAi0sV2\n1tTzm3+u5LjxeXxsWs+/tEUsJQURkS5258urqKiu45ZzpvWqVgIoKYiIdKn15dXc+8YaPnHkKGaM\nzE10OPtMSUFEpIts3L6bLz00j6Qk+MbHpyQ6nP0SaVIwszPNbLmZrTKzm9sp82kzW2pmS8zskSjj\nERGJytw15Zx/xxusLavm95cexbDc9ESHtF8iu8mOmSUDdwIfA0qAuWY2292XxpSZBHwb+LC7V5hZ\n7zmYV0Qk9Nf5JXzrqUWMOiiTx645molDe88hqK1Feee1mcAqdy8CMLPHgAuApTFlrgbudPcKAHff\nGmE8IiJd7tUVpXzjyUUcOy6PP1x2NLkZveNyFu2JsvtoJLA+ZroknBdrMjDZzN40s3fM7My2FmRm\n15hZoZkVlpaWRhSuiMi+Wb65kusens+koQOZ9fmCXp8QINqk0NZxWN5qOgWYBJwMXALcZWZ73YXC\n3We5e4G7F+Tn955riIhI31VaWctV980lPTWZe644hoFpfeOW91EmhRJgdMz0KGBjG2X+5u717l4M\nLCdIEiIiPZK78/zizVz0x7coq6rl7ssLGDEoI9FhdZkok8JcYJKZjTOzVOBiYHarMk8DpwCY2RCC\n7qSiCGMSEdkvTU3Ou0VlfGbWO3zpoXkMSE7ivitncngvuMXmvtin9o6ZHQSMdvdFnZV19wYzuw54\nAUgG7nH3JWZ2G1Do7rPD184ws6VAI/ANdy/b51qIiERk6cadPDZ3HS8s2cyWnbUMzkrlxxfO4OJj\nRpOS3PdO9TL31t38rQqYvQKcT5BAFgKlwKvufmPk0bWhoKDACwsLE7FqEelnauobmfmTf1LX2MTJ\nk4dy5oxhnD7t4F45fmBm89y9oLNy8dQs1913mtkXgXvd/Qdm1mlLQUSkt3tr9TZ21jRw75XHcMqU\n/nEaVTxtnxQzGw58Gng24nhERHqMFxZvITsthRMmDE50KN0mnqRwG0Hf/2p3n2tm44GV0YYlIpJY\nDY1NvLhsC6dMHUpaSnKiw+k2nXYfufsTwBMx00XAJ6MMSkQk0QrXVlBeVcfHpw9LdCjdqtOWgplN\nNrOXzGxxOH24md0SfWgiIonz/OLNpKYkcfKU/nXCbDzdR38muGhdPUB4OOrFUQYlIpJI7s4/lmzm\nxElDyOqFRxodiHiSQqa7z2k1ryGKYEREeoL3N+xg444azuhnXUcQX1LYZmYTCK9bZGYXAZsijUpE\nJIFeWLKZ5CTj9EN71/2Vu0I87aIvA7OAqWa2ASgGLo00KhGRBGlqcv6+eDMzx+aRl5Wa6HC6XTxJ\nwd39dDPLApLcvdLMxkUdmIhIItz31hqKSqv4yqkTEx1KQsTTffQUgLtXuXtlOO/J6EISEUmMlVsq\nuf35Dzht6lAuPKL17V/6h3ZbCmY2FZgO5JrZJ2JeygF6581HRUTaUdfQxNceX8jAtBRu/+ThmLV1\nS5i+r6PuoynAucAg4LyY+ZUEt9EUEekz/uellSzesJM/fe5o8rPTEh1OwrSbFNz9b8DfzOx4d3+7\nG2MSEelW7xSV8ftXVnHR0aP63RnMrcUz0LzAzL5M0JXU0m3k7ldFFpWISDcp21XLVx9bwCGDs7j1\n/OmJDifh4hlofhAYBnwceJXgtpqVHb5DRKQXaGpybnriPSqq67njs0f2yvskdLV4ksJEd/8eUOXu\n9wPnAIdFG5aISPTueqOIV5aX8r1zDmX6iNxEh9MjxJMU6sO/281sBpALjI0sIhGRbvCvD7bwi+eX\nc9aMYVx23CGJDqfHiKetNCu8N/P3gNnAQOD7kUYlIhKhV1eU8qUH5zNtRA4/v6j/Hn7alnjup3BX\n+PRVYHy04YiIROutVdu45oFCJg4dyANXzSQnfUCiQ+pROk0KZjYI+DxBl1FLeXe/PrqwRES6VlOT\n8+jcdfz42WWMHZzFQ188lkGZ/e/aRp2Jp/voOeAd4H2gKdpwRES6XvG2Km5+ahHvFpdzwoTB/O7i\nI/vlxe7iEU9SSHf3GyOPRESki63aWsm9b67hyXklpKYk8fNPHsanC0ZrDKED8SSFB83sauBZoLZ5\npruXRxaViMgBWF26i1tnL+H1ldtITUniwiNGcNMZUzg4R5dt60w8SaEO+CXwXcIb7YR/NegsIj3O\nu0VlXPPgPJIMvn7GZC6ZOYbBA/vvtYz2VTxJ4UaCE9i2RR2MiMiB+NvCDXzjiUWMysvgvitmMmZw\nZqJD6nXiSQpLgOqoAxER2V+1DY38+h8rmPVaEceOy+NPnztaRxbtp3iSQiOw0MxeZs8xBR2SKiIJ\n98Hmndzw2EI+2FzJpceO4fvnTSMtJTnRYfVa8SSFp8OHiEiPsWZbFQ++s5YH315LTsYA7rmigFOn\nHpzosHq9eM5ovr87AhER6UxVbQMvL9/KE4UlvLqilJQk47wPjeCWcw7VYHIX6eh2nI+7+6fN7H3+\nfdRRC3c/PNLIRESALTtreHVFKS8u3cJrK0qpbWji4Jw0vnb6ZC6ZOZqhOsy0S3XUUvhq+Pfc7ghE\nRPqnxiansqaeXbUNVNU2snHHbopKqygq3cW8tRV8sDm4fcvw3HQumTmGs2YMo2BsHslJOgEtCh3d\njnNT+PRad/9W7Gtm9nPgW3u/S0Rk31z0x7dYsG77XvNz0lOYMTKXb581lRMn5zN1WLbORO4G8Qw0\nf4y9E8BZbcwTEdknWytrWLBuO+cePpwTJ+WTlZbC0Jw0xg/JIi8rVUkgAToaU/gv4Fpggpktinkp\nG3gz6sBEpO+bW1wBwBc/Op4jRg9KcDQCHbcUHgH+DvwMuDlmfqWueyQiXWFOcRmZqclMH5GT6FAk\n1O7tON19h7uvAW4BNrv7WmAccFl4jwURkQPybnE5Rx9yEAOS47kzsHSHeLbEU0CjmU0E7iZIDI9E\nGpWI9Hk7qutZvqWSY8bmJToUiRFPUmhy9wbgE8Bv3f1rwPB4Fm5mZ5rZcjNbZWY3d1DuIjNzMyuI\nL2wR6e0K15bjDjPHKSn0JPEkhXozu4TglpzPhvM6vampmSUDdxIcqTQNuMTMprVRLhu4Hng33qBF\npPebU1xOanKSBph7mHiSwpXA8cBP3L3YzMYBD8XxvpnAKncvcvc64DHggjbK/Qj4BVATZ8wi0ge8\nW1zOh0bnkj5AF6/rSdpNCmaWA+DuS939end/NJwuJr4xhZHA+pjpknBe7DqOBEa7+7OISL9RVdvA\n4g07NJ7QA3XUUnil+YmZvdTqtXiumtrWWSct11AysyTgN8BNnS7I7BozKzSzwtLS0jhWLSI92YJ1\n22loco0n9EAdJYXYH/XWWy6e0wxLgNEx06OAjTHT2cAM4BUzWwMcB8xua7DZ3We5e4G7F+Tn58ex\nahHpyeYUl5FkcPQhByU6FGmlo6Tg7Txva7otc4FJZjbOzFKBi4HZLQsIzoMY4u5j3X0s8A5wvrsX\nxhe6iPRWc9aUM31ELtnpnR6zIt2sozOah5rZjQStgubnhNOd7q67e4OZXQe8ACQD97j7EjO7DSh0\n99kdL0FE+qLFG3ZQuKaCqz4yLtGhSBs6Sgp/Jujiaf0c4K54Fu7uzwHPtZr3/XbKnhzPMkWk96qq\nbeD6RxcweGAqXzppQqLDkTZ0dOnsH3ZnICLS9/3wmSUUl1Xx8BePJS8rNdHhSBt0wRER6RbPvLeR\nxwtLuPbkCZwwYUiiw5F2xHM/BRGR/bartoG7Xi/iT68WccToQdxw+uREhyQdUFIQkUi4Ow+9u47f\nvriCsqo6zj5sGD84b7quiNrDdZoUzOxg4KfACHc/K7x+0fHufnfk0YlIr/XQu+v43tOLOW58Hnef\ndaiucdRLxJOy7yM4rHREOL0CuCGqgESk93tv/XZ+9MxSTp06lEe+eJwSQi8ST1IY4u6PA00QnH8A\nNEYalYj0WhVVdVz78Hzys9P4709/iKQk3We5N4lnTKHKzAYTnsVsZscBOyKNSkR6pd11jdz4+EJK\nK2t58r+OZ1CmDjvtbeJJCjcSXJ5igpm9SXA280WRRiUivUp1XQMPv7OOP71WxLZdtfz4whkcPkpd\nRr1Rp0nB3eeb2UnAFIJLXCx39/rIIxORHsvdea9kB4VrylmwfjtvrdpGRXU9H5k4hK+efpQuid2L\nxXP00edbzTrKzHD3ByKKSUR6uF//YwV3vLwKgJGDMvjopHwuP+EQjj5EyaC3i6f76JiY5+nAacB8\nQElBpB96dM467nh5FZ86ehTf+PgUhuakJzok6ULxdB99JXbazHKBByOLSER6rJeXb+WWpxdz0uR8\nfvaJw0jRiWh9zv6c0VwNTOrqQESkZ3v5g61c98h8phyczZ2XHqWE0EfFM6bwDP++qU4SMA14PMqg\nRKTnWFdWzW3PLuWfy7YwIT+Le688hoFpukJOXxXPlv1VzPMGYK27l0QUj4j0ABu27+aNlaW8tnIb\nLy7dQkqScfNZU7nqw+NITVELoS+LZ0zh1e4IREQSq7Kmnmfe28Rf5q7jvZLg/NSh2Wl88qiRXH/a\nJIbnZiQ4QukO7SYFM6uk7XsxG+DunhNZVCISGXdne3U9m3fWUFRaxbJNO1m6aSdvry5jd30jUw7O\n5jtnT+XkKUOZNHQgZrpMRX/S0Z3Xstt7TUR6n7qGJm564j3+sWQztQ1NLfOTk4yJ+QO58MiRfLpg\nFEeMHqRE0I/FPVpkZkMJzlMAwN3XRRKRiHS5hsYmvvrYAv6+eDOXzBzDxKEDGZ6bzpi8TCYOHUj6\ngOREhyg9RDxHH50P/Jrg0tlbgUOAZcD0aEMTka7Q1OR888lF/H3xZr537jS+8JFxiQ5JerB4DiP4\nEXAcsMLdxxGc0fxmpFGJSJfYsH03N/xlIX9dsIGbPjZZCUE6FU/3Ub27l5lZkpklufvLZvbzyCMT\nkf1WvK2KP7yyir/O3wDADadP4rpTJyY4KukN4kkK281sIPAa8LCZbSU4X0FEehB3p3BtBX9+rYgX\nl20hNTmJS48dwzUnTWDkIB1OKvGJJylcANQAXwMuBXKB26IMSkTit3lHDf/3/ib+tnADi0p2kJsx\ngGtPnsDlJ4xlaLYuVif7pqPzFO4AHnH3t2Jm3x99SCLSHndn884a3lu/nQXrt1O4poL56ypwh2nD\nc7jtgulcdPQoMlN1GQrZPx19c1YCvzaz4cBfgEfdfWH3hCUiABu37+aV5aW8uXobRaVVrC2rorou\nuEX6gGRj2ohcbjhtMud+aDgT8gcmOFrpCzo6ee13wO/M7BDgYuBeM0sHHgUec/cV3RSjSL/Q3Aoo\nXFNB4Zpy3i0u54PNlQCMyE1n6vAcjh8/mLFDMjlsZC7TRuSQlqLzC6RrmXtbV7Jop7DZkcA9wOHu\nnpBvY0FBgRcWFiZi1SL7zd2pqmuktLKW0spatlbWsHlH8Ni4Yzdry6pZW1bNrtrgGI7M1GSOGnMQ\nJ03O55Sp+UzI1+Um5MCY2Tx3L+isXDwnrw0AziRoLZwGvAr88IAjFOmDZr22mr/O30BDk9PY5NTU\nN1JV20BVXSONTXvvgKUPSGJEbgZjBmdyzNg8xg3J4qgxB3Ho8Gzdr0ASoqOB5o8BlwDnAHOAx4Br\n3L2qm2IT6VXcnbteLyZ9QDIzRuaQkpREakoSA9NSyEpLJid9APnZaS2P4TkZ5GSkqAUgPUpHLYXv\nAI8AX3f38m6KR6TXWltWzdb8d16TAAAQVklEQVTKWn584QwuO+6QRIcjsl86Gmg+pTsDEent5qwJ\n9p2OHZeX4EhE9p86LUW6yJzicg7KHMDEoTo0VHovJQWRLjKnuJxjxuZpjEB6NSUFkS6wacdu1pVX\nM1NdR9LLKSmIdIE5xc3jCYMTHInIgVFSEOkCc9eUk5WazKHDdRdb6d2UFES6wJzico4em6cTzqTX\ni/QbbGZnmtlyM1tlZje38fqNZrbUzBaZ2UvhdZZEepXyqjpWbNmlQ1GlT4gsKZhZMnAncBYwDbjE\nzKa1KrYAKHD3w4EngV9EFY9IVOaG5ydokFn6gihbCjOBVe5e5O51BJfJuCC2gLu/7O7V4eQ7wKgI\n4xGJxNziclJTkjh8VG6iQxE5YFEmhZHA+pjpknBee74A/L2tF8zsGjMrNLPC0tLSLgxR5MDU1Dfy\n4rItHDF6kC5jLX1ClEmhrTN42rxOt5ldBhQAv2zrdXef5e4F7l6Qn5/fhSGKHJif/N8y1pZV8+VT\nJiY6FJEuEeU9+0qA0THTo4CNrQuZ2enAd4GT3L02wnhEutQLSzbz4Dtr+eJHxnHSZO2sSN8QZUth\nLjDJzMaZWSrB/RhmxxYIb9rzJ+B8d98aYSwiXWrTjt1866lFTB+RwzfOnJLocES6TGRJwd0bgOuA\nF4BlwOPuvsTMbjOz88NivwQGAk+Y2UIzm93O4kR6jPdLdnD1A4XUNTTxv5ccqbEE6VOi7D7C3Z8D\nnms17/sxz0+Pcv0iXWltWRW/+scKnnlvIwdlDuC/P30E4/N1RVTpWyJNCiJ9RVHpLs6/400am5yv\nnDqRq08cT076gESHJdLllBREOrG7rpH/emg+qSlJ/O3LH2Z0XmaiQxKJjJKCSAfcne8+/T4rtlby\nwFUzlRCkz9PVu0Q68Je56/nr/A3ccNpkPjpJh51K36eWgkgbVm2t5H//tYpn3tvIiZPz+cqpOjlN\n+gclBRGgsclZsaWS+esqeGPlNp5fspmMAclcfeJ4rjtlIklJusWm9A9KCtLvLVhXwRfuL6S8qg6A\nwVmpfOmkCVz90fHkZaUmODqR7qWkIP3a2rIqvnB/IQPTUvj+udM4cswgxuRlYqaWgfRPSgrSb1VU\n1XHFvXNpcue+K4/RiWgiKClIP7Vjdz1XP1DIhu27eeSLxyohiISUFKRfaWpynppfws+f/4Dyqjr+\n55IjKRirO6aJNFNSkH6hpKKaN1Zu4y+F61mwbjtHjRnEfVfOZMZI3S1NJJaSgvRZ68ureWJeCc++\nt5GibVUAjMhN51ef+hCfOHKkDjMVaYOSgvQZ5VV1LNu0k6Ubd/LaylLeWLUNgA9PGMKlxx3CRycN\nYdLQgTqySKQDSgrSa/39/U08Onc9m3fsZtOOGiprGlpeG3VQBtefOolPHzOakYMyEhilSO+ipCC9\n0rOLNnL9owsYk5fJlGHZHD9+MKMOyuTQ4TkcOjybwQPTEh2iSK+kpCC9zj+XbuGGxxZScEge9181\nk4xU3flMpKsoKUiv4e68sGQz1z+2kGkjcrj7igIlBJEupqQgPV5Tk/Pisi3c+fIqFpXsYOqwbB64\naibZuvOZSJdTUpAeq6a+kacXbOCuN4pZtXUXY/Iy+dknDuMTR40kLUUtBJEoKClIj7NqayWzF27k\nkTnr2LarjmnDc/jdxUdwzmHDSUnWfaFEoqSkIAm3o7qehSXbmb+2gheWbOaDzZWYwUmT87n6o+M5\nYcJgnVsg0k2UFKRbrS+v5vWV21ixpZI1ZVUUb6tibVk1AGZw1JiDuPW8aZx92HCG5qQnOFqR/kdJ\nQSJVtquWwrUVzCku59UVpazauguAzNRkDhmcxfQROXy6YDRHjB7E4aNyNXgskmBKCnJAGpuc8qo6\nNu+oYfPOGjZu383asmrWllWxunQXa8JWQGpKEseOy+OzM8dw8pR8xg3JUpeQSA+kpCDtWlSynScK\nS2hoaqK+0alraKKqtoGqugZ27m6gdFct5VV1NDb5Hu9LH5DE2MFZTB2Ww8Uzx3DM2IOYMTJXRwyJ\n9AJKCtKmrTtruOLeueyuayQ7PYWUJGNAShJZqSkMTEtheG46h4/KJT87jfzsNA7OSWd4bjrDctPJ\nH5imVoBIL6WkIHtpanJufPw9qusaePYrH2Hi0OxEhyQi3UQHfcteZr1exBurtvGD86YrIYj0M0oK\nsofCNeX86oXlnDVjGBcfMzrR4YhIN1P3kQAwb20Ff3p1NS8u28KI3Axu/8ThGhcQ6YeUFPohd6dw\nbQXvFpWxbHMlyzbupGhbFbkZA/jKKRO5/ISx5GbqfAGR/khJoR+pqW9k9nsbuffNNSzbtBOA0XkZ\nTB2Ww+ePP4RPFYwmK01fCZH+TL8AfUxTkzNvXQUlFdVUVNWzvbqO4rJqVm6ppGhbFXUNTUw5OJvb\nP3EY5xw+XGcQi8gelBT6AHdnbVk1f51fwlPzN7Bh++6W18xg5KAMJg0dyImT8zl5cj7H6wJzItIO\nJYVeorHJ2bKzhpKK3ZRW1rJtVy2bd9awdONOFm/YQVlVHWbw0Un53HzWVKaPyGFQZiq5GQNITlIC\nEJH4KCn0ADX1jbyyvJQF6yrYXd/I7rpGdtc3srOmgcqaeiqq6tiwfTf1jXteTiI5yZg0dCCnTB3K\nYSNz+di0gxkxKCNBtRCRvkBJIQGqahtYXbqL1aW7eHt1GX9fvJnKmgZSk5PISksmfUAyGQOSyc4Y\nQE56CiMHZXDmjOGMyctk1EEZ5GenMWRgGnlZqWoFiEiXijQpmNmZwO+AZOAud7+91etpwAPA0UAZ\n8Bl3XxNlTFFzd3bVNrC9up6tlTVs3F7Dph27WVdeTfG2KopKq9i0o6al/MC0FM6YfjAXHDGSD08Y\nrDuLiUhCRZYUzCwZuBP4GFACzDWz2e6+NKbYF4AKd59oZhcDPwc+E1VM8XB3ahuaqKlvpKa+iaq6\nBqprG9lV28CO3XWUV9VTUV1HaWUtWytrKK2sZefu4MqhVbUNVNY00NDqqqEAOekpjM8fyPHjBzM+\nP4uJQwcycehADhmcxQAlAhHpIaJsKcwEVrl7EYCZPQZcAMQmhQuAW8PnTwJ3mJm5+96/qgfo8bnr\nmfV6EU3u4NDkTkOT09DoNDQ1UdsQPOoamuJaXnZaCvk5aQzNTmPskEyyUlPISE0mN2MAgzIHMCgz\nlfyBaQwflM6IQRnk6NBPEekFokwKI4H1MdMlwLHtlXH3BjPbAQwGtsUWMrNrgGsAxowZs1/BDMoc\nwJSDs8EgyQwDUpKNlCQjJTmJtJQk0lKSSUtJIn1AMukDgr+ZqclkpaaQmZbMoIxU8rJSGZQ5gPQB\nujeAiPQ9USaFtkZAW7cA4imDu88CZgEUFBTsVyvijOnDOGP6sP15q4hIvxFlZ3YJEHuZzVHAxvbK\nmFkKkAuURxiTiIh0IMqkMBeYZGbjzCwVuBiY3arMbODy8PlFwL+iGE8QEZH4RNZ9FI4RXAe8QHBI\n6j3uvsTMbgMK3X02cDfwoJmtImghXBxVPCIi0rlIz1Nw9+eA51rN+37M8xrgU1HGICIi8dMB8iIi\n0kJJQUREWigpiIhICyUFERFpYb3tCFAzKwXW7ufbh9DqbOl+oj/Wuz/WGfpnvftjnWHf632Iu+d3\nVqjXJYUDYWaF7l6Q6Di6W3+sd3+sM/TPevfHOkN09Vb3kYiItFBSEBGRFv0tKcxKdAAJ0h/r3R/r\nDP2z3v2xzhBRvfvVmIKIiHSsv7UURESkA0oKIiLSot8kBTM708yWm9kqM7s50fFEwcxGm9nLZrbM\nzJaY2VfD+Xlm9qKZrQz/HpToWLuamSWb2QIzezacHmdm74Z1/kt4+fY+xcwGmdmTZvZBuM2P7yfb\n+mvh93uxmT1qZul9bXub2T1mttXMFsfMa3PbWuB/wt+2RWZ21IGsu18kBTNLBu4EzgKmAZeY2bTE\nRhWJBuAmdz8UOA74cljPm4GX3H0S8FI43dd8FVgWM/1z4DdhnSuALyQkqmj9Dnje3acCHyKof5/e\n1mY2ErgeKHD3GQSX5b+Yvre97wPObDWvvW17FjApfFwD/OFAVtwvkgIwE1jl7kXuXgc8BlyQ4Ji6\nnLtvcvf54fNKgh+JkQR1vT8sdj9wYWIijIaZjQLOAe4Kpw04FXgyLNIX65wDnEhwTxLcvc7dt9PH\nt3UoBcgI79aYCWyij21vd3+Nve9C2d62vQB4wAPvAIPMbPj+rru/JIWRwPqY6ZJwXp9lZmOBI4F3\ngYPdfRMEiQMYmrjIIvFb4JtAUzg9GNju7g3hdF/c3uOBUuDesNvsLjPLoo9va3ffAPwKWEeQDHYA\n8+j72xva37Zd+vvWX5KCtTGvzx6La2YDgaeAG9x9Z6LjiZKZnQtsdfd5sbPbKNrXtncKcBTwB3c/\nEqiij3UVtSXsR78AGAeMALIIuk9a62vbuyNd+n3vL0mhBBgdMz0K2JigWCJlZgMIEsLD7v7XcPaW\n5uZk+HdrouKLwIeB881sDUG34KkELYdBYfcC9M3tXQKUuPu74fSTBEmiL29rgNOBYncvdfd64K/A\nCfT97Q3tb9su/X3rL0lhLjApPEIhlWBganaCY+pyYV/63cAyd//vmJdmA5eHzy8H/tbdsUXF3b/t\n7qPcfSzBdv2Xu18KvAxcFBbrU3UGcPfNwHozmxLOOg1YSh/e1qF1wHFmlhl+35vr3ae3d6i9bTsb\n+Hx4FNJxwI7mbqb90W/OaDazswn2IJOBe9z9JwkOqcuZ2UeA14H3+Xf/+ncIxhUeB8YQ/FN9yt1b\nD2L1emZ2MvB1dz/XzMYTtBzygAXAZe5em8j4upqZHUEwuJ4KFAFXEuzo9eltbWY/BD5DcLTdAuCL\nBH3ofWZ7m9mjwMkEl8feAvwAeJo2tm2YHO8gOFqpGrjS3Qv3e939JSmIiEjn+kv3kYiIxEFJQURE\nWigpiIhICyUFERFpoaQgIiItlBREQmY2zMweM7PVZrbUzJ4zs8n7sZy7mi+4aGbfifM9a8xsyL6u\nS6Sr6ZBUEVpO/HsLuN/d/xjOOwLIdvfXD2C5u9x9YBzl1hBc+XPb/q5LpCuopSASOAWob04IAO6+\nEFhgZi+Z2Xwze9/MLoDggoPhfQzuD69h/6SZZYavvWJmBWZ2O8HVPBea2cPha0+b2bzwfgDXJKCe\nIh1SUhAJzCC42mZrNcB/uPtRBInj12GrAmAKMMvdDwd2AtfGvtHdbwZ2u/sR4aU3AK5y96OBAuB6\nMxscQV1E9puSgkjHDPipmS0C/klwOYWDw9fWu/ub4fOHgI/Esbzrzew94B2Ci5hN6uJ4RQ5ISudF\nRPqFJfz7gmqxLgXygaPdvT7s+08PX2s9INfhAF14babTgePdvdrMXolZlkiPoJaCSOBfQJqZXd08\nw8yOAQ4huF9DvZmdEk43G2Nmx4fPLwHeaGO59eHlzAFygYowIUwluGWqSI+ipCACeHAY3n8AHwsP\nSV0C3Ao8BxSYWSFBq+GDmLctAy4Pu5byaPveuLOAReFA8/NASlj+RwRdSCI9ig5JFdkP4e1Onw1v\nHi/SZ6ilICIiLdRSEBGRFmopiIhICyUFERFpoaQgIiItlBRERKSFkoKIiLT4/4EmbUnRp+/0AAAA\nAElFTkSuQmCC\n",
+      "text/plain": [
+       "<matplotlib.figure.Figure at 0x1d9de122198>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Plotting Final Policy (action stake) vs State (Capital)\n",
+    "\n",
+    "# x axis values\n",
+    "x = range(100)\n",
+    "# corresponding y axis values\n",
+    "y = v[:100]\n",
+    " \n",
+    "# plotting the points \n",
+    "plt.plot(x, y)\n",
+    " \n",
+    "# naming the x axis\n",
+    "plt.xlabel('Capital')\n",
+    "# naming the y axis\n",
+    "plt.ylabel('Value Estimates')\n",
+    " \n",
+    "# giving a title to the graph\n",
+    "plt.title('Final Policy (action stake) vs State (Capital)')\n",
+    " \n",
+    "# function to show the plot\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEWCAYAAABrDZDcAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAGoxJREFUeJzt3Xu8HGV9x/HP13AXQghJMJDEgA0X\naw2XIwWxlIu0SCmJBSkUMW3B9KJyEbWovFpQq9AqUK9tCmKK3CQg1xZJYyKlhUAihIsgCYgQE5MA\nCQEBTeDXP+Y5sBzO7pk9Z2f27M73/Xrta3dm5/KbmWR/53memedRRGBmZtX1pnYHYGZm7eVEYGZW\ncU4EZmYV50RgZlZxTgRmZhXnRGBmVnFOBDZsSTpB0q1DWH+BpJNbGVMT+x5S7H229bik97ZiWzXb\nPEjS8prpByUd1Mp9WOdwIrAhk/RnkhZJel7SSkn/Jek9Q91uRFwWEX9Qs5+Q9FtD3W6rSJqcYnq+\n5rUE3hh7gTF8R9Jv0r6fkTRX0u7NbicifjsiFhQQonUAJwIbEkkfBy4EvgjsAEwCvglMa2dcJRsV\nEVun19Q27P+fImJrYAKwGvhOG2KwDuZEYIMmaVvgc8BHIuLaiPhVRGyIiBsj4pNpmX0l3SFpXSot\nfF3SZjXbCEmnSHpM0lOS/lnSm9J3fy7p9vT5trTKkvTX759K2k7STZLWSFqbPk/IEfeOkl6UNLpm\n3l5p/5tK+i1JP5L0bJp31SDOzaux1xznX0tammL9hiSl794m6YeSnk77u0zSqGb3GREvAJcD70jb\n3VzShZJWpNeFkjavE++r1U+SRkj6jKRHJT0nabGkiSnmr/RZ70ZJpzUbqw0vTgQ2FPsDWwDfb7DM\ny8DpwJi0/KHA3/ZZ5v1AD7A3WUniL/tuJCIOTB+npr+8ryL793sJ8FayksiLwNcHCjoiVgB3AEfX\nzP4zYE5EbAA+D9wKbEf2V/bXBtpmTkcC7wKmAscCf5jmC/gSsCOwBzAROLvZjUvaGjgBuCfN+iyw\nH7Bn2ue+wFk5NvVx4HjgCGAk2fV4AZgNHF+TqMeQXc8rmo3VhhcnAhuK7YGnImJjvQUiYnFE3BkR\nGyPiceDfgN/vs9h5EfFMRDxBVs10fJ6dR8TTEXFNRLwQEc8B/9jPtuu5vHc/6S/z49I8gA1kyWXH\niHgpIm7vfxOveiqVeNZJ+kSD5c6NiHXpOOeT/UATEcsiYm5E/Doi1gDnN3EcAJ+QtA5YBmwN/Hma\nfwLwuYhYnbZ7DnBiju2dDJwVET+NzJJ0ru8CniX78YfsnC2IiFVNxGrDkBOBDcXTwBhJm9RbQNKu\nqcrml5LWk7UljOmz2JM1n39O9pfxgCRtJenfJP08bfs2YJSkETlWnwPsL2lH4EAggP9J332K7K/0\nu9LdNG8oofQxJiJGpdeXGyz3y5rPL5D9aCNpnKQrJf0iHcd3eeM5auTLad9viYijIuLRNH9HsvPZ\nK++5nQg8Wue72cAH0+cPApc2EacNU04ENhR3AC8B0xss8y3gYWBKRIwEPkP2I1trYs3nScCKnPs/\nA9gN+N207d7qo77bf4OIWEdW/XMsWbXQFZG64o2IX0bEhyNiR+CvgG8WfLfSl8gS0TvTcXyQHMeQ\nwwqykk2vvOf2SeBtdb77LjBN0lSyaqzrhhShDQtOBDZoEfEs8PfANyRNT3+hbyrpfZL+KS22DbAe\neD7d1vg3/Wzqk6nhdyJwKlCvcXYVsEvN9DZk7QLrUsPvPzR5CJcDHyJrK+itFkLSB2oandeS/Ui/\n3OS2m7EN8DzZcewEfLJF270COEvS2FSf//dkP+QDuQj4vKQpyrxT0vYAEbEcuJusJHBNRLzYolit\njZwIbEgi4nyyxsWzgDVkf01+lNf+UvwE2V/czwH/Tv8/8tcDi4F7gZuBi+vs7mxgdqqLP5asPWFL\n4CngTuCWJsO/AZgCrIqIJTXz3wUslPR8WubUiPhZk9tuxjlkDeXPkh3/tS3a7heARcB9wP3Aj9O8\ngZwPfI+sxLSe7HpsWfP9bOB3cLVQ15AHprF2khRk1UbL2h2L5SPpQLKSxeSIeKXd8djQuURgZrlJ\n2pSs+u4iJ4Hu4URgZrlI2gNYB4wnq5azLuGqITOzinOJwMys4uo+CDScjBkzJiZPntzuMMzMOsri\nxYufioixAy3XEYlg8uTJLFq0qN1hmJl1FEk/H3gpVw2ZmVWeE4GZWcU5EZiZVZwTgZlZxTkRmJlV\nnBOBmVnFFXr7qKTHyXqdfBnYGBE9qbvgq4DJwOPAsRGxtsg4zMysvjJKBAdHxJ4R0ZOmzwTmRcQU\nYF6aNjOzNmlH1dA0sv7MSe+NRrcyM7OCFZ0IArhV0mJJM9O8HSJiJUB6H9ffipJmSlokadGaNWsK\nDtNs8C6Y+wgXzH2k3WGYDVrRXUwcEBErJI0D5kp6OO+KETELmAXQ09PjLlLNzApSaIkgIlak99XA\n94F9gVWSxgOk99VFxmBmZo0VlggkvVnSNr2fgT8AHiAbA3ZGWmwG2Xi1ZmbWJkVWDe0AfF9S734u\nj4hbJN0NfE/SScATwAcKjMGs5WrbA04/bNc2RmLWGoUlgoh4DJjaz/yngUOL2q+ZmTXHTxabmVWc\nE4GZWcV1xAhlZu3m5wSsm7lEYGZWcU4EZmYV50RgZlZxbiMwq8PtAlYVLhGYmVWcE4GZWcU5EZiZ\nVZzbCMxquF3AqsglAjOzinMiMDOrOCcCM7OKcyIwM6s4JwIzs4pzIjAzqzgnAjOzinMiMDOrOD9Q\nZpXkAejNXuMSgZlZxTkRmJlVnBOBmVnFORGYmVWcE4GZWcU5EZiZVZwTgZlZxfk5Autqfl7AbGAu\nEZiZVZwTgZlZxTkRmJlVnBOBmVnFFZ4IJI2QdI+km9L0zpIWSloq6SpJmxUdg5mZ1VdGieBU4KGa\n6fOACyJiCrAWOKmEGMzMrI5CE4GkCcAfARelaQGHAHPSIrOB6UXGYGZmjRVdIrgQ+BTwSpreHlgX\nERvT9HJgp/5WlDRT0iJJi9asWVNwmGZm1VVYIpB0JLA6IhbXzu5n0ehv/YiYFRE9EdEzduzYQmI0\nM7Ninyw+ADhK0hHAFsBIshLCKEmbpFLBBGBFgTGYmdkACisRRMSnI2JCREwGjgN+GBEnAPOBY9Ji\nM4Dri4rBzMwG1o7nCP4O+LikZWRtBhe3IQYzM0tK6XQuIhYAC9Lnx4B9y9ivmZkNzE8Wm5lVnBOB\nmVnFORFYR7lg7iOvG2PAzIbOicDMrOKcCMzMKs6JwMys4hrePippC+BI4PeAHYEXgQeAmyPiweLD\nMzOzotVNBJLOBv6Y7P7/hcBqsq4idgXOTUnijIi4r/gwzcysKI1KBHdHxNl1vjtf0jhgUutDMjOz\nMtVNBBFxc+20pDdHxK9qvl9NVkowM7MONmBjsaR3S/oJaZQxSVMlfbPwyMzMrBR57hq6APhD4GmA\niFgCHFhkUGZmVp5ct49GxJN9Zr1cQCxmZtYGeXoffVLSu4GQtBlwCq8fjN7MzDpYnhLBXwMfIRtb\neDmwZ5o2M7MukKdE8EoaWexVknYmtRmYmVlny1MiuFHSyN4JSXsANxYXkpmZlSlPIvgiWTLYWtI+\nwBzgg8WGZWZmZRmwaigibpa0KXArsA0wPSKWFh6ZmZmVolFfQ18DombWSOAx4GOSiIhTig7OzMyK\n16hEsKjP9OIiAzEzs/Zo1NfQ7DIDMTOz9hiwjUDSFOBLwNvJuqEGICJ2KTAuMzMrSZ67hi4BvgVs\nBA4G/gO4tMigzMysPHkSwZYRMQ9QRPw8jVFwSLFhmZlZWfI8WfySpDcBSyV9FPgFMK7YsMzMrCx5\nSgSnAVuRdTa3D9nDZB8qMigzMytPnkQwOSKej4jlEfEXEXE0HqLSzKxr5EkEn845z8zMOlCjJ4vf\nBxwB7CTpqzVfjSS7g8jMzLpAo8biFWRPFx/F658qfg44vcigzMysPI2eLF4CLJF0eURsAJC0HTAx\nItaWFaCZmRUrTxvBXEkjJY0GlgCXSDp/oJUkbSHpLklLJD0o6Zw0f2dJCyUtlXRVGv7SzMzaJE8i\n2DYi1gN/AlwSEfsA782x3q+BQyJiKtnwlodL2g84D7ggIqYAa4GTBhe6mZm1Qp5EsImk8cCxwE15\nNxyZ59PkpukVZE8lz0nzZwPT84drZmatlicRfA74AbAsIu6WtAuQa2AaSSMk3QusBuYCjwLrIqL3\nrqPlwE7Nh21mZq2SZ4Syq4Gra6YfA47Os/GIeBnYU9Io4PvAHv0t1t+6kmYCMwEmTfLza2ZmRalb\nIpB0Vmogrvf9IZKOzLOTiFgHLAD2A0ZJ6k1AE8huU+1vnVkR0RMRPWPHjs2zGzMzG4RGJYL7yQat\nfwn4MbCGbDyCKWSNv/9NNrB9vySNBTZExDpJW5I1MJ8HzAeOAa4EZgDXt+A4zMxskBo9R3A9cH0a\nmOYAYDywHvguMDMiXhxg2+OB2ZJGkJU8vhcRN0n6CXClpC8A9wAXt+A4zMxskPK0ESwlZ+Nwn/Xu\nA/bqZ/5jwL7Nbs/MzIqR564hMzPrYk4EZmYVN2AiaHTnkJmZdb48JYKFkq6WdIQkFR6RmZmVKk8i\n2BWYBZwILJP0RUm7FhuWmZmVZcBEkPoMmhsRxwMnk937f5ekH0nav/AIzcysUAPePippe7IB608E\nVgEfA24ge6jsamDnIgM0M7NiDZgIgDuAS4HpEbG8Zv4iSf9aTFhmZlaWPIlgt4jot2O4iDivxfGY\nmVnJ8jQW35p6DwWy4Sol/aDAmMzMrER5EsHY1HsoAGm84nHFhWRmZmXKkwhelvTqgACS3kqdMQTM\nzKzz5Gkj+Cxwu6QfpekDSQPGmJlZ58vT++gtkvYmG1RGwOkR8VThkZmZWSkajVC2e3rfG5hENpLY\nL4BJaZ6ZmXWBRiWCM4APA1/p57sADikkImurC+Y+8urn0w9zTyLWer3/xvzva/hoNELZh9P7weWF\nY2ZmZaubCCT9SaMVI+La1odjZmZla1Q19McNvgvAicDMrAs0qhr6izIDsfapbRcwK4LbBYa3PCOU\nbSvpfEmL0usrkrYtIzgzMytenieLvw08BxybXuuBS4oMyszMypPnyeK3RcTRNdPnSLq3qIDMzKxc\neUoEL0p6T++EpAOAF4sLyczMypSnRPA3wOzULiDgGbLhKq2DuYHYiuYG4s6Rp6+he4Gpkkam6fWF\nR2VmZqXJc9fQ9pK+CiwA5kv6lzSOsZmZdYE8bQRXAmuAo4Fj0uerigzKzMzKk6eNYHREfL5m+guS\nphcVkJl1LrcLdKY8JYL5ko6T9Kb0Oha4uejAzMysHHkSwV8BlwO/Tq8rgY9Lek6SG47NzDpcnruG\ntikjEDMza488bQTWwTzQjBXN7QKdL0/V0KBImihpvqSHJD0o6dQ0f7SkuZKWpvftiorBzMwGVlgi\nADYCZ0TEHmQD339E0tuBM4F5ETEFmJemzcysTRqNUDa60YoR8cwA368EVqbPz0l6CNgJmAYclBab\nTfag2t/ljtjMzFqqURvBYrKRyNTPdwHskncnkiYDewELgR1SkiAiVkoaV2edmcBMgEmTJuXdleE6\nWzNrTqMRynZuxQ4kbQ1cA5wWEeul/vJKv/ufBcwC6OnpiVbEYmZmb5TrrqHUoDsF2KJ3XkTclmO9\nTcmSwGU1g92vkjQ+lQbGA6ubD9vMzFolT6dzJwO3AT8AzknvZ+dYT8DFwEMRcX7NVzfwWjfWM4Dr\nmwvZzMxaKU+J4FTgXcCdEXGwpN3JEsJADgBOBO6vGdHsM8C5wPcknQQ8AXyg+bDNrEx+HqW75UkE\nL0XES5KQtHlEPCxpt4FWiojb6b+hGeDQpqI0M7PC5EkEyyWNAq4D5kpaC6woNiwzMytLnr6G3p8+\nni1pPrAtcEuhUZmZWWny3jU0AtgB+Fma9Ray+n1rIz8vYGatMGAikPQx4B+AVcAraXYA7ywwLjMz\nK0neu4Z2i4iniw7GzMzKl6fTuSeBZ4sOxMzM2iNPieAxYIGkm8lGKAOgz0NiVhK3C1iR/LxANeVJ\nBE+k12bpZWZmXSTP7aN5niI2M7MO1Wg8ggsj4jRJN5LdJfQ6EXFUoZGZmVkpGpUILk3vXy4jEDMz\na49GiWANQET8qKRYrA43EJtZkRrdPnpd7wdJ15QQi5mZtUGjRFDbc2juYSnNzKyzNEoEUeezmZl1\nkUZtBFMlrScrGWyZPpOmIyJGFh5dhbldwMzK0mjw+hFlBmJmZu2Rp68hMzPrYk4EZmYV50RgZlZx\nTgRmZhXnRGBmVnFOBGZmFZdr8Hoz60weaMbycInAzKzinAjMzCrOicDMrOLcRtAGtf0IuU+hgfkc\nNae2XcAGVu98Venfm0sEZmYV50RgZlZxTgRmZhXnNoKS5Knndl34a3y+mpOnXcDPFLzG5+v1CisR\nSPq2pNWSHqiZN1rSXElL0/t2Re3fzMzyKbJq6DvA4X3mnQnMi4gpwLw0bWZmbVRYIoiI24Bn+sye\nBsxOn2cD04vav5mZ5VN2G8EOEbESICJWShpXb0FJM4GZAJMmTSopvNZyHXZzhnK+qniuW3G+Brt+\nJxrq8xXd/G9s2N41FBGzIqInInrGjh3b7nDMzLpW2YlglaTxAOl9dcn7NzOzPspOBDcAM9LnGcD1\nJe/fzMz6KPL20SuAO4DdJC2XdBJwLnCYpKXAYWnazMzaqLDG4og4vs5Xhxa1z3arYgPcUBXRANfN\nna75fDWnqGPrtobjYdtYbGZm5XAiMDOrOCcCM7OKc6dzQ9TN9atF6bb61aL5fDWn7P+T3XB9XCIw\nM6s4JwIzs4pzIjAzqzi3EQyC2wWaNxzqUTvpuvl8NWe4xDocrttguERgZlZxTgRmZhXnRGBmVnFu\nI8hpuNRB1jMc6yaHY0zDmc9Xc/x/snVcIjAzqzgnAjOzinMiMDOrOLcRNDDc6yCHo06qF+3Vzuvc\niecL2he3z1cxXCIwM6s4JwIzs4pzIjAzqzi3EfThdoHmeJzm5g33+uLhptvO13A8HpcIzMwqzonA\nzKzinAjMzCrObQRUp12gVcdZlfPVSsOxXng4q8r5Gi7H6RKBmVnFORGYmVWcE4GZWcU5EZiZVVxl\nG4vd4Nkcn6/mDZeGwE5R9fPVzuN3icDMrOKcCMzMKs6JwMys4irVRuB67ub4fDWv6vXczfL56l/Z\n56UtJQJJh0v6qaRlks5sRwxmZpYpPRFIGgF8A3gf8HbgeElvLzsOMzPLtKNEsC+wLCIei4jfAFcC\n09oQh5mZAYqIcncoHQMcHhEnp+kTgd+NiI/2WW4mMDNN7gb8dAi7HQM8NYT1O5GPuRp8zNUw2GN+\na0SMHWihdjQWq595b8hGETELmNWSHUqLIqKnFdvqFD7mavAxV0PRx9yOqqHlwMSa6QnAijbEYWZm\ntCcR3A1MkbSzpM2A44Ab2hCHmZnRhqqhiNgo6aPAD4ARwLcj4sGCd9uSKqYO42OuBh9zNRR6zKU3\nFpuZ2fDiLibMzCrOicDMrOK6PhFUoTsLSRMlzZf0kKQHJZ2a5o+WNFfS0vS+XbtjbSVJIyTdI+mm\nNL2zpIXpeK9KNyN0FUmjJM2R9HC63vtX4Dqfnv5dPyDpCklbdNu1lvRtSaslPVAzr9/rqsxX02/a\nfZL2Hur+uzoRVKg7i43AGRGxB7Af8JF0nGcC8yJiCjAvTXeTU4GHaqbPAy5Ix7sWOKktURXrX4Bb\nImJ3YCrZ8XftdZa0E3AK0BMR7yC7weQ4uu9afwc4vM+8etf1fcCU9JoJfGuoO+/qREBFurOIiJUR\n8eP0+TmyH4edyI51dlpsNjC9PRG2nqQJwB8BF6VpAYcAc9IiXXW8AJJGAgcCFwNExG8iYh1dfJ2T\nTYAtJW0CbAWspMuudUTcBjzTZ3a96zoN+I/I3AmMkjR+KPvv9kSwE/BkzfTyNK9rSZoM7AUsBHaI\niJWQJQtgXPsia7kLgU8Br6Tp7YF1EbExTXfjtd4FWANckqrELpL0Zrr4OkfEL4AvA0+QJYBngcV0\n/7WG+te15b9r3Z4IcnVn0S0kbQ1cA5wWEevbHU9RJB0JrI6IxbWz+1m02671JsDewLciYi/gV3RR\nNVB/Ur34NGBnYEfgzWRVI31127VupOX/1rs9EVSmOwtJm5Ilgcsi4to0e1VvkTG9r25XfC12AHCU\npMfJqvsOISshjErVB9Cd13o5sDwiFqbpOWSJoVuvM8B7gZ9FxJqI2ABcC7yb7r/WUP+6tvx3rdsT\nQSW6s0j14xcDD0XE+TVf3QDMSJ9nANeXHVsRIuLTETEhIiaTXdMfRsQJwHzgmLRY1xxvr4j4JfCk\npN3SrEOBn9Cl1zl5AthP0lbp33nvMXf1tU7qXdcbgA+lu4f2A57trUIatIjo6hdwBPAI8Cjw2XbH\nU9AxvoesaHgfcG96HUFWbz4PWJreR7c71gKO/SDgpvR5F+AuYBlwNbB5u+Mr4Hj3BBala30dsF23\nX2fgHOBh4AHgUmDzbrvWwBVkbSAbyP7iP6nedSWrGvpG+k27n+yOqiHt311MmJlVXLdXDZmZ2QCc\nCMzMKs6JwMys4pwIzMwqzonAzKzinAis0iS9RdKVkh6V9BNJ/ylp10Fs56LeDg0lfSbnOo9LGtPs\nvsxazbePWmWlB5T+D5gdEf+a5u0JbBMR/zOE7T4fEVvnWO5xsnvAnxrsvsxawSUCq7KDgQ29SQAg\nIu4F7pE0T9KPJd0vaRpkHfqlcQBmp37g50jaKn23QFKPpHPJesq8V9Jl6bvrJC1OferPbMNxmjXk\nRGBV9g6yniz7egl4f0TsTZYsvpJKDwC7AbMi4p3AeuBva1eMiDOBFyNiz8i6vQD4y4jYB+gBTpG0\nfQHHYjZoTgRmbyTgi5LuA/6brIvfHdJ3T0bE/6bP3yXr3mMgp0haAtxJ1lnYlBbHazYkmwy8iFnX\nepDXOi6rdQIwFtgnIjakuvwt0nd9G9UaNrJJOoisB839I+IFSQtqtmU2LLhEYFX2Q2BzSR/unSHp\nXcBbycY72CDp4DTda5Kk/dPn44Hb+9nuhtQtOMC2wNqUBHYnG0rUbFhxIrDKiuyWufcDh6XbRx8E\nzgb+E+iRtIisdPBwzWoPATNStdFo+h8vdhZwX2osvgXYJC3/ebLqIbNhxbePmuWUhgG9KbJB1M26\nhksEZmYV5xKBmVnFuURgZlZxTgRmZhXnRGBmVnFOBGZmFedEYGZWcf8PGuEWwOrW2QgAAAAASUVO\nRK5CYII=\n",
+      "text/plain": [
+       "<matplotlib.figure.Figure at 0x1d9e016fe48>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Plotting Capital vs Final Policy\n",
+    "\n",
+    "# x axis values\n",
+    "x = range(100)\n",
+    "# corresponding y axis values\n",
+    "y = policy\n",
+    " \n",
+    "# plotting the bars\n",
+    "plt.bar(x, y, align='center', alpha=0.5)\n",
+    " \n",
+    "# naming the x axis\n",
+    "plt.xlabel('Capital')\n",
+    "# naming the y axis\n",
+    "plt.ylabel('Final policy (stake)')\n",
+    " \n",
+    "# giving a title to the graph\n",
+    "plt.title('Capital vs Final Policy')\n",
+    " \n",
+    "# function to show the plot\n",
+    "plt.show()\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/DP/Gamblers Problem.ipynb b/DP/Gamblers Problem.ipynb
new file mode 100644
index 000000000..3479a7b30
--- /dev/null
+++ b/DP/Gamblers Problem.ipynb	
@@ -0,0 +1,154 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "### This is Example 4.3. Gambler’s Problem from Sutton's book.\n",
+    "\n",
+    "A gambler has the opportunity to make bets on the outcomes of a sequence of coin flips. \n",
+    "If the coin comes up heads, he wins as many dollars as he has staked on that flip; \n",
+    "if it is tails, he loses his stake. The game ends when the gambler wins by reaching his goal of $100, \n",
+    "or loses by running out of money. \n",
+    "\n",
+    "On each flip, the gambler must decide what portion of his capital to stake, in integer numbers of dollars. \n",
+    "This problem can be formulated as an undiscounted, episodic, finite MDP. \n",
+    "\n",
+    "The state is the gambler’s capital, s ∈ {1, 2, . . . , 99}.\n",
+    "The actions are stakes, a ∈ {0, 1, . . . , min(s, 100 − s)}. \n",
+    "The reward is zero on all transitions except those on which the gambler reaches his goal, when it is +1.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import sys\n",
+    "import matplotlib.pyplot as plt\n",
+    "if \"../\" not in sys.path:\n",
+    "  sys.path.append(\"../\") "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "\n",
+    "### Exercise 4.9 (programming)\n",
+    "\n",
+    "Implement value iteration for the gambler’s problem and solve it for p_h = 0.25 and p_h = 0.55."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def value_iteration_for_gamblers(p_h, theta=0.0001, discount_factor=1.0):\n",
+    "    \"\"\"\n",
+    "    Args:\n",
+    "        p_h: Probability of the coin coming up heads\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    def one_step_lookahead(s, V, rewards):\n",
+    "        \"\"\"\n",
+    "        Helper function to calculate the value for all action in a given state.\n",
+    "        \n",
+    "        Args:\n",
+    "            s: The gambler’s capital. Integer.\n",
+    "            V: The vector that contains values at each state. \n",
+    "            rewards: The reward vector.\n",
+    "                        \n",
+    "        Returns:\n",
+    "            A vector containing the expected value of each action. Its length equals to the number of actions.\n",
+    "        \"\"\"\n",
+    "        \n",
+    "        # Implement!\n",
+    "        \n",
+    "        return A\n",
+    "    \n",
+    "    # Implement!\n",
+    "    \n",
+    "    return policy, V"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "policy, v = value_iteration_for_gamblers(0.25)\n",
+    "\n",
+    "print(\"Optimized Policy:\")\n",
+    "print(policy)\n",
+    "print(\"\")\n",
+    "\n",
+    "print(\"Optimized Value Function:\")\n",
+    "print(v)\n",
+    "print(\"\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Plotting Final Policy (action stake) vs State (Capital)\n",
+    "\n",
+    "# Implement!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Plotting Capital vs Final Policy\n",
+    "\n",
+    "# Implement!\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/DP/README.md b/DP/README.md
index cf2cbf51f..ae2f6e6c5 100644
--- a/DP/README.md
+++ b/DP/README.md
@@ -44,3 +44,7 @@
 - Implement Value Iteration in Python (Gridworld)
   - [Exercise](Value%20Iteration.ipynb)
   - [Solution](Value%20Iteration%20Solution.ipynb)
+
+- Implement Gambler's Problem
+  - [Exercise](Gamblers%20Problem.ipynb)
+  - [Solution](Gamblers%20Problem%20Solution.ipynb)
\ No newline at end of file

From be7cfe308e9e5b24146ed9dc41d7a68981613c33 Mon Sep 17 00:00:00 2001
From: Aerin Kim <ohmyheroine@hotmail.com>
Date: Sun, 27 May 2018 22:33:44 -0700
Subject: [PATCH 36/56] just formatting

---
 DP/Gamblers Problem Solution.ipynb | 14 +++++++++-----
 DP/Gamblers Problem.ipynb          | 12 ++++++++----
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/DP/Gamblers Problem Solution.ipynb b/DP/Gamblers Problem Solution.ipynb
index d3880ef80..4e96a4885 100644
--- a/DP/Gamblers Problem Solution.ipynb	
+++ b/DP/Gamblers Problem Solution.ipynb	
@@ -18,7 +18,9 @@
     "\n",
     "The state is the gambler’s capital, s ∈ {1, 2, . . . , 99}.\n",
     "The actions are stakes, a ∈ {0, 1, . . . , min(s, 100 − s)}. \n",
-    "The reward is zero on all transitions except those on which the gambler reaches his goal, when it is +1.\n"
+    "The reward is zero on all transitions except those on which the gambler reaches his goal, when it is +1.\n",
+    "\n",
+    "The state-value function then gives the probability of winning from each state. A policy is a mapping from levels of capital to stakes. The optimal policy maximizes the probability of reaching the goal. Let p_h denote the probability of the coin coming up heads. If p_h is known, then the entire problem is known and it can be solved, for instance, by value iteration.\n"
    ]
   },
   {
@@ -61,7 +63,8 @@
     "    Args:\n",
     "        p_h: Probability of the coin coming up heads\n",
     "    \"\"\"\n",
-    "    # The reward is zero on all transitions except those on which the gambler reaches his goal, when it is +1.\n",
+    "    # The reward is zero on all transitions except those on which the gambler reaches his goal,\n",
+    "    # when it is +1.\n",
     "    rewards = np.zeros(101)\n",
     "    rewards[100] = 1 \n",
     "    \n",
@@ -78,15 +81,16 @@
     "            rewards: The reward vector.\n",
     "                        \n",
     "        Returns:\n",
-    "            A vector containing the expected value of each action. Its length equals to the number of actions.\n",
+    "            A vector containing the expected value of each action. \n",
+    "            Its length equals to the number of actions.\n",
     "        \"\"\"\n",
     "        A = np.zeros(101)\n",
     "        stakes = range(1, min(s, 100-s)+1) # Your minimum bet is 1, maximum bet is min(s, 100-s).\n",
     "        for a in stakes:\n",
     "            # rewards[s+a], rewards[s-a] are immediate rewards.\n",
     "            # V[s+a], V[s-a] are values of the next states.\n",
-    "            # This is the core of the Bellman equation: \n",
-    "            # The expected value of your action is the sum of immediate rewards and the value of the next state.\n",
+    "            # This is the core of the Bellman equation: The expected value of your action is \n",
+    "            # the sum of immediate rewards and the value of the next state.\n",
     "            A[a] = p_h * (rewards[s+a] + V[s+a]*discount_factor) + (1-p_h) * (rewards[s-a] + V[s-a]*discount_factor)\n",
     "        return A\n",
     "    \n",
diff --git a/DP/Gamblers Problem.ipynb b/DP/Gamblers Problem.ipynb
index 3479a7b30..0ed86294d 100644
--- a/DP/Gamblers Problem.ipynb	
+++ b/DP/Gamblers Problem.ipynb	
@@ -18,7 +18,9 @@
     "\n",
     "The state is the gambler’s capital, s ∈ {1, 2, . . . , 99}.\n",
     "The actions are stakes, a ∈ {0, 1, . . . , min(s, 100 − s)}. \n",
-    "The reward is zero on all transitions except those on which the gambler reaches his goal, when it is +1.\n"
+    "The reward is zero on all transitions except those on which the gambler reaches his goal, when it is +1.\n",
+    "\n",
+    "The state-value function then gives the probability of winning from each state. A policy is a mapping from levels of capital to stakes. The optimal policy maximizes the probability of reaching the goal. Let p_h denote the probability of the coin coming up heads. If p_h is known, then the entire problem is known and it can be solved, for instance, by value iteration.\n"
    ]
   },
   {
@@ -45,12 +47,13 @@
     "\n",
     "### Exercise 4.9 (programming)\n",
     "\n",
-    "Implement value iteration for the gambler’s problem and solve it for p_h = 0.25 and p_h = 0.55."
+    "Implement value iteration for the gambler’s problem and solve it for p_h = 0.25 and p_h = 0.55.\n",
+    "\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {
     "collapsed": true
    },
@@ -72,7 +75,8 @@
     "            rewards: The reward vector.\n",
     "                        \n",
     "        Returns:\n",
-    "            A vector containing the expected value of each action. Its length equals to the number of actions.\n",
+    "            A vector containing the expected value of each action. \n",
+    "            Its length equals to the number of actions.\n",
     "        \"\"\"\n",
     "        \n",
     "        # Implement!\n",

From 4f0d9428597bc64c68b2d4fc71025a8de48a08d7 Mon Sep 17 00:00:00 2001
From: Aerin Kim <ohmyheroine@hotmail.com>
Date: Mon, 28 May 2018 16:46:49 -0700
Subject: [PATCH 37/56] updated the broken link

---
 DQN/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DQN/README.md b/DQN/README.md
index 7d0464727..07c887bbc 100644
--- a/DQN/README.md
+++ b/DQN/README.md
@@ -23,7 +23,7 @@
 **Required:**
 
 - [Human-Level Control through Deep Reinforcement Learning](http://www.readcube.com/articles/10.1038/nature14236)
-- [Demystifying Deep Reinforcement Learning](https://www.nervanasys.com/demystifying-deep-reinforcement-learning/)
+- [Demystifying Deep Reinforcement Learning](https://ai.intel.com/demystifying-deep-reinforcement-learning/)
 - David Silver's RL Course Lecture 6 - Value Function Approximation ([video](https://www.youtube.com/watch?v=UoPei5o4fps), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/FA.pdf))
 
 **Optional:**

From fe3edfc570aa5d5150f9abfa0a728898b1b503a9 Mon Sep 17 00:00:00 2001
From: Aerin Kim <ohmyheroine@hotmail.com>
Date: Mon, 28 May 2018 20:15:01 -0700
Subject: [PATCH 38/56] fix #89

---
 DQN/Breakout Playground.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DQN/Breakout Playground.ipynb b/DQN/Breakout Playground.ipynb
index 5ff6a9e99..4e1a48ed8 100644
--- a/DQN/Breakout Playground.ipynb	
+++ b/DQN/Breakout Playground.ipynb	
@@ -73,7 +73,7 @@
    ],
    "source": [
     "print(\"Action space size: {}\".format(env.action_space.n))\n",
-    "print(env.get_action_meanings())\n",
+    "print(env.get_action_meanings()) # env.unwrapped.get_action_meanings() for gym 0.8.0 or later\n",
     "\n",
     "observation = env.reset()\n",
     "print(\"Observation space shape: {}\".format(observation.shape))\n",

From 49631ce5b0afdcd23170026563ded145982c02f8 Mon Sep 17 00:00:00 2001
From: Sharwon Pius <sharwonpiusm@gmail.com>
Date: Fri, 21 Sep 2018 04:06:26 +0530
Subject: [PATCH 39/56] Update README.md

Added CS885 Reinforcement Learning course from University of Waterloo. One of the most comprehensive RL courses.
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 72a11e5a9..61fe51bc3 100644
--- a/README.md
+++ b/README.md
@@ -57,6 +57,7 @@ Classes:
 - [David Silver's Reinforcement Learning Course (UCL, 2015)](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html)
 - [CS294 - Deep Reinforcement Learning (Berkeley, Fall 2015)](http://rll.berkeley.edu/deeprlcourse/)
 - [CS 8803 - Reinforcement Learning (Georgia Tech)](https://www.udacity.com/course/reinforcement-learning--ud600)
+- [CS885 - Reinforcement Learning (UWaterloo), Spring 2018](https://cs.uwaterloo.ca/~ppoupart/teaching/cs885-spring18/)
 
 Talks/Tutorials:
 

From b47c9206b6f35ebabeecaa44ac10ec5fa97ff239 Mon Sep 17 00:00:00 2001
From: Jovan Sardinha <jovan.sardinha@gmail.com>
Date: Mon, 24 Dec 2018 10:56:34 -0800
Subject: [PATCH 40/56] updates to README.md

* Added UC Berkley class resources
* Added OpenAI spinning up resources
---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 61fe51bc3..65c0ec623 100644
--- a/README.md
+++ b/README.md
@@ -58,6 +58,7 @@ Classes:
 - [CS294 - Deep Reinforcement Learning (Berkeley, Fall 2015)](http://rll.berkeley.edu/deeprlcourse/)
 - [CS 8803 - Reinforcement Learning (Georgia Tech)](https://www.udacity.com/course/reinforcement-learning--ud600)
 - [CS885 - Reinforcement Learning (UWaterloo), Spring 2018](https://cs.uwaterloo.ca/~ppoupart/teaching/cs885-spring18/)
+- [CS294-112 - Deep Reinforcement Learning (UC Berkeley)](http://rail.eecs.berkeley.edu/deeprlcourse/)
 
 Talks/Tutorials:
 
@@ -67,6 +68,7 @@ Talks/Tutorials:
 - [Tutorial: Introduction to Reinforcement Learning with Function Approximation](https://www.youtube.com/watch?v=ggqnxyjaKe4)
 - [John Schulman - Deep Reinforcement Learning (4 Lectures)](https://www.youtube.com/playlist?list=PLjKEIQlKCTZYN3CYBlj8r58SbNorobqcp)
 - [Deep Reinforcement Learning Slides @ NIPS 2016](http://people.eecs.berkeley.edu/~pabbeel/nips-tutorial-policy-optimization-Schulman-Abbeel.pdf)
+- [OpenAI Spinning Up](https://spinningup.openai.com/en/latest/user/introduction.html)
 
 Other Projects:
 

From 57f71cd4b97df36988dc1cbc868b45a2970d19d6 Mon Sep 17 00:00:00 2001
From: Jovan Sardinha <jovan.sardinha@gmail.com>
Date: Tue, 25 Dec 2018 08:42:14 -0800
Subject: [PATCH 41/56] imported io so that StringIO() would work

---
 lib/envs/gridworld.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/lib/envs/gridworld.py b/lib/envs/gridworld.py
index ea96ddbb0..4611ae495 100644
--- a/lib/envs/gridworld.py
+++ b/lib/envs/gridworld.py
@@ -1,3 +1,4 @@
+import io
 import numpy as np
 import sys
 from gym.envs.toy_text import discrete
@@ -86,7 +87,7 @@ def _render(self, mode='human', close=False):
         if close:
             return
 
-        outfile = StringIO() if mode == 'ansi' else sys.stdout
+        outfile = io.StringIO() if mode == 'ansi' else sys.stdout
 
         grid = np.arange(self.nS).reshape(self.shape)
         it = np.nditer(grid, flags=['multi_index'])
@@ -102,7 +103,7 @@ def _render(self, mode='human', close=False):
                 output = " o "
 
             if x == 0:
-                output = output.lstrip() 
+                output = output.lstrip()
             if x == self.shape[1] - 1:
                 output = output.rstrip()
 
@@ -111,4 +112,4 @@ def _render(self, mode='human', close=False):
             if x == self.shape[1] - 1:
                 outfile.write("\n")
 
-            it.iternext()
\ No newline at end of file
+            it.iternext()

From 9ad2689f9e638e645e6b71d8198f9f733d7142b5 Mon Sep 17 00:00:00 2001
From: Jovan Sardinha <jovan.sardinha@gmail.com>
Date: Tue, 25 Dec 2018 08:42:39 -0800
Subject: [PATCH 42/56] added documentation for _render()

---
 lib/envs/gridworld.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/lib/envs/gridworld.py b/lib/envs/gridworld.py
index 4611ae495..72bd92eb7 100644
--- a/lib/envs/gridworld.py
+++ b/lib/envs/gridworld.py
@@ -84,6 +84,15 @@ def __init__(self, shape=[4,4]):
         super(GridworldEnv, self).__init__(nS, nA, P, isd)
 
     def _render(self, mode='human', close=False):
+        """ Renders the current gridworld layout
+
+         For example, a 4x4 grid with the mode="human" looks like:
+            T  o  o  o
+            o  x  o  o
+            o  o  o  o
+            o  o  o  T
+        where x is your position and T are the two terminal states.
+        """
         if close:
             return
 

From 0fe550c4ad64337a0ccef5c244bbe2ad5a51a8d1 Mon Sep 17 00:00:00 2001
From: Jovan Sardinha <jovan.sardinha@gmail.com>
Date: Tue, 25 Dec 2018 08:48:02 -0800
Subject: [PATCH 43/56] documented structure for P[s][a]

---
 lib/envs/gridworld.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/lib/envs/gridworld.py b/lib/envs/gridworld.py
index 72bd92eb7..22c5ed538 100644
--- a/lib/envs/gridworld.py
+++ b/lib/envs/gridworld.py
@@ -31,6 +31,10 @@ class GridworldEnv(discrete.DiscreteEnv):
     metadata = {'render.modes': ['human', 'ansi']}
 
     def __init__(self, shape=[4,4]):
+        """
+
+
+        """
         if not isinstance(shape, (list, tuple)) or not len(shape) == 2:
             raise ValueError('shape argument must be a list/tuple of length 2')
 
@@ -50,6 +54,7 @@ def __init__(self, shape=[4,4]):
             s = it.iterindex
             y, x = it.multi_index
 
+            # P[s][a] = (prob, next_state, reward, done)
             P[s] = {a : [] for a in range(nA)}
 
             is_done = lambda s: s == 0 or s == (nS - 1)

From 30b230436c47e194f0209fad29f907146c739610 Mon Sep 17 00:00:00 2001
From: Jovan Sardinha <jovan.sardinha@gmail.com>
Date: Tue, 25 Dec 2018 08:50:53 -0800
Subject: [PATCH 44/56] removed extra whitespace

---
 lib/envs/gridworld.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/lib/envs/gridworld.py b/lib/envs/gridworld.py
index 22c5ed538..c28882eb5 100644
--- a/lib/envs/gridworld.py
+++ b/lib/envs/gridworld.py
@@ -31,10 +31,6 @@ class GridworldEnv(discrete.DiscreteEnv):
     metadata = {'render.modes': ['human', 'ansi']}
 
     def __init__(self, shape=[4,4]):
-        """
-
-
-        """
         if not isinstance(shape, (list, tuple)) or not len(shape) == 2:
             raise ValueError('shape argument must be a list/tuple of length 2')
 

From 01b8b1379a3c4a79ec14528b52b1c6c7608bbbaf Mon Sep 17 00:00:00 2001
From: Jovan Sardinha <jovan.sardinha@gmail.com>
Date: Tue, 25 Dec 2018 08:51:57 -0800
Subject: [PATCH 45/56] nit

---
 lib/envs/gridworld.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/envs/gridworld.py b/lib/envs/gridworld.py
index c28882eb5..5eede9af0 100644
--- a/lib/envs/gridworld.py
+++ b/lib/envs/gridworld.py
@@ -50,7 +50,7 @@ def __init__(self, shape=[4,4]):
             s = it.iterindex
             y, x = it.multi_index
 
-            # P[s][a] = (prob, next_state, reward, done)
+            # P[s][a] = (prob, next_state, reward, is_done)
             P[s] = {a : [] for a in range(nA)}
 
             is_done = lambda s: s == 0 or s == (nS - 1)

From 120fbcfb640afff960f741ae0af56a148955f9dd Mon Sep 17 00:00:00 2001
From: Stas Olekhnovich <stanislav.olekhnovich@bmw.de>
Date: Wed, 27 Feb 2019 17:33:02 +0100
Subject: [PATCH 46/56] Add link to Advanced Depp Learning & Reinforcement
 Learning lectures on youtube.

Lectures were recorded at UCL in 2018. Lecturer for RL is Hado Vab Hasselt from DeepMind
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 65c0ec623..f9f1abe87 100644
--- a/README.md
+++ b/README.md
@@ -69,6 +69,7 @@ Talks/Tutorials:
 - [John Schulman - Deep Reinforcement Learning (4 Lectures)](https://www.youtube.com/playlist?list=PLjKEIQlKCTZYN3CYBlj8r58SbNorobqcp)
 - [Deep Reinforcement Learning Slides @ NIPS 2016](http://people.eecs.berkeley.edu/~pabbeel/nips-tutorial-policy-optimization-Schulman-Abbeel.pdf)
 - [OpenAI Spinning Up](https://spinningup.openai.com/en/latest/user/introduction.html)
+- [Advanced Deep Learning & Reinforcement Learning (UCL 2018, DeepMind)](https://www.youtube.com/playlist?list=PLqYmG7hTraZDNJre23vqCGIVpfZ_K2RZs)
 
 Other Projects:
 

From 4a2df43bb111ec319ea11cfd68a1d79c7dcceceb Mon Sep 17 00:00:00 2001
From: Aleks K <Aleks.Krnjaic@gmail.com>
Date: Fri, 1 Mar 2019 13:31:13 +1100
Subject: [PATCH 47/56] fixed shape descriptions for neural network input layer

---
 DQN/Deep Q Learning Solution.ipynb | 6 +++---
 DQN/Deep Q Learning.ipynb          | 6 +++---
 DQN/Double DQN Solution.ipynb      | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/DQN/Deep Q Learning Solution.ipynb b/DQN/Deep Q Learning Solution.ipynb
index fc88b90ae..90881ea07 100644
--- a/DQN/Deep Q Learning Solution.ipynb	
+++ b/DQN/Deep Q Learning Solution.ipynb	
@@ -117,7 +117,7 @@
     "        \"\"\"\n",
     "\n",
     "        # Placeholders for our input\n",
-    "        # Our input are 4 RGB frames of shape 160, 160 each\n",
+    "        # Our input are 4 grayscale frames of shape 84, 84 each\n",
     "        self.X_pl = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name=\"X\")\n",
     "        # The TD target value\n",
     "        self.y_pl = tf.placeholder(shape=[None], dtype=tf.float32, name=\"y\")\n",
@@ -166,7 +166,7 @@
     "\n",
     "        Args:\n",
     "          sess: Tensorflow session\n",
-    "          s: State input of shape [batch_size, 4, 160, 160, 3]\n",
+    "          s: State input of shape [batch_size, 4, 84, 84, 1]\n",
     "\n",
     "        Returns:\n",
     "          Tensor of shape [batch_size, NUM_VALID_ACTIONS] containing the estimated \n",
@@ -180,7 +180,7 @@
     "\n",
     "        Args:\n",
     "          sess: Tensorflow session object\n",
-    "          s: State input of shape [batch_size, 4, 160, 160, 3]\n",
+    "          s: State input of shape [batch_size, 4, 84, 84, 1]\n",
     "          a: Chosen actions of shape [batch_size]\n",
     "          y: Targets of shape [batch_size]\n",
     "\n",
diff --git a/DQN/Deep Q Learning.ipynb b/DQN/Deep Q Learning.ipynb
index d3a51697f..2b77605c8 100644
--- a/DQN/Deep Q Learning.ipynb	
+++ b/DQN/Deep Q Learning.ipynb	
@@ -110,7 +110,7 @@
     "        \"\"\"\n",
     "\n",
     "        # Placeholders for our input\n",
-    "        # Our input are 4 RGB frames of shape 160, 160 each\n",
+    "        # Our input are 4 grayscale frames of shape 84, 84 each\n",
     "        self.X_pl = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name=\"X\")\n",
     "        # The TD target value\n",
     "        self.y_pl = tf.placeholder(shape=[None], dtype=tf.float32, name=\"y\")\n",
@@ -160,7 +160,7 @@
     "\n",
     "        Args:\n",
     "          sess: Tensorflow session\n",
-    "          s: State input of shape [batch_size, 4, 160, 160, 3]\n",
+    "          s: State input of shape [batch_size, 4, 84, 84, 1]\n",
     "\n",
     "        Returns:\n",
     "          Tensor of shape [batch_size, NUM_VALID_ACTIONS] containing the estimated \n",
@@ -174,7 +174,7 @@
     "\n",
     "        Args:\n",
     "          sess: Tensorflow session object\n",
-    "          s: State input of shape [batch_size, 4, 160, 160, 3]\n",
+    "          s: State input of shape [batch_size, 4, 84, 84, 1]\n",
     "          a: Chosen actions of shape [batch_size]\n",
     "          y: Targets of shape [batch_size]\n",
     "\n",
diff --git a/DQN/Double DQN Solution.ipynb b/DQN/Double DQN Solution.ipynb
index 3fc45722b..f53ca59a6 100644
--- a/DQN/Double DQN Solution.ipynb	
+++ b/DQN/Double DQN Solution.ipynb	
@@ -109,7 +109,7 @@
     "        \"\"\"\n",
     "\n",
     "        # Placeholders for our input\n",
-    "        # Our input are 4 RGB frames of shape 160, 160 each\n",
+    "        # Our input are 4 grayscale frames of shape 84, 84 each\n",
     "        self.X_pl = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name=\"X\")\n",
     "        # The TD target value\n",
     "        self.y_pl = tf.placeholder(shape=[None], dtype=tf.float32, name=\"y\")\n",
@@ -136,7 +136,7 @@
     "\n",
     "        Args:\n",
     "          sess: Tensorflow session\n",
-    "          s: State input of shape [batch_size, 4, 160, 160, 3]\n",
+    "          s: State input of shape [batch_size, 4, 84, 84, 1]\n",
     "\n",
     "        Returns:\n",
     "          Tensor of shape [batch_size, NUM_VALID_ACTIONS] containing the estimated \n",
@@ -150,7 +150,7 @@
     "\n",
     "        Args:\n",
     "          sess: Tensorflow session object\n",
-    "          s: State input of shape [batch_size, 4, 160, 160, 3]\n",
+    "          s: State input of shape [batch_size, 4, 84, 84, 1]\n",
     "          a: Chosen actions of shape [batch_size]\n",
     "          y: Targets of shape [batch_size]\n",
     "\n",

From a35df152681152ecd11f88d50dded4d9879f06d2 Mon Sep 17 00:00:00 2001
From: Piero Macaluso <piero.macaluso@studenti.polito.it>
Date: Wed, 13 Mar 2019 09:55:50 +0100
Subject: [PATCH 48/56] Updated links to new version of Sutton's book

---
 DP/README.md             | 2 +-
 FA/README.md             | 4 ++--
 Introduction/README.md   | 2 +-
 MC/README.md             | 2 +-
 MDP/README.md            | 2 +-
 PolicyGradient/README.md | 2 +-
 README.md                | 4 ++--
 TD/README.md             | 6 +++---
 8 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/DP/README.md b/DP/README.md
index ae2f6e6c5..a6dabe88c 100644
--- a/DP/README.md
+++ b/DP/README.md
@@ -28,7 +28,7 @@
 
 **Optional:**
 
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 4: Dynamic Programming
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 4: Dynamic Programming
 
 
 ### Exercises
diff --git a/FA/README.md b/FA/README.md
index 247c41e4e..a8456622d 100644
--- a/FA/README.md
+++ b/FA/README.md
@@ -25,8 +25,8 @@
 **Required:**
 
 - David Silver's RL Course Lecture 6 - Value Function Approximation ([video](https://www.youtube.com/watch?v=UoPei5o4fps), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/FA.pdf))
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 9: On-policy Prediction with Approximation
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 10: On-policy Control with Approximation
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 9: On-policy Prediction with Approximation
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 10: On-policy Control with Approximation
 
 **Optional:**
 
diff --git a/Introduction/README.md b/Introduction/README.md
index cd27a4e12..ca8897826 100644
--- a/Introduction/README.md
+++ b/Introduction/README.md
@@ -17,7 +17,7 @@
 
 **Required:**
 
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 1: The Reinforcement Learning Problem
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 1: The Reinforcement Learning Problem
 - David Silver's RL Course Lecture 1 - Introduction to Reinforcement Learning ([video](https://www.youtube.com/watch?v=2pWv7GOvuf0), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/intro_RL.pdf))
 - [OpenAI Gym Tutorial](https://gym.openai.com/docs)
 
diff --git a/MC/README.md b/MC/README.md
index 7b889ed6f..8f246c38d 100644
--- a/MC/README.md
+++ b/MC/README.md
@@ -26,7 +26,7 @@
 
 **Required:**
 
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 5: Monte Carlo Methods
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 5: Monte Carlo Methods
 
 
 **Optional:**
diff --git a/MDP/README.md b/MDP/README.md
index de9bcce35..08e73d072 100644
--- a/MDP/README.md
+++ b/MDP/README.md
@@ -25,7 +25,7 @@
 
 **Required:**
 
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 3: Finite Markov Decision Processes
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 3: Finite Markov Decision Processes
 - David Silver's RL Course Lecture 2 - Markov Decision Processes ([video](https://www.youtube.com/watch?v=lfHX2hHRMVQ), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/MDP.pdf))
 
 
diff --git a/PolicyGradient/README.md b/PolicyGradient/README.md
index a7dffdeef..e8e793b77 100644
--- a/PolicyGradient/README.md
+++ b/PolicyGradient/README.md
@@ -36,7 +36,7 @@
 
 **Optional:**
 
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 13: Policy Gradient Methods
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 13: Policy Gradient Methods
 - [Deterministic Policy Gradient Algorithms](http://jmlr.org/proceedings/papers/v32/silver14.pdf)
 - [Deterministic Policy Gradient Algorithms (Talk)](http://techtalks.tv/talks/deterministic-policy-gradient-algorithms/61098/)
 - [Continuous control with deep reinforcement learning](https://arxiv.org/abs/1509.02971)
diff --git a/README.md b/README.md
index f9f1abe87..82009e229 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 This repository provides code, exercises and solutions for popular Reinforcement Learning algorithms. These are meant to serve as a learning tool to complement the theoretical materials from
 
-- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/book/bookdraft2018jan1.pdf)
+- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/book/RLbook2018.pdf)
 - [David Silver's Reinforcement Learning Course](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html)
 
 Each folder in corresponds to one or more chapters of the above textbook and/or course. In addition to exercises and solution, each folder also contains a list of learning goals, a brief concept summary, and links to the relevant readings.
@@ -50,7 +50,7 @@ All code is written in Python 3 and uses RL environments from [OpenAI Gym](https
 
 Textbooks:
 
-- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/book/bookdraft2018jan1.pdf)
+- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/book/RLbook2018.pdf)
 
 Classes:
 
diff --git a/TD/README.md b/TD/README.md
index a4c35a0e9..9b34caecc 100644
--- a/TD/README.md
+++ b/TD/README.md
@@ -28,14 +28,14 @@
 
 **Required:**
 
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 6: Temporal-Difference Learning
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 6: Temporal-Difference Learning
 - David Silver's RL Course Lecture 4 - Model-Free Prediction ([video](https://www.youtube.com/watch?v=PnHCvfgC_ZA), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/MC-TD.pdf))
 - David Silver's RL Course Lecture 5 - Model-Free Control ([video](https://www.youtube.com/watch?v=0g4j2k_Ggc4), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/control.pdf))
 
 **Optional:**
 
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 7: Multi-Step Bootstrapping
-- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 12: Eligibility Traces
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 7: Multi-Step Bootstrapping
+- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 12: Eligibility Traces
 
 
 ### Exercises

From bb9241dbbbdabb8dd1d6c116f7120e46459a87fd Mon Sep 17 00:00:00 2001
From: Stanislav Olekhnovich <stas.olekhnovich@gmail.com>
Date: Fri, 29 Mar 2019 18:17:22 +0100
Subject: [PATCH 49/56] Fix rendering crash on Win 10

It was crashing on my win10 PC, I found a fix https://github.com/openai/gym/issues/1056 and applied it.
---
 FA/MountainCar Playground.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/FA/MountainCar Playground.ipynb b/FA/MountainCar Playground.ipynb
index 9b4fe3a36..914f7a5a0 100644
--- a/FA/MountainCar Playground.ipynb	
+++ b/FA/MountainCar Playground.ipynb	
@@ -71,7 +71,7 @@
     "plt.figure()\n",
     "plt.imshow(env.render(mode='rgb_array'))\n",
     "\n",
-    "env.render(close=True)"
+    "env.close()"
    ]
   },
   {

From 1abaae41f6bf751c66d555c04de9b304f8ef8abc Mon Sep 17 00:00:00 2001
From: Michael Anuzis <anuzis@gmail.com>
Date: Tue, 2 Apr 2019 17:15:57 -0400
Subject: [PATCH 50/56] Q-Learning docstring improvements.

---
 TD/Q-Learning Solution.ipynb | 4 ++--
 TD/Q-Learning.ipynb          | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/TD/Q-Learning Solution.ipynb b/TD/Q-Learning Solution.ipynb
index 4c1c5be2c..f2da32351 100644
--- a/TD/Q-Learning Solution.ipynb	
+++ b/TD/Q-Learning Solution.ipynb	
@@ -50,7 +50,7 @@
     "    Args:\n",
     "        Q: A dictionary that maps from state -> action-values.\n",
     "            Each value is a numpy array of length nA (see below)\n",
-    "        epsilon: The probability to select a random action . float between 0 and 1.\n",
+    "        epsilon: The probability to select a random action. Float between 0 and 1.\n",
     "        nA: Number of actions in the environment.\n",
     "    \n",
     "    Returns:\n",
@@ -82,7 +82,7 @@
     "        num_episodes: Number of episodes to run for.\n",
     "        discount_factor: Gamma discount factor.\n",
     "        alpha: TD learning rate.\n",
-    "        epsilon: Chance the sample a random action. Float betwen 0 and 1.\n",
+    "        epsilon: Chance to sample a random action. Float between 0 and 1.\n",
     "    \n",
     "    Returns:\n",
     "        A tuple (Q, episode_lengths).\n",
diff --git a/TD/Q-Learning.ipynb b/TD/Q-Learning.ipynb
index 4e1396cf6..ddd33c756 100644
--- a/TD/Q-Learning.ipynb
+++ b/TD/Q-Learning.ipynb
@@ -49,7 +49,7 @@
     "    Args:\n",
     "        Q: A dictionary that maps from state -> action-values.\n",
     "            Each value is a numpy array of length nA (see below)\n",
-    "        epsilon: The probability to select a random action . float between 0 and 1.\n",
+    "        epsilon: The probability to select a random action. Float between 0 and 1.\n",
     "        nA: Number of actions in the environment.\n",
     "    \n",
     "    Returns:\n",
@@ -81,7 +81,7 @@
     "        num_episodes: Number of episodes to run for.\n",
     "        discount_factor: Gamma discount factor.\n",
     "        alpha: TD learning rate.\n",
-    "        epsilon: Chance the sample a random action. Float betwen 0 and 1.\n",
+    "        epsilon: Chance to sample a random action. Float between 0 and 1.\n",
     "    \n",
     "    Returns:\n",
     "        A tuple (Q, episode_lengths).\n",

From b2d179a1fe2fc8ee5b01e9f9b5ecadaf9139ada7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=B7=E6=89=93=E4=B8=8D=E5=8A=A8=EF=BC=81?=
 <779222056@qq.com>
Date: Tue, 11 Jun 2019 15:58:21 +0800
Subject: [PATCH 51/56] Update CliffWalk REINFORCE with Baseline Solution.ipynb

---
 PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb b/PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb
index cad46261d..fb7707846 100644
--- a/PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb	
+++ b/PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb	
@@ -161,7 +161,7 @@
     "    Transition = collections.namedtuple(\"Transition\", [\"state\", \"action\", \"reward\", \"next_state\", \"done\"])\n",
     "    \n",
     "    for i_episode in range(num_episodes):\n",
-    "        # Reset the environment and pick the fisrst action\n",
+    "        # Reset the environment and pick the first action\n",
     "        state = env.reset()\n",
     "        \n",
     "        episode = []\n",

From 775fd81e82fa900f87d35309f937d1102ed9fc57 Mon Sep 17 00:00:00 2001
From: nsydn <nserhanaydin@gmail.com>
Date: Tue, 1 Oct 2019 18:04:03 +0300
Subject: [PATCH 52/56] Update Policy Iteration Solution.ipynb

---
 DP/Policy Iteration Solution.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DP/Policy Iteration Solution.ipynb b/DP/Policy Iteration Solution.ipynb
index dc121c8c5..076894169 100644
--- a/DP/Policy Iteration Solution.ipynb	
+++ b/DP/Policy Iteration Solution.ipynb	
@@ -82,7 +82,7 @@
     "    until an optimal policy is found.\n",
     "    \n",
     "    Args:\n",
-    "        env: The OpenAI envrionment.\n",
+    "        env: The OpenAI environment.\n",
     "        policy_eval_fn: Policy Evaluation function that takes 3 arguments:\n",
     "            policy, env, discount_factor.\n",
     "        discount_factor: gamma discount factor.\n",

From 7d232607e63743c7c1b9ff912f4e0084bbf3e616 Mon Sep 17 00:00:00 2001
From: nsydn <nserhanaydin@gmail.com>
Date: Tue, 1 Oct 2019 18:13:40 +0300
Subject: [PATCH 53/56] Update Policy Iteration Solution.ipynb

---
 DP/Policy Iteration Solution.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DP/Policy Iteration Solution.ipynb b/DP/Policy Iteration Solution.ipynb
index 076894169..73009f000 100644
--- a/DP/Policy Iteration Solution.ipynb	
+++ b/DP/Policy Iteration Solution.ipynb	
@@ -124,7 +124,7 @@
     "        \n",
     "        # For each state...\n",
     "        for s in range(env.nS):\n",
-    "            # The best action we would take under the currect policy\n",
+    "            # The best action we would take under the current policy\n",
     "            chosen_a = np.argmax(policy[s])\n",
     "            \n",
     "            # Find the best action by one-step lookahead\n",

From 1298c8ddd60331ab7457bb7fa6fbc42ebecf8f0c Mon Sep 17 00:00:00 2001
From: Roshan  Ray <roshanroy759@gmail.com>
Date: Fri, 8 Nov 2019 08:56:53 +0530
Subject: [PATCH 54/56] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 82009e229..8a89bd765 100644
--- a/README.md
+++ b/README.md
@@ -70,6 +70,7 @@ Talks/Tutorials:
 - [Deep Reinforcement Learning Slides @ NIPS 2016](http://people.eecs.berkeley.edu/~pabbeel/nips-tutorial-policy-optimization-Schulman-Abbeel.pdf)
 - [OpenAI Spinning Up](https://spinningup.openai.com/en/latest/user/introduction.html)
 - [Advanced Deep Learning & Reinforcement Learning (UCL 2018, DeepMind)](https://www.youtube.com/playlist?list=PLqYmG7hTraZDNJre23vqCGIVpfZ_K2RZs)
+-[Deep RL Bootcamp](https://sites.google.com/view/deep-rl-bootcamp/lectures)
 
 Other Projects:
 

From 40eda6b1ca56cb41d99b1eae70a74774d016fa5f Mon Sep 17 00:00:00 2001
From: "Ariel S. Boiardi" <aboiardi@sissa.it>
Date: Mon, 19 Sep 2022 19:19:46 +0200
Subject: [PATCH 55/56] Compatible with gym==0.26

---
 lib/envs/cliff_walking.py   |  9 +++++--
 lib/envs/discrete.py        | 51 +++++++++++++++++++++++++++++++++++++
 lib/envs/gridworld.py       |  5 +++-
 lib/envs/windy_gridworld.py |  8 ++++--
 4 files changed, 68 insertions(+), 5 deletions(-)
 create mode 100644 lib/envs/discrete.py

diff --git a/lib/envs/cliff_walking.py b/lib/envs/cliff_walking.py
index 30b2ff7bb..bbae6c80d 100644
--- a/lib/envs/cliff_walking.py
+++ b/lib/envs/cliff_walking.py
@@ -1,6 +1,11 @@
+import io
 import numpy as np
 import sys
-from gym.envs.toy_text import discrete
+if "../.." not in sys.path:
+  sys.path.append("../..") 
+
+from lib.envs import discrete
+
 
 
 UP = 0
@@ -60,7 +65,7 @@ def _render(self, mode='human', close=False):
         if close:
             return
 
-        outfile = StringIO() if mode == 'ansi' else sys.stdout
+        outfile = io.StringIO() if mode == 'ansi' else sys.stdout
 
         for s in range(self.nS):
             position = np.unravel_index(s, self.shape)
diff --git a/lib/envs/discrete.py b/lib/envs/discrete.py
new file mode 100644
index 000000000..64455fc00
--- /dev/null
+++ b/lib/envs/discrete.py
@@ -0,0 +1,51 @@
+import numpy as np
+
+from gym import Env, spaces
+from gym.utils import seeding
+from gym.envs.toy_text.utils import categorical_sample
+
+class DiscreteEnv(Env):
+
+    """
+    Has the following members
+    - nS: number of states
+    - nA: number of actions
+    - P: transitions (*)
+    - isd: initial state distribution (**)
+
+    (*) dictionary of lists, where
+      P[s][a] == [(probability, nextstate, reward, done), ...]
+    (**) list or array of length nS
+
+
+    """
+
+    def __init__(self, nS, nA, P, isd):
+        self.P = P
+        self.isd = isd
+        self.lastaction = None  # for rendering
+        self.nS = nS
+        self.nA = nA
+
+        self.action_space = spaces.Discrete(self.nA)
+        self.observation_space = spaces.Discrete(self.nS)
+
+        self.seed()
+        self.s = categorical_sample(self.isd, self.np_random)
+
+    def seed(self, seed=None):
+        self.np_random, seed = seeding.np_random(seed)
+        return [seed]
+
+    def reset(self):
+        self.s = categorical_sample(self.isd, self.np_random)
+        self.lastaction = None
+        return int(self.s)
+
+    def step(self, a):
+        transitions = self.P[self.s][a]
+        i = categorical_sample([t[0] for t in transitions], self.np_random)
+        p, s, r, d = transitions[i]
+        self.s = s
+        self.lastaction = a
+        return (int(s), r, d, {"prob": p})
diff --git a/lib/envs/gridworld.py b/lib/envs/gridworld.py
index 5eede9af0..6c559f918 100644
--- a/lib/envs/gridworld.py
+++ b/lib/envs/gridworld.py
@@ -1,7 +1,10 @@
 import io
 import numpy as np
 import sys
-from gym.envs.toy_text import discrete
+if "../.." not in sys.path:
+  sys.path.append("../..") 
+
+from lib.envs import discrete
 
 UP = 0
 RIGHT = 1
diff --git a/lib/envs/windy_gridworld.py b/lib/envs/windy_gridworld.py
index 720c5974b..4b307decd 100644
--- a/lib/envs/windy_gridworld.py
+++ b/lib/envs/windy_gridworld.py
@@ -1,7 +1,11 @@
+import io
 import gym
 import numpy as np
 import sys
-from gym.envs.toy_text import discrete
+if "../.." not in sys.path:
+  sys.path.append("../..") 
+
+from lib.envs import discrete
 
 UP = 0
 RIGHT = 1
@@ -60,7 +64,7 @@ def _render(self, mode='human', close=False):
         if close:
             return
 
-        outfile = StringIO() if mode == 'ansi' else sys.stdout
+        outfile = io.StringIO() if mode == 'ansi' else sys.stdout
 
         for s in range(self.nS):
             position = np.unravel_index(s, self.shape)

From d173521920759490516fe0738955f491fd373b71 Mon Sep 17 00:00:00 2001
From: "Ariel S. Boiardi" <aboiardi@sissa.it>
Date: Tue, 20 Sep 2022 10:46:18 +0200
Subject: [PATCH 56/56] Corrected import

---
 lib/envs/cliff_walking.py   | 6 +-----
 lib/envs/gridworld.py       | 4 +---
 lib/envs/windy_gridworld.py | 4 +---
 3 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/lib/envs/cliff_walking.py b/lib/envs/cliff_walking.py
index bbae6c80d..2c677662c 100644
--- a/lib/envs/cliff_walking.py
+++ b/lib/envs/cliff_walking.py
@@ -1,12 +1,8 @@
 import io
 import numpy as np
 import sys
-if "../.." not in sys.path:
-  sys.path.append("../..") 
-
-from lib.envs import discrete
-
 
+from . import discrete
 
 UP = 0
 RIGHT = 1
diff --git a/lib/envs/gridworld.py b/lib/envs/gridworld.py
index 6c559f918..64a5be602 100644
--- a/lib/envs/gridworld.py
+++ b/lib/envs/gridworld.py
@@ -1,10 +1,8 @@
 import io
 import numpy as np
 import sys
-if "../.." not in sys.path:
-  sys.path.append("../..") 
 
-from lib.envs import discrete
+from . import discrete
 
 UP = 0
 RIGHT = 1
diff --git a/lib/envs/windy_gridworld.py b/lib/envs/windy_gridworld.py
index 4b307decd..6ac49cab3 100644
--- a/lib/envs/windy_gridworld.py
+++ b/lib/envs/windy_gridworld.py
@@ -2,10 +2,8 @@
 import gym
 import numpy as np
 import sys
-if "../.." not in sys.path:
-  sys.path.append("../..") 
 
-from lib.envs import discrete
+from . import discrete
 
 UP = 0
 RIGHT = 1