From 71223d590a517f3d9596bfe13a6bfb6cb7d8b9ff Mon Sep 17 00:00:00 2001 From: Kismuz Date: Sat, 8 Jul 2017 16:53:15 +0300 Subject: [PATCH 01/56] DQN copy_model_parameters memory leak fixed, tensorboard summaries updated with cpu/mem usage --- DQN/Deep Q Learning Solution.ipynb | 120 +++++++++++++++++------------ 1 file changed, 70 insertions(+), 50 deletions(-) diff --git a/DQN/Deep Q Learning Solution.ipynb b/DQN/Deep Q Learning Solution.ipynb index 7dd832212..7cf615137 100644 --- a/DQN/Deep Q Learning Solution.ipynb +++ b/DQN/Deep Q Learning Solution.ipynb @@ -17,6 +17,7 @@ "import os\n", "import random\n", "import sys\n", + "import psutil\n", "import tensorflow as tf\n", "\n", "if \"../\" not in sys.path:\n", @@ -29,9 +30,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "env = gym.envs.make(\"Breakout-v0\")" @@ -40,9 +39,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Atari Actions: 0 (noop), 1 (fire), 2 (left) and 3 (right) are valid actions\n", @@ -86,9 +83,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "class Estimator():\n", @@ -198,9 +193,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# For Testing....\n", @@ -234,30 +227,39 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ - "def copy_model_parameters(sess, estimator1, estimator2):\n", + "class ModelParametersCopier():\n", " \"\"\"\n", - " Copies the model parameters of one estimator to another.\n", - "\n", - " Args:\n", - " sess: Tensorflow session instance\n", - " estimator1: Estimator to copy the paramters from\n", - " estimator2: Estimator to copy the parameters to\n", + " Copy model parameters of one estimator to another.\n", " \"\"\"\n", - " e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1.scope)]\n", - " e1_params = sorted(e1_params, key=lambda v: v.name)\n", - " e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)]\n", - " e2_params = sorted(e2_params, key=lambda v: v.name)\n", - "\n", - " update_ops = []\n", - " for e1_v, e2_v in zip(e1_params, e2_params):\n", - " op = e2_v.assign(e1_v)\n", - " update_ops.append(op)\n", - "\n", - " sess.run(update_ops)" + " \n", + " def __init__(self, estimator1, estimator2):\n", + " \"\"\"\n", + " Defines copy-work operation graph. \n", + " Args:\n", + " estimator1: Estimator to copy the paramters from\n", + " estimator2: Estimator to copy the parameters to\n", + " \"\"\"\n", + " e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1.scope)]\n", + " e1_params = sorted(e1_params, key=lambda v: v.name)\n", + " e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)]\n", + " e2_params = sorted(e2_params, key=lambda v: v.name)\n", + "\n", + " self.update_ops = []\n", + " for e1_v, e2_v in zip(e1_params, e2_params):\n", + " op = e2_v.assign(e1_v)\n", + " self.update_ops.append(op)\n", + " \n", + " def make(self, sess):\n", + " \"\"\"\n", + " Makes copy.\n", + " Args:\n", + " sess: Tensorflow session instance\n", + " \"\"\"\n", + " sess.run(self.update_ops)" ] }, { @@ -293,9 +295,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def deep_q_learning(sess,\n", @@ -347,11 +347,17 @@ "\n", " # The replay memory\n", " replay_memory = []\n", + " \n", + " # Make model copier object\n", + " estimator_copy = ModelParametersCopier(q_estimator, target_estimator)\n", "\n", " # Keeps track of useful statistics\n", " stats = plotting.EpisodeStats(\n", " episode_lengths=np.zeros(num_episodes),\n", " episode_rewards=np.zeros(num_episodes))\n", + " \n", + " # For 'system/' summaries, usefull to check if currrent process looks healthy\n", + " current_process = psutil.Process()\n", "\n", " # Create directories for checkpoints and summaries\n", " checkpoint_dir = os.path.join(experiment_dir, \"checkpoints\")\n", @@ -422,14 +428,9 @@ " # Epsilon for this time step\n", " epsilon = epsilons[min(total_t, epsilon_decay_steps-1)]\n", "\n", - " # Add epsilon to Tensorboard\n", - " episode_summary = tf.Summary()\n", - " episode_summary.value.add(simple_value=epsilon, tag=\"epsilon\")\n", - " q_estimator.summary_writer.add_summary(episode_summary, total_t)\n", - "\n", " # Maybe update the target estimator\n", " if total_t % update_target_estimator_every == 0:\n", - " copy_model_parameters(sess, q_estimator, target_estimator)\n", + " estimator_copy.make(sess)\n", " print(\"\\nCopied model parameters to target network.\")\n", "\n", " # Print out which step we're on, useful for debugging.\n", @@ -475,11 +476,14 @@ "\n", " # Add summaries to tensorboard\n", " episode_summary = tf.Summary()\n", - " episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], node_name=\"episode_reward\", tag=\"episode_reward\")\n", - " episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], node_name=\"episode_length\", tag=\"episode_length\")\n", - " q_estimator.summary_writer.add_summary(episode_summary, total_t)\n", + " episode_summary.value.add(simple_value=epsilon, tag=\"episode/epsilon\")\n", + " episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], tag=\"episode/reward\")\n", + " episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], tag=\"episode/length\")\n", + " episode_summary.value.add(simple_value=current_process.cpu_percent(), tag=\"system/cpu_usage_percent\")\n", + " episode_summary.value.add(simple_value=current_process.memory_percent(memtype=\"vms\"), tag=\"system/v_memeory_usage_percent\")\n", + " q_estimator.summary_writer.add_summary(episode_summary, i_episode)\n", " q_estimator.summary_writer.flush()\n", - "\n", + " \n", " yield total_t, plotting.EpisodeStats(\n", " episode_lengths=stats.episode_lengths[:i_episode+1],\n", " episode_rewards=stats.episode_rewards[:i_episode+1])\n", @@ -490,9 +494,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "tf.reset_default_graph()\n", @@ -504,7 +506,7 @@ "global_step = tf.Variable(0, name='global_step', trainable=False)\n", " \n", "# Create estimators\n", - "q_estimator = Estimator(scope=\"q\", summaries_dir=experiment_dir)\n", + "q_estimator = Estimator(scope=\"q_estimator\", summaries_dir=experiment_dir)\n", "target_estimator = Estimator(scope=\"target_q\")\n", "\n", "# State processor\n", @@ -531,6 +533,24 @@ "\n", " print(\"\\nEpisode Reward: {}\".format(stats.episode_rewards[-1]))" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] } ], "metadata": { @@ -549,9 +569,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.4.3" + "version": "3.6.0" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } From 2b576bd992014977238bb344a3a07a1145eda31f Mon Sep 17 00:00:00 2001 From: sstarzycki Date: Fri, 21 Jul 2017 07:59:04 +0200 Subject: [PATCH 02/56] Update description of env.P[s][a] env.P[s][a] is not a tuple but rather a list of tuples (as probability in that tuple wouldn't make sense otherwise). --- DP/Policy Evaluation.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DP/Policy Evaluation.ipynb b/DP/Policy Evaluation.ipynb index e401c7759..9c3e0cdd3 100644 --- a/DP/Policy Evaluation.ipynb +++ b/DP/Policy Evaluation.ipynb @@ -41,7 +41,7 @@ " Args:\n", " policy: [S, A] shaped matrix representing the policy.\n", " env: OpenAI env. env.P represents the transition probabilities of the environment.\n", - " env.P[s][a] is a (prob, next_state, reward, done) tuple.\n", + " env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n", " theta: We stop evaluation once our value function change is less than theta for all states.\n", " discount_factor: gamma discount factor.\n", " \n", From 1f04c1d7606c8e607c7d9213745f515a9fc2baf4 Mon Sep 17 00:00:00 2001 From: himanshusahni Date: Wed, 4 Oct 2017 18:58:13 -0400 Subject: [PATCH 03/56] bind worker within lambda to avoid running worker twice --- PolicyGradient/a3c/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PolicyGradient/a3c/train.py b/PolicyGradient/a3c/train.py index 28064e506..286ca7a9d 100755 --- a/PolicyGradient/a3c/train.py +++ b/PolicyGradient/a3c/train.py @@ -125,7 +125,7 @@ def make_env(wrap=True): # Start worker threads worker_threads = [] for worker in workers: - worker_fn = lambda: worker.run(sess, coord, FLAGS.t_max) + worker_fn = lambda worker=worker: worker.run(sess, coord, FLAGS.t_max) t = threading.Thread(target=worker_fn) t.start() worker_threads.append(t) From bc7ee056e33518aea8d685ea43c12b8426993a96 Mon Sep 17 00:00:00 2001 From: himanshusahni Date: Tue, 10 Oct 2017 01:39:40 -0400 Subject: [PATCH 04/56] worker name scope should have trailing backslash otherwise any worker 10-19 will clash in scope with worker 1, and so on. --- PolicyGradient/a3c/worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PolicyGradient/a3c/worker.py b/PolicyGradient/a3c/worker.py index 5f310ac3c..6371558f2 100644 --- a/PolicyGradient/a3c/worker.py +++ b/PolicyGradient/a3c/worker.py @@ -85,7 +85,7 @@ def __init__(self, name, env, policy_net, value_net, global_counter, discount_fa # Op to copy params from global policy/valuenets self.copy_params_op = make_copy_params_op( tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES), - tf.contrib.slim.get_variables(scope=self.name, collection=tf.GraphKeys.TRAINABLE_VARIABLES)) + tf.contrib.slim.get_variables(scope=self.name+'/', collection=tf.GraphKeys.TRAINABLE_VARIABLES)) self.vnet_train_op = make_train_op(self.value_net, self.global_value_net) self.pnet_train_op = make_train_op(self.policy_net, self.global_policy_net) From 3611ec96b7429058a49c2478082506e90729ad3e Mon Sep 17 00:00:00 2001 From: Praveen Palanisamy Date: Tue, 31 Oct 2017 23:53:06 -0400 Subject: [PATCH 05/56] Fixed some of the issues with the DQN script as pointed out in #117 --- DQN/dqn.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/DQN/dqn.py b/DQN/dqn.py index 7b459240d..81d4aa58d 100755 --- a/DQN/dqn.py +++ b/DQN/dqn.py @@ -1,4 +1,5 @@ import gym +form gym.wrappers import Monitor import itertools import numpy as np import os @@ -28,7 +29,7 @@ def __init__(self): self.output = tf.image.rgb_to_grayscale(self.input_state) self.output = tf.image.crop_to_bounding_box(self.output, 34, 0, 160, 160) self.output = tf.image.resize_images( - self.output, 84, 84, method=tf.image.ResizeMethod.NEAREST_NEIGHBOR) + self.output, [84, 84], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR) self.output = tf.squeeze(self.output) def process(self, sess, state): @@ -292,9 +293,11 @@ def deep_q_learning(sess, state = next_state # Record videos - env.monitor.start(monitor_path, - resume=True, - video_callable=lambda count: count % record_video_every == 0) + # Use the gym env Monitor wrapper + env = Monitor(env, + directory=monitor_path, + resume=True, + video_callable=lambda count: count % record_video_every ==0) for i_episode in range(num_episodes): From e9068bfe7a5d7b1f32e710a95a99df5c441b039d Mon Sep 17 00:00:00 2001 From: Praveen Palanisamy Date: Tue, 31 Oct 2017 23:59:14 -0400 Subject: [PATCH 06/56] Updated to support recent versions of TF. Removed deprecated functions. Pointed out in #117 --- DQN/dqn.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/DQN/dqn.py b/DQN/dqn.py index 81d4aa58d..80466556c 100755 --- a/DQN/dqn.py +++ b/DQN/dqn.py @@ -60,7 +60,7 @@ def __init__(self, scope="estimator", summaries_dir=None): summary_dir = os.path.join(summaries_dir, "summaries_{}".format(scope)) if not os.path.exists(summary_dir): os.makedirs(summary_dir) - self.summary_writer = tf.train.SummaryWriter(summary_dir) + self.summary_writer = tf.summary.FileWriter(summary_dir) def _build_model(self): """ @@ -104,11 +104,11 @@ def _build_model(self): self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step()) # Summaries for Tensorboard - self.summaries = tf.merge_summary([ - tf.scalar_summary("loss", self.loss), - tf.histogram_summary("loss_hist", self.losses), - tf.histogram_summary("q_values_hist", self.predictions), - tf.scalar_summary("max_q_value", tf.reduce_max(self.predictions)) + self.summaries = tf.summary.merge([ + tf.summary.scalar("loss", self.loss), + tf.summary.histogram("loss_hist", self.losses), + tf.summary.histogram("q_values_hist", self.predictions), + tf.summary.scalar("max_q_value", tf.reduce_max(self.predictions)) ]) @@ -401,7 +401,7 @@ def deep_q_learning(sess, state_processor = StateProcessor() with tf.Session() as sess: - sess.run(tf.initialize_all_variables()) + sess.run(tf.global_variables_initializer()) for t, stats in deep_q_learning(sess, env, q_estimator=q_estimator, From 0b2ae4144b817c871153283188b270484aeeb7c2 Mon Sep 17 00:00:00 2001 From: Praveen Palanisamy Date: Wed, 1 Nov 2017 00:13:25 -0400 Subject: [PATCH 07/56] Fixed issues with the DQN in the exercise notebook --- DQN/Deep Q Learning.ipynb | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/DQN/Deep Q Learning.ipynb b/DQN/Deep Q Learning.ipynb index d2a295cf1..c3210d2ad 100644 --- a/DQN/Deep Q Learning.ipynb +++ b/DQN/Deep Q Learning.ipynb @@ -11,6 +11,7 @@ "%matplotlib inline\n", "\n", "import gym\n", + "from gym.wrappers import Monitor\n", "import itertools\n", "import numpy as np\n", "import os\n", @@ -67,7 +68,7 @@ " self.output = tf.image.rgb_to_grayscale(self.input_state)\n", " self.output = tf.image.crop_to_bounding_box(self.output, 34, 0, 160, 160)\n", " self.output = tf.image.resize_images(\n", - " self.output, 84, 84, method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)\n", + " self.output, [84, 84], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)\n", " self.output = tf.squeeze(self.output)\n", "\n", " def process(self, sess, state):\n", @@ -107,7 +108,7 @@ " summary_dir = os.path.join(summaries_dir, \"summaries_{}\".format(scope))\n", " if not os.path.exists(summary_dir):\n", " os.makedirs(summary_dir)\n", - " self.summary_writer = tf.train.SummaryWriter(summary_dir)\n", + " self.summary_writer = tf.summary.FileWriter(summary_dir)\n", "\n", " def _build_model(self):\n", " \"\"\"\n", @@ -151,11 +152,11 @@ " self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step())\n", "\n", " # Summaries for Tensorboard\n", - " self.summaries = tf.merge_summary([\n", - " tf.scalar_summary(\"loss\", self.loss),\n", - " tf.histogram_summary(\"loss_hist\", self.losses),\n", - " tf.histogram_summary(\"q_values_hist\", self.predictions),\n", - " tf.scalar_summary(\"max_q_value\", tf.reduce_max(self.predictions))\n", + " self.summaries = tf.summary.merge([\n", + " tf.summary.scalar(\"loss\", self.loss),\n", + " tf.summary.histogram(\"loss_hist\", self.losses),\n", + " tf.summary.histogram(\"q_values_hist\", self.predictions),\n", + " tf.summary.scalar(\"max_q_value\", tf.reduce_max(self.predictions))\n", " ])\n", "\n", "\n", @@ -212,7 +213,7 @@ "sp = StateProcessor()\n", "\n", "with tf.Session() as sess:\n", - " sess.run(tf.initialize_all_variables())\n", + " sess.run(tf.global_variables_initializer())\n", " \n", " # Example observation batch\n", " observation = env.reset()\n", @@ -391,9 +392,10 @@ " pass\n", "\n", " # Record videos\n", - " env.monitor.start(monitor_path,\n", - " resume=True,\n", - " video_callable=lambda count: count % record_video_every == 0)\n", + " env= Monitor(env,\n", + " directory=monitor_path,\n", + " resume=True,\n", + " video_callable=lambda count: count % record_video_every == 0)\n", "\n", " for i_episode in range(num_episodes):\n", "\n", @@ -526,7 +528,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.0" } }, "nbformat": 4, From 10ce5dc7eb242ab4953b6a07d342f8aae9267214 Mon Sep 17 00:00:00 2001 From: Praveen Palanisamy Date: Wed, 1 Nov 2017 00:13:48 -0400 Subject: [PATCH 08/56] Fixed typo --- DQN/dqn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DQN/dqn.py b/DQN/dqn.py index 80466556c..d54d4d1bf 100755 --- a/DQN/dqn.py +++ b/DQN/dqn.py @@ -1,5 +1,5 @@ import gym -form gym.wrappers import Monitor +from gym.wrappers import Monitor import itertools import numpy as np import os From 60013e507080222c8fa473d77e7808795ea3f4eb Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 16 Nov 2017 13:54:30 +0900 Subject: [PATCH 09/56] Sync function descriptions. Lambda -> gamma (discount factor). Added description of env.nS and env.nA --- DP/Policy Evaluation Solution.ipynb | 34 ++++++++++------------- DP/Policy Evaluation.ipynb | 30 +++++++++------------ DP/Policy Iteration Solution.ipynb | 34 +++++++++++------------ DP/Policy Iteration.ipynb | 38 +++++++++++--------------- DP/Value Iteration Solution.ipynb | 38 +++++++++++--------------- DP/Value Iteration.ipynb | 42 ++++++++++++----------------- 6 files changed, 90 insertions(+), 126 deletions(-) diff --git a/DP/Policy Evaluation Solution.ipynb b/DP/Policy Evaluation Solution.ipynb index a8b949367..9b9b11b49 100644 --- a/DP/Policy Evaluation Solution.ipynb +++ b/DP/Policy Evaluation Solution.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": 53, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -43,9 +41,11 @@ " Args:\n", " policy: [S, A] shaped matrix representing the policy.\n", " env: OpenAI env. env.P represents the transition probabilities of the environment.\n", - " env.P[s][a] is a (prob, next_state, reward, done) tuple.\n", + " env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n", + " env.nS is a number of available states. \n", + " env.nA is a number of available actions.\n", " theta: We stop evaluation once our value function change is less than theta for all states.\n", - " discount_factor: lambda discount factor.\n", + " discount_factor: gamma discount factor.\n", " \n", " Returns:\n", " Vector of length env.nS representing the value function.\n", @@ -75,9 +75,7 @@ { "cell_type": "code", "execution_count": 56, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "random_policy = np.ones([env.nS, env.nA]) / env.nA\n", @@ -87,9 +85,7 @@ { "cell_type": "code", "execution_count": 57, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -122,9 +118,7 @@ { "cell_type": "code", "execution_count": 51, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Test: Make sure the evaluated policy is what we expected\n", @@ -144,23 +138,23 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 2", "language": "python", - "name": "python3" + "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3 + "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.1" + "pygments_lexer": "ipython2", + "version": "2.7.12" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/DP/Policy Evaluation.ipynb b/DP/Policy Evaluation.ipynb index 9c3e0cdd3..160ac6dd9 100644 --- a/DP/Policy Evaluation.ipynb +++ b/DP/Policy Evaluation.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -29,9 +27,7 @@ { "cell_type": "code", "execution_count": 25, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):\n", @@ -42,6 +38,8 @@ " policy: [S, A] shaped matrix representing the policy.\n", " env: OpenAI env. env.P represents the transition probabilities of the environment.\n", " env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n", + " env.nS is a number of available states. \n", + " env.nA is a number of available actions.\n", " theta: We stop evaluation once our value function change is less than theta for all states.\n", " discount_factor: gamma discount factor.\n", " \n", @@ -59,9 +57,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "random_policy = np.ones([env.nS, env.nA]) / env.nA\n", @@ -71,9 +67,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "ename": "AssertionError", @@ -107,23 +101,23 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 2", "language": "python", - "name": "python3" + "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3 + "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.1" + "pygments_lexer": "ipython2", + "version": "2.7.12" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/DP/Policy Iteration Solution.ipynb b/DP/Policy Iteration Solution.ipynb index cfb68a2e5..8cf4faf78 100644 --- a/DP/Policy Iteration Solution.ipynb +++ b/DP/Policy Iteration Solution.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -45,9 +43,11 @@ " Args:\n", " policy: [S, A] shaped matrix representing the policy.\n", " env: OpenAI env. env.P represents the transition probabilities of the environment.\n", - " env.P[s][a] is a (prob, next_state, reward, done) tuple.\n", - " theta: We stop evaluation one our value function change is less than theta for all states.\n", - " discount_factor: lambda discount factor.\n", + " env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n", + " env.nS is a number of available states. \n", + " env.nA is a number of available actions.\n", + " theta: We stop evaluation once our value function change is less than theta for all states.\n", + " discount_factor: gamma discount factor.\n", " \n", " Returns:\n", " Vector of length env.nS representing the value function.\n", @@ -91,7 +91,7 @@ " env: The OpenAI envrionment.\n", " policy_eval_fn: Policy Evaluation function that takes 3 arguments:\n", " policy, env, discount_factor.\n", - " discount_factor: Lambda discount factor.\n", + " discount_factor: gamma discount factor.\n", " \n", " Returns:\n", " A tuple (policy, V). \n", @@ -136,9 +136,7 @@ { "cell_type": "code", "execution_count": 64, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -203,9 +201,7 @@ { "cell_type": "code", "execution_count": 59, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Test the value function\n", @@ -225,23 +221,23 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 2", "language": "python", - "name": "python3" + "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3 + "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.1" + "pygments_lexer": "ipython2", + "version": "2.7.12" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/DP/Policy Iteration.ipynb b/DP/Policy Iteration.ipynb index d67b22505..afd417593 100644 --- a/DP/Policy Iteration.ipynb +++ b/DP/Policy Iteration.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -45,9 +43,11 @@ " Args:\n", " policy: [S, A] shaped matrix representing the policy.\n", " env: OpenAI env. env.P represents the transition probabilities of the environment.\n", - " env.P[s][a] is a (prob, next_state, reward, done) tuple.\n", - " theta: We stop evaluation one our value function change is less than theta for all states.\n", - " discount_factor: lambda discount factor.\n", + " env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n", + " env.nS is a number of available states. \n", + " env.nA is a number of available actions.\n", + " theta: We stop evaluation once our value function change is less than theta for all states.\n", + " discount_factor: gamma discount factor.\n", " \n", " Returns:\n", " Vector of length env.nS representing the value function.\n", @@ -77,9 +77,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def policy_improvement(env, policy_eval_fn=policy_eval, discount_factor=1.0):\n", @@ -91,7 +89,7 @@ " env: The OpenAI envrionment.\n", " policy_eval_fn: Policy Evaluation function that takes 3 arguments:\n", " policy, env, discount_factor.\n", - " discount_factor: Lambda discount factor.\n", + " discount_factor: gamma discount factor.\n", " \n", " Returns:\n", " A tuple (policy, V). \n", @@ -113,9 +111,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -180,9 +176,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "ename": "AssertionError", @@ -216,23 +210,23 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 2", "language": "python", - "name": "python3" + "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3 + "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.1" + "pygments_lexer": "ipython2", + "version": "2.7.12" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/DP/Value Iteration Solution.ipynb b/DP/Value Iteration Solution.ipynb index 7bc985d15..fb98665f8 100644 --- a/DP/Value Iteration Solution.ipynb +++ b/DP/Value Iteration Solution.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -19,9 +17,7 @@ { "cell_type": "code", "execution_count": 18, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "pp = pprint.PrettyPrinter(indent=2)\n", @@ -31,9 +27,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def value_iteration(env, theta=0.0001, discount_factor=1.0):\n", @@ -41,10 +35,12 @@ " Value Iteration Algorithm.\n", " \n", " Args:\n", - " env: OpenAI environment. env.P represents the transition probabilities of the environment.\n", - " theta: Stopping threshold. If the value of all states changes less than theta\n", - " in one iteration we are done.\n", - " discount_factor: lambda time discount factor.\n", + " env: OpenAI env. env.P represents the transition probabilities of the environment.\n", + " env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n", + " env.nS is a number of available states. \n", + " env.nA is a number of available actions.\n", + " theta: We stop evaluation once our value function change is less than theta for all states.\n", + " discount_factor: gamma discount factor.\n", " \n", " Returns:\n", " A tuple (policy, V) of the optimal policy and the optimal value function.\n", @@ -99,9 +95,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -176,23 +170,23 @@ "metadata": { "anaconda-cloud": {}, "kernelspec": { - "display_name": "Python [Root]", + "display_name": "Python 2", "language": "python", - "name": "Python [Root]" + "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3 + "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.1" + "pygments_lexer": "ipython2", + "version": "2.7.12" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/DP/Value Iteration.ipynb b/DP/Value Iteration.ipynb index 6329d12f7..66c902113 100644 --- a/DP/Value Iteration.ipynb +++ b/DP/Value Iteration.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -19,9 +17,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "pp = pprint.PrettyPrinter(indent=2)\n", @@ -31,9 +27,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def value_iteration(env, theta=0.0001, discount_factor=1.0):\n", @@ -41,10 +35,12 @@ " Value Iteration Algorithm.\n", " \n", " Args:\n", - " env: OpenAI environment. env.P represents the transition probabilities of the environment.\n", - " theta: Stopping threshold. If the value of all states changes less than theta\n", - " in one iteration we are done.\n", - " discount_factor: lambda time discount factor.\n", + " env: OpenAI env. env.P represents the transition probabilities of the environment.\n", + " env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n", + " env.nS is a number of available states. \n", + " env.nA is a number of available actions.\n", + " theta: We stop evaluation once our value function change is less than theta for all states.\n", + " discount_factor: gamma discount factor.\n", " \n", " Returns:\n", " A tuple (policy, V) of the optimal policy and the optimal value function. \n", @@ -61,9 +57,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -128,9 +122,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "ename": "AssertionError", @@ -155,23 +147,23 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 2", "language": "python", - "name": "python3" + "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3 + "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.1" + "pygments_lexer": "ipython2", + "version": "2.7.12" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } From 430766766f34681460986ae044ecc7d408ddb691 Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 21 Nov 2017 15:15:28 +0900 Subject: [PATCH 10/56] Updates function description in DP. Fixed typos in MC. Changed Lambda to Gamma as in the book. --- DP/Policy Evaluation Solution.ipynb | 18 ++++++---- DP/Policy Evaluation.ipynb | 18 ++++++---- DP/Policy Iteration Solution.ipynb | 14 +++++--- DP/Policy Iteration.ipynb | 14 +++++--- DP/Value Iteration Solution.ipynb | 18 ++++++---- DP/Value Iteration.ipynb | 18 ++++++---- ...ith Epsilon-Greedy Policies Solution.ipynb | 30 +++++++--------- ...Control with Epsilon-Greedy Policies.ipynb | 26 +++++--------- MC/MC Prediction Solution.ipynb | 33 +++++++---------- MC/MC Prediction.ipynb | 25 +++++-------- ...eighted Importance Sampling Solution.ipynb | 36 +++++++------------ ...ol with Weighted Importance Sampling.ipynb | 36 +++++++------------ 12 files changed, 133 insertions(+), 153 deletions(-) diff --git a/DP/Policy Evaluation Solution.ipynb b/DP/Policy Evaluation Solution.ipynb index 9b9b11b49..8db76d578 100644 --- a/DP/Policy Evaluation Solution.ipynb +++ b/DP/Policy Evaluation Solution.ipynb @@ -3,7 +3,9 @@ { "cell_type": "code", "execution_count": 53, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "import numpy as np\n", @@ -42,10 +44,10 @@ " policy: [S, A] shaped matrix representing the policy.\n", " env: OpenAI env. env.P represents the transition probabilities of the environment.\n", " env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n", - " env.nS is a number of available states. \n", - " env.nA is a number of available actions.\n", + " env.nS is a number of states in the environment. \n", + " env.nA is a number of actions in the environment.\n", " theta: We stop evaluation once our value function change is less than theta for all states.\n", - " discount_factor: gamma discount factor.\n", + " discount_factor: Gamma discount factor.\n", " \n", " Returns:\n", " Vector of length env.nS representing the value function.\n", @@ -75,7 +77,9 @@ { "cell_type": "code", "execution_count": 56, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "random_policy = np.ones([env.nS, env.nA]) / env.nA\n", @@ -118,7 +122,9 @@ { "cell_type": "code", "execution_count": 51, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# Test: Make sure the evaluated policy is what we expected\n", diff --git a/DP/Policy Evaluation.ipynb b/DP/Policy Evaluation.ipynb index 160ac6dd9..e4f5f3673 100644 --- a/DP/Policy Evaluation.ipynb +++ b/DP/Policy Evaluation.ipynb @@ -3,7 +3,9 @@ { "cell_type": "code", "execution_count": 23, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "import numpy as np\n", @@ -27,7 +29,9 @@ { "cell_type": "code", "execution_count": 25, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):\n", @@ -38,10 +42,10 @@ " policy: [S, A] shaped matrix representing the policy.\n", " env: OpenAI env. env.P represents the transition probabilities of the environment.\n", " env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n", - " env.nS is a number of available states. \n", - " env.nA is a number of available actions.\n", + " env.nS is a number of states in the environment. \n", + " env.nA is a number of actions in the environment.\n", " theta: We stop evaluation once our value function change is less than theta for all states.\n", - " discount_factor: gamma discount factor.\n", + " discount_factor: Gamma discount factor.\n", " \n", " Returns:\n", " Vector of length env.nS representing the value function.\n", @@ -57,7 +61,9 @@ { "cell_type": "code", "execution_count": 26, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "random_policy = np.ones([env.nS, env.nA]) / env.nA\n", diff --git a/DP/Policy Iteration Solution.ipynb b/DP/Policy Iteration Solution.ipynb index 8cf4faf78..bf6fa631a 100644 --- a/DP/Policy Iteration Solution.ipynb +++ b/DP/Policy Iteration Solution.ipynb @@ -3,7 +3,9 @@ { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "import numpy as np\n", @@ -44,10 +46,10 @@ " policy: [S, A] shaped matrix representing the policy.\n", " env: OpenAI env. env.P represents the transition probabilities of the environment.\n", " env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n", - " env.nS is a number of available states. \n", - " env.nA is a number of available actions.\n", + " env.nS is a number of states in the environment. \n", + " env.nA is a number of actions in the environment.\n", " theta: We stop evaluation once our value function change is less than theta for all states.\n", - " discount_factor: gamma discount factor.\n", + " discount_factor: Gamma discount factor.\n", " \n", " Returns:\n", " Vector of length env.nS representing the value function.\n", @@ -201,7 +203,9 @@ { "cell_type": "code", "execution_count": 59, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# Test the value function\n", diff --git a/DP/Policy Iteration.ipynb b/DP/Policy Iteration.ipynb index afd417593..bdff9deea 100644 --- a/DP/Policy Iteration.ipynb +++ b/DP/Policy Iteration.ipynb @@ -3,7 +3,9 @@ { "cell_type": "code", "execution_count": 5, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "import numpy as np\n", @@ -44,10 +46,10 @@ " policy: [S, A] shaped matrix representing the policy.\n", " env: OpenAI env. env.P represents the transition probabilities of the environment.\n", " env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n", - " env.nS is a number of available states. \n", - " env.nA is a number of available actions.\n", + " env.nS is a number of states in the environment. \n", + " env.nA is a number of actions in the environment.\n", " theta: We stop evaluation once our value function change is less than theta for all states.\n", - " discount_factor: gamma discount factor.\n", + " discount_factor: Gamma discount factor.\n", " \n", " Returns:\n", " Vector of length env.nS representing the value function.\n", @@ -77,7 +79,9 @@ { "cell_type": "code", "execution_count": 13, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def policy_improvement(env, policy_eval_fn=policy_eval, discount_factor=1.0):\n", diff --git a/DP/Value Iteration Solution.ipynb b/DP/Value Iteration Solution.ipynb index fb98665f8..ebd1b5d49 100644 --- a/DP/Value Iteration Solution.ipynb +++ b/DP/Value Iteration Solution.ipynb @@ -3,7 +3,9 @@ { "cell_type": "code", "execution_count": 17, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "import numpy as np\n", @@ -17,7 +19,9 @@ { "cell_type": "code", "execution_count": 18, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "pp = pprint.PrettyPrinter(indent=2)\n", @@ -27,7 +31,9 @@ { "cell_type": "code", "execution_count": 19, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def value_iteration(env, theta=0.0001, discount_factor=1.0):\n", @@ -37,10 +43,10 @@ " Args:\n", " env: OpenAI env. env.P represents the transition probabilities of the environment.\n", " env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n", - " env.nS is a number of available states. \n", - " env.nA is a number of available actions.\n", + " env.nS is a number of states in the environment. \n", + " env.nA is a number of actions in the environment.\n", " theta: We stop evaluation once our value function change is less than theta for all states.\n", - " discount_factor: gamma discount factor.\n", + " discount_factor: Gamma discount factor.\n", " \n", " Returns:\n", " A tuple (policy, V) of the optimal policy and the optimal value function.\n", diff --git a/DP/Value Iteration.ipynb b/DP/Value Iteration.ipynb index 66c902113..f947fd761 100644 --- a/DP/Value Iteration.ipynb +++ b/DP/Value Iteration.ipynb @@ -3,7 +3,9 @@ { "cell_type": "code", "execution_count": 3, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "import numpy as np\n", @@ -17,7 +19,9 @@ { "cell_type": "code", "execution_count": 4, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "pp = pprint.PrettyPrinter(indent=2)\n", @@ -27,7 +31,9 @@ { "cell_type": "code", "execution_count": 5, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def value_iteration(env, theta=0.0001, discount_factor=1.0):\n", @@ -37,10 +43,10 @@ " Args:\n", " env: OpenAI env. env.P represents the transition probabilities of the environment.\n", " env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n", - " env.nS is a number of available states. \n", - " env.nA is a number of available actions.\n", + " env.nS is a number of states in the environment. \n", + " env.nA is a number of actions in the environment.\n", " theta: We stop evaluation once our value function change is less than theta for all states.\n", - " discount_factor: gamma discount factor.\n", + " discount_factor: Gamma discount factor.\n", " \n", " Returns:\n", " A tuple (policy, V) of the optimal policy and the optimal value function. \n", diff --git a/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb b/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb index 4484eb2f6..c41cf3b1a 100644 --- a/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb +++ b/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb @@ -27,9 +27,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "env = BlackjackEnv()" @@ -81,14 +79,14 @@ " \n", " Args:\n", " env: OpenAI gym environment.\n", - " num_episodes: Nubmer of episodes to sample.\n", - " discount_factor: Lambda discount factor.\n", + " num_episodes: Number of episodes to sample.\n", + " discount_factor: Gamma discount factor.\n", " epsilon: Chance the sample a random action. Float betwen 0 and 1.\n", " \n", " Returns:\n", " A tuple (Q, policy).\n", " Q is a dictionary mapping state -> action values.\n", - " policy is a function taht takes an observation as an argument and returns\n", + " policy is a function that takes an observation as an argument and returns\n", " action probabilities\n", " \"\"\"\n", " \n", @@ -147,9 +145,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -166,9 +162,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -213,23 +207,23 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 2", "language": "python", - "name": "python3" + "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3 + "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.2" + "pygments_lexer": "ipython2", + "version": "2.7.12" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/MC/MC Control with Epsilon-Greedy Policies.ipynb b/MC/MC Control with Epsilon-Greedy Policies.ipynb index dab7af2ac..7963c8d18 100644 --- a/MC/MC Control with Epsilon-Greedy Policies.ipynb +++ b/MC/MC Control with Epsilon-Greedy Policies.ipynb @@ -27,9 +27,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "env = BlackjackEnv()" @@ -38,9 +36,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def make_epsilon_greedy_policy(Q, epsilon, nA):\n", @@ -79,14 +75,14 @@ " \n", " Args:\n", " env: OpenAI gym environment.\n", - " num_episodes: Nubmer of episodes to sample.\n", - " discount_factor: Lambda discount factor.\n", + " num_episodes: Number of episodes to sample.\n", + " discount_factor: Gamma discount factor.\n", " epsilon: Chance the sample a random action. Float betwen 0 and 1.\n", " \n", " Returns:\n", " A tuple (Q, policy).\n", " Q is a dictionary mapping state -> action values.\n", - " policy is a function taht takes an observation as an argument and returns\n", + " policy is a function that takes an observation as an argument and returns\n", " action probabilities\n", " \"\"\"\n", " \n", @@ -111,9 +107,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "Q, policy = mc_control_epsilon_greedy(env, num_episodes=500000, epsilon=0.1)" @@ -122,9 +116,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# For plotting: Create value function from action-value function\n", @@ -162,9 +154,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.11" + "version": "2.7.12" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/MC/MC Prediction Solution.ipynb b/MC/MC Prediction Solution.ipynb index b6cc24f95..7459f6048 100644 --- a/MC/MC Prediction Solution.ipynb +++ b/MC/MC Prediction Solution.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", @@ -28,9 +26,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "env = BlackjackEnv()" @@ -39,9 +35,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def mc_prediction(policy, env, num_episodes, discount_factor=1.0):\n", @@ -52,8 +46,8 @@ " Args:\n", " policy: A function that maps an observation to action probabilities.\n", " env: OpenAI gym environment.\n", - " num_episodes: Nubmer of episodes to sample.\n", - " discount_factor: Lambda discount factor.\n", + " num_episodes: Number of episodes to sample.\n", + " discount_factor: Gamma discount factor.\n", " \n", " Returns:\n", " A dictionary that maps from state -> value.\n", @@ -106,9 +100,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def sample_policy(observation):\n", @@ -123,7 +115,6 @@ "cell_type": "code", "execution_count": 5, "metadata": { - "collapsed": false, "scrolled": false }, "outputs": [ @@ -202,23 +193,23 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 2", "language": "python", - "name": "python3" + "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3 + "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.2" + "pygments_lexer": "ipython2", + "version": "2.7.12" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/MC/MC Prediction.ipynb b/MC/MC Prediction.ipynb index 17c8cf64f..aff53e747 100644 --- a/MC/MC Prediction.ipynb +++ b/MC/MC Prediction.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", @@ -28,9 +26,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "env = BlackjackEnv()" @@ -39,9 +35,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def mc_prediction(policy, env, num_episodes, discount_factor=1.0):\n", @@ -52,8 +46,8 @@ " Args:\n", " policy: A function that maps an observation to action probabilities.\n", " env: OpenAI gym environment.\n", - " num_episodes: Nubmer of episodes to sample.\n", - " discount_factor: Lambda discount factor.\n", + " num_episodes: Number of episodes to sample.\n", + " discount_factor: Gamma discount factor.\n", " \n", " Returns:\n", " A dictionary that maps from state -> value.\n", @@ -77,9 +71,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def sample_policy(observation):\n", @@ -94,7 +86,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false, "scrolled": false }, "outputs": [], @@ -132,9 +123,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.11" + "version": "2.7.12" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/MC/Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb b/MC/Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb index 2baf04377..32c7cdaef 100644 --- a/MC/Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb +++ b/MC/Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", @@ -27,9 +25,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "env = BlackjackEnv()" @@ -91,9 +87,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def mc_control_importance_sampling(env, num_episodes, behavior_policy, discount_factor=1.0):\n", @@ -103,10 +97,10 @@ " \n", " Args:\n", " env: OpenAI gym environment.\n", - " num_episodes: Nubmer of episodes to sample.\n", + " num_episodes: Number of episodes to sample.\n", " behavior_policy: The behavior to follow while generating episodes.\n", " A function that given an observation returns a vector of probabilities for each action.\n", - " discount_factor: Lambda discount factor.\n", + " discount_factor: Gamma discount factor.\n", " \n", " Returns:\n", " A tuple (Q, policy).\n", @@ -171,9 +165,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -191,9 +183,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -238,23 +228,23 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 2", "language": "python", - "name": "python3" + "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3 + "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.0" + "pygments_lexer": "ipython2", + "version": "2.7.12" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/MC/Off-Policy MC Control with Weighted Importance Sampling.ipynb b/MC/Off-Policy MC Control with Weighted Importance Sampling.ipynb index 5cd2b408c..ff3d43a86 100644 --- a/MC/Off-Policy MC Control with Weighted Importance Sampling.ipynb +++ b/MC/Off-Policy MC Control with Weighted Importance Sampling.ipynb @@ -27,9 +27,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "env = BlackjackEnv()" @@ -63,9 +61,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def create_greedy_policy(Q):\n", @@ -89,9 +85,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def mc_control_importance_sampling(env, num_episodes, behavior_policy, discount_factor=1.0):\n", @@ -101,10 +95,10 @@ " \n", " Args:\n", " env: OpenAI gym environment.\n", - " num_episodes: Nubmer of episodes to sample.\n", + " num_episodes: Number of episodes to sample.\n", " behavior_policy: The behavior to follow while generating episodes.\n", " A function that given an observation returns a vector of probabilities for each action.\n", - " discount_factor: Lambda discount factor.\n", + " discount_factor: Gamma discount factor.\n", " \n", " Returns:\n", " A tuple (Q, policy).\n", @@ -128,9 +122,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "random_policy = create_random_policy(env.action_space.n)\n", @@ -140,9 +132,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# For plotting: Create value function from action-value function\n", @@ -166,23 +156,23 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 2", "language": "python", - "name": "python3" + "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3 + "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.0" + "pygments_lexer": "ipython2", + "version": "2.7.12" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } From 7017f9edc2eda903e75ed7387ffec9f4c20fed30 Mon Sep 17 00:00:00 2001 From: jonahweissman Date: Wed, 22 Nov 2017 11:50:30 -0500 Subject: [PATCH 11/56] Fix links in all the `README.md`s Markdown doesn't allow spaces in links to files, so I replaced the spaces with "%20". Now the links correctly display on GitHub. --- DP/README.md | 12 ++++++------ DQN/README.md | 8 ++++---- FA/README.md | 4 ++-- MC/README.md | 14 +++++++------- PolicyGradient/README.md | 6 +++--- README.md | 28 ++++++++++++++-------------- TD/README.md | 8 ++++---- 7 files changed, 40 insertions(+), 40 deletions(-) diff --git a/DP/README.md b/DP/README.md index bdb6fd086..7a7d9389a 100644 --- a/DP/README.md +++ b/DP/README.md @@ -34,13 +34,13 @@ ### Exercises - Implement Policy Evaluation in Python (Gridworld) - - [Exercise](Policy Evaluation.ipynb) - - [Solution](Policy Evaluation Solution.ipynb) + - [Exercise](Policy%20Evaluation.ipynb) + - [Solution](Policy%20Evaluation%20Solution.ipynb) - Implement Policy Iteration in Python (Gridworld) - - [Exercise](Policy Iteration.ipynb) - - [Solution](Policy Iteration Solution.ipynb) + - [Exercise](Policy%20Iteration.ipynb) + - [Solution](Policy%20Iteration%20Solution.ipynb) - Implement Value Iteration in Python (Gridworld) - - [Exercise](Value Iteration.ipynb) - - [Solution](Value Iteration Solution.ipynb) + - [Exercise](Value%20Iteration.ipynb) + - [Solution](Value%20Iteration%20Solution.ipynb) diff --git a/DQN/README.md b/DQN/README.md index 1528b3d0a..eedbbd894 100644 --- a/DQN/README.md +++ b/DQN/README.md @@ -39,11 +39,11 @@ ### Exercises -- [OpenAI Gym Atari Environment Playground](Breakout Playground.ipynb) +- [OpenAI Gym Atari Environment Playground](Breakout%20Playground.ipynb) - Deep-Q Learning for Atari Games - - [Exercise](Deep Q Learning.ipynb) - - [Solution](Deep Q Learning Solution.ipynb) + - [Exercise](Deep%20Q%20Learning.ipynb) + - [Solution](Deep%20Q%20Learning%20Solution.ipynb) - Double-Q Learning - This is a minimal change to Q-Learning so use the same exercise as above - - [Solution](Double DQN Solution.ipynb) + - [Solution](Double%20DQN%20Solution.ipynb) - Prioritized Experience Replay (WIP) diff --git a/FA/README.md b/FA/README.md index 9eb97101f..fb6dd111a 100644 --- a/FA/README.md +++ b/FA/README.md @@ -36,5 +36,5 @@ ### Exercises - Solve Mountain Car Problem using Q-Learning with Linear Function Approximation - - [Exercise](Q-Learning with Value Function Approximation.ipynb) - - [Solution](Q-Learning with Value Function Approximation Solution.ipynb) + - [Exercise](Q-Learning%20with%20Value%20Function%20Approximation.ipynb) + - [Solution](Q-Learning%20with%20Value%20Function%20Approximation%20Solution.ipynb) diff --git a/MC/README.md b/MC/README.md index 5ed660915..835789227 100644 --- a/MC/README.md +++ b/MC/README.md @@ -37,13 +37,13 @@ ### Exercises -- [Get familiar with the Blackjack environment (Blackjack-v0)](Blackjack Playground.ipynb) +- [Get familiar with the Blackjack environment (Blackjack-v0)](Blackjack%20Playground.ipynb) - Implement the Monte Carlo Prediction to estimate state-action values - - [Exercise](MC Prediction.ipynb) - - [Solution](MC Prediction Solution.ipynb) + - [Exercise](MC%20Prediction.ipynb) + - [Solution](MC%20Prediction%20Solution.ipynb) - Implement the on-policy first-visit Monte Carlo Control algorithm - - [Exercise](MC Control with Epsilon-Greedy Policies.ipynb) - - [Solution](MC Control with Epsilon-Greedy Policies Solution.ipynb) + - [Exercise](MC%20Control%20with%20Epsilon-Greedy%20Policies.ipynb) + - [Solution](MC%20Control%20with%20Epsilon-Greedy%20Policies%20Solution.ipynb) - Implement the off-policy every-visit Monte Carlo Control using Weighted Important Sampling algorithm - - [Exercise](Off-Policy MC Control with Weighted Importance Sampling.ipynb) - - [Solution](Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb) \ No newline at end of file + - [Exercise](Off-Policy%20MC%20Control%20with%20Weighted%20Importance%20Sampling.ipynb) + - [Solution](Off-Policy%20MC%20Control%20with%20Weighted%20Importance%20Sampling%20Solution.ipynb) diff --git a/PolicyGradient/README.md b/PolicyGradient/README.md index 4921e0cd6..3094fb332 100644 --- a/PolicyGradient/README.md +++ b/PolicyGradient/README.md @@ -50,13 +50,13 @@ - REINFORCE with Baseline - Exercise - - [Solution](CliffWalk REINFORCE with Baseline Solution.ipynb) + - [Solution](CliffWalk%20REINFORCE%20with%20Baseline%20Solution.ipynb) - Actor-Critic with Baseline - Exercise - - [Solution](CliffWalk Actor-Critic Solution.ipynb) + - [Solution](CliffWalk%20Actor-Critic%20Solution.ipynb) - Actor-Critic with Baseline for Continuous Action Spaces - Exercise - - [Solution](Continuous MountainCar Actor-Critic Solution.ipynb) + - [Solution](Continuous%20MountainCar%20Actor-Critic%20Solution.ipynb) - Deterministic Policy Gradients for Continuous Action Spaces (WIP) - Deep Deterministic Policy Gradients (WIP) - Asynchronous Advantage Actor-Critic (A3C) diff --git a/README.md b/README.md index fd2e42323..ad2abe1d3 100644 --- a/README.md +++ b/README.md @@ -26,21 +26,21 @@ All code is written in Python 3 and uses RL environments from [OpenAI Gym](https ### List of Implemented Algorithms -- [Dynamic Programming Policy Evaluation](DP/Policy Evaluation Solution.ipynb) -- [Dynamic Programming Policy Iteration](DP/Policy Iteration Solution.ipynb) -- [Dynamic Programming Value Iteration](DP/Value Iteration Solution.ipynb) -- [Monte Carlo Prediction](MC/MC Prediction Solution.ipynb) -- [Monte Carlo Control with Epsilon-Greedy Policies](MC/MC Control with Epsilon-Greedy Policies Solution.ipynb) -- [Monte Carlo Off-Policy Control with Importance Sampling](MC/Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb) -- [SARSA (On Policy TD Learning)](TD/SARSA Solution.ipynb) -- [Q-Learning (Off Policy TD Learning)](TD/Q-Learning Solution.ipynb) -- [Q-Learning with Linear Function Approximation](FA/Q-Learning with Value Function Approximation Solution.ipynb) -- [Deep Q-Learning for Atari Games](DQN/Deep Q Learning Solution.ipynb) -- [Double Deep-Q Learning for Atari Games](DQN/Double DQN Solution.ipynb) +- [Dynamic Programming Policy Evaluation](DP/Policy%20Evaluation%20Solution.ipynb) +- [Dynamic Programming Policy Iteration](DP/Policy%20Iteration%20Solution.ipynb) +- [Dynamic Programming Value Iteration](DP/Value%20Iteration%20Solution.ipynb) +- [Monte Carlo Prediction](MC/MC%20Prediction%20Solution.ipynb) +- [Monte Carlo Control with Epsilon-Greedy Policies](MC/MC%20Control%20with%20Epsilon-Greedy%20Policies%20Solution.ipynb) +- [Monte Carlo Off-Policy Control with Importance Sampling](MC/Off-Policy%20MC%20Control%20with%20Weighted%20Importance%20Sampling%20Solution.ipynb) +- [SARSA (On Policy TD Learning)](TD/SARSA%20Solution.ipynb) +- [Q-Learning (Off Policy TD Learning)](TD/Q-Learning%20Solution.ipynb) +- [Q-Learning with Linear Function Approximation](FA/Q-Learning%20with%20Value%20Function%20Approximation%20Solution.ipynb) +- [Deep Q-Learning for Atari Games](DQN/Deep%20Q%20Learning%20Solution.ipynb) +- [Double Deep-Q Learning for Atari Games](DQN/Double%20DQN%20Solution.ipynb) - Deep Q-Learning with Prioritized Experience Replay (WIP) -- [Policy Gradient: REINFORCE with Baseline](PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb) -- [Policy Gradient: Actor Critic with Baseline](PolicyGradient/CliffWalk Actor Critic Solution.ipynb) -- [Policy Gradient: Actor Critic with Baseline for Continuous Action Spaces](PolicyGradient/Continuous MountainCar Actor Critic Solution.ipynb) +- [Policy Gradient: REINFORCE with Baseline](PolicyGradient/CliffWalk%20REINFORCE%20with%20Baseline%20Solution.ipynb) +- [Policy Gradient: Actor Critic with Baseline](PolicyGradient/CliffWalk%20Actor%20Critic%20Solution.ipynb) +- [Policy Gradient: Actor Critic with Baseline for Continuous Action Spaces](PolicyGradient/Continuous%20MountainCar%20Actor%20Critic%20Solution.ipynb) - Deterministic Policy Gradients for Continuous Action Spaces (WIP) - Deep Deterministic Policy Gradients (DDPG) (WIP) - [Asynchronous Advantage Actor Critic (A3C)](PolicyGradient/a3c) diff --git a/TD/README.md b/TD/README.md index 9044704d1..f0b26aa50 100644 --- a/TD/README.md +++ b/TD/README.md @@ -40,11 +40,11 @@ ### Exercises -- [Windy Gridworld Playground](Windy Gridworld Playground.ipynb) +- [Windy Gridworld Playground](Windy%20Gridworld%20Playground.ipynb) - Implement SARSA - [Exercise](SARSA.ipynb) - - [Solution](SARSA Solution.ipynb) -- [Cliff Environment Playground](Cliff Environment Playground.ipynb) + - [Solution](SARSA%20Solution.ipynb) +- [Cliff Environment Playground](Cliff%20Environment%20Playground.ipynb) - Implement Q-Learning in Python - [Exercise](Q-Learning.ipynb) - - [Solution](Q-Learning Solution.ipynb) \ No newline at end of file + - [Solution](Q-Learning%20Solution.ipynb) From da612e5eddc00468bb1894d119d22a8c37566241 Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 23 Nov 2017 09:23:51 +0900 Subject: [PATCH 12/56] Change kernel to python3 --- DP/Policy Evaluation Solution.ipynb | 10 +++---- DP/Policy Evaluation.ipynb | 10 +++---- DP/Policy Iteration Solution.ipynb | 10 +++---- DP/Policy Iteration.ipynb | 10 +++---- DP/Value Iteration Solution.ipynb | 10 +++---- DP/Value Iteration.ipynb | 10 +++---- MC/Blackjack Playground.ipynb | 12 +++----- ...ith Epsilon-Greedy Policies Solution.ipynb | 14 +++++---- ...Control with Epsilon-Greedy Policies.ipynb | 26 ++++++++++------ MC/MC Prediction Solution.ipynb | 26 ++++++++++------ MC/MC Prediction.ipynb | 27 +++++++++++------ ...eighted Importance Sampling Solution.ipynb | 22 +++++++++----- ...ol with Weighted Importance Sampling.ipynb | 30 ++++++++++++------- 13 files changed, 128 insertions(+), 89 deletions(-) diff --git a/DP/Policy Evaluation Solution.ipynb b/DP/Policy Evaluation Solution.ipynb index 8db76d578..703e020fb 100644 --- a/DP/Policy Evaluation Solution.ipynb +++ b/DP/Policy Evaluation Solution.ipynb @@ -144,21 +144,21 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.12" + "pygments_lexer": "ipython3", + "version": "3.5.2" } }, "nbformat": 4, diff --git a/DP/Policy Evaluation.ipynb b/DP/Policy Evaluation.ipynb index e4f5f3673..381a58260 100644 --- a/DP/Policy Evaluation.ipynb +++ b/DP/Policy Evaluation.ipynb @@ -107,21 +107,21 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.12" + "pygments_lexer": "ipython3", + "version": "3.5.2" } }, "nbformat": 4, diff --git a/DP/Policy Iteration Solution.ipynb b/DP/Policy Iteration Solution.ipynb index bf6fa631a..be7d3710e 100644 --- a/DP/Policy Iteration Solution.ipynb +++ b/DP/Policy Iteration Solution.ipynb @@ -225,21 +225,21 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.12" + "pygments_lexer": "ipython3", + "version": "3.5.2" } }, "nbformat": 4, diff --git a/DP/Policy Iteration.ipynb b/DP/Policy Iteration.ipynb index bdff9deea..fc87f291b 100644 --- a/DP/Policy Iteration.ipynb +++ b/DP/Policy Iteration.ipynb @@ -214,21 +214,21 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.12" + "pygments_lexer": "ipython3", + "version": "3.5.2" } }, "nbformat": 4, diff --git a/DP/Value Iteration Solution.ipynb b/DP/Value Iteration Solution.ipynb index ebd1b5d49..cd0da629f 100644 --- a/DP/Value Iteration Solution.ipynb +++ b/DP/Value Iteration Solution.ipynb @@ -176,21 +176,21 @@ "metadata": { "anaconda-cloud": {}, "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.12" + "pygments_lexer": "ipython3", + "version": "3.5.2" } }, "nbformat": 4, diff --git a/DP/Value Iteration.ipynb b/DP/Value Iteration.ipynb index f947fd761..ff4bf15dd 100644 --- a/DP/Value Iteration.ipynb +++ b/DP/Value Iteration.ipynb @@ -153,21 +153,21 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.12" + "pygments_lexer": "ipython3", + "version": "3.5.2" } }, "nbformat": 4, diff --git a/MC/Blackjack Playground.ipynb b/MC/Blackjack Playground.ipynb index cbb6c40c8..28dfc1867 100644 --- a/MC/Blackjack Playground.ipynb +++ b/MC/Blackjack Playground.ipynb @@ -18,9 +18,7 @@ { "cell_type": "code", "execution_count": 420, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "env = BlackjackEnv()" @@ -29,9 +27,7 @@ { "cell_type": "code", "execution_count": 422, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -219,9 +215,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.5.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb b/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb index c41cf3b1a..0f10d783e 100644 --- a/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb +++ b/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb @@ -27,7 +27,9 @@ { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "env = BlackjackEnv()" @@ -207,21 +209,21 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.12" + "pygments_lexer": "ipython3", + "version": "3.5.2" } }, "nbformat": 4, diff --git a/MC/MC Control with Epsilon-Greedy Policies.ipynb b/MC/MC Control with Epsilon-Greedy Policies.ipynb index 7963c8d18..257a84b44 100644 --- a/MC/MC Control with Epsilon-Greedy Policies.ipynb +++ b/MC/MC Control with Epsilon-Greedy Policies.ipynb @@ -27,7 +27,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "env = BlackjackEnv()" @@ -36,7 +38,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def make_epsilon_greedy_policy(Q, epsilon, nA):\n", @@ -107,7 +111,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "Q, policy = mc_control_epsilon_greedy(env, num_episodes=500000, epsilon=0.1)" @@ -116,7 +122,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# For plotting: Create value function from action-value function\n", @@ -140,21 +148,21 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.12" + "pygments_lexer": "ipython3", + "version": "3.5.2" } }, "nbformat": 4, diff --git a/MC/MC Prediction Solution.ipynb b/MC/MC Prediction Solution.ipynb index 7459f6048..25da5f3ca 100644 --- a/MC/MC Prediction Solution.ipynb +++ b/MC/MC Prediction Solution.ipynb @@ -3,7 +3,9 @@ { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "%matplotlib inline\n", @@ -26,7 +28,9 @@ { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "env = BlackjackEnv()" @@ -35,7 +39,9 @@ { "cell_type": "code", "execution_count": 3, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def mc_prediction(policy, env, num_episodes, discount_factor=1.0):\n", @@ -100,7 +106,9 @@ { "cell_type": "code", "execution_count": 4, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def sample_policy(observation):\n", @@ -193,21 +201,21 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.12" + "pygments_lexer": "ipython3", + "version": "3.5.2" } }, "nbformat": 4, diff --git a/MC/MC Prediction.ipynb b/MC/MC Prediction.ipynb index aff53e747..472f9ef35 100644 --- a/MC/MC Prediction.ipynb +++ b/MC/MC Prediction.ipynb @@ -3,7 +3,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "%matplotlib inline\n", @@ -26,7 +28,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "env = BlackjackEnv()" @@ -35,7 +39,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def mc_prediction(policy, env, num_episodes, discount_factor=1.0):\n", @@ -71,7 +77,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def sample_policy(observation):\n", @@ -86,6 +94,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, "scrolled": false }, "outputs": [], @@ -109,21 +118,21 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.12" + "pygments_lexer": "ipython3", + "version": "3.5.2" } }, "nbformat": 4, diff --git a/MC/Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb b/MC/Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb index 32c7cdaef..41dad0fe6 100644 --- a/MC/Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb +++ b/MC/Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb @@ -3,7 +3,9 @@ { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "%matplotlib inline\n", @@ -25,7 +27,9 @@ { "cell_type": "code", "execution_count": 3, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "env = BlackjackEnv()" @@ -87,7 +91,9 @@ { "cell_type": "code", "execution_count": 6, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def mc_control_importance_sampling(env, num_episodes, behavior_policy, discount_factor=1.0):\n", @@ -228,21 +234,21 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.12" + "pygments_lexer": "ipython3", + "version": "3.5.2" } }, "nbformat": 4, diff --git a/MC/Off-Policy MC Control with Weighted Importance Sampling.ipynb b/MC/Off-Policy MC Control with Weighted Importance Sampling.ipynb index ff3d43a86..b93408711 100644 --- a/MC/Off-Policy MC Control with Weighted Importance Sampling.ipynb +++ b/MC/Off-Policy MC Control with Weighted Importance Sampling.ipynb @@ -27,7 +27,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "env = BlackjackEnv()" @@ -61,7 +63,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def create_greedy_policy(Q):\n", @@ -85,7 +89,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def mc_control_importance_sampling(env, num_episodes, behavior_policy, discount_factor=1.0):\n", @@ -122,7 +128,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "random_policy = create_random_policy(env.action_space.n)\n", @@ -132,7 +140,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# For plotting: Create value function from action-value function\n", @@ -156,21 +166,21 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.12" + "pygments_lexer": "ipython3", + "version": "3.5.2" } }, "nbformat": 4, From 79cadc0ddb885787852bc2ade62641f8ae9ee909 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 24 Nov 2017 16:47:26 +0900 Subject: [PATCH 13/56] Lambda to Gamma. Updated Readme. --- MC/README.md | 2 +- TD/Q-Learning Solution.ipynb | 26 ++++++++------------------ TD/Q-Learning.ipynb | 26 ++++++++------------------ TD/README.md | 4 ++-- TD/SARSA Solution.ipynb | 30 +++++++++--------------------- TD/SARSA.ipynb | 30 +++++++++--------------------- 6 files changed, 37 insertions(+), 81 deletions(-) diff --git a/MC/README.md b/MC/README.md index 835789227..2c1a512d7 100644 --- a/MC/README.md +++ b/MC/README.md @@ -37,7 +37,7 @@ ### Exercises -- [Get familiar with the Blackjack environment (Blackjack-v0)](Blackjack%20Playground.ipynb) +- Get familiar with the [Blackjack environment (Blackjack-v0)](Blackjack%20Playground.ipynb) - Implement the Monte Carlo Prediction to estimate state-action values - [Exercise](MC%20Prediction.ipynb) - [Solution](MC%20Prediction%20Solution.ipynb) diff --git a/TD/Q-Learning Solution.ipynb b/TD/Q-Learning Solution.ipynb index 5794e20de..4c1c5be2c 100644 --- a/TD/Q-Learning Solution.ipynb +++ b/TD/Q-Learning Solution.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", @@ -31,9 +29,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "env = CliffWalkingEnv()" @@ -73,9 +69,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def q_learning(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1):\n", @@ -86,7 +80,7 @@ " Args:\n", " env: OpenAI environment.\n", " num_episodes: Number of episodes to run for.\n", - " discount_factor: Lambda time discount factor.\n", + " discount_factor: Gamma discount factor.\n", " alpha: TD learning rate.\n", " epsilon: Chance the sample a random action. Float betwen 0 and 1.\n", " \n", @@ -147,9 +141,7 @@ { "cell_type": "code", "execution_count": 18, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -166,9 +158,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -231,9 +221,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.5.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/TD/Q-Learning.ipynb b/TD/Q-Learning.ipynb index 724d682ad..4e1396cf6 100644 --- a/TD/Q-Learning.ipynb +++ b/TD/Q-Learning.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", @@ -30,9 +28,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "env = CliffWalkingEnv()" @@ -72,9 +68,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def q_learning(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1):\n", @@ -85,7 +79,7 @@ " Args:\n", " env: OpenAI environment.\n", " num_episodes: Number of episodes to run for.\n", - " discount_factor: Lambda time discount factor.\n", + " discount_factor: Gamma discount factor.\n", " alpha: TD learning rate.\n", " epsilon: Chance the sample a random action. Float betwen 0 and 1.\n", " \n", @@ -121,9 +115,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -140,9 +132,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -205,9 +195,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.5.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/TD/README.md b/TD/README.md index f0b26aa50..ac2488167 100644 --- a/TD/README.md +++ b/TD/README.md @@ -40,11 +40,11 @@ ### Exercises -- [Windy Gridworld Playground](Windy%20Gridworld%20Playground.ipynb) +- Get familiar with the [Windy Gridworld Playground](Windy%20Gridworld%20Playground.ipynb) - Implement SARSA - [Exercise](SARSA.ipynb) - [Solution](SARSA%20Solution.ipynb) -- [Cliff Environment Playground](Cliff%20Environment%20Playground.ipynb) +- Get familiar with the [Cliff Environment Playground](Cliff%20Environment%20Playground.ipynb) - Implement Q-Learning in Python - [Exercise](Q-Learning.ipynb) - [Solution](Q-Learning%20Solution.ipynb) diff --git a/TD/SARSA Solution.ipynb b/TD/SARSA Solution.ipynb index feab3db02..df647f193 100644 --- a/TD/SARSA Solution.ipynb +++ b/TD/SARSA Solution.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", @@ -39,9 +37,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "env = WindyGridworldEnv()" @@ -81,9 +77,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def sarsa(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1):\n", @@ -93,7 +87,7 @@ " Args:\n", " env: OpenAI environment.\n", " num_episodes: Number of episodes to run for.\n", - " discount_factor: Lambda time discount factor.\n", + " discount_factor: Gamma discount factor.\n", " alpha: TD learning rate.\n", " epsilon: Chance the sample a random action. Float betwen 0 and 1.\n", " \n", @@ -156,9 +150,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -175,9 +167,7 @@ { "cell_type": "code", "execution_count": 24, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -217,9 +207,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [] } @@ -240,9 +228,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.5.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/TD/SARSA.ipynb b/TD/SARSA.ipynb index 799915352..8a0344410 100644 --- a/TD/SARSA.ipynb +++ b/TD/SARSA.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", @@ -30,9 +28,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "env = WindyGridworldEnv()" @@ -72,9 +68,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def sarsa(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1):\n", @@ -84,7 +78,7 @@ " Args:\n", " env: OpenAI environment.\n", " num_episodes: Number of episodes to run for.\n", - " discount_factor: Lambda time discount factor.\n", + " discount_factor: Gamma discount factor.\n", " alpha: TD learning rate.\n", " epsilon: Chance the sample a random action. Float betwen 0 and 1.\n", " \n", @@ -121,9 +115,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -140,9 +132,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -182,9 +172,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [] } @@ -205,9 +193,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.5.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } From 3fce6b57c93ae505fd7990bad63c57cee4f9a6c1 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 1 Dec 2017 15:38:01 +0900 Subject: [PATCH 14/56] Updated Readme. Changed Lambda to Gamma --- ...alue Function Approximation Solution.ipynb | 40 ++++++------------- ...ng with Value Function Approximation.ipynb | 30 +++++--------- FA/README.md | 2 + 3 files changed, 25 insertions(+), 47 deletions(-) diff --git a/FA/Q-Learning with Value Function Approximation Solution.ipynb b/FA/Q-Learning with Value Function Approximation Solution.ipynb index a271d6a63..49c62ca37 100644 --- a/FA/Q-Learning with Value Function Approximation Solution.ipynb +++ b/FA/Q-Learning with Value Function Approximation Solution.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", @@ -31,9 +29,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stderr", @@ -50,9 +46,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -74,7 +68,7 @@ "scaler = sklearn.preprocessing.StandardScaler()\n", "scaler.fit(observation_examples)\n", "\n", - "# Used to converte a state to a featurizes represenation.\n", + "# Used to convert a state to a featurizes represenation.\n", "# We use RBF kernels with different variances to cover different parts of the space\n", "featurizer = sklearn.pipeline.FeatureUnion([\n", " (\"rbf1\", RBFSampler(gamma=5.0, n_components=100)),\n", @@ -88,9 +82,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "class Estimator():\n", @@ -151,9 +143,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def make_epsilon_greedy_policy(estimator, epsilon, nA):\n", @@ -182,9 +172,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def q_learning(env, estimator, num_episodes, discount_factor=1.0, epsilon=0.1, epsilon_decay=1.0):\n", @@ -196,7 +184,7 @@ " env: OpenAI environment.\n", " estimator: Action-Value function estimator\n", " num_episodes: Number of episodes to run for.\n", - " discount_factor: Lambda time discount factor.\n", + " discount_factor: Gamma discount factor.\n", " epsilon: Chance the sample a random action. Float betwen 0 and 1.\n", " epsilon_decay: Each episode, epsilon is decayed by this factor\n", " \n", @@ -283,9 +271,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -305,9 +291,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -384,9 +368,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.5.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/FA/Q-Learning with Value Function Approximation.ipynb b/FA/Q-Learning with Value Function Approximation.ipynb index e83b6bbb0..442605562 100644 --- a/FA/Q-Learning with Value Function Approximation.ipynb +++ b/FA/Q-Learning with Value Function Approximation.ipynb @@ -4,7 +4,7 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -31,9 +31,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stderr", @@ -50,9 +48,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -89,7 +85,7 @@ "cell_type": "code", "execution_count": 4, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -149,7 +145,7 @@ "cell_type": "code", "execution_count": 5, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -180,7 +176,7 @@ "cell_type": "code", "execution_count": 18, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -193,7 +189,7 @@ " env: OpenAI environment.\n", " estimator: Action-Value function estimator\n", " num_episodes: Number of episodes to run for.\n", - " discount_factor: Lambda time discount factor.\n", + " discount_factor: Gamma discount factor.\n", " epsilon: Chance the sample a random action. Float betwen 0 and 1.\n", " epsilon_decay: Each episode, epsilon is decayed by this factor\n", " \n", @@ -237,9 +233,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -259,9 +253,7 @@ { "cell_type": "code", "execution_count": 21, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -326,9 +318,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.4.3" + "version": "3.5.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/FA/README.md b/FA/README.md index fb6dd111a..579498c85 100644 --- a/FA/README.md +++ b/FA/README.md @@ -35,6 +35,8 @@ ### Exercises +- Get familiar with the [Mountain Car Playground](MountainCar%20Playground.ipynb) + - Solve Mountain Car Problem using Q-Learning with Linear Function Approximation - [Exercise](Q-Learning%20with%20Value%20Function%20Approximation.ipynb) - [Solution](Q-Learning%20with%20Value%20Function%20Approximation%20Solution.ipynb) From 152dbc414cfd70d67aff46241c3fc69887256c8b Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 6 Dec 2017 17:15:13 +0900 Subject: [PATCH 15/56] Updated link to Sutton's book --- DP/README.md | 2 +- FA/README.md | 4 ++-- Introduction/README.md | 2 +- MC/README.md | 2 +- MDP/README.md | 2 +- PolicyGradient/README.md | 2 +- README.md | 2 +- TD/README.md | 6 +++--- 8 files changed, 11 insertions(+), 11 deletions(-) diff --git a/DP/README.md b/DP/README.md index 7a7d9389a..1c2bb768b 100644 --- a/DP/README.md +++ b/DP/README.md @@ -28,7 +28,7 @@ **Optional:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 4: Dynamic Programming +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 4: Dynamic Programming ### Exercises diff --git a/FA/README.md b/FA/README.md index 579498c85..f50f56cef 100644 --- a/FA/README.md +++ b/FA/README.md @@ -25,8 +25,8 @@ **Required:** - David Silver's RL Course Lecture 6 - Value Function Approximation ([video](https://www.youtube.com/watch?v=UoPei5o4fps), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/FA.pdf)) -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 9: On-policy Prediction with Approximation -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 10: On-policy Control with Approximation +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 9: On-policy Prediction with Approximation +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 10: On-policy Control with Approximation **Optional:** diff --git a/Introduction/README.md b/Introduction/README.md index f476fabb9..9e5b383ac 100644 --- a/Introduction/README.md +++ b/Introduction/README.md @@ -17,7 +17,7 @@ **Required:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 1: The Reinforcement Learning Problem +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 1: The Reinforcement Learning Problem - David Silver's RL Course Lecture 1 - Introduction to Reinforcement Learning ([video](https://www.youtube.com/watch?v=2pWv7GOvuf0), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/intro_RL.pdf)) - [OpenAI Gym Tutorial](https://gym.openai.com/docs) diff --git a/MC/README.md b/MC/README.md index 2c1a512d7..9d23968c2 100644 --- a/MC/README.md +++ b/MC/README.md @@ -26,7 +26,7 @@ **Required:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 5: Monte Carlo Methods +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 5: Monte Carlo Methods **Optional:** diff --git a/MDP/README.md b/MDP/README.md index 404cb141b..539799a09 100644 --- a/MDP/README.md +++ b/MDP/README.md @@ -25,7 +25,7 @@ **Required:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 3: Finite Markov Decision Processes +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 3: Finite Markov Decision Processes - David Silver's RL Course Lecture 2 - Markov Decision Processes ([video](https://www.youtube.com/watch?v=lfHX2hHRMVQ), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/MDP.pdf)) diff --git a/PolicyGradient/README.md b/PolicyGradient/README.md index 3094fb332..1e7a1c68d 100644 --- a/PolicyGradient/README.md +++ b/PolicyGradient/README.md @@ -36,7 +36,7 @@ **Optional:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 11: Policy Gradient Methods (Under Construction) +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 11: Policy Gradient Methods (Under Construction) - [Deterministic Policy Gradient Algorithms](http://jmlr.org/proceedings/papers/v32/silver14.pdf) - [Deterministic Policy Gradient Algorithms (Talk)](http://techtalks.tv/talks/deterministic-policy-gradient-algorithms/61098/) - [Continuous control with deep reinforcement learning](https://arxiv.org/abs/1509.02971) diff --git a/README.md b/README.md index ad2abe1d3..60974e0dd 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ All code is written in Python 3 and uses RL environments from [OpenAI Gym](https Textbooks: -- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) +- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/book/bookdraft2017nov5.pdf) Classes: diff --git a/TD/README.md b/TD/README.md index ac2488167..b54bfead8 100644 --- a/TD/README.md +++ b/TD/README.md @@ -28,14 +28,14 @@ **Required:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 6: Temporal-Difference Learning +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 6: Temporal-Difference Learning - David Silver's RL Course Lecture 4 - Model-Free Prediction ([video](https://www.youtube.com/watch?v=PnHCvfgC_ZA), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/MC-TD.pdf)) - David Silver's RL Course Lecture 5 - Model-Free Control ([video](https://www.youtube.com/watch?v=0g4j2k_Ggc4), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/control.pdf)) **Optional:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 7: Multi-Step Bootstrapping -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 12: Eligibility Traces +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 7: Multi-Step Bootstrapping +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 12: Eligibility Traces ### Exercises From 9ee6cdd8494ff529df270d6d07658abbec0d62aa Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 6 Dec 2017 17:16:45 +0900 Subject: [PATCH 16/56] Updated link to Sutton's book --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 60974e0dd..43a7be82a 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ This repository provides code, exercises and solutions for popular Reinforcement Learning algorithms. These are meant to serve as a learning tool to complement the theoretical materials from -- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) +- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - [David Silver's Reinforcement Learning Course](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html) Each folder in corresponds to one or more chapters of the above textbook and/or course. In addition to exercises and solution, each folder also contains a list of learning goals, a brief concept summary, and links to the relevant readings. From dee1e01b6e4ed7cbd90ed603a0bf6ccb396fdcc4 Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 7 Dec 2017 15:23:25 +0900 Subject: [PATCH 17/56] DQN: Fixed typos. Changed labmda to gamma. Updated Readme --- DQN/Deep Q Learning Solution.ipynb | 32 +++++++++++++++++-------- DQN/Deep Q Learning.ipynb | 38 ++++++++++-------------------- DQN/Double DQN Solution.ipynb | 38 ++++++++++-------------------- DQN/README.md | 2 +- DQN/dqn.py | 6 ++--- 5 files changed, 50 insertions(+), 66 deletions(-) diff --git a/DQN/Deep Q Learning Solution.ipynb b/DQN/Deep Q Learning Solution.ipynb index 7cf615137..1477005ef 100644 --- a/DQN/Deep Q Learning Solution.ipynb +++ b/DQN/Deep Q Learning Solution.ipynb @@ -30,7 +30,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "env = gym.envs.make(\"Breakout-v0\")" @@ -39,7 +41,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# Atari Actions: 0 (noop), 1 (fire), 2 (left) and 3 (right) are valid actions\n", @@ -56,7 +60,7 @@ "source": [ "class StateProcessor():\n", " \"\"\"\n", - " Processes a raw Atari iamges. Resizes it and converts it to grayscale.\n", + " Processes a raw Atari images. Resizes it and converts it to grayscale.\n", " \"\"\"\n", " def __init__(self):\n", " # Build the Tensorflow graph\n", @@ -83,7 +87,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "class Estimator():\n", @@ -193,7 +199,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# For Testing....\n", @@ -295,7 +303,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def deep_q_learning(sess,\n", @@ -315,7 +325,7 @@ " batch_size=32,\n", " record_video_every=50):\n", " \"\"\"\n", - " Q-Learning algorithm for fff-policy TD control using Function Approximation.\n", + " Q-Learning algorithm for off-policy TD control using Function Approximation.\n", " Finds the optimal greedy policy while following an epsilon-greedy policy.\n", "\n", " Args:\n", @@ -331,7 +341,7 @@ " the reply memory.\n", " update_target_estimator_every: Copy parameters from the Q estimator to the \n", " target estimator every N steps\n", - " discount_factor: Lambda time discount factor\n", + " discount_factor: Gamma discount factor\n", " epsilon_start: Chance to sample a random action when taking an action.\n", " Epsilon is decayed over time and this is the start value\n", " epsilon_end: The final minimum value of epsilon after decaying is done\n", @@ -494,7 +504,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "tf.reset_default_graph()\n", @@ -569,7 +581,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.0" + "version": "3.5.2" } }, "nbformat": 4, diff --git a/DQN/Deep Q Learning.ipynb b/DQN/Deep Q Learning.ipynb index c3210d2ad..29631ce0a 100644 --- a/DQN/Deep Q Learning.ipynb +++ b/DQN/Deep Q Learning.ipynb @@ -29,9 +29,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "env = gym.envs.make(\"Breakout-v0\")" @@ -40,9 +38,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Atari Actions: 0 (noop), 1 (fire), 2 (left) and 3 (right) are valid actions\n", @@ -59,7 +55,7 @@ "source": [ "class StateProcessor():\n", " \"\"\"\n", - " Processes a raw Atari iamges. Resizes it and converts it to grayscale.\n", + " Processes a raw Atari images. Resizes it and converts it to grayscale.\n", " \"\"\"\n", " def __init__(self):\n", " # Build the Tensorflow graph\n", @@ -86,9 +82,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "class Estimator():\n", @@ -199,9 +193,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# For Testing....\n", @@ -234,9 +226,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def copy_model_parameters(sess, estimator1, estimator2):\n", @@ -294,9 +284,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def deep_q_learning(sess,\n", @@ -316,7 +304,7 @@ " batch_size=32,\n", " record_video_every=50):\n", " \"\"\"\n", - " Q-Learning algorithm for fff-policy TD control using Function Approximation.\n", + " Q-Learning algorithm for off-policy TD control using Function Approximation.\n", " Finds the optimal greedy policy while following an epsilon-greedy policy.\n", "\n", " Args:\n", @@ -332,7 +320,7 @@ " the reply memory.\n", " update_target_estimator_every: Copy parameters from the Q estimator to the \n", " target estimator every N steps\n", - " discount_factor: Lambda time discount factor\n", + " discount_factor: Gamma discount factor\n", " epsilon_start: Chance to sample a random action when taking an action.\n", " Epsilon is decayed over time and this is the start value\n", " epsilon_end: The final minimum value of epsilon after decaying is done\n", @@ -469,9 +457,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "tf.reset_default_graph()\n", @@ -528,9 +514,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.0" + "version": "3.5.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/DQN/Double DQN Solution.ipynb b/DQN/Double DQN Solution.ipynb index 22bb9ebc9..7d8411fdd 100644 --- a/DQN/Double DQN Solution.ipynb +++ b/DQN/Double DQN Solution.ipynb @@ -28,9 +28,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "env = gym.envs.make(\"Breakout-v0\")" @@ -39,9 +37,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Atari Actions: 0 (noop), 1 (fire), 2 (left) and 3 (right) are valid actions\n", @@ -58,7 +54,7 @@ "source": [ "class StateProcessor():\n", " \"\"\"\n", - " Processes a raw Atari iamges. Resizes it and converts it to grayscale.\n", + " Processes a raw Atari images. Resizes it and converts it to grayscale.\n", " \"\"\"\n", " def __init__(self):\n", " # Build the Tensorflow graph\n", @@ -85,9 +81,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "class Estimator():\n", @@ -175,9 +169,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# For Testing....\n", @@ -210,9 +202,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def copy_model_parameters(sess, estimator1, estimator2):\n", @@ -270,9 +260,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def deep_q_learning(sess,\n", @@ -292,7 +280,7 @@ " batch_size=32,\n", " record_video_every=50):\n", " \"\"\"\n", - " Q-Learning algorithm for fff-policy TD control using Function Approximation.\n", + " Q-Learning algorithm for off-policy TD control using Function Approximation.\n", " Finds the optimal greedy policy while following an epsilon-greedy policy.\n", "\n", " Args:\n", @@ -308,7 +296,7 @@ " the reply memory.\n", " update_target_estimator_every: Copy parameters from the Q estimator to the \n", " target estimator every N steps\n", - " discount_factor: Lambda time discount factor\n", + " discount_factor: Gamma discount factor\n", " epsilon_start: Chance to sample a random action when taking an action.\n", " Epsilon is decayed over time and this is the start value\n", " epsilon_end: The final minimum value of epsilon after decaying is done\n", @@ -472,9 +460,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "tf.reset_default_graph()\n", @@ -531,9 +517,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.5.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/DQN/README.md b/DQN/README.md index eedbbd894..7d0464727 100644 --- a/DQN/README.md +++ b/DQN/README.md @@ -39,7 +39,7 @@ ### Exercises -- [OpenAI Gym Atari Environment Playground](Breakout%20Playground.ipynb) +- Get familiar with the [OpenAI Gym Atari Environment Playground](Breakout%20Playground.ipynb) - Deep-Q Learning for Atari Games - [Exercise](Deep%20Q%20Learning.ipynb) - [Solution](Deep%20Q%20Learning%20Solution.ipynb) diff --git a/DQN/dqn.py b/DQN/dqn.py index d54d4d1bf..be43ec08b 100755 --- a/DQN/dqn.py +++ b/DQN/dqn.py @@ -20,7 +20,7 @@ class StateProcessor(): """ - Processes a raw Atari iamges. Resizes it and converts it to grayscale. + Processes a raw Atari images. Resizes it and converts it to grayscale. """ def __init__(self): # Build the Tensorflow graph @@ -208,7 +208,7 @@ def deep_q_learning(sess, batch_size=32, record_video_every=50): """ - Q-Learning algorithm for fff-policy TD control using Function Approximation. + Q-Learning algorithm for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: @@ -224,7 +224,7 @@ def deep_q_learning(sess, the reply memory. update_target_estimator_every: Copy parameters from the Q estimator to the target estimator every N steps - discount_factor: Lambda time discount factor + discount_factor: Gamma discount factor epsilon_start: Chance to sample a random action when taking an action. Epsilon is decayed over time and this is the start value epsilon_end: The final minimum value of epsilon after decaying is done From 85565ec067d2856f7d2ac033badd943f035adbf3 Mon Sep 17 00:00:00 2001 From: Alex Bailo Date: Wed, 27 Dec 2017 15:13:47 +0900 Subject: [PATCH 18/56] "Policy Gradient Methods" is chapter 13 now --- PolicyGradient/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PolicyGradient/README.md b/PolicyGradient/README.md index 1e7a1c68d..dc534c914 100644 --- a/PolicyGradient/README.md +++ b/PolicyGradient/README.md @@ -36,7 +36,7 @@ **Optional:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 11: Policy Gradient Methods (Under Construction) +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 13: Policy Gradient Methods (Under Construction) - [Deterministic Policy Gradient Algorithms](http://jmlr.org/proceedings/papers/v32/silver14.pdf) - [Deterministic Policy Gradient Algorithms (Talk)](http://techtalks.tv/talks/deterministic-policy-gradient-algorithms/61098/) - [Continuous control with deep reinforcement learning](https://arxiv.org/abs/1509.02971) From f637c42976275d5acd27f4a03779b2e1ddcf8a1a Mon Sep 17 00:00:00 2001 From: Alex Bailo Date: Wed, 27 Dec 2017 15:41:46 +0900 Subject: [PATCH 19/56] "Policy Gradient Methods" chapter is completed. Updated OpenAI Gym link with cached version. --- PolicyGradient/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PolicyGradient/README.md b/PolicyGradient/README.md index dc534c914..373bcee95 100644 --- a/PolicyGradient/README.md +++ b/PolicyGradient/README.md @@ -36,13 +36,13 @@ **Optional:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 13: Policy Gradient Methods (Under Construction) +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 13: Policy Gradient Methods - [Deterministic Policy Gradient Algorithms](http://jmlr.org/proceedings/papers/v32/silver14.pdf) - [Deterministic Policy Gradient Algorithms (Talk)](http://techtalks.tv/talks/deterministic-policy-gradient-algorithms/61098/) - [Continuous control with deep reinforcement learning](https://arxiv.org/abs/1509.02971) - [Deep Deterministic Policy Gradients in TensorFlow](http://pemami4911.github.io/blog_posts/2016/08/21/ddpg-rl.html) - [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/abs/1602.01783) -- [Deep Reinforcement Learning: A Tutorial (Policy Gradient Section)](https://gym.openai.com/docs/rl#policy-gradients) +- [Deep Reinforcement Learning: A Tutorial (Policy Gradient Section)](http://web.archive.org/web/20161029135055/https://gym.openai.com/docs/rl#id16) From 1f2e2eb50a36655c9a17da8dad7b533d72333c5e Mon Sep 17 00:00:00 2001 From: Alex Bailo Date: Wed, 27 Dec 2017 17:18:09 +0900 Subject: [PATCH 20/56] Fixed broken links to Solutions in PolicyGradient --- PolicyGradient/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PolicyGradient/README.md b/PolicyGradient/README.md index 373bcee95..8d77199fb 100644 --- a/PolicyGradient/README.md +++ b/PolicyGradient/README.md @@ -53,10 +53,10 @@ - [Solution](CliffWalk%20REINFORCE%20with%20Baseline%20Solution.ipynb) - Actor-Critic with Baseline - Exercise - - [Solution](CliffWalk%20Actor-Critic%20Solution.ipynb) + - [Solution](CliffWalk%20Actor%20Critic%20Solution.ipynb) - Actor-Critic with Baseline for Continuous Action Spaces - Exercise - - [Solution](Continuous%20MountainCar%20Actor-Critic%20Solution.ipynb) + - [Solution](Continuous%20MountainCar%20Actor%20Critic%20Solution.ipynb) - Deterministic Policy Gradients for Continuous Action Spaces (WIP) - Deep Deterministic Policy Gradients (WIP) - Asynchronous Advantage Actor-Critic (A3C) From 783c2c39a3a94df40b5a62ab768e1570f95c776b Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 28 Dec 2017 10:16:48 +0900 Subject: [PATCH 21/56] Mod. estimator_value comment in actor-critic --- .../CliffWalk Actor Critic Solution.ipynb | 26 ++++++------------- ...us MountainCar Actor Critic Solution.ipynb | 8 +++--- 2 files changed, 13 insertions(+), 21 deletions(-) diff --git a/PolicyGradient/CliffWalk Actor Critic Solution.ipynb b/PolicyGradient/CliffWalk Actor Critic Solution.ipynb index 0e952a07c..0a8fb509e 100644 --- a/PolicyGradient/CliffWalk Actor Critic Solution.ipynb +++ b/PolicyGradient/CliffWalk Actor Critic Solution.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", @@ -29,9 +27,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "env = CliffWalkingEnv()" @@ -88,9 +84,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "class ValueEstimator():\n", @@ -145,7 +139,7 @@ " Args:\n", " env: OpenAI environment.\n", " estimator_policy: Policy Function to be optimized \n", - " estimator_value: Value function approximator, used as a baseline\n", + " estimator_value: Value function approximator, used as a critic\n", " num_episodes: Number of episodes to run for\n", " discount_factor: Time-discount factor\n", " \n", @@ -209,9 +203,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -238,9 +230,7 @@ { "cell_type": "code", "execution_count": 28, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -306,9 +296,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.0" + "version": "3.5.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/PolicyGradient/Continuous MountainCar Actor Critic Solution.ipynb b/PolicyGradient/Continuous MountainCar Actor Critic Solution.ipynb index 4cbc43d27..6b34a0b62 100644 --- a/PolicyGradient/Continuous MountainCar Actor Critic Solution.ipynb +++ b/PolicyGradient/Continuous MountainCar Actor Critic Solution.ipynb @@ -233,7 +233,7 @@ " Args:\n", " env: OpenAI environment.\n", " estimator_policy: Policy Function to be optimized \n", - " estimator_value: Value function approximator, used as a baseline\n", + " estimator_value: Value function approximator, used as a critic\n", " num_episodes: Number of episodes to run for\n", " discount_factor: Time-discount factor\n", " \n", @@ -343,7 +343,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "plotting.plot_episode_stats(stats, smoothing_window=10)" @@ -384,7 +386,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.1" + "version": "3.5.2" } }, "nbformat": 4, From d8136b4c575001a6ae4ff67f4362ced9c6cf8846 Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 3 Jan 2018 14:58:39 +0900 Subject: [PATCH 22/56] Updated links to new version of Sutton's book --- DP/README.md | 2 +- FA/README.md | 4 ++-- Introduction/README.md | 2 +- MC/README.md | 2 +- MDP/README.md | 2 +- PolicyGradient/README.md | 2 +- README.md | 4 ++-- TD/README.md | 6 +++--- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/DP/README.md b/DP/README.md index 1c2bb768b..cf2cbf51f 100644 --- a/DP/README.md +++ b/DP/README.md @@ -28,7 +28,7 @@ **Optional:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 4: Dynamic Programming +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 4: Dynamic Programming ### Exercises diff --git a/FA/README.md b/FA/README.md index f50f56cef..247c41e4e 100644 --- a/FA/README.md +++ b/FA/README.md @@ -25,8 +25,8 @@ **Required:** - David Silver's RL Course Lecture 6 - Value Function Approximation ([video](https://www.youtube.com/watch?v=UoPei5o4fps), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/FA.pdf)) -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 9: On-policy Prediction with Approximation -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 10: On-policy Control with Approximation +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 9: On-policy Prediction with Approximation +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 10: On-policy Control with Approximation **Optional:** diff --git a/Introduction/README.md b/Introduction/README.md index 9e5b383ac..cd27a4e12 100644 --- a/Introduction/README.md +++ b/Introduction/README.md @@ -17,7 +17,7 @@ **Required:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 1: The Reinforcement Learning Problem +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 1: The Reinforcement Learning Problem - David Silver's RL Course Lecture 1 - Introduction to Reinforcement Learning ([video](https://www.youtube.com/watch?v=2pWv7GOvuf0), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/intro_RL.pdf)) - [OpenAI Gym Tutorial](https://gym.openai.com/docs) diff --git a/MC/README.md b/MC/README.md index 9d23968c2..7b889ed6f 100644 --- a/MC/README.md +++ b/MC/README.md @@ -26,7 +26,7 @@ **Required:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 5: Monte Carlo Methods +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 5: Monte Carlo Methods **Optional:** diff --git a/MDP/README.md b/MDP/README.md index 539799a09..de9bcce35 100644 --- a/MDP/README.md +++ b/MDP/README.md @@ -25,7 +25,7 @@ **Required:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 3: Finite Markov Decision Processes +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 3: Finite Markov Decision Processes - David Silver's RL Course Lecture 2 - Markov Decision Processes ([video](https://www.youtube.com/watch?v=lfHX2hHRMVQ), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/MDP.pdf)) diff --git a/PolicyGradient/README.md b/PolicyGradient/README.md index 8d77199fb..a7dffdeef 100644 --- a/PolicyGradient/README.md +++ b/PolicyGradient/README.md @@ -36,7 +36,7 @@ **Optional:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 13: Policy Gradient Methods +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 13: Policy Gradient Methods - [Deterministic Policy Gradient Algorithms](http://jmlr.org/proceedings/papers/v32/silver14.pdf) - [Deterministic Policy Gradient Algorithms (Talk)](http://techtalks.tv/talks/deterministic-policy-gradient-algorithms/61098/) - [Continuous control with deep reinforcement learning](https://arxiv.org/abs/1509.02971) diff --git a/README.md b/README.md index 43a7be82a..72a11e5a9 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ This repository provides code, exercises and solutions for popular Reinforcement Learning algorithms. These are meant to serve as a learning tool to complement the theoretical materials from -- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/book/bookdraft2017nov5.pdf) +- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - [David Silver's Reinforcement Learning Course](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html) Each folder in corresponds to one or more chapters of the above textbook and/or course. In addition to exercises and solution, each folder also contains a list of learning goals, a brief concept summary, and links to the relevant readings. @@ -50,7 +50,7 @@ All code is written in Python 3 and uses RL environments from [OpenAI Gym](https Textbooks: -- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/book/bookdraft2017nov5.pdf) +- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/book/bookdraft2018jan1.pdf) Classes: diff --git a/TD/README.md b/TD/README.md index b54bfead8..a4c35a0e9 100644 --- a/TD/README.md +++ b/TD/README.md @@ -28,14 +28,14 @@ **Required:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 6: Temporal-Difference Learning +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 6: Temporal-Difference Learning - David Silver's RL Course Lecture 4 - Model-Free Prediction ([video](https://www.youtube.com/watch?v=PnHCvfgC_ZA), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/MC-TD.pdf)) - David Silver's RL Course Lecture 5 - Model-Free Control ([video](https://www.youtube.com/watch?v=0g4j2k_Ggc4), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/control.pdf)) **Optional:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 7: Multi-Step Bootstrapping -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2017nov5.pdf) - Chapter 12: Eligibility Traces +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 7: Multi-Step Bootstrapping +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 12: Eligibility Traces ### Exercises From 30326df0cf66d649c1619ce2e3134fc2839dcde9 Mon Sep 17 00:00:00 2001 From: Keith Gould Date: Wed, 24 Jan 2018 10:03:44 -0500 Subject: [PATCH 23/56] update value estimator only after calculating advantage --- .../CliffWalk REINFORCE with Baseline Solution.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb b/PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb index 4291d5551..cad46261d 100644 --- a/PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb +++ b/PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb @@ -196,11 +196,11 @@ " for t, transition in enumerate(episode):\n", " # The return after this timestep\n", " total_return = sum(discount_factor**i * t.reward for i, t in enumerate(episode[t:]))\n", - " # Update our value estimator\n", - " estimator_value.update(transition.state, total_return)\n", " # Calculate baseline/advantage\n", " baseline_value = estimator_value.predict(transition.state) \n", " advantage = total_return - baseline_value\n", + " # Update our value estimator\n", + " estimator_value.update(transition.state, total_return)\n", " # Update our policy estimator\n", " estimator_policy.update(transition.state, advantage, transition.action)\n", " \n", From 9454010f60a87a6e66a517fdb038365b9988146c Mon Sep 17 00:00:00 2001 From: Byzantine Date: Sun, 28 Jan 2018 12:30:06 -0800 Subject: [PATCH 24/56] Minor fix: sync sample policy with the solution --- MC/MC Prediction.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MC/MC Prediction.ipynb b/MC/MC Prediction.ipynb index 472f9ef35..13b3da809 100644 --- a/MC/MC Prediction.ipynb +++ b/MC/MC Prediction.ipynb @@ -87,7 +87,7 @@ " A policy that sticks if the player score is > 20 and hits otherwise.\n", " \"\"\"\n", " score, dealer_score, usable_ace = observation\n", - " return np.array([1.0, 0.0]) if score >= 20 else np.array([0.0, 1.0])" + " return 0 if score >= 20 else 1" ] }, { From 6211e2df03162f367c7a8c05728897385397ae35 Mon Sep 17 00:00:00 2001 From: Sanyam Kapoor <1sanyamkapoor@gmail.com> Date: Mon, 19 Feb 2018 10:45:30 -0500 Subject: [PATCH 25/56] Add one step lookahead function for easy comparison with Value Iteration --- DP/Policy Evaluation Solution.ipynb | 42 ++++++-------- DP/Policy Iteration Solution.ipynb | 89 +++++++++++++++-------------- 2 files changed, 62 insertions(+), 69 deletions(-) diff --git a/DP/Policy Evaluation Solution.ipynb b/DP/Policy Evaluation Solution.ipynb index 703e020fb..d69fe2546 100644 --- a/DP/Policy Evaluation Solution.ipynb +++ b/DP/Policy Evaluation Solution.ipynb @@ -2,12 +2,11 @@ "cells": [ { "cell_type": "code", - "execution_count": 53, - "metadata": { - "collapsed": true - }, + "execution_count": 1, + "metadata": {}, "outputs": [], "source": [ + "from IPython.core.debugger import set_trace\n", "import numpy as np\n", "import pprint\n", "import sys\n", @@ -18,10 +17,8 @@ }, { "cell_type": "code", - "execution_count": 54, - "metadata": { - "collapsed": true - }, + "execution_count": 2, + "metadata": {}, "outputs": [], "source": [ "pp = pprint.PrettyPrinter(indent=2)\n", @@ -30,10 +27,8 @@ }, { "cell_type": "code", - "execution_count": 55, - "metadata": { - "collapsed": true - }, + "execution_count": 3, + "metadata": {}, "outputs": [], "source": [ "def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):\n", @@ -76,10 +71,8 @@ }, { "cell_type": "code", - "execution_count": 56, - "metadata": { - "collapsed": true - }, + "execution_count": 4, + "metadata": {}, "outputs": [], "source": [ "random_policy = np.ones([env.nS, env.nA]) / env.nA\n", @@ -88,7 +81,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -98,7 +91,8 @@ "Value Function:\n", "[ 0. -13.99993529 -19.99990698 -21.99989761 -13.99993529\n", " -17.9999206 -19.99991379 -19.99991477 -19.99990698 -19.99991379\n", - " -17.99992725 -13.99994569 -21.99989761 -19.99991477 -13.99994569 0. ]\n", + " -17.99992725 -13.99994569 -21.99989761 -19.99991477 -13.99994569\n", + " 0. ]\n", "\n", "Reshaped Grid Value Function:\n", "[[ 0. -13.99993529 -19.99990698 -21.99989761]\n", @@ -121,10 +115,8 @@ }, { "cell_type": "code", - "execution_count": 51, - "metadata": { - "collapsed": true - }, + "execution_count": 6, + "metadata": {}, "outputs": [], "source": [ "# Test: Make sure the evaluated policy is what we expected\n", @@ -135,9 +127,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [] } @@ -158,7 +148,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.6.4" } }, "nbformat": 4, diff --git a/DP/Policy Iteration Solution.ipynb b/DP/Policy Iteration Solution.ipynb index be7d3710e..dc121c8c5 100644 --- a/DP/Policy Iteration Solution.ipynb +++ b/DP/Policy Iteration Solution.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -19,9 +17,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "pp = pprint.PrettyPrinter(indent=2)\n", @@ -30,10 +26,8 @@ }, { "cell_type": "code", - "execution_count": 62, - "metadata": { - "collapsed": true - }, + "execution_count": 3, + "metadata": {}, "outputs": [], "source": [ "# Taken from Policy Evaluation Exercise!\n", @@ -78,10 +72,8 @@ }, { "cell_type": "code", - "execution_count": 63, - "metadata": { - "collapsed": true - }, + "execution_count": 4, + "metadata": {}, "outputs": [], "source": [ "def policy_improvement(env, policy_eval_fn=policy_eval, discount_factor=1.0):\n", @@ -102,6 +94,24 @@ " V is the value function for the optimal policy.\n", " \n", " \"\"\"\n", + "\n", + " def one_step_lookahead(state, V):\n", + " \"\"\"\n", + " Helper function to calculate the value for all action in a given state.\n", + " \n", + " Args:\n", + " state: The state to consider (int)\n", + " V: The value to use as an estimator, Vector of length env.nS\n", + " \n", + " Returns:\n", + " A vector of length env.nA containing the expected value of each action.\n", + " \"\"\"\n", + " A = np.zeros(env.nA)\n", + " for a in range(env.nA):\n", + " for prob, next_state, reward, done in env.P[state][a]:\n", + " A[a] += prob * (reward + discount_factor * V[next_state])\n", + " return A\n", + " \n", " # Start with a random policy\n", " policy = np.ones([env.nS, env.nA]) / env.nA\n", " \n", @@ -119,10 +129,7 @@ " \n", " # Find the best action by one-step lookahead\n", " # Ties are resolved arbitarily\n", - " action_values = np.zeros(env.nA)\n", - " for a in range(env.nA):\n", - " for prob, next_state, reward, done in env.P[s][a]:\n", - " action_values[a] += prob * (reward + discount_factor * V[next_state])\n", + " action_values = one_step_lookahead(s, V)\n", " best_a = np.argmax(action_values)\n", " \n", " # Greedily update the policy\n", @@ -137,7 +144,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -145,22 +152,22 @@ "output_type": "stream", "text": [ "Policy Probability Distribution:\n", - "[[ 1. 0. 0. 0.]\n", - " [ 0. 0. 0. 1.]\n", - " [ 0. 0. 0. 1.]\n", - " [ 0. 0. 1. 0.]\n", - " [ 1. 0. 0. 0.]\n", - " [ 1. 0. 0. 0.]\n", - " [ 1. 0. 0. 0.]\n", - " [ 0. 0. 1. 0.]\n", - " [ 1. 0. 0. 0.]\n", - " [ 1. 0. 0. 0.]\n", - " [ 0. 1. 0. 0.]\n", - " [ 0. 0. 1. 0.]\n", - " [ 1. 0. 0. 0.]\n", - " [ 0. 1. 0. 0.]\n", - " [ 0. 1. 0. 0.]\n", - " [ 1. 0. 0. 0.]]\n", + "[[1. 0. 0. 0.]\n", + " [0. 0. 0. 1.]\n", + " [0. 0. 0. 1.]\n", + " [0. 0. 1. 0.]\n", + " [1. 0. 0. 0.]\n", + " [1. 0. 0. 0.]\n", + " [1. 0. 0. 0.]\n", + " [0. 0. 1. 0.]\n", + " [1. 0. 0. 0.]\n", + " [1. 0. 0. 0.]\n", + " [0. 1. 0. 0.]\n", + " [0. 0. 1. 0.]\n", + " [1. 0. 0. 0.]\n", + " [0. 1. 0. 0.]\n", + " [0. 1. 0. 0.]\n", + " [1. 0. 0. 0.]]\n", "\n", "Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):\n", "[[0 3 3 2]\n", @@ -202,10 +209,8 @@ }, { "cell_type": "code", - "execution_count": 59, - "metadata": { - "collapsed": true - }, + "execution_count": 6, + "metadata": {}, "outputs": [], "source": [ "# Test the value function\n", @@ -216,9 +221,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [] } @@ -239,7 +242,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.6.4" } }, "nbformat": 4, From e030ecfe1e980189106d6d58bb69967819b78a7d Mon Sep 17 00:00:00 2001 From: Sanyam Kapoor <1sanyamkapoor@gmail.com> Date: Mon, 19 Feb 2018 10:45:54 -0500 Subject: [PATCH 26/56] Add value check assertion --- DP/Value Iteration Solution.ipynb | 69 ++++++++++++++++--------------- 1 file changed, 36 insertions(+), 33 deletions(-) diff --git a/DP/Value Iteration Solution.ipynb b/DP/Value Iteration Solution.ipynb index cd0da629f..c7134dff3 100644 --- a/DP/Value Iteration Solution.ipynb +++ b/DP/Value Iteration Solution.ipynb @@ -2,10 +2,8 @@ "cells": [ { "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": true - }, + "execution_count": 1, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -18,10 +16,8 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": true - }, + "execution_count": 2, + "metadata": {}, "outputs": [], "source": [ "pp = pprint.PrettyPrinter(indent=2)\n", @@ -30,10 +26,8 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "collapsed": true - }, + "execution_count": 3, + "metadata": {}, "outputs": [], "source": [ "def value_iteration(env, theta=0.0001, discount_factor=1.0):\n", @@ -100,7 +94,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -108,22 +102,22 @@ "output_type": "stream", "text": [ "Policy Probability Distribution:\n", - "[[ 1. 0. 0. 0.]\n", - " [ 0. 0. 0. 1.]\n", - " [ 0. 0. 0. 1.]\n", - " [ 0. 0. 1. 0.]\n", - " [ 1. 0. 0. 0.]\n", - " [ 1. 0. 0. 0.]\n", - " [ 1. 0. 0. 0.]\n", - " [ 0. 0. 1. 0.]\n", - " [ 1. 0. 0. 0.]\n", - " [ 1. 0. 0. 0.]\n", - " [ 0. 1. 0. 0.]\n", - " [ 0. 0. 1. 0.]\n", - " [ 1. 0. 0. 0.]\n", - " [ 0. 1. 0. 0.]\n", - " [ 0. 1. 0. 0.]\n", - " [ 1. 0. 0. 0.]]\n", + "[[1. 0. 0. 0.]\n", + " [0. 0. 0. 1.]\n", + " [0. 0. 0. 1.]\n", + " [0. 0. 1. 0.]\n", + " [1. 0. 0. 0.]\n", + " [1. 0. 0. 0.]\n", + " [1. 0. 0. 0.]\n", + " [0. 0. 1. 0.]\n", + " [1. 0. 0. 0.]\n", + " [1. 0. 0. 0.]\n", + " [0. 1. 0. 0.]\n", + " [0. 0. 1. 0.]\n", + " [1. 0. 0. 0.]\n", + " [0. 1. 0. 0.]\n", + " [0. 1. 0. 0.]\n", + " [1. 0. 0. 0.]]\n", "\n", "Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):\n", "[[0 3 3 2]\n", @@ -163,12 +157,21 @@ "print(\"\")" ] }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Test the value function\n", + "expected_v = np.array([ 0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1, 0])\n", + "np.testing.assert_array_almost_equal(v, expected_v, decimal=2)" + ] + }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [] } @@ -190,7 +193,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.6.4" } }, "nbformat": 4, From edcba6b8790dbfda151c319378d48ee349a0f4ac Mon Sep 17 00:00:00 2001 From: Sanyam Kapoor <1sanyamkapoor@gmail.com> Date: Mon, 19 Feb 2018 14:20:31 -0500 Subject: [PATCH 27/56] Fix step and reset NotImplementedError --- MC/Blackjack Playground.ipynb | 171 ++++++---------------------------- lib/envs/blackjack.py | 8 +- 2 files changed, 34 insertions(+), 145 deletions(-) diff --git a/MC/Blackjack Playground.ipynb b/MC/Blackjack Playground.ipynb index 28dfc1867..f4f6ffe84 100644 --- a/MC/Blackjack Playground.ipynb +++ b/MC/Blackjack Playground.ipynb @@ -2,10 +2,8 @@ "cells": [ { "cell_type": "code", - "execution_count": 419, - "metadata": { - "collapsed": true - }, + "execution_count": 1, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -17,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": 420, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -26,151 +24,29 @@ }, { "cell_type": "code", - "execution_count": 422, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Player Score: 17 (Usable Ace: False), Dealer Score: 10\n", - "Taking action: Hit\n", - "Player Score: 18 (Usable Ace: False), Dealer Score: 10\n", - "Taking action: Hit\n", - "Player Score: 28 (Usable Ace: False), Dealer Score: 10\n", - "Game end. Reward: -1.0\n", - "\n", - "Player Score: 6 (Usable Ace: False), Dealer Score: 9\n", - "Taking action: Hit\n", - "Player Score: 16 (Usable Ace: False), Dealer Score: 9\n", - "Taking action: Hit\n", - "Player Score: 26 (Usable Ace: False), Dealer Score: 9\n", - "Game end. Reward: -1.0\n", - "\n", - "Player Score: 12 (Usable Ace: False), Dealer Score: 6\n", - "Taking action: Hit\n", - "Player Score: 21 (Usable Ace: False), Dealer Score: 6\n", - "Taking action: Stick\n", - "Player Score: 21 (Usable Ace: False), Dealer Score: 6\n", - "Game end. Reward: 1.0\n", - "\n", - "Player Score: 17 (Usable Ace: True), Dealer Score: 8\n", - "Taking action: Hit\n", - "Player Score: 17 (Usable Ace: False), Dealer Score: 8\n", - "Taking action: Hit\n", - "Player Score: 22 (Usable Ace: False), Dealer Score: 8\n", - "Game end. Reward: -1.0\n", - "\n", "Player Score: 17 (Usable Ace: False), Dealer Score: 8\n", - "Taking action: Hit\n", - "Player Score: 27 (Usable Ace: False), Dealer Score: 8\n", - "Game end. Reward: -1.0\n", - "\n", - "Player Score: 16 (Usable Ace: False), Dealer Score: 10\n", - "Taking action: Hit\n", - "Player Score: 19 (Usable Ace: False), Dealer Score: 10\n", - "Taking action: Hit\n", - "Player Score: 28 (Usable Ace: False), Dealer Score: 10\n", - "Game end. Reward: -1.0\n", - "\n", - "Player Score: 13 (Usable Ace: False), Dealer Score: 7\n", - "Taking action: Hit\n", - "Player Score: 14 (Usable Ace: False), Dealer Score: 7\n", - "Taking action: Hit\n", - "Player Score: 24 (Usable Ace: False), Dealer Score: 7\n", - "Game end. Reward: -1.0\n", - "\n", - "Player Score: 17 (Usable Ace: False), Dealer Score: 5\n", - "Taking action: Hit\n", - "Player Score: 25 (Usable Ace: False), Dealer Score: 5\n", - "Game end. Reward: -1.0\n", - "\n", - "Player Score: 20 (Usable Ace: False), Dealer Score: 5\n", - "Taking action: Stick\n", - "Player Score: 20 (Usable Ace: False), Dealer Score: 5\n", - "Game end. Reward: 1.0\n", - "\n", - "Player Score: 12 (Usable Ace: True), Dealer Score: 10\n", - "Taking action: Hit\n", - "Player Score: 20 (Usable Ace: True), Dealer Score: 10\n", - "Taking action: Stick\n", - "Player Score: 20 (Usable Ace: True), Dealer Score: 10\n", - "Game end. Reward: 0.0\n", - "\n", - "Player Score: 12 (Usable Ace: False), Dealer Score: 10\n", - "Taking action: Hit\n", - "Player Score: 19 (Usable Ace: False), Dealer Score: 10\n", - "Taking action: Hit\n", - "Player Score: 24 (Usable Ace: False), Dealer Score: 10\n", - "Game end. Reward: -1.0\n", - "\n", - "Player Score: 19 (Usable Ace: False), Dealer Score: 4\n", - "Taking action: Hit\n", - "Player Score: 22 (Usable Ace: False), Dealer Score: 4\n", - "Game end. Reward: -1.0\n", - "\n", - "Player Score: 16 (Usable Ace: False), Dealer Score: 10\n", - "Taking action: Hit\n", - "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", - "Taking action: Stick\n", - "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", - "Game end. Reward: 0.0\n", - "\n", - "Player Score: 4 (Usable Ace: False), Dealer Score: 3\n", - "Taking action: Hit\n", - "Player Score: 14 (Usable Ace: False), Dealer Score: 3\n", - "Taking action: Hit\n", - "Player Score: 24 (Usable Ace: False), Dealer Score: 3\n", - "Game end. Reward: -1.0\n", - "\n", - "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n", - "Taking action: Stick\n", - "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n", - "Game end. Reward: 1.0\n", - "\n", - "Player Score: 16 (Usable Ace: True), Dealer Score: 10\n", - "Taking action: Hit\n", - "Player Score: 12 (Usable Ace: False), Dealer Score: 10\n", - "Taking action: Hit\n", - "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", - "Taking action: Stick\n", - "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", - "Game end. Reward: 1.0\n", - "\n", - "Player Score: 9 (Usable Ace: False), Dealer Score: 10\n", - "Taking action: Hit\n", - "Player Score: 19 (Usable Ace: False), Dealer Score: 10\n", - "Taking action: Hit\n", - "Player Score: 26 (Usable Ace: False), Dealer Score: 10\n", - "Game end. Reward: -1.0\n", - "\n", - "Player Score: 12 (Usable Ace: False), Dealer Score: 5\n", - "Taking action: Hit\n", - "Player Score: 15 (Usable Ace: False), Dealer Score: 5\n", - "Taking action: Hit\n", - "Player Score: 21 (Usable Ace: False), Dealer Score: 5\n", - "Taking action: Stick\n", - "Player Score: 21 (Usable Ace: False), Dealer Score: 5\n", - "Game end. Reward: 1.0\n", - "\n", - "Player Score: 11 (Usable Ace: False), Dealer Score: 9\n", - "Taking action: Hit\n", - "Player Score: 13 (Usable Ace: False), Dealer Score: 9\n", - "Taking action: Hit\n", - "Player Score: 17 (Usable Ace: False), Dealer Score: 9\n", - "Taking action: Hit\n", - "Player Score: 19 (Usable Ace: False), Dealer Score: 9\n", - "Taking action: Hit\n", - "Player Score: 29 (Usable Ace: False), Dealer Score: 9\n", - "Game end. Reward: -1.0\n", - "\n", - "Player Score: 14 (Usable Ace: False), Dealer Score: 7\n", - "Taking action: Hit\n", - "Player Score: 19 (Usable Ace: False), Dealer Score: 7\n", - "Taking action: Hit\n", - "Player Score: 29 (Usable Ace: False), Dealer Score: 7\n", - "Game end. Reward: -1.0\n", - "\n" + "Taking action: Hit\n" + ] + }, + { + "ename": "RecursionError", + "evalue": "maximum recursion depth exceeded", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mRecursionError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0maction\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstrategy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobservation\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Taking action: {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\"Stick\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Hit\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0maction\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0mobservation\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreward\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 18\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0mprint_observation\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobservation\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Workspace/src/github.com/dennybritz/reinforcement-learning/lib/envs/blackjack.py\u001b[0m in \u001b[0;36mstep\u001b[0;34m(self, action)\u001b[0m\n\u001b[1;32m 84\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 85\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 86\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 87\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_seed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mseed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "... last 1 frames repeated, from the frame below ...\n", + "\u001b[0;32m~/Workspace/src/github.com/dennybritz/reinforcement-learning/lib/envs/blackjack.py\u001b[0m in \u001b[0;36mstep\u001b[0;34m(self, action)\u001b[0m\n\u001b[1;32m 84\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 85\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 86\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 87\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_seed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mseed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mRecursionError\u001b[0m: maximum recursion depth exceeded" ] } ], @@ -197,6 +73,13 @@ " print(\"Game end. Reward: {}\\n\".format(float(reward)))\n", " break" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -215,7 +98,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.6.4" } }, "nbformat": 4, diff --git a/lib/envs/blackjack.py b/lib/envs/blackjack.py index 158c49709..9052b4677 100644 --- a/lib/envs/blackjack.py +++ b/lib/envs/blackjack.py @@ -79,6 +79,12 @@ def __init__(self, natural=False): self._reset() # Number of self.nA = 2 + def reset(self): + return self._reset() + + def step(self, action): + return self._step(action) + def _seed(self, seed=None): self.np_random, seed = seeding.np_random(seed) return [seed] @@ -113,4 +119,4 @@ def _reset(self): while sum_hand(self.player) < 12: self.player.append(draw_card(self.np_random)) - return self._get_obs() \ No newline at end of file + return self._get_obs() From ba12f971f316078adc20ae898ff8de65491c1925 Mon Sep 17 00:00:00 2001 From: Sanyam Kapoor <1sanyamkapoor@gmail.com> Date: Mon, 19 Feb 2018 14:27:28 -0500 Subject: [PATCH 28/56] Update playground output --- MC/Blackjack Playground.ipynb | 144 ++++++++++++++++++++++++++++++---- 1 file changed, 128 insertions(+), 16 deletions(-) diff --git a/MC/Blackjack Playground.ipynb b/MC/Blackjack Playground.ipynb index f4f6ffe84..412322175 100644 --- a/MC/Blackjack Playground.ipynb +++ b/MC/Blackjack Playground.ipynb @@ -31,22 +31,134 @@ "name": "stdout", "output_type": "stream", "text": [ - "Player Score: 17 (Usable Ace: False), Dealer Score: 8\n", - "Taking action: Hit\n" - ] - }, - { - "ename": "RecursionError", - "evalue": "maximum recursion depth exceeded", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mRecursionError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0maction\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstrategy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobservation\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Taking action: {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\"Stick\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Hit\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0maction\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0mobservation\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreward\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 18\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0mprint_observation\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobservation\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Workspace/src/github.com/dennybritz/reinforcement-learning/lib/envs/blackjack.py\u001b[0m in \u001b[0;36mstep\u001b[0;34m(self, action)\u001b[0m\n\u001b[1;32m 84\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 85\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 86\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 87\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_seed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mseed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "... last 1 frames repeated, from the frame below ...\n", - "\u001b[0;32m~/Workspace/src/github.com/dennybritz/reinforcement-learning/lib/envs/blackjack.py\u001b[0m in \u001b[0;36mstep\u001b[0;34m(self, action)\u001b[0m\n\u001b[1;32m 84\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 85\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 86\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 87\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_seed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mseed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mRecursionError\u001b[0m: maximum recursion depth exceeded" + "Player Score: 19 (Usable Ace: False), Dealer Score: 5\n", + "Taking action: Hit\n", + "Player Score: 27 (Usable Ace: False), Dealer Score: 5\n", + "Game end. Reward: -1.0\n", + "\n", + "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n", + "Taking action: Stick\n", + "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n", + "Game end. Reward: 0.0\n", + "\n", + "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n", + "Taking action: Stick\n", + "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n", + "Game end. Reward: 1.0\n", + "\n", + "Player Score: 14 (Usable Ace: True), Dealer Score: 10\n", + "Taking action: Hit\n", + "Player Score: 19 (Usable Ace: True), Dealer Score: 10\n", + "Taking action: Hit\n", + "Player Score: 15 (Usable Ace: False), Dealer Score: 10\n", + "Taking action: Hit\n", + "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", + "Taking action: Stick\n", + "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", + "Game end. Reward: 1.0\n", + "\n", + "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", + "Taking action: Stick\n", + "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", + "Game end. Reward: 1.0\n", + "\n", + "Player Score: 18 (Usable Ace: False), Dealer Score: 6\n", + "Taking action: Hit\n", + "Player Score: 27 (Usable Ace: False), Dealer Score: 6\n", + "Game end. Reward: -1.0\n", + "\n", + "Player Score: 16 (Usable Ace: False), Dealer Score: 3\n", + "Taking action: Hit\n", + "Player Score: 18 (Usable Ace: False), Dealer Score: 3\n", + "Taking action: Hit\n", + "Player Score: 23 (Usable Ace: False), Dealer Score: 3\n", + "Game end. Reward: -1.0\n", + "\n", + "Player Score: 19 (Usable Ace: False), Dealer Score: 10\n", + "Taking action: Hit\n", + "Player Score: 23 (Usable Ace: False), Dealer Score: 10\n", + "Game end. Reward: -1.0\n", + "\n", + "Player Score: 19 (Usable Ace: False), Dealer Score: 4\n", + "Taking action: Hit\n", + "Player Score: 21 (Usable Ace: False), Dealer Score: 4\n", + "Taking action: Stick\n", + "Player Score: 21 (Usable Ace: False), Dealer Score: 4\n", + "Game end. Reward: 1.0\n", + "\n", + "Player Score: 21 (Usable Ace: True), Dealer Score: 4\n", + "Taking action: Stick\n", + "Player Score: 21 (Usable Ace: True), Dealer Score: 4\n", + "Game end. Reward: 1.0\n", + "\n", + "Player Score: 16 (Usable Ace: True), Dealer Score: 10\n", + "Taking action: Hit\n", + "Player Score: 16 (Usable Ace: False), Dealer Score: 10\n", + "Taking action: Hit\n", + "Player Score: 26 (Usable Ace: False), Dealer Score: 10\n", + "Game end. Reward: -1.0\n", + "\n", + "Player Score: 14 (Usable Ace: False), Dealer Score: 10\n", + "Taking action: Hit\n", + "Player Score: 23 (Usable Ace: False), Dealer Score: 10\n", + "Game end. Reward: -1.0\n", + "\n", + "Player Score: 12 (Usable Ace: False), Dealer Score: 10\n", + "Taking action: Hit\n", + "Player Score: 15 (Usable Ace: False), Dealer Score: 10\n", + "Taking action: Hit\n", + "Player Score: 16 (Usable Ace: False), Dealer Score: 10\n", + "Taking action: Hit\n", + "Player Score: 26 (Usable Ace: False), Dealer Score: 10\n", + "Game end. Reward: -1.0\n", + "\n", + "Player Score: 16 (Usable Ace: True), Dealer Score: 8\n", + "Taking action: Hit\n", + "Player Score: 18 (Usable Ace: True), Dealer Score: 8\n", + "Taking action: Hit\n", + "Player Score: 18 (Usable Ace: False), Dealer Score: 8\n", + "Taking action: Hit\n", + "Player Score: 20 (Usable Ace: False), Dealer Score: 8\n", + "Taking action: Stick\n", + "Player Score: 20 (Usable Ace: False), Dealer Score: 8\n", + "Game end. Reward: 1.0\n", + "\n", + "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", + "Taking action: Stick\n", + "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", + "Game end. Reward: -1.0\n", + "\n", + "Player Score: 15 (Usable Ace: False), Dealer Score: 10\n", + "Taking action: Hit\n", + "Player Score: 16 (Usable Ace: False), Dealer Score: 10\n", + "Taking action: Hit\n", + "Player Score: 23 (Usable Ace: False), Dealer Score: 10\n", + "Game end. Reward: -1.0\n", + "\n", + "Player Score: 12 (Usable Ace: False), Dealer Score: 4\n", + "Taking action: Hit\n", + "Player Score: 16 (Usable Ace: False), Dealer Score: 4\n", + "Taking action: Hit\n", + "Player Score: 24 (Usable Ace: False), Dealer Score: 4\n", + "Game end. Reward: -1.0\n", + "\n", + "Player Score: 20 (Usable Ace: False), Dealer Score: 7\n", + "Taking action: Stick\n", + "Player Score: 20 (Usable Ace: False), Dealer Score: 7\n", + "Game end. Reward: 1.0\n", + "\n", + "Player Score: 15 (Usable Ace: False), Dealer Score: 7\n", + "Taking action: Hit\n", + "Player Score: 21 (Usable Ace: False), Dealer Score: 7\n", + "Taking action: Stick\n", + "Player Score: 21 (Usable Ace: False), Dealer Score: 7\n", + "Game end. Reward: 1.0\n", + "\n", + "Player Score: 15 (Usable Ace: False), Dealer Score: 8\n", + "Taking action: Hit\n", + "Player Score: 23 (Usable Ace: False), Dealer Score: 8\n", + "Game end. Reward: -1.0\n", + "\n" ] } ], From 8da669c1496a617de8cbdf8c62ef075a4b9d8f3f Mon Sep 17 00:00:00 2001 From: Sanyam Kapoor <1sanyamkapoor@gmail.com> Date: Tue, 20 Feb 2018 16:42:36 -0500 Subject: [PATCH 29/56] Fix missing render() --- TD/Cliff Environment Playground.ipynb | 19 +++++++++++-------- TD/Windy Gridworld Playground.ipynb | 23 +++++++++++++---------- lib/envs/cliff_walking.py | 5 ++++- lib/envs/windy_gridworld.py | 5 ++++- 4 files changed, 32 insertions(+), 20 deletions(-) diff --git a/TD/Cliff Environment Playground.ipynb b/TD/Cliff Environment Playground.ipynb index d50da42b6..414cf811d 100644 --- a/TD/Cliff Environment Playground.ipynb +++ b/TD/Cliff Environment Playground.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import gym\n", @@ -21,9 +19,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -80,6 +76,13 @@ "print(env.step(2))\n", "env.render()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -98,9 +101,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.4" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/TD/Windy Gridworld Playground.ipynb b/TD/Windy Gridworld Playground.ipynb index 7c37d7857..0572c0d86 100644 --- a/TD/Windy Gridworld Playground.ipynb +++ b/TD/Windy Gridworld Playground.ipynb @@ -2,10 +2,8 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, + "execution_count": 1, + "metadata": {}, "outputs": [], "source": [ "import gym\n", @@ -20,10 +18,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, + "execution_count": 2, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -119,6 +115,13 @@ "print(env.step(1))\n", "env.render()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -137,9 +140,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.4" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/lib/envs/cliff_walking.py b/lib/envs/cliff_walking.py index 37516ad1b..30b2ff7bb 100644 --- a/lib/envs/cliff_walking.py +++ b/lib/envs/cliff_walking.py @@ -53,6 +53,9 @@ def __init__(self): super(CliffWalkingEnv, self).__init__(nS, nA, P, isd) + def render(self, mode='human', close=False): + self._render(mode, close) + def _render(self, mode='human', close=False): if close: return @@ -78,4 +81,4 @@ def _render(self, mode='human', close=False): output += "\n" outfile.write(output) - outfile.write("\n") \ No newline at end of file + outfile.write("\n") diff --git a/lib/envs/windy_gridworld.py b/lib/envs/windy_gridworld.py index 7524dbd58..720c5974b 100644 --- a/lib/envs/windy_gridworld.py +++ b/lib/envs/windy_gridworld.py @@ -53,6 +53,9 @@ def __init__(self): super(WindyGridworldEnv, self).__init__(nS, nA, P, isd) + def render(self, mode='human', close=False): + self._render(mode, close) + def _render(self, mode='human', close=False): if close: return @@ -76,4 +79,4 @@ def _render(self, mode='human', close=False): output += "\n" outfile.write(output) - outfile.write("\n") \ No newline at end of file + outfile.write("\n") From 542cbf04e553b9bbac7c4dc7e0dfd69dacb458f5 Mon Sep 17 00:00:00 2001 From: jonahweissman <19804455+jonahweissman@users.noreply.github.com> Date: Wed, 7 Mar 2018 18:13:01 -0500 Subject: [PATCH 30/56] Fix typo in MC Control dictionar -> dictionary --- MC/MC Control with Epsilon-Greedy Policies Solution.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb b/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb index 0f10d783e..40af11f40 100644 --- a/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb +++ b/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb @@ -139,7 +139,7 @@ " returns_count[sa_pair] += 1.0\n", " Q[state][action] = returns_sum[sa_pair] / returns_count[sa_pair]\n", " \n", - " # The policy is improved implicitly by changing the Q dictionar\n", + " # The policy is improved implicitly by changing the Q dictionary\n", " \n", " return Q, policy" ] From c90ebaf06ab507d4a2cb7eaa9bf382bb0f94d2ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ayberk=20Ayd=C4=B1n?= Date: Fri, 13 Apr 2018 18:20:08 +0300 Subject: [PATCH 31/56] correction for state processor output shape --- DQN/Deep Q Learning.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DQN/Deep Q Learning.ipynb b/DQN/Deep Q Learning.ipynb index 29631ce0a..fcd7191a8 100644 --- a/DQN/Deep Q Learning.ipynb +++ b/DQN/Deep Q Learning.ipynb @@ -74,7 +74,7 @@ " state: A [210, 160, 3] Atari RGB State\n", "\n", " Returns:\n", - " A processed [84, 84, 1] state representing grayscale values.\n", + " A processed [84, 84] state representing grayscale values.\n", " \"\"\"\n", " return sess.run(self.output, { self.input_state: state })" ] From 56f893c059be47d3d36cf5fbdf9a5bb1270ef182 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ayberk=20Ayd=C4=B1n?= Date: Sat, 14 Apr 2018 14:16:46 +0300 Subject: [PATCH 32/56] typo fix and correction for state processor output shape --- DQN/Deep Q Learning Solution.ipynb | 4 ++-- DQN/Deep Q Learning.ipynb | 2 +- DQN/Double DQN Solution.ipynb | 2 +- DQN/dqn.py | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/DQN/Deep Q Learning Solution.ipynb b/DQN/Deep Q Learning Solution.ipynb index 1477005ef..fc88b90ae 100644 --- a/DQN/Deep Q Learning Solution.ipynb +++ b/DQN/Deep Q Learning Solution.ipynb @@ -79,7 +79,7 @@ " state: A [210, 160, 3] Atari RGB State\n", "\n", " Returns:\n", - " A processed [84, 84, 1] state representing grayscale values.\n", + " A processed [84, 84] state representing grayscale values.\n", " \"\"\"\n", " return sess.run(self.output, { self.input_state: state })" ] @@ -144,7 +144,7 @@ " gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.actions_pl\n", " self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices)\n", "\n", - " # Calcualte the loss\n", + " # Calculate the loss\n", " self.losses = tf.squared_difference(self.y_pl, self.action_predictions)\n", " self.loss = tf.reduce_mean(self.losses)\n", "\n", diff --git a/DQN/Deep Q Learning.ipynb b/DQN/Deep Q Learning.ipynb index fcd7191a8..d3a51697f 100644 --- a/DQN/Deep Q Learning.ipynb +++ b/DQN/Deep Q Learning.ipynb @@ -137,7 +137,7 @@ " gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.actions_pl\n", " self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices)\n", "\n", - " # Calcualte the loss\n", + " # Calculate the loss\n", " self.losses = tf.squared_difference(self.y_pl, self.action_predictions)\n", " self.loss = tf.reduce_mean(self.losses)\n", "\n", diff --git a/DQN/Double DQN Solution.ipynb b/DQN/Double DQN Solution.ipynb index 7d8411fdd..3fc45722b 100644 --- a/DQN/Double DQN Solution.ipynb +++ b/DQN/Double DQN Solution.ipynb @@ -73,7 +73,7 @@ " state: A [210, 160, 3] Atari RGB State\n", "\n", " Returns:\n", - " A processed [84, 84, 1] state representing grayscale values.\n", + " A processed [84, 84] state representing grayscale values.\n", " \"\"\"\n", " return sess.run(self.output, { self.input_state: state })" ] diff --git a/DQN/dqn.py b/DQN/dqn.py index be43ec08b..9d6532a8a 100755 --- a/DQN/dqn.py +++ b/DQN/dqn.py @@ -39,7 +39,7 @@ def process(self, sess, state): state: A [210, 160, 3] Atari RGB State Returns: - A processed [84, 84, 1] state representing grayscale values. + A processed [84, 84] state representing grayscale values. """ return sess.run(self.output, { self.input_state: state }) @@ -95,7 +95,7 @@ def _build_model(self): gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.actions_pl self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices) - # Calcualte the loss + # Calculate the loss self.losses = tf.squared_difference(self.y_pl, self.action_predictions) self.loss = tf.reduce_mean(self.losses) From 07dd722024306da428923bd2d7a64beb689ef6be Mon Sep 17 00:00:00 2001 From: Aerin Kim Date: Sat, 26 May 2018 15:26:45 -0700 Subject: [PATCH 33/56] added the equation reference --- DP/Policy Evaluation Solution.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DP/Policy Evaluation Solution.ipynb b/DP/Policy Evaluation Solution.ipynb index d69fe2546..0b06f87e7 100644 --- a/DP/Policy Evaluation Solution.ipynb +++ b/DP/Policy Evaluation Solution.ipynb @@ -58,7 +58,7 @@ " for a, action_prob in enumerate(policy[s]):\n", " # For each action, look at the possible next states...\n", " for prob, next_state, reward, done in env.P[s][a]:\n", - " # Calculate the expected value\n", + " # Calculate the expected value. Ref: Sutton book eq. 4.6.\n", " v += action_prob * prob * (reward + discount_factor * V[next_state])\n", " # How much our value function changed (across any states)\n", " delta = max(delta, np.abs(v - V[s]))\n", From 377c87595ae903e12df44886dba50ee40091a934 Mon Sep 17 00:00:00 2001 From: Aerin Kim Date: Sat, 26 May 2018 15:38:42 -0700 Subject: [PATCH 34/56] added Sutton book's equation --- DP/Value Iteration Solution.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DP/Value Iteration Solution.ipynb b/DP/Value Iteration Solution.ipynb index c7134dff3..90ec96a17 100644 --- a/DP/Value Iteration Solution.ipynb +++ b/DP/Value Iteration Solution.ipynb @@ -74,7 +74,7 @@ " best_action_value = np.max(A)\n", " # Calculate delta across all states seen so far\n", " delta = max(delta, np.abs(best_action_value - V[s]))\n", - " # Update the value function\n", + " # Update the value function. Ref: Sutton book eq. 4.10. \n", " V[s] = best_action_value \n", " # Check if we can stop \n", " if delta < theta:\n", From 1b5c06f5b00bfa16a8138644387b013e15fbec29 Mon Sep 17 00:00:00 2001 From: Aerin Kim Date: Sun, 27 May 2018 17:13:36 -0700 Subject: [PATCH 35/56] Gambler's problem (ex.4.3) added. --- DP/Gamblers Problem Solution.ipynb | 289 +++++++++++++++++++++++++++++ DP/Gamblers Problem.ipynb | 154 +++++++++++++++ DP/README.md | 4 + 3 files changed, 447 insertions(+) create mode 100644 DP/Gamblers Problem Solution.ipynb create mode 100644 DP/Gamblers Problem.ipynb diff --git a/DP/Gamblers Problem Solution.ipynb b/DP/Gamblers Problem Solution.ipynb new file mode 100644 index 000000000..d3880ef80 --- /dev/null +++ b/DP/Gamblers Problem Solution.ipynb @@ -0,0 +1,289 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "### This is Example 4.3. Gambler’s Problem from Sutton's book.\n", + "\n", + "A gambler has the opportunity to make bets on the outcomes of a sequence of coin flips. \n", + "If the coin comes up heads, he wins as many dollars as he has staked on that flip; \n", + "if it is tails, he loses his stake. The game ends when the gambler wins by reaching his goal of $100, \n", + "or loses by running out of money. \n", + "\n", + "On each flip, the gambler must decide what portion of his capital to stake, in integer numbers of dollars. \n", + "This problem can be formulated as an undiscounted, episodic, finite MDP. \n", + "\n", + "The state is the gambler’s capital, s ∈ {1, 2, . . . , 99}.\n", + "The actions are stakes, a ∈ {0, 1, . . . , min(s, 100 − s)}. \n", + "The reward is zero on all transitions except those on which the gambler reaches his goal, when it is +1.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import sys\n", + "import matplotlib.pyplot as plt\n", + "if \"../\" not in sys.path:\n", + " sys.path.append(\"../\") " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "\n", + "### Exercise 4.9 (programming)\n", + "\n", + "Implement value iteration for the gambler’s problem and solve it for p_h = 0.25 and p_h = 0.55." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def value_iteration_for_gamblers(p_h, theta=0.0001, discount_factor=1.0):\n", + " \"\"\"\n", + " Args:\n", + " p_h: Probability of the coin coming up heads\n", + " \"\"\"\n", + " # The reward is zero on all transitions except those on which the gambler reaches his goal, when it is +1.\n", + " rewards = np.zeros(101)\n", + " rewards[100] = 1 \n", + " \n", + " # We introduce two dummy states corresponding to termination with capital of 0 and 100\n", + " V = np.zeros(101)\n", + " \n", + " def one_step_lookahead(s, V, rewards):\n", + " \"\"\"\n", + " Helper function to calculate the value for all action in a given state.\n", + " \n", + " Args:\n", + " s: The gambler’s capital. Integer.\n", + " V: The vector that contains values at each state. \n", + " rewards: The reward vector.\n", + " \n", + " Returns:\n", + " A vector containing the expected value of each action. Its length equals to the number of actions.\n", + " \"\"\"\n", + " A = np.zeros(101)\n", + " stakes = range(1, min(s, 100-s)+1) # Your minimum bet is 1, maximum bet is min(s, 100-s).\n", + " for a in stakes:\n", + " # rewards[s+a], rewards[s-a] are immediate rewards.\n", + " # V[s+a], V[s-a] are values of the next states.\n", + " # This is the core of the Bellman equation: \n", + " # The expected value of your action is the sum of immediate rewards and the value of the next state.\n", + " A[a] = p_h * (rewards[s+a] + V[s+a]*discount_factor) + (1-p_h) * (rewards[s-a] + V[s-a]*discount_factor)\n", + " return A\n", + " \n", + " while True:\n", + " # Stopping condition\n", + " delta = 0\n", + " # Update each state...\n", + " for s in range(1, 100):\n", + " # Do a one-step lookahead to find the best action\n", + " A = one_step_lookahead(s, V, rewards)\n", + " # print(s,A,V) # if you want to debug.\n", + " best_action_value = np.max(A)\n", + " # Calculate delta across all states seen so far\n", + " delta = max(delta, np.abs(best_action_value - V[s]))\n", + " # Update the value function. Ref: Sutton book eq. 4.10. \n", + " V[s] = best_action_value \n", + " # Check if we can stop \n", + " if delta < theta:\n", + " break\n", + " \n", + " # Create a deterministic policy using the optimal value function\n", + " policy = np.zeros(100)\n", + " for s in range(1, 100):\n", + " # One step lookahead to find the best action for this state\n", + " A = one_step_lookahead(s, V, rewards)\n", + " best_action = np.argmax(A)\n", + " # Always take the best action\n", + " policy[s] = best_action\n", + " \n", + " return policy, V" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Optimized Policy:\n", + "[ 0. 1. 2. 3. 4. 5. 6. 7. 8. 9. 10. 11. 12. 12. 11. 15. 16. 17.\n", + " 18. 6. 20. 21. 3. 23. 24. 25. 1. 2. 3. 4. 5. 6. 7. 8. 9. 10.\n", + " 11. 12. 38. 11. 10. 9. 42. 7. 44. 5. 46. 47. 48. 49. 50. 1. 2. 3.\n", + " 4. 5. 6. 7. 8. 9. 10. 11. 12. 13. 11. 10. 9. 17. 7. 19. 5. 21.\n", + " 22. 23. 24. 25. 1. 2. 3. 4. 5. 6. 7. 8. 9. 10. 11. 12. 12. 11.\n", + " 10. 9. 8. 7. 6. 5. 4. 3. 2. 1.]\n", + "\n", + "Optimized Value Function:\n", + "[0.00000000e+00 7.24792480e-05 2.89916992e-04 6.95257448e-04\n", + " 1.16010383e-03 1.76906586e-03 2.78102979e-03 4.03504074e-03\n", + " 4.66214120e-03 5.59997559e-03 7.08471239e-03 9.03964043e-03\n", + " 1.11241192e-02 1.56793594e-02 1.61464431e-02 1.69517994e-02\n", + " 1.86512806e-02 1.98249817e-02 2.24047303e-02 2.73845196e-02\n", + " 2.83388495e-02 3.04937363e-02 3.61633897e-02 3.84953022e-02\n", + " 4.44964767e-02 6.25000000e-02 6.27174377e-02 6.33700779e-02\n", + " 6.45857723e-02 6.59966059e-02 6.78135343e-02 7.08430894e-02\n", + " 7.46098323e-02 7.64884604e-02 7.93035477e-02 8.37541372e-02\n", + " 8.96225423e-02 9.58723575e-02 1.09538078e-01 1.10939329e-01\n", + " 1.13360151e-01 1.18457374e-01 1.21977661e-01 1.29716907e-01\n", + " 1.44653559e-01 1.47520113e-01 1.53983246e-01 1.70990169e-01\n", + " 1.77987434e-01 1.95990576e-01 2.50000000e-01 2.50217438e-01\n", + " 2.50870078e-01 2.52085772e-01 2.53496606e-01 2.55313534e-01\n", + " 2.58343089e-01 2.62109832e-01 2.63988460e-01 2.66803548e-01\n", + " 2.71254137e-01 2.77122542e-01 2.83372357e-01 2.97038078e-01\n", + " 2.98439329e-01 3.00860151e-01 3.05957374e-01 3.09477661e-01\n", + " 3.17216907e-01 3.32153559e-01 3.35020113e-01 3.41483246e-01\n", + " 3.58490169e-01 3.65487434e-01 3.83490576e-01 4.37500000e-01\n", + " 4.38152558e-01 4.40122454e-01 4.43757317e-01 4.47991345e-01\n", + " 4.53440603e-01 4.62529268e-01 4.73829497e-01 4.79468031e-01\n", + " 4.87912680e-01 5.01265085e-01 5.18867627e-01 5.37617932e-01\n", + " 5.78614419e-01 5.82817988e-01 5.90080452e-01 6.05372123e-01\n", + " 6.15934510e-01 6.39150720e-01 6.83960814e-01 6.92560339e-01\n", + " 7.11950883e-01 7.62970611e-01 7.83963162e-01 8.37972371e-01\n", + " 0.00000000e+00]\n", + "\n" + ] + } + ], + "source": [ + "policy, v = value_iteration_for_gamblers(0.25)\n", + "\n", + "print(\"Optimized Policy:\")\n", + "print(policy)\n", + "print(\"\")\n", + "\n", + "print(\"Optimized Value Function:\")\n", + "print(v)\n", + "print(\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Show your results graphically, as in Figure 4.3.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEWCAYAAACJ0YulAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAIABJREFUeJzt3Xd8HNW5//HPI8mqlmRky7jjbmMb\nQhGmJKGH0OEmJIFAQkng5hJCCKSQhCSENNJuknshxaH3UPIjhksghNCbLReMC26Si1xlSbZlyerP\n748ZKWtZZW1rtCrf9+u1L+3Mnp15zs5qnznnTDF3R0REBCAp0QGIiEjPoaQgIiItlBRERKSFkoKI\niLRQUhARkRZKCiIi0kJJIUJmNsbMdplZchcs6z4z+3FXxNVquW5mE8PnfzSz70Wwjv80s99GsNxL\nzewfXb3crrS/283MDjezt6KIqaczszPM7OluWtd3zOyuA3j/GjM7PXx+vZnd3nXRJYaSQhcIvxi7\nwwTQ/Bjh7uvcfaC7N0a8/ivMrDFc704zW2hm5+7rctz9S+7+oy6OLRW4BfjlAS5nbJjAUprnufvD\n7n7Ggca4j3G0/AhEyd0XAdvN7Lwo12NmqWb2azMrCb8/xWb2m5jX96m+XbTz8lOg5cfVAteb2WIz\nqwpjfcLMDjvA9eDuP3X3L4br2es7to9mAZeZ2dADjSuRlBS6znlhAmh+bOzm9b/t7gOBQcDdwONm\nltfNMbTlAuADd9+Q6EB6oYeB/4x4Hd8GCoCZQDZwCrAg4nW2y8yOAXLd/Z2Y2b8DvgpcD+QBk4Gn\ngXO6P8L2uXsN8Hfg84mO5UAoKUSo9Z6Hmb1iZj8yszfNrNLM/mFmQ2LKP2Fmm81sh5m9ZmbT93Wd\n7t4E3ANkAOPD5V5tZqvMrNzMZpvZiHbi3WMvz8wuCFsdO81stZmdaWafMrN5rd53UwfN/bOAV1uV\nb7eeZpYR7rmuDV9/w8wygNfCItvDPdrjwxbSGzHvPcHM5obvm2tmJ8S81uFn3yq+IWb2rJltDz+z\n180sycweBMYAz4QxfLOz+rRabraZvWxm/xPu/aaZ2a/MbJ2ZbbGg+y4j5i2vAKeZWVoby7rYzApb\nzfuamc0On59tZkvDum4ws6+3FRNwDPD/3H2jB9a4+wPhMvapvmZ2DXAp8M2w/DPh/BFm9pSZlVrQ\nErm+nVig1ffFzCYBXwYucfd/uXutu1eHrcTbwzLnmNmC8Hu63sxujXl/8//gNWa20cw2mdlNMa/f\namYPhZNtfccmmNm/zKzMzLaZ2cNmNqiD+F+hhyWrfebuehzgA1gDnN7G/LGAAynh9CvAaoI9nYxw\n+vaY8lcR7K2lAb8FFsa8dh/w43bWfwXwRvg8hWCvqhLIBU4FtgFHhcv9X+C1mPc6MLH1Ogj2HHcA\nHyPYeRgJTA2XUQ4cGrOMBcAn24ltLvCpVvM6qued4ecyEkgGTgjL7fFZtlHvPKAC+Fz4GVwSTg+O\n57NvFd/PgD8CA8LHRwFrb1vHs92AwcCc2G0Ylp0dxp4NPAP8rNWydwKHtxFjZriNJ7X6rC8On28C\nPho+Pwg4qp263gKsA64FDmuuZ0ff7XjqGzOdBMwDvg+kEuyoFAEfbyeeJ4BvxEx/CVjbyf/fyWHs\nScDhwBbgwlb/g48CWWG50uY6AbcCD7X1/xrOm0jwP5AG5BMkjt+29/kQ/J+VJ/L36EAfCQ+gLzzC\nL8YuYHv4eDqcv8eXjOCH6JaY910LPN/OMgeF780Np/f4Z2tV9gqgIVz3NuCdmC/93cAvYsoOBOqB\nseF0e0nhT8Bv2lnfH4CfhM+nE/z4prVTdiVwZgefXUs9w3/q3cCH2ijX1j/sFfw7KXwOmNPqPW8D\nV+zHZ38b8Lfmz6WNbb3XDkAn2+0eYDF7/tgZUAVMiJl3PFDcankbgBPbWddDwPfD55MIkkRmOL2O\noOspp5PvbjLBnvibQC2wEbj8AOsbmxSOBda1es+3gXvbWd6LwJdipr8LvLOP/4+/bf7uxnxvpsa8\n/gvg7vD5rXSQFNpY9oXAgvY+n3A7NO5LvD3toe6jrnOhuw8KHxd2UG5zzPNqgh9pzCzZzG4Pu2l2\nEnzZANrs4mjDO+G6h7j7ce7+z3D+CGBtcyF33wWUEeyJd2Q0wZ51W+4HPmtmRvBj/Li717ZTtoJg\nrxLotJ5DgPQO1tuRPeoZWsue9Wzzs2/DL4FVwD/MrMjMbm5vpXFut3MIWid/jJmXT7C3Py/sptoO\nPB/Oj5VNkOzb8ghBiwjgswQ7I9Xh9CeBs4G1ZvaqmR3f1gLcvdHd73T3DxP8wP8EuMfMDj2A+sY6\nBBjRXMewnt8BDm6n/B7fF4Lv6vB2yjbHdGzYLVdqZjsIWhet41kf83wtwfelU2Y21MweC7vgdhIk\n4o7+J7MJWti9lpJCz/FZgkHZ0wn2mseG8+0Al7uR4B8zWJhZFkFXRmcDv+uBCW294MEgYB1Bt8pn\ngQc7WM4igi6bZh3VcxtQ0856O7uc7x71DI2h83ruvSL3Sne/yd3HA+cBN5rZae3EEc92+zPBD/5z\n4ecPQV13A9NjdiZyPThYIFhAMPaTCixvJ9R/AEPM7AiC5PBITB3muvsFwFCCQdnH46j3bne/k+CH\nedp+1rd1+fUErZ9BMY9sdz+7nTBaf19eAkaZWUEHoT9C0A032t1zCZJv6/+b0THPxxB8X1pr6zv2\ns3D+4e6eA1zWxrJjHQq818HrPZ6SQs+RTdB8LyPYg/xpFy33EeBKMzsiHLD8KfCuu6/p5H13h+87\nLRxkHWlmU2NefwC4A2hw9zfaXgQAzwEnxUy3W0//9yD5f4eDk8nhYF8aQT9wE+HgeTvrmWxmnzWz\nFDP7DMEP27Od1HMvZnaumU0MW0I7gcbwAUF/dWwM8W636wh+3J81s4ywrn8GfmPhIYzhZ/zxmPec\nDPyrvVaYuzcATxK0bPIIul6aDzO91Mxy3b0+pg5t1fUGMzvZggH+FDO7PKxT8xFI+1rf1uXnADvN\n7FvhOpLNbIYFRxm1ZY/vi7uvBH4PPBrGmWpm6RYMtDe34LIJ+vFrzGwmQeJq7XtmlhkOil8J/KWN\nMm19x7IJu4bNbCTwjXbibnYSwRFIvZaSQs/xAEGzdgOwlGBc4IC5+0vA94CnCAYfJwAXx/G+OQT/\nPL8haA6/yp574g8CM+i4lQDB4OlU+/cRT53V8+vA+wSDpuXAz4GksFvkJ8CbYTfEca3iLQPOBW4i\n+MH6JnCuu2/rrK5tmAT8k+DH4G3g9+7+Svjaz4Bbwhi+Hkd9muNz4BqCPee/mVk68C2Cbqp3wq6J\nfwJTYt52KXt2ObXlEYK99ifCJNHsc8CacLlfItjDbctu4NcEXWvbCMYXPunuRftZ37uBaWH5pz04\nR+c84AigOFzHXQStjL24+3xgh5kdGzP7eoIdkDsJutJWA/9B8N2CYHzoNjOrJBjQbqtV9CrBZ/0S\n8Ct33+ukx3a+Yz8kGDzeAfwf8Ne24gYIt+nZBN2rvVbzERUi+8SCQye3EhzVsrKTstcA09z9hm4J\nrg+w4MSsWe7e5lhAX2ZmZwDXdjI2F++yxhIkowGtkmaXM7OvEHRhfTPK9URNSUH2i5ndSLAnfmqi\nYxFpT3cmhb5if0/nln7MzNYQDLYd8J6ciPQsaimIiEgLDTSLiEiLXtd9NGTIEB87dmyiwxAR6VXm\nzZu3zd1bnxy5l16XFMaOHUthYWHnBUVEpIWZtT7jv03qPhIRkRZKCiIi0kJJQUREWigpiIhICyUF\nERFpoaQgIiItlBRERKSFkoKISA/X1OT89LllLCpp7yZ8XUdJQUSkh1uxtZJZrxWxcsuuyNelpCAi\n0sPNKS4HYOa4vMjXpaQgItLDvVtczojcdEYdlBH5upQURER6MHdnTnE5M8flEdw2PFpKCiIiPdja\nsmpKK2s5phu6jkBJQUSkR2seTzi2LyQFMzvTzJab2Sozu7mN18eY2ctmtsDMFpnZ2VHGIyLS27xb\nXE5eVioT8gd2y/oiSwpmlgzcCZwFTAMuMbNprYrdAjzu7kcCFwO/jyoeEZHeaM6aMmaO7Z7xBIi2\npTATWOXuRe5eBzwGXNCqjAM54fNcYGOE8YiI9CqbduxmffnubjkUtVmUSWEksD5muiScF+tW4DIz\nKwGeA77S1oLM7BozKzSzwtLS0ihiFRHpcbrz/IRmUSaFtto63mr6EuA+dx8FnA08aGZ7xeTus9y9\nwN0L8vM7vcWoiEifMKe4nIFpKRw6PKfzwl0kyqRQAoyOmR7F3t1DXwAeB3D3t4F0YEiEMYmI9Bpz\nisspGHsQyUndM54A0SaFucAkMxtnZqkEA8mzW5VZB5wGYGaHEiQF9Q+JSL+3dWcNK7fu6tauI4gw\nKbh7A3Ad8AKwjOAooyVmdpuZnR8Wuwm42szeAx4FrnD31l1MIiL9zs+fX05KkvHx6cO6db0pUS7c\n3Z8jGECOnff9mOdLgQ9HGYOISG/z9uoynppfwrUnT+i28xOa6YxmEZEepLahke8+/T6j8zL4yqmT\nun39kbYURERk3/zp1SKKSqu478pjyEhN7vb1q6UgItJDbNy+mzteXsU5hw3n5ClDExKDkoKISA/x\n0rIt1DU0cdMZkxMWg5KCiEgP8XZRGSNy0xk3JCthMSgpiIj0AE1NzturyzhuwuBuu/hdW5QURER6\ngOVbKqmorueECYm9qIOSgohID/D26jIAjp8wOKFxKCmIiPQAb60u45DBmYwclJHQOJQUREQSrLHJ\nebe4jOPHJ7aVAEoKIiIJt2TjDiprGhLedQRKCiIiCdcynqCWgoiIvLW6jAn5WQzNSU90KEoKIiKJ\nVN/YxNw15Qk/FLWZkoKISALNW1tBdV1jjxhPACUFEZGEqWto4rZnljJkYBofmdQzWgq6dLaISILc\n+fIqlm7ayazPHU1O+oBEhwOopSAikhCLN+zgzpdX8R9HjuSMbr7lZkeUFEREulltQyNff+I98rJS\n+cF50xIdzh7UfSQi0o1Wba3kG08u4oPNldx9eQGDMlMTHdIelBRERLpBQ2MTs14v4rf/XElmajK/\nu/gITjv04ESHtRclBRGRbnDfW2v4xfPLOWvGMG67YAb52WmJDqlNSgoiIt3g+cWbmTEyhz9cdnSi\nQ+mQBppFRCJWUVXH/HUVnDq153UXtaakICISsddWltLkcMqU/ESH0iklBRGRiL38wVYGZ6XyoVGD\nEh1Kp5QUREQi1NjkvLqilJMm55OUZIkOp1NKCiIiEVq4fjsV1fWcMnVookOJi5KCiEiEXlm+leQk\n48RJPX88AZQUREQi9a8PtnL0mIPIzewZF7zrjJKCiEhEtuysYcnGnZw8tXe0EkBJQUQkMq8s3wrA\nqb1kPAGUFEREIlHf2MSs14oYn5/FlIOzEx1O3JQUREQi8NicdawureLmM6di1vMPRW2mpCAi0sV2\n1tTzm3+u5LjxeXxsWs+/tEUsJQURkS5258urqKiu45ZzpvWqVgIoKYiIdKn15dXc+8YaPnHkKGaM\nzE10OPtMSUFEpIts3L6bLz00j6Qk+MbHpyQ6nP0SaVIwszPNbLmZrTKzm9sp82kzW2pmS8zskSjj\nERGJytw15Zx/xxusLavm95cexbDc9ESHtF8iu8mOmSUDdwIfA0qAuWY2292XxpSZBHwb+LC7V5hZ\n7zmYV0Qk9Nf5JXzrqUWMOiiTx645molDe88hqK1Feee1mcAqdy8CMLPHgAuApTFlrgbudPcKAHff\nGmE8IiJd7tUVpXzjyUUcOy6PP1x2NLkZveNyFu2JsvtoJLA+ZroknBdrMjDZzN40s3fM7My2FmRm\n15hZoZkVlpaWRhSuiMi+Wb65kusens+koQOZ9fmCXp8QINqk0NZxWN5qOgWYBJwMXALcZWZ73YXC\n3We5e4G7F+Tn955riIhI31VaWctV980lPTWZe644hoFpfeOW91EmhRJgdMz0KGBjG2X+5u717l4M\nLCdIEiIiPZK78/zizVz0x7coq6rl7ssLGDEoI9FhdZkok8JcYJKZjTOzVOBiYHarMk8DpwCY2RCC\n7qSiCGMSEdkvTU3Ou0VlfGbWO3zpoXkMSE7ivitncngvuMXmvtin9o6ZHQSMdvdFnZV19wYzuw54\nAUgG7nH3JWZ2G1Do7rPD184ws6VAI/ANdy/b51qIiERk6cadPDZ3HS8s2cyWnbUMzkrlxxfO4OJj\nRpOS3PdO9TL31t38rQqYvQKcT5BAFgKlwKvufmPk0bWhoKDACwsLE7FqEelnauobmfmTf1LX2MTJ\nk4dy5oxhnD7t4F45fmBm89y9oLNy8dQs1913mtkXgXvd/Qdm1mlLQUSkt3tr9TZ21jRw75XHcMqU\n/nEaVTxtnxQzGw58Gng24nhERHqMFxZvITsthRMmDE50KN0mnqRwG0Hf/2p3n2tm44GV0YYlIpJY\nDY1NvLhsC6dMHUpaSnKiw+k2nXYfufsTwBMx00XAJ6MMSkQk0QrXVlBeVcfHpw9LdCjdqtOWgplN\nNrOXzGxxOH24md0SfWgiIonz/OLNpKYkcfKU/nXCbDzdR38muGhdPUB4OOrFUQYlIpJI7s4/lmzm\nxElDyOqFRxodiHiSQqa7z2k1ryGKYEREeoL3N+xg444azuhnXUcQX1LYZmYTCK9bZGYXAZsijUpE\nJIFeWLKZ5CTj9EN71/2Vu0I87aIvA7OAqWa2ASgGLo00KhGRBGlqcv6+eDMzx+aRl5Wa6HC6XTxJ\nwd39dDPLApLcvdLMxkUdmIhIItz31hqKSqv4yqkTEx1KQsTTffQUgLtXuXtlOO/J6EISEUmMlVsq\nuf35Dzht6lAuPKL17V/6h3ZbCmY2FZgO5JrZJ2JeygF6581HRUTaUdfQxNceX8jAtBRu/+ThmLV1\nS5i+r6PuoynAucAg4LyY+ZUEt9EUEekz/uellSzesJM/fe5o8rPTEh1OwrSbFNz9b8DfzOx4d3+7\nG2MSEelW7xSV8ftXVnHR0aP63RnMrcUz0LzAzL5M0JXU0m3k7ldFFpWISDcp21XLVx9bwCGDs7j1\n/OmJDifh4hlofhAYBnwceJXgtpqVHb5DRKQXaGpybnriPSqq67njs0f2yvskdLV4ksJEd/8eUOXu\n9wPnAIdFG5aISPTueqOIV5aX8r1zDmX6iNxEh9MjxJMU6sO/281sBpALjI0sIhGRbvCvD7bwi+eX\nc9aMYVx23CGJDqfHiKetNCu8N/P3gNnAQOD7kUYlIhKhV1eU8qUH5zNtRA4/v6j/Hn7alnjup3BX\n+PRVYHy04YiIROutVdu45oFCJg4dyANXzSQnfUCiQ+pROk0KZjYI+DxBl1FLeXe/PrqwRES6VlOT\n8+jcdfz42WWMHZzFQ188lkGZ/e/aRp2Jp/voOeAd4H2gKdpwRES6XvG2Km5+ahHvFpdzwoTB/O7i\nI/vlxe7iEU9SSHf3GyOPRESki63aWsm9b67hyXklpKYk8fNPHsanC0ZrDKED8SSFB83sauBZoLZ5\npruXRxaViMgBWF26i1tnL+H1ldtITUniwiNGcNMZUzg4R5dt60w8SaEO+CXwXcIb7YR/NegsIj3O\nu0VlXPPgPJIMvn7GZC6ZOYbBA/vvtYz2VTxJ4UaCE9i2RR2MiMiB+NvCDXzjiUWMysvgvitmMmZw\nZqJD6nXiSQpLgOqoAxER2V+1DY38+h8rmPVaEceOy+NPnztaRxbtp3iSQiOw0MxeZs8xBR2SKiIJ\n98Hmndzw2EI+2FzJpceO4fvnTSMtJTnRYfVa8SSFp8OHiEiPsWZbFQ++s5YH315LTsYA7rmigFOn\nHpzosHq9eM5ovr87AhER6UxVbQMvL9/KE4UlvLqilJQk47wPjeCWcw7VYHIX6eh2nI+7+6fN7H3+\nfdRRC3c/PNLIRESALTtreHVFKS8u3cJrK0qpbWji4Jw0vnb6ZC6ZOZqhOsy0S3XUUvhq+Pfc7ghE\nRPqnxiansqaeXbUNVNU2snHHbopKqygq3cW8tRV8sDm4fcvw3HQumTmGs2YMo2BsHslJOgEtCh3d\njnNT+PRad/9W7Gtm9nPgW3u/S0Rk31z0x7dYsG77XvNz0lOYMTKXb581lRMn5zN1WLbORO4G8Qw0\nf4y9E8BZbcwTEdknWytrWLBuO+cePpwTJ+WTlZbC0Jw0xg/JIi8rVUkgAToaU/gv4Fpggpktinkp\nG3gz6sBEpO+bW1wBwBc/Op4jRg9KcDQCHbcUHgH+DvwMuDlmfqWueyQiXWFOcRmZqclMH5GT6FAk\n1O7tON19h7uvAW4BNrv7WmAccFl4jwURkQPybnE5Rx9yEAOS47kzsHSHeLbEU0CjmU0E7iZIDI9E\nGpWI9Hk7qutZvqWSY8bmJToUiRFPUmhy9wbgE8Bv3f1rwPB4Fm5mZ5rZcjNbZWY3d1DuIjNzMyuI\nL2wR6e0K15bjDjPHKSn0JPEkhXozu4TglpzPhvM6vampmSUDdxIcqTQNuMTMprVRLhu4Hng33qBF\npPebU1xOanKSBph7mHiSwpXA8cBP3L3YzMYBD8XxvpnAKncvcvc64DHggjbK/Qj4BVATZ8wi0ge8\nW1zOh0bnkj5AF6/rSdpNCmaWA+DuS939end/NJwuJr4xhZHA+pjpknBe7DqOBEa7+7OISL9RVdvA\n4g07NJ7QA3XUUnil+YmZvdTqtXiumtrWWSct11AysyTgN8BNnS7I7BozKzSzwtLS0jhWLSI92YJ1\n22loco0n9EAdJYXYH/XWWy6e0wxLgNEx06OAjTHT2cAM4BUzWwMcB8xua7DZ3We5e4G7F+Tn58ex\nahHpyeYUl5FkcPQhByU6FGmlo6Tg7Txva7otc4FJZjbOzFKBi4HZLQsIzoMY4u5j3X0s8A5wvrsX\nxhe6iPRWc9aUM31ELtnpnR6zIt2sozOah5rZjQStgubnhNOd7q67e4OZXQe8ACQD97j7EjO7DSh0\n99kdL0FE+qLFG3ZQuKaCqz4yLtGhSBs6Sgp/Jujiaf0c4K54Fu7uzwHPtZr3/XbKnhzPMkWk96qq\nbeD6RxcweGAqXzppQqLDkTZ0dOnsH3ZnICLS9/3wmSUUl1Xx8BePJS8rNdHhSBt0wRER6RbPvLeR\nxwtLuPbkCZwwYUiiw5F2xHM/BRGR/bartoG7Xi/iT68WccToQdxw+uREhyQdUFIQkUi4Ow+9u47f\nvriCsqo6zj5sGD84b7quiNrDdZoUzOxg4KfACHc/K7x+0fHufnfk0YlIr/XQu+v43tOLOW58Hnef\ndaiucdRLxJOy7yM4rHREOL0CuCGqgESk93tv/XZ+9MxSTp06lEe+eJwSQi8ST1IY4u6PA00QnH8A\nNEYalYj0WhVVdVz78Hzys9P4709/iKQk3We5N4lnTKHKzAYTnsVsZscBOyKNSkR6pd11jdz4+EJK\nK2t58r+OZ1CmDjvtbeJJCjcSXJ5igpm9SXA280WRRiUivUp1XQMPv7OOP71WxLZdtfz4whkcPkpd\nRr1Rp0nB3eeb2UnAFIJLXCx39/rIIxORHsvdea9kB4VrylmwfjtvrdpGRXU9H5k4hK+efpQuid2L\nxXP00edbzTrKzHD3ByKKSUR6uF//YwV3vLwKgJGDMvjopHwuP+EQjj5EyaC3i6f76JiY5+nAacB8\nQElBpB96dM467nh5FZ86ehTf+PgUhuakJzok6ULxdB99JXbazHKBByOLSER6rJeXb+WWpxdz0uR8\nfvaJw0jRiWh9zv6c0VwNTOrqQESkZ3v5g61c98h8phyczZ2XHqWE0EfFM6bwDP++qU4SMA14PMqg\nRKTnWFdWzW3PLuWfy7YwIT+Le688hoFpukJOXxXPlv1VzPMGYK27l0QUj4j0ABu27+aNlaW8tnIb\nLy7dQkqScfNZU7nqw+NITVELoS+LZ0zh1e4IREQSq7Kmnmfe28Rf5q7jvZLg/NSh2Wl88qiRXH/a\nJIbnZiQ4QukO7SYFM6uk7XsxG+DunhNZVCISGXdne3U9m3fWUFRaxbJNO1m6aSdvry5jd30jUw7O\n5jtnT+XkKUOZNHQgZrpMRX/S0Z3Xstt7TUR6n7qGJm564j3+sWQztQ1NLfOTk4yJ+QO58MiRfLpg\nFEeMHqRE0I/FPVpkZkMJzlMAwN3XRRKRiHS5hsYmvvrYAv6+eDOXzBzDxKEDGZ6bzpi8TCYOHUj6\ngOREhyg9RDxHH50P/Jrg0tlbgUOAZcD0aEMTka7Q1OR888lF/H3xZr537jS+8JFxiQ5JerB4DiP4\nEXAcsMLdxxGc0fxmpFGJSJfYsH03N/xlIX9dsIGbPjZZCUE6FU/3Ub27l5lZkpklufvLZvbzyCMT\nkf1WvK2KP7yyir/O3wDADadP4rpTJyY4KukN4kkK281sIPAa8LCZbSU4X0FEehB3p3BtBX9+rYgX\nl20hNTmJS48dwzUnTWDkIB1OKvGJJylcANQAXwMuBXKB26IMSkTit3lHDf/3/ib+tnADi0p2kJsx\ngGtPnsDlJ4xlaLYuVif7pqPzFO4AHnH3t2Jm3x99SCLSHndn884a3lu/nQXrt1O4poL56ypwh2nD\nc7jtgulcdPQoMlN1GQrZPx19c1YCvzaz4cBfgEfdfWH3hCUiABu37+aV5aW8uXobRaVVrC2rorou\nuEX6gGRj2ohcbjhtMud+aDgT8gcmOFrpCzo6ee13wO/M7BDgYuBeM0sHHgUec/cV3RSjSL/Q3Aoo\nXFNB4Zpy3i0u54PNlQCMyE1n6vAcjh8/mLFDMjlsZC7TRuSQlqLzC6RrmXtbV7Jop7DZkcA9wOHu\nnpBvY0FBgRcWFiZi1SL7zd2pqmuktLKW0spatlbWsHlH8Ni4Yzdry6pZW1bNrtrgGI7M1GSOGnMQ\nJ03O55Sp+UzI1+Um5MCY2Tx3L+isXDwnrw0AziRoLZwGvAr88IAjFOmDZr22mr/O30BDk9PY5NTU\nN1JV20BVXSONTXvvgKUPSGJEbgZjBmdyzNg8xg3J4qgxB3Ho8Gzdr0ASoqOB5o8BlwDnAHOAx4Br\n3L2qm2IT6VXcnbteLyZ9QDIzRuaQkpREakoSA9NSyEpLJid9APnZaS2P4TkZ5GSkqAUgPUpHLYXv\nAI8AX3f38m6KR6TXWltWzdb8d16TAAAQVklEQVTKWn584QwuO+6QRIcjsl86Gmg+pTsDEent5qwJ\n9p2OHZeX4EhE9p86LUW6yJzicg7KHMDEoTo0VHovJQWRLjKnuJxjxuZpjEB6NSUFkS6wacdu1pVX\nM1NdR9LLKSmIdIE5xc3jCYMTHInIgVFSEOkCc9eUk5WazKHDdRdb6d2UFES6wJzico4em6cTzqTX\ni/QbbGZnmtlyM1tlZje38fqNZrbUzBaZ2UvhdZZEepXyqjpWbNmlQ1GlT4gsKZhZMnAncBYwDbjE\nzKa1KrYAKHD3w4EngV9EFY9IVOaG5ydokFn6gihbCjOBVe5e5O51BJfJuCC2gLu/7O7V4eQ7wKgI\n4xGJxNziclJTkjh8VG6iQxE5YFEmhZHA+pjpknBee74A/L2tF8zsGjMrNLPC0tLSLgxR5MDU1Dfy\n4rItHDF6kC5jLX1ClEmhrTN42rxOt5ldBhQAv2zrdXef5e4F7l6Qn5/fhSGKHJif/N8y1pZV8+VT\nJiY6FJEuEeU9+0qA0THTo4CNrQuZ2enAd4GT3L02wnhEutQLSzbz4Dtr+eJHxnHSZO2sSN8QZUth\nLjDJzMaZWSrB/RhmxxYIb9rzJ+B8d98aYSwiXWrTjt1866lFTB+RwzfOnJLocES6TGRJwd0bgOuA\nF4BlwOPuvsTMbjOz88NivwQGAk+Y2UIzm93O4kR6jPdLdnD1A4XUNTTxv5ccqbEE6VOi7D7C3Z8D\nnms17/sxz0+Pcv0iXWltWRW/+scKnnlvIwdlDuC/P30E4/N1RVTpWyJNCiJ9RVHpLs6/400am5yv\nnDqRq08cT076gESHJdLllBREOrG7rpH/emg+qSlJ/O3LH2Z0XmaiQxKJjJKCSAfcne8+/T4rtlby\nwFUzlRCkz9PVu0Q68Je56/nr/A3ccNpkPjpJh51K36eWgkgbVm2t5H//tYpn3tvIiZPz+cqpOjlN\n+gclBRGgsclZsaWS+esqeGPlNp5fspmMAclcfeJ4rjtlIklJusWm9A9KCtLvLVhXwRfuL6S8qg6A\nwVmpfOmkCVz90fHkZaUmODqR7qWkIP3a2rIqvnB/IQPTUvj+udM4cswgxuRlYqaWgfRPSgrSb1VU\n1XHFvXNpcue+K4/RiWgiKClIP7Vjdz1XP1DIhu27eeSLxyohiISUFKRfaWpynppfws+f/4Dyqjr+\n55IjKRirO6aJNFNSkH6hpKKaN1Zu4y+F61mwbjtHjRnEfVfOZMZI3S1NJJaSgvRZ68ureWJeCc++\nt5GibVUAjMhN51ef+hCfOHKkDjMVaYOSgvQZ5VV1LNu0k6Ubd/LaylLeWLUNgA9PGMKlxx3CRycN\nYdLQgTqySKQDSgrSa/39/U08Onc9m3fsZtOOGiprGlpeG3VQBtefOolPHzOakYMyEhilSO+ipCC9\n0rOLNnL9owsYk5fJlGHZHD9+MKMOyuTQ4TkcOjybwQPTEh2iSK+kpCC9zj+XbuGGxxZScEge9181\nk4xU3flMpKsoKUiv4e68sGQz1z+2kGkjcrj7igIlBJEupqQgPV5Tk/Pisi3c+fIqFpXsYOqwbB64\naibZuvOZSJdTUpAeq6a+kacXbOCuN4pZtXUXY/Iy+dknDuMTR40kLUUtBJEoKClIj7NqayWzF27k\nkTnr2LarjmnDc/jdxUdwzmHDSUnWfaFEoqSkIAm3o7qehSXbmb+2gheWbOaDzZWYwUmT87n6o+M5\nYcJgnVsg0k2UFKRbrS+v5vWV21ixpZI1ZVUUb6tibVk1AGZw1JiDuPW8aZx92HCG5qQnOFqR/kdJ\nQSJVtquWwrUVzCku59UVpazauguAzNRkDhmcxfQROXy6YDRHjB7E4aNyNXgskmBKCnJAGpuc8qo6\nNu+oYfPOGjZu383asmrWllWxunQXa8JWQGpKEseOy+OzM8dw8pR8xg3JUpeQSA+kpCDtWlSynScK\nS2hoaqK+0alraKKqtoGqugZ27m6gdFct5VV1NDb5Hu9LH5DE2MFZTB2Ww8Uzx3DM2IOYMTJXRwyJ\n9AJKCtKmrTtruOLeueyuayQ7PYWUJGNAShJZqSkMTEtheG46h4/KJT87jfzsNA7OSWd4bjrDctPJ\nH5imVoBIL6WkIHtpanJufPw9qusaePYrH2Hi0OxEhyQi3UQHfcteZr1exBurtvGD86YrIYj0M0oK\nsofCNeX86oXlnDVjGBcfMzrR4YhIN1P3kQAwb20Ff3p1NS8u28KI3Axu/8ThGhcQ6YeUFPohd6dw\nbQXvFpWxbHMlyzbupGhbFbkZA/jKKRO5/ISx5GbqfAGR/khJoR+pqW9k9nsbuffNNSzbtBOA0XkZ\nTB2Ww+ePP4RPFYwmK01fCZH+TL8AfUxTkzNvXQUlFdVUVNWzvbqO4rJqVm6ppGhbFXUNTUw5OJvb\nP3EY5xw+XGcQi8gelBT6AHdnbVk1f51fwlPzN7Bh++6W18xg5KAMJg0dyImT8zl5cj7H6wJzItIO\nJYVeorHJ2bKzhpKK3ZRW1rJtVy2bd9awdONOFm/YQVlVHWbw0Un53HzWVKaPyGFQZiq5GQNITlIC\nEJH4KCn0ADX1jbyyvJQF6yrYXd/I7rpGdtc3srOmgcqaeiqq6tiwfTf1jXteTiI5yZg0dCCnTB3K\nYSNz+di0gxkxKCNBtRCRvkBJIQGqahtYXbqL1aW7eHt1GX9fvJnKmgZSk5PISksmfUAyGQOSyc4Y\nQE56CiMHZXDmjOGMyctk1EEZ5GenMWRgGnlZqWoFiEiXijQpmNmZwO+AZOAud7+91etpwAPA0UAZ\n8Bl3XxNlTFFzd3bVNrC9up6tlTVs3F7Dph27WVdeTfG2KopKq9i0o6al/MC0FM6YfjAXHDGSD08Y\nrDuLiUhCRZYUzCwZuBP4GFACzDWz2e6+NKbYF4AKd59oZhcDPwc+E1VM8XB3ahuaqKlvpKa+iaq6\nBqprG9lV28CO3XWUV9VTUV1HaWUtWytrKK2sZefu4MqhVbUNVNY00NDqqqEAOekpjM8fyPHjBzM+\nP4uJQwcycehADhmcxQAlAhHpIaJsKcwEVrl7EYCZPQZcAMQmhQuAW8PnTwJ3mJm5+96/qgfo8bnr\nmfV6EU3u4NDkTkOT09DoNDQ1UdsQPOoamuJaXnZaCvk5aQzNTmPskEyyUlPISE0mN2MAgzIHMCgz\nlfyBaQwflM6IQRnk6NBPEekFokwKI4H1MdMlwLHtlXH3BjPbAQwGtsUWMrNrgGsAxowZs1/BDMoc\nwJSDs8EgyQwDUpKNlCQjJTmJtJQk0lKSSUtJIn1AMukDgr+ZqclkpaaQmZbMoIxU8rJSGZQ5gPQB\nujeAiPQ9USaFtkZAW7cA4imDu88CZgEUFBTsVyvijOnDOGP6sP15q4hIvxFlZ3YJEHuZzVHAxvbK\nmFkKkAuURxiTiIh0IMqkMBeYZGbjzCwVuBiY3arMbODy8PlFwL+iGE8QEZH4RNZ9FI4RXAe8QHBI\n6j3uvsTMbgMK3X02cDfwoJmtImghXBxVPCIi0rlIz1Nw9+eA51rN+37M8xrgU1HGICIi8dMB8iIi\n0kJJQUREWigpiIhICyUFERFpYb3tCFAzKwXW7ufbh9DqbOl+oj/Wuz/WGfpnvftjnWHf632Iu+d3\nVqjXJYUDYWaF7l6Q6Di6W3+sd3+sM/TPevfHOkN09Vb3kYiItFBSEBGRFv0tKcxKdAAJ0h/r3R/r\nDP2z3v2xzhBRvfvVmIKIiHSsv7UURESkA0oKIiLSot8kBTM708yWm9kqM7s50fFEwcxGm9nLZrbM\nzJaY2VfD+Xlm9qKZrQz/HpToWLuamSWb2QIzezacHmdm74Z1/kt4+fY+xcwGmdmTZvZBuM2P7yfb\n+mvh93uxmT1qZul9bXub2T1mttXMFsfMa3PbWuB/wt+2RWZ21IGsu18kBTNLBu4EzgKmAZeY2bTE\nRhWJBuAmdz8UOA74cljPm4GX3H0S8FI43dd8FVgWM/1z4DdhnSuALyQkqmj9Dnje3acCHyKof5/e\n1mY2ErgeKHD3GQSX5b+Yvre97wPObDWvvW17FjApfFwD/OFAVtwvkgIwE1jl7kXuXgc8BlyQ4Ji6\nnLtvcvf54fNKgh+JkQR1vT8sdj9wYWIijIaZjQLOAe4Kpw04FXgyLNIX65wDnEhwTxLcvc7dt9PH\nt3UoBcgI79aYCWyij21vd3+Nve9C2d62vQB4wAPvAIPMbPj+rru/JIWRwPqY6ZJwXp9lZmOBI4F3\ngYPdfRMEiQMYmrjIIvFb4JtAUzg9GNju7g3hdF/c3uOBUuDesNvsLjPLoo9va3ffAPwKWEeQDHYA\n8+j72xva37Zd+vvWX5KCtTGvzx6La2YDgaeAG9x9Z6LjiZKZnQtsdfd5sbPbKNrXtncKcBTwB3c/\nEqiij3UVtSXsR78AGAeMALIIuk9a62vbuyNd+n3vL0mhBBgdMz0K2JigWCJlZgMIEsLD7v7XcPaW\n5uZk+HdrouKLwIeB881sDUG34KkELYdBYfcC9M3tXQKUuPu74fSTBEmiL29rgNOBYncvdfd64K/A\nCfT97Q3tb9su/X3rL0lhLjApPEIhlWBganaCY+pyYV/63cAyd//vmJdmA5eHzy8H/tbdsUXF3b/t\n7qPcfSzBdv2Xu18KvAxcFBbrU3UGcPfNwHozmxLOOg1YSh/e1qF1wHFmlhl+35vr3ae3d6i9bTsb\n+Hx4FNJxwI7mbqb90W/OaDazswn2IJOBe9z9JwkOqcuZ2UeA14H3+Xf/+ncIxhUeB8YQ/FN9yt1b\nD2L1emZ2MvB1dz/XzMYTtBzygAXAZe5em8j4upqZHUEwuJ4KFAFXEuzo9eltbWY/BD5DcLTdAuCL\nBH3ofWZ7m9mjwMkEl8feAvwAeJo2tm2YHO8gOFqpGrjS3Qv3e939JSmIiEjn+kv3kYiIxEFJQURE\nWigpiIhICyUFERFpoaQgIiItlBREQmY2zMweM7PVZrbUzJ4zs8n7sZy7mi+4aGbfifM9a8xsyL6u\nS6Sr6ZBUEVpO/HsLuN/d/xjOOwLIdvfXD2C5u9x9YBzl1hBc+XPb/q5LpCuopSASOAWob04IAO6+\nEFhgZi+Z2Xwze9/MLoDggoPhfQzuD69h/6SZZYavvWJmBWZ2O8HVPBea2cPha0+b2bzwfgDXJKCe\nIh1SUhAJzCC42mZrNcB/uPtRBInj12GrAmAKMMvdDwd2AtfGvtHdbwZ2u/sR4aU3AK5y96OBAuB6\nMxscQV1E9puSgkjHDPipmS0C/klwOYWDw9fWu/ub4fOHgI/Esbzrzew94B2Ci5hN6uJ4RQ5ISudF\nRPqFJfz7gmqxLgXygaPdvT7s+08PX2s9INfhAF14babTgePdvdrMXolZlkiPoJaCSOBfQJqZXd08\nw8yOAQ4huF9DvZmdEk43G2Nmx4fPLwHeaGO59eHlzAFygYowIUwluGWqSI+ipCACeHAY3n8AHwsP\nSV0C3Ao8BxSYWSFBq+GDmLctAy4Pu5byaPveuLOAReFA8/NASlj+RwRdSCI9ig5JFdkP4e1Onw1v\nHi/SZ6ilICIiLdRSEBGRFmopiIhICyUFERFpoaQgIiItlBRERKSFkoKIiLT4/4EmbUnRp+/0AAAA\nAElFTkSuQmCC\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plotting Final Policy (action stake) vs State (Capital)\n", + "\n", + "# x axis values\n", + "x = range(100)\n", + "# corresponding y axis values\n", + "y = v[:100]\n", + " \n", + "# plotting the points \n", + "plt.plot(x, y)\n", + " \n", + "# naming the x axis\n", + "plt.xlabel('Capital')\n", + "# naming the y axis\n", + "plt.ylabel('Value Estimates')\n", + " \n", + "# giving a title to the graph\n", + "plt.title('Final Policy (action stake) vs State (Capital)')\n", + " \n", + "# function to show the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEWCAYAAABrDZDcAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAGoxJREFUeJzt3Xu8HGV9x/HP13AXQghJMJDEgA0X\naw2XIwWxlIu0SCmJBSkUMW3B9KJyEbWovFpQq9AqUK9tCmKK3CQg1xZJYyKlhUAihIsgCYgQE5MA\nCQEBTeDXP+Y5sBzO7pk9Z2f27M73/Xrta3dm5/KbmWR/53memedRRGBmZtX1pnYHYGZm7eVEYGZW\ncU4EZmYV50RgZlZxTgRmZhXnRGBmVnFOBDZsSTpB0q1DWH+BpJNbGVMT+x5S7H229bik97ZiWzXb\nPEjS8prpByUd1Mp9WOdwIrAhk/RnkhZJel7SSkn/Jek9Q91uRFwWEX9Qs5+Q9FtD3W6rSJqcYnq+\n5rUE3hh7gTF8R9Jv0r6fkTRX0u7NbicifjsiFhQQonUAJwIbEkkfBy4EvgjsAEwCvglMa2dcJRsV\nEVun19Q27P+fImJrYAKwGvhOG2KwDuZEYIMmaVvgc8BHIuLaiPhVRGyIiBsj4pNpmX0l3SFpXSot\nfF3SZjXbCEmnSHpM0lOS/lnSm9J3fy7p9vT5trTKkvTX759K2k7STZLWSFqbPk/IEfeOkl6UNLpm\n3l5p/5tK+i1JP5L0bJp31SDOzaux1xznX0tammL9hiSl794m6YeSnk77u0zSqGb3GREvAJcD70jb\n3VzShZJWpNeFkjavE++r1U+SRkj6jKRHJT0nabGkiSnmr/RZ70ZJpzUbqw0vTgQ2FPsDWwDfb7DM\ny8DpwJi0/KHA3/ZZ5v1AD7A3WUniL/tuJCIOTB+npr+8ryL793sJ8FayksiLwNcHCjoiVgB3AEfX\nzP4zYE5EbAA+D9wKbEf2V/bXBtpmTkcC7wKmAscCf5jmC/gSsCOwBzAROLvZjUvaGjgBuCfN+iyw\nH7Bn2ue+wFk5NvVx4HjgCGAk2fV4AZgNHF+TqMeQXc8rmo3VhhcnAhuK7YGnImJjvQUiYnFE3BkR\nGyPiceDfgN/vs9h5EfFMRDxBVs10fJ6dR8TTEXFNRLwQEc8B/9jPtuu5vHc/6S/z49I8gA1kyWXH\niHgpIm7vfxOveiqVeNZJ+kSD5c6NiHXpOOeT/UATEcsiYm5E/Doi1gDnN3EcAJ+QtA5YBmwN/Hma\nfwLwuYhYnbZ7DnBiju2dDJwVET+NzJJ0ru8CniX78YfsnC2IiFVNxGrDkBOBDcXTwBhJm9RbQNKu\nqcrml5LWk7UljOmz2JM1n39O9pfxgCRtJenfJP08bfs2YJSkETlWnwPsL2lH4EAggP9J332K7K/0\nu9LdNG8oofQxJiJGpdeXGyz3y5rPL5D9aCNpnKQrJf0iHcd3eeM5auTLad9viYijIuLRNH9HsvPZ\nK++5nQg8Wue72cAH0+cPApc2EacNU04ENhR3AC8B0xss8y3gYWBKRIwEPkP2I1trYs3nScCKnPs/\nA9gN+N207d7qo77bf4OIWEdW/XMsWbXQFZG64o2IX0bEhyNiR+CvgG8WfLfSl8gS0TvTcXyQHMeQ\nwwqykk2vvOf2SeBtdb77LjBN0lSyaqzrhhShDQtOBDZoEfEs8PfANyRNT3+hbyrpfZL+KS22DbAe\neD7d1vg3/Wzqk6nhdyJwKlCvcXYVsEvN9DZk7QLrUsPvPzR5CJcDHyJrK+itFkLSB2oandeS/Ui/\n3OS2m7EN8DzZcewEfLJF270COEvS2FSf//dkP+QDuQj4vKQpyrxT0vYAEbEcuJusJHBNRLzYolit\njZwIbEgi4nyyxsWzgDVkf01+lNf+UvwE2V/czwH/Tv8/8tcDi4F7gZuBi+vs7mxgdqqLP5asPWFL\n4CngTuCWJsO/AZgCrIqIJTXz3wUslPR8WubUiPhZk9tuxjlkDeXPkh3/tS3a7heARcB9wP3Aj9O8\ngZwPfI+sxLSe7HpsWfP9bOB3cLVQ15AHprF2khRk1UbL2h2L5SPpQLKSxeSIeKXd8djQuURgZrlJ\n2pSs+u4iJ4Hu4URgZrlI2gNYB4wnq5azLuGqITOzinOJwMys4uo+CDScjBkzJiZPntzuMMzMOsri\nxYufioixAy3XEYlg8uTJLFq0qN1hmJl1FEk/H3gpVw2ZmVWeE4GZWcU5EZiZVZwTgZlZxTkRmJlV\nnBOBmVnFFXr7qKTHyXqdfBnYGBE9qbvgq4DJwOPAsRGxtsg4zMysvjJKBAdHxJ4R0ZOmzwTmRcQU\nYF6aNjOzNmlH1dA0sv7MSe+NRrcyM7OCFZ0IArhV0mJJM9O8HSJiJUB6H9ffipJmSlokadGaNWsK\nDtNs8C6Y+wgXzH2k3WGYDVrRXUwcEBErJI0D5kp6OO+KETELmAXQ09PjLlLNzApSaIkgIlak99XA\n94F9gVWSxgOk99VFxmBmZo0VlggkvVnSNr2fgT8AHiAbA3ZGWmwG2Xi1ZmbWJkVWDe0AfF9S734u\nj4hbJN0NfE/SScATwAcKjMGs5WrbA04/bNc2RmLWGoUlgoh4DJjaz/yngUOL2q+ZmTXHTxabmVWc\nE4GZWcV1xAhlZu3m5wSsm7lEYGZWcU4EZmYV50RgZlZxbiMwq8PtAlYVLhGYmVWcE4GZWcU5EZiZ\nVZzbCMxquF3AqsglAjOzinMiMDOrOCcCM7OKcyIwM6s4JwIzs4pzIjAzqzgnAjOzinMiMDOrOD9Q\nZpXkAejNXuMSgZlZxTkRmJlVnBOBmVnFORGYmVWcE4GZWcU5EZiZVZwTgZlZxfk5Autqfl7AbGAu\nEZiZVZwTgZlZxTkRmJlVnBOBmVnFFZ4IJI2QdI+km9L0zpIWSloq6SpJmxUdg5mZ1VdGieBU4KGa\n6fOACyJiCrAWOKmEGMzMrI5CE4GkCcAfARelaQGHAHPSIrOB6UXGYGZmjRVdIrgQ+BTwSpreHlgX\nERvT9HJgp/5WlDRT0iJJi9asWVNwmGZm1VVYIpB0JLA6IhbXzu5n0ehv/YiYFRE9EdEzduzYQmI0\nM7Ninyw+ADhK0hHAFsBIshLCKEmbpFLBBGBFgTGYmdkACisRRMSnI2JCREwGjgN+GBEnAPOBY9Ji\nM4Dri4rBzMwG1o7nCP4O+LikZWRtBhe3IQYzM0tK6XQuIhYAC9Lnx4B9y9ivmZkNzE8Wm5lVnBOB\nmVnFORFYR7lg7iOvG2PAzIbOicDMrOKcCMzMKs6JwMys4hrePippC+BI4PeAHYEXgQeAmyPiweLD\nMzOzotVNBJLOBv6Y7P7/hcBqsq4idgXOTUnijIi4r/gwzcysKI1KBHdHxNl1vjtf0jhgUutDMjOz\nMtVNBBFxc+20pDdHxK9qvl9NVkowM7MONmBjsaR3S/oJaZQxSVMlfbPwyMzMrBR57hq6APhD4GmA\niFgCHFhkUGZmVp5ct49GxJN9Zr1cQCxmZtYGeXoffVLSu4GQtBlwCq8fjN7MzDpYnhLBXwMfIRtb\neDmwZ5o2M7MukKdE8EoaWexVknYmtRmYmVlny1MiuFHSyN4JSXsANxYXkpmZlSlPIvgiWTLYWtI+\nwBzgg8WGZWZmZRmwaigibpa0KXArsA0wPSKWFh6ZmZmVolFfQ18DombWSOAx4GOSiIhTig7OzMyK\n16hEsKjP9OIiAzEzs/Zo1NfQ7DIDMTOz9hiwjUDSFOBLwNvJuqEGICJ2KTAuMzMrSZ67hi4BvgVs\nBA4G/gO4tMigzMysPHkSwZYRMQ9QRPw8jVFwSLFhmZlZWfI8WfySpDcBSyV9FPgFMK7YsMzMrCx5\nSgSnAVuRdTa3D9nDZB8qMigzMytPnkQwOSKej4jlEfEXEXE0HqLSzKxr5EkEn845z8zMOlCjJ4vf\nBxwB7CTpqzVfjSS7g8jMzLpAo8biFWRPFx/F658qfg44vcigzMysPI2eLF4CLJF0eURsAJC0HTAx\nItaWFaCZmRUrTxvBXEkjJY0GlgCXSDp/oJUkbSHpLklLJD0o6Zw0f2dJCyUtlXRVGv7SzMzaJE8i\n2DYi1gN/AlwSEfsA782x3q+BQyJiKtnwlodL2g84D7ggIqYAa4GTBhe6mZm1Qp5EsImk8cCxwE15\nNxyZ59PkpukVZE8lz0nzZwPT84drZmatlicRfA74AbAsIu6WtAuQa2AaSSMk3QusBuYCjwLrIqL3\nrqPlwE7Nh21mZq2SZ4Syq4Gra6YfA47Os/GIeBnYU9Io4PvAHv0t1t+6kmYCMwEmTfLza2ZmRalb\nIpB0Vmogrvf9IZKOzLOTiFgHLAD2A0ZJ6k1AE8huU+1vnVkR0RMRPWPHjs2zGzMzG4RGJYL7yQat\nfwn4MbCGbDyCKWSNv/9NNrB9vySNBTZExDpJW5I1MJ8HzAeOAa4EZgDXt+A4zMxskBo9R3A9cH0a\nmOYAYDywHvguMDMiXhxg2+OB2ZJGkJU8vhcRN0n6CXClpC8A9wAXt+A4zMxskPK0ESwlZ+Nwn/Xu\nA/bqZ/5jwL7Nbs/MzIqR564hMzPrYk4EZmYVN2AiaHTnkJmZdb48JYKFkq6WdIQkFR6RmZmVKk8i\n2BWYBZwILJP0RUm7FhuWmZmVZcBEkPoMmhsRxwMnk937f5ekH0nav/AIzcysUAPePippe7IB608E\nVgEfA24ge6jsamDnIgM0M7NiDZgIgDuAS4HpEbG8Zv4iSf9aTFhmZlaWPIlgt4jot2O4iDivxfGY\nmVnJ8jQW35p6DwWy4Sol/aDAmMzMrER5EsHY1HsoAGm84nHFhWRmZmXKkwhelvTqgACS3kqdMQTM\nzKzz5Gkj+Cxwu6QfpekDSQPGmJlZ58vT++gtkvYmG1RGwOkR8VThkZmZWSkajVC2e3rfG5hENpLY\nL4BJaZ6ZmXWBRiWCM4APA1/p57sADikkImurC+Y+8urn0w9zTyLWer3/xvzva/hoNELZh9P7weWF\nY2ZmZaubCCT9SaMVI+La1odjZmZla1Q19McNvgvAicDMrAs0qhr6izIDsfapbRcwK4LbBYa3PCOU\nbSvpfEmL0usrkrYtIzgzMytenieLvw08BxybXuuBS4oMyszMypPnyeK3RcTRNdPnSLq3qIDMzKxc\neUoEL0p6T++EpAOAF4sLyczMypSnRPA3wOzULiDgGbLhKq2DuYHYiuYG4s6Rp6+he4Gpkkam6fWF\nR2VmZqXJc9fQ9pK+CiwA5kv6lzSOsZmZdYE8bQRXAmuAo4Fj0uerigzKzMzKk6eNYHREfL5m+guS\nphcVkJl1LrcLdKY8JYL5ko6T9Kb0Oha4uejAzMysHHkSwV8BlwO/Tq8rgY9Lek6SG47NzDpcnruG\ntikjEDMza488bQTWwTzQjBXN7QKdL0/V0KBImihpvqSHJD0o6dQ0f7SkuZKWpvftiorBzMwGVlgi\nADYCZ0TEHmQD339E0tuBM4F5ETEFmJemzcysTRqNUDa60YoR8cwA368EVqbPz0l6CNgJmAYclBab\nTfag2t/ljtjMzFqqURvBYrKRyNTPdwHskncnkiYDewELgR1SkiAiVkoaV2edmcBMgEmTJuXdleE6\nWzNrTqMRynZuxQ4kbQ1cA5wWEeul/vJKv/ufBcwC6OnpiVbEYmZmb5TrrqHUoDsF2KJ3XkTclmO9\nTcmSwGU1g92vkjQ+lQbGA6ubD9vMzFolT6dzJwO3AT8AzknvZ+dYT8DFwEMRcX7NVzfwWjfWM4Dr\nmwvZzMxaKU+J4FTgXcCdEXGwpN3JEsJADgBOBO6vGdHsM8C5wPcknQQ8AXyg+bDNrEx+HqW75UkE\nL0XES5KQtHlEPCxpt4FWiojb6b+hGeDQpqI0M7PC5EkEyyWNAq4D5kpaC6woNiwzMytLnr6G3p8+\nni1pPrAtcEuhUZmZWWny3jU0AtgB+Fma9Ray+n1rIz8vYGatMGAikPQx4B+AVcAraXYA7ywwLjMz\nK0neu4Z2i4iniw7GzMzKl6fTuSeBZ4sOxMzM2iNPieAxYIGkm8lGKAOgz0NiVhK3C1iR/LxANeVJ\nBE+k12bpZWZmXSTP7aN5niI2M7MO1Wg8ggsj4jRJN5LdJfQ6EXFUoZGZmVkpGpUILk3vXy4jEDMz\na49GiWANQET8qKRYrA43EJtZkRrdPnpd7wdJ15QQi5mZtUGjRFDbc2juYSnNzKyzNEoEUeezmZl1\nkUZtBFMlrScrGWyZPpOmIyJGFh5dhbldwMzK0mjw+hFlBmJmZu2Rp68hMzPrYk4EZmYV50RgZlZx\nTgRmZhXnRGBmVnFOBGZmFZdr8Hoz60weaMbycInAzKzinAjMzCrOicDMrOLcRtAGtf0IuU+hgfkc\nNae2XcAGVu98Venfm0sEZmYV50RgZlZxTgRmZhXnNoKS5Knndl34a3y+mpOnXcDPFLzG5+v1CisR\nSPq2pNWSHqiZN1rSXElL0/t2Re3fzMzyKbJq6DvA4X3mnQnMi4gpwLw0bWZmbVRYIoiI24Bn+sye\nBsxOn2cD04vav5mZ5VN2G8EOEbESICJWShpXb0FJM4GZAJMmTSopvNZyHXZzhnK+qniuW3G+Brt+\nJxrq8xXd/G9s2N41FBGzIqInInrGjh3b7nDMzLpW2YlglaTxAOl9dcn7NzOzPspOBDcAM9LnGcD1\nJe/fzMz6KPL20SuAO4DdJC2XdBJwLnCYpKXAYWnazMzaqLDG4og4vs5Xhxa1z3arYgPcUBXRANfN\nna75fDWnqGPrtobjYdtYbGZm5XAiMDOrOCcCM7OKc6dzQ9TN9atF6bb61aL5fDWn7P+T3XB9XCIw\nM6s4JwIzs4pzIjAzqzi3EQyC2wWaNxzqUTvpuvl8NWe4xDocrttguERgZlZxTgRmZhXnRGBmVnFu\nI8hpuNRB1jMc6yaHY0zDmc9Xc/x/snVcIjAzqzgnAjOzinMiMDOrOLcRNDDc6yCHo06qF+3Vzuvc\niecL2he3z1cxXCIwM6s4JwIzs4pzIjAzqzi3EfThdoHmeJzm5g33+uLhptvO13A8HpcIzMwqzonA\nzKzinAjMzCrObQRUp12gVcdZlfPVSsOxXng4q8r5Gi7H6RKBmVnFORGYmVWcE4GZWcU5EZiZVVxl\nG4vd4Nkcn6/mDZeGwE5R9fPVzuN3icDMrOKcCMzMKs6JwMys4irVRuB67ub4fDWv6vXczfL56l/Z\n56UtJQJJh0v6qaRlks5sRwxmZpYpPRFIGgF8A3gf8HbgeElvLzsOMzPLtKNEsC+wLCIei4jfAFcC\n09oQh5mZAYqIcncoHQMcHhEnp+kTgd+NiI/2WW4mMDNN7gb8dAi7HQM8NYT1O5GPuRp8zNUw2GN+\na0SMHWihdjQWq595b8hGETELmNWSHUqLIqKnFdvqFD7mavAxV0PRx9yOqqHlwMSa6QnAijbEYWZm\ntCcR3A1MkbSzpM2A44Ab2hCHmZnRhqqhiNgo6aPAD4ARwLcj4sGCd9uSKqYO42OuBh9zNRR6zKU3\nFpuZ2fDiLibMzCrOicDMrOK6PhFUoTsLSRMlzZf0kKQHJZ2a5o+WNFfS0vS+XbtjbSVJIyTdI+mm\nNL2zpIXpeK9KNyN0FUmjJM2R9HC63vtX4Dqfnv5dPyDpCklbdNu1lvRtSaslPVAzr9/rqsxX02/a\nfZL2Hur+uzoRVKg7i43AGRGxB7Af8JF0nGcC8yJiCjAvTXeTU4GHaqbPAy5Ix7sWOKktURXrX4Bb\nImJ3YCrZ8XftdZa0E3AK0BMR7yC7weQ4uu9afwc4vM+8etf1fcCU9JoJfGuoO+/qREBFurOIiJUR\n8eP0+TmyH4edyI51dlpsNjC9PRG2nqQJwB8BF6VpAYcAc9IiXXW8AJJGAgcCFwNExG8iYh1dfJ2T\nTYAtJW0CbAWspMuudUTcBjzTZ3a96zoN+I/I3AmMkjR+KPvv9kSwE/BkzfTyNK9rSZoM7AUsBHaI\niJWQJQtgXPsia7kLgU8Br6Tp7YF1EbExTXfjtd4FWANckqrELpL0Zrr4OkfEL4AvA0+QJYBngcV0\n/7WG+te15b9r3Z4IcnVn0S0kbQ1cA5wWEevbHU9RJB0JrI6IxbWz+1m02671JsDewLciYi/gV3RR\nNVB/Ur34NGBnYEfgzWRVI31127VupOX/1rs9EVSmOwtJm5Ilgcsi4to0e1VvkTG9r25XfC12AHCU\npMfJqvsOISshjErVB9Cd13o5sDwiFqbpOWSJoVuvM8B7gZ9FxJqI2ABcC7yb7r/WUP+6tvx3rdsT\nQSW6s0j14xcDD0XE+TVf3QDMSJ9nANeXHVsRIuLTETEhIiaTXdMfRsQJwHzgmLRY1xxvr4j4JfCk\npN3SrEOBn9Cl1zl5AthP0lbp33nvMXf1tU7qXdcbgA+lu4f2A57trUIatIjo6hdwBPAI8Cjw2XbH\nU9AxvoesaHgfcG96HUFWbz4PWJreR7c71gKO/SDgpvR5F+AuYBlwNbB5u+Mr4Hj3BBala30dsF23\nX2fgHOBh4AHgUmDzbrvWwBVkbSAbyP7iP6nedSWrGvpG+k27n+yOqiHt311MmJlVXLdXDZmZ2QCc\nCMzMKs6JwMys4pwIzMwqzonAzKzinAis0iS9RdKVkh6V9BNJ/ylp10Fs56LeDg0lfSbnOo9LGtPs\nvsxazbePWmWlB5T+D5gdEf+a5u0JbBMR/zOE7T4fEVvnWO5xsnvAnxrsvsxawSUCq7KDgQ29SQAg\nIu4F7pE0T9KPJd0vaRpkHfqlcQBmp37g50jaKn23QFKPpHPJesq8V9Jl6bvrJC1OferPbMNxmjXk\nRGBV9g6yniz7egl4f0TsTZYsvpJKDwC7AbMi4p3AeuBva1eMiDOBFyNiz8i6vQD4y4jYB+gBTpG0\nfQHHYjZoTgRmbyTgi5LuA/6brIvfHdJ3T0bE/6bP3yXr3mMgp0haAtxJ1lnYlBbHazYkmwy8iFnX\nepDXOi6rdQIwFtgnIjakuvwt0nd9G9UaNrJJOoisB839I+IFSQtqtmU2LLhEYFX2Q2BzSR/unSHp\nXcBbycY72CDp4DTda5Kk/dPn44Hb+9nuhtQtOMC2wNqUBHYnG0rUbFhxIrDKiuyWufcDh6XbRx8E\nzgb+E+iRtIisdPBwzWoPATNStdFo+h8vdhZwX2osvgXYJC3/ebLqIbNhxbePmuWUhgG9KbJB1M26\nhksEZmYV5xKBmVnFuURgZlZxTgRmZhXnRGBmVnFOBGZmFedEYGZWcf8PGuEWwOrW2QgAAAAASUVO\nRK5CYII=\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plotting Capital vs Final Policy\n", + "\n", + "# x axis values\n", + "x = range(100)\n", + "# corresponding y axis values\n", + "y = policy\n", + " \n", + "# plotting the bars\n", + "plt.bar(x, y, align='center', alpha=0.5)\n", + " \n", + "# naming the x axis\n", + "plt.xlabel('Capital')\n", + "# naming the y axis\n", + "plt.ylabel('Final policy (stake)')\n", + " \n", + "# giving a title to the graph\n", + "plt.title('Capital vs Final Policy')\n", + " \n", + "# function to show the plot\n", + "plt.show()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/DP/Gamblers Problem.ipynb b/DP/Gamblers Problem.ipynb new file mode 100644 index 000000000..3479a7b30 --- /dev/null +++ b/DP/Gamblers Problem.ipynb @@ -0,0 +1,154 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "### This is Example 4.3. Gambler’s Problem from Sutton's book.\n", + "\n", + "A gambler has the opportunity to make bets on the outcomes of a sequence of coin flips. \n", + "If the coin comes up heads, he wins as many dollars as he has staked on that flip; \n", + "if it is tails, he loses his stake. The game ends when the gambler wins by reaching his goal of $100, \n", + "or loses by running out of money. \n", + "\n", + "On each flip, the gambler must decide what portion of his capital to stake, in integer numbers of dollars. \n", + "This problem can be formulated as an undiscounted, episodic, finite MDP. \n", + "\n", + "The state is the gambler’s capital, s ∈ {1, 2, . . . , 99}.\n", + "The actions are stakes, a ∈ {0, 1, . . . , min(s, 100 − s)}. \n", + "The reward is zero on all transitions except those on which the gambler reaches his goal, when it is +1.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import sys\n", + "import matplotlib.pyplot as plt\n", + "if \"../\" not in sys.path:\n", + " sys.path.append(\"../\") " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "\n", + "### Exercise 4.9 (programming)\n", + "\n", + "Implement value iteration for the gambler’s problem and solve it for p_h = 0.25 and p_h = 0.55." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def value_iteration_for_gamblers(p_h, theta=0.0001, discount_factor=1.0):\n", + " \"\"\"\n", + " Args:\n", + " p_h: Probability of the coin coming up heads\n", + " \"\"\"\n", + " \n", + " def one_step_lookahead(s, V, rewards):\n", + " \"\"\"\n", + " Helper function to calculate the value for all action in a given state.\n", + " \n", + " Args:\n", + " s: The gambler’s capital. Integer.\n", + " V: The vector that contains values at each state. \n", + " rewards: The reward vector.\n", + " \n", + " Returns:\n", + " A vector containing the expected value of each action. Its length equals to the number of actions.\n", + " \"\"\"\n", + " \n", + " # Implement!\n", + " \n", + " return A\n", + " \n", + " # Implement!\n", + " \n", + " return policy, V" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "policy, v = value_iteration_for_gamblers(0.25)\n", + "\n", + "print(\"Optimized Policy:\")\n", + "print(policy)\n", + "print(\"\")\n", + "\n", + "print(\"Optimized Value Function:\")\n", + "print(v)\n", + "print(\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Plotting Final Policy (action stake) vs State (Capital)\n", + "\n", + "# Implement!" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Plotting Capital vs Final Policy\n", + "\n", + "# Implement!\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/DP/README.md b/DP/README.md index cf2cbf51f..ae2f6e6c5 100644 --- a/DP/README.md +++ b/DP/README.md @@ -44,3 +44,7 @@ - Implement Value Iteration in Python (Gridworld) - [Exercise](Value%20Iteration.ipynb) - [Solution](Value%20Iteration%20Solution.ipynb) + +- Implement Gambler's Problem + - [Exercise](Gamblers%20Problem.ipynb) + - [Solution](Gamblers%20Problem%20Solution.ipynb) \ No newline at end of file From be7cfe308e9e5b24146ed9dc41d7a68981613c33 Mon Sep 17 00:00:00 2001 From: Aerin Kim Date: Sun, 27 May 2018 22:33:44 -0700 Subject: [PATCH 36/56] just formatting --- DP/Gamblers Problem Solution.ipynb | 14 +++++++++----- DP/Gamblers Problem.ipynb | 12 ++++++++---- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/DP/Gamblers Problem Solution.ipynb b/DP/Gamblers Problem Solution.ipynb index d3880ef80..4e96a4885 100644 --- a/DP/Gamblers Problem Solution.ipynb +++ b/DP/Gamblers Problem Solution.ipynb @@ -18,7 +18,9 @@ "\n", "The state is the gambler’s capital, s ∈ {1, 2, . . . , 99}.\n", "The actions are stakes, a ∈ {0, 1, . . . , min(s, 100 − s)}. \n", - "The reward is zero on all transitions except those on which the gambler reaches his goal, when it is +1.\n" + "The reward is zero on all transitions except those on which the gambler reaches his goal, when it is +1.\n", + "\n", + "The state-value function then gives the probability of winning from each state. A policy is a mapping from levels of capital to stakes. The optimal policy maximizes the probability of reaching the goal. Let p_h denote the probability of the coin coming up heads. If p_h is known, then the entire problem is known and it can be solved, for instance, by value iteration.\n" ] }, { @@ -61,7 +63,8 @@ " Args:\n", " p_h: Probability of the coin coming up heads\n", " \"\"\"\n", - " # The reward is zero on all transitions except those on which the gambler reaches his goal, when it is +1.\n", + " # The reward is zero on all transitions except those on which the gambler reaches his goal,\n", + " # when it is +1.\n", " rewards = np.zeros(101)\n", " rewards[100] = 1 \n", " \n", @@ -78,15 +81,16 @@ " rewards: The reward vector.\n", " \n", " Returns:\n", - " A vector containing the expected value of each action. Its length equals to the number of actions.\n", + " A vector containing the expected value of each action. \n", + " Its length equals to the number of actions.\n", " \"\"\"\n", " A = np.zeros(101)\n", " stakes = range(1, min(s, 100-s)+1) # Your minimum bet is 1, maximum bet is min(s, 100-s).\n", " for a in stakes:\n", " # rewards[s+a], rewards[s-a] are immediate rewards.\n", " # V[s+a], V[s-a] are values of the next states.\n", - " # This is the core of the Bellman equation: \n", - " # The expected value of your action is the sum of immediate rewards and the value of the next state.\n", + " # This is the core of the Bellman equation: The expected value of your action is \n", + " # the sum of immediate rewards and the value of the next state.\n", " A[a] = p_h * (rewards[s+a] + V[s+a]*discount_factor) + (1-p_h) * (rewards[s-a] + V[s-a]*discount_factor)\n", " return A\n", " \n", diff --git a/DP/Gamblers Problem.ipynb b/DP/Gamblers Problem.ipynb index 3479a7b30..0ed86294d 100644 --- a/DP/Gamblers Problem.ipynb +++ b/DP/Gamblers Problem.ipynb @@ -18,7 +18,9 @@ "\n", "The state is the gambler’s capital, s ∈ {1, 2, . . . , 99}.\n", "The actions are stakes, a ∈ {0, 1, . . . , min(s, 100 − s)}. \n", - "The reward is zero on all transitions except those on which the gambler reaches his goal, when it is +1.\n" + "The reward is zero on all transitions except those on which the gambler reaches his goal, when it is +1.\n", + "\n", + "The state-value function then gives the probability of winning from each state. A policy is a mapping from levels of capital to stakes. The optimal policy maximizes the probability of reaching the goal. Let p_h denote the probability of the coin coming up heads. If p_h is known, then the entire problem is known and it can be solved, for instance, by value iteration.\n" ] }, { @@ -45,12 +47,13 @@ "\n", "### Exercise 4.9 (programming)\n", "\n", - "Implement value iteration for the gambler’s problem and solve it for p_h = 0.25 and p_h = 0.55." + "Implement value iteration for the gambler’s problem and solve it for p_h = 0.25 and p_h = 0.55.\n", + "\n" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": { "collapsed": true }, @@ -72,7 +75,8 @@ " rewards: The reward vector.\n", " \n", " Returns:\n", - " A vector containing the expected value of each action. Its length equals to the number of actions.\n", + " A vector containing the expected value of each action. \n", + " Its length equals to the number of actions.\n", " \"\"\"\n", " \n", " # Implement!\n", From 4f0d9428597bc64c68b2d4fc71025a8de48a08d7 Mon Sep 17 00:00:00 2001 From: Aerin Kim Date: Mon, 28 May 2018 16:46:49 -0700 Subject: [PATCH 37/56] updated the broken link --- DQN/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DQN/README.md b/DQN/README.md index 7d0464727..07c887bbc 100644 --- a/DQN/README.md +++ b/DQN/README.md @@ -23,7 +23,7 @@ **Required:** - [Human-Level Control through Deep Reinforcement Learning](http://www.readcube.com/articles/10.1038/nature14236) -- [Demystifying Deep Reinforcement Learning](https://www.nervanasys.com/demystifying-deep-reinforcement-learning/) +- [Demystifying Deep Reinforcement Learning](https://ai.intel.com/demystifying-deep-reinforcement-learning/) - David Silver's RL Course Lecture 6 - Value Function Approximation ([video](https://www.youtube.com/watch?v=UoPei5o4fps), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/FA.pdf)) **Optional:** From fe3edfc570aa5d5150f9abfa0a728898b1b503a9 Mon Sep 17 00:00:00 2001 From: Aerin Kim Date: Mon, 28 May 2018 20:15:01 -0700 Subject: [PATCH 38/56] fix #89 --- DQN/Breakout Playground.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DQN/Breakout Playground.ipynb b/DQN/Breakout Playground.ipynb index 5ff6a9e99..4e1a48ed8 100644 --- a/DQN/Breakout Playground.ipynb +++ b/DQN/Breakout Playground.ipynb @@ -73,7 +73,7 @@ ], "source": [ "print(\"Action space size: {}\".format(env.action_space.n))\n", - "print(env.get_action_meanings())\n", + "print(env.get_action_meanings()) # env.unwrapped.get_action_meanings() for gym 0.8.0 or later\n", "\n", "observation = env.reset()\n", "print(\"Observation space shape: {}\".format(observation.shape))\n", From 49631ce5b0afdcd23170026563ded145982c02f8 Mon Sep 17 00:00:00 2001 From: Sharwon Pius Date: Fri, 21 Sep 2018 04:06:26 +0530 Subject: [PATCH 39/56] Update README.md Added CS885 Reinforcement Learning course from University of Waterloo. One of the most comprehensive RL courses. --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 72a11e5a9..61fe51bc3 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,7 @@ Classes: - [David Silver's Reinforcement Learning Course (UCL, 2015)](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html) - [CS294 - Deep Reinforcement Learning (Berkeley, Fall 2015)](http://rll.berkeley.edu/deeprlcourse/) - [CS 8803 - Reinforcement Learning (Georgia Tech)](https://www.udacity.com/course/reinforcement-learning--ud600) +- [CS885 - Reinforcement Learning (UWaterloo), Spring 2018](https://cs.uwaterloo.ca/~ppoupart/teaching/cs885-spring18/) Talks/Tutorials: From b47c9206b6f35ebabeecaa44ac10ec5fa97ff239 Mon Sep 17 00:00:00 2001 From: Jovan Sardinha Date: Mon, 24 Dec 2018 10:56:34 -0800 Subject: [PATCH 40/56] updates to README.md * Added UC Berkley class resources * Added OpenAI spinning up resources --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 61fe51bc3..65c0ec623 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,7 @@ Classes: - [CS294 - Deep Reinforcement Learning (Berkeley, Fall 2015)](http://rll.berkeley.edu/deeprlcourse/) - [CS 8803 - Reinforcement Learning (Georgia Tech)](https://www.udacity.com/course/reinforcement-learning--ud600) - [CS885 - Reinforcement Learning (UWaterloo), Spring 2018](https://cs.uwaterloo.ca/~ppoupart/teaching/cs885-spring18/) +- [CS294-112 - Deep Reinforcement Learning (UC Berkeley)](http://rail.eecs.berkeley.edu/deeprlcourse/) Talks/Tutorials: @@ -67,6 +68,7 @@ Talks/Tutorials: - [Tutorial: Introduction to Reinforcement Learning with Function Approximation](https://www.youtube.com/watch?v=ggqnxyjaKe4) - [John Schulman - Deep Reinforcement Learning (4 Lectures)](https://www.youtube.com/playlist?list=PLjKEIQlKCTZYN3CYBlj8r58SbNorobqcp) - [Deep Reinforcement Learning Slides @ NIPS 2016](http://people.eecs.berkeley.edu/~pabbeel/nips-tutorial-policy-optimization-Schulman-Abbeel.pdf) +- [OpenAI Spinning Up](https://spinningup.openai.com/en/latest/user/introduction.html) Other Projects: From 57f71cd4b97df36988dc1cbc868b45a2970d19d6 Mon Sep 17 00:00:00 2001 From: Jovan Sardinha Date: Tue, 25 Dec 2018 08:42:14 -0800 Subject: [PATCH 41/56] imported io so that StringIO() would work --- lib/envs/gridworld.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/envs/gridworld.py b/lib/envs/gridworld.py index ea96ddbb0..4611ae495 100644 --- a/lib/envs/gridworld.py +++ b/lib/envs/gridworld.py @@ -1,3 +1,4 @@ +import io import numpy as np import sys from gym.envs.toy_text import discrete @@ -86,7 +87,7 @@ def _render(self, mode='human', close=False): if close: return - outfile = StringIO() if mode == 'ansi' else sys.stdout + outfile = io.StringIO() if mode == 'ansi' else sys.stdout grid = np.arange(self.nS).reshape(self.shape) it = np.nditer(grid, flags=['multi_index']) @@ -102,7 +103,7 @@ def _render(self, mode='human', close=False): output = " o " if x == 0: - output = output.lstrip() + output = output.lstrip() if x == self.shape[1] - 1: output = output.rstrip() @@ -111,4 +112,4 @@ def _render(self, mode='human', close=False): if x == self.shape[1] - 1: outfile.write("\n") - it.iternext() \ No newline at end of file + it.iternext() From 9ad2689f9e638e645e6b71d8198f9f733d7142b5 Mon Sep 17 00:00:00 2001 From: Jovan Sardinha Date: Tue, 25 Dec 2018 08:42:39 -0800 Subject: [PATCH 42/56] added documentation for _render() --- lib/envs/gridworld.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lib/envs/gridworld.py b/lib/envs/gridworld.py index 4611ae495..72bd92eb7 100644 --- a/lib/envs/gridworld.py +++ b/lib/envs/gridworld.py @@ -84,6 +84,15 @@ def __init__(self, shape=[4,4]): super(GridworldEnv, self).__init__(nS, nA, P, isd) def _render(self, mode='human', close=False): + """ Renders the current gridworld layout + + For example, a 4x4 grid with the mode="human" looks like: + T o o o + o x o o + o o o o + o o o T + where x is your position and T are the two terminal states. + """ if close: return From 0fe550c4ad64337a0ccef5c244bbe2ad5a51a8d1 Mon Sep 17 00:00:00 2001 From: Jovan Sardinha Date: Tue, 25 Dec 2018 08:48:02 -0800 Subject: [PATCH 43/56] documented structure for P[s][a] --- lib/envs/gridworld.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/envs/gridworld.py b/lib/envs/gridworld.py index 72bd92eb7..22c5ed538 100644 --- a/lib/envs/gridworld.py +++ b/lib/envs/gridworld.py @@ -31,6 +31,10 @@ class GridworldEnv(discrete.DiscreteEnv): metadata = {'render.modes': ['human', 'ansi']} def __init__(self, shape=[4,4]): + """ + + + """ if not isinstance(shape, (list, tuple)) or not len(shape) == 2: raise ValueError('shape argument must be a list/tuple of length 2') @@ -50,6 +54,7 @@ def __init__(self, shape=[4,4]): s = it.iterindex y, x = it.multi_index + # P[s][a] = (prob, next_state, reward, done) P[s] = {a : [] for a in range(nA)} is_done = lambda s: s == 0 or s == (nS - 1) From 30b230436c47e194f0209fad29f907146c739610 Mon Sep 17 00:00:00 2001 From: Jovan Sardinha Date: Tue, 25 Dec 2018 08:50:53 -0800 Subject: [PATCH 44/56] removed extra whitespace --- lib/envs/gridworld.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lib/envs/gridworld.py b/lib/envs/gridworld.py index 22c5ed538..c28882eb5 100644 --- a/lib/envs/gridworld.py +++ b/lib/envs/gridworld.py @@ -31,10 +31,6 @@ class GridworldEnv(discrete.DiscreteEnv): metadata = {'render.modes': ['human', 'ansi']} def __init__(self, shape=[4,4]): - """ - - - """ if not isinstance(shape, (list, tuple)) or not len(shape) == 2: raise ValueError('shape argument must be a list/tuple of length 2') From 01b8b1379a3c4a79ec14528b52b1c6c7608bbbaf Mon Sep 17 00:00:00 2001 From: Jovan Sardinha Date: Tue, 25 Dec 2018 08:51:57 -0800 Subject: [PATCH 45/56] nit --- lib/envs/gridworld.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/envs/gridworld.py b/lib/envs/gridworld.py index c28882eb5..5eede9af0 100644 --- a/lib/envs/gridworld.py +++ b/lib/envs/gridworld.py @@ -50,7 +50,7 @@ def __init__(self, shape=[4,4]): s = it.iterindex y, x = it.multi_index - # P[s][a] = (prob, next_state, reward, done) + # P[s][a] = (prob, next_state, reward, is_done) P[s] = {a : [] for a in range(nA)} is_done = lambda s: s == 0 or s == (nS - 1) From 120fbcfb640afff960f741ae0af56a148955f9dd Mon Sep 17 00:00:00 2001 From: Stas Olekhnovich Date: Wed, 27 Feb 2019 17:33:02 +0100 Subject: [PATCH 46/56] Add link to Advanced Depp Learning & Reinforcement Learning lectures on youtube. Lectures were recorded at UCL in 2018. Lecturer for RL is Hado Vab Hasselt from DeepMind --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 65c0ec623..f9f1abe87 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,7 @@ Talks/Tutorials: - [John Schulman - Deep Reinforcement Learning (4 Lectures)](https://www.youtube.com/playlist?list=PLjKEIQlKCTZYN3CYBlj8r58SbNorobqcp) - [Deep Reinforcement Learning Slides @ NIPS 2016](http://people.eecs.berkeley.edu/~pabbeel/nips-tutorial-policy-optimization-Schulman-Abbeel.pdf) - [OpenAI Spinning Up](https://spinningup.openai.com/en/latest/user/introduction.html) +- [Advanced Deep Learning & Reinforcement Learning (UCL 2018, DeepMind)](https://www.youtube.com/playlist?list=PLqYmG7hTraZDNJre23vqCGIVpfZ_K2RZs) Other Projects: From 4a2df43bb111ec319ea11cfd68a1d79c7dcceceb Mon Sep 17 00:00:00 2001 From: Aleks K Date: Fri, 1 Mar 2019 13:31:13 +1100 Subject: [PATCH 47/56] fixed shape descriptions for neural network input layer --- DQN/Deep Q Learning Solution.ipynb | 6 +++--- DQN/Deep Q Learning.ipynb | 6 +++--- DQN/Double DQN Solution.ipynb | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/DQN/Deep Q Learning Solution.ipynb b/DQN/Deep Q Learning Solution.ipynb index fc88b90ae..90881ea07 100644 --- a/DQN/Deep Q Learning Solution.ipynb +++ b/DQN/Deep Q Learning Solution.ipynb @@ -117,7 +117,7 @@ " \"\"\"\n", "\n", " # Placeholders for our input\n", - " # Our input are 4 RGB frames of shape 160, 160 each\n", + " # Our input are 4 grayscale frames of shape 84, 84 each\n", " self.X_pl = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name=\"X\")\n", " # The TD target value\n", " self.y_pl = tf.placeholder(shape=[None], dtype=tf.float32, name=\"y\")\n", @@ -166,7 +166,7 @@ "\n", " Args:\n", " sess: Tensorflow session\n", - " s: State input of shape [batch_size, 4, 160, 160, 3]\n", + " s: State input of shape [batch_size, 4, 84, 84, 1]\n", "\n", " Returns:\n", " Tensor of shape [batch_size, NUM_VALID_ACTIONS] containing the estimated \n", @@ -180,7 +180,7 @@ "\n", " Args:\n", " sess: Tensorflow session object\n", - " s: State input of shape [batch_size, 4, 160, 160, 3]\n", + " s: State input of shape [batch_size, 4, 84, 84, 1]\n", " a: Chosen actions of shape [batch_size]\n", " y: Targets of shape [batch_size]\n", "\n", diff --git a/DQN/Deep Q Learning.ipynb b/DQN/Deep Q Learning.ipynb index d3a51697f..2b77605c8 100644 --- a/DQN/Deep Q Learning.ipynb +++ b/DQN/Deep Q Learning.ipynb @@ -110,7 +110,7 @@ " \"\"\"\n", "\n", " # Placeholders for our input\n", - " # Our input are 4 RGB frames of shape 160, 160 each\n", + " # Our input are 4 grayscale frames of shape 84, 84 each\n", " self.X_pl = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name=\"X\")\n", " # The TD target value\n", " self.y_pl = tf.placeholder(shape=[None], dtype=tf.float32, name=\"y\")\n", @@ -160,7 +160,7 @@ "\n", " Args:\n", " sess: Tensorflow session\n", - " s: State input of shape [batch_size, 4, 160, 160, 3]\n", + " s: State input of shape [batch_size, 4, 84, 84, 1]\n", "\n", " Returns:\n", " Tensor of shape [batch_size, NUM_VALID_ACTIONS] containing the estimated \n", @@ -174,7 +174,7 @@ "\n", " Args:\n", " sess: Tensorflow session object\n", - " s: State input of shape [batch_size, 4, 160, 160, 3]\n", + " s: State input of shape [batch_size, 4, 84, 84, 1]\n", " a: Chosen actions of shape [batch_size]\n", " y: Targets of shape [batch_size]\n", "\n", diff --git a/DQN/Double DQN Solution.ipynb b/DQN/Double DQN Solution.ipynb index 3fc45722b..f53ca59a6 100644 --- a/DQN/Double DQN Solution.ipynb +++ b/DQN/Double DQN Solution.ipynb @@ -109,7 +109,7 @@ " \"\"\"\n", "\n", " # Placeholders for our input\n", - " # Our input are 4 RGB frames of shape 160, 160 each\n", + " # Our input are 4 grayscale frames of shape 84, 84 each\n", " self.X_pl = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name=\"X\")\n", " # The TD target value\n", " self.y_pl = tf.placeholder(shape=[None], dtype=tf.float32, name=\"y\")\n", @@ -136,7 +136,7 @@ "\n", " Args:\n", " sess: Tensorflow session\n", - " s: State input of shape [batch_size, 4, 160, 160, 3]\n", + " s: State input of shape [batch_size, 4, 84, 84, 1]\n", "\n", " Returns:\n", " Tensor of shape [batch_size, NUM_VALID_ACTIONS] containing the estimated \n", @@ -150,7 +150,7 @@ "\n", " Args:\n", " sess: Tensorflow session object\n", - " s: State input of shape [batch_size, 4, 160, 160, 3]\n", + " s: State input of shape [batch_size, 4, 84, 84, 1]\n", " a: Chosen actions of shape [batch_size]\n", " y: Targets of shape [batch_size]\n", "\n", From a35df152681152ecd11f88d50dded4d9879f06d2 Mon Sep 17 00:00:00 2001 From: Piero Macaluso Date: Wed, 13 Mar 2019 09:55:50 +0100 Subject: [PATCH 48/56] Updated links to new version of Sutton's book --- DP/README.md | 2 +- FA/README.md | 4 ++-- Introduction/README.md | 2 +- MC/README.md | 2 +- MDP/README.md | 2 +- PolicyGradient/README.md | 2 +- README.md | 4 ++-- TD/README.md | 6 +++--- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/DP/README.md b/DP/README.md index ae2f6e6c5..a6dabe88c 100644 --- a/DP/README.md +++ b/DP/README.md @@ -28,7 +28,7 @@ **Optional:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 4: Dynamic Programming +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 4: Dynamic Programming ### Exercises diff --git a/FA/README.md b/FA/README.md index 247c41e4e..a8456622d 100644 --- a/FA/README.md +++ b/FA/README.md @@ -25,8 +25,8 @@ **Required:** - David Silver's RL Course Lecture 6 - Value Function Approximation ([video](https://www.youtube.com/watch?v=UoPei5o4fps), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/FA.pdf)) -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 9: On-policy Prediction with Approximation -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 10: On-policy Control with Approximation +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 9: On-policy Prediction with Approximation +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 10: On-policy Control with Approximation **Optional:** diff --git a/Introduction/README.md b/Introduction/README.md index cd27a4e12..ca8897826 100644 --- a/Introduction/README.md +++ b/Introduction/README.md @@ -17,7 +17,7 @@ **Required:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 1: The Reinforcement Learning Problem +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 1: The Reinforcement Learning Problem - David Silver's RL Course Lecture 1 - Introduction to Reinforcement Learning ([video](https://www.youtube.com/watch?v=2pWv7GOvuf0), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/intro_RL.pdf)) - [OpenAI Gym Tutorial](https://gym.openai.com/docs) diff --git a/MC/README.md b/MC/README.md index 7b889ed6f..8f246c38d 100644 --- a/MC/README.md +++ b/MC/README.md @@ -26,7 +26,7 @@ **Required:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 5: Monte Carlo Methods +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 5: Monte Carlo Methods **Optional:** diff --git a/MDP/README.md b/MDP/README.md index de9bcce35..08e73d072 100644 --- a/MDP/README.md +++ b/MDP/README.md @@ -25,7 +25,7 @@ **Required:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 3: Finite Markov Decision Processes +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 3: Finite Markov Decision Processes - David Silver's RL Course Lecture 2 - Markov Decision Processes ([video](https://www.youtube.com/watch?v=lfHX2hHRMVQ), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/MDP.pdf)) diff --git a/PolicyGradient/README.md b/PolicyGradient/README.md index a7dffdeef..e8e793b77 100644 --- a/PolicyGradient/README.md +++ b/PolicyGradient/README.md @@ -36,7 +36,7 @@ **Optional:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 13: Policy Gradient Methods +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 13: Policy Gradient Methods - [Deterministic Policy Gradient Algorithms](http://jmlr.org/proceedings/papers/v32/silver14.pdf) - [Deterministic Policy Gradient Algorithms (Talk)](http://techtalks.tv/talks/deterministic-policy-gradient-algorithms/61098/) - [Continuous control with deep reinforcement learning](https://arxiv.org/abs/1509.02971) diff --git a/README.md b/README.md index f9f1abe87..82009e229 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ This repository provides code, exercises and solutions for popular Reinforcement Learning algorithms. These are meant to serve as a learning tool to complement the theoretical materials from -- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/book/bookdraft2018jan1.pdf) +- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/book/RLbook2018.pdf) - [David Silver's Reinforcement Learning Course](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html) Each folder in corresponds to one or more chapters of the above textbook and/or course. In addition to exercises and solution, each folder also contains a list of learning goals, a brief concept summary, and links to the relevant readings. @@ -50,7 +50,7 @@ All code is written in Python 3 and uses RL environments from [OpenAI Gym](https Textbooks: -- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/book/bookdraft2018jan1.pdf) +- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/book/RLbook2018.pdf) Classes: diff --git a/TD/README.md b/TD/README.md index a4c35a0e9..9b34caecc 100644 --- a/TD/README.md +++ b/TD/README.md @@ -28,14 +28,14 @@ **Required:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 6: Temporal-Difference Learning +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 6: Temporal-Difference Learning - David Silver's RL Course Lecture 4 - Model-Free Prediction ([video](https://www.youtube.com/watch?v=PnHCvfgC_ZA), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/MC-TD.pdf)) - David Silver's RL Course Lecture 5 - Model-Free Control ([video](https://www.youtube.com/watch?v=0g4j2k_Ggc4), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/control.pdf)) **Optional:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 7: Multi-Step Bootstrapping -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/bookdraft2018jan1.pdf) - Chapter 12: Eligibility Traces +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 7: Multi-Step Bootstrapping +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 12: Eligibility Traces ### Exercises From bb9241dbbbdabb8dd1d6c116f7120e46459a87fd Mon Sep 17 00:00:00 2001 From: Stanislav Olekhnovich Date: Fri, 29 Mar 2019 18:17:22 +0100 Subject: [PATCH 49/56] Fix rendering crash on Win 10 It was crashing on my win10 PC, I found a fix https://github.com/openai/gym/issues/1056 and applied it. --- FA/MountainCar Playground.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FA/MountainCar Playground.ipynb b/FA/MountainCar Playground.ipynb index 9b4fe3a36..914f7a5a0 100644 --- a/FA/MountainCar Playground.ipynb +++ b/FA/MountainCar Playground.ipynb @@ -71,7 +71,7 @@ "plt.figure()\n", "plt.imshow(env.render(mode='rgb_array'))\n", "\n", - "env.render(close=True)" + "env.close()" ] }, { From 1abaae41f6bf751c66d555c04de9b304f8ef8abc Mon Sep 17 00:00:00 2001 From: Michael Anuzis Date: Tue, 2 Apr 2019 17:15:57 -0400 Subject: [PATCH 50/56] Q-Learning docstring improvements. --- TD/Q-Learning Solution.ipynb | 4 ++-- TD/Q-Learning.ipynb | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/TD/Q-Learning Solution.ipynb b/TD/Q-Learning Solution.ipynb index 4c1c5be2c..f2da32351 100644 --- a/TD/Q-Learning Solution.ipynb +++ b/TD/Q-Learning Solution.ipynb @@ -50,7 +50,7 @@ " Args:\n", " Q: A dictionary that maps from state -> action-values.\n", " Each value is a numpy array of length nA (see below)\n", - " epsilon: The probability to select a random action . float between 0 and 1.\n", + " epsilon: The probability to select a random action. Float between 0 and 1.\n", " nA: Number of actions in the environment.\n", " \n", " Returns:\n", @@ -82,7 +82,7 @@ " num_episodes: Number of episodes to run for.\n", " discount_factor: Gamma discount factor.\n", " alpha: TD learning rate.\n", - " epsilon: Chance the sample a random action. Float betwen 0 and 1.\n", + " epsilon: Chance to sample a random action. Float between 0 and 1.\n", " \n", " Returns:\n", " A tuple (Q, episode_lengths).\n", diff --git a/TD/Q-Learning.ipynb b/TD/Q-Learning.ipynb index 4e1396cf6..ddd33c756 100644 --- a/TD/Q-Learning.ipynb +++ b/TD/Q-Learning.ipynb @@ -49,7 +49,7 @@ " Args:\n", " Q: A dictionary that maps from state -> action-values.\n", " Each value is a numpy array of length nA (see below)\n", - " epsilon: The probability to select a random action . float between 0 and 1.\n", + " epsilon: The probability to select a random action. Float between 0 and 1.\n", " nA: Number of actions in the environment.\n", " \n", " Returns:\n", @@ -81,7 +81,7 @@ " num_episodes: Number of episodes to run for.\n", " discount_factor: Gamma discount factor.\n", " alpha: TD learning rate.\n", - " epsilon: Chance the sample a random action. Float betwen 0 and 1.\n", + " epsilon: Chance to sample a random action. Float between 0 and 1.\n", " \n", " Returns:\n", " A tuple (Q, episode_lengths).\n", From b2d179a1fe2fc8ee5b01e9f9b5ecadaf9139ada7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=B7=E6=89=93=E4=B8=8D=E5=8A=A8=EF=BC=81?= <779222056@qq.com> Date: Tue, 11 Jun 2019 15:58:21 +0800 Subject: [PATCH 51/56] Update CliffWalk REINFORCE with Baseline Solution.ipynb --- PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb b/PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb index cad46261d..fb7707846 100644 --- a/PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb +++ b/PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb @@ -161,7 +161,7 @@ " Transition = collections.namedtuple(\"Transition\", [\"state\", \"action\", \"reward\", \"next_state\", \"done\"])\n", " \n", " for i_episode in range(num_episodes):\n", - " # Reset the environment and pick the fisrst action\n", + " # Reset the environment and pick the first action\n", " state = env.reset()\n", " \n", " episode = []\n", From 775fd81e82fa900f87d35309f937d1102ed9fc57 Mon Sep 17 00:00:00 2001 From: nsydn Date: Tue, 1 Oct 2019 18:04:03 +0300 Subject: [PATCH 52/56] Update Policy Iteration Solution.ipynb --- DP/Policy Iteration Solution.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DP/Policy Iteration Solution.ipynb b/DP/Policy Iteration Solution.ipynb index dc121c8c5..076894169 100644 --- a/DP/Policy Iteration Solution.ipynb +++ b/DP/Policy Iteration Solution.ipynb @@ -82,7 +82,7 @@ " until an optimal policy is found.\n", " \n", " Args:\n", - " env: The OpenAI envrionment.\n", + " env: The OpenAI environment.\n", " policy_eval_fn: Policy Evaluation function that takes 3 arguments:\n", " policy, env, discount_factor.\n", " discount_factor: gamma discount factor.\n", From 7d232607e63743c7c1b9ff912f4e0084bbf3e616 Mon Sep 17 00:00:00 2001 From: nsydn Date: Tue, 1 Oct 2019 18:13:40 +0300 Subject: [PATCH 53/56] Update Policy Iteration Solution.ipynb --- DP/Policy Iteration Solution.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DP/Policy Iteration Solution.ipynb b/DP/Policy Iteration Solution.ipynb index 076894169..73009f000 100644 --- a/DP/Policy Iteration Solution.ipynb +++ b/DP/Policy Iteration Solution.ipynb @@ -124,7 +124,7 @@ " \n", " # For each state...\n", " for s in range(env.nS):\n", - " # The best action we would take under the currect policy\n", + " # The best action we would take under the current policy\n", " chosen_a = np.argmax(policy[s])\n", " \n", " # Find the best action by one-step lookahead\n", From 1298c8ddd60331ab7457bb7fa6fbc42ebecf8f0c Mon Sep 17 00:00:00 2001 From: Roshan Ray Date: Fri, 8 Nov 2019 08:56:53 +0530 Subject: [PATCH 54/56] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 82009e229..8a89bd765 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,7 @@ Talks/Tutorials: - [Deep Reinforcement Learning Slides @ NIPS 2016](http://people.eecs.berkeley.edu/~pabbeel/nips-tutorial-policy-optimization-Schulman-Abbeel.pdf) - [OpenAI Spinning Up](https://spinningup.openai.com/en/latest/user/introduction.html) - [Advanced Deep Learning & Reinforcement Learning (UCL 2018, DeepMind)](https://www.youtube.com/playlist?list=PLqYmG7hTraZDNJre23vqCGIVpfZ_K2RZs) +-[Deep RL Bootcamp](https://sites.google.com/view/deep-rl-bootcamp/lectures) Other Projects: From 40eda6b1ca56cb41d99b1eae70a74774d016fa5f Mon Sep 17 00:00:00 2001 From: "Ariel S. Boiardi" Date: Mon, 19 Sep 2022 19:19:46 +0200 Subject: [PATCH 55/56] Compatible with gym==0.26 --- lib/envs/cliff_walking.py | 9 +++++-- lib/envs/discrete.py | 51 +++++++++++++++++++++++++++++++++++++ lib/envs/gridworld.py | 5 +++- lib/envs/windy_gridworld.py | 8 ++++-- 4 files changed, 68 insertions(+), 5 deletions(-) create mode 100644 lib/envs/discrete.py diff --git a/lib/envs/cliff_walking.py b/lib/envs/cliff_walking.py index 30b2ff7bb..bbae6c80d 100644 --- a/lib/envs/cliff_walking.py +++ b/lib/envs/cliff_walking.py @@ -1,6 +1,11 @@ +import io import numpy as np import sys -from gym.envs.toy_text import discrete +if "../.." not in sys.path: + sys.path.append("../..") + +from lib.envs import discrete + UP = 0 @@ -60,7 +65,7 @@ def _render(self, mode='human', close=False): if close: return - outfile = StringIO() if mode == 'ansi' else sys.stdout + outfile = io.StringIO() if mode == 'ansi' else sys.stdout for s in range(self.nS): position = np.unravel_index(s, self.shape) diff --git a/lib/envs/discrete.py b/lib/envs/discrete.py new file mode 100644 index 000000000..64455fc00 --- /dev/null +++ b/lib/envs/discrete.py @@ -0,0 +1,51 @@ +import numpy as np + +from gym import Env, spaces +from gym.utils import seeding +from gym.envs.toy_text.utils import categorical_sample + +class DiscreteEnv(Env): + + """ + Has the following members + - nS: number of states + - nA: number of actions + - P: transitions (*) + - isd: initial state distribution (**) + + (*) dictionary of lists, where + P[s][a] == [(probability, nextstate, reward, done), ...] + (**) list or array of length nS + + + """ + + def __init__(self, nS, nA, P, isd): + self.P = P + self.isd = isd + self.lastaction = None # for rendering + self.nS = nS + self.nA = nA + + self.action_space = spaces.Discrete(self.nA) + self.observation_space = spaces.Discrete(self.nS) + + self.seed() + self.s = categorical_sample(self.isd, self.np_random) + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def reset(self): + self.s = categorical_sample(self.isd, self.np_random) + self.lastaction = None + return int(self.s) + + def step(self, a): + transitions = self.P[self.s][a] + i = categorical_sample([t[0] for t in transitions], self.np_random) + p, s, r, d = transitions[i] + self.s = s + self.lastaction = a + return (int(s), r, d, {"prob": p}) diff --git a/lib/envs/gridworld.py b/lib/envs/gridworld.py index 5eede9af0..6c559f918 100644 --- a/lib/envs/gridworld.py +++ b/lib/envs/gridworld.py @@ -1,7 +1,10 @@ import io import numpy as np import sys -from gym.envs.toy_text import discrete +if "../.." not in sys.path: + sys.path.append("../..") + +from lib.envs import discrete UP = 0 RIGHT = 1 diff --git a/lib/envs/windy_gridworld.py b/lib/envs/windy_gridworld.py index 720c5974b..4b307decd 100644 --- a/lib/envs/windy_gridworld.py +++ b/lib/envs/windy_gridworld.py @@ -1,7 +1,11 @@ +import io import gym import numpy as np import sys -from gym.envs.toy_text import discrete +if "../.." not in sys.path: + sys.path.append("../..") + +from lib.envs import discrete UP = 0 RIGHT = 1 @@ -60,7 +64,7 @@ def _render(self, mode='human', close=False): if close: return - outfile = StringIO() if mode == 'ansi' else sys.stdout + outfile = io.StringIO() if mode == 'ansi' else sys.stdout for s in range(self.nS): position = np.unravel_index(s, self.shape) From d173521920759490516fe0738955f491fd373b71 Mon Sep 17 00:00:00 2001 From: "Ariel S. Boiardi" Date: Tue, 20 Sep 2022 10:46:18 +0200 Subject: [PATCH 56/56] Corrected import --- lib/envs/cliff_walking.py | 6 +----- lib/envs/gridworld.py | 4 +--- lib/envs/windy_gridworld.py | 4 +--- 3 files changed, 3 insertions(+), 11 deletions(-) diff --git a/lib/envs/cliff_walking.py b/lib/envs/cliff_walking.py index bbae6c80d..2c677662c 100644 --- a/lib/envs/cliff_walking.py +++ b/lib/envs/cliff_walking.py @@ -1,12 +1,8 @@ import io import numpy as np import sys -if "../.." not in sys.path: - sys.path.append("../..") - -from lib.envs import discrete - +from . import discrete UP = 0 RIGHT = 1 diff --git a/lib/envs/gridworld.py b/lib/envs/gridworld.py index 6c559f918..64a5be602 100644 --- a/lib/envs/gridworld.py +++ b/lib/envs/gridworld.py @@ -1,10 +1,8 @@ import io import numpy as np import sys -if "../.." not in sys.path: - sys.path.append("../..") -from lib.envs import discrete +from . import discrete UP = 0 RIGHT = 1 diff --git a/lib/envs/windy_gridworld.py b/lib/envs/windy_gridworld.py index 4b307decd..6ac49cab3 100644 --- a/lib/envs/windy_gridworld.py +++ b/lib/envs/windy_gridworld.py @@ -2,10 +2,8 @@ import gym import numpy as np import sys -if "../.." not in sys.path: - sys.path.append("../..") -from lib.envs import discrete +from . import discrete UP = 0 RIGHT = 1