diff --git a/DP/Gamblers Problem Solution.ipynb b/DP/Gamblers Problem Solution.ipynb new file mode 100644 index 000000000..4e96a4885 --- /dev/null +++ b/DP/Gamblers Problem Solution.ipynb @@ -0,0 +1,293 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "### This is Example 4.3. Gambler’s Problem from Sutton's book.\n", + "\n", + "A gambler has the opportunity to make bets on the outcomes of a sequence of coin flips. \n", + "If the coin comes up heads, he wins as many dollars as he has staked on that flip; \n", + "if it is tails, he loses his stake. The game ends when the gambler wins by reaching his goal of $100, \n", + "or loses by running out of money. \n", + "\n", + "On each flip, the gambler must decide what portion of his capital to stake, in integer numbers of dollars. \n", + "This problem can be formulated as an undiscounted, episodic, finite MDP. \n", + "\n", + "The state is the gambler’s capital, s ∈ {1, 2, . . . , 99}.\n", + "The actions are stakes, a ∈ {0, 1, . . . , min(s, 100 − s)}. \n", + "The reward is zero on all transitions except those on which the gambler reaches his goal, when it is +1.\n", + "\n", + "The state-value function then gives the probability of winning from each state. A policy is a mapping from levels of capital to stakes. The optimal policy maximizes the probability of reaching the goal. Let p_h denote the probability of the coin coming up heads. If p_h is known, then the entire problem is known and it can be solved, for instance, by value iteration.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import sys\n", + "import matplotlib.pyplot as plt\n", + "if \"../\" not in sys.path:\n", + " sys.path.append(\"../\") " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "\n", + "### Exercise 4.9 (programming)\n", + "\n", + "Implement value iteration for the gambler’s problem and solve it for p_h = 0.25 and p_h = 0.55." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def value_iteration_for_gamblers(p_h, theta=0.0001, discount_factor=1.0):\n", + " \"\"\"\n", + " Args:\n", + " p_h: Probability of the coin coming up heads\n", + " \"\"\"\n", + " # The reward is zero on all transitions except those on which the gambler reaches his goal,\n", + " # when it is +1.\n", + " rewards = np.zeros(101)\n", + " rewards[100] = 1 \n", + " \n", + " # We introduce two dummy states corresponding to termination with capital of 0 and 100\n", + " V = np.zeros(101)\n", + " \n", + " def one_step_lookahead(s, V, rewards):\n", + " \"\"\"\n", + " Helper function to calculate the value for all action in a given state.\n", + " \n", + " Args:\n", + " s: The gambler’s capital. Integer.\n", + " V: The vector that contains values at each state. \n", + " rewards: The reward vector.\n", + " \n", + " Returns:\n", + " A vector containing the expected value of each action. \n", + " Its length equals to the number of actions.\n", + " \"\"\"\n", + " A = np.zeros(101)\n", + " stakes = range(1, min(s, 100-s)+1) # Your minimum bet is 1, maximum bet is min(s, 100-s).\n", + " for a in stakes:\n", + " # rewards[s+a], rewards[s-a] are immediate rewards.\n", + " # V[s+a], V[s-a] are values of the next states.\n", + " # This is the core of the Bellman equation: The expected value of your action is \n", + " # the sum of immediate rewards and the value of the next state.\n", + " A[a] = p_h * (rewards[s+a] + V[s+a]*discount_factor) + (1-p_h) * (rewards[s-a] + V[s-a]*discount_factor)\n", + " return A\n", + " \n", + " while True:\n", + " # Stopping condition\n", + " delta = 0\n", + " # Update each state...\n", + " for s in range(1, 100):\n", + " # Do a one-step lookahead to find the best action\n", + " A = one_step_lookahead(s, V, rewards)\n", + " # print(s,A,V) # if you want to debug.\n", + " best_action_value = np.max(A)\n", + " # Calculate delta across all states seen so far\n", + " delta = max(delta, np.abs(best_action_value - V[s]))\n", + " # Update the value function. Ref: Sutton book eq. 4.10. \n", + " V[s] = best_action_value \n", + " # Check if we can stop \n", + " if delta < theta:\n", + " break\n", + " \n", + " # Create a deterministic policy using the optimal value function\n", + " policy = np.zeros(100)\n", + " for s in range(1, 100):\n", + " # One step lookahead to find the best action for this state\n", + " A = one_step_lookahead(s, V, rewards)\n", + " best_action = np.argmax(A)\n", + " # Always take the best action\n", + " policy[s] = best_action\n", + " \n", + " return policy, V" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Optimized Policy:\n", + "[ 0. 1. 2. 3. 4. 5. 6. 7. 8. 9. 10. 11. 12. 12. 11. 15. 16. 17.\n", + " 18. 6. 20. 21. 3. 23. 24. 25. 1. 2. 3. 4. 5. 6. 7. 8. 9. 10.\n", + " 11. 12. 38. 11. 10. 9. 42. 7. 44. 5. 46. 47. 48. 49. 50. 1. 2. 3.\n", + " 4. 5. 6. 7. 8. 9. 10. 11. 12. 13. 11. 10. 9. 17. 7. 19. 5. 21.\n", + " 22. 23. 24. 25. 1. 2. 3. 4. 5. 6. 7. 8. 9. 10. 11. 12. 12. 11.\n", + " 10. 9. 8. 7. 6. 5. 4. 3. 2. 1.]\n", + "\n", + "Optimized Value Function:\n", + "[0.00000000e+00 7.24792480e-05 2.89916992e-04 6.95257448e-04\n", + " 1.16010383e-03 1.76906586e-03 2.78102979e-03 4.03504074e-03\n", + " 4.66214120e-03 5.59997559e-03 7.08471239e-03 9.03964043e-03\n", + " 1.11241192e-02 1.56793594e-02 1.61464431e-02 1.69517994e-02\n", + " 1.86512806e-02 1.98249817e-02 2.24047303e-02 2.73845196e-02\n", + " 2.83388495e-02 3.04937363e-02 3.61633897e-02 3.84953022e-02\n", + " 4.44964767e-02 6.25000000e-02 6.27174377e-02 6.33700779e-02\n", + " 6.45857723e-02 6.59966059e-02 6.78135343e-02 7.08430894e-02\n", + " 7.46098323e-02 7.64884604e-02 7.93035477e-02 8.37541372e-02\n", + " 8.96225423e-02 9.58723575e-02 1.09538078e-01 1.10939329e-01\n", + " 1.13360151e-01 1.18457374e-01 1.21977661e-01 1.29716907e-01\n", + " 1.44653559e-01 1.47520113e-01 1.53983246e-01 1.70990169e-01\n", + " 1.77987434e-01 1.95990576e-01 2.50000000e-01 2.50217438e-01\n", + " 2.50870078e-01 2.52085772e-01 2.53496606e-01 2.55313534e-01\n", + " 2.58343089e-01 2.62109832e-01 2.63988460e-01 2.66803548e-01\n", + " 2.71254137e-01 2.77122542e-01 2.83372357e-01 2.97038078e-01\n", + " 2.98439329e-01 3.00860151e-01 3.05957374e-01 3.09477661e-01\n", + " 3.17216907e-01 3.32153559e-01 3.35020113e-01 3.41483246e-01\n", + " 3.58490169e-01 3.65487434e-01 3.83490576e-01 4.37500000e-01\n", + " 4.38152558e-01 4.40122454e-01 4.43757317e-01 4.47991345e-01\n", + " 4.53440603e-01 4.62529268e-01 4.73829497e-01 4.79468031e-01\n", + " 4.87912680e-01 5.01265085e-01 5.18867627e-01 5.37617932e-01\n", + " 5.78614419e-01 5.82817988e-01 5.90080452e-01 6.05372123e-01\n", + " 6.15934510e-01 6.39150720e-01 6.83960814e-01 6.92560339e-01\n", + " 7.11950883e-01 7.62970611e-01 7.83963162e-01 8.37972371e-01\n", + " 0.00000000e+00]\n", + "\n" + ] + } + ], + "source": [ + "policy, v = value_iteration_for_gamblers(0.25)\n", + "\n", + "print(\"Optimized Policy:\")\n", + "print(policy)\n", + "print(\"\")\n", + "\n", + "print(\"Optimized Value Function:\")\n", + "print(v)\n", + "print(\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Show your results graphically, as in Figure 4.3.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEWCAYAAACJ0YulAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAIABJREFUeJzt3Xd8HNW5//HPI8mqlmRky7jjbmMb\nQhGmJKGH0OEmJIFAQkng5hJCCKSQhCSENNJuknshxaH3UPIjhksghNCbLReMC26Si1xlSbZlyerP\n748ZKWtZZW1rtCrf9+u1L+3Mnp15zs5qnznnTDF3R0REBCAp0QGIiEjPoaQgIiItlBRERKSFkoKI\niLRQUhARkRZKCiIi0kJJIUJmNsbMdplZchcs6z4z+3FXxNVquW5mE8PnfzSz70Wwjv80s99GsNxL\nzewfXb3crrS/283MDjezt6KIqaczszPM7OluWtd3zOyuA3j/GjM7PXx+vZnd3nXRJYaSQhcIvxi7\nwwTQ/Bjh7uvcfaC7N0a8/ivMrDFc704zW2hm5+7rctz9S+7+oy6OLRW4BfjlAS5nbJjAUprnufvD\n7n7Ggca4j3G0/AhEyd0XAdvN7Lwo12NmqWb2azMrCb8/xWb2m5jX96m+XbTz8lOg5cfVAteb2WIz\nqwpjfcLMDjvA9eDuP3X3L4br2es7to9mAZeZ2dADjSuRlBS6znlhAmh+bOzm9b/t7gOBQcDdwONm\nltfNMbTlAuADd9+Q6EB6oYeB/4x4Hd8GCoCZQDZwCrAg4nW2y8yOAXLd/Z2Y2b8DvgpcD+QBk4Gn\ngXO6P8L2uXsN8Hfg84mO5UAoKUSo9Z6Hmb1iZj8yszfNrNLM/mFmQ2LKP2Fmm81sh5m9ZmbT93Wd\n7t4E3ANkAOPD5V5tZqvMrNzMZpvZiHbi3WMvz8wuCFsdO81stZmdaWafMrN5rd53UwfN/bOAV1uV\nb7eeZpYR7rmuDV9/w8wygNfCItvDPdrjwxbSGzHvPcHM5obvm2tmJ8S81uFn3yq+IWb2rJltDz+z\n180sycweBMYAz4QxfLOz+rRabraZvWxm/xPu/aaZ2a/MbJ2ZbbGg+y4j5i2vAKeZWVoby7rYzApb\nzfuamc0On59tZkvDum4ws6+3FRNwDPD/3H2jB9a4+wPhMvapvmZ2DXAp8M2w/DPh/BFm9pSZlVrQ\nErm+nVig1ffFzCYBXwYucfd/uXutu1eHrcTbwzLnmNmC8Hu63sxujXl/8//gNWa20cw2mdlNMa/f\namYPhZNtfccmmNm/zKzMzLaZ2cNmNqiD+F+hhyWrfebuehzgA1gDnN7G/LGAAynh9CvAaoI9nYxw\n+vaY8lcR7K2lAb8FFsa8dh/w43bWfwXwRvg8hWCvqhLIBU4FtgFHhcv9X+C1mPc6MLH1Ogj2HHcA\nHyPYeRgJTA2XUQ4cGrOMBcAn24ltLvCpVvM6qued4ecyEkgGTgjL7fFZtlHvPKAC+Fz4GVwSTg+O\n57NvFd/PgD8CA8LHRwFrb1vHs92AwcCc2G0Ylp0dxp4NPAP8rNWydwKHtxFjZriNJ7X6rC8On28C\nPho+Pwg4qp263gKsA64FDmuuZ0ff7XjqGzOdBMwDvg+kEuyoFAEfbyeeJ4BvxEx/CVjbyf/fyWHs\nScDhwBbgwlb/g48CWWG50uY6AbcCD7X1/xrOm0jwP5AG5BMkjt+29/kQ/J+VJ/L36EAfCQ+gLzzC\nL8YuYHv4eDqcv8eXjOCH6JaY910LPN/OMgeF780Np/f4Z2tV9gqgIVz3NuCdmC/93cAvYsoOBOqB\nseF0e0nhT8Bv2lnfH4CfhM+nE/z4prVTdiVwZgefXUs9w3/q3cCH2ijX1j/sFfw7KXwOmNPqPW8D\nV+zHZ38b8Lfmz6WNbb3XDkAn2+0eYDF7/tgZUAVMiJl3PFDcankbgBPbWddDwPfD55MIkkRmOL2O\noOspp5PvbjLBnvibQC2wEbj8AOsbmxSOBda1es+3gXvbWd6LwJdipr8LvLOP/4+/bf7uxnxvpsa8\n/gvg7vD5rXSQFNpY9oXAgvY+n3A7NO5LvD3toe6jrnOhuw8KHxd2UG5zzPNqgh9pzCzZzG4Pu2l2\nEnzZANrs4mjDO+G6h7j7ce7+z3D+CGBtcyF33wWUEeyJd2Q0wZ51W+4HPmtmRvBj/Li717ZTtoJg\nrxLotJ5DgPQO1tuRPeoZWsue9Wzzs2/DL4FVwD/MrMjMbm5vpXFut3MIWid/jJmXT7C3Py/sptoO\nPB/Oj5VNkOzb8ghBiwjgswQ7I9Xh9CeBs4G1ZvaqmR3f1gLcvdHd73T3DxP8wP8EuMfMDj2A+sY6\nBBjRXMewnt8BDm6n/B7fF4Lv6vB2yjbHdGzYLVdqZjsIWhet41kf83wtwfelU2Y21MweC7vgdhIk\n4o7+J7MJWti9lpJCz/FZgkHZ0wn2mseG8+0Al7uR4B8zWJhZFkFXRmcDv+uBCW294MEgYB1Bt8pn\ngQc7WM4igi6bZh3VcxtQ0856O7uc7x71DI2h83ruvSL3Sne/yd3HA+cBN5rZae3EEc92+zPBD/5z\n4ecPQV13A9NjdiZyPThYIFhAMPaTCixvJ9R/AEPM7AiC5PBITB3muvsFwFCCQdnH46j3bne/k+CH\nedp+1rd1+fUErZ9BMY9sdz+7nTBaf19eAkaZWUEHoT9C0A032t1zCZJv6/+b0THPxxB8X1pr6zv2\ns3D+4e6eA1zWxrJjHQq818HrPZ6SQs+RTdB8LyPYg/xpFy33EeBKMzsiHLD8KfCuu6/p5H13h+87\nLRxkHWlmU2NefwC4A2hw9zfaXgQAzwEnxUy3W0//9yD5f4eDk8nhYF8aQT9wE+HgeTvrmWxmnzWz\nFDP7DMEP27Od1HMvZnaumU0MW0I7gcbwAUF/dWwM8W636wh+3J81s4ywrn8GfmPhIYzhZ/zxmPec\nDPyrvVaYuzcATxK0bPIIul6aDzO91Mxy3b0+pg5t1fUGMzvZggH+FDO7PKxT8xFI+1rf1uXnADvN\n7FvhOpLNbIYFRxm1ZY/vi7uvBH4PPBrGmWpm6RYMtDe34LIJ+vFrzGwmQeJq7XtmlhkOil8J/KWN\nMm19x7IJu4bNbCTwjXbibnYSwRFIvZaSQs/xAEGzdgOwlGBc4IC5+0vA94CnCAYfJwAXx/G+OQT/\nPL8haA6/yp574g8CM+i4lQDB4OlU+/cRT53V8+vA+wSDpuXAz4GksFvkJ8CbYTfEca3iLQPOBW4i\n+MH6JnCuu2/rrK5tmAT8k+DH4G3g9+7+Svjaz4Bbwhi+Hkd9muNz4BqCPee/mVk68C2Cbqp3wq6J\nfwJTYt52KXt2ObXlEYK99ifCJNHsc8CacLlfItjDbctu4NcEXWvbCMYXPunuRftZ37uBaWH5pz04\nR+c84AigOFzHXQStjL24+3xgh5kdGzP7eoIdkDsJutJWA/9B8N2CYHzoNjOrJBjQbqtV9CrBZ/0S\n8Ct33+ukx3a+Yz8kGDzeAfwf8Ne24gYIt+nZBN2rvVbzERUi+8SCQye3EhzVsrKTstcA09z9hm4J\nrg+w4MSsWe7e5lhAX2ZmZwDXdjI2F++yxhIkowGtkmaXM7OvEHRhfTPK9URNSUH2i5ndSLAnfmqi\nYxFpT3cmhb5if0/nln7MzNYQDLYd8J6ciPQsaimIiEgLDTSLiEiLXtd9NGTIEB87dmyiwxAR6VXm\nzZu3zd1bnxy5l16XFMaOHUthYWHnBUVEpIWZtT7jv03qPhIRkRZKCiIi0kJJQUREWigpiIhICyUF\nERFpoaQgIiItlBRERKSFkoKISA/X1OT89LllLCpp7yZ8XUdJQUSkh1uxtZJZrxWxcsuuyNelpCAi\n0sPNKS4HYOa4vMjXpaQgItLDvVtczojcdEYdlBH5upQURER6MHdnTnE5M8flEdw2PFpKCiIiPdja\nsmpKK2s5phu6jkBJQUSkR2seTzi2LyQFMzvTzJab2Sozu7mN18eY2ctmtsDMFpnZ2VHGIyLS27xb\nXE5eVioT8gd2y/oiSwpmlgzcCZwFTAMuMbNprYrdAjzu7kcCFwO/jyoeEZHeaM6aMmaO7Z7xBIi2\npTATWOXuRe5eBzwGXNCqjAM54fNcYGOE8YiI9CqbduxmffnubjkUtVmUSWEksD5muiScF+tW4DIz\nKwGeA77S1oLM7BozKzSzwtLS0ihiFRHpcbrz/IRmUSaFtto63mr6EuA+dx8FnA08aGZ7xeTus9y9\nwN0L8vM7vcWoiEifMKe4nIFpKRw6PKfzwl0kyqRQAoyOmR7F3t1DXwAeB3D3t4F0YEiEMYmI9Bpz\nisspGHsQyUndM54A0SaFucAkMxtnZqkEA8mzW5VZB5wGYGaHEiQF9Q+JSL+3dWcNK7fu6tauI4gw\nKbh7A3Ad8AKwjOAooyVmdpuZnR8Wuwm42szeAx4FrnD31l1MIiL9zs+fX05KkvHx6cO6db0pUS7c\n3Z8jGECOnff9mOdLgQ9HGYOISG/z9uoynppfwrUnT+i28xOa6YxmEZEepLahke8+/T6j8zL4yqmT\nun39kbYURERk3/zp1SKKSqu478pjyEhN7vb1q6UgItJDbNy+mzteXsU5hw3n5ClDExKDkoKISA/x\n0rIt1DU0cdMZkxMWg5KCiEgP8XZRGSNy0xk3JCthMSgpiIj0AE1NzturyzhuwuBuu/hdW5QURER6\ngOVbKqmorueECYm9qIOSgohID/D26jIAjp8wOKFxKCmIiPQAb60u45DBmYwclJHQOJQUREQSrLHJ\nebe4jOPHJ7aVAEoKIiIJt2TjDiprGhLedQRKCiIiCdcynqCWgoiIvLW6jAn5WQzNSU90KEoKIiKJ\nVN/YxNw15Qk/FLWZkoKISALNW1tBdV1jjxhPACUFEZGEqWto4rZnljJkYBofmdQzWgq6dLaISILc\n+fIqlm7ayazPHU1O+oBEhwOopSAikhCLN+zgzpdX8R9HjuSMbr7lZkeUFEREulltQyNff+I98rJS\n+cF50xIdzh7UfSQi0o1Wba3kG08u4oPNldx9eQGDMlMTHdIelBRERLpBQ2MTs14v4rf/XElmajK/\nu/gITjv04ESHtRclBRGRbnDfW2v4xfPLOWvGMG67YAb52WmJDqlNSgoiIt3g+cWbmTEyhz9cdnSi\nQ+mQBppFRCJWUVXH/HUVnDq153UXtaakICISsddWltLkcMqU/ESH0iklBRGRiL38wVYGZ6XyoVGD\nEh1Kp5QUREQi1NjkvLqilJMm55OUZIkOp1NKCiIiEVq4fjsV1fWcMnVookOJi5KCiEiEXlm+leQk\n48RJPX88AZQUREQi9a8PtnL0mIPIzewZF7zrjJKCiEhEtuysYcnGnZw8tXe0EkBJQUQkMq8s3wrA\nqb1kPAGUFEREIlHf2MSs14oYn5/FlIOzEx1O3JQUREQi8NicdawureLmM6di1vMPRW2mpCAi0sV2\n1tTzm3+u5LjxeXxsWs+/tEUsJQURkS5258urqKiu45ZzpvWqVgIoKYiIdKn15dXc+8YaPnHkKGaM\nzE10OPtMSUFEpIts3L6bLz00j6Qk+MbHpyQ6nP0SaVIwszPNbLmZrTKzm9sp82kzW2pmS8zskSjj\nERGJytw15Zx/xxusLavm95cexbDc9ESHtF8iu8mOmSUDdwIfA0qAuWY2292XxpSZBHwb+LC7V5hZ\n7zmYV0Qk9Nf5JXzrqUWMOiiTx645molDe88hqK1Feee1mcAqdy8CMLPHgAuApTFlrgbudPcKAHff\nGmE8IiJd7tUVpXzjyUUcOy6PP1x2NLkZveNyFu2JsvtoJLA+ZroknBdrMjDZzN40s3fM7My2FmRm\n15hZoZkVlpaWRhSuiMi+Wb65kusens+koQOZ9fmCXp8QINqk0NZxWN5qOgWYBJwMXALcZWZ73YXC\n3We5e4G7F+Tn955riIhI31VaWctV980lPTWZe644hoFpfeOW91EmhRJgdMz0KGBjG2X+5u717l4M\nLCdIEiIiPZK78/zizVz0x7coq6rl7ssLGDEoI9FhdZkok8JcYJKZjTOzVOBiYHarMk8DpwCY2RCC\n7qSiCGMSEdkvTU3Ou0VlfGbWO3zpoXkMSE7ivitncngvuMXmvtin9o6ZHQSMdvdFnZV19wYzuw54\nAUgG7nH3JWZ2G1Do7rPD184ws6VAI/ANdy/b51qIiERk6cadPDZ3HS8s2cyWnbUMzkrlxxfO4OJj\nRpOS3PdO9TL31t38rQqYvQKcT5BAFgKlwKvufmPk0bWhoKDACwsLE7FqEelnauobmfmTf1LX2MTJ\nk4dy5oxhnD7t4F45fmBm89y9oLNy8dQs1913mtkXgXvd/Qdm1mlLQUSkt3tr9TZ21jRw75XHcMqU\n/nEaVTxtnxQzGw58Gng24nhERHqMFxZvITsthRMmDE50KN0mnqRwG0Hf/2p3n2tm44GV0YYlIpJY\nDY1NvLhsC6dMHUpaSnKiw+k2nXYfufsTwBMx00XAJ6MMSkQk0QrXVlBeVcfHpw9LdCjdqtOWgplN\nNrOXzGxxOH24md0SfWgiIonz/OLNpKYkcfKU/nXCbDzdR38muGhdPUB4OOrFUQYlIpJI7s4/lmzm\nxElDyOqFRxodiHiSQqa7z2k1ryGKYEREeoL3N+xg444azuhnXUcQX1LYZmYTCK9bZGYXAZsijUpE\nJIFeWLKZ5CTj9EN71/2Vu0I87aIvA7OAqWa2ASgGLo00KhGRBGlqcv6+eDMzx+aRl5Wa6HC6XTxJ\nwd39dDPLApLcvdLMxkUdmIhIItz31hqKSqv4yqkTEx1KQsTTffQUgLtXuXtlOO/J6EISEUmMlVsq\nuf35Dzht6lAuPKL17V/6h3ZbCmY2FZgO5JrZJ2JeygF6581HRUTaUdfQxNceX8jAtBRu/+ThmLV1\nS5i+r6PuoynAucAg4LyY+ZUEt9EUEekz/uellSzesJM/fe5o8rPTEh1OwrSbFNz9b8DfzOx4d3+7\nG2MSEelW7xSV8ftXVnHR0aP63RnMrcUz0LzAzL5M0JXU0m3k7ldFFpWISDcp21XLVx9bwCGDs7j1\n/OmJDifh4hlofhAYBnwceJXgtpqVHb5DRKQXaGpybnriPSqq67njs0f2yvskdLV4ksJEd/8eUOXu\n9wPnAIdFG5aISPTueqOIV5aX8r1zDmX6iNxEh9MjxJMU6sO/281sBpALjI0sIhGRbvCvD7bwi+eX\nc9aMYVx23CGJDqfHiKetNCu8N/P3gNnAQOD7kUYlIhKhV1eU8qUH5zNtRA4/v6j/Hn7alnjup3BX\n+PRVYHy04YiIROutVdu45oFCJg4dyANXzSQnfUCiQ+pROk0KZjYI+DxBl1FLeXe/PrqwRES6VlOT\n8+jcdfz42WWMHZzFQ188lkGZ/e/aRp2Jp/voOeAd4H2gKdpwRES6XvG2Km5+ahHvFpdzwoTB/O7i\nI/vlxe7iEU9SSHf3GyOPRESki63aWsm9b67hyXklpKYk8fNPHsanC0ZrDKED8SSFB83sauBZoLZ5\npruXRxaViMgBWF26i1tnL+H1ldtITUniwiNGcNMZUzg4R5dt60w8SaEO+CXwXcIb7YR/NegsIj3O\nu0VlXPPgPJIMvn7GZC6ZOYbBA/vvtYz2VTxJ4UaCE9i2RR2MiMiB+NvCDXzjiUWMysvgvitmMmZw\nZqJD6nXiSQpLgOqoAxER2V+1DY38+h8rmPVaEceOy+NPnztaRxbtp3iSQiOw0MxeZs8xBR2SKiIJ\n98Hmndzw2EI+2FzJpceO4fvnTSMtJTnRYfVa8SSFp8OHiEiPsWZbFQ++s5YH315LTsYA7rmigFOn\nHpzosHq9eM5ovr87AhER6UxVbQMvL9/KE4UlvLqilJQk47wPjeCWcw7VYHIX6eh2nI+7+6fN7H3+\nfdRRC3c/PNLIRESALTtreHVFKS8u3cJrK0qpbWji4Jw0vnb6ZC6ZOZqhOsy0S3XUUvhq+Pfc7ghE\nRPqnxiansqaeXbUNVNU2snHHbopKqygq3cW8tRV8sDm4fcvw3HQumTmGs2YMo2BsHslJOgEtCh3d\njnNT+PRad/9W7Gtm9nPgW3u/S0Rk31z0x7dYsG77XvNz0lOYMTKXb581lRMn5zN1WLbORO4G8Qw0\nf4y9E8BZbcwTEdknWytrWLBuO+cePpwTJ+WTlZbC0Jw0xg/JIi8rVUkgAToaU/gv4Fpggpktinkp\nG3gz6sBEpO+bW1wBwBc/Op4jRg9KcDQCHbcUHgH+DvwMuDlmfqWueyQiXWFOcRmZqclMH5GT6FAk\n1O7tON19h7uvAW4BNrv7WmAccFl4jwURkQPybnE5Rx9yEAOS47kzsHSHeLbEU0CjmU0E7iZIDI9E\nGpWI9Hk7qutZvqWSY8bmJToUiRFPUmhy9wbgE8Bv3f1rwPB4Fm5mZ5rZcjNbZWY3d1DuIjNzMyuI\nL2wR6e0K15bjDjPHKSn0JPEkhXozu4TglpzPhvM6vampmSUDdxIcqTQNuMTMprVRLhu4Hng33qBF\npPebU1xOanKSBph7mHiSwpXA8cBP3L3YzMYBD8XxvpnAKncvcvc64DHggjbK/Qj4BVATZ8wi0ge8\nW1zOh0bnkj5AF6/rSdpNCmaWA+DuS939end/NJwuJr4xhZHA+pjpknBe7DqOBEa7+7OISL9RVdvA\n4g07NJ7QA3XUUnil+YmZvdTqtXiumtrWWSct11AysyTgN8BNnS7I7BozKzSzwtLS0jhWLSI92YJ1\n22loco0n9EAdJYXYH/XWWy6e0wxLgNEx06OAjTHT2cAM4BUzWwMcB8xua7DZ3We5e4G7F+Tn58ex\nahHpyeYUl5FkcPQhByU6FGmlo6Tg7Txva7otc4FJZjbOzFKBi4HZLQsIzoMY4u5j3X0s8A5wvrsX\nxhe6iPRWc9aUM31ELtnpnR6zIt2sozOah5rZjQStgubnhNOd7q67e4OZXQe8ACQD97j7EjO7DSh0\n99kdL0FE+qLFG3ZQuKaCqz4yLtGhSBs6Sgp/Jujiaf0c4K54Fu7uzwHPtZr3/XbKnhzPMkWk96qq\nbeD6RxcweGAqXzppQqLDkTZ0dOnsH3ZnICLS9/3wmSUUl1Xx8BePJS8rNdHhSBt0wRER6RbPvLeR\nxwtLuPbkCZwwYUiiw5F2xHM/BRGR/bartoG7Xi/iT68WccToQdxw+uREhyQdUFIQkUi4Ow+9u47f\nvriCsqo6zj5sGD84b7quiNrDdZoUzOxg4KfACHc/K7x+0fHufnfk0YlIr/XQu+v43tOLOW58Hnef\ndaiucdRLxJOy7yM4rHREOL0CuCGqgESk93tv/XZ+9MxSTp06lEe+eJwSQi8ST1IY4u6PA00QnH8A\nNEYalYj0WhVVdVz78Hzys9P4709/iKQk3We5N4lnTKHKzAYTnsVsZscBOyKNSkR6pd11jdz4+EJK\nK2t58r+OZ1CmDjvtbeJJCjcSXJ5igpm9SXA280WRRiUivUp1XQMPv7OOP71WxLZdtfz4whkcPkpd\nRr1Rp0nB3eeb2UnAFIJLXCx39/rIIxORHsvdea9kB4VrylmwfjtvrdpGRXU9H5k4hK+efpQuid2L\nxXP00edbzTrKzHD3ByKKSUR6uF//YwV3vLwKgJGDMvjopHwuP+EQjj5EyaC3i6f76JiY5+nAacB8\nQElBpB96dM467nh5FZ86ehTf+PgUhuakJzok6ULxdB99JXbazHKBByOLSER6rJeXb+WWpxdz0uR8\nfvaJw0jRiWh9zv6c0VwNTOrqQESkZ3v5g61c98h8phyczZ2XHqWE0EfFM6bwDP++qU4SMA14PMqg\nRKTnWFdWzW3PLuWfy7YwIT+Le688hoFpukJOXxXPlv1VzPMGYK27l0QUj4j0ABu27+aNlaW8tnIb\nLy7dQkqScfNZU7nqw+NITVELoS+LZ0zh1e4IREQSq7Kmnmfe28Rf5q7jvZLg/NSh2Wl88qiRXH/a\nJIbnZiQ4QukO7SYFM6uk7XsxG+DunhNZVCISGXdne3U9m3fWUFRaxbJNO1m6aSdvry5jd30jUw7O\n5jtnT+XkKUOZNHQgZrpMRX/S0Z3Xstt7TUR6n7qGJm564j3+sWQztQ1NLfOTk4yJ+QO58MiRfLpg\nFEeMHqRE0I/FPVpkZkMJzlMAwN3XRRKRiHS5hsYmvvrYAv6+eDOXzBzDxKEDGZ6bzpi8TCYOHUj6\ngOREhyg9RDxHH50P/Jrg0tlbgUOAZcD0aEMTka7Q1OR888lF/H3xZr537jS+8JFxiQ5JerB4DiP4\nEXAcsMLdxxGc0fxmpFGJSJfYsH03N/xlIX9dsIGbPjZZCUE6FU/3Ub27l5lZkpklufvLZvbzyCMT\nkf1WvK2KP7yyir/O3wDADadP4rpTJyY4KukN4kkK281sIPAa8LCZbSU4X0FEehB3p3BtBX9+rYgX\nl20hNTmJS48dwzUnTWDkIB1OKvGJJylcANQAXwMuBXKB26IMSkTit3lHDf/3/ib+tnADi0p2kJsx\ngGtPnsDlJ4xlaLYuVif7pqPzFO4AHnH3t2Jm3x99SCLSHndn884a3lu/nQXrt1O4poL56ypwh2nD\nc7jtgulcdPQoMlN1GQrZPx19c1YCvzaz4cBfgEfdfWH3hCUiABu37+aV5aW8uXobRaVVrC2rorou\nuEX6gGRj2ohcbjhtMud+aDgT8gcmOFrpCzo6ee13wO/M7BDgYuBeM0sHHgUec/cV3RSjSL/Q3Aoo\nXFNB4Zpy3i0u54PNlQCMyE1n6vAcjh8/mLFDMjlsZC7TRuSQlqLzC6RrmXtbV7Jop7DZkcA9wOHu\nnpBvY0FBgRcWFiZi1SL7zd2pqmuktLKW0spatlbWsHlH8Ni4Yzdry6pZW1bNrtrgGI7M1GSOGnMQ\nJ03O55Sp+UzI1+Um5MCY2Tx3L+isXDwnrw0AziRoLZwGvAr88IAjFOmDZr22mr/O30BDk9PY5NTU\nN1JV20BVXSONTXvvgKUPSGJEbgZjBmdyzNg8xg3J4qgxB3Ho8Gzdr0ASoqOB5o8BlwDnAHOAx4Br\n3L2qm2IT6VXcnbteLyZ9QDIzRuaQkpREakoSA9NSyEpLJid9APnZaS2P4TkZ5GSkqAUgPUpHLYXv\nAI8AX3f38m6KR6TXWltWzdb8d16TAAAQVklEQVTKWn584QwuO+6QRIcjsl86Gmg+pTsDEent5qwJ\n9p2OHZeX4EhE9p86LUW6yJzicg7KHMDEoTo0VHovJQWRLjKnuJxjxuZpjEB6NSUFkS6wacdu1pVX\nM1NdR9LLKSmIdIE5xc3jCYMTHInIgVFSEOkCc9eUk5WazKHDdRdb6d2UFES6wJzico4em6cTzqTX\ni/QbbGZnmtlyM1tlZje38fqNZrbUzBaZ2UvhdZZEepXyqjpWbNmlQ1GlT4gsKZhZMnAncBYwDbjE\nzKa1KrYAKHD3w4EngV9EFY9IVOaG5ydokFn6gihbCjOBVe5e5O51BJfJuCC2gLu/7O7V4eQ7wKgI\n4xGJxNziclJTkjh8VG6iQxE5YFEmhZHA+pjpknBee74A/L2tF8zsGjMrNLPC0tLSLgxR5MDU1Dfy\n4rItHDF6kC5jLX1ClEmhrTN42rxOt5ldBhQAv2zrdXef5e4F7l6Qn5/fhSGKHJif/N8y1pZV8+VT\nJiY6FJEuEeU9+0qA0THTo4CNrQuZ2enAd4GT3L02wnhEutQLSzbz4Dtr+eJHxnHSZO2sSN8QZUth\nLjDJzMaZWSrB/RhmxxYIb9rzJ+B8d98aYSwiXWrTjt1866lFTB+RwzfOnJLocES6TGRJwd0bgOuA\nF4BlwOPuvsTMbjOz88NivwQGAk+Y2UIzm93O4kR6jPdLdnD1A4XUNTTxv5ccqbEE6VOi7D7C3Z8D\nnms17/sxz0+Pcv0iXWltWRW/+scKnnlvIwdlDuC/P30E4/N1RVTpWyJNCiJ9RVHpLs6/400am5yv\nnDqRq08cT076gESHJdLllBREOrG7rpH/emg+qSlJ/O3LH2Z0XmaiQxKJjJKCSAfcne8+/T4rtlby\nwFUzlRCkz9PVu0Q68Je56/nr/A3ccNpkPjpJh51K36eWgkgbVm2t5H//tYpn3tvIiZPz+cqpOjlN\n+gclBRGgsclZsaWS+esqeGPlNp5fspmMAclcfeJ4rjtlIklJusWm9A9KCtLvLVhXwRfuL6S8qg6A\nwVmpfOmkCVz90fHkZaUmODqR7qWkIP3a2rIqvnB/IQPTUvj+udM4cswgxuRlYqaWgfRPSgrSb1VU\n1XHFvXNpcue+K4/RiWgiKClIP7Vjdz1XP1DIhu27eeSLxyohiISUFKRfaWpynppfws+f/4Dyqjr+\n55IjKRirO6aJNFNSkH6hpKKaN1Zu4y+F61mwbjtHjRnEfVfOZMZI3S1NJJaSgvRZ68ureWJeCc++\nt5GibVUAjMhN51ef+hCfOHKkDjMVaYOSgvQZ5VV1LNu0k6Ubd/LaylLeWLUNgA9PGMKlxx3CRycN\nYdLQgTqySKQDSgrSa/39/U08Onc9m3fsZtOOGiprGlpeG3VQBtefOolPHzOakYMyEhilSO+ipCC9\n0rOLNnL9owsYk5fJlGHZHD9+MKMOyuTQ4TkcOjybwQPTEh2iSK+kpCC9zj+XbuGGxxZScEge9181\nk4xU3flMpKsoKUiv4e68sGQz1z+2kGkjcrj7igIlBJEupqQgPV5Tk/Pisi3c+fIqFpXsYOqwbB64\naibZuvOZSJdTUpAeq6a+kacXbOCuN4pZtXUXY/Iy+dknDuMTR40kLUUtBJEoKClIj7NqayWzF27k\nkTnr2LarjmnDc/jdxUdwzmHDSUnWfaFEoqSkIAm3o7qehSXbmb+2gheWbOaDzZWYwUmT87n6o+M5\nYcJgnVsg0k2UFKRbrS+v5vWV21ixpZI1ZVUUb6tibVk1AGZw1JiDuPW8aZx92HCG5qQnOFqR/kdJ\nQSJVtquWwrUVzCku59UVpazauguAzNRkDhmcxfQROXy6YDRHjB7E4aNyNXgskmBKCnJAGpuc8qo6\nNu+oYfPOGjZu383asmrWllWxunQXa8JWQGpKEseOy+OzM8dw8pR8xg3JUpeQSA+kpCDtWlSynScK\nS2hoaqK+0alraKKqtoGqugZ27m6gdFct5VV1NDb5Hu9LH5DE2MFZTB2Ww8Uzx3DM2IOYMTJXRwyJ\n9AJKCtKmrTtruOLeueyuayQ7PYWUJGNAShJZqSkMTEtheG46h4/KJT87jfzsNA7OSWd4bjrDctPJ\nH5imVoBIL6WkIHtpanJufPw9qusaePYrH2Hi0OxEhyQi3UQHfcteZr1exBurtvGD86YrIYj0M0oK\nsofCNeX86oXlnDVjGBcfMzrR4YhIN1P3kQAwb20Ff3p1NS8u28KI3Axu/8ThGhcQ6YeUFPohd6dw\nbQXvFpWxbHMlyzbupGhbFbkZA/jKKRO5/ISx5GbqfAGR/khJoR+pqW9k9nsbuffNNSzbtBOA0XkZ\nTB2Ww+ePP4RPFYwmK01fCZH+TL8AfUxTkzNvXQUlFdVUVNWzvbqO4rJqVm6ppGhbFXUNTUw5OJvb\nP3EY5xw+XGcQi8gelBT6AHdnbVk1f51fwlPzN7Bh++6W18xg5KAMJg0dyImT8zl5cj7H6wJzItIO\nJYVeorHJ2bKzhpKK3ZRW1rJtVy2bd9awdONOFm/YQVlVHWbw0Un53HzWVKaPyGFQZiq5GQNITlIC\nEJH4KCn0ADX1jbyyvJQF6yrYXd/I7rpGdtc3srOmgcqaeiqq6tiwfTf1jXteTiI5yZg0dCCnTB3K\nYSNz+di0gxkxKCNBtRCRvkBJIQGqahtYXbqL1aW7eHt1GX9fvJnKmgZSk5PISksmfUAyGQOSyc4Y\nQE56CiMHZXDmjOGMyctk1EEZ5GenMWRgGnlZqWoFiEiXijQpmNmZwO+AZOAud7+91etpwAPA0UAZ\n8Bl3XxNlTFFzd3bVNrC9up6tlTVs3F7Dph27WVdeTfG2KopKq9i0o6al/MC0FM6YfjAXHDGSD08Y\nrDuLiUhCRZYUzCwZuBP4GFACzDWz2e6+NKbYF4AKd59oZhcDPwc+E1VM8XB3ahuaqKlvpKa+iaq6\nBqprG9lV28CO3XWUV9VTUV1HaWUtWytrKK2sZefu4MqhVbUNVNY00NDqqqEAOekpjM8fyPHjBzM+\nP4uJQwcycehADhmcxQAlAhHpIaJsKcwEVrl7EYCZPQZcAMQmhQuAW8PnTwJ3mJm5+96/qgfo8bnr\nmfV6EU3u4NDkTkOT09DoNDQ1UdsQPOoamuJaXnZaCvk5aQzNTmPskEyyUlPISE0mN2MAgzIHMCgz\nlfyBaQwflM6IQRnk6NBPEekFokwKI4H1MdMlwLHtlXH3BjPbAQwGtsUWMrNrgGsAxowZs1/BDMoc\nwJSDs8EgyQwDUpKNlCQjJTmJtJQk0lKSSUtJIn1AMukDgr+ZqclkpaaQmZbMoIxU8rJSGZQ5gPQB\nujeAiPQ9USaFtkZAW7cA4imDu88CZgEUFBTsVyvijOnDOGP6sP15q4hIvxFlZ3YJEHuZzVHAxvbK\nmFkKkAuURxiTiIh0IMqkMBeYZGbjzCwVuBiY3arMbODy8PlFwL+iGE8QEZH4RNZ9FI4RXAe8QHBI\n6j3uvsTMbgMK3X02cDfwoJmtImghXBxVPCIi0rlIz1Nw9+eA51rN+37M8xrgU1HGICIi8dMB8iIi\n0kJJQUREWigpiIhICyUFERFpYb3tCFAzKwXW7ufbh9DqbOl+oj/Wuz/WGfpnvftjnWHf632Iu+d3\nVqjXJYUDYWaF7l6Q6Di6W3+sd3+sM/TPevfHOkN09Vb3kYiItFBSEBGRFv0tKcxKdAAJ0h/r3R/r\nDP2z3v2xzhBRvfvVmIKIiHSsv7UURESkA0oKIiLSot8kBTM708yWm9kqM7s50fFEwcxGm9nLZrbM\nzJaY2VfD+Xlm9qKZrQz/HpToWLuamSWb2QIzezacHmdm74Z1/kt4+fY+xcwGmdmTZvZBuM2P7yfb\n+mvh93uxmT1qZul9bXub2T1mttXMFsfMa3PbWuB/wt+2RWZ21IGsu18kBTNLBu4EzgKmAZeY2bTE\nRhWJBuAmdz8UOA74cljPm4GX3H0S8FI43dd8FVgWM/1z4DdhnSuALyQkqmj9Dnje3acCHyKof5/e\n1mY2ErgeKHD3GQSX5b+Yvre97wPObDWvvW17FjApfFwD/OFAVtwvkgIwE1jl7kXuXgc8BlyQ4Ji6\nnLtvcvf54fNKgh+JkQR1vT8sdj9wYWIijIaZjQLOAe4Kpw04FXgyLNIX65wDnEhwTxLcvc7dt9PH\nt3UoBcgI79aYCWyij21vd3+Nve9C2d62vQB4wAPvAIPMbPj+rru/JIWRwPqY6ZJwXp9lZmOBI4F3\ngYPdfRMEiQMYmrjIIvFb4JtAUzg9GNju7g3hdF/c3uOBUuDesNvsLjPLoo9va3ffAPwKWEeQDHYA\n8+j72xva37Zd+vvWX5KCtTGvzx6La2YDgaeAG9x9Z6LjiZKZnQtsdfd5sbPbKNrXtncKcBTwB3c/\nEqiij3UVtSXsR78AGAeMALIIuk9a62vbuyNd+n3vL0mhBBgdMz0K2JigWCJlZgMIEsLD7v7XcPaW\n5uZk+HdrouKLwIeB881sDUG34KkELYdBYfcC9M3tXQKUuPu74fSTBEmiL29rgNOBYncvdfd64K/A\nCfT97Q3tb9su/X3rL0lhLjApPEIhlWBganaCY+pyYV/63cAyd//vmJdmA5eHzy8H/tbdsUXF3b/t\n7qPcfSzBdv2Xu18KvAxcFBbrU3UGcPfNwHozmxLOOg1YSh/e1qF1wHFmlhl+35vr3ae3d6i9bTsb\n+Hx4FNJxwI7mbqb90W/OaDazswn2IJOBe9z9JwkOqcuZ2UeA14H3+Xf/+ncIxhUeB8YQ/FN9yt1b\nD2L1emZ2MvB1dz/XzMYTtBzygAXAZe5em8j4upqZHUEwuJ4KFAFXEuzo9eltbWY/BD5DcLTdAuCL\nBH3ofWZ7m9mjwMkEl8feAvwAeJo2tm2YHO8gOFqpGrjS3Qv3e939JSmIiEjn+kv3kYiIxEFJQURE\nWigpiIhICyUFERFpoaQgIiItlBREQmY2zMweM7PVZrbUzJ4zs8n7sZy7mi+4aGbfifM9a8xsyL6u\nS6Sr6ZBUEVpO/HsLuN/d/xjOOwLIdvfXD2C5u9x9YBzl1hBc+XPb/q5LpCuopSASOAWob04IAO6+\nEFhgZi+Z2Xwze9/MLoDggoPhfQzuD69h/6SZZYavvWJmBWZ2O8HVPBea2cPha0+b2bzwfgDXJKCe\nIh1SUhAJzCC42mZrNcB/uPtRBInj12GrAmAKMMvdDwd2AtfGvtHdbwZ2u/sR4aU3AK5y96OBAuB6\nMxscQV1E9puSgkjHDPipmS0C/klwOYWDw9fWu/ub4fOHgI/Esbzrzew94B2Ci5hN6uJ4RQ5ISudF\nRPqFJfz7gmqxLgXygaPdvT7s+08PX2s9INfhAF14babTgePdvdrMXolZlkiPoJaCSOBfQJqZXd08\nw8yOAQ4huF9DvZmdEk43G2Nmx4fPLwHeaGO59eHlzAFygYowIUwluGWqSI+ipCACeHAY3n8AHwsP\nSV0C3Ao8BxSYWSFBq+GDmLctAy4Pu5byaPveuLOAReFA8/NASlj+RwRdSCI9ig5JFdkP4e1Onw1v\nHi/SZ6ilICIiLdRSEBGRFmopiIhICyUFERFpoaQgIiItlBRERKSFkoKIiLT4/4EmbUnRp+/0AAAA\nAElFTkSuQmCC\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plotting Final Policy (action stake) vs State (Capital)\n", + "\n", + "# x axis values\n", + "x = range(100)\n", + "# corresponding y axis values\n", + "y = v[:100]\n", + " \n", + "# plotting the points \n", + "plt.plot(x, y)\n", + " \n", + "# naming the x axis\n", + "plt.xlabel('Capital')\n", + "# naming the y axis\n", + "plt.ylabel('Value Estimates')\n", + " \n", + "# giving a title to the graph\n", + "plt.title('Final Policy (action stake) vs State (Capital)')\n", + " \n", + "# function to show the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEWCAYAAABrDZDcAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAGoxJREFUeJzt3Xu8HGV9x/HP13AXQghJMJDEgA0X\naw2XIwWxlIu0SCmJBSkUMW3B9KJyEbWovFpQq9AqUK9tCmKK3CQg1xZJYyKlhUAihIsgCYgQE5MA\nCQEBTeDXP+Y5sBzO7pk9Z2f27M73/Xrta3dm5/KbmWR/53memedRRGBmZtX1pnYHYGZm7eVEYGZW\ncU4EZmYV50RgZlZxTgRmZhXnRGBmVnFOBDZsSTpB0q1DWH+BpJNbGVMT+x5S7H229bik97ZiWzXb\nPEjS8prpByUd1Mp9WOdwIrAhk/RnkhZJel7SSkn/Jek9Q91uRFwWEX9Qs5+Q9FtD3W6rSJqcYnq+\n5rUE3hh7gTF8R9Jv0r6fkTRX0u7NbicifjsiFhQQonUAJwIbEkkfBy4EvgjsAEwCvglMa2dcJRsV\nEVun19Q27P+fImJrYAKwGvhOG2KwDuZEYIMmaVvgc8BHIuLaiPhVRGyIiBsj4pNpmX0l3SFpXSot\nfF3SZjXbCEmnSHpM0lOS/lnSm9J3fy7p9vT5trTKkvTX759K2k7STZLWSFqbPk/IEfeOkl6UNLpm\n3l5p/5tK+i1JP5L0bJp31SDOzaux1xznX0tammL9hiSl794m6YeSnk77u0zSqGb3GREvAJcD70jb\n3VzShZJWpNeFkjavE++r1U+SRkj6jKRHJT0nabGkiSnmr/RZ70ZJpzUbqw0vTgQ2FPsDWwDfb7DM\ny8DpwJi0/KHA3/ZZ5v1AD7A3WUniL/tuJCIOTB+npr+8ryL793sJ8FayksiLwNcHCjoiVgB3AEfX\nzP4zYE5EbAA+D9wKbEf2V/bXBtpmTkcC7wKmAscCf5jmC/gSsCOwBzAROLvZjUvaGjgBuCfN+iyw\nH7Bn2ue+wFk5NvVx4HjgCGAk2fV4AZgNHF+TqMeQXc8rmo3VhhcnAhuK7YGnImJjvQUiYnFE3BkR\nGyPiceDfgN/vs9h5EfFMRDxBVs10fJ6dR8TTEXFNRLwQEc8B/9jPtuu5vHc/6S/z49I8gA1kyWXH\niHgpIm7vfxOveiqVeNZJ+kSD5c6NiHXpOOeT/UATEcsiYm5E/Doi1gDnN3EcAJ+QtA5YBmwN/Hma\nfwLwuYhYnbZ7DnBiju2dDJwVET+NzJJ0ru8CniX78YfsnC2IiFVNxGrDkBOBDcXTwBhJm9RbQNKu\nqcrml5LWk7UljOmz2JM1n39O9pfxgCRtJenfJP08bfs2YJSkETlWnwPsL2lH4EAggP9J332K7K/0\nu9LdNG8oofQxJiJGpdeXGyz3y5rPL5D9aCNpnKQrJf0iHcd3eeM5auTLad9viYijIuLRNH9HsvPZ\nK++5nQg8Wue72cAH0+cPApc2EacNU04ENhR3AC8B0xss8y3gYWBKRIwEPkP2I1trYs3nScCKnPs/\nA9gN+N207d7qo77bf4OIWEdW/XMsWbXQFZG64o2IX0bEhyNiR+CvgG8WfLfSl8gS0TvTcXyQHMeQ\nwwqykk2vvOf2SeBtdb77LjBN0lSyaqzrhhShDQtOBDZoEfEs8PfANyRNT3+hbyrpfZL+KS22DbAe\neD7d1vg3/Wzqk6nhdyJwKlCvcXYVsEvN9DZk7QLrUsPvPzR5CJcDHyJrK+itFkLSB2oandeS/Ui/\n3OS2m7EN8DzZcewEfLJF270COEvS2FSf//dkP+QDuQj4vKQpyrxT0vYAEbEcuJusJHBNRLzYolit\njZwIbEgi4nyyxsWzgDVkf01+lNf+UvwE2V/czwH/Tv8/8tcDi4F7gZuBi+vs7mxgdqqLP5asPWFL\n4CngTuCWJsO/AZgCrIqIJTXz3wUslPR8WubUiPhZk9tuxjlkDeXPkh3/tS3a7heARcB9wP3Aj9O8\ngZwPfI+sxLSe7HpsWfP9bOB3cLVQ15AHprF2khRk1UbL2h2L5SPpQLKSxeSIeKXd8djQuURgZrlJ\n2pSs+u4iJ4Hu4URgZrlI2gNYB4wnq5azLuGqITOzinOJwMys4uo+CDScjBkzJiZPntzuMMzMOsri\nxYufioixAy3XEYlg8uTJLFq0qN1hmJl1FEk/H3gpVw2ZmVWeE4GZWcU5EZiZVZwTgZlZxTkRmJlV\nnBOBmVnFFXr7qKTHyXqdfBnYGBE9qbvgq4DJwOPAsRGxtsg4zMysvjJKBAdHxJ4R0ZOmzwTmRcQU\nYF6aNjOzNmlH1dA0sv7MSe+NRrcyM7OCFZ0IArhV0mJJM9O8HSJiJUB6H9ffipJmSlokadGaNWsK\nDtNs8C6Y+wgXzH2k3WGYDVrRXUwcEBErJI0D5kp6OO+KETELmAXQ09PjLlLNzApSaIkgIlak99XA\n94F9gVWSxgOk99VFxmBmZo0VlggkvVnSNr2fgT8AHiAbA3ZGWmwG2Xi1ZmbWJkVWDe0AfF9S734u\nj4hbJN0NfE/SScATwAcKjMGs5WrbA04/bNc2RmLWGoUlgoh4DJjaz/yngUOL2q+ZmTXHTxabmVWc\nE4GZWcV1xAhlZu3m5wSsm7lEYGZWcU4EZmYV50RgZlZxbiMwq8PtAlYVLhGYmVWcE4GZWcU5EZiZ\nVZzbCMxquF3AqsglAjOzinMiMDOrOCcCM7OKcyIwM6s4JwIzs4pzIjAzqzgnAjOzinMiMDOrOD9Q\nZpXkAejNXuMSgZlZxTkRmJlVnBOBmVnFORGYmVWcE4GZWcU5EZiZVZwTgZlZxfk5Autqfl7AbGAu\nEZiZVZwTgZlZxTkRmJlVnBOBmVnFFZ4IJI2QdI+km9L0zpIWSloq6SpJmxUdg5mZ1VdGieBU4KGa\n6fOACyJiCrAWOKmEGMzMrI5CE4GkCcAfARelaQGHAHPSIrOB6UXGYGZmjRVdIrgQ+BTwSpreHlgX\nERvT9HJgp/5WlDRT0iJJi9asWVNwmGZm1VVYIpB0JLA6IhbXzu5n0ehv/YiYFRE9EdEzduzYQmI0\nM7Ninyw+ADhK0hHAFsBIshLCKEmbpFLBBGBFgTGYmdkACisRRMSnI2JCREwGjgN+GBEnAPOBY9Ji\nM4Dri4rBzMwG1o7nCP4O+LikZWRtBhe3IQYzM0tK6XQuIhYAC9Lnx4B9y9ivmZkNzE8Wm5lVnBOB\nmVnFORFYR7lg7iOvG2PAzIbOicDMrOKcCMzMKs6JwMys4hrePippC+BI4PeAHYEXgQeAmyPiweLD\nMzOzotVNBJLOBv6Y7P7/hcBqsq4idgXOTUnijIi4r/gwzcysKI1KBHdHxNl1vjtf0jhgUutDMjOz\nMtVNBBFxc+20pDdHxK9qvl9NVkowM7MONmBjsaR3S/oJaZQxSVMlfbPwyMzMrBR57hq6APhD4GmA\niFgCHFhkUGZmVp5ct49GxJN9Zr1cQCxmZtYGeXoffVLSu4GQtBlwCq8fjN7MzDpYnhLBXwMfIRtb\neDmwZ5o2M7MukKdE8EoaWexVknYmtRmYmVlny1MiuFHSyN4JSXsANxYXkpmZlSlPIvgiWTLYWtI+\nwBzgg8WGZWZmZRmwaigibpa0KXArsA0wPSKWFh6ZmZmVolFfQ18DombWSOAx4GOSiIhTig7OzMyK\n16hEsKjP9OIiAzEzs/Zo1NfQ7DIDMTOz9hiwjUDSFOBLwNvJuqEGICJ2KTAuMzMrSZ67hi4BvgVs\nBA4G/gO4tMigzMysPHkSwZYRMQ9QRPw8jVFwSLFhmZlZWfI8WfySpDcBSyV9FPgFMK7YsMzMrCx5\nSgSnAVuRdTa3D9nDZB8qMigzMytPnkQwOSKej4jlEfEXEXE0HqLSzKxr5EkEn845z8zMOlCjJ4vf\nBxwB7CTpqzVfjSS7g8jMzLpAo8biFWRPFx/F658qfg44vcigzMysPI2eLF4CLJF0eURsAJC0HTAx\nItaWFaCZmRUrTxvBXEkjJY0GlgCXSDp/oJUkbSHpLklLJD0o6Zw0f2dJCyUtlXRVGv7SzMzaJE8i\n2DYi1gN/AlwSEfsA782x3q+BQyJiKtnwlodL2g84D7ggIqYAa4GTBhe6mZm1Qp5EsImk8cCxwE15\nNxyZ59PkpukVZE8lz0nzZwPT84drZmatlicRfA74AbAsIu6WtAuQa2AaSSMk3QusBuYCjwLrIqL3\nrqPlwE7Nh21mZq2SZ4Syq4Gra6YfA47Os/GIeBnYU9Io4PvAHv0t1t+6kmYCMwEmTfLza2ZmRalb\nIpB0Vmogrvf9IZKOzLOTiFgHLAD2A0ZJ6k1AE8huU+1vnVkR0RMRPWPHjs2zGzMzG4RGJYL7yQat\nfwn4MbCGbDyCKWSNv/9NNrB9vySNBTZExDpJW5I1MJ8HzAeOAa4EZgDXt+A4zMxskBo9R3A9cH0a\nmOYAYDywHvguMDMiXhxg2+OB2ZJGkJU8vhcRN0n6CXClpC8A9wAXt+A4zMxskPK0ESwlZ+Nwn/Xu\nA/bqZ/5jwL7Nbs/MzIqR564hMzPrYk4EZmYVN2AiaHTnkJmZdb48JYKFkq6WdIQkFR6RmZmVKk8i\n2BWYBZwILJP0RUm7FhuWmZmVZcBEkPoMmhsRxwMnk937f5ekH0nav/AIzcysUAPePippe7IB608E\nVgEfA24ge6jsamDnIgM0M7NiDZgIgDuAS4HpEbG8Zv4iSf9aTFhmZlaWPIlgt4jot2O4iDivxfGY\nmVnJ8jQW35p6DwWy4Sol/aDAmMzMrER5EsHY1HsoAGm84nHFhWRmZmXKkwhelvTqgACS3kqdMQTM\nzKzz5Gkj+Cxwu6QfpekDSQPGmJlZ58vT++gtkvYmG1RGwOkR8VThkZmZWSkajVC2e3rfG5hENpLY\nL4BJaZ6ZmXWBRiWCM4APA1/p57sADikkImurC+Y+8urn0w9zTyLWer3/xvzva/hoNELZh9P7weWF\nY2ZmZaubCCT9SaMVI+La1odjZmZla1Q19McNvgvAicDMrAs0qhr6izIDsfapbRcwK4LbBYa3PCOU\nbSvpfEmL0usrkrYtIzgzMytenieLvw08BxybXuuBS4oMyszMypPnyeK3RcTRNdPnSLq3qIDMzKxc\neUoEL0p6T++EpAOAF4sLyczMypSnRPA3wOzULiDgGbLhKq2DuYHYiuYG4s6Rp6+he4Gpkkam6fWF\nR2VmZqXJc9fQ9pK+CiwA5kv6lzSOsZmZdYE8bQRXAmuAo4Fj0uerigzKzMzKk6eNYHREfL5m+guS\nphcVkJl1LrcLdKY8JYL5ko6T9Kb0Oha4uejAzMysHHkSwV8BlwO/Tq8rgY9Lek6SG47NzDpcnruG\ntikjEDMza488bQTWwTzQjBXN7QKdL0/V0KBImihpvqSHJD0o6dQ0f7SkuZKWpvftiorBzMwGVlgi\nADYCZ0TEHmQD339E0tuBM4F5ETEFmJemzcysTRqNUDa60YoR8cwA368EVqbPz0l6CNgJmAYclBab\nTfag2t/ljtjMzFqqURvBYrKRyNTPdwHskncnkiYDewELgR1SkiAiVkoaV2edmcBMgEmTJuXdleE6\nWzNrTqMRynZuxQ4kbQ1cA5wWEeul/vJKv/ufBcwC6OnpiVbEYmZmb5TrrqHUoDsF2KJ3XkTclmO9\nTcmSwGU1g92vkjQ+lQbGA6ubD9vMzFolT6dzJwO3AT8AzknvZ+dYT8DFwEMRcX7NVzfwWjfWM4Dr\nmwvZzMxaKU+J4FTgXcCdEXGwpN3JEsJADgBOBO6vGdHsM8C5wPcknQQ8AXyg+bDNrEx+HqW75UkE\nL0XES5KQtHlEPCxpt4FWiojb6b+hGeDQpqI0M7PC5EkEyyWNAq4D5kpaC6woNiwzMytLnr6G3p8+\nni1pPrAtcEuhUZmZWWny3jU0AtgB+Fma9Ray+n1rIz8vYGatMGAikPQx4B+AVcAraXYA7ywwLjMz\nK0neu4Z2i4iniw7GzMzKl6fTuSeBZ4sOxMzM2iNPieAxYIGkm8lGKAOgz0NiVhK3C1iR/LxANeVJ\nBE+k12bpZWZmXSTP7aN5niI2M7MO1Wg8ggsj4jRJN5LdJfQ6EXFUoZGZmVkpGpUILk3vXy4jEDMz\na49GiWANQET8qKRYrA43EJtZkRrdPnpd7wdJ15QQi5mZtUGjRFDbc2juYSnNzKyzNEoEUeezmZl1\nkUZtBFMlrScrGWyZPpOmIyJGFh5dhbldwMzK0mjw+hFlBmJmZu2Rp68hMzPrYk4EZmYV50RgZlZx\nTgRmZhXnRGBmVnFOBGZmFZdr8Hoz60weaMbycInAzKzinAjMzCrOicDMrOLcRtAGtf0IuU+hgfkc\nNae2XcAGVu98Venfm0sEZmYV50RgZlZxTgRmZhXnNoKS5Knndl34a3y+mpOnXcDPFLzG5+v1CisR\nSPq2pNWSHqiZN1rSXElL0/t2Re3fzMzyKbJq6DvA4X3mnQnMi4gpwLw0bWZmbVRYIoiI24Bn+sye\nBsxOn2cD04vav5mZ5VN2G8EOEbESICJWShpXb0FJM4GZAJMmTSopvNZyHXZzhnK+qniuW3G+Brt+\nJxrq8xXd/G9s2N41FBGzIqInInrGjh3b7nDMzLpW2YlglaTxAOl9dcn7NzOzPspOBDcAM9LnGcD1\nJe/fzMz6KPL20SuAO4DdJC2XdBJwLnCYpKXAYWnazMzaqLDG4og4vs5Xhxa1z3arYgPcUBXRANfN\nna75fDWnqGPrtobjYdtYbGZm5XAiMDOrOCcCM7OKc6dzQ9TN9atF6bb61aL5fDWn7P+T3XB9XCIw\nM6s4JwIzs4pzIjAzqzi3EQyC2wWaNxzqUTvpuvl8NWe4xDocrttguERgZlZxTgRmZhXnRGBmVnFu\nI8hpuNRB1jMc6yaHY0zDmc9Xc/x/snVcIjAzqzgnAjOzinMiMDOrOLcRNDDc6yCHo06qF+3Vzuvc\niecL2he3z1cxXCIwM6s4JwIzs4pzIjAzqzi3EfThdoHmeJzm5g33+uLhptvO13A8HpcIzMwqzonA\nzKzinAjMzCrObQRUp12gVcdZlfPVSsOxXng4q8r5Gi7H6RKBmVnFORGYmVWcE4GZWcU5EZiZVVxl\nG4vd4Nkcn6/mDZeGwE5R9fPVzuN3icDMrOKcCMzMKs6JwMys4irVRuB67ub4fDWv6vXczfL56l/Z\n56UtJQJJh0v6qaRlks5sRwxmZpYpPRFIGgF8A3gf8HbgeElvLzsOMzPLtKNEsC+wLCIei4jfAFcC\n09oQh5mZAYqIcncoHQMcHhEnp+kTgd+NiI/2WW4mMDNN7gb8dAi7HQM8NYT1O5GPuRp8zNUw2GN+\na0SMHWihdjQWq595b8hGETELmNWSHUqLIqKnFdvqFD7mavAxV0PRx9yOqqHlwMSa6QnAijbEYWZm\ntCcR3A1MkbSzpM2A44Ab2hCHmZnRhqqhiNgo6aPAD4ARwLcj4sGCd9uSKqYO42OuBh9zNRR6zKU3\nFpuZ2fDiLibMzCrOicDMrOK6PhFUoTsLSRMlzZf0kKQHJZ2a5o+WNFfS0vS+XbtjbSVJIyTdI+mm\nNL2zpIXpeK9KNyN0FUmjJM2R9HC63vtX4Dqfnv5dPyDpCklbdNu1lvRtSaslPVAzr9/rqsxX02/a\nfZL2Hur+uzoRVKg7i43AGRGxB7Af8JF0nGcC8yJiCjAvTXeTU4GHaqbPAy5Ix7sWOKktURXrX4Bb\nImJ3YCrZ8XftdZa0E3AK0BMR7yC7weQ4uu9afwc4vM+8etf1fcCU9JoJfGuoO+/qREBFurOIiJUR\n8eP0+TmyH4edyI51dlpsNjC9PRG2nqQJwB8BF6VpAYcAc9IiXXW8AJJGAgcCFwNExG8iYh1dfJ2T\nTYAtJW0CbAWspMuudUTcBjzTZ3a96zoN+I/I3AmMkjR+KPvv9kSwE/BkzfTyNK9rSZoM7AUsBHaI\niJWQJQtgXPsia7kLgU8Br6Tp7YF1EbExTXfjtd4FWANckqrELpL0Zrr4OkfEL4AvA0+QJYBngcV0\n/7WG+te15b9r3Z4IcnVn0S0kbQ1cA5wWEevbHU9RJB0JrI6IxbWz+1m02671JsDewLciYi/gV3RR\nNVB/Ur34NGBnYEfgzWRVI31127VupOX/1rs9EVSmOwtJm5Ilgcsi4to0e1VvkTG9r25XfC12AHCU\npMfJqvsOISshjErVB9Cd13o5sDwiFqbpOWSJoVuvM8B7gZ9FxJqI2ABcC7yb7r/WUP+6tvx3rdsT\nQSW6s0j14xcDD0XE+TVf3QDMSJ9nANeXHVsRIuLTETEhIiaTXdMfRsQJwHzgmLRY1xxvr4j4JfCk\npN3SrEOBn9Cl1zl5AthP0lbp33nvMXf1tU7qXdcbgA+lu4f2A57trUIatIjo6hdwBPAI8Cjw2XbH\nU9AxvoesaHgfcG96HUFWbz4PWJreR7c71gKO/SDgpvR5F+AuYBlwNbB5u+Mr4Hj3BBala30dsF23\nX2fgHOBh4AHgUmDzbrvWwBVkbSAbyP7iP6nedSWrGvpG+k27n+yOqiHt311MmJlVXLdXDZmZ2QCc\nCMzMKs6JwMys4pwIzMwqzonAzKzinAis0iS9RdKVkh6V9BNJ/ylp10Fs56LeDg0lfSbnOo9LGtPs\nvsxazbePWmWlB5T+D5gdEf+a5u0JbBMR/zOE7T4fEVvnWO5xsnvAnxrsvsxawSUCq7KDgQ29SQAg\nIu4F7pE0T9KPJd0vaRpkHfqlcQBmp37g50jaKn23QFKPpHPJesq8V9Jl6bvrJC1OferPbMNxmjXk\nRGBV9g6yniz7egl4f0TsTZYsvpJKDwC7AbMi4p3AeuBva1eMiDOBFyNiz8i6vQD4y4jYB+gBTpG0\nfQHHYjZoTgRmbyTgi5LuA/6brIvfHdJ3T0bE/6bP3yXr3mMgp0haAtxJ1lnYlBbHazYkmwy8iFnX\nepDXOi6rdQIwFtgnIjakuvwt0nd9G9UaNrJJOoisB839I+IFSQtqtmU2LLhEYFX2Q2BzSR/unSHp\nXcBbycY72CDp4DTda5Kk/dPn44Hb+9nuhtQtOMC2wNqUBHYnG0rUbFhxIrDKiuyWufcDh6XbRx8E\nzgb+E+iRtIisdPBwzWoPATNStdFo+h8vdhZwX2osvgXYJC3/ebLqIbNhxbePmuWUhgG9KbJB1M26\nhksEZmYV5xKBmVnFuURgZlZxTgRmZhXnRGBmVnFOBGZmFedEYGZWcf8PGuEWwOrW2QgAAAAASUVO\nRK5CYII=\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plotting Capital vs Final Policy\n", + "\n", + "# x axis values\n", + "x = range(100)\n", + "# corresponding y axis values\n", + "y = policy\n", + " \n", + "# plotting the bars\n", + "plt.bar(x, y, align='center', alpha=0.5)\n", + " \n", + "# naming the x axis\n", + "plt.xlabel('Capital')\n", + "# naming the y axis\n", + "plt.ylabel('Final policy (stake)')\n", + " \n", + "# giving a title to the graph\n", + "plt.title('Capital vs Final Policy')\n", + " \n", + "# function to show the plot\n", + "plt.show()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/DP/Gamblers Problem.ipynb b/DP/Gamblers Problem.ipynb new file mode 100644 index 000000000..0ed86294d --- /dev/null +++ b/DP/Gamblers Problem.ipynb @@ -0,0 +1,158 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "### This is Example 4.3. Gambler’s Problem from Sutton's book.\n", + "\n", + "A gambler has the opportunity to make bets on the outcomes of a sequence of coin flips. \n", + "If the coin comes up heads, he wins as many dollars as he has staked on that flip; \n", + "if it is tails, he loses his stake. The game ends when the gambler wins by reaching his goal of $100, \n", + "or loses by running out of money. \n", + "\n", + "On each flip, the gambler must decide what portion of his capital to stake, in integer numbers of dollars. \n", + "This problem can be formulated as an undiscounted, episodic, finite MDP. \n", + "\n", + "The state is the gambler’s capital, s ∈ {1, 2, . . . , 99}.\n", + "The actions are stakes, a ∈ {0, 1, . . . , min(s, 100 − s)}. \n", + "The reward is zero on all transitions except those on which the gambler reaches his goal, when it is +1.\n", + "\n", + "The state-value function then gives the probability of winning from each state. A policy is a mapping from levels of capital to stakes. The optimal policy maximizes the probability of reaching the goal. Let p_h denote the probability of the coin coming up heads. If p_h is known, then the entire problem is known and it can be solved, for instance, by value iteration.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import sys\n", + "import matplotlib.pyplot as plt\n", + "if \"../\" not in sys.path:\n", + " sys.path.append(\"../\") " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "\n", + "### Exercise 4.9 (programming)\n", + "\n", + "Implement value iteration for the gambler’s problem and solve it for p_h = 0.25 and p_h = 0.55.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def value_iteration_for_gamblers(p_h, theta=0.0001, discount_factor=1.0):\n", + " \"\"\"\n", + " Args:\n", + " p_h: Probability of the coin coming up heads\n", + " \"\"\"\n", + " \n", + " def one_step_lookahead(s, V, rewards):\n", + " \"\"\"\n", + " Helper function to calculate the value for all action in a given state.\n", + " \n", + " Args:\n", + " s: The gambler’s capital. Integer.\n", + " V: The vector that contains values at each state. \n", + " rewards: The reward vector.\n", + " \n", + " Returns:\n", + " A vector containing the expected value of each action. \n", + " Its length equals to the number of actions.\n", + " \"\"\"\n", + " \n", + " # Implement!\n", + " \n", + " return A\n", + " \n", + " # Implement!\n", + " \n", + " return policy, V" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "policy, v = value_iteration_for_gamblers(0.25)\n", + "\n", + "print(\"Optimized Policy:\")\n", + "print(policy)\n", + "print(\"\")\n", + "\n", + "print(\"Optimized Value Function:\")\n", + "print(v)\n", + "print(\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Plotting Final Policy (action stake) vs State (Capital)\n", + "\n", + "# Implement!" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Plotting Capital vs Final Policy\n", + "\n", + "# Implement!\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/DP/Policy Evaluation Solution.ipynb b/DP/Policy Evaluation Solution.ipynb index a8b949367..0b06f87e7 100644 --- a/DP/Policy Evaluation Solution.ipynb +++ b/DP/Policy Evaluation Solution.ipynb @@ -2,12 +2,11 @@ "cells": [ { "cell_type": "code", - "execution_count": 53, - "metadata": { - "collapsed": false - }, + "execution_count": 1, + "metadata": {}, "outputs": [], "source": [ + "from IPython.core.debugger import set_trace\n", "import numpy as np\n", "import pprint\n", "import sys\n", @@ -18,10 +17,8 @@ }, { "cell_type": "code", - "execution_count": 54, - "metadata": { - "collapsed": true - }, + "execution_count": 2, + "metadata": {}, "outputs": [], "source": [ "pp = pprint.PrettyPrinter(indent=2)\n", @@ -30,10 +27,8 @@ }, { "cell_type": "code", - "execution_count": 55, - "metadata": { - "collapsed": true - }, + "execution_count": 3, + "metadata": {}, "outputs": [], "source": [ "def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):\n", @@ -43,9 +38,11 @@ " Args:\n", " policy: [S, A] shaped matrix representing the policy.\n", " env: OpenAI env. env.P represents the transition probabilities of the environment.\n", - " env.P[s][a] is a (prob, next_state, reward, done) tuple.\n", + " env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n", + " env.nS is a number of states in the environment. \n", + " env.nA is a number of actions in the environment.\n", " theta: We stop evaluation once our value function change is less than theta for all states.\n", - " discount_factor: lambda discount factor.\n", + " discount_factor: Gamma discount factor.\n", " \n", " Returns:\n", " Vector of length env.nS representing the value function.\n", @@ -61,7 +58,7 @@ " for a, action_prob in enumerate(policy[s]):\n", " # For each action, look at the possible next states...\n", " for prob, next_state, reward, done in env.P[s][a]:\n", - " # Calculate the expected value\n", + " # Calculate the expected value. Ref: Sutton book eq. 4.6.\n", " v += action_prob * prob * (reward + discount_factor * V[next_state])\n", " # How much our value function changed (across any states)\n", " delta = max(delta, np.abs(v - V[s]))\n", @@ -74,10 +71,8 @@ }, { "cell_type": "code", - "execution_count": 56, - "metadata": { - "collapsed": false - }, + "execution_count": 4, + "metadata": {}, "outputs": [], "source": [ "random_policy = np.ones([env.nS, env.nA]) / env.nA\n", @@ -86,10 +81,8 @@ }, { "cell_type": "code", - "execution_count": 57, - "metadata": { - "collapsed": false - }, + "execution_count": 5, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -98,7 +91,8 @@ "Value Function:\n", "[ 0. -13.99993529 -19.99990698 -21.99989761 -13.99993529\n", " -17.9999206 -19.99991379 -19.99991477 -19.99990698 -19.99991379\n", - " -17.99992725 -13.99994569 -21.99989761 -19.99991477 -13.99994569 0. ]\n", + " -17.99992725 -13.99994569 -21.99989761 -19.99991477 -13.99994569\n", + " 0. ]\n", "\n", "Reshaped Grid Value Function:\n", "[[ 0. -13.99993529 -19.99990698 -21.99989761]\n", @@ -121,10 +115,8 @@ }, { "cell_type": "code", - "execution_count": 51, - "metadata": { - "collapsed": false - }, + "execution_count": 6, + "metadata": {}, "outputs": [], "source": [ "# Test: Make sure the evaluated policy is what we expected\n", @@ -135,9 +127,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [] } @@ -158,9 +148,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.4" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/DP/Policy Evaluation.ipynb b/DP/Policy Evaluation.ipynb index e401c7759..381a58260 100644 --- a/DP/Policy Evaluation.ipynb +++ b/DP/Policy Evaluation.ipynb @@ -4,7 +4,7 @@ "cell_type": "code", "execution_count": 23, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -30,7 +30,7 @@ "cell_type": "code", "execution_count": 25, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -41,9 +41,11 @@ " Args:\n", " policy: [S, A] shaped matrix representing the policy.\n", " env: OpenAI env. env.P represents the transition probabilities of the environment.\n", - " env.P[s][a] is a (prob, next_state, reward, done) tuple.\n", + " env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n", + " env.nS is a number of states in the environment. \n", + " env.nA is a number of actions in the environment.\n", " theta: We stop evaluation once our value function change is less than theta for all states.\n", - " discount_factor: gamma discount factor.\n", + " discount_factor: Gamma discount factor.\n", " \n", " Returns:\n", " Vector of length env.nS representing the value function.\n", @@ -60,7 +62,7 @@ "cell_type": "code", "execution_count": 26, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -71,9 +73,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "ename": "AssertionError", @@ -121,9 +121,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.5.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/DP/Policy Iteration Solution.ipynb b/DP/Policy Iteration Solution.ipynb index cfb68a2e5..73009f000 100644 --- a/DP/Policy Iteration Solution.ipynb +++ b/DP/Policy Iteration Solution.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -19,9 +17,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "pp = pprint.PrettyPrinter(indent=2)\n", @@ -30,10 +26,8 @@ }, { "cell_type": "code", - "execution_count": 62, - "metadata": { - "collapsed": true - }, + "execution_count": 3, + "metadata": {}, "outputs": [], "source": [ "# Taken from Policy Evaluation Exercise!\n", @@ -45,9 +39,11 @@ " Args:\n", " policy: [S, A] shaped matrix representing the policy.\n", " env: OpenAI env. env.P represents the transition probabilities of the environment.\n", - " env.P[s][a] is a (prob, next_state, reward, done) tuple.\n", - " theta: We stop evaluation one our value function change is less than theta for all states.\n", - " discount_factor: lambda discount factor.\n", + " env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n", + " env.nS is a number of states in the environment. \n", + " env.nA is a number of actions in the environment.\n", + " theta: We stop evaluation once our value function change is less than theta for all states.\n", + " discount_factor: Gamma discount factor.\n", " \n", " Returns:\n", " Vector of length env.nS representing the value function.\n", @@ -76,10 +72,8 @@ }, { "cell_type": "code", - "execution_count": 63, - "metadata": { - "collapsed": true - }, + "execution_count": 4, + "metadata": {}, "outputs": [], "source": [ "def policy_improvement(env, policy_eval_fn=policy_eval, discount_factor=1.0):\n", @@ -88,10 +82,10 @@ " until an optimal policy is found.\n", " \n", " Args:\n", - " env: The OpenAI envrionment.\n", + " env: The OpenAI environment.\n", " policy_eval_fn: Policy Evaluation function that takes 3 arguments:\n", " policy, env, discount_factor.\n", - " discount_factor: Lambda discount factor.\n", + " discount_factor: gamma discount factor.\n", " \n", " Returns:\n", " A tuple (policy, V). \n", @@ -100,6 +94,24 @@ " V is the value function for the optimal policy.\n", " \n", " \"\"\"\n", + "\n", + " def one_step_lookahead(state, V):\n", + " \"\"\"\n", + " Helper function to calculate the value for all action in a given state.\n", + " \n", + " Args:\n", + " state: The state to consider (int)\n", + " V: The value to use as an estimator, Vector of length env.nS\n", + " \n", + " Returns:\n", + " A vector of length env.nA containing the expected value of each action.\n", + " \"\"\"\n", + " A = np.zeros(env.nA)\n", + " for a in range(env.nA):\n", + " for prob, next_state, reward, done in env.P[state][a]:\n", + " A[a] += prob * (reward + discount_factor * V[next_state])\n", + " return A\n", + " \n", " # Start with a random policy\n", " policy = np.ones([env.nS, env.nA]) / env.nA\n", " \n", @@ -112,15 +124,12 @@ " \n", " # For each state...\n", " for s in range(env.nS):\n", - " # The best action we would take under the currect policy\n", + " # The best action we would take under the current policy\n", " chosen_a = np.argmax(policy[s])\n", " \n", " # Find the best action by one-step lookahead\n", " # Ties are resolved arbitarily\n", - " action_values = np.zeros(env.nA)\n", - " for a in range(env.nA):\n", - " for prob, next_state, reward, done in env.P[s][a]:\n", - " action_values[a] += prob * (reward + discount_factor * V[next_state])\n", + " action_values = one_step_lookahead(s, V)\n", " best_a = np.argmax(action_values)\n", " \n", " # Greedily update the policy\n", @@ -135,32 +144,30 @@ }, { "cell_type": "code", - "execution_count": 64, - "metadata": { - "collapsed": false - }, + "execution_count": 5, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Policy Probability Distribution:\n", - "[[ 1. 0. 0. 0.]\n", - " [ 0. 0. 0. 1.]\n", - " [ 0. 0. 0. 1.]\n", - " [ 0. 0. 1. 0.]\n", - " [ 1. 0. 0. 0.]\n", - " [ 1. 0. 0. 0.]\n", - " [ 1. 0. 0. 0.]\n", - " [ 0. 0. 1. 0.]\n", - " [ 1. 0. 0. 0.]\n", - " [ 1. 0. 0. 0.]\n", - " [ 0. 1. 0. 0.]\n", - " [ 0. 0. 1. 0.]\n", - " [ 1. 0. 0. 0.]\n", - " [ 0. 1. 0. 0.]\n", - " [ 0. 1. 0. 0.]\n", - " [ 1. 0. 0. 0.]]\n", + "[[1. 0. 0. 0.]\n", + " [0. 0. 0. 1.]\n", + " [0. 0. 0. 1.]\n", + " [0. 0. 1. 0.]\n", + " [1. 0. 0. 0.]\n", + " [1. 0. 0. 0.]\n", + " [1. 0. 0. 0.]\n", + " [0. 0. 1. 0.]\n", + " [1. 0. 0. 0.]\n", + " [1. 0. 0. 0.]\n", + " [0. 1. 0. 0.]\n", + " [0. 0. 1. 0.]\n", + " [1. 0. 0. 0.]\n", + " [0. 1. 0. 0.]\n", + " [0. 1. 0. 0.]\n", + " [1. 0. 0. 0.]]\n", "\n", "Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):\n", "[[0 3 3 2]\n", @@ -202,10 +209,8 @@ }, { "cell_type": "code", - "execution_count": 59, - "metadata": { - "collapsed": false - }, + "execution_count": 6, + "metadata": {}, "outputs": [], "source": [ "# Test the value function\n", @@ -216,9 +221,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [] } @@ -239,9 +242,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.4" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/DP/Policy Iteration.ipynb b/DP/Policy Iteration.ipynb index d67b22505..fc87f291b 100644 --- a/DP/Policy Iteration.ipynb +++ b/DP/Policy Iteration.ipynb @@ -4,7 +4,7 @@ "cell_type": "code", "execution_count": 5, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -45,9 +45,11 @@ " Args:\n", " policy: [S, A] shaped matrix representing the policy.\n", " env: OpenAI env. env.P represents the transition probabilities of the environment.\n", - " env.P[s][a] is a (prob, next_state, reward, done) tuple.\n", - " theta: We stop evaluation one our value function change is less than theta for all states.\n", - " discount_factor: lambda discount factor.\n", + " env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n", + " env.nS is a number of states in the environment. \n", + " env.nA is a number of actions in the environment.\n", + " theta: We stop evaluation once our value function change is less than theta for all states.\n", + " discount_factor: Gamma discount factor.\n", " \n", " Returns:\n", " Vector of length env.nS representing the value function.\n", @@ -78,7 +80,7 @@ "cell_type": "code", "execution_count": 13, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -91,7 +93,7 @@ " env: The OpenAI envrionment.\n", " policy_eval_fn: Policy Evaluation function that takes 3 arguments:\n", " policy, env, discount_factor.\n", - " discount_factor: Lambda discount factor.\n", + " discount_factor: gamma discount factor.\n", " \n", " Returns:\n", " A tuple (policy, V). \n", @@ -113,9 +115,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -180,9 +180,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "ename": "AssertionError", @@ -230,9 +228,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.5.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/DP/README.md b/DP/README.md index bdb6fd086..a6dabe88c 100644 --- a/DP/README.md +++ b/DP/README.md @@ -28,19 +28,23 @@ **Optional:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 4: Dynamic Programming +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 4: Dynamic Programming ### Exercises - Implement Policy Evaluation in Python (Gridworld) - - [Exercise](Policy Evaluation.ipynb) - - [Solution](Policy Evaluation Solution.ipynb) + - [Exercise](Policy%20Evaluation.ipynb) + - [Solution](Policy%20Evaluation%20Solution.ipynb) - Implement Policy Iteration in Python (Gridworld) - - [Exercise](Policy Iteration.ipynb) - - [Solution](Policy Iteration Solution.ipynb) + - [Exercise](Policy%20Iteration.ipynb) + - [Solution](Policy%20Iteration%20Solution.ipynb) - Implement Value Iteration in Python (Gridworld) - - [Exercise](Value Iteration.ipynb) - - [Solution](Value Iteration Solution.ipynb) + - [Exercise](Value%20Iteration.ipynb) + - [Solution](Value%20Iteration%20Solution.ipynb) + +- Implement Gambler's Problem + - [Exercise](Gamblers%20Problem.ipynb) + - [Solution](Gamblers%20Problem%20Solution.ipynb) \ No newline at end of file diff --git a/DP/Value Iteration Solution.ipynb b/DP/Value Iteration Solution.ipynb index 7bc985d15..90ec96a17 100644 --- a/DP/Value Iteration Solution.ipynb +++ b/DP/Value Iteration Solution.ipynb @@ -2,10 +2,8 @@ "cells": [ { "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": false - }, + "execution_count": 1, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -18,10 +16,8 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": false - }, + "execution_count": 2, + "metadata": {}, "outputs": [], "source": [ "pp = pprint.PrettyPrinter(indent=2)\n", @@ -30,10 +26,8 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "collapsed": false - }, + "execution_count": 3, + "metadata": {}, "outputs": [], "source": [ "def value_iteration(env, theta=0.0001, discount_factor=1.0):\n", @@ -41,10 +35,12 @@ " Value Iteration Algorithm.\n", " \n", " Args:\n", - " env: OpenAI environment. env.P represents the transition probabilities of the environment.\n", - " theta: Stopping threshold. If the value of all states changes less than theta\n", - " in one iteration we are done.\n", - " discount_factor: lambda time discount factor.\n", + " env: OpenAI env. env.P represents the transition probabilities of the environment.\n", + " env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n", + " env.nS is a number of states in the environment. \n", + " env.nA is a number of actions in the environment.\n", + " theta: We stop evaluation once our value function change is less than theta for all states.\n", + " discount_factor: Gamma discount factor.\n", " \n", " Returns:\n", " A tuple (policy, V) of the optimal policy and the optimal value function.\n", @@ -78,7 +74,7 @@ " best_action_value = np.max(A)\n", " # Calculate delta across all states seen so far\n", " delta = max(delta, np.abs(best_action_value - V[s]))\n", - " # Update the value function\n", + " # Update the value function. Ref: Sutton book eq. 4.10. \n", " V[s] = best_action_value \n", " # Check if we can stop \n", " if delta < theta:\n", @@ -98,32 +94,30 @@ }, { "cell_type": "code", - "execution_count": 20, - "metadata": { - "collapsed": false - }, + "execution_count": 4, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Policy Probability Distribution:\n", - "[[ 1. 0. 0. 0.]\n", - " [ 0. 0. 0. 1.]\n", - " [ 0. 0. 0. 1.]\n", - " [ 0. 0. 1. 0.]\n", - " [ 1. 0. 0. 0.]\n", - " [ 1. 0. 0. 0.]\n", - " [ 1. 0. 0. 0.]\n", - " [ 0. 0. 1. 0.]\n", - " [ 1. 0. 0. 0.]\n", - " [ 1. 0. 0. 0.]\n", - " [ 0. 1. 0. 0.]\n", - " [ 0. 0. 1. 0.]\n", - " [ 1. 0. 0. 0.]\n", - " [ 0. 1. 0. 0.]\n", - " [ 0. 1. 0. 0.]\n", - " [ 1. 0. 0. 0.]]\n", + "[[1. 0. 0. 0.]\n", + " [0. 0. 0. 1.]\n", + " [0. 0. 0. 1.]\n", + " [0. 0. 1. 0.]\n", + " [1. 0. 0. 0.]\n", + " [1. 0. 0. 0.]\n", + " [1. 0. 0. 0.]\n", + " [0. 0. 1. 0.]\n", + " [1. 0. 0. 0.]\n", + " [1. 0. 0. 0.]\n", + " [0. 1. 0. 0.]\n", + " [0. 0. 1. 0.]\n", + " [1. 0. 0. 0.]\n", + " [0. 1. 0. 0.]\n", + " [0. 1. 0. 0.]\n", + " [1. 0. 0. 0.]]\n", "\n", "Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):\n", "[[0 3 3 2]\n", @@ -163,12 +157,21 @@ "print(\"\")" ] }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Test the value function\n", + "expected_v = np.array([ 0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1, 0])\n", + "np.testing.assert_array_almost_equal(v, expected_v, decimal=2)" + ] + }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [] } @@ -176,9 +179,9 @@ "metadata": { "anaconda-cloud": {}, "kernelspec": { - "display_name": "Python [Root]", + "display_name": "Python 3", "language": "python", - "name": "Python [Root]" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -190,9 +193,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.4" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/DP/Value Iteration.ipynb b/DP/Value Iteration.ipynb index 6329d12f7..ff4bf15dd 100644 --- a/DP/Value Iteration.ipynb +++ b/DP/Value Iteration.ipynb @@ -4,7 +4,7 @@ "cell_type": "code", "execution_count": 3, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -20,7 +20,7 @@ "cell_type": "code", "execution_count": 4, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -32,7 +32,7 @@ "cell_type": "code", "execution_count": 5, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -41,10 +41,12 @@ " Value Iteration Algorithm.\n", " \n", " Args:\n", - " env: OpenAI environment. env.P represents the transition probabilities of the environment.\n", - " theta: Stopping threshold. If the value of all states changes less than theta\n", - " in one iteration we are done.\n", - " discount_factor: lambda time discount factor.\n", + " env: OpenAI env. env.P represents the transition probabilities of the environment.\n", + " env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n", + " env.nS is a number of states in the environment. \n", + " env.nA is a number of actions in the environment.\n", + " theta: We stop evaluation once our value function change is less than theta for all states.\n", + " discount_factor: Gamma discount factor.\n", " \n", " Returns:\n", " A tuple (policy, V) of the optimal policy and the optimal value function. \n", @@ -61,9 +63,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -128,9 +128,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "ename": "AssertionError", @@ -169,9 +167,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.5.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/DQN/Breakout Playground.ipynb b/DQN/Breakout Playground.ipynb index 5ff6a9e99..4e1a48ed8 100644 --- a/DQN/Breakout Playground.ipynb +++ b/DQN/Breakout Playground.ipynb @@ -73,7 +73,7 @@ ], "source": [ "print(\"Action space size: {}\".format(env.action_space.n))\n", - "print(env.get_action_meanings())\n", + "print(env.get_action_meanings()) # env.unwrapped.get_action_meanings() for gym 0.8.0 or later\n", "\n", "observation = env.reset()\n", "print(\"Observation space shape: {}\".format(observation.shape))\n", diff --git a/DQN/Deep Q Learning Solution.ipynb b/DQN/Deep Q Learning Solution.ipynb index 7dd832212..90881ea07 100644 --- a/DQN/Deep Q Learning Solution.ipynb +++ b/DQN/Deep Q Learning Solution.ipynb @@ -17,6 +17,7 @@ "import os\n", "import random\n", "import sys\n", + "import psutil\n", "import tensorflow as tf\n", "\n", "if \"../\" not in sys.path:\n", @@ -30,7 +31,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -41,7 +42,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -59,7 +60,7 @@ "source": [ "class StateProcessor():\n", " \"\"\"\n", - " Processes a raw Atari iamges. Resizes it and converts it to grayscale.\n", + " Processes a raw Atari images. Resizes it and converts it to grayscale.\n", " \"\"\"\n", " def __init__(self):\n", " # Build the Tensorflow graph\n", @@ -78,7 +79,7 @@ " state: A [210, 160, 3] Atari RGB State\n", "\n", " Returns:\n", - " A processed [84, 84, 1] state representing grayscale values.\n", + " A processed [84, 84] state representing grayscale values.\n", " \"\"\"\n", " return sess.run(self.output, { self.input_state: state })" ] @@ -87,7 +88,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -116,7 +117,7 @@ " \"\"\"\n", "\n", " # Placeholders for our input\n", - " # Our input are 4 RGB frames of shape 160, 160 each\n", + " # Our input are 4 grayscale frames of shape 84, 84 each\n", " self.X_pl = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name=\"X\")\n", " # The TD target value\n", " self.y_pl = tf.placeholder(shape=[None], dtype=tf.float32, name=\"y\")\n", @@ -143,7 +144,7 @@ " gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.actions_pl\n", " self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices)\n", "\n", - " # Calcualte the loss\n", + " # Calculate the loss\n", " self.losses = tf.squared_difference(self.y_pl, self.action_predictions)\n", " self.loss = tf.reduce_mean(self.losses)\n", "\n", @@ -165,7 +166,7 @@ "\n", " Args:\n", " sess: Tensorflow session\n", - " s: State input of shape [batch_size, 4, 160, 160, 3]\n", + " s: State input of shape [batch_size, 4, 84, 84, 1]\n", "\n", " Returns:\n", " Tensor of shape [batch_size, NUM_VALID_ACTIONS] containing the estimated \n", @@ -179,7 +180,7 @@ "\n", " Args:\n", " sess: Tensorflow session object\n", - " s: State input of shape [batch_size, 4, 160, 160, 3]\n", + " s: State input of shape [batch_size, 4, 84, 84, 1]\n", " a: Chosen actions of shape [batch_size]\n", " y: Targets of shape [batch_size]\n", "\n", @@ -199,7 +200,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -234,30 +235,39 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ - "def copy_model_parameters(sess, estimator1, estimator2):\n", + "class ModelParametersCopier():\n", " \"\"\"\n", - " Copies the model parameters of one estimator to another.\n", - "\n", - " Args:\n", - " sess: Tensorflow session instance\n", - " estimator1: Estimator to copy the paramters from\n", - " estimator2: Estimator to copy the parameters to\n", + " Copy model parameters of one estimator to another.\n", " \"\"\"\n", - " e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1.scope)]\n", - " e1_params = sorted(e1_params, key=lambda v: v.name)\n", - " e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)]\n", - " e2_params = sorted(e2_params, key=lambda v: v.name)\n", - "\n", - " update_ops = []\n", - " for e1_v, e2_v in zip(e1_params, e2_params):\n", - " op = e2_v.assign(e1_v)\n", - " update_ops.append(op)\n", - "\n", - " sess.run(update_ops)" + " \n", + " def __init__(self, estimator1, estimator2):\n", + " \"\"\"\n", + " Defines copy-work operation graph. \n", + " Args:\n", + " estimator1: Estimator to copy the paramters from\n", + " estimator2: Estimator to copy the parameters to\n", + " \"\"\"\n", + " e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1.scope)]\n", + " e1_params = sorted(e1_params, key=lambda v: v.name)\n", + " e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)]\n", + " e2_params = sorted(e2_params, key=lambda v: v.name)\n", + "\n", + " self.update_ops = []\n", + " for e1_v, e2_v in zip(e1_params, e2_params):\n", + " op = e2_v.assign(e1_v)\n", + " self.update_ops.append(op)\n", + " \n", + " def make(self, sess):\n", + " \"\"\"\n", + " Makes copy.\n", + " Args:\n", + " sess: Tensorflow session instance\n", + " \"\"\"\n", + " sess.run(self.update_ops)" ] }, { @@ -294,7 +304,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -315,7 +325,7 @@ " batch_size=32,\n", " record_video_every=50):\n", " \"\"\"\n", - " Q-Learning algorithm for fff-policy TD control using Function Approximation.\n", + " Q-Learning algorithm for off-policy TD control using Function Approximation.\n", " Finds the optimal greedy policy while following an epsilon-greedy policy.\n", "\n", " Args:\n", @@ -331,7 +341,7 @@ " the reply memory.\n", " update_target_estimator_every: Copy parameters from the Q estimator to the \n", " target estimator every N steps\n", - " discount_factor: Lambda time discount factor\n", + " discount_factor: Gamma discount factor\n", " epsilon_start: Chance to sample a random action when taking an action.\n", " Epsilon is decayed over time and this is the start value\n", " epsilon_end: The final minimum value of epsilon after decaying is done\n", @@ -347,11 +357,17 @@ "\n", " # The replay memory\n", " replay_memory = []\n", + " \n", + " # Make model copier object\n", + " estimator_copy = ModelParametersCopier(q_estimator, target_estimator)\n", "\n", " # Keeps track of useful statistics\n", " stats = plotting.EpisodeStats(\n", " episode_lengths=np.zeros(num_episodes),\n", " episode_rewards=np.zeros(num_episodes))\n", + " \n", + " # For 'system/' summaries, usefull to check if currrent process looks healthy\n", + " current_process = psutil.Process()\n", "\n", " # Create directories for checkpoints and summaries\n", " checkpoint_dir = os.path.join(experiment_dir, \"checkpoints\")\n", @@ -422,14 +438,9 @@ " # Epsilon for this time step\n", " epsilon = epsilons[min(total_t, epsilon_decay_steps-1)]\n", "\n", - " # Add epsilon to Tensorboard\n", - " episode_summary = tf.Summary()\n", - " episode_summary.value.add(simple_value=epsilon, tag=\"epsilon\")\n", - " q_estimator.summary_writer.add_summary(episode_summary, total_t)\n", - "\n", " # Maybe update the target estimator\n", " if total_t % update_target_estimator_every == 0:\n", - " copy_model_parameters(sess, q_estimator, target_estimator)\n", + " estimator_copy.make(sess)\n", " print(\"\\nCopied model parameters to target network.\")\n", "\n", " # Print out which step we're on, useful for debugging.\n", @@ -475,11 +486,14 @@ "\n", " # Add summaries to tensorboard\n", " episode_summary = tf.Summary()\n", - " episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], node_name=\"episode_reward\", tag=\"episode_reward\")\n", - " episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], node_name=\"episode_length\", tag=\"episode_length\")\n", - " q_estimator.summary_writer.add_summary(episode_summary, total_t)\n", + " episode_summary.value.add(simple_value=epsilon, tag=\"episode/epsilon\")\n", + " episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], tag=\"episode/reward\")\n", + " episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], tag=\"episode/length\")\n", + " episode_summary.value.add(simple_value=current_process.cpu_percent(), tag=\"system/cpu_usage_percent\")\n", + " episode_summary.value.add(simple_value=current_process.memory_percent(memtype=\"vms\"), tag=\"system/v_memeory_usage_percent\")\n", + " q_estimator.summary_writer.add_summary(episode_summary, i_episode)\n", " q_estimator.summary_writer.flush()\n", - "\n", + " \n", " yield total_t, plotting.EpisodeStats(\n", " episode_lengths=stats.episode_lengths[:i_episode+1],\n", " episode_rewards=stats.episode_rewards[:i_episode+1])\n", @@ -491,7 +505,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -504,7 +518,7 @@ "global_step = tf.Variable(0, name='global_step', trainable=False)\n", " \n", "# Create estimators\n", - "q_estimator = Estimator(scope=\"q\", summaries_dir=experiment_dir)\n", + "q_estimator = Estimator(scope=\"q_estimator\", summaries_dir=experiment_dir)\n", "target_estimator = Estimator(scope=\"target_q\")\n", "\n", "# State processor\n", @@ -531,6 +545,24 @@ "\n", " print(\"\\nEpisode Reward: {}\".format(stats.episode_rewards[-1]))" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] } ], "metadata": { @@ -549,9 +581,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.4.3" + "version": "3.5.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/DQN/Deep Q Learning.ipynb b/DQN/Deep Q Learning.ipynb index d2a295cf1..2b77605c8 100644 --- a/DQN/Deep Q Learning.ipynb +++ b/DQN/Deep Q Learning.ipynb @@ -11,6 +11,7 @@ "%matplotlib inline\n", "\n", "import gym\n", + "from gym.wrappers import Monitor\n", "import itertools\n", "import numpy as np\n", "import os\n", @@ -28,9 +29,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "env = gym.envs.make(\"Breakout-v0\")" @@ -39,9 +38,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Atari Actions: 0 (noop), 1 (fire), 2 (left) and 3 (right) are valid actions\n", @@ -58,7 +55,7 @@ "source": [ "class StateProcessor():\n", " \"\"\"\n", - " Processes a raw Atari iamges. Resizes it and converts it to grayscale.\n", + " Processes a raw Atari images. Resizes it and converts it to grayscale.\n", " \"\"\"\n", " def __init__(self):\n", " # Build the Tensorflow graph\n", @@ -67,7 +64,7 @@ " self.output = tf.image.rgb_to_grayscale(self.input_state)\n", " self.output = tf.image.crop_to_bounding_box(self.output, 34, 0, 160, 160)\n", " self.output = tf.image.resize_images(\n", - " self.output, 84, 84, method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)\n", + " self.output, [84, 84], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)\n", " self.output = tf.squeeze(self.output)\n", "\n", " def process(self, sess, state):\n", @@ -77,7 +74,7 @@ " state: A [210, 160, 3] Atari RGB State\n", "\n", " Returns:\n", - " A processed [84, 84, 1] state representing grayscale values.\n", + " A processed [84, 84] state representing grayscale values.\n", " \"\"\"\n", " return sess.run(self.output, { self.input_state: state })" ] @@ -85,9 +82,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "class Estimator():\n", @@ -107,7 +102,7 @@ " summary_dir = os.path.join(summaries_dir, \"summaries_{}\".format(scope))\n", " if not os.path.exists(summary_dir):\n", " os.makedirs(summary_dir)\n", - " self.summary_writer = tf.train.SummaryWriter(summary_dir)\n", + " self.summary_writer = tf.summary.FileWriter(summary_dir)\n", "\n", " def _build_model(self):\n", " \"\"\"\n", @@ -115,7 +110,7 @@ " \"\"\"\n", "\n", " # Placeholders for our input\n", - " # Our input are 4 RGB frames of shape 160, 160 each\n", + " # Our input are 4 grayscale frames of shape 84, 84 each\n", " self.X_pl = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name=\"X\")\n", " # The TD target value\n", " self.y_pl = tf.placeholder(shape=[None], dtype=tf.float32, name=\"y\")\n", @@ -142,7 +137,7 @@ " gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.actions_pl\n", " self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices)\n", "\n", - " # Calcualte the loss\n", + " # Calculate the loss\n", " self.losses = tf.squared_difference(self.y_pl, self.action_predictions)\n", " self.loss = tf.reduce_mean(self.losses)\n", "\n", @@ -151,11 +146,11 @@ " self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step())\n", "\n", " # Summaries for Tensorboard\n", - " self.summaries = tf.merge_summary([\n", - " tf.scalar_summary(\"loss\", self.loss),\n", - " tf.histogram_summary(\"loss_hist\", self.losses),\n", - " tf.histogram_summary(\"q_values_hist\", self.predictions),\n", - " tf.scalar_summary(\"max_q_value\", tf.reduce_max(self.predictions))\n", + " self.summaries = tf.summary.merge([\n", + " tf.summary.scalar(\"loss\", self.loss),\n", + " tf.summary.histogram(\"loss_hist\", self.losses),\n", + " tf.summary.histogram(\"q_values_hist\", self.predictions),\n", + " tf.summary.scalar(\"max_q_value\", tf.reduce_max(self.predictions))\n", " ])\n", "\n", "\n", @@ -165,7 +160,7 @@ "\n", " Args:\n", " sess: Tensorflow session\n", - " s: State input of shape [batch_size, 4, 160, 160, 3]\n", + " s: State input of shape [batch_size, 4, 84, 84, 1]\n", "\n", " Returns:\n", " Tensor of shape [batch_size, NUM_VALID_ACTIONS] containing the estimated \n", @@ -179,7 +174,7 @@ "\n", " Args:\n", " sess: Tensorflow session object\n", - " s: State input of shape [batch_size, 4, 160, 160, 3]\n", + " s: State input of shape [batch_size, 4, 84, 84, 1]\n", " a: Chosen actions of shape [batch_size]\n", " y: Targets of shape [batch_size]\n", "\n", @@ -198,9 +193,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# For Testing....\n", @@ -212,7 +205,7 @@ "sp = StateProcessor()\n", "\n", "with tf.Session() as sess:\n", - " sess.run(tf.initialize_all_variables())\n", + " sess.run(tf.global_variables_initializer())\n", " \n", " # Example observation batch\n", " observation = env.reset()\n", @@ -233,9 +226,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def copy_model_parameters(sess, estimator1, estimator2):\n", @@ -293,9 +284,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def deep_q_learning(sess,\n", @@ -315,7 +304,7 @@ " batch_size=32,\n", " record_video_every=50):\n", " \"\"\"\n", - " Q-Learning algorithm for fff-policy TD control using Function Approximation.\n", + " Q-Learning algorithm for off-policy TD control using Function Approximation.\n", " Finds the optimal greedy policy while following an epsilon-greedy policy.\n", "\n", " Args:\n", @@ -331,7 +320,7 @@ " the reply memory.\n", " update_target_estimator_every: Copy parameters from the Q estimator to the \n", " target estimator every N steps\n", - " discount_factor: Lambda time discount factor\n", + " discount_factor: Gamma discount factor\n", " epsilon_start: Chance to sample a random action when taking an action.\n", " Epsilon is decayed over time and this is the start value\n", " epsilon_end: The final minimum value of epsilon after decaying is done\n", @@ -391,9 +380,10 @@ " pass\n", "\n", " # Record videos\n", - " env.monitor.start(monitor_path,\n", - " resume=True,\n", - " video_callable=lambda count: count % record_video_every == 0)\n", + " env= Monitor(env,\n", + " directory=monitor_path,\n", + " resume=True,\n", + " video_callable=lambda count: count % record_video_every == 0)\n", "\n", " for i_episode in range(num_episodes):\n", "\n", @@ -467,9 +457,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "tf.reset_default_graph()\n", @@ -526,9 +514,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.5.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/DQN/Double DQN Solution.ipynb b/DQN/Double DQN Solution.ipynb index 22bb9ebc9..f53ca59a6 100644 --- a/DQN/Double DQN Solution.ipynb +++ b/DQN/Double DQN Solution.ipynb @@ -28,9 +28,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "env = gym.envs.make(\"Breakout-v0\")" @@ -39,9 +37,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Atari Actions: 0 (noop), 1 (fire), 2 (left) and 3 (right) are valid actions\n", @@ -58,7 +54,7 @@ "source": [ "class StateProcessor():\n", " \"\"\"\n", - " Processes a raw Atari iamges. Resizes it and converts it to grayscale.\n", + " Processes a raw Atari images. Resizes it and converts it to grayscale.\n", " \"\"\"\n", " def __init__(self):\n", " # Build the Tensorflow graph\n", @@ -77,7 +73,7 @@ " state: A [210, 160, 3] Atari RGB State\n", "\n", " Returns:\n", - " A processed [84, 84, 1] state representing grayscale values.\n", + " A processed [84, 84] state representing grayscale values.\n", " \"\"\"\n", " return sess.run(self.output, { self.input_state: state })" ] @@ -85,9 +81,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "class Estimator():\n", @@ -115,7 +109,7 @@ " \"\"\"\n", "\n", " # Placeholders for our input\n", - " # Our input are 4 RGB frames of shape 160, 160 each\n", + " # Our input are 4 grayscale frames of shape 84, 84 each\n", " self.X_pl = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name=\"X\")\n", " # The TD target value\n", " self.y_pl = tf.placeholder(shape=[None], dtype=tf.float32, name=\"y\")\n", @@ -142,7 +136,7 @@ "\n", " Args:\n", " sess: Tensorflow session\n", - " s: State input of shape [batch_size, 4, 160, 160, 3]\n", + " s: State input of shape [batch_size, 4, 84, 84, 1]\n", "\n", " Returns:\n", " Tensor of shape [batch_size, NUM_VALID_ACTIONS] containing the estimated \n", @@ -156,7 +150,7 @@ "\n", " Args:\n", " sess: Tensorflow session object\n", - " s: State input of shape [batch_size, 4, 160, 160, 3]\n", + " s: State input of shape [batch_size, 4, 84, 84, 1]\n", " a: Chosen actions of shape [batch_size]\n", " y: Targets of shape [batch_size]\n", "\n", @@ -175,9 +169,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# For Testing....\n", @@ -210,9 +202,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def copy_model_parameters(sess, estimator1, estimator2):\n", @@ -270,9 +260,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def deep_q_learning(sess,\n", @@ -292,7 +280,7 @@ " batch_size=32,\n", " record_video_every=50):\n", " \"\"\"\n", - " Q-Learning algorithm for fff-policy TD control using Function Approximation.\n", + " Q-Learning algorithm for off-policy TD control using Function Approximation.\n", " Finds the optimal greedy policy while following an epsilon-greedy policy.\n", "\n", " Args:\n", @@ -308,7 +296,7 @@ " the reply memory.\n", " update_target_estimator_every: Copy parameters from the Q estimator to the \n", " target estimator every N steps\n", - " discount_factor: Lambda time discount factor\n", + " discount_factor: Gamma discount factor\n", " epsilon_start: Chance to sample a random action when taking an action.\n", " Epsilon is decayed over time and this is the start value\n", " epsilon_end: The final minimum value of epsilon after decaying is done\n", @@ -472,9 +460,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "tf.reset_default_graph()\n", @@ -531,9 +517,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.5.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/DQN/README.md b/DQN/README.md index 1528b3d0a..07c887bbc 100644 --- a/DQN/README.md +++ b/DQN/README.md @@ -23,7 +23,7 @@ **Required:** - [Human-Level Control through Deep Reinforcement Learning](http://www.readcube.com/articles/10.1038/nature14236) -- [Demystifying Deep Reinforcement Learning](https://www.nervanasys.com/demystifying-deep-reinforcement-learning/) +- [Demystifying Deep Reinforcement Learning](https://ai.intel.com/demystifying-deep-reinforcement-learning/) - David Silver's RL Course Lecture 6 - Value Function Approximation ([video](https://www.youtube.com/watch?v=UoPei5o4fps), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/FA.pdf)) **Optional:** @@ -39,11 +39,11 @@ ### Exercises -- [OpenAI Gym Atari Environment Playground](Breakout Playground.ipynb) +- Get familiar with the [OpenAI Gym Atari Environment Playground](Breakout%20Playground.ipynb) - Deep-Q Learning for Atari Games - - [Exercise](Deep Q Learning.ipynb) - - [Solution](Deep Q Learning Solution.ipynb) + - [Exercise](Deep%20Q%20Learning.ipynb) + - [Solution](Deep%20Q%20Learning%20Solution.ipynb) - Double-Q Learning - This is a minimal change to Q-Learning so use the same exercise as above - - [Solution](Double DQN Solution.ipynb) + - [Solution](Double%20DQN%20Solution.ipynb) - Prioritized Experience Replay (WIP) diff --git a/DQN/dqn.py b/DQN/dqn.py index 7b459240d..9d6532a8a 100755 --- a/DQN/dqn.py +++ b/DQN/dqn.py @@ -1,4 +1,5 @@ import gym +from gym.wrappers import Monitor import itertools import numpy as np import os @@ -19,7 +20,7 @@ class StateProcessor(): """ - Processes a raw Atari iamges. Resizes it and converts it to grayscale. + Processes a raw Atari images. Resizes it and converts it to grayscale. """ def __init__(self): # Build the Tensorflow graph @@ -28,7 +29,7 @@ def __init__(self): self.output = tf.image.rgb_to_grayscale(self.input_state) self.output = tf.image.crop_to_bounding_box(self.output, 34, 0, 160, 160) self.output = tf.image.resize_images( - self.output, 84, 84, method=tf.image.ResizeMethod.NEAREST_NEIGHBOR) + self.output, [84, 84], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR) self.output = tf.squeeze(self.output) def process(self, sess, state): @@ -38,7 +39,7 @@ def process(self, sess, state): state: A [210, 160, 3] Atari RGB State Returns: - A processed [84, 84, 1] state representing grayscale values. + A processed [84, 84] state representing grayscale values. """ return sess.run(self.output, { self.input_state: state }) @@ -59,7 +60,7 @@ def __init__(self, scope="estimator", summaries_dir=None): summary_dir = os.path.join(summaries_dir, "summaries_{}".format(scope)) if not os.path.exists(summary_dir): os.makedirs(summary_dir) - self.summary_writer = tf.train.SummaryWriter(summary_dir) + self.summary_writer = tf.summary.FileWriter(summary_dir) def _build_model(self): """ @@ -94,7 +95,7 @@ def _build_model(self): gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.actions_pl self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices) - # Calcualte the loss + # Calculate the loss self.losses = tf.squared_difference(self.y_pl, self.action_predictions) self.loss = tf.reduce_mean(self.losses) @@ -103,11 +104,11 @@ def _build_model(self): self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step()) # Summaries for Tensorboard - self.summaries = tf.merge_summary([ - tf.scalar_summary("loss", self.loss), - tf.histogram_summary("loss_hist", self.losses), - tf.histogram_summary("q_values_hist", self.predictions), - tf.scalar_summary("max_q_value", tf.reduce_max(self.predictions)) + self.summaries = tf.summary.merge([ + tf.summary.scalar("loss", self.loss), + tf.summary.histogram("loss_hist", self.losses), + tf.summary.histogram("q_values_hist", self.predictions), + tf.summary.scalar("max_q_value", tf.reduce_max(self.predictions)) ]) @@ -207,7 +208,7 @@ def deep_q_learning(sess, batch_size=32, record_video_every=50): """ - Q-Learning algorithm for fff-policy TD control using Function Approximation. + Q-Learning algorithm for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: @@ -223,7 +224,7 @@ def deep_q_learning(sess, the reply memory. update_target_estimator_every: Copy parameters from the Q estimator to the target estimator every N steps - discount_factor: Lambda time discount factor + discount_factor: Gamma discount factor epsilon_start: Chance to sample a random action when taking an action. Epsilon is decayed over time and this is the start value epsilon_end: The final minimum value of epsilon after decaying is done @@ -292,9 +293,11 @@ def deep_q_learning(sess, state = next_state # Record videos - env.monitor.start(monitor_path, - resume=True, - video_callable=lambda count: count % record_video_every == 0) + # Use the gym env Monitor wrapper + env = Monitor(env, + directory=monitor_path, + resume=True, + video_callable=lambda count: count % record_video_every ==0) for i_episode in range(num_episodes): @@ -398,7 +401,7 @@ def deep_q_learning(sess, state_processor = StateProcessor() with tf.Session() as sess: - sess.run(tf.initialize_all_variables()) + sess.run(tf.global_variables_initializer()) for t, stats in deep_q_learning(sess, env, q_estimator=q_estimator, diff --git a/FA/MountainCar Playground.ipynb b/FA/MountainCar Playground.ipynb index 9b4fe3a36..914f7a5a0 100644 --- a/FA/MountainCar Playground.ipynb +++ b/FA/MountainCar Playground.ipynb @@ -71,7 +71,7 @@ "plt.figure()\n", "plt.imshow(env.render(mode='rgb_array'))\n", "\n", - "env.render(close=True)" + "env.close()" ] }, { diff --git a/FA/Q-Learning with Value Function Approximation Solution.ipynb b/FA/Q-Learning with Value Function Approximation Solution.ipynb index a271d6a63..49c62ca37 100644 --- a/FA/Q-Learning with Value Function Approximation Solution.ipynb +++ b/FA/Q-Learning with Value Function Approximation Solution.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", @@ -31,9 +29,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stderr", @@ -50,9 +46,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -74,7 +68,7 @@ "scaler = sklearn.preprocessing.StandardScaler()\n", "scaler.fit(observation_examples)\n", "\n", - "# Used to converte a state to a featurizes represenation.\n", + "# Used to convert a state to a featurizes represenation.\n", "# We use RBF kernels with different variances to cover different parts of the space\n", "featurizer = sklearn.pipeline.FeatureUnion([\n", " (\"rbf1\", RBFSampler(gamma=5.0, n_components=100)),\n", @@ -88,9 +82,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "class Estimator():\n", @@ -151,9 +143,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def make_epsilon_greedy_policy(estimator, epsilon, nA):\n", @@ -182,9 +172,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def q_learning(env, estimator, num_episodes, discount_factor=1.0, epsilon=0.1, epsilon_decay=1.0):\n", @@ -196,7 +184,7 @@ " env: OpenAI environment.\n", " estimator: Action-Value function estimator\n", " num_episodes: Number of episodes to run for.\n", - " discount_factor: Lambda time discount factor.\n", + " discount_factor: Gamma discount factor.\n", " epsilon: Chance the sample a random action. Float betwen 0 and 1.\n", " epsilon_decay: Each episode, epsilon is decayed by this factor\n", " \n", @@ -283,9 +271,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -305,9 +291,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -384,9 +368,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.5.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/FA/Q-Learning with Value Function Approximation.ipynb b/FA/Q-Learning with Value Function Approximation.ipynb index e83b6bbb0..442605562 100644 --- a/FA/Q-Learning with Value Function Approximation.ipynb +++ b/FA/Q-Learning with Value Function Approximation.ipynb @@ -4,7 +4,7 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -31,9 +31,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stderr", @@ -50,9 +48,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -89,7 +85,7 @@ "cell_type": "code", "execution_count": 4, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -149,7 +145,7 @@ "cell_type": "code", "execution_count": 5, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -180,7 +176,7 @@ "cell_type": "code", "execution_count": 18, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -193,7 +189,7 @@ " env: OpenAI environment.\n", " estimator: Action-Value function estimator\n", " num_episodes: Number of episodes to run for.\n", - " discount_factor: Lambda time discount factor.\n", + " discount_factor: Gamma discount factor.\n", " epsilon: Chance the sample a random action. Float betwen 0 and 1.\n", " epsilon_decay: Each episode, epsilon is decayed by this factor\n", " \n", @@ -237,9 +233,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -259,9 +253,7 @@ { "cell_type": "code", "execution_count": 21, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -326,9 +318,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.4.3" + "version": "3.5.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/FA/README.md b/FA/README.md index 9eb97101f..a8456622d 100644 --- a/FA/README.md +++ b/FA/README.md @@ -25,8 +25,8 @@ **Required:** - David Silver's RL Course Lecture 6 - Value Function Approximation ([video](https://www.youtube.com/watch?v=UoPei5o4fps), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/FA.pdf)) -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 9: On-policy Prediction with Approximation -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 10: On-policy Control with Approximation +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 9: On-policy Prediction with Approximation +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 10: On-policy Control with Approximation **Optional:** @@ -35,6 +35,8 @@ ### Exercises +- Get familiar with the [Mountain Car Playground](MountainCar%20Playground.ipynb) + - Solve Mountain Car Problem using Q-Learning with Linear Function Approximation - - [Exercise](Q-Learning with Value Function Approximation.ipynb) - - [Solution](Q-Learning with Value Function Approximation Solution.ipynb) + - [Exercise](Q-Learning%20with%20Value%20Function%20Approximation.ipynb) + - [Solution](Q-Learning%20with%20Value%20Function%20Approximation%20Solution.ipynb) diff --git a/Introduction/README.md b/Introduction/README.md index f476fabb9..ca8897826 100644 --- a/Introduction/README.md +++ b/Introduction/README.md @@ -17,7 +17,7 @@ **Required:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 1: The Reinforcement Learning Problem +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 1: The Reinforcement Learning Problem - David Silver's RL Course Lecture 1 - Introduction to Reinforcement Learning ([video](https://www.youtube.com/watch?v=2pWv7GOvuf0), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/intro_RL.pdf)) - [OpenAI Gym Tutorial](https://gym.openai.com/docs) diff --git a/MC/Blackjack Playground.ipynb b/MC/Blackjack Playground.ipynb index cbb6c40c8..412322175 100644 --- a/MC/Blackjack Playground.ipynb +++ b/MC/Blackjack Playground.ipynb @@ -2,10 +2,8 @@ "cells": [ { "cell_type": "code", - "execution_count": 419, - "metadata": { - "collapsed": true - }, + "execution_count": 1, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -17,10 +15,8 @@ }, { "cell_type": "code", - "execution_count": 420, - "metadata": { - "collapsed": false - }, + "execution_count": 2, + "metadata": {}, "outputs": [], "source": [ "env = BlackjackEnv()" @@ -28,151 +24,139 @@ }, { "cell_type": "code", - "execution_count": 422, - "metadata": { - "collapsed": false - }, + "execution_count": 3, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Player Score: 17 (Usable Ace: False), Dealer Score: 10\n", + "Player Score: 19 (Usable Ace: False), Dealer Score: 5\n", "Taking action: Hit\n", - "Player Score: 18 (Usable Ace: False), Dealer Score: 10\n", - "Taking action: Hit\n", - "Player Score: 28 (Usable Ace: False), Dealer Score: 10\n", + "Player Score: 27 (Usable Ace: False), Dealer Score: 5\n", "Game end. Reward: -1.0\n", "\n", - "Player Score: 6 (Usable Ace: False), Dealer Score: 9\n", - "Taking action: Hit\n", - "Player Score: 16 (Usable Ace: False), Dealer Score: 9\n", - "Taking action: Hit\n", - "Player Score: 26 (Usable Ace: False), Dealer Score: 9\n", - "Game end. Reward: -1.0\n", + "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n", + "Taking action: Stick\n", + "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n", + "Game end. Reward: 0.0\n", "\n", - "Player Score: 12 (Usable Ace: False), Dealer Score: 6\n", - "Taking action: Hit\n", - "Player Score: 21 (Usable Ace: False), Dealer Score: 6\n", + "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n", "Taking action: Stick\n", - "Player Score: 21 (Usable Ace: False), Dealer Score: 6\n", + "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n", "Game end. Reward: 1.0\n", "\n", - "Player Score: 17 (Usable Ace: True), Dealer Score: 8\n", + "Player Score: 14 (Usable Ace: True), Dealer Score: 10\n", "Taking action: Hit\n", - "Player Score: 17 (Usable Ace: False), Dealer Score: 8\n", + "Player Score: 19 (Usable Ace: True), Dealer Score: 10\n", "Taking action: Hit\n", - "Player Score: 22 (Usable Ace: False), Dealer Score: 8\n", - "Game end. Reward: -1.0\n", - "\n", - "Player Score: 17 (Usable Ace: False), Dealer Score: 8\n", + "Player Score: 15 (Usable Ace: False), Dealer Score: 10\n", "Taking action: Hit\n", - "Player Score: 27 (Usable Ace: False), Dealer Score: 8\n", - "Game end. Reward: -1.0\n", + "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", + "Taking action: Stick\n", + "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", + "Game end. Reward: 1.0\n", "\n", - "Player Score: 16 (Usable Ace: False), Dealer Score: 10\n", - "Taking action: Hit\n", - "Player Score: 19 (Usable Ace: False), Dealer Score: 10\n", + "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", + "Taking action: Stick\n", + "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", + "Game end. Reward: 1.0\n", + "\n", + "Player Score: 18 (Usable Ace: False), Dealer Score: 6\n", "Taking action: Hit\n", - "Player Score: 28 (Usable Ace: False), Dealer Score: 10\n", + "Player Score: 27 (Usable Ace: False), Dealer Score: 6\n", "Game end. Reward: -1.0\n", "\n", - "Player Score: 13 (Usable Ace: False), Dealer Score: 7\n", + "Player Score: 16 (Usable Ace: False), Dealer Score: 3\n", "Taking action: Hit\n", - "Player Score: 14 (Usable Ace: False), Dealer Score: 7\n", + "Player Score: 18 (Usable Ace: False), Dealer Score: 3\n", "Taking action: Hit\n", - "Player Score: 24 (Usable Ace: False), Dealer Score: 7\n", + "Player Score: 23 (Usable Ace: False), Dealer Score: 3\n", "Game end. Reward: -1.0\n", "\n", - "Player Score: 17 (Usable Ace: False), Dealer Score: 5\n", + "Player Score: 19 (Usable Ace: False), Dealer Score: 10\n", "Taking action: Hit\n", - "Player Score: 25 (Usable Ace: False), Dealer Score: 5\n", + "Player Score: 23 (Usable Ace: False), Dealer Score: 10\n", "Game end. Reward: -1.0\n", "\n", - "Player Score: 20 (Usable Ace: False), Dealer Score: 5\n", + "Player Score: 19 (Usable Ace: False), Dealer Score: 4\n", + "Taking action: Hit\n", + "Player Score: 21 (Usable Ace: False), Dealer Score: 4\n", "Taking action: Stick\n", - "Player Score: 20 (Usable Ace: False), Dealer Score: 5\n", + "Player Score: 21 (Usable Ace: False), Dealer Score: 4\n", "Game end. Reward: 1.0\n", "\n", - "Player Score: 12 (Usable Ace: True), Dealer Score: 10\n", - "Taking action: Hit\n", - "Player Score: 20 (Usable Ace: True), Dealer Score: 10\n", + "Player Score: 21 (Usable Ace: True), Dealer Score: 4\n", "Taking action: Stick\n", - "Player Score: 20 (Usable Ace: True), Dealer Score: 10\n", - "Game end. Reward: 0.0\n", + "Player Score: 21 (Usable Ace: True), Dealer Score: 4\n", + "Game end. Reward: 1.0\n", "\n", - "Player Score: 12 (Usable Ace: False), Dealer Score: 10\n", + "Player Score: 16 (Usable Ace: True), Dealer Score: 10\n", "Taking action: Hit\n", - "Player Score: 19 (Usable Ace: False), Dealer Score: 10\n", + "Player Score: 16 (Usable Ace: False), Dealer Score: 10\n", "Taking action: Hit\n", - "Player Score: 24 (Usable Ace: False), Dealer Score: 10\n", + "Player Score: 26 (Usable Ace: False), Dealer Score: 10\n", "Game end. Reward: -1.0\n", "\n", - "Player Score: 19 (Usable Ace: False), Dealer Score: 4\n", + "Player Score: 14 (Usable Ace: False), Dealer Score: 10\n", "Taking action: Hit\n", - "Player Score: 22 (Usable Ace: False), Dealer Score: 4\n", + "Player Score: 23 (Usable Ace: False), Dealer Score: 10\n", "Game end. Reward: -1.0\n", "\n", - "Player Score: 16 (Usable Ace: False), Dealer Score: 10\n", + "Player Score: 12 (Usable Ace: False), Dealer Score: 10\n", "Taking action: Hit\n", - "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", - "Taking action: Stick\n", - "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", - "Game end. Reward: 0.0\n", - "\n", - "Player Score: 4 (Usable Ace: False), Dealer Score: 3\n", + "Player Score: 15 (Usable Ace: False), Dealer Score: 10\n", "Taking action: Hit\n", - "Player Score: 14 (Usable Ace: False), Dealer Score: 3\n", + "Player Score: 16 (Usable Ace: False), Dealer Score: 10\n", "Taking action: Hit\n", - "Player Score: 24 (Usable Ace: False), Dealer Score: 3\n", + "Player Score: 26 (Usable Ace: False), Dealer Score: 10\n", "Game end. Reward: -1.0\n", "\n", - "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n", + "Player Score: 16 (Usable Ace: True), Dealer Score: 8\n", + "Taking action: Hit\n", + "Player Score: 18 (Usable Ace: True), Dealer Score: 8\n", + "Taking action: Hit\n", + "Player Score: 18 (Usable Ace: False), Dealer Score: 8\n", + "Taking action: Hit\n", + "Player Score: 20 (Usable Ace: False), Dealer Score: 8\n", "Taking action: Stick\n", - "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n", + "Player Score: 20 (Usable Ace: False), Dealer Score: 8\n", "Game end. Reward: 1.0\n", "\n", - "Player Score: 16 (Usable Ace: True), Dealer Score: 10\n", - "Taking action: Hit\n", - "Player Score: 12 (Usable Ace: False), Dealer Score: 10\n", - "Taking action: Hit\n", "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", "Taking action: Stick\n", "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n", - "Game end. Reward: 1.0\n", + "Game end. Reward: -1.0\n", "\n", - "Player Score: 9 (Usable Ace: False), Dealer Score: 10\n", + "Player Score: 15 (Usable Ace: False), Dealer Score: 10\n", "Taking action: Hit\n", - "Player Score: 19 (Usable Ace: False), Dealer Score: 10\n", + "Player Score: 16 (Usable Ace: False), Dealer Score: 10\n", "Taking action: Hit\n", - "Player Score: 26 (Usable Ace: False), Dealer Score: 10\n", + "Player Score: 23 (Usable Ace: False), Dealer Score: 10\n", "Game end. Reward: -1.0\n", "\n", - "Player Score: 12 (Usable Ace: False), Dealer Score: 5\n", + "Player Score: 12 (Usable Ace: False), Dealer Score: 4\n", "Taking action: Hit\n", - "Player Score: 15 (Usable Ace: False), Dealer Score: 5\n", + "Player Score: 16 (Usable Ace: False), Dealer Score: 4\n", "Taking action: Hit\n", - "Player Score: 21 (Usable Ace: False), Dealer Score: 5\n", + "Player Score: 24 (Usable Ace: False), Dealer Score: 4\n", + "Game end. Reward: -1.0\n", + "\n", + "Player Score: 20 (Usable Ace: False), Dealer Score: 7\n", "Taking action: Stick\n", - "Player Score: 21 (Usable Ace: False), Dealer Score: 5\n", + "Player Score: 20 (Usable Ace: False), Dealer Score: 7\n", "Game end. Reward: 1.0\n", "\n", - "Player Score: 11 (Usable Ace: False), Dealer Score: 9\n", - "Taking action: Hit\n", - "Player Score: 13 (Usable Ace: False), Dealer Score: 9\n", - "Taking action: Hit\n", - "Player Score: 17 (Usable Ace: False), Dealer Score: 9\n", + "Player Score: 15 (Usable Ace: False), Dealer Score: 7\n", "Taking action: Hit\n", - "Player Score: 19 (Usable Ace: False), Dealer Score: 9\n", - "Taking action: Hit\n", - "Player Score: 29 (Usable Ace: False), Dealer Score: 9\n", - "Game end. Reward: -1.0\n", + "Player Score: 21 (Usable Ace: False), Dealer Score: 7\n", + "Taking action: Stick\n", + "Player Score: 21 (Usable Ace: False), Dealer Score: 7\n", + "Game end. Reward: 1.0\n", "\n", - "Player Score: 14 (Usable Ace: False), Dealer Score: 7\n", - "Taking action: Hit\n", - "Player Score: 19 (Usable Ace: False), Dealer Score: 7\n", + "Player Score: 15 (Usable Ace: False), Dealer Score: 8\n", "Taking action: Hit\n", - "Player Score: 29 (Usable Ace: False), Dealer Score: 7\n", + "Player Score: 23 (Usable Ace: False), Dealer Score: 8\n", "Game end. Reward: -1.0\n", "\n" ] @@ -201,6 +185,13 @@ " print(\"Game end. Reward: {}\\n\".format(float(reward)))\n", " break" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -219,9 +210,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.4" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb b/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb index 4484eb2f6..40af11f40 100644 --- a/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb +++ b/MC/MC Control with Epsilon-Greedy Policies Solution.ipynb @@ -28,7 +28,7 @@ "cell_type": "code", "execution_count": 2, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -81,14 +81,14 @@ " \n", " Args:\n", " env: OpenAI gym environment.\n", - " num_episodes: Nubmer of episodes to sample.\n", - " discount_factor: Lambda discount factor.\n", + " num_episodes: Number of episodes to sample.\n", + " discount_factor: Gamma discount factor.\n", " epsilon: Chance the sample a random action. Float betwen 0 and 1.\n", " \n", " Returns:\n", " A tuple (Q, policy).\n", " Q is a dictionary mapping state -> action values.\n", - " policy is a function taht takes an observation as an argument and returns\n", + " policy is a function that takes an observation as an argument and returns\n", " action probabilities\n", " \"\"\"\n", " \n", @@ -139,7 +139,7 @@ " returns_count[sa_pair] += 1.0\n", " Q[state][action] = returns_sum[sa_pair] / returns_count[sa_pair]\n", " \n", - " # The policy is improved implicitly by changing the Q dictionar\n", + " # The policy is improved implicitly by changing the Q dictionary\n", " \n", " return Q, policy" ] @@ -147,9 +147,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -166,9 +164,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -231,5 +227,5 @@ } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/MC/MC Control with Epsilon-Greedy Policies.ipynb b/MC/MC Control with Epsilon-Greedy Policies.ipynb index dab7af2ac..257a84b44 100644 --- a/MC/MC Control with Epsilon-Greedy Policies.ipynb +++ b/MC/MC Control with Epsilon-Greedy Policies.ipynb @@ -28,7 +28,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -39,7 +39,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -79,14 +79,14 @@ " \n", " Args:\n", " env: OpenAI gym environment.\n", - " num_episodes: Nubmer of episodes to sample.\n", - " discount_factor: Lambda discount factor.\n", + " num_episodes: Number of episodes to sample.\n", + " discount_factor: Gamma discount factor.\n", " epsilon: Chance the sample a random action. Float betwen 0 and 1.\n", " \n", " Returns:\n", " A tuple (Q, policy).\n", " Q is a dictionary mapping state -> action values.\n", - " policy is a function taht takes an observation as an argument and returns\n", + " policy is a function that takes an observation as an argument and returns\n", " action probabilities\n", " \"\"\"\n", " \n", @@ -112,7 +112,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -123,7 +123,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -148,23 +148,23 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.11" + "pygments_lexer": "ipython3", + "version": "3.5.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/MC/MC Prediction Solution.ipynb b/MC/MC Prediction Solution.ipynb index b6cc24f95..25da5f3ca 100644 --- a/MC/MC Prediction Solution.ipynb +++ b/MC/MC Prediction Solution.ipynb @@ -4,7 +4,7 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -29,7 +29,7 @@ "cell_type": "code", "execution_count": 2, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -40,7 +40,7 @@ "cell_type": "code", "execution_count": 3, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -52,8 +52,8 @@ " Args:\n", " policy: A function that maps an observation to action probabilities.\n", " env: OpenAI gym environment.\n", - " num_episodes: Nubmer of episodes to sample.\n", - " discount_factor: Lambda discount factor.\n", + " num_episodes: Number of episodes to sample.\n", + " discount_factor: Gamma discount factor.\n", " \n", " Returns:\n", " A dictionary that maps from state -> value.\n", @@ -107,7 +107,7 @@ "cell_type": "code", "execution_count": 4, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -123,7 +123,6 @@ "cell_type": "code", "execution_count": 5, "metadata": { - "collapsed": false, "scrolled": false }, "outputs": [ @@ -220,5 +219,5 @@ } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/MC/MC Prediction.ipynb b/MC/MC Prediction.ipynb index 17c8cf64f..13b3da809 100644 --- a/MC/MC Prediction.ipynb +++ b/MC/MC Prediction.ipynb @@ -4,7 +4,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -29,7 +29,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -40,7 +40,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -52,8 +52,8 @@ " Args:\n", " policy: A function that maps an observation to action probabilities.\n", " env: OpenAI gym environment.\n", - " num_episodes: Nubmer of episodes to sample.\n", - " discount_factor: Lambda discount factor.\n", + " num_episodes: Number of episodes to sample.\n", + " discount_factor: Gamma discount factor.\n", " \n", " Returns:\n", " A dictionary that maps from state -> value.\n", @@ -78,7 +78,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -87,14 +87,14 @@ " A policy that sticks if the player score is > 20 and hits otherwise.\n", " \"\"\"\n", " score, dealer_score, usable_ace = observation\n", - " return np.array([1.0, 0.0]) if score >= 20 else np.array([0.0, 1.0])" + " return 0 if score >= 20 else 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false, + "collapsed": true, "scrolled": false }, "outputs": [], @@ -118,23 +118,23 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.11" + "pygments_lexer": "ipython3", + "version": "3.5.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/MC/Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb b/MC/Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb index 2baf04377..41dad0fe6 100644 --- a/MC/Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb +++ b/MC/Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb @@ -4,7 +4,7 @@ "cell_type": "code", "execution_count": 2, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -28,7 +28,7 @@ "cell_type": "code", "execution_count": 3, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -92,7 +92,7 @@ "cell_type": "code", "execution_count": 6, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -103,10 +103,10 @@ " \n", " Args:\n", " env: OpenAI gym environment.\n", - " num_episodes: Nubmer of episodes to sample.\n", + " num_episodes: Number of episodes to sample.\n", " behavior_policy: The behavior to follow while generating episodes.\n", " A function that given an observation returns a vector of probabilities for each action.\n", - " discount_factor: Lambda discount factor.\n", + " discount_factor: Gamma discount factor.\n", " \n", " Returns:\n", " A tuple (Q, policy).\n", @@ -171,9 +171,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -191,9 +189,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -252,9 +248,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.0" + "version": "3.5.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/MC/Off-Policy MC Control with Weighted Importance Sampling.ipynb b/MC/Off-Policy MC Control with Weighted Importance Sampling.ipynb index 5cd2b408c..b93408711 100644 --- a/MC/Off-Policy MC Control with Weighted Importance Sampling.ipynb +++ b/MC/Off-Policy MC Control with Weighted Importance Sampling.ipynb @@ -28,7 +28,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -64,7 +64,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -90,7 +90,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -101,10 +101,10 @@ " \n", " Args:\n", " env: OpenAI gym environment.\n", - " num_episodes: Nubmer of episodes to sample.\n", + " num_episodes: Number of episodes to sample.\n", " behavior_policy: The behavior to follow while generating episodes.\n", " A function that given an observation returns a vector of probabilities for each action.\n", - " discount_factor: Lambda discount factor.\n", + " discount_factor: Gamma discount factor.\n", " \n", " Returns:\n", " A tuple (Q, policy).\n", @@ -129,7 +129,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -141,7 +141,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -180,9 +180,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.0" + "version": "3.5.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/MC/README.md b/MC/README.md index 5ed660915..8f246c38d 100644 --- a/MC/README.md +++ b/MC/README.md @@ -26,7 +26,7 @@ **Required:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 5: Monte Carlo Methods +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 5: Monte Carlo Methods **Optional:** @@ -37,13 +37,13 @@ ### Exercises -- [Get familiar with the Blackjack environment (Blackjack-v0)](Blackjack Playground.ipynb) +- Get familiar with the [Blackjack environment (Blackjack-v0)](Blackjack%20Playground.ipynb) - Implement the Monte Carlo Prediction to estimate state-action values - - [Exercise](MC Prediction.ipynb) - - [Solution](MC Prediction Solution.ipynb) + - [Exercise](MC%20Prediction.ipynb) + - [Solution](MC%20Prediction%20Solution.ipynb) - Implement the on-policy first-visit Monte Carlo Control algorithm - - [Exercise](MC Control with Epsilon-Greedy Policies.ipynb) - - [Solution](MC Control with Epsilon-Greedy Policies Solution.ipynb) + - [Exercise](MC%20Control%20with%20Epsilon-Greedy%20Policies.ipynb) + - [Solution](MC%20Control%20with%20Epsilon-Greedy%20Policies%20Solution.ipynb) - Implement the off-policy every-visit Monte Carlo Control using Weighted Important Sampling algorithm - - [Exercise](Off-Policy MC Control with Weighted Importance Sampling.ipynb) - - [Solution](Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb) \ No newline at end of file + - [Exercise](Off-Policy%20MC%20Control%20with%20Weighted%20Importance%20Sampling.ipynb) + - [Solution](Off-Policy%20MC%20Control%20with%20Weighted%20Importance%20Sampling%20Solution.ipynb) diff --git a/MDP/README.md b/MDP/README.md index 404cb141b..08e73d072 100644 --- a/MDP/README.md +++ b/MDP/README.md @@ -25,7 +25,7 @@ **Required:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 3: Finite Markov Decision Processes +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 3: Finite Markov Decision Processes - David Silver's RL Course Lecture 2 - Markov Decision Processes ([video](https://www.youtube.com/watch?v=lfHX2hHRMVQ), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/MDP.pdf)) diff --git a/PolicyGradient/CliffWalk Actor Critic Solution.ipynb b/PolicyGradient/CliffWalk Actor Critic Solution.ipynb index 0e952a07c..0a8fb509e 100644 --- a/PolicyGradient/CliffWalk Actor Critic Solution.ipynb +++ b/PolicyGradient/CliffWalk Actor Critic Solution.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", @@ -29,9 +27,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "env = CliffWalkingEnv()" @@ -88,9 +84,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "class ValueEstimator():\n", @@ -145,7 +139,7 @@ " Args:\n", " env: OpenAI environment.\n", " estimator_policy: Policy Function to be optimized \n", - " estimator_value: Value function approximator, used as a baseline\n", + " estimator_value: Value function approximator, used as a critic\n", " num_episodes: Number of episodes to run for\n", " discount_factor: Time-discount factor\n", " \n", @@ -209,9 +203,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -238,9 +230,7 @@ { "cell_type": "code", "execution_count": 28, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -306,9 +296,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.0" + "version": "3.5.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb b/PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb index 4291d5551..fb7707846 100644 --- a/PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb +++ b/PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb @@ -161,7 +161,7 @@ " Transition = collections.namedtuple(\"Transition\", [\"state\", \"action\", \"reward\", \"next_state\", \"done\"])\n", " \n", " for i_episode in range(num_episodes):\n", - " # Reset the environment and pick the fisrst action\n", + " # Reset the environment and pick the first action\n", " state = env.reset()\n", " \n", " episode = []\n", @@ -196,11 +196,11 @@ " for t, transition in enumerate(episode):\n", " # The return after this timestep\n", " total_return = sum(discount_factor**i * t.reward for i, t in enumerate(episode[t:]))\n", - " # Update our value estimator\n", - " estimator_value.update(transition.state, total_return)\n", " # Calculate baseline/advantage\n", " baseline_value = estimator_value.predict(transition.state) \n", " advantage = total_return - baseline_value\n", + " # Update our value estimator\n", + " estimator_value.update(transition.state, total_return)\n", " # Update our policy estimator\n", " estimator_policy.update(transition.state, advantage, transition.action)\n", " \n", diff --git a/PolicyGradient/Continuous MountainCar Actor Critic Solution.ipynb b/PolicyGradient/Continuous MountainCar Actor Critic Solution.ipynb index 4cbc43d27..6b34a0b62 100644 --- a/PolicyGradient/Continuous MountainCar Actor Critic Solution.ipynb +++ b/PolicyGradient/Continuous MountainCar Actor Critic Solution.ipynb @@ -233,7 +233,7 @@ " Args:\n", " env: OpenAI environment.\n", " estimator_policy: Policy Function to be optimized \n", - " estimator_value: Value function approximator, used as a baseline\n", + " estimator_value: Value function approximator, used as a critic\n", " num_episodes: Number of episodes to run for\n", " discount_factor: Time-discount factor\n", " \n", @@ -343,7 +343,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "plotting.plot_episode_stats(stats, smoothing_window=10)" @@ -384,7 +386,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.1" + "version": "3.5.2" } }, "nbformat": 4, diff --git a/PolicyGradient/README.md b/PolicyGradient/README.md index 4921e0cd6..e8e793b77 100644 --- a/PolicyGradient/README.md +++ b/PolicyGradient/README.md @@ -36,13 +36,13 @@ **Optional:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 11: Policy Gradient Methods (Under Construction) +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 13: Policy Gradient Methods - [Deterministic Policy Gradient Algorithms](http://jmlr.org/proceedings/papers/v32/silver14.pdf) - [Deterministic Policy Gradient Algorithms (Talk)](http://techtalks.tv/talks/deterministic-policy-gradient-algorithms/61098/) - [Continuous control with deep reinforcement learning](https://arxiv.org/abs/1509.02971) - [Deep Deterministic Policy Gradients in TensorFlow](http://pemami4911.github.io/blog_posts/2016/08/21/ddpg-rl.html) - [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/abs/1602.01783) -- [Deep Reinforcement Learning: A Tutorial (Policy Gradient Section)](https://gym.openai.com/docs/rl#policy-gradients) +- [Deep Reinforcement Learning: A Tutorial (Policy Gradient Section)](http://web.archive.org/web/20161029135055/https://gym.openai.com/docs/rl#id16) @@ -50,13 +50,13 @@ - REINFORCE with Baseline - Exercise - - [Solution](CliffWalk REINFORCE with Baseline Solution.ipynb) + - [Solution](CliffWalk%20REINFORCE%20with%20Baseline%20Solution.ipynb) - Actor-Critic with Baseline - Exercise - - [Solution](CliffWalk Actor-Critic Solution.ipynb) + - [Solution](CliffWalk%20Actor%20Critic%20Solution.ipynb) - Actor-Critic with Baseline for Continuous Action Spaces - Exercise - - [Solution](Continuous MountainCar Actor-Critic Solution.ipynb) + - [Solution](Continuous%20MountainCar%20Actor%20Critic%20Solution.ipynb) - Deterministic Policy Gradients for Continuous Action Spaces (WIP) - Deep Deterministic Policy Gradients (WIP) - Asynchronous Advantage Actor-Critic (A3C) diff --git a/PolicyGradient/a3c/train.py b/PolicyGradient/a3c/train.py index 28064e506..286ca7a9d 100755 --- a/PolicyGradient/a3c/train.py +++ b/PolicyGradient/a3c/train.py @@ -125,7 +125,7 @@ def make_env(wrap=True): # Start worker threads worker_threads = [] for worker in workers: - worker_fn = lambda: worker.run(sess, coord, FLAGS.t_max) + worker_fn = lambda worker=worker: worker.run(sess, coord, FLAGS.t_max) t = threading.Thread(target=worker_fn) t.start() worker_threads.append(t) diff --git a/PolicyGradient/a3c/worker.py b/PolicyGradient/a3c/worker.py index 5f310ac3c..6371558f2 100644 --- a/PolicyGradient/a3c/worker.py +++ b/PolicyGradient/a3c/worker.py @@ -85,7 +85,7 @@ def __init__(self, name, env, policy_net, value_net, global_counter, discount_fa # Op to copy params from global policy/valuenets self.copy_params_op = make_copy_params_op( tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES), - tf.contrib.slim.get_variables(scope=self.name, collection=tf.GraphKeys.TRAINABLE_VARIABLES)) + tf.contrib.slim.get_variables(scope=self.name+'/', collection=tf.GraphKeys.TRAINABLE_VARIABLES)) self.vnet_train_op = make_train_op(self.value_net, self.global_value_net) self.pnet_train_op = make_train_op(self.policy_net, self.global_policy_net) diff --git a/README.md b/README.md index fd2e42323..8a89bd765 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ This repository provides code, exercises and solutions for popular Reinforcement Learning algorithms. These are meant to serve as a learning tool to complement the theoretical materials from -- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) +- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/book/RLbook2018.pdf) - [David Silver's Reinforcement Learning Course](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html) Each folder in corresponds to one or more chapters of the above textbook and/or course. In addition to exercises and solution, each folder also contains a list of learning goals, a brief concept summary, and links to the relevant readings. @@ -26,21 +26,21 @@ All code is written in Python 3 and uses RL environments from [OpenAI Gym](https ### List of Implemented Algorithms -- [Dynamic Programming Policy Evaluation](DP/Policy Evaluation Solution.ipynb) -- [Dynamic Programming Policy Iteration](DP/Policy Iteration Solution.ipynb) -- [Dynamic Programming Value Iteration](DP/Value Iteration Solution.ipynb) -- [Monte Carlo Prediction](MC/MC Prediction Solution.ipynb) -- [Monte Carlo Control with Epsilon-Greedy Policies](MC/MC Control with Epsilon-Greedy Policies Solution.ipynb) -- [Monte Carlo Off-Policy Control with Importance Sampling](MC/Off-Policy MC Control with Weighted Importance Sampling Solution.ipynb) -- [SARSA (On Policy TD Learning)](TD/SARSA Solution.ipynb) -- [Q-Learning (Off Policy TD Learning)](TD/Q-Learning Solution.ipynb) -- [Q-Learning with Linear Function Approximation](FA/Q-Learning with Value Function Approximation Solution.ipynb) -- [Deep Q-Learning for Atari Games](DQN/Deep Q Learning Solution.ipynb) -- [Double Deep-Q Learning for Atari Games](DQN/Double DQN Solution.ipynb) +- [Dynamic Programming Policy Evaluation](DP/Policy%20Evaluation%20Solution.ipynb) +- [Dynamic Programming Policy Iteration](DP/Policy%20Iteration%20Solution.ipynb) +- [Dynamic Programming Value Iteration](DP/Value%20Iteration%20Solution.ipynb) +- [Monte Carlo Prediction](MC/MC%20Prediction%20Solution.ipynb) +- [Monte Carlo Control with Epsilon-Greedy Policies](MC/MC%20Control%20with%20Epsilon-Greedy%20Policies%20Solution.ipynb) +- [Monte Carlo Off-Policy Control with Importance Sampling](MC/Off-Policy%20MC%20Control%20with%20Weighted%20Importance%20Sampling%20Solution.ipynb) +- [SARSA (On Policy TD Learning)](TD/SARSA%20Solution.ipynb) +- [Q-Learning (Off Policy TD Learning)](TD/Q-Learning%20Solution.ipynb) +- [Q-Learning with Linear Function Approximation](FA/Q-Learning%20with%20Value%20Function%20Approximation%20Solution.ipynb) +- [Deep Q-Learning for Atari Games](DQN/Deep%20Q%20Learning%20Solution.ipynb) +- [Double Deep-Q Learning for Atari Games](DQN/Double%20DQN%20Solution.ipynb) - Deep Q-Learning with Prioritized Experience Replay (WIP) -- [Policy Gradient: REINFORCE with Baseline](PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb) -- [Policy Gradient: Actor Critic with Baseline](PolicyGradient/CliffWalk Actor Critic Solution.ipynb) -- [Policy Gradient: Actor Critic with Baseline for Continuous Action Spaces](PolicyGradient/Continuous MountainCar Actor Critic Solution.ipynb) +- [Policy Gradient: REINFORCE with Baseline](PolicyGradient/CliffWalk%20REINFORCE%20with%20Baseline%20Solution.ipynb) +- [Policy Gradient: Actor Critic with Baseline](PolicyGradient/CliffWalk%20Actor%20Critic%20Solution.ipynb) +- [Policy Gradient: Actor Critic with Baseline for Continuous Action Spaces](PolicyGradient/Continuous%20MountainCar%20Actor%20Critic%20Solution.ipynb) - Deterministic Policy Gradients for Continuous Action Spaces (WIP) - Deep Deterministic Policy Gradients (DDPG) (WIP) - [Asynchronous Advantage Actor Critic (A3C)](PolicyGradient/a3c) @@ -50,13 +50,15 @@ All code is written in Python 3 and uses RL environments from [OpenAI Gym](https Textbooks: -- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) +- [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/book/RLbook2018.pdf) Classes: - [David Silver's Reinforcement Learning Course (UCL, 2015)](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html) - [CS294 - Deep Reinforcement Learning (Berkeley, Fall 2015)](http://rll.berkeley.edu/deeprlcourse/) - [CS 8803 - Reinforcement Learning (Georgia Tech)](https://www.udacity.com/course/reinforcement-learning--ud600) +- [CS885 - Reinforcement Learning (UWaterloo), Spring 2018](https://cs.uwaterloo.ca/~ppoupart/teaching/cs885-spring18/) +- [CS294-112 - Deep Reinforcement Learning (UC Berkeley)](http://rail.eecs.berkeley.edu/deeprlcourse/) Talks/Tutorials: @@ -66,6 +68,9 @@ Talks/Tutorials: - [Tutorial: Introduction to Reinforcement Learning with Function Approximation](https://www.youtube.com/watch?v=ggqnxyjaKe4) - [John Schulman - Deep Reinforcement Learning (4 Lectures)](https://www.youtube.com/playlist?list=PLjKEIQlKCTZYN3CYBlj8r58SbNorobqcp) - [Deep Reinforcement Learning Slides @ NIPS 2016](http://people.eecs.berkeley.edu/~pabbeel/nips-tutorial-policy-optimization-Schulman-Abbeel.pdf) +- [OpenAI Spinning Up](https://spinningup.openai.com/en/latest/user/introduction.html) +- [Advanced Deep Learning & Reinforcement Learning (UCL 2018, DeepMind)](https://www.youtube.com/playlist?list=PLqYmG7hTraZDNJre23vqCGIVpfZ_K2RZs) +-[Deep RL Bootcamp](https://sites.google.com/view/deep-rl-bootcamp/lectures) Other Projects: diff --git a/TD/Cliff Environment Playground.ipynb b/TD/Cliff Environment Playground.ipynb index d50da42b6..414cf811d 100644 --- a/TD/Cliff Environment Playground.ipynb +++ b/TD/Cliff Environment Playground.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import gym\n", @@ -21,9 +19,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -80,6 +76,13 @@ "print(env.step(2))\n", "env.render()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -98,9 +101,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.4" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/TD/Q-Learning Solution.ipynb b/TD/Q-Learning Solution.ipynb index 5794e20de..f2da32351 100644 --- a/TD/Q-Learning Solution.ipynb +++ b/TD/Q-Learning Solution.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", @@ -31,9 +29,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "env = CliffWalkingEnv()" @@ -54,7 +50,7 @@ " Args:\n", " Q: A dictionary that maps from state -> action-values.\n", " Each value is a numpy array of length nA (see below)\n", - " epsilon: The probability to select a random action . float between 0 and 1.\n", + " epsilon: The probability to select a random action. Float between 0 and 1.\n", " nA: Number of actions in the environment.\n", " \n", " Returns:\n", @@ -73,9 +69,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def q_learning(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1):\n", @@ -86,9 +80,9 @@ " Args:\n", " env: OpenAI environment.\n", " num_episodes: Number of episodes to run for.\n", - " discount_factor: Lambda time discount factor.\n", + " discount_factor: Gamma discount factor.\n", " alpha: TD learning rate.\n", - " epsilon: Chance the sample a random action. Float betwen 0 and 1.\n", + " epsilon: Chance to sample a random action. Float between 0 and 1.\n", " \n", " Returns:\n", " A tuple (Q, episode_lengths).\n", @@ -147,9 +141,7 @@ { "cell_type": "code", "execution_count": 18, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -166,9 +158,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -231,9 +221,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.5.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/TD/Q-Learning.ipynb b/TD/Q-Learning.ipynb index 724d682ad..ddd33c756 100644 --- a/TD/Q-Learning.ipynb +++ b/TD/Q-Learning.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", @@ -30,9 +28,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "env = CliffWalkingEnv()" @@ -53,7 +49,7 @@ " Args:\n", " Q: A dictionary that maps from state -> action-values.\n", " Each value is a numpy array of length nA (see below)\n", - " epsilon: The probability to select a random action . float between 0 and 1.\n", + " epsilon: The probability to select a random action. Float between 0 and 1.\n", " nA: Number of actions in the environment.\n", " \n", " Returns:\n", @@ -72,9 +68,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def q_learning(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1):\n", @@ -85,9 +79,9 @@ " Args:\n", " env: OpenAI environment.\n", " num_episodes: Number of episodes to run for.\n", - " discount_factor: Lambda time discount factor.\n", + " discount_factor: Gamma discount factor.\n", " alpha: TD learning rate.\n", - " epsilon: Chance the sample a random action. Float betwen 0 and 1.\n", + " epsilon: Chance to sample a random action. Float between 0 and 1.\n", " \n", " Returns:\n", " A tuple (Q, episode_lengths).\n", @@ -121,9 +115,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -140,9 +132,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -205,9 +195,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.5.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/TD/README.md b/TD/README.md index 9044704d1..9b34caecc 100644 --- a/TD/README.md +++ b/TD/README.md @@ -28,23 +28,23 @@ **Required:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 6: Temporal-Difference Learning +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 6: Temporal-Difference Learning - David Silver's RL Course Lecture 4 - Model-Free Prediction ([video](https://www.youtube.com/watch?v=PnHCvfgC_ZA), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/MC-TD.pdf)) - David Silver's RL Course Lecture 5 - Model-Free Control ([video](https://www.youtube.com/watch?v=0g4j2k_Ggc4), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/control.pdf)) **Optional:** -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 7: Multi-Step Bootstrapping -- [Reinforcement Learning: An Introduction](http://incompleteideas.net/sutton/book/bookdraft2017june.pdf) - Chapter 12: Eligibility Traces +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 7: Multi-Step Bootstrapping +- [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 12: Eligibility Traces ### Exercises -- [Windy Gridworld Playground](Windy Gridworld Playground.ipynb) +- Get familiar with the [Windy Gridworld Playground](Windy%20Gridworld%20Playground.ipynb) - Implement SARSA - [Exercise](SARSA.ipynb) - - [Solution](SARSA Solution.ipynb) -- [Cliff Environment Playground](Cliff Environment Playground.ipynb) + - [Solution](SARSA%20Solution.ipynb) +- Get familiar with the [Cliff Environment Playground](Cliff%20Environment%20Playground.ipynb) - Implement Q-Learning in Python - [Exercise](Q-Learning.ipynb) - - [Solution](Q-Learning Solution.ipynb) \ No newline at end of file + - [Solution](Q-Learning%20Solution.ipynb) diff --git a/TD/SARSA Solution.ipynb b/TD/SARSA Solution.ipynb index feab3db02..df647f193 100644 --- a/TD/SARSA Solution.ipynb +++ b/TD/SARSA Solution.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", @@ -39,9 +37,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "env = WindyGridworldEnv()" @@ -81,9 +77,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def sarsa(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1):\n", @@ -93,7 +87,7 @@ " Args:\n", " env: OpenAI environment.\n", " num_episodes: Number of episodes to run for.\n", - " discount_factor: Lambda time discount factor.\n", + " discount_factor: Gamma discount factor.\n", " alpha: TD learning rate.\n", " epsilon: Chance the sample a random action. Float betwen 0 and 1.\n", " \n", @@ -156,9 +150,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -175,9 +167,7 @@ { "cell_type": "code", "execution_count": 24, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -217,9 +207,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [] } @@ -240,9 +228,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.5.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/TD/SARSA.ipynb b/TD/SARSA.ipynb index 799915352..8a0344410 100644 --- a/TD/SARSA.ipynb +++ b/TD/SARSA.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", @@ -30,9 +28,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "env = WindyGridworldEnv()" @@ -72,9 +68,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def sarsa(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1):\n", @@ -84,7 +78,7 @@ " Args:\n", " env: OpenAI environment.\n", " num_episodes: Number of episodes to run for.\n", - " discount_factor: Lambda time discount factor.\n", + " discount_factor: Gamma discount factor.\n", " alpha: TD learning rate.\n", " epsilon: Chance the sample a random action. Float betwen 0 and 1.\n", " \n", @@ -121,9 +115,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -140,9 +132,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -182,9 +172,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [] } @@ -205,9 +193,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.5.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/TD/Windy Gridworld Playground.ipynb b/TD/Windy Gridworld Playground.ipynb index 7c37d7857..0572c0d86 100644 --- a/TD/Windy Gridworld Playground.ipynb +++ b/TD/Windy Gridworld Playground.ipynb @@ -2,10 +2,8 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, + "execution_count": 1, + "metadata": {}, "outputs": [], "source": [ "import gym\n", @@ -20,10 +18,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, + "execution_count": 2, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -119,6 +115,13 @@ "print(env.step(1))\n", "env.render()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -137,9 +140,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.4" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/lib/envs/blackjack.py b/lib/envs/blackjack.py index 158c49709..9052b4677 100644 --- a/lib/envs/blackjack.py +++ b/lib/envs/blackjack.py @@ -79,6 +79,12 @@ def __init__(self, natural=False): self._reset() # Number of self.nA = 2 + def reset(self): + return self._reset() + + def step(self, action): + return self._step(action) + def _seed(self, seed=None): self.np_random, seed = seeding.np_random(seed) return [seed] @@ -113,4 +119,4 @@ def _reset(self): while sum_hand(self.player) < 12: self.player.append(draw_card(self.np_random)) - return self._get_obs() \ No newline at end of file + return self._get_obs() diff --git a/lib/envs/cliff_walking.py b/lib/envs/cliff_walking.py index 37516ad1b..2c677662c 100644 --- a/lib/envs/cliff_walking.py +++ b/lib/envs/cliff_walking.py @@ -1,7 +1,8 @@ +import io import numpy as np import sys -from gym.envs.toy_text import discrete +from . import discrete UP = 0 RIGHT = 1 @@ -53,11 +54,14 @@ def __init__(self): super(CliffWalkingEnv, self).__init__(nS, nA, P, isd) + def render(self, mode='human', close=False): + self._render(mode, close) + def _render(self, mode='human', close=False): if close: return - outfile = StringIO() if mode == 'ansi' else sys.stdout + outfile = io.StringIO() if mode == 'ansi' else sys.stdout for s in range(self.nS): position = np.unravel_index(s, self.shape) @@ -78,4 +82,4 @@ def _render(self, mode='human', close=False): output += "\n" outfile.write(output) - outfile.write("\n") \ No newline at end of file + outfile.write("\n") diff --git a/lib/envs/discrete.py b/lib/envs/discrete.py new file mode 100644 index 000000000..64455fc00 --- /dev/null +++ b/lib/envs/discrete.py @@ -0,0 +1,51 @@ +import numpy as np + +from gym import Env, spaces +from gym.utils import seeding +from gym.envs.toy_text.utils import categorical_sample + +class DiscreteEnv(Env): + + """ + Has the following members + - nS: number of states + - nA: number of actions + - P: transitions (*) + - isd: initial state distribution (**) + + (*) dictionary of lists, where + P[s][a] == [(probability, nextstate, reward, done), ...] + (**) list or array of length nS + + + """ + + def __init__(self, nS, nA, P, isd): + self.P = P + self.isd = isd + self.lastaction = None # for rendering + self.nS = nS + self.nA = nA + + self.action_space = spaces.Discrete(self.nA) + self.observation_space = spaces.Discrete(self.nS) + + self.seed() + self.s = categorical_sample(self.isd, self.np_random) + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def reset(self): + self.s = categorical_sample(self.isd, self.np_random) + self.lastaction = None + return int(self.s) + + def step(self, a): + transitions = self.P[self.s][a] + i = categorical_sample([t[0] for t in transitions], self.np_random) + p, s, r, d = transitions[i] + self.s = s + self.lastaction = a + return (int(s), r, d, {"prob": p}) diff --git a/lib/envs/gridworld.py b/lib/envs/gridworld.py index ea96ddbb0..64a5be602 100644 --- a/lib/envs/gridworld.py +++ b/lib/envs/gridworld.py @@ -1,6 +1,8 @@ +import io import numpy as np import sys -from gym.envs.toy_text import discrete + +from . import discrete UP = 0 RIGHT = 1 @@ -49,6 +51,7 @@ def __init__(self, shape=[4,4]): s = it.iterindex y, x = it.multi_index + # P[s][a] = (prob, next_state, reward, is_done) P[s] = {a : [] for a in range(nA)} is_done = lambda s: s == 0 or s == (nS - 1) @@ -83,10 +86,19 @@ def __init__(self, shape=[4,4]): super(GridworldEnv, self).__init__(nS, nA, P, isd) def _render(self, mode='human', close=False): + """ Renders the current gridworld layout + + For example, a 4x4 grid with the mode="human" looks like: + T o o o + o x o o + o o o o + o o o T + where x is your position and T are the two terminal states. + """ if close: return - outfile = StringIO() if mode == 'ansi' else sys.stdout + outfile = io.StringIO() if mode == 'ansi' else sys.stdout grid = np.arange(self.nS).reshape(self.shape) it = np.nditer(grid, flags=['multi_index']) @@ -102,7 +114,7 @@ def _render(self, mode='human', close=False): output = " o " if x == 0: - output = output.lstrip() + output = output.lstrip() if x == self.shape[1] - 1: output = output.rstrip() @@ -111,4 +123,4 @@ def _render(self, mode='human', close=False): if x == self.shape[1] - 1: outfile.write("\n") - it.iternext() \ No newline at end of file + it.iternext() diff --git a/lib/envs/windy_gridworld.py b/lib/envs/windy_gridworld.py index 7524dbd58..6ac49cab3 100644 --- a/lib/envs/windy_gridworld.py +++ b/lib/envs/windy_gridworld.py @@ -1,7 +1,9 @@ +import io import gym import numpy as np import sys -from gym.envs.toy_text import discrete + +from . import discrete UP = 0 RIGHT = 1 @@ -53,11 +55,14 @@ def __init__(self): super(WindyGridworldEnv, self).__init__(nS, nA, P, isd) + def render(self, mode='human', close=False): + self._render(mode, close) + def _render(self, mode='human', close=False): if close: return - outfile = StringIO() if mode == 'ansi' else sys.stdout + outfile = io.StringIO() if mode == 'ansi' else sys.stdout for s in range(self.nS): position = np.unravel_index(s, self.shape) @@ -76,4 +81,4 @@ def _render(self, mode='human', close=False): output += "\n" outfile.write(output) - outfile.write("\n") \ No newline at end of file + outfile.write("\n")