diff --git a/Notes/RL/05Nov22.pdf b/Notes/RL/05Nov22.pdf new file mode 100644 index 0000000..819cea1 Binary files /dev/null and b/Notes/RL/05Nov22.pdf differ diff --git a/Notes/RL/08Oct22.pdf b/Notes/RL/08Oct22.pdf new file mode 100644 index 0000000..c26fd8c Binary files /dev/null and b/Notes/RL/08Oct22.pdf differ diff --git a/Notes/RL/10Sep22.pdf b/Notes/RL/10Sep22.pdf new file mode 100644 index 0000000..6e1e0be Binary files /dev/null and b/Notes/RL/10Sep22.pdf differ diff --git a/Notes/RL/12Nov22.pdf b/Notes/RL/12Nov22.pdf new file mode 100644 index 0000000..1f7ae03 Binary files /dev/null and b/Notes/RL/12Nov22.pdf differ diff --git a/Notes/RL/15Oct22.pdf b/Notes/RL/15Oct22.pdf new file mode 100644 index 0000000..4e45900 Binary files /dev/null and b/Notes/RL/15Oct22.pdf differ diff --git a/Notes/RL/17Sep22.pdf b/Notes/RL/17Sep22.pdf new file mode 100644 index 0000000..c6c5afc Binary files /dev/null and b/Notes/RL/17Sep22.pdf differ diff --git a/Notes/RL/19Nov22.pdf b/Notes/RL/19Nov22.pdf new file mode 100644 index 0000000..7722721 Binary files /dev/null and b/Notes/RL/19Nov22.pdf differ diff --git a/Notes/RL/22Oct22.pdf b/Notes/RL/22Oct22.pdf new file mode 100644 index 0000000..69a8b09 Binary files /dev/null and b/Notes/RL/22Oct22.pdf differ diff --git a/Notes/RL/24Sep22.pdf b/Notes/RL/24Sep22.pdf new file mode 100644 index 0000000..1132cae Binary files /dev/null and b/Notes/RL/24Sep22.pdf differ diff --git a/Notes/RL/Final exam Solutions.pdf b/Notes/RL/Final exam Solutions.pdf new file mode 100644 index 0000000..6cf445c Binary files /dev/null and b/Notes/RL/Final exam Solutions.pdf differ diff --git a/Notes/RL/Gamblers Problem.ipynb b/Notes/RL/Gamblers Problem.ipynb new file mode 100644 index 0000000..4927352 --- /dev/null +++ b/Notes/RL/Gamblers Problem.ipynb @@ -0,0 +1,351 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "states = 20\n", + "pf = 0.4\n", + "V = np.zeros(states+1)\n", + "V[states] = 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in range(10000):\n", + " for j in range(1,states):\n", + " actions = range(1,min(j,states-j)+1)\n", + " max_val = -1*np.inf\n", + " for a in actions:\n", + " temp = pf*V[j+a] + (1-pf)*V[j-a]\n", + " if temp >= max_val:\n", + " max_val = temp\n", + " V[j] = max_val" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "V" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "policy = -1*np.ones(states+1)\n", + "for j in range(1,states):\n", + " actions = range(1,min(j,states-j)+1)\n", + " max_val = -1*np.inf\n", + " for a in actions:\n", + " temp = pf*V[j+a] + (1-pf)*V[j-a]\n", + " print(\"(\",j,\",\",a,\")\",\":\",temp)\n", + " if temp >= max_val:\n", + " max_val = temp\n", + " policy[j] = a\n", + " print(\"*******************\")\n", + "# V[j] = max_val" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "plt.plot(range(1,states),V[1:states])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Monte-Carlo Control\n", + "Q = np.random.random((states+1,states+1))\n", + "returns = {}\n", + "for i in range(10000): #Number of episodes\n", + " done = False\n", + " episode = []\n", + " s = np.random.randint(9) + 1\n", + " while not done:\n", + " if np.random.random() < 0.1:\n", + " a = np.random.randint(min(s,states-s))+1\n", + " else:\n", + " actions = range(1,min(s,states-s)+1)\n", + " a = np.argmax(Q[s,1:min(s,states-s)+1])\n", + " a = actions[a]\n", + " \n", + " if np.random.random() < pf:\n", + " ns = s+a\n", + " else:\n", + " ns = s-a\n", + " \n", + " if ns == states:\n", + " reward = 1\n", + " done = True\n", + " elif ns == 0:\n", + " reward = 0\n", + " done = True\n", + " else:\n", + " reward = 0\n", + " done = False\n", + " \n", + " episode.append((s,a,reward,ns))\n", + " s = ns\n", + " \n", + " total_reward = 0 \n", + " for l in reversed(episode):\n", + "# print(l)\n", + " index = (l[0],l[1])\n", + " if index not in returns.keys():\n", + " returns[index] = []\n", + " total_reward += l[2]\n", + " returns[index].append(total_reward)\n", + " \n", + " for i in list(returns.keys()):\n", + "# print(i[0],i[1])\n", + " Q[i[0],i[1]] = np.average(returns[i])\n", + "# print(Q[i[0],i[1]])\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "policy_MC = -1*np.ones(states+1)\n", + "for s in range(1,states):\n", + " actions = range(1,min(s,states-s)+1)\n", + " a = np.argmax(Q[s,1:min(s,states-s)+1])\n", + " policy_MC[s] = actions[a]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "policy_MC[1:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Off-Policy Monte-Carlo Control\n", + "Q = np.zeros((states+1,states+1))\n", + "C = np.zeros((states+1,states+1))\n", + "returns = {}\n", + "for i in range(100000): #Number of episodes\n", + " done = False\n", + " episode = []\n", + " s = np.random.randint(9) + 1\n", + " while not done:\n", + " a = np.random.randint(min(s,states-s))+1 \n", + " if np.random.random() < pf:\n", + " ns = s+a\n", + " else:\n", + " ns = s-a\n", + " \n", + " if ns == states:\n", + " reward = 1\n", + " done = True\n", + " elif ns == 0:\n", + " reward = 0\n", + " done = True\n", + " else:\n", + " reward = 0\n", + " done = False\n", + " \n", + " episode.append((s,a,reward,ns))\n", + " s = ns\n", + " \n", + " total_reward = 0 \n", + " W = 1\n", + " for l in reversed(episode):\n", + "# print(l)\n", + " total_reward += l[2]\n", + " C[l[0],l[1]] += W\n", + " Q[l[0],l[1]] += (W/C[l[0],l[1]])*(total_reward - Q[l[0],l[1]])\n", + " \n", + " actions = range(1,min(l[0],states-l[0])+1)\n", + " a = np.argmax(Q[l[0],1:min(l[0],states-l[0])+1])\n", + " \n", + " if actions[a] != l[1]:\n", + " break\n", + " W = W*(1/(1/len(actions)))\n", + "# print(\"******************\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "policy_MC_off = -1*np.ones(states+1)\n", + "for s in range(1,states):\n", + " actions = range(1,min(s,states-s)+1)\n", + " a = np.argmax(Q[s,1:min(s,states-s)+1])\n", + " policy_MC_off[s] = actions[a]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "policy_MC_off[1:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Q-Learning\n", + "Q = np.zeros((states+1,states+1))\n", + "count = np.zeros((states+1,states+1))\n", + "for i in range(10000): #Number of episodes\n", + " done = False\n", + " s = np.random.randint(9) + 1\n", + " while not done:\n", + " if np.random.random() < 0.2:\n", + " a = np.random.randint(min(s,states-s))+1\n", + " else:\n", + " actions = range(1,min(s,states-s)+1)\n", + " a = np.argmax(Q[s,1:min(s,states-s)+1])\n", + " a = actions[a]\n", + " \n", + " ns = get_next_state(s,a)\n", + " \n", + " if ns == states:\n", + " reward = 1\n", + " done = True\n", + " elif ns == 0:\n", + " reward = 0\n", + " done = True\n", + " else:\n", + " reward = 0\n", + " done = False\n", + " \n", + " if ns == 0 or ns == states:\n", + " next_reward = 0\n", + " else:\n", + " next_reward = np.max(Q[ns,1:min(ns,states-ns)+1])\n", + " \n", + " count[s,a] +=1\n", + " alpha = 1/count[s,a]\n", + " \n", + " Q[s,a] = (1-alpha)*Q[s,a] + alpha*(reward + next_reward)\n", + " \n", + " s = ns\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#environment \n", + "def get_next_state(s,a):\n", + " if np.random.random() < pf:\n", + " ns = s+a\n", + " else:\n", + " ns = s-a\n", + " return ns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "policy_Q = -1*np.ones(states+1)\n", + "for s in range(1,states):\n", + " actions = range(1,min(s,states-s)+1)\n", + " a = np.argmax(Q[s,1:min(s,states-s)+1])\n", + " policy_Q[s] = actions[a]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "policy_Q[1:states]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Notes/RL/Midterm-Solutions.pdf b/Notes/RL/Midterm-Solutions.pdf new file mode 100644 index 0000000..c1ae8e8 Binary files /dev/null and b/Notes/RL/Midterm-Solutions.pdf differ diff --git a/Notes/RL/Note 06-Dec-2022.pdf b/Notes/RL/Note 06-Dec-2022.pdf new file mode 100644 index 0000000..f6b83dd Binary files /dev/null and b/Notes/RL/Note 06-Dec-2022.pdf differ diff --git a/Notes/RL/OnlineMtech_RL_FinalExam_2022.pdf b/Notes/RL/OnlineMtech_RL_FinalExam_2022.pdf new file mode 100644 index 0000000..dfc3f96 Binary files /dev/null and b/Notes/RL/OnlineMtech_RL_FinalExam_2022.pdf differ diff --git a/Notes/RL/OnlineMtech_RL_Practice.pdf b/Notes/RL/OnlineMtech_RL_Practice.pdf new file mode 100644 index 0000000..b9c7672 Binary files /dev/null and b/Notes/RL/OnlineMtech_RL_Practice.pdf differ diff --git a/Notes/RL/Online_RL_Practice_Questions.pdf b/Notes/RL/Online_RL_Practice_Questions.pdf new file mode 100644 index 0000000..e524285 Binary files /dev/null and b/Notes/RL/Online_RL_Practice_Questions.pdf differ diff --git a/Notes/RL/Practice Solutions 1.pdf b/Notes/RL/Practice Solutions 1.pdf new file mode 100644 index 0000000..12ef0cb Binary files /dev/null and b/Notes/RL/Practice Solutions 1.pdf differ diff --git a/Notes/RL/Practice Solutions 2.pdf b/Notes/RL/Practice Solutions 2.pdf new file mode 100644 index 0000000..7359df8 Binary files /dev/null and b/Notes/RL/Practice Solutions 2.pdf differ diff --git a/Notes/RL/Practice Solutions.pdf b/Notes/RL/Practice Solutions.pdf new file mode 100644 index 0000000..4c28484 Binary files /dev/null and b/Notes/RL/Practice Solutions.pdf differ diff --git a/Notes/RL/Quiz solutions .pdf b/Notes/RL/Quiz solutions .pdf new file mode 100644 index 0000000..d8e83bf Binary files /dev/null and b/Notes/RL/Quiz solutions .pdf differ