RL Files

RonakDedhiya · Sep 30, 2024 · 575e1d6 · 575e1d6
1 parent b6982df
commit 575e1d6
Show file tree

Hide file tree

Showing 20 changed files with 351 additions and 0 deletions.
diff --git a/Notes/RL/05Nov22.pdf b/Notes/RL/05Nov22.pdf
diff --git a/Notes/RL/08Oct22.pdf b/Notes/RL/08Oct22.pdf
diff --git a/Notes/RL/10Sep22.pdf b/Notes/RL/10Sep22.pdf
diff --git a/Notes/RL/12Nov22.pdf b/Notes/RL/12Nov22.pdf
diff --git a/Notes/RL/15Oct22.pdf b/Notes/RL/15Oct22.pdf
diff --git a/Notes/RL/17Sep22.pdf b/Notes/RL/17Sep22.pdf
diff --git a/Notes/RL/19Nov22.pdf b/Notes/RL/19Nov22.pdf
diff --git a/Notes/RL/22Oct22.pdf b/Notes/RL/22Oct22.pdf
diff --git a/Notes/RL/24Sep22.pdf b/Notes/RL/24Sep22.pdf
diff --git a/Notes/RL/Final exam Solutions.pdf b/Notes/RL/Final exam Solutions.pdf
diff --git a/Notes/RL/Gamblers Problem.ipynb b/Notes/RL/Gamblers Problem.ipynb
@@ -0,0 +1,351 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "states = 20\n",
+    "pf = 0.4\n",
+    "V = np.zeros(states+1)\n",
+    "V[states] = 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in range(10000):\n",
+    "    for j in range(1,states):\n",
+    "        actions = range(1,min(j,states-j)+1)\n",
+    "        max_val = -1*np.inf\n",
+    "        for a in actions:\n",
+    "            temp = pf*V[j+a] + (1-pf)*V[j-a]\n",
+    "            if temp >= max_val:\n",
+    "                max_val = temp\n",
+    "        V[j] = max_val"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "V"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "policy = -1*np.ones(states+1)\n",
+    "for j in range(1,states):\n",
+    "    actions = range(1,min(j,states-j)+1)\n",
+    "    max_val = -1*np.inf\n",
+    "    for a in actions:\n",
+    "        temp = pf*V[j+a] + (1-pf)*V[j-a]\n",
+    "        print(\"(\",j,\",\",a,\")\",\":\",temp)\n",
+    "        if temp >= max_val:\n",
+    "            max_val = temp\n",
+    "            policy[j] = a\n",
+    "    print(\"*******************\")\n",
+    "#     V[j] = max_val"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "plt.plot(range(1,states),V[1:states])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Monte-Carlo Control\n",
+    "Q = np.random.random((states+1,states+1))\n",
+    "returns = {}\n",
+    "for i in range(10000): #Number of episodes\n",
+    "    done = False\n",
+    "    episode = []\n",
+    "    s = np.random.randint(9) + 1\n",
+    "    while not done:\n",
+    "        if np.random.random() < 0.1:\n",
+    "            a = np.random.randint(min(s,states-s))+1\n",
+    "        else:\n",
+    "            actions = range(1,min(s,states-s)+1)\n",
+    "            a = np.argmax(Q[s,1:min(s,states-s)+1])\n",
+    "            a = actions[a]\n",
+    "            \n",
+    "        if np.random.random() < pf:\n",
+    "            ns = s+a\n",
+    "        else:\n",
+    "            ns = s-a\n",
+    "                        \n",
+    "        if ns == states:\n",
+    "            reward = 1\n",
+    "            done = True\n",
+    "        elif ns == 0:\n",
+    "            reward = 0\n",
+    "            done = True\n",
+    "        else:\n",
+    "            reward = 0\n",
+    "            done = False\n",
+    "            \n",
+    "        episode.append((s,a,reward,ns))\n",
+    "        s = ns\n",
+    "    \n",
+    "    total_reward = 0 \n",
+    "    for l in reversed(episode):\n",
+    "#         print(l)\n",
+    "        index = (l[0],l[1])\n",
+    "        if index not in returns.keys():\n",
+    "            returns[index] = []\n",
+    "        total_reward += l[2]\n",
+    "        returns[index].append(total_reward)\n",
+    "        \n",
+    "    for i in list(returns.keys()):\n",
+    "#         print(i[0],i[1])\n",
+    "        Q[i[0],i[1]] = np.average(returns[i])\n",
+    "#         print(Q[i[0],i[1]])\n",
+    "    \n",
+    "        \n",
+    "    \n",
+    "            \n",
+    "        \n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "policy_MC = -1*np.ones(states+1)\n",
+    "for s in range(1,states):\n",
+    "    actions = range(1,min(s,states-s)+1)\n",
+    "    a = np.argmax(Q[s,1:min(s,states-s)+1])\n",
+    "    policy_MC[s] = actions[a]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "policy_MC[1:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Off-Policy Monte-Carlo Control\n",
+    "Q = np.zeros((states+1,states+1))\n",
+    "C = np.zeros((states+1,states+1))\n",
+    "returns = {}\n",
+    "for i in range(100000): #Number of episodes\n",
+    "    done = False\n",
+    "    episode = []\n",
+    "    s = np.random.randint(9) + 1\n",
+    "    while not done:\n",
+    "        a = np.random.randint(min(s,states-s))+1          \n",
+    "        if np.random.random() < pf:\n",
+    "            ns = s+a\n",
+    "        else:\n",
+    "            ns = s-a\n",
+    "                        \n",
+    "        if ns == states:\n",
+    "            reward = 1\n",
+    "            done = True\n",
+    "        elif ns == 0:\n",
+    "            reward = 0\n",
+    "            done = True\n",
+    "        else:\n",
+    "            reward = 0\n",
+    "            done = False\n",
+    "            \n",
+    "        episode.append((s,a,reward,ns))\n",
+    "        s = ns\n",
+    "    \n",
+    "    total_reward = 0 \n",
+    "    W = 1\n",
+    "    for l in reversed(episode):\n",
+    "#         print(l)\n",
+    "        total_reward += l[2]\n",
+    "        C[l[0],l[1]] += W\n",
+    "        Q[l[0],l[1]] += (W/C[l[0],l[1]])*(total_reward - Q[l[0],l[1]])\n",
+    "        \n",
+    "        actions = range(1,min(l[0],states-l[0])+1)\n",
+    "        a = np.argmax(Q[l[0],1:min(l[0],states-l[0])+1])\n",
+    "        \n",
+    "        if actions[a] != l[1]:\n",
+    "            break\n",
+    "        W = W*(1/(1/len(actions)))\n",
+    "#     print(\"******************\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "policy_MC_off = -1*np.ones(states+1)\n",
+    "for s in range(1,states):\n",
+    "    actions = range(1,min(s,states-s)+1)\n",
+    "    a = np.argmax(Q[s,1:min(s,states-s)+1])\n",
+    "    policy_MC_off[s] = actions[a]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "policy_MC_off[1:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Q-Learning\n",
+    "Q = np.zeros((states+1,states+1))\n",
+    "count = np.zeros((states+1,states+1))\n",
+    "for i in range(10000): #Number of episodes\n",
+    "    done = False\n",
+    "    s = np.random.randint(9) + 1\n",
+    "    while not done:\n",
+    "        if np.random.random() < 0.2:\n",
+    "            a = np.random.randint(min(s,states-s))+1\n",
+    "        else:\n",
+    "            actions = range(1,min(s,states-s)+1)\n",
+    "            a = np.argmax(Q[s,1:min(s,states-s)+1])\n",
+    "            a = actions[a]\n",
+    "            \n",
+    "        ns = get_next_state(s,a)\n",
+    "                        \n",
+    "        if ns == states:\n",
+    "            reward = 1\n",
+    "            done = True\n",
+    "        elif ns == 0:\n",
+    "            reward = 0\n",
+    "            done = True\n",
+    "        else:\n",
+    "            reward = 0\n",
+    "            done = False\n",
+    "            \n",
+    "        if ns == 0 or ns == states:\n",
+    "            next_reward = 0\n",
+    "        else:\n",
+    "            next_reward = np.max(Q[ns,1:min(ns,states-ns)+1])\n",
+    "        \n",
+    "        count[s,a] +=1\n",
+    "        alpha = 1/count[s,a]\n",
+    "        \n",
+    "        Q[s,a] = (1-alpha)*Q[s,a] + alpha*(reward + next_reward)\n",
+    "        \n",
+    "        s = ns\n",
+    "    \n",
+    "        \n",
+    "    \n",
+    "            \n",
+    "        \n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#environment \n",
+    "def get_next_state(s,a):\n",
+    "    if np.random.random() < pf:\n",
+    "        ns = s+a\n",
+    "    else:\n",
+    "        ns = s-a\n",
+    "    return ns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "policy_Q = -1*np.ones(states+1)\n",
+    "for s in range(1,states):\n",
+    "    actions = range(1,min(s,states-s)+1)\n",
+    "    a = np.argmax(Q[s,1:min(s,states-s)+1])\n",
+    "    policy_Q[s] = actions[a]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "policy_Q[1:states]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Notes/RL/Midterm-Solutions.pdf b/Notes/RL/Midterm-Solutions.pdf
diff --git a/Notes/RL/Note 06-Dec-2022.pdf b/Notes/RL/Note 06-Dec-2022.pdf
diff --git a/Notes/RL/OnlineMtech_RL_FinalExam_2022.pdf b/Notes/RL/OnlineMtech_RL_FinalExam_2022.pdf
diff --git a/Notes/RL/OnlineMtech_RL_Practice.pdf b/Notes/RL/OnlineMtech_RL_Practice.pdf
diff --git a/Notes/RL/Online_RL_Practice_Questions.pdf b/Notes/RL/Online_RL_Practice_Questions.pdf
diff --git a/Notes/RL/Practice Solutions 1.pdf b/Notes/RL/Practice Solutions 1.pdf
diff --git a/Notes/RL/Practice Solutions 2.pdf b/Notes/RL/Practice Solutions 2.pdf
diff --git a/Notes/RL/Practice Solutions.pdf b/Notes/RL/Practice Solutions.pdf
diff --git a/Notes/RL/Quiz solutions .pdf b/Notes/RL/Quiz solutions .pdf