Practical Reinforcement Learning Week4

umairnsr87 · Jan 7, 2019 · e47da9e · e47da9e
1 parent 95c4c02
commit e47da9e
Show file tree

Hide file tree

Showing 15 changed files with 1,761 additions and 0 deletions.
diff --git a/Practical Reinforcement Learning/Week4_approx/QUIZ DQN.pdf b/Practical Reinforcement Learning/Week4_approx/QUIZ DQN.pdf
diff --git a/Practical Reinforcement Learning/Week4_approx/QUIZ MC & TD.pdf b/Practical Reinforcement Learning/Week4_approx/QUIZ MC & TD.pdf
diff --git a/Practical Reinforcement Learning/Week4_approx/QUIZ SARSA and QLeaning.pdf b/Practical Reinforcement Learning/Week4_approx/QUIZ SARSA and QLeaning.pdf
diff --git a/Practical Reinforcement Learning/Week4_approx/atari_util.py b/Practical Reinforcement Learning/Week4_approx/atari_util.py
@@ -0,0 +1,59 @@
+"""Auxilary files for those who wanted to solve breakout with CEM or policy gradient"""
+import numpy as np
+import gym
+from scipy.misc import imresize
+from gym.core import Wrapper
+from gym.spaces.box import Box
+
+class PreprocessAtari(Wrapper):
+    def __init__(self, env, height=42, width=42, color=False, crop=lambda img: img, 
+                 n_frames=4, dim_order='theano', reward_scale=1,):
+        """A gym wrapper that reshapes, crops and scales image into the desired shapes"""
+        super(PreprocessAtari, self).__init__(env)
+        assert dim_order in ('theano', 'tensorflow')
+        self.img_size = (height, width)
+        self.crop=crop
+        self.color=color
+        self.dim_order = dim_order
+        self.reward_scale = reward_scale
+
+        n_channels = (3 * n_frames) if color else n_frames
+        obs_shape = [n_channels,height,width] if dim_order == 'theano' else [height,width,n_channels]
+        self.observation_space = Box(0.0, 1.0, obs_shape)
+        self.framebuffer = np.zeros(obs_shape, 'float32')
+
+    def reset(self):
+        """resets breakout, returns initial frames"""
+        self.framebuffer = np.zeros_like(self.framebuffer)
+        self.update_buffer(self.env.reset())
+        return self.framebuffer
+
+    def step(self,action):
+        """plays breakout for 1 step, returns frame buffer"""
+        new_img, reward, done, info = self.env.step(action)
+        self.update_buffer(new_img)
+        return self.framebuffer, reward * self.reward_scale, done, info
+
+    ### image processing ###
+
+    def update_buffer(self,img):
+        img = self.preproc_image(img)
+        offset = 3 if self.color else 1
+        if self.dim_order == 'theano':
+            axis = 0
+            cropped_framebuffer = self.framebuffer[:-offset]
+        else:
+            axis = -1
+            cropped_framebuffer = self.framebuffer[:,:,:-offset]
+        self.framebuffer = np.concatenate([img, cropped_framebuffer], axis = axis)
+
+    def preproc_image(self, img):
+        """what happens to the observation"""
+        img = self.crop(img)
+        img = imresize(img, self.img_size)
+        if not self.color:
+            img = img.mean(-1, keepdims=True)
+        if self.dim_order == 'theano':
+            img = img.transpose([2,0,1]) # [h, w, c] to [c, h, w]
+        img = img.astype('float32') / 255.
+        return img
diff --git a/Practical Reinforcement Learning/Week4_approx/dqn_arch.png b/Practical Reinforcement Learning/Week4_approx/dqn_arch.png
diff --git a/Practical Reinforcement Learning/Week4_approx/dqn_atari.ipynb b/Practical Reinforcement Learning/Week4_approx/dqn_atari.ipynb
diff --git a/Practical Reinforcement Learning/Week4_approx/exp_replay.png b/Practical Reinforcement Learning/Week4_approx/exp_replay.png
diff --git a/Practical Reinforcement Learning/Week4_approx/framebuffer.py b/Practical Reinforcement Learning/Week4_approx/framebuffer.py
@@ -0,0 +1,41 @@
+import numpy as np
+from gym.spaces.box import Box
+from gym.core import Wrapper
+class FrameBuffer(Wrapper):
+    def __init__(self, env, n_frames=4, dim_order='tensorflow'):
+        """A gym wrapper that reshapes, crops and scales image into the desired shapes"""
+        super(FrameBuffer, self).__init__(env)
+        self.dim_order = dim_order
+        if dim_order == 'tensorflow':
+            height, width, n_channels = env.observation_space.shape
+            obs_shape = [height, width, n_channels * n_frames]
+        elif dim_order == 'pytorch':
+            n_channels, height, width = env.observation_space.shape
+            obs_shape = [n_channels * n_frames, height, width]
+        else:
+            raise ValueError('dim_order should be "tensorflow" or "pytorch", got {}'.format(dim_order))
+        self.observation_space = Box(0.0, 1.0, obs_shape)
+        self.framebuffer = np.zeros(obs_shape, 'float32')
+
+    def reset(self):
+        """resets breakout, returns initial frames"""
+        self.framebuffer = np.zeros_like(self.framebuffer)
+        self.update_buffer(self.env.reset())
+        return self.framebuffer
+
+    def step(self, action):
+        """plays breakout for 1 step, returns frame buffer"""
+        new_img, reward, done, info = self.env.step(action)
+        self.update_buffer(new_img)
+        return self.framebuffer, reward, done, info
+
+    def update_buffer(self, img):
+        if self.dim_order == 'tensorflow':
+            offset = self.env.observation_space.shape[-1]
+            axis = -1
+            cropped_framebuffer = self.framebuffer[:,:,:-offset]
+        elif self.dim_order == 'pytorch':
+            offset = self.env.observation_space.shape[0]
+            axis = 0
+            cropped_framebuffer = self.framebuffer[:-offset]
+        self.framebuffer = np.concatenate([img, cropped_framebuffer], axis = axis)
diff --git a/Practical Reinforcement Learning/Week4_approx/my_bot_is_training.png b/Practical Reinforcement Learning/Week4_approx/my_bot_is_training.png
diff --git a/Practical Reinforcement Learning/Week4_approx/nerd.png b/Practical Reinforcement Learning/Week4_approx/nerd.png
diff --git a/Practical Reinforcement Learning/Week4_approx/practice_approx_qlearning.ipynb b/Practical Reinforcement Learning/Week4_approx/practice_approx_qlearning.ipynb
diff --git a/Practical Reinforcement Learning/Week4_approx/qlearning_scheme.png b/Practical Reinforcement Learning/Week4_approx/qlearning_scheme.png
diff --git a/Practical Reinforcement Learning/Week4_approx/replay_buffer.py b/Practical Reinforcement Learning/Week4_approx/replay_buffer.py
@@ -0,0 +1,63 @@
+# This code is shamelessly stolen from https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py
+import numpy as np
+import random
+
+class ReplayBuffer(object):
+    def __init__(self, size):
+        """Create Replay buffer.
+        Parameters
+        ----------
+        size: int
+            Max number of transitions to store in the buffer. When the buffer
+            overflows the old memories are dropped.
+        """
+        self._storage = []
+        self._maxsize = size
+        self._next_idx = 0
+
+    def __len__(self):
+        return len(self._storage)
+
+    def add(self, obs_t, action, reward, obs_tp1, done):
+        data = (obs_t, action, reward, obs_tp1, done)
+
+        if self._next_idx >= len(self._storage):
+            self._storage.append(data)
+        else:
+            self._storage[self._next_idx] = data
+        self._next_idx = (self._next_idx + 1) % self._maxsize
+
+    def _encode_sample(self, idxes):
+        obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
+        for i in idxes:
+            data = self._storage[i]
+            obs_t, action, reward, obs_tp1, done = data
+            obses_t.append(np.array(obs_t, copy=False))
+            actions.append(np.array(action, copy=False))
+            rewards.append(reward)
+            obses_tp1.append(np.array(obs_tp1, copy=False))
+            dones.append(done)
+        return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones)
+
+    def sample(self, batch_size):
+        """Sample a batch of experiences.
+        Parameters
+        ----------
+        batch_size: int
+            How many transitions to sample.
+        Returns
+        -------
+        obs_batch: np.array
+            batch of observations
+        act_batch: np.array
+            batch of actions executed given obs_batch
+        rew_batch: np.array
+            rewards received as results of executing act_batch
+        next_obs_batch: np.array
+            next set of observations seen after executing act_batch
+        done_mask: np.array
+            done_mask[i] = 1 if executing act_batch[i] resulted in
+            the end of an episode and 0 otherwise.
+        """
+        idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
+        return self._encode_sample(idxes)
diff --git a/Practical Reinforcement Learning/Week4_approx/submit.py b/Practical Reinforcement Learning/Week4_approx/submit.py
@@ -0,0 +1,21 @@
+import sys
+import numpy as np
+sys.path.append("..")
+import grading
+
+
+def submit_cartpole(generate_session, email, token):
+    sessions = [generate_session() for _ in range(100)]
+    #session_rewards, _, _ = map(np.array, zip(*sessions))
+    session_rewards = np.array(sessions)
+    grader = grading.Grader("RDofv-QXEeeaGw6kpIOf3g")
+    grader.set_answer("NRNkl", int(np.mean(session_rewards)))
+    grader.submit(email, token)
+
+
+def submit_breakout(agent, env, evaluate, email, token):
+    sessions = [evaluate(env, agent, n_games=1) for _ in range(100)]
+    session_rewards = np.array(sessions)
+    grader = grading.Grader("WTOZHCn1EeiNwAoZNi-Hrg")
+    grader.set_answer("VFM7Z", int(np.mean(session_rewards)))
+    grader.submit(email, token)
diff --git a/Practical Reinforcement Learning/Week4_approx/target_net.png b/Practical Reinforcement Learning/Week4_approx/target_net.png