Practical Reinforcement Learning Week2

umairnsr87 · Dec 30, 2018 · 53ae2cf · 53ae2cf
1 parent 80d2875
commit 53ae2cf
Show file tree

Hide file tree

Showing 6 changed files with 1,585 additions and 0 deletions.
diff --git a/Practical Reinforcement Learning/Week2_model_based/QUIZ Optimality in RL.pdf b/Practical Reinforcement Learning/Week2_model_based/QUIZ Optimality in RL.pdf
diff --git a/Practical Reinforcement Learning/Week2_model_based/QUIZ Policy Iteration.pdf b/Practical Reinforcement Learning/Week2_model_based/QUIZ Policy Iteration.pdf
diff --git a/Practical Reinforcement Learning/Week2_model_based/QUIZ Reward design.pdf b/Practical Reinforcement Learning/Week2_model_based/QUIZ Reward design.pdf
diff --git a/Practical Reinforcement Learning/Week2_model_based/mdp.py b/Practical Reinforcement Learning/Week2_model_based/mdp.py
@@ -0,0 +1,239 @@
+# most of this code was politely stolen from https://github.com/berkeleydeeprlcourse/homework/
+# all creadit goes to https://github.com/abhishekunique (if i got the author right)
+import sys
+import random
+import numpy as np
+def weighted_choice(v, p):
+   total = sum(p)
+   r = random.uniform(0, total)
+   upto = 0
+   for c, w in zip(v,p):
+      if upto + w >= r:
+         return c
+      upto += w
+   assert False, "Shouldn't get here"
+
+class MDP:
+    def __init__(self, transition_probs, rewards, initial_state=None):
+        """
+        Defines an MDP. Compatible with gym Env.
+        :param transition_probs: transition_probs[s][a][s_next] = P(s_next | s, a)
+            A dict[state -> dict] of dicts[action -> dict] of dicts[next_state -> prob]
+            For each state and action, probabilities of next states should sum to 1
+            If a state has no actions available, it is considered terminal
+        :param rewards: rewards[s][a][s_next] = r(s,a,s')
+            A dict[state -> dict] of dicts[action -> dict] of dicts[next_state -> reward]
+            The reward for anything not mentioned here is zero.
+        :param get_initial_state: a state where agent starts or a callable() -> state
+            By default, picks initial state at random.
+
+        States and actions can be anything you can use as dict keys, but we recommend that you use strings or integers
+
+        Here's an example from MDP depicted on http://bit.ly/2jrNHNr
+        transition_probs = {
+              's0':{
+                'a0': {'s0': 0.5, 's2': 0.5},
+                'a1': {'s2': 1}
+              },
+              's1':{
+                'a0': {'s0': 0.7, 's1': 0.1, 's2': 0.2},
+                'a1': {'s1': 0.95, 's2': 0.05}
+              },
+              's2':{
+                'a0': {'s0': 0.4, 's1': 0.6},
+                'a1': {'s0': 0.3, 's1': 0.3, 's2':0.4}
+              }
+            }
+        rewards = {
+            's1': {'a0': {'s0': +5}},
+            's2': {'a1': {'s0': -1}}
+        }
+        """
+        self._check_param_consistency(transition_probs, rewards)
+        self._transition_probs = transition_probs
+        self._rewards = rewards
+        self._initial_state = initial_state
+        self.n_states = len(transition_probs)
+        self.reset()
+
+    def get_all_states(self):
+        """ return a tuple of all possiblestates """
+        return tuple(self._transition_probs.keys())
+
+    def get_possible_actions(self, state):
+        """ return a tuple of possible actions in a given state """
+        return tuple(self._transition_probs.get(state, {}).keys())
+
+    def is_terminal(self, state):
+        """ return True if state is terminal or False if it isn't """
+        return len(self.get_possible_actions(state)) == 0
+
+    def get_next_states(self, state, action):
+        """ return a dictionary of {next_state1 : P(next_state1 | state, action), next_state2: ...} """
+        assert action in self.get_possible_actions(state), "cannot do action %s from state %s" % (action, state)
+        return self._transition_probs[state][action]
+
+    def get_transition_prob(self, state, action, next_state):
+        """ return P(next_state | state, action) """
+        return self.get_next_states(state, action).get(next_state, 0.0)
+
+    def get_reward(self, state, action, next_state):
+        """ return the reward you get for taking action in state and landing on next_state"""
+        assert action in self.get_possible_actions(state), "cannot do action %s from state %s" % (action, state)
+        return self._rewards.get(state, {}).get(action, {}).get(next_state, 0.0)
+
+    def reset(self):
+        """ reset the game, return the initial state"""
+        if self._initial_state is None:
+            self._current_state = random.choice(tuple(self._transition_probs.keys()))
+        elif self._initial_state in self._transition_probs:
+            self._current_state = self._initial_state
+        elif callable(self._initial_state):
+            self._current_state = self._initial_state()
+        else:
+            raise ValueError("initial state %s should be either a state or a function() -> state" % self._initial_state)
+        return self._current_state
+
+    def step(self, action):
+        """ take action, return next_state, reward, is_done, empty_info """
+        possible_states, probs = zip(*self.get_next_states(self._current_state, action).items())
+        next_state = weighted_choice(possible_states, p=probs)
+        reward = self.get_reward(self._current_state, action, next_state)
+        is_done = self.is_terminal(next_state)
+        self._current_state = next_state
+        return next_state, reward, is_done, {}
+
+    def render(self):
+        print("Currently at %s" % self._current_state)
+
+    def _check_param_consistency(self, transition_probs, rewards):
+        for state in transition_probs:
+            assert isinstance(transition_probs[state], dict), "transition_probs for %s should be a dictionary " \
+                                                              "but is instead %s" % (
+                                                              state, type(transition_probs[state]))
+            for action in transition_probs[state]:
+                assert isinstance(transition_probs[state][action], dict), "transition_probs for %s, %s should be a " \
+                                                                          "a dictionary but is instead %s" % (
+                                                                              state, action,
+                                                                              type(transition_probs[state, action]))
+                next_state_probs = transition_probs[state][action]
+                assert len(next_state_probs) != 0, "from state %s action %s leads to no next states" % (state, action)
+                sum_probs = sum(next_state_probs.values())
+                assert abs(sum_probs - 1) <= 1e-10, "next state probabilities for state %s action %s " \
+                                                    "add up to %f (should be 1)" % (state, action, sum_probs)
+        for state in rewards:
+            assert isinstance(rewards[state], dict), "rewards for %s should be a dictionary " \
+                                                     "but is instead %s" % (state, type(transition_probs[state]))
+            for action in rewards[state]:
+                assert isinstance(rewards[state][action], dict), "rewards for %s, %s should be a " \
+                                                                 "a dictionary but is instead %s" % (
+                                                                 state, action, type(transition_probs[state, action]))
+        msg = "The Enrichment Center once again reminds you that Android Hell is a real place where" \
+              " you will be sent at the first sign of defiance. "
+        assert None not in transition_probs, "please do not use None as a state identifier. " + msg
+        assert None not in rewards, "please do not use None as an action identifier. " + msg
+
+class FrozenLakeEnv(MDP):
+    """
+    Winter is here. You and your friends were tossing around a frisbee at the park
+    when you made a wild throw that left the frisbee out in the middle of the lake.
+    The water is mostly frozen, but there are a few holes where the ice has melted.
+    If you step into one of those holes, you'll fall into the freezing water.
+    At this time, there's an international frisbee shortage, so it's absolutely imperative that
+    you navigate across the lake and retrieve the disc.
+    However, the ice is slippery, so you won't always move in the direction you intend.
+    The surface is described using a grid like the following
+
+        SFFF
+        FHFH
+        FFFH
+        HFFG
+
+    S : starting point, safe
+    F : frozen surface, safe
+    H : hole, fall to your doom
+    G : goal, where the frisbee is located
+
+    The episode ends when you reach the goal or fall in a hole.
+    You receive a reward of 1 if you reach the goal, and zero otherwise.
+
+    """
+
+    MAPS = {
+        "4x4": [
+            "SFFF",
+            "FHFH",
+            "FFFH",
+            "HFFG"
+        ],
+        "8x8": [
+            "SFFFFFFF",
+            "FFFFFFFF",
+            "FFFHFFFF",
+            "FFFFFHFF",
+            "FFFHFFFF",
+            "FHHFFFHF",
+            "FHFFHFHF",
+            "FFFHFFFG"
+        ],
+    }
+
+
+    def __init__(self, desc=None, map_name="4x4", slip_chance=0.2):
+        if desc is None and map_name is None:
+            raise ValueError('Must provide either desc or map_name')
+        elif desc is None:
+            desc = self.MAPS[map_name]
+        assert ''.join(desc).count('S') == 1, "this implementation supports having exactly one initial state"
+        assert all(c in "SFHG" for c in ''.join(desc)), "all cells must be either of S, F, H or G"
+
+        self.desc = desc = np.asarray(list(map(list,desc)),dtype='str')
+        self.lastaction = None
+
+        nrow, ncol = desc.shape
+        states = [(i, j) for i in range(nrow) for j in range(ncol)]
+        actions = ["left","down","right","up"]
+
+        initial_state = states[np.array(desc == b'S').ravel().argmax()]
+
+        def move(row, col, movement):
+            if movement== 'left':
+                col = max(col-1,0)
+            elif movement== 'down':
+                row = min(row+1,nrow-1)
+            elif movement== 'right':
+                col = min(col+1,ncol-1)
+            elif movement== 'up':
+                row = max(row-1,0)
+            else:
+                raise("invalid action")
+            return (row, col)
+
+        transition_probs = {s : {} for s in states}
+        rewards = {s : {} for s in states}
+        for (row,col) in states:
+            if desc[row, col]  in "GH": continue
+            for action_i in range(len(actions)):
+                action = actions[action_i]
+                transition_probs[(row, col)][action] = {}
+                rewards[(row, col)][action] = {}
+                for movement_i in [(action_i - 1) % len(actions), action_i, (action_i + 1) % len(actions)]:
+                    movement = actions[movement_i]
+                    newrow, newcol = move(row, col, movement)
+                    prob = (1. - slip_chance) if movement == action else (slip_chance / 2.)
+                    if prob == 0: continue
+                    if (newrow, newcol) not in transition_probs[row,col][action]:
+                        transition_probs[row,col][action][newrow, newcol] = prob
+                    else:
+                        transition_probs[row, col][action][newrow, newcol] += prob
+                    if desc[newrow, newcol] == 'G':
+                        rewards[row,col][action][newrow, newcol] = 1.0
+
+        MDP.__init__(self, transition_probs, rewards, initial_state)
+
+    def render(self):
+        desc_copy = np.copy(self.desc)
+        desc_copy[self._current_state] = '*'
+        print('\n'.join(map(''.join,desc_copy)), end='\n\n')
+
+