Skip to content

Commit

Permalink
Practical Reinforcement Learning Week3
Browse files Browse the repository at this point in the history
  • Loading branch information
jiadaizhao committed Jan 1, 2019
1 parent 53ae2cf commit 95c4c02
Show file tree
Hide file tree
Showing 6 changed files with 1,591 additions and 0 deletions.
Binary file not shown.

Large diffs are not rendered by default.

662 changes: 662 additions & 0 deletions Practical Reinforcement Learning/Week3_model_free/qlearning.ipynb

Large diffs are not rendered by default.

114 changes: 114 additions & 0 deletions Practical Reinforcement Learning/Week3_model_free/qlearning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
from collections import defaultdict
import random, math
import numpy as np

class QLearningAgent:
def __init__(self, alpha, epsilon, discount, get_legal_actions):
"""
Q-Learning Agent
based on http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
Instance variables you have access to
- self.epsilon (exploration prob)
- self.alpha (learning rate)
- self.discount (discount rate aka gamma)
Functions you should use
- self.get_legal_actions(state) {state, hashable -> list of actions, each is hashable}
which returns legal actions for a state
- self.get_qvalue(state,action)
which returns Q(state,action)
- self.set_qvalue(state,action,value)
which sets Q(state,action) := value
!!!Important!!!
Note: please avoid using self._qValues directly.
There's a special self.get_qvalue/set_qvalue for that.
"""

self.get_legal_actions = get_legal_actions
self._qvalues = defaultdict(lambda: defaultdict(lambda: 0))
self.alpha = alpha
self.epsilon = epsilon
self.discount = discount

def get_qvalue(self, state, action):
""" Returns Q(state,action) """
return self._qvalues[state][action]

def set_qvalue(self,state,action,value):
""" Sets the Qvalue for [state,action] to the given value """
self._qvalues[state][action] = value

#---------------------START OF YOUR CODE---------------------#

def get_value(self, state):
"""
Compute your agent's estimate of V(s) using current q-values
V(s) = max_over_action Q(state,action) over possible actions.
Note: please take into account that q-values can be negative.
"""
possible_actions = self.get_legal_actions(state)

#If there are no legal actions, return 0.0
if len(possible_actions) == 0:
return 0.0

value = max(self.get_qvalue(state, action) for action in possible_actions)

return value

def update(self, state, action, reward, next_state):
"""
You should do your Q-Value update here:
Q(s,a) := (1 - alpha) * Q(s,a) + alpha * (r + gamma * V(s'))
"""

#agent parameters
gamma = self.discount
learning_rate = self.alpha

Qsa = (1 - learning_rate)*self.get_qvalue(state, action) + learning_rate*(reward + gamma*self.get_value(next_state))

self.set_qvalue(state, action, Qsa)


def get_best_action(self, state):
"""
Compute the best action to take in a state (using current q-values).
"""
possible_actions = self.get_legal_actions(state)

#If there are no legal actions, return None
if len(possible_actions) == 0:
return None

Q_values = {action: self.get_qvalue(state, action) for action in possible_actions}
best_action = max(Q_values, key=lambda action: Q_values[action])

return best_action

def get_action(self, state):
"""
Compute the action to take in the current state, including exploration.
With probability self.epsilon, we should take a random action.
otherwise - the best policy action (self.getPolicy).
Note: To pick randomly from a list, use random.choice(list).
To pick True or False with a given probablity, generate uniform number in [0, 1]
and compare it with your probability
"""

# Pick Action
possible_actions = self.get_legal_actions(state)
action = random.choice(possible_actions)

#If there are no legal actions, return None
if len(possible_actions) == 0:
return None

#agent parameters:
epsilon = self.epsilon

chosen_action = self.get_best_action(state) if random.random() > epsilon else action

return chosen_action
378 changes: 378 additions & 0 deletions Practical Reinforcement Learning/Week3_model_free/sarsa.ipynb

Large diffs are not rendered by default.

60 changes: 60 additions & 0 deletions Practical Reinforcement Learning/Week3_model_free/submit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import sys
import numpy as np
sys.path.append("..")
import grading


def submit_experience_replay(rewards_replay, rewards_baseline, email, token):
flag1 = np.mean(rewards_replay[:100]) - np.mean(rewards_baseline[:100])
flag2 = np.mean(rewards_replay[-100:])
flag3 = np.mean(rewards_baseline[-100:])

grader = grading.Grader("XUt-8d7yEee8nwq8KJgXXg")
grader.set_answer("iEQwT", flag1)
grader.set_answer("8N1Wm", flag2)
grader.set_answer("F0Am8", flag3)

grader.submit(email, token)


def submit_qlearning1(rewards, email, token):
flag1 = np.mean(rewards[-10:])

grader = grading.Grader("XbjcGd7xEeeDzRKutDCmyA")
grader.set_answer("5NB4z", flag1)

grader.submit(email, token)


def submit_qlearning2(rewards, email, token):
flag1 = np.mean(rewards[-10:])

grader = grading.Grader("XbjcGd7xEeeDzRKutDCmyA")
grader.set_answer("CkyJ4", flag1)

grader.submit(email, token)


def submit_qlearning_all(rewards_q1, rewards_q2, email, token):
grader = grading.Grader("XbjcGd7xEeeDzRKutDCmyA")

flag1 = np.mean(rewards_q1[-10:])
grader.set_answer("5NB4z", flag1)

flag2 = np.mean(rewards_q2[-10:])
grader.set_answer("CkyJ4", flag2)

grader.submit(email, token)


def submit_sarsa(rewards_ql, rewards_sarsa, email, token):
flag1 = np.mean(rewards_ql[-100:])
flag2 = np.mean(rewards_sarsa[-100:])
flag3 = np.mean(rewards_sarsa[-100:]) - np.mean(rewards_ql[-100:])

grader = grading.Grader("pazQX97xEee_JA6t1Myltg")
grader.set_answer("ZarWJ", flag1)
grader.set_answer("izJi4", flag2)
grader.set_answer("frgbU", flag3)

grader.submit(email, token)

0 comments on commit 95c4c02

Please sign in to comment.