-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Practical Reinforcement Learning Week3
- Loading branch information
1 parent
53ae2cf
commit 95c4c02
Showing
6 changed files
with
1,591 additions
and
0 deletions.
There are no files selected for viewing
Binary file added
BIN
+582 KB
Practical Reinforcement Learning/Week3_model_free/QUIZ Model-free reinforcement learning.pdf
Binary file not shown.
377 changes: 377 additions & 0 deletions
377
Practical Reinforcement Learning/Week3_model_free/experience_replay.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
662 changes: 662 additions & 0 deletions
662
Practical Reinforcement Learning/Week3_model_free/qlearning.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
114 changes: 114 additions & 0 deletions
114
Practical Reinforcement Learning/Week3_model_free/qlearning.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
from collections import defaultdict | ||
import random, math | ||
import numpy as np | ||
|
||
class QLearningAgent: | ||
def __init__(self, alpha, epsilon, discount, get_legal_actions): | ||
""" | ||
Q-Learning Agent | ||
based on http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html | ||
Instance variables you have access to | ||
- self.epsilon (exploration prob) | ||
- self.alpha (learning rate) | ||
- self.discount (discount rate aka gamma) | ||
Functions you should use | ||
- self.get_legal_actions(state) {state, hashable -> list of actions, each is hashable} | ||
which returns legal actions for a state | ||
- self.get_qvalue(state,action) | ||
which returns Q(state,action) | ||
- self.set_qvalue(state,action,value) | ||
which sets Q(state,action) := value | ||
!!!Important!!! | ||
Note: please avoid using self._qValues directly. | ||
There's a special self.get_qvalue/set_qvalue for that. | ||
""" | ||
|
||
self.get_legal_actions = get_legal_actions | ||
self._qvalues = defaultdict(lambda: defaultdict(lambda: 0)) | ||
self.alpha = alpha | ||
self.epsilon = epsilon | ||
self.discount = discount | ||
|
||
def get_qvalue(self, state, action): | ||
""" Returns Q(state,action) """ | ||
return self._qvalues[state][action] | ||
|
||
def set_qvalue(self,state,action,value): | ||
""" Sets the Qvalue for [state,action] to the given value """ | ||
self._qvalues[state][action] = value | ||
|
||
#---------------------START OF YOUR CODE---------------------# | ||
|
||
def get_value(self, state): | ||
""" | ||
Compute your agent's estimate of V(s) using current q-values | ||
V(s) = max_over_action Q(state,action) over possible actions. | ||
Note: please take into account that q-values can be negative. | ||
""" | ||
possible_actions = self.get_legal_actions(state) | ||
|
||
#If there are no legal actions, return 0.0 | ||
if len(possible_actions) == 0: | ||
return 0.0 | ||
|
||
value = max(self.get_qvalue(state, action) for action in possible_actions) | ||
|
||
return value | ||
|
||
def update(self, state, action, reward, next_state): | ||
""" | ||
You should do your Q-Value update here: | ||
Q(s,a) := (1 - alpha) * Q(s,a) + alpha * (r + gamma * V(s')) | ||
""" | ||
|
||
#agent parameters | ||
gamma = self.discount | ||
learning_rate = self.alpha | ||
|
||
Qsa = (1 - learning_rate)*self.get_qvalue(state, action) + learning_rate*(reward + gamma*self.get_value(next_state)) | ||
|
||
self.set_qvalue(state, action, Qsa) | ||
|
||
|
||
def get_best_action(self, state): | ||
""" | ||
Compute the best action to take in a state (using current q-values). | ||
""" | ||
possible_actions = self.get_legal_actions(state) | ||
|
||
#If there are no legal actions, return None | ||
if len(possible_actions) == 0: | ||
return None | ||
|
||
Q_values = {action: self.get_qvalue(state, action) for action in possible_actions} | ||
best_action = max(Q_values, key=lambda action: Q_values[action]) | ||
|
||
return best_action | ||
|
||
def get_action(self, state): | ||
""" | ||
Compute the action to take in the current state, including exploration. | ||
With probability self.epsilon, we should take a random action. | ||
otherwise - the best policy action (self.getPolicy). | ||
Note: To pick randomly from a list, use random.choice(list). | ||
To pick True or False with a given probablity, generate uniform number in [0, 1] | ||
and compare it with your probability | ||
""" | ||
|
||
# Pick Action | ||
possible_actions = self.get_legal_actions(state) | ||
action = random.choice(possible_actions) | ||
|
||
#If there are no legal actions, return None | ||
if len(possible_actions) == 0: | ||
return None | ||
|
||
#agent parameters: | ||
epsilon = self.epsilon | ||
|
||
chosen_action = self.get_best_action(state) if random.random() > epsilon else action | ||
|
||
return chosen_action |
378 changes: 378 additions & 0 deletions
378
Practical Reinforcement Learning/Week3_model_free/sarsa.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
60 changes: 60 additions & 0 deletions
60
Practical Reinforcement Learning/Week3_model_free/submit.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
import sys | ||
import numpy as np | ||
sys.path.append("..") | ||
import grading | ||
|
||
|
||
def submit_experience_replay(rewards_replay, rewards_baseline, email, token): | ||
flag1 = np.mean(rewards_replay[:100]) - np.mean(rewards_baseline[:100]) | ||
flag2 = np.mean(rewards_replay[-100:]) | ||
flag3 = np.mean(rewards_baseline[-100:]) | ||
|
||
grader = grading.Grader("XUt-8d7yEee8nwq8KJgXXg") | ||
grader.set_answer("iEQwT", flag1) | ||
grader.set_answer("8N1Wm", flag2) | ||
grader.set_answer("F0Am8", flag3) | ||
|
||
grader.submit(email, token) | ||
|
||
|
||
def submit_qlearning1(rewards, email, token): | ||
flag1 = np.mean(rewards[-10:]) | ||
|
||
grader = grading.Grader("XbjcGd7xEeeDzRKutDCmyA") | ||
grader.set_answer("5NB4z", flag1) | ||
|
||
grader.submit(email, token) | ||
|
||
|
||
def submit_qlearning2(rewards, email, token): | ||
flag1 = np.mean(rewards[-10:]) | ||
|
||
grader = grading.Grader("XbjcGd7xEeeDzRKutDCmyA") | ||
grader.set_answer("CkyJ4", flag1) | ||
|
||
grader.submit(email, token) | ||
|
||
|
||
def submit_qlearning_all(rewards_q1, rewards_q2, email, token): | ||
grader = grading.Grader("XbjcGd7xEeeDzRKutDCmyA") | ||
|
||
flag1 = np.mean(rewards_q1[-10:]) | ||
grader.set_answer("5NB4z", flag1) | ||
|
||
flag2 = np.mean(rewards_q2[-10:]) | ||
grader.set_answer("CkyJ4", flag2) | ||
|
||
grader.submit(email, token) | ||
|
||
|
||
def submit_sarsa(rewards_ql, rewards_sarsa, email, token): | ||
flag1 = np.mean(rewards_ql[-100:]) | ||
flag2 = np.mean(rewards_sarsa[-100:]) | ||
flag3 = np.mean(rewards_sarsa[-100:]) - np.mean(rewards_ql[-100:]) | ||
|
||
grader = grading.Grader("pazQX97xEee_JA6t1Myltg") | ||
grader.set_answer("ZarWJ", flag1) | ||
grader.set_answer("izJi4", flag2) | ||
grader.set_answer("frgbU", flag3) | ||
|
||
grader.submit(email, token) |