forked from IntManear/MountainCar-QLearning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathQLearning.py
160 lines (126 loc) · 4.89 KB
/
QLearning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# -*- coding: utf-8 -*-
"""QLearning.ipynb
Automatically generated by Colaboratory.
"""
#remove " > /dev/null 2>&1" to see what is going on under the hood
# !pip install gym pyvirtualdisplay > /dev/null 2>&1
# !apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
# !apt-get update > /dev/null 2>&1
# !apt-get install cmake > /dev/null 2>&1
# !pip install --upgrade setuptools 2>&1
# !pip install ez_setup > /dev/null 2>&1
# !pip install gym[atari] > /dev/null 2>&1
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
# %matplotlib inline
import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()
"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""
def show_video():
mp4list = glob.glob('video/*.mp4')
if len(mp4list) > 0:
mp4 = mp4list[0]
video = io.open(mp4, 'r+b').read()
encoded = base64.b64encode(video)
ipythondisplay.display(HTML(data='''<video alt="test" autoplay
loop controls style="height: 400px;">
<source src="data:video/mp4;base64,{0}" type="video/mp4" />
</video>'''.format(encoded.decode('ascii'))))
else:
print("Could not find video")
def wrap_env(env):
env = Monitor(env, './video', force=True)
return env
env = wrap_env(gym.make('MountainCar-v0'))
env.reset()
#check out the Mountain Car action space!
print(env.action_space)
def QLearning(env, learning, discount, epsilon, min_eps, episodes):
# Determine size of discretized state space
num_states = (env.observation_space.high - env.observation_space.low)*\
np.array([10, 100])
num_states = np.round(num_states, 0).astype(int) + 1
# Initialize Q table
Q = np.random.uniform(low = -1, high = 1,
size = (num_states[0], num_states[1],
env.action_space.n))
# Initialize variables to track rewards
reward_list = []
ave_reward_list = []
# Calculate episodic reduction in epsilon
reduction = (epsilon - min_eps)/episodes
# Run Q learning algorithm
for i in range(episodes):
# Initialize parameters
done = False
tot_reward, reward = 0,0
state = env.reset()
# Discretize state
state_adj = (state - env.observation_space.low)*np.array([10, 100])
state_adj = np.round(state_adj, 0).astype(int)
while done != True:
# Render environment for last five episodes
if i >= (episodes - 20):
env.render()
# Determine next action - epsilon greedy strategy
if np.random.random() < 1 - epsilon:
action = np.argmax(Q[state_adj[0], state_adj[1]])
else:
action = np.random.randint(0, env.action_space.n)
# Get next state and reward
state2, reward, done, info = env.step(action)
# Discretize state2
state2_adj = (state2 - env.observation_space.low)*np.array([10, 100])
state2_adj = np.round(state2_adj, 0).astype(int)
#Allow for terminal states
if done and state2[0] >= 0.5:
Q[state_adj[0], state_adj[1], action] = reward
# Adjust Q value for current state
else:
delta = learning*(reward +
discount*np.max(Q[state2_adj[0],
state2_adj[1]]) -
Q[state_adj[0], state_adj[1],action])
Q[state_adj[0], state_adj[1],action] += delta
# Update variables
tot_reward += reward
state_adj = state2_adj
# Decay epsilon
if epsilon > min_eps:
epsilon -= reduction
# Track rewards
reward_list.append(tot_reward)
if (i+1) % 100 == 0:
ave_reward = np.mean(reward_list)
ave_reward_list.append(ave_reward)
reward_list = []
if (i+1) % 100 == 0:
print('Episode {} Average Reward: {}'.format(i+1, ave_reward))
env.close()
return ave_reward_list
# Run Q-learning algorithm
rewards = QLearning(env, 0.2, 0.9, 0.8, 0, 10000)
# Plot Rewards
plt.plot(100*(np.arange(len(rewards)) + 1), rewards)
plt.xlabel('Episodes')
plt.ylabel('Average Reward')
plt.title('Average Reward vs Episodes')
plt.savefig('rewards.jpg')
plt.show()
plt.close()
show_video()