-
Notifications
You must be signed in to change notification settings - Fork 0
/
cartpole_dqn.py
232 lines (197 loc) · 10.4 KB
/
cartpole_dqn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
import sys
import gym
import matplotlib
matplotlib.use("PS")
from matplotlib import pyplot as plt
import random
import numpy as np
from collections import deque
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.models import Sequential
EPISODES = 1000 #Maximum number of episodes
#DQN Agent for the Cartpole
#Q function approximation with NN, experience replay, and target network
class DQNAgent:
#Constructor for the agent (invoked when DQN is first called in main)
def __init__(self, state_size, action_size):
self.check_solve = True #If True, stop if you satisfy solution confition
self.render = False #If you want to see Cartpole learning, then change to True
#Get size of state and action
self.state_size = state_size
self.action_size = action_size
################################################################################
################################################################################
#Set hyper parameters for the DQN. Do not adjust those labeled as Fixed.
self.discount_factor = 0.99
self.learning_rate = 0.001
self.epsilon = 0.02 #Fixed
self.batch_size = 32 #Fixed
self.memory_size = 5000
self.train_start = 1000 #Fixed
self.target_update_frequency = 2
################################################################################
################################################################################
#Number of test states for Q value plots
self.test_state_no = 10000
#Create memory buffer using deque
self.memory = deque(maxlen=self.memory_size)
#Create main network and target network (using build_model defined below)
self.model = self.build_model()
self.target_model = self.build_model()
#Initialize target network
self.update_target_model()
#Approximate Q function using Neural Network
#State is the input and the Q Values are the output.
###############################################################################
###############################################################################
#Edit the Neural Network model here
#Tip: Consult https://keras.io/getting-started/sequential-model-guide/
def build_model(self):
model = Sequential()
model.add(Dense(12, input_dim=self.state_size, activation='relu',
kernel_initializer='he_uniform'))
model.add(Dense(12, activation='relu',
kernel_initializer='he_uniform'))
model.add(Dense(self.action_size, activation='linear',
kernel_initializer='he_uniform'))
model.summary()
model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
return model
###############################################################################
###############################################################################
#After some time interval update the target model to be same with model
def update_target_model(self):
self.target_model.set_weights(self.model.get_weights())
#Get action from model using epsilon-greedy policy
def get_action(self, state):
###############################################################################
###############################################################################
#Insert your e-greedy policy code here
#Tip 1: Use the random package to generate a random action.
#Tip 2: Use keras.model.predict() to compute Q-values from the state.
if np.random.random() < self.epsilon:
action = random.randrange(self.action_size)
else:
Q = self.model.predict(state)
action = np.argmax(Q)
return action
###############################################################################
###############################################################################
#Save sample <s,a,r,s'> to the replay memory
def append_sample(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done)) #Add sample to the end of the list
#Sample <s,a,r,s'> from replay memory
def train_model(self):
if len(self.memory) < self.train_start: #Do not train if not enough memory
return
batch_size = min(self.batch_size, len(self.memory)) #Train on at most as many samples as you have in memory
mini_batch = random.sample(self.memory, batch_size) #Uniformly sample the memory buffer
#Preallocate network and target network input matrices.
update_input = np.zeros((batch_size, self.state_size)) #batch_size by state_size two-dimensional array (not matrix!)
update_target = np.zeros((batch_size, self.state_size)) #Same as above, but used for the target network
action, reward, done = [], [], [] #Empty arrays that will grow dynamically
for i in range(self.batch_size):
update_input[i] = mini_batch[i][0] #Allocate s(i) to the network input array from iteration i in the batch
action.append(mini_batch[i][1]) #Store a(i)
reward.append(mini_batch[i][2]) #Store r(i)
update_target[i] = mini_batch[i][3] #Allocate s'(i) for the target network array from iteration i in the batch
done.append(mini_batch[i][4]) #Store done(i)
target = self.model.predict(update_input) #Generate target values for training the inner loop network using the network model
target_val = self.target_model.predict(update_target) #Generate the target values for training the outer loop target network
#Q Learning: get maximum Q value at s' from target network
###############################################################################
###############################################################################
#Insert your Q-learning code here
#Tip 1: Observe that the Q-values are stored in the variable target
#Tip 2: What is the Q-value of the action taken at the last state of the episode?
for i in range(self.batch_size): #For every batch
if done[i]:
target[i][action[i]] = reward[i]
else:
target[i][action[i]] = reward[i] + self.discount_factor*np.amax(target_val[i,:])
###############################################################################
###############################################################################
#Train the inner loop network
self.model.fit(update_input, target, batch_size=self.batch_size,
epochs=1, verbose=0)
return
#Plots the score per episode as well as the maximum q value per episode, averaged over precollected states.
def plot_data(self, episodes, scores, max_q_mean):
plt.figure(0)
plt.plot(episodes, max_q_mean, 'b')
plt.xlabel("Episodes")
plt.ylabel("Average Q Value")
plt.savefig("qvalues.png")
np.savetxt('max_q_mean.data',max_q_mean)
plt.figure(1)
plt.plot(episodes, scores, 'b')
plt.xlabel("Episodes")
plt.ylabel("Score")
plt.savefig("scores.png")
np.savetxt('scores.data',scores)
if __name__ == "__main__":
#For CartPole-v0, maximum episode length is 200
env = gym.make('CartPole-v0') #Generate Cartpole-v0 environment object from the gym library
#Get state and action sizes from the environment
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
#Create agent, see the DQNAgent __init__ method for details
agent = DQNAgent(state_size, action_size)
#Collect test states for plotting Q values using uniform random policy
test_states = np.zeros((agent.test_state_no, state_size))
max_q = np.zeros((EPISODES, agent.test_state_no))
max_q_mean = np.zeros((EPISODES,1))
done = True
for i in range(agent.test_state_no):
if done:
done = False
state = env.reset()
state = np.reshape(state, [1, state_size])
test_states[i] = state
else:
action = random.randrange(action_size)
next_state, reward, done, info = env.step(action)
next_state = np.reshape(next_state, [1, state_size])
test_states[i] = state
state = next_state
scores, episodes = [], [] #Create dynamically growing score and episode counters
for e in range(EPISODES):
done = False
score = 0
state = env.reset() #Initialize/reset the environment
state = np.reshape(state, [1, state_size]) #Reshape state so that to a 1 by state_size two-dimensional array ie. [x_1,x_2] to [[x_1,x_2]]
#Compute Q values for plotting
tmp = agent.model.predict(test_states)
max_q[e][:] = np.max(tmp, axis=1)
max_q_mean[e] = np.mean(max_q[e][:])
while not done:
if agent.render:
env.render() #Show cartpole animation
#Get action for the current state and go one step in environment
action = agent.get_action(state)
next_state, reward, done, info = env.step(action)
next_state = np.reshape(next_state, [1, state_size]) #Reshape next_state similarly to state
#Save sample <s, a, r, s'> to the replay memory
agent.append_sample(state, action, reward, next_state, done)
#Training step
agent.train_model()
score += reward #Store episodic reward
state = next_state #Propagate state
if done:
#At the end of very episode, update the target network
if e % agent.target_update_frequency == 0:
agent.update_target_model()
#Plot the play time for every episode
scores.append(score)
episodes.append(e)
print("episode:", e, " score:", score," q_value:", max_q_mean[e]," memory length:",
len(agent.memory))
# if the mean of scores of last 100 episodes is bigger than 195
# stop training
if agent.check_solve and len(scores)>100:
if np.mean(scores[-min(100, len(scores)):]) >= 195:
print("solved after", e-100, "episodes")
agent.plot_data(episodes,scores,max_q_mean[:e+1])
sys.exit()
agent.plot_data(episodes,scores,max_q_mean)