-
Notifications
You must be signed in to change notification settings - Fork 0
/
tangrai_agent.py
145 lines (124 loc) · 5.56 KB
/
tangrai_agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# EXECUTE ENVIRONMENT
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import time
import matplotlib.pyplot as plt
from tangrai import __init__ as aaaaa
import copy
class Agent():
def __init__(self, env_id, path, episodes, max_env_steps, win_threshold, epsilon_decay,
state_size=None, action_size=None, epsilon=1.0, epsilon_min=0.01,
gamma=1.0, learning_rate=.001, alpha_decay=.01, batch_size=16, prints=False):
self.memory = deque(maxlen=100000)
self.env = gym.make(env_id)
if state_size is None:
self.state_size = self.env.observation_space.n
else:
self.state_size = state_size
if action_size is None:
self.action_size = self.env.action_space.n
else:
self.action_size = action_size
self.episodes = episodes
self.env._max_episode_steps = max_env_steps
self.win_threshold = win_threshold
self.epsilon = epsilon
self.epsilon_decay = epsilon_decay
self.epsilon_min = epsilon_min
self.gamma = gamma
self.alpha_decay = alpha_decay
self.batch_size = batch_size
self.path = path # location where the model is saved to
self.prints = prints # if true, the agent will print his scores
self.learning_rate = learning_rate
self.model = self.NN_model()
self.mse_list = []
def NN_model(self):
model = Sequential()
model.add(Dense(256 * 2, input_dim=self.state_size, activation='relu'))
model.add(Dense(128 * 2, activation='relu'))
model.add(Dense(64 * 2, activation='relu'))
model.add(Dense(32 * 2, activation='relu'))
model.add(Dense(self.action_size, activation='softmax'))
model.compile(loss='mse',
optimizer=Adam(lr=self.learning_rate, decay=self.alpha_decay), metrics=['mse'])
return model
def act(self, state):
if (np.random.random() <= self.epsilon):
print(self.model.predict(state)) # Q-Table!!!!
return self.env.action_space.sample()
return np.argmax(self.model.predict(state)) # Returns the position XY of the next piece
def remember(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))
def replay(self, batch_size):
x_batch, y_batch = [], []
minibatch = random.sample(self.memory, min(len(self.memory), batch_size)) # select random samples in batchsize
for state, action, reward, next_state, done in minibatch:
y_target = self.model.predict(state)
y_target[0][action] = reward if done else reward + self.gamma * np.max(self.model.predict(next_state)[0])
x_batch.append(state[0])
y_batch.append(y_target[0])
history = self.model.fit(np.array(x_batch), np.array(y_batch), batch_size=len(x_batch), verbose=0)
self.mse_list = np.append(self.mse_list, history.history['mse'])
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
def train(self):
for episode in range(self.episodes): # Number of desired episodes
state = self.env.reset() # Empty the board and score
done = False
counter_steps = 0
score = 0
if episode % 10 == True:
print('EPISODE', episode)
print('Mean MSE', np.mean(self.mse_list))
for _ in range(self.env._max_episode_steps): # Number of movements = pieces = 7
action_space = self.act(state) # Returns the XY of the next piece
next_state, reward, done, _ = self.env.step(action_space) # Movement-> reward
self.remember(state, action_space, reward, next_state, done) # Save the results
self.replay(self.batch_size) # Fit the model using old results
score += reward # Add up the score
state = next_state
# Uncomment this to see the plots
board_ = copy.deepcopy(state)
board_ = np.reshape(board_, (20, 10))
plt.imshow(board_)
plt.show()
# Board and Piece in different plot
print('#' * 10)
plt.figure()
plt.title('Board')
plt.imshow(board_[:10])
plt.show()
plt.figure()
plt.title('Piece')
plt.imshow(board_[10:21])
plt.show()
if counter_steps == 6:
done = True
break
else:
done = False
counter_steps += 1
self.model.save_weights(self.path)
if __name__ == "__main__":
agent = Agent(env_id='TangrAI-v0',
path='model/model_RL.h5',
episodes=5000,
max_env_steps=7,
win_threshold=None,
epsilon_decay=1, # Effect a lot to the model
state_size=200,
action_size=None,
epsilon=0.8,
epsilon_min=0.01,
gamma=0.8,
learning_rate=.001,
alpha_decay=0.1,
batch_size=4,
prints=True)
agent.train()