diff --git a/agent.py b/agent.py new file mode 100644 index 0000000..23dc0a6 --- /dev/null +++ b/agent.py @@ -0,0 +1,81 @@ +import time +import numpy as np + + +class AgentBase: + def __init__(self): + # sub module + self._env = None + self._replay = None + # epsilon greedy + self._epsilon = 0.2 + self._epsilon_min = 0.2 + self._epsilon_max = 0.8 + # global_step + self._global_step = 0 + + def is_not_used(self): + pass + + def _epsilon_greedy(self): + self._epsilon = np.tanh(0.02 * self._global_step) + self._epsilon = np.maximum(self._epsilon, self._epsilon_min) + self._epsilon = np.minimum(self._epsilon, self._epsilon_max) + return self._epsilon + + def _train_impl(self, max_step): + # prepare for epsilon greedy + self._global_step += 1 + # train + state = self._env.reset() + for step in range(max_step): + # 1. predict + action = self._explore(state) + # 2. action + state_, reward, done, info = self._env.step(action) + # 3. perceive + self._perceive(state, action, state_, reward, done) + # 4. update + state = state_ + if done: + break + + def _test_impl(self, max_step, delay, gui): + # test + total_reward = 0 + state = self._env.reset() + for step in range(max_step): + if gui: + self._env.render() + time.sleep(delay) + # 1. predict + action = self._optimal_action(state) + # 2. action + state_, reward, done, info = self._env.step(action) + # 3. perceive + total_reward += reward + # 4. update + state = state_ + if done: + break + return total_reward + + def _explore(self, state): + epsilon = self._epsilon_greedy() + if np.random.uniform() > epsilon: + action = self._random_action(state) + else: + action = self._optimal_action(state) + return action + + def _random_action(self, state): + self.is_not_used() + raise RuntimeError('function must overridden.') + + def _optimal_action(self, state): + self.is_not_used() + raise RuntimeError('function must overridden.') + + def _perceive(self, state, action, state_, reward, done): + self.is_not_used() + raise RuntimeError('function must overridden.') \ No newline at end of file diff --git a/dqn.py b/dqn.py new file mode 100644 index 0000000..a68c889 --- /dev/null +++ b/dqn.py @@ -0,0 +1,73 @@ +import time +import numpy as np +import tensorflow as tf +from tqdm import tqdm +import agent +import enviroment +import history +import layers + + +class DQN(agent.AgentBase): + # Modeling + ENV_ID = 'CartPole-v0' + STATE_SPACE = 4 + HIDDEN_NEURONS = 20 + ACTION_SPACE = 2 + # Training + LR = 1e-3 # learning rate + + def __init__(self): + super().__init__() + # sub module + self._env = enviroment.make(self.ENV_ID) + self._replay = history.ReplayBuffer() + # build network + self._net = self._build_network() + + def _build_network(self): + # neural network + with tf.name_scope('input'): + state = tf.placeholder(dtype=tf.float32, shape=[None, self.STATE_SPACE]) + with tf.name_scope('hidden'): + y1 = layers.fc(state, n_neurons=self.HIDDEN_NEURONS, activation=tf.nn.tanh) + with tf.name_scope('output'): + q_predict = layers.fc(y1, n_neurons=self.ACTION_SPACE) + # process q value + with tf.name_scope('q_value'): + action_mask = tf.placeholder(tf.float32, [None, self.ACTION_SPACE]) + q_current = tf.reduce_sum(tf.multiply(q_predict, action_mask), axis=1) + # loss + with tf.name_scope('loss'): + q_target = tf.placeholder(tf.float32, [None]) + loss = tf.reduce_mean(tf.squared_difference(q_current, q_target)) + tf.summary.scalar('loss', loss) + # + return {'state': state, + 'action_mask': action_mask, + 'q_current': q_current, + 'q_target': q_target} + + def train(self, episodes=500, max_step=200): + # prepare for epsilon greedy + # train step + for episode in tqdm(range(episodes)): + if episode % 50 == 0: + total_reward = self._test_impl(max_step, delay=0.2, gui=True) + tqdm.write('current reward: {total_reward}'.format(total_reward=total_reward)) + else: + self._train_impl(max_step) + + def test(self, episodes=1, max_step=200, delay=0.2, gui=True): + for episode in range(episodes): + total_reward = self._test_impl(max_step, delay, gui) + print('current reward: {total_reward}'.format(total_reward=total_reward)) + + def _random_action(self, state): + pass + + def _optimal_action(self, state): + pass + + def _perceive(self, state, action, state_, reward, done): + pass diff --git a/enviroment.py b/enviroment.py new file mode 100644 index 0000000..d508d64 --- /dev/null +++ b/enviroment.py @@ -0,0 +1,5 @@ +import gym + + +def make(name): + return gym.make(name) \ No newline at end of file diff --git a/history.py b/history.py new file mode 100644 index 0000000..252e6c4 --- /dev/null +++ b/history.py @@ -0,0 +1,20 @@ +import random + + +class ReplayBuffer: + def __init__(self, buffer_size=512): + self.buffer_size = buffer_size + # + self.buffer = list() + + def insert(self, *args): + if len(self.buffer) > self.buffer_size: + self.buffer.pop(0) + self.buffer.append(args) + + def get_batch(self, batch_size=64): + batch = random.sample(self.buffer, batch_size) + return zip(*batch) + + def reset(self): + self.buffer.clear() \ No newline at end of file diff --git a/layers.py b/layers.py new file mode 100644 index 0000000..9012135 --- /dev/null +++ b/layers.py @@ -0,0 +1,22 @@ +import tensorflow as tf + + +def weight_variable(shape): + initial = tf.truncated_normal(shape, stddev=0.1) + return tf.Variable(initial) + + +def bias_variable(shape): + initial = tf.truncated_normal(shape, stddev=0.01) + return tf.Variable(initial) + + +def fc(x, n_neurons, activation=None): + rows, cols = tf.shape(x) + W = weight_variable(shape=[cols, n_neurons]) + b = bias_variable(shape=[n_neurons]) + if activation is None: + y = tf.matmul(x, W) + b + else: + y = activation(tf.matmul(x, W) + b) + return y \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..f8c046f --- /dev/null +++ b/main.py @@ -0,0 +1,5 @@ +import dqn + + +if __name__ == '__main__': + agent = dqn.DQN() \ No newline at end of file