init

ytgui · Jul 12, 2017 · 7b7b31f · 7b7b31f
1 parent c2caa3d
commit 7b7b31f
Show file tree

Hide file tree

Showing 6 changed files with 206 additions and 0 deletions.
diff --git a/agent.py b/agent.py
@@ -0,0 +1,81 @@
+import time
+import numpy as np
+
+
+class AgentBase:
+    def __init__(self):
+        # sub module
+        self._env = None
+        self._replay = None
+        # epsilon greedy
+        self._epsilon = 0.2
+        self._epsilon_min = 0.2
+        self._epsilon_max = 0.8
+        # global_step
+        self._global_step = 0
+
+    def is_not_used(self):
+        pass
+
+    def _epsilon_greedy(self):
+        self._epsilon = np.tanh(0.02 * self._global_step)
+        self._epsilon = np.maximum(self._epsilon, self._epsilon_min)
+        self._epsilon = np.minimum(self._epsilon, self._epsilon_max)
+        return self._epsilon
+
+    def _train_impl(self, max_step):
+        # prepare for epsilon greedy
+        self._global_step += 1
+        # train
+        state = self._env.reset()
+        for step in range(max_step):
+            # 1. predict
+            action = self._explore(state)
+            # 2. action
+            state_, reward, done, info = self._env.step(action)
+            # 3. perceive
+            self._perceive(state, action, state_, reward, done)
+            # 4. update
+            state = state_
+            if done:
+                break
+
+    def _test_impl(self, max_step, delay, gui):
+        # test
+        total_reward = 0
+        state = self._env.reset()
+        for step in range(max_step):
+            if gui:
+                self._env.render()
+            time.sleep(delay)
+            # 1. predict
+            action = self._optimal_action(state)
+            # 2. action
+            state_, reward, done, info = self._env.step(action)
+            # 3. perceive
+            total_reward += reward
+            # 4. update
+            state = state_
+            if done:
+                break
+        return total_reward
+
+    def _explore(self, state):
+        epsilon = self._epsilon_greedy()
+        if np.random.uniform() > epsilon:
+            action = self._random_action(state)
+        else:
+            action = self._optimal_action(state)
+        return action
+
+    def _random_action(self, state):
+        self.is_not_used()
+        raise RuntimeError('function must overridden.')
+
+    def _optimal_action(self, state):
+        self.is_not_used()
+        raise RuntimeError('function must overridden.')
+
+    def _perceive(self, state, action, state_, reward, done):
+        self.is_not_used()
+        raise RuntimeError('function must overridden.')
diff --git a/dqn.py b/dqn.py
@@ -0,0 +1,73 @@
+import time
+import numpy as np
+import tensorflow as tf
+from tqdm import tqdm
+import agent
+import enviroment
+import history
+import layers
+
+
+class DQN(agent.AgentBase):
+    # Modeling
+    ENV_ID = 'CartPole-v0'
+    STATE_SPACE = 4
+    HIDDEN_NEURONS = 20
+    ACTION_SPACE = 2
+    # Training
+    LR = 1e-3  # learning rate
+
+    def __init__(self):
+        super().__init__()
+        # sub module
+        self._env = enviroment.make(self.ENV_ID)
+        self._replay = history.ReplayBuffer()
+        # build network
+        self._net = self._build_network()
+
+    def _build_network(self):
+        # neural network
+        with tf.name_scope('input'):
+            state = tf.placeholder(dtype=tf.float32, shape=[None, self.STATE_SPACE])
+        with tf.name_scope('hidden'):
+            y1 = layers.fc(state, n_neurons=self.HIDDEN_NEURONS, activation=tf.nn.tanh)
+        with tf.name_scope('output'):
+            q_predict = layers.fc(y1, n_neurons=self.ACTION_SPACE)
+        # process q value
+        with tf.name_scope('q_value'):
+            action_mask = tf.placeholder(tf.float32, [None, self.ACTION_SPACE])
+            q_current = tf.reduce_sum(tf.multiply(q_predict, action_mask), axis=1)
+        # loss
+        with tf.name_scope('loss'):
+            q_target = tf.placeholder(tf.float32, [None])
+            loss = tf.reduce_mean(tf.squared_difference(q_current, q_target))
+            tf.summary.scalar('loss', loss)
+        #
+        return {'state': state,
+                'action_mask': action_mask,
+                'q_current': q_current,
+                'q_target': q_target}
+
+    def train(self, episodes=500, max_step=200):
+        # prepare for epsilon greedy
+        # train step
+        for episode in tqdm(range(episodes)):
+            if episode % 50 == 0:
+                total_reward = self._test_impl(max_step, delay=0.2, gui=True)
+                tqdm.write('current reward: {total_reward}'.format(total_reward=total_reward))
+            else:
+                self._train_impl(max_step)
+
+    def test(self, episodes=1, max_step=200, delay=0.2, gui=True):
+        for episode in range(episodes):
+            total_reward = self._test_impl(max_step, delay, gui)
+            print('current reward: {total_reward}'.format(total_reward=total_reward))
+
+    def _random_action(self, state):
+        pass
+
+    def _optimal_action(self, state):
+        pass
+
+    def _perceive(self, state, action, state_, reward, done):
+        pass
diff --git a/enviroment.py b/enviroment.py
@@ -0,0 +1,5 @@
+import gym
+
+
+def make(name):
+    return gym.make(name)
diff --git a/history.py b/history.py
@@ -0,0 +1,20 @@
+import random
+
+
+class ReplayBuffer:
+    def __init__(self, buffer_size=512):
+        self.buffer_size = buffer_size
+        #
+        self.buffer = list()
+
+    def insert(self, *args):
+        if len(self.buffer) > self.buffer_size:
+            self.buffer.pop(0)
+        self.buffer.append(args)
+
+    def get_batch(self, batch_size=64):
+        batch = random.sample(self.buffer, batch_size)
+        return zip(*batch)
+
+    def reset(self):
+        self.buffer.clear()
diff --git a/layers.py b/layers.py
@@ -0,0 +1,22 @@
+import tensorflow as tf
+
+
+def weight_variable(shape):
+    initial = tf.truncated_normal(shape, stddev=0.1)
+    return tf.Variable(initial)
+
+
+def bias_variable(shape):
+    initial = tf.truncated_normal(shape, stddev=0.01)
+    return tf.Variable(initial)
+
+
+def fc(x, n_neurons, activation=None):
+    rows, cols = tf.shape(x)
+    W = weight_variable(shape=[cols, n_neurons])
+    b = bias_variable(shape=[n_neurons])
+    if activation is None:
+        y = tf.matmul(x, W) + b
+    else:
+        y = activation(tf.matmul(x, W) + b)
+    return y
diff --git a/main.py b/main.py
@@ -0,0 +1,5 @@
+import dqn
+
+
+if __name__ == '__main__':
+    agent = dqn.DQN()