Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
ytgui committed Jul 12, 2017
1 parent c2caa3d commit 7b7b31f
Show file tree
Hide file tree
Showing 6 changed files with 206 additions and 0 deletions.
81 changes: 81 additions & 0 deletions agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import time
import numpy as np


class AgentBase:
def __init__(self):
# sub module
self._env = None
self._replay = None
# epsilon greedy
self._epsilon = 0.2
self._epsilon_min = 0.2
self._epsilon_max = 0.8
# global_step
self._global_step = 0

def is_not_used(self):
pass

def _epsilon_greedy(self):
self._epsilon = np.tanh(0.02 * self._global_step)
self._epsilon = np.maximum(self._epsilon, self._epsilon_min)
self._epsilon = np.minimum(self._epsilon, self._epsilon_max)
return self._epsilon

def _train_impl(self, max_step):
# prepare for epsilon greedy
self._global_step += 1
# train
state = self._env.reset()
for step in range(max_step):
# 1. predict
action = self._explore(state)
# 2. action
state_, reward, done, info = self._env.step(action)
# 3. perceive
self._perceive(state, action, state_, reward, done)
# 4. update
state = state_
if done:
break

def _test_impl(self, max_step, delay, gui):
# test
total_reward = 0
state = self._env.reset()
for step in range(max_step):
if gui:
self._env.render()
time.sleep(delay)
# 1. predict
action = self._optimal_action(state)
# 2. action
state_, reward, done, info = self._env.step(action)
# 3. perceive
total_reward += reward
# 4. update
state = state_
if done:
break
return total_reward

def _explore(self, state):
epsilon = self._epsilon_greedy()
if np.random.uniform() > epsilon:
action = self._random_action(state)
else:
action = self._optimal_action(state)
return action

def _random_action(self, state):
self.is_not_used()
raise RuntimeError('function must overridden.')

def _optimal_action(self, state):
self.is_not_used()
raise RuntimeError('function must overridden.')

def _perceive(self, state, action, state_, reward, done):
self.is_not_used()
raise RuntimeError('function must overridden.')
73 changes: 73 additions & 0 deletions dqn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import time
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import agent
import enviroment
import history
import layers


class DQN(agent.AgentBase):
# Modeling
ENV_ID = 'CartPole-v0'
STATE_SPACE = 4
HIDDEN_NEURONS = 20
ACTION_SPACE = 2
# Training
LR = 1e-3 # learning rate

def __init__(self):
super().__init__()
# sub module
self._env = enviroment.make(self.ENV_ID)
self._replay = history.ReplayBuffer()
# build network
self._net = self._build_network()

def _build_network(self):
# neural network
with tf.name_scope('input'):
state = tf.placeholder(dtype=tf.float32, shape=[None, self.STATE_SPACE])
with tf.name_scope('hidden'):
y1 = layers.fc(state, n_neurons=self.HIDDEN_NEURONS, activation=tf.nn.tanh)
with tf.name_scope('output'):
q_predict = layers.fc(y1, n_neurons=self.ACTION_SPACE)
# process q value
with tf.name_scope('q_value'):
action_mask = tf.placeholder(tf.float32, [None, self.ACTION_SPACE])
q_current = tf.reduce_sum(tf.multiply(q_predict, action_mask), axis=1)
# loss
with tf.name_scope('loss'):
q_target = tf.placeholder(tf.float32, [None])
loss = tf.reduce_mean(tf.squared_difference(q_current, q_target))
tf.summary.scalar('loss', loss)
#
return {'state': state,
'action_mask': action_mask,
'q_current': q_current,
'q_target': q_target}

def train(self, episodes=500, max_step=200):
# prepare for epsilon greedy
# train step
for episode in tqdm(range(episodes)):
if episode % 50 == 0:
total_reward = self._test_impl(max_step, delay=0.2, gui=True)
tqdm.write('current reward: {total_reward}'.format(total_reward=total_reward))
else:
self._train_impl(max_step)

def test(self, episodes=1, max_step=200, delay=0.2, gui=True):
for episode in range(episodes):
total_reward = self._test_impl(max_step, delay, gui)
print('current reward: {total_reward}'.format(total_reward=total_reward))

def _random_action(self, state):
pass

def _optimal_action(self, state):
pass

def _perceive(self, state, action, state_, reward, done):
pass
5 changes: 5 additions & 0 deletions enviroment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import gym


def make(name):
return gym.make(name)
20 changes: 20 additions & 0 deletions history.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import random


class ReplayBuffer:
def __init__(self, buffer_size=512):
self.buffer_size = buffer_size
#
self.buffer = list()

def insert(self, *args):
if len(self.buffer) > self.buffer_size:
self.buffer.pop(0)
self.buffer.append(args)

def get_batch(self, batch_size=64):
batch = random.sample(self.buffer, batch_size)
return zip(*batch)

def reset(self):
self.buffer.clear()
22 changes: 22 additions & 0 deletions layers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import tensorflow as tf


def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)


def bias_variable(shape):
initial = tf.truncated_normal(shape, stddev=0.01)
return tf.Variable(initial)


def fc(x, n_neurons, activation=None):
rows, cols = tf.shape(x)
W = weight_variable(shape=[cols, n_neurons])
b = bias_variable(shape=[n_neurons])
if activation is None:
y = tf.matmul(x, W) + b
else:
y = activation(tf.matmul(x, W) + b)
return y
5 changes: 5 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import dqn


if __name__ == '__main__':
agent = dqn.DQN()

0 comments on commit 7b7b31f

Please sign in to comment.