Skip to content

Commit

Permalink
Fixed based on comments
Browse files Browse the repository at this point in the history
  • Loading branch information
ASzot committed Oct 19, 2018
1 parent 93d7d46 commit 8705bd0
Show file tree
Hide file tree
Showing 8 changed files with 300 additions and 213 deletions.
14 changes: 12 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ listed above are important.
Train the actor critic model using the following. Training this is a necessary
baseline and needs to be used to train the environment model.
```
python a2c.py
python main.py a2c
```

Train the environment model using the following. Remember the a2c model must be
Expand All @@ -51,9 +51,10 @@ python env_model.py
Train the imagination augmented agent using the following. Remember the
environment model must already be trained.
```
python i2a.py
python main.py i2a
```


Evaluate any agent (in the terminal). Change the model checkpoint to whichever
actor you would like to evaluate (either from i2a or a2c).
```
Expand All @@ -65,3 +66,12 @@ Notebook.

To see the imagined states from the environment model visually run the
`eval_env_model.ipynb` Jupyter Notebook.


The hyperparameters of this model all work great for minipacman. The hyperparameters for the actor critic training can be found at the top of `a2c.py`. The hyperparamters can be found for the imagination part at the to top of `i2a.py` but they can be found at
the top of the `i2a.py` file, for the environment model at top of
`env_model.py` and for the overall training at the top of `main.py`. `N_ENV` is
the number of environments which are concurrently being simulated and trained
on. You can change this number based on the speed of your system.
`NUM_ROLLOUTS` is an interesting hyperparameter to play with as it corresponds to
the number of imagined states in the future.
171 changes: 14 additions & 157 deletions a2c.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,14 @@
# Inspired from OpenAI Baselines. This uses the same design of having an easily
# substitutable generic policy that can be trained. This allows to easily
# substitute in the I2A policy as opposed to the basic CNN one.
import os
import numpy as np
import tensorflow as tf
from common.minipacman import MiniPacman
from common.multiprocessing_env import SubprocVecEnv
from tqdm import tqdm

def discount_with_dones(rewards, dones, gamma):
discounted = []
r = 0
for reward, done in zip(rewards[::-1], dones[::-1]):
r = reward + gamma*r*(1.-done) # fixed off by one bug
discounted.append(r)
return discounted[::-1]

# TUNABLE HYPERPARAMETERS FOR A2C TRAINING
VF_COEFF=0.5
ENTROPY_COEFF=0.01
MAX_GRAD_NORM=0.5
LR=7e-4
EPSILON=1e-5
ALPHA=0.99

def cat_entropy(logits):
a0 = logits - tf.reduce_max(logits, 1, keep_dims=True)
Expand Down Expand Up @@ -90,9 +83,7 @@ def get_inputs(self):

# generic graph for a2c.
class ActorCritic(object):
def __init__(self, sess, policy, ob_space, ac_space, nenvs, nsteps,
ent_coef, vf_coef, max_grad_norm, lr, alpha, epsilon, should_summary):

def __init__(self, sess, policy, ob_space, ac_space, nenvs, nsteps, should_summary):
self.sess = sess

nact = ac_space.n
Expand All @@ -114,17 +105,17 @@ def __init__(self, sess, policy, ob_space, ac_space, nenvs, nsteps,
# Value function loss
self.vf_loss = tf.reduce_mean(tf.square(tf.squeeze(self.train_model.vf) - self.rewards) / 2.0)
self.entropy = tf.reduce_mean(cat_entropy(self.train_model.pi))
self.loss = self.pg_loss - (self.entropy * ent_coef) + (self.vf_loss * vf_coef)
self.loss = self.pg_loss - (self.entropy * ENTROPY_COEFF) + (self.vf_loss * VF_COEFF)

with tf.variable_scope('model'):
params = tf.trainable_variables()

grads = tf.gradients(self.loss, params)
if max_grad_norm is not None:
grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
if MAX_GRAD_NORM is not None:
grads, grad_norm = tf.clip_by_global_norm(grads, MAX_GRAD_NORM)
grads = list(zip(grads, params))

trainer = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon)
trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=ALPHA, EPSILON=EPSILON)
self.opt = trainer.apply_gradients(grads)

# Tensorboard
Expand Down Expand Up @@ -200,142 +191,8 @@ def load(self, full_path):

def get_actor_critic(sess, nenvs, nsteps, ob_space, ac_space,
policy, should_summary=True):
# TUNABLE HYPERPARAMETERS FOR A2C TRAINING
vf_coef=0.5
ent_coef=0.01
max_grad_norm=0.5
lr=7e-4
epsilon=1e-5
alpha=0.99
actor_critic = ActorCritic(sess, policy, ob_space, ac_space, nenvs, nsteps,
ent_coef, vf_coef, max_grad_norm, lr, alpha, epsilon,
should_summary)

return actor_critic


def train(policy, save_name, load_count = 0, summarize=True, load_path=None, log_path = './logs'):
nenvs = 16
nsteps=5
total_timesteps=int(1e6)
gamma=0.99
log_interval=100
save_interval = 1e5
save_path = 'weights'

def make_env():
def _thunk():
env = MiniPacman('regular', 1000)
return env

return _thunk

envs = [make_env() for i in range(nenvs)]
envs = SubprocVecEnv(envs)

ob_space = envs.observation_space.shape
nw, nh, nc = ob_space
ac_space = envs.action_space

obs = envs.reset()

with tf.Session() as sess:
actor_critic = get_actor_critic(sess, nenvs, nsteps, ob_space,
ac_space, policy, summarize)
if load_path is not None:
actor_critic.load(load_path)
print('Loaded a2c')

summary_op = tf.summary.merge_all()
writer = tf.summary.FileWriter(log_path, graph=sess.graph)

sess.run(tf.global_variables_initializer())

batch_ob_shape = (nenvs*nsteps, nw, nh, nc)

dones = [False for _ in range(nenvs)]
nbatch = nenvs * nsteps

episode_rewards = np.zeros((nenvs, ))
final_rewards = np.zeros((nenvs, ))

for update in tqdm(range(load_count + 1, total_timesteps + 1)):
# mb stands for mini batch
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
for n in range(nsteps):
actions, values, _ = actor_critic.act(obs)

mb_obs.append(np.copy(obs))
mb_actions.append(actions)
mb_values.append(values)
mb_dones.append(dones)

obs, rewards, dones, _ = envs.step(actions)

episode_rewards += rewards
masks = 1 - np.array(dones)
final_rewards *= masks
final_rewards += (1 - masks) * episode_rewards
episode_rewards *= masks

mb_rewards.append(rewards)

mb_dones.append(dones)

#batch of steps to batch of rollouts
mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(1, 0).reshape(batch_ob_shape)
mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
mb_masks = mb_dones[:, :-1]
mb_dones = mb_dones[:, 1:]

last_values = actor_critic.critique(obs).tolist()

#discount/bootstrap off value fn
for n, (rewards, d, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
rewards = rewards.tolist()
d = d.tolist()
if d[-1] == 0:
rewards = discount_with_dones(rewards+[value], d+[0], gamma)[:-1]
else:
rewards = discount_with_dones(rewards, d, gamma)
mb_rewards[n] = rewards

mb_rewards = mb_rewards.flatten()
mb_actions = mb_actions.flatten()
mb_values = mb_values.flatten()
mb_masks = mb_masks.flatten()

if summarize:
loss, policy_loss, value_loss, policy_entropy, _, summary = actor_critic.train(mb_obs,
mb_rewards, mb_masks, mb_actions, mb_values, update,
summary_op)
writer.add_summary(summary, update)
else:
loss, policy_loss, value_loss, policy_entropy, _ = actor_critic.train(mb_obs,
mb_rewards, mb_masks, mb_actions, mb_values, update)

if update % log_interval == 0 or update == 1:
print('%i): %.4f, %.4f, %.4f' % (update, policy_loss, value_loss, policy_entropy))
print(final_rewards.mean())

if update % save_interval == 0:
print('Saving model')
actor_critic.save(save_path, save_name + '_' + str(update) + '.ckpt')

actor_critic.save(save_path, save_name + '_done.ckpt')


if __name__ == '__main__':
os.environ["CUDA_VISIBLE_DEVICES"]="0"

load_count = 0
load_path = 'weights/a2c_%i.ckpt' % load_count
load_path = None

train(CnnPolicy, 'a2c', load_count=load_count, load_path=load_path,
log_path='./a2c_logs')
actor_critic = ActorCritic(sess, policy, ob_space, ac_space, nenvs, nsteps, should_summary)

return actor_critic

Empty file added common/__init__.py
Empty file.
42 changes: 25 additions & 17 deletions env_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,20 @@

from common.pacman_util import num_pixels, mode_rewards, pix_to_target, rewards_to_target

# How many iterations we are training the environment model for.
NUM_UPDATES = 5000

LOG_INTERVAL = 100

N_ENVS = 16
N_STEPS = 5

# This can be anything from "regular" "avoid" "hunt" "ambush" "rush" each
# resulting in a different reward function giving the agent different behavior.
REWARD_MODE = 'regular'

# Replace this with the location of your own weights.
A2C_WEIGHTS = 'weights/a2c_200000.ckpt'

def pool_inject(X, batch_size, depth, width, height):
m = tf.layers.max_pooling2d(X, pool_size=(width, height), strides=(width, height))
Expand Down Expand Up @@ -116,7 +130,7 @@ def create_env_model(obs_shape, num_actions, num_pixels, num_rewards,

def make_env():
def _thunk():
env = MiniPacman('regular', 1000)
env = MiniPacman(REWARD_MODE, 1000)
return env

return _thunk
Expand Down Expand Up @@ -151,30 +165,24 @@ def __init__(self, imag_state, imag_reward, input_states, input_actions,


if __name__ == '__main__':
nenvs = 16
nsteps = 5

envs = [make_env() for i in range(nenvs)]
envs = [make_env() for i in range(N_ENVS)]
envs = SubprocVecEnv(envs)

ob_space = envs.observation_space.shape
ac_space = envs.action_space
num_actions = envs.action_space.n

os.environ["CUDA_VISIBLE_DEVICES"]="1"
with tf.Session() as sess:
actor_critic = get_actor_critic(sess, nenvs, nsteps, ob_space, ac_space, CnnPolicy, should_summary=False)
actor_critic.load('weights/a2c_200000.ckpt')
actor_critic = get_actor_critic(sess, N_ENVS, N_STEPS, ob_space, ac_space, CnnPolicy, should_summary=False)
actor_critic.load(A2C_WEIGHTS)

with tf.variable_scope('env_model'):
env_model = create_env_model(ob_space, num_actions, num_pixels, len(mode_rewards['regular']))
env_model = create_env_model(ob_space, num_actions, num_pixels,
len(mode_rewards[REWARD_MODE]))

summary_op = tf.summary.merge_all()
sess.run(tf.global_variables_initializer())

num_updates = 5000
log_interval = 100

losses = []
all_rewards = []

Expand All @@ -187,12 +195,12 @@ def __init__(self, imag_state, imag_reward, input_states, input_actions,

writer = tf.summary.FileWriter('./env_logs', graph=sess.graph)

for frame_idx, states, actions, rewards, next_states, dones in tqdm(play_games(actor_critic, envs, num_updates), total=num_updates):
for frame_idx, states, actions, rewards, next_states, dones in tqdm(play_games(actor_critic, envs, NUM_UPDATES), total=NUM_UPDATES):
target_state = pix_to_target(next_states)
target_reward = rewards_to_target('regular', rewards)
target_reward = rewards_to_target(REWARD_MODE, rewards)

onehot_actions = np.zeros((nenvs, num_actions, width, height))
onehot_actions[range(nenvs), actions] = 1
onehot_actions = np.zeros((N_ENVS, num_actions, width, height))
onehot_actions[range(N_ENVS), actions] = 1
# Change so actions are the 'depth of the image' as tf expects
onehot_actions = onehot_actions.transpose(0, 2, 3, 1)

Expand All @@ -210,7 +218,7 @@ def __init__(self, imag_state, imag_reward, input_states, input_actions,
env_model.target_rewards: target_reward
})

if frame_idx % log_interval == 0:
if frame_idx % LOG_INTERVAL == 0:
print('%i) %.5f, %.5f, %.5f' % (frame_idx, l, reward_loss, image_loss))
writer.add_summary(summary, frame_idx)

Expand Down
Loading

0 comments on commit 8705bd0

Please sign in to comment.