diff --git a/README.md b/README.md index c574cf8..ff698a8 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ listed above are important. Train the actor critic model using the following. Training this is a necessary baseline and needs to be used to train the environment model. ``` -python a2c.py +python main.py a2c ``` Train the environment model using the following. Remember the a2c model must be @@ -51,9 +51,10 @@ python env_model.py Train the imagination augmented agent using the following. Remember the environment model must already be trained. ``` -python i2a.py +python main.py i2a ``` + Evaluate any agent (in the terminal). Change the model checkpoint to whichever actor you would like to evaluate (either from i2a or a2c). ``` @@ -65,3 +66,12 @@ Notebook. To see the imagined states from the environment model visually run the `eval_env_model.ipynb` Jupyter Notebook. + + +The hyperparameters of this model all work great for minipacman. The hyperparameters for the actor critic training can be found at the top of `a2c.py`. The hyperparamters can be found for the imagination part at the to top of `i2a.py` but they can be found at +the top of the `i2a.py` file, for the environment model at top of +`env_model.py` and for the overall training at the top of `main.py`. `N_ENV` is +the number of environments which are concurrently being simulated and trained +on. You can change this number based on the speed of your system. +`NUM_ROLLOUTS` is an interesting hyperparameter to play with as it corresponds to +the number of imagined states in the future. diff --git a/a2c.py b/a2c.py index 6e2b7bc..72180e6 100644 --- a/a2c.py +++ b/a2c.py @@ -1,21 +1,14 @@ -# Inspired from OpenAI Baselines. This uses the same design of having an easily -# substitutable generic policy that can be trained. This allows to easily -# substitute in the I2A policy as opposed to the basic CNN one. import os import numpy as np import tensorflow as tf -from common.minipacman import MiniPacman -from common.multiprocessing_env import SubprocVecEnv -from tqdm import tqdm - -def discount_with_dones(rewards, dones, gamma): - discounted = [] - r = 0 - for reward, done in zip(rewards[::-1], dones[::-1]): - r = reward + gamma*r*(1.-done) # fixed off by one bug - discounted.append(r) - return discounted[::-1] +# TUNABLE HYPERPARAMETERS FOR A2C TRAINING +VF_COEFF=0.5 +ENTROPY_COEFF=0.01 +MAX_GRAD_NORM=0.5 +LR=7e-4 +EPSILON=1e-5 +ALPHA=0.99 def cat_entropy(logits): a0 = logits - tf.reduce_max(logits, 1, keep_dims=True) @@ -90,9 +83,7 @@ def get_inputs(self): # generic graph for a2c. class ActorCritic(object): - def __init__(self, sess, policy, ob_space, ac_space, nenvs, nsteps, - ent_coef, vf_coef, max_grad_norm, lr, alpha, epsilon, should_summary): - + def __init__(self, sess, policy, ob_space, ac_space, nenvs, nsteps, should_summary): self.sess = sess nact = ac_space.n @@ -114,17 +105,17 @@ def __init__(self, sess, policy, ob_space, ac_space, nenvs, nsteps, # Value function loss self.vf_loss = tf.reduce_mean(tf.square(tf.squeeze(self.train_model.vf) - self.rewards) / 2.0) self.entropy = tf.reduce_mean(cat_entropy(self.train_model.pi)) - self.loss = self.pg_loss - (self.entropy * ent_coef) + (self.vf_loss * vf_coef) + self.loss = self.pg_loss - (self.entropy * ENTROPY_COEFF) + (self.vf_loss * VF_COEFF) with tf.variable_scope('model'): params = tf.trainable_variables() grads = tf.gradients(self.loss, params) - if max_grad_norm is not None: - grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) + if MAX_GRAD_NORM is not None: + grads, grad_norm = tf.clip_by_global_norm(grads, MAX_GRAD_NORM) grads = list(zip(grads, params)) - trainer = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon) + trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=ALPHA, EPSILON=EPSILON) self.opt = trainer.apply_gradients(grads) # Tensorboard @@ -200,142 +191,8 @@ def load(self, full_path): def get_actor_critic(sess, nenvs, nsteps, ob_space, ac_space, policy, should_summary=True): - # TUNABLE HYPERPARAMETERS FOR A2C TRAINING - vf_coef=0.5 - ent_coef=0.01 - max_grad_norm=0.5 - lr=7e-4 - epsilon=1e-5 - alpha=0.99 - actor_critic = ActorCritic(sess, policy, ob_space, ac_space, nenvs, nsteps, - ent_coef, vf_coef, max_grad_norm, lr, alpha, epsilon, - should_summary) - - return actor_critic - - -def train(policy, save_name, load_count = 0, summarize=True, load_path=None, log_path = './logs'): - nenvs = 16 - nsteps=5 - total_timesteps=int(1e6) - gamma=0.99 - log_interval=100 - save_interval = 1e5 - save_path = 'weights' - - def make_env(): - def _thunk(): - env = MiniPacman('regular', 1000) - return env - - return _thunk - - envs = [make_env() for i in range(nenvs)] - envs = SubprocVecEnv(envs) - - ob_space = envs.observation_space.shape - nw, nh, nc = ob_space - ac_space = envs.action_space - - obs = envs.reset() - - with tf.Session() as sess: - actor_critic = get_actor_critic(sess, nenvs, nsteps, ob_space, - ac_space, policy, summarize) - if load_path is not None: - actor_critic.load(load_path) - print('Loaded a2c') - - summary_op = tf.summary.merge_all() - writer = tf.summary.FileWriter(log_path, graph=sess.graph) - - sess.run(tf.global_variables_initializer()) - - batch_ob_shape = (nenvs*nsteps, nw, nh, nc) - - dones = [False for _ in range(nenvs)] - nbatch = nenvs * nsteps - - episode_rewards = np.zeros((nenvs, )) - final_rewards = np.zeros((nenvs, )) - - for update in tqdm(range(load_count + 1, total_timesteps + 1)): - # mb stands for mini batch - mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] - for n in range(nsteps): - actions, values, _ = actor_critic.act(obs) - mb_obs.append(np.copy(obs)) - mb_actions.append(actions) - mb_values.append(values) - mb_dones.append(dones) - - obs, rewards, dones, _ = envs.step(actions) - - episode_rewards += rewards - masks = 1 - np.array(dones) - final_rewards *= masks - final_rewards += (1 - masks) * episode_rewards - episode_rewards *= masks - - mb_rewards.append(rewards) - - mb_dones.append(dones) - - #batch of steps to batch of rollouts - mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(1, 0).reshape(batch_ob_shape) - mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) - mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) - mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) - mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) - mb_masks = mb_dones[:, :-1] - mb_dones = mb_dones[:, 1:] - - last_values = actor_critic.critique(obs).tolist() - - #discount/bootstrap off value fn - for n, (rewards, d, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): - rewards = rewards.tolist() - d = d.tolist() - if d[-1] == 0: - rewards = discount_with_dones(rewards+[value], d+[0], gamma)[:-1] - else: - rewards = discount_with_dones(rewards, d, gamma) - mb_rewards[n] = rewards - - mb_rewards = mb_rewards.flatten() - mb_actions = mb_actions.flatten() - mb_values = mb_values.flatten() - mb_masks = mb_masks.flatten() - - if summarize: - loss, policy_loss, value_loss, policy_entropy, _, summary = actor_critic.train(mb_obs, - mb_rewards, mb_masks, mb_actions, mb_values, update, - summary_op) - writer.add_summary(summary, update) - else: - loss, policy_loss, value_loss, policy_entropy, _ = actor_critic.train(mb_obs, - mb_rewards, mb_masks, mb_actions, mb_values, update) - - if update % log_interval == 0 or update == 1: - print('%i): %.4f, %.4f, %.4f' % (update, policy_loss, value_loss, policy_entropy)) - print(final_rewards.mean()) - - if update % save_interval == 0: - print('Saving model') - actor_critic.save(save_path, save_name + '_' + str(update) + '.ckpt') - - actor_critic.save(save_path, save_name + '_done.ckpt') - - -if __name__ == '__main__': - os.environ["CUDA_VISIBLE_DEVICES"]="0" - - load_count = 0 - load_path = 'weights/a2c_%i.ckpt' % load_count - load_path = None - - train(CnnPolicy, 'a2c', load_count=load_count, load_path=load_path, - log_path='./a2c_logs') + actor_critic = ActorCritic(sess, policy, ob_space, ac_space, nenvs, nsteps, should_summary) + return actor_critic diff --git a/common/__init__.py b/common/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/env_model.py b/env_model.py index cf96e94..d0dc5a6 100644 --- a/env_model.py +++ b/env_model.py @@ -12,6 +12,20 @@ from common.pacman_util import num_pixels, mode_rewards, pix_to_target, rewards_to_target +# How many iterations we are training the environment model for. +NUM_UPDATES = 5000 + +LOG_INTERVAL = 100 + +N_ENVS = 16 +N_STEPS = 5 + +# This can be anything from "regular" "avoid" "hunt" "ambush" "rush" each +# resulting in a different reward function giving the agent different behavior. +REWARD_MODE = 'regular' + +# Replace this with the location of your own weights. +A2C_WEIGHTS = 'weights/a2c_200000.ckpt' def pool_inject(X, batch_size, depth, width, height): m = tf.layers.max_pooling2d(X, pool_size=(width, height), strides=(width, height)) @@ -116,7 +130,7 @@ def create_env_model(obs_shape, num_actions, num_pixels, num_rewards, def make_env(): def _thunk(): - env = MiniPacman('regular', 1000) + env = MiniPacman(REWARD_MODE, 1000) return env return _thunk @@ -151,30 +165,24 @@ def __init__(self, imag_state, imag_reward, input_states, input_actions, if __name__ == '__main__': - nenvs = 16 - nsteps = 5 - - envs = [make_env() for i in range(nenvs)] + envs = [make_env() for i in range(N_ENVS)] envs = SubprocVecEnv(envs) ob_space = envs.observation_space.shape ac_space = envs.action_space num_actions = envs.action_space.n - os.environ["CUDA_VISIBLE_DEVICES"]="1" with tf.Session() as sess: - actor_critic = get_actor_critic(sess, nenvs, nsteps, ob_space, ac_space, CnnPolicy, should_summary=False) - actor_critic.load('weights/a2c_200000.ckpt') + actor_critic = get_actor_critic(sess, N_ENVS, N_STEPS, ob_space, ac_space, CnnPolicy, should_summary=False) + actor_critic.load(A2C_WEIGHTS) with tf.variable_scope('env_model'): - env_model = create_env_model(ob_space, num_actions, num_pixels, len(mode_rewards['regular'])) + env_model = create_env_model(ob_space, num_actions, num_pixels, + len(mode_rewards[REWARD_MODE])) summary_op = tf.summary.merge_all() sess.run(tf.global_variables_initializer()) - num_updates = 5000 - log_interval = 100 - losses = [] all_rewards = [] @@ -187,12 +195,12 @@ def __init__(self, imag_state, imag_reward, input_states, input_actions, writer = tf.summary.FileWriter('./env_logs', graph=sess.graph) - for frame_idx, states, actions, rewards, next_states, dones in tqdm(play_games(actor_critic, envs, num_updates), total=num_updates): + for frame_idx, states, actions, rewards, next_states, dones in tqdm(play_games(actor_critic, envs, NUM_UPDATES), total=NUM_UPDATES): target_state = pix_to_target(next_states) - target_reward = rewards_to_target('regular', rewards) + target_reward = rewards_to_target(REWARD_MODE, rewards) - onehot_actions = np.zeros((nenvs, num_actions, width, height)) - onehot_actions[range(nenvs), actions] = 1 + onehot_actions = np.zeros((N_ENVS, num_actions, width, height)) + onehot_actions[range(N_ENVS), actions] = 1 # Change so actions are the 'depth of the image' as tf expects onehot_actions = onehot_actions.transpose(0, 2, 3, 1) @@ -210,7 +218,7 @@ def __init__(self, imag_state, imag_reward, input_states, input_actions, env_model.target_rewards: target_reward }) - if frame_idx % log_interval == 0: + if frame_idx % LOG_INTERVAL == 0: print('%i) %.5f, %.5f, %.5f' % (frame_idx, l, reward_loss, image_loss)) writer.add_summary(summary, frame_idx) diff --git a/eval_actor_vis.ipynb b/eval_actor_vis.ipynb index fe4e0aa..97c0e00 100644 --- a/eval_actor_vis.ipynb +++ b/eval_actor_vis.ipynb @@ -1,5 +1,15 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualize the A2C Agent\n", + "This notebook is for visualizing the A2C agent playing the pacman game and making sure that model is working. This is not for visualizing the imagination augmented agent. \n", + "\n", + "First start off by importing the necessary modules." + ] + }, { "cell_type": "code", "execution_count": 1, @@ -21,6 +31,13 @@ "%autoreload 2" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next create our environment. We want don't want to have multiprocessing (hence `nenvs=1`) " + ] + }, { "cell_type": "code", "execution_count": 2, @@ -49,6 +66,13 @@ "states = env.reset()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Helper function to display an image to the Jupyter Notebook so we can see the game being played in our browser" + ] + }, { "cell_type": "code", "execution_count": 3, @@ -69,6 +93,13 @@ " time.sleep(0.1)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load in the saved weights and see the game being played! Replace the weights I saved with whatever you want to use." + ] + }, { "cell_type": "code", "execution_count": 4, @@ -115,21 +146,12 @@ "\n", " print('total reward', total_reward)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { - "display_name": "Python [default]", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -143,7 +165,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.4.5" + "version": "3.6.0" } }, "nbformat": 4, diff --git a/eval_env_model.ipynb b/eval_env_model.ipynb index f7cbd43..9aff6c5 100644 --- a/eval_env_model.ipynb +++ b/eval_env_model.ipynb @@ -1,5 +1,16 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualize the Environment Model\n", + "\n", + "Evaluate and visualize the performance of the environment model by seeing it visualize future states while a A2C agent plays the game.\n", + "\n", + "First start off with some imports." + ] + }, { "cell_type": "code", "execution_count": 1, @@ -34,6 +45,13 @@ "%autoreload 2" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next create the environments we will use." + ] + }, { "cell_type": "code", "execution_count": 3, @@ -52,6 +70,13 @@ "num_actions = envs.action_space.n" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, go ahead and test the environment model in minipacman. This will use the A2C agent to play the game and the environment model to predict future states and rewards. Note that you should replace the locations of my weights with the locations of your own saved weights. This will visualize the imagined and real rewards and game states from the environment model. " + ] + }, { "cell_type": "code", "execution_count": 4, @@ -133,9 +158,9 @@ "metadata": { "anaconda-cloud": {}, "kernelspec": { - "display_name": "Python [conda env:tf]", + "display_name": "Python 3", "language": "python", - "name": "conda-env-tf-py" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -147,7 +172,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.4.5" + "version": "3.6.0" } }, "nbformat": 4, diff --git a/i2a.py b/i2a.py index 2b4cca4..45c0964 100644 --- a/i2a.py +++ b/i2a.py @@ -12,6 +12,24 @@ from a2c import get_actor_critic, train, CnnPolicy from common.pacman_util import num_pixels, mode_rewards, pix_to_target, rewards_to_target, mode_rewards, target_to_pix + +# Hyperparameter of how far ahead in the future the agent "imagines" +# Currently this is specifying one frame in the future. +NUM_ROLLOUTS = 1 + +# Hidden size in RNN imagination encoder. +HIDDEN_SIZE = 256 + +N_STEPS = 5 + +# This can be anything from "regular" "avoid" "hunt" "ambush" "rush" each +# resulting in a different reward function giving the agent different behavior. +REWARD_MODE = 'regular' + +# Replace this with the name of the weights you want to load to train I2A +A2C_MODEL_PATH = 'weights/a2c_200000.ckpt' +ENV_MODEL_PATH = 'weights/env_model.ckpt' + # Softmax function for numpy taken from # https://nolanbconaway.github.io/blog/2017/softmax-numpy def softmax(X, theta = 1.0, axis = None): @@ -71,6 +89,9 @@ def convert_target_to_real(batch_size, nw, nh, nc, imagined_state, imagined_rewa return imagined_state, imagined_reward +""" +Used to generate rollouts of imagined states. +""" class ImaginationCore(object): def __init__(self, num_rollouts, num_actions, num_rewards, ob_space, actor_critic, env_model): @@ -113,7 +134,6 @@ def imagine(self, state, sess): self.env_model.input_actions: onehot_action, }) - # TODO: Add some code here to visualize the imaginations? imagined_state, imagined_reward = convert_target_to_real(rollout_batch_size, nw, nh, nc, imagined_state, imagined_reward) onehot_reward = np.zeros((rollout_batch_size, self.num_rewards)) @@ -135,7 +155,8 @@ def get_cache_loaded_a2c(sess, nenvs, nsteps, ob_space, ac_space): with tf.variable_scope('actor'): g_actor_critic = get_actor_critic(sess, nenvs, nsteps, ob_space, ac_space, CnnPolicy, should_summary=False) - g_actor_critic.load('weights/a2c_200000.ckpt') + g_actor_critic.load(A2C_MODEL_PATH) + print('Actor restored!') return g_actor_critic @@ -147,11 +168,11 @@ def get_cache_loaded_env_model(sess, nenvs, ob_space, num_actions): if g_env_model is None: with tf.variable_scope('env_model'): g_env_model = create_env_model(ob_space, num_actions, num_pixels, - len(mode_rewards['regular']), should_summary=False) + len(mode_rewards[REWARD_MODE]), should_summary=False) save_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='env_model') loader = tf.train.Saver(var_list=save_vars) - loader.restore(sess, 'weights/env_model.ckpt') + loader.restore(sess, ENV_MODEL_PATH) print('Env model restored!') @@ -160,20 +181,14 @@ def get_cache_loaded_env_model(sess, nenvs, ob_space, num_actions): class I2aPolicy(object): def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): - # Hyperparameter of how far ahead in the future the agent "imagines" - # Currently this is specifying one frame in the future. - num_rollouts = 1 - hidden_size = 256 - nsteps = 5 - - num_rewards = len(mode_rewards['regular']) + num_rewards = len(mode_rewards[REWARD_MODE]) num_actions = ac_space.n width, height, depth = ob_space - actor_critic = get_cache_loaded_a2c(sess, nbatch, nsteps, ob_space, ac_space) + actor_critic = get_cache_loaded_a2c(sess, nbatch, N_STEPS, ob_space, ac_space) env_model = get_cache_loaded_env_model(sess, nbatch, ob_space, num_actions) - self.imagination = ImaginationCore(num_rollouts, num_actions, num_rewards, + self.imagination = ImaginationCore(NUM_ROLLOUTS, num_actions, num_rewards, ob_space, actor_critic, env_model) with tf.variable_scope('model', reuse=reuse): @@ -185,7 +200,7 @@ def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): batch_size = tf.shape(self.imagined_state)[1] hidden_state = self.get_encoder(self.imagined_state, self.imagined_reward, - num_steps, batch_size, width, height, depth, hidden_size) + num_steps, batch_size, width, height, depth, HIDDEN_SIZE) # Model free path. self.state = tf.placeholder(tf.float32, [None, width, height, @@ -276,12 +291,3 @@ def transform_input(self, X, sess): imagined_state, imagined_reward = self.imagination.imagine(X, sess) return [imagined_state, imagined_reward, X] - - -if __name__ == '__main__': - os.environ["CUDA_VISIBLE_DEVICES"]="0" - # Train using typical a2c algorithm. - train(I2aPolicy, 'i2a', summarize=True, log_path='./i2a_logs') - - - diff --git a/main.py b/main.py new file mode 100644 index 0000000..17c8c26 --- /dev/null +++ b/main.py @@ -0,0 +1,159 @@ +# Inspired from OpenAI Baselines. This uses the same design of having an easily +# substitutable generic policy that can be trained. This allows to easily +# substitute in the I2A policy as opposed to the basic CNN one. + +import numpy as np +import tensorflow as tf +from common.minipacman import MiniPacman +from common.multiprocessing_env import SubprocVecEnv +from tqdm import tqdm + + +N_ENVS = 16 +N_STEPS=5 + +# Total number of iterations (taking into account number of environments and +# number of steps). You wish to train for. +TOTAL_TIMESTEPS=int(1e6) + +GAMMA=0.99 + +LOG_INTERVAL=100 +SAVE_INTERVAL = 1e5 + +# Where you want to save the weights +SAVE_PATH = 'weights' + +# This can be anything from "regular" "avoid" "hunt" "ambush" "rush" each +# resulting in a different reward function giving the agent different behavior. +REWARD_MODE = 'regular' + +def discount_with_dones(rewards, dones, GAMMA): + discounted = [] + r = 0 + for reward, done in zip(rewards[::-1], dones[::-1]): + r = reward + GAMMA*r*(1.-done) + discounted.append(r) + return discounted[::-1] + + +def train(policy, save_name, load_count = 0, summarize=True, load_path=None, log_path = './logs'): + def make_env(): + def _thunk(): + env = MiniPacman(REWARD_MODE, 1000) + return env + + return _thunk + + envs = [make_env() for i in range(N_ENVS)] + envs = SubprocVecEnv(envs) + + ob_space = envs.observation_space.shape + nw, nh, nc = ob_space + ac_space = envs.action_space + + obs = envs.reset() + + with tf.Session() as sess: + actor_critic = get_actor_critic(sess, N_ENVS, N_STEPS, ob_space, + ac_space, policy, summarize) + if load_path is not None: + actor_critic.load(load_path) + print('Loaded a2c') + + summary_op = tf.summary.merge_all() + writer = tf.summary.FileWriter(log_path, graph=sess.graph) + + sess.run(tf.global_variables_initializer()) + + batch_ob_shape = (N_ENVS*N_STEPS, nw, nh, nc) + + dones = [False for _ in range(N_ENVS)] + nbatch = N_ENVS * N_STEPS + + episode_rewards = np.zeros((N_ENVS, )) + final_rewards = np.zeros((N_ENVS, )) + + for update in tqdm(range(load_count + 1, TOTAL_TIMESTEPS + 1)): + # mb stands for mini batch + mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] + for n in range(N_STEPS): + actions, values, _ = actor_critic.act(obs) + + mb_obs.append(np.copy(obs)) + mb_actions.append(actions) + mb_values.append(values) + mb_dones.append(dones) + + obs, rewards, dones, _ = envs.step(actions) + + episode_rewards += rewards + masks = 1 - np.array(dones) + final_rewards *= masks + final_rewards += (1 - masks) * episode_rewards + episode_rewards *= masks + + mb_rewards.append(rewards) + + mb_dones.append(dones) + + #batch of steps to batch of rollouts + mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(1, 0).reshape(batch_ob_shape) + mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) + mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) + mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) + mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) + mb_masks = mb_dones[:, :-1] + mb_dones = mb_dones[:, 1:] + + last_values = actor_critic.critique(obs).tolist() + + #discount/bootstrap off value fn + for n, (rewards, d, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): + rewards = rewards.tolist() + d = d.tolist() + if d[-1] == 0: + rewards = discount_with_dones(rewards+[value], d+[0], GAMMA)[:-1] + else: + rewards = discount_with_dones(rewards, d, GAMMA) + mb_rewards[n] = rewards + + mb_rewards = mb_rewards.flatten() + mb_actions = mb_actions.flatten() + mb_values = mb_values.flatten() + mb_masks = mb_masks.flatten() + + if summarize: + loss, policy_loss, value_loss, policy_entropy, _, summary = actor_critic.train(mb_obs, + mb_rewards, mb_masks, mb_actions, mb_values, update, + summary_op) + writer.add_summary(summary, update) + else: + loss, policy_loss, value_loss, policy_entropy, _ = actor_critic.train(mb_obs, + mb_rewards, mb_masks, mb_actions, mb_values, update) + + if update % LOG_INTERVAL == 0 or update == 1: + print('%i): %.4f, %.4f, %.4f' % (update, policy_loss, value_loss, policy_entropy)) + print(final_rewards.mean()) + + if update % SAVE_INTERVAL == 0: + print('Saving model') + actor_critic.save(SAVE_PATH, save_name + '_' + str(update) + '.ckpt') + + actor_critic.save(SAVE_PATH, save_name + '_done.ckpt') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('algo', help='Algorithm to train either i2a or a2c') + args = parser.parse_args() + + if args.algo == 'a2c': + policy = CnnPolicy + elif args.algo == 'i2a': + policy = I2aPolicy + else: + raise ValueError('Must specify the algo name as either a2c or i2a') + + train(policy, algo.name, summarize=True, log_path=algo.name + '_logs') +