diff --git a/.gitignore b/.gitignore index 72364f99f..7ff358b24 100644 --- a/.gitignore +++ b/.gitignore @@ -87,3 +87,15 @@ ENV/ # Rope project settings .ropeproject + +# HW1 files +hw1/archive/ +hw1/results/ +.vscode/settings.json +hw1/images/*.xcf + +# HW2 files +hw2/data/ + +# HW3 files +hw3/results/ diff --git a/hw1/bc.py b/hw1/bc.py new file mode 100644 index 000000000..09dd1646c --- /dev/null +++ b/hw1/bc.py @@ -0,0 +1,246 @@ +from __future__ import print_function +import os +import sys +import logging +import argparse +from tqdm import tqdm +import tensorflow as tf +import numpy as np +import gym +from gym import wrappers +import load_policy +import pickle +from sklearn.model_selection import train_test_split + +from data.bc_data import Data +from models.bc_model import Model + +def config_logging(log_file): + if os.path.exists(log_file): + os.remove(log_file) + + logger = logging.getLogger() + logger.setLevel(logging.DEBUG) + formatter = logging.Formatter('%(asctime)s - %(message)s') + + fh = logging.FileHandler(log_file) + fh.setLevel(logging.DEBUG) + fh.setFormatter(formatter) + logger.addHandler(fh) + + return logger + +def create_model(session, obs_samples, num_observations, num_actions, logger, optimizer, learning_rate, restore, checkpoint_dir): + model = Model(obs_samples, num_observations, num_actions, checkpoint_dir, logger, optimizer, learning_rate) + + if restore: + model.load(session) + else: + logger.info("Created model with fresh parameters") + session.run(tf.global_variables_initializer()) + + return model + +def gather_expert_experience(num_rollouts, env, policy_fn, max_steps): + with tf.Session(): + returns = [] + observations = [] + actions = [] + for _ in tqdm(range(num_rollouts)): + obs = env.reset() + done = False + totalr = 0. + steps = 0 + while not done: + action = policy_fn(obs[None,:]) + observations.append(obs) + actions.append(action) + obs, r, done, _ = env.step(action) + totalr += r + steps += 1 + if steps >= max_steps: + break + returns.append(totalr) + + expert_data = {'observations': np.stack(observations, axis=0), + 'actions': np.squeeze(np.stack(actions, axis=0)), + 'returns':np.array(returns)} + return expert_data + + +def bc(expert_data_file, expert_policy_file, env_name, restore, results_dir, + num_rollouts, max_timesteps=None, optimizer='adam', num_epochs=100, learning_rate=.001, batch_size=32, keep_prob=1): + tf.reset_default_graph() + + env = gym.make(env_name) + max_steps = max_timesteps or env.spec.timestep_limit + + # data = Data(expert_data_file, train_ratio=0.9, val_ratio=0.05) + + with open(expert_data_file, 'rb') as f: + data = pickle.loads(f.read()) + + obs = np.stack(data['observations'], axis=0) + actions = np.squeeze(np.stack(data['actions'], axis=0)) + + x_train, x_test, y_train, y_test = train_test_split(obs, actions, test_size=0.2) + + num_samples = len(x_train) + + min_val_loss = sys.maxsize + + with tf.Session() as session: + model = create_model(session, x_train, x_train.shape[1], y_train.shape[1], logger, optimizer, learning_rate, restore, results_dir) + + file_writer = tf.summary.FileWriter(results_dir, session.graph) + + for epoch in tqdm(range(num_epochs)): + perm = np.random.permutation(x_train.shape[0]) + + obs_samples = x_train[perm] + action_samples = y_train[perm] + + loss = 0. + for k in range(0,obs_samples.shape[0], batch_size): + batch_loss, training_scalar = model.update(session, obs_samples[k:k+batch_size], + action_samples[k:k+batch_size], + keep_prob) + loss += batch_loss + + file_writer.add_summary(training_scalar, epoch) + + min_val_loss, validation_scalar = validate(model, logger, session, x_test, y_test, epoch, batch_size, min_val_loss, results_dir) + file_writer.add_summary(validation_scalar, epoch) + + new_exp = model.test_run(session, env, max_steps ) + tqdm.write("Epoch %3d Loss %f Reward %f" %(epoch, loss/num_samples, new_exp['reward'])) + + env = wrappers.Monitor(env, results_dir, force=True) + + results = [] + for _ in tqdm(range(10)): + results.append(model.test_run(session, env, max_steps )['reward']) + logger.info("Reward mean and std dev with behavior cloning: %f(%f)"%(np.mean(results), np.std(results))) + return data['mean_return'], data['std_return'], np.mean(results), np.std(results) + +def validate(model, logger, session, x_test, y_test, num_epoch, batch_size, min_loss, checkpoint_dir): + avg_loss = [] + + # for k in range(0, x_test.shape[0], batch_size): + loss, validation_scalar = model.validate(session, x_test, y_test) + avg_loss.append(loss) + + new_loss = sum(avg_loss) / len(avg_loss) + logger.info("Finished epoch %d, average validation loss = %f" % (num_epoch, new_loss)) + + if new_loss < min_loss: # Only save model if val loss dropped + model.save(session) + min_loss = new_loss + return min_loss, validation_scalar + +def dagger(expert_data_file, expert_policy_file, env_name, restore, results_dir, + num_rollouts, max_timesteps=None, optimizer='adam', num_epochs=40, learning_rate=.001, batch_size=32, keep_prob=1): + tf.reset_default_graph() + + env = gym.make(env_name) + max_steps = max_timesteps or env.spec.timestep_limit + + expert_policy_fn = load_policy.load_policy(expert_policy_file) + + # data = Data(expert_data_file, train_ratio=0.9, val_ratio=0.05) + + with open(expert_data_file, 'rb') as f: + data = pickle.loads(f.read()) + + obs = np.stack(data['observations'], axis=0) + actions = np.squeeze(np.stack(data['actions'], axis=0)) + + x_train, x_test, y_train, y_test = train_test_split(obs, actions, test_size=0.2) + + min_val_loss = sys.maxsize + + with tf.Session() as session: + model = create_model(session, x_train, x_train.shape[1], y_train.shape[1], logger, optimizer, learning_rate, restore, results_dir) + + file_writer = tf.summary.FileWriter(results_dir, session.graph) + + for epoch in tqdm(range(num_epochs)): + num_samples = x_train.shape[0] + perm = np.random.permutation(num_samples) + + obsv_samples = x_train[perm] + action_samples = y_train[perm] + + obsv_samples = np.stack(obsv_samples, axis=0) + action_samples = np.squeeze(np.stack(action_samples, axis=0)) + + + loss = 0. + for k in range(0,obsv_samples.shape[0], batch_size): + batch_loss, training_scalar = model.update(session, obsv_samples[k:k+batch_size], + action_samples[k:k+batch_size], + keep_prob) + loss += batch_loss + + file_writer.add_summary(training_scalar, epoch) + + min_val_loss, validation_scalar = validate(model, logger, session, x_test, y_test, epoch, batch_size, min_val_loss, results_dir) + file_writer.add_summary(validation_scalar, epoch) + + new_exp = model.test_run(session, env, max_steps) + + #Data Aggregation Steps. Supervision signal comes from expert policy. + new_exp_len = new_exp['observations'].shape[0] + expert_expected_actions = [] + for k in range(0, new_exp_len, batch_size) : + expert_expected_actions.append(expert_policy_fn(new_exp['observations'][k:k+batch_size])) + + # add new experience into original one. (No eviction) + x_train = np.concatenate((x_train, new_exp['observations']), + axis=0) + y_train = np.concatenate([y_train] + expert_expected_actions, + axis=0) + tqdm.write("Epoch %3d Loss %f Reward %f" %(epoch, loss/num_samples, new_exp['reward'])) + + env = wrappers.Monitor(env, results_dir, force=True) + + results = [] + for _ in tqdm(range(10)): + results.append(model.test_run(session, env, max_steps )['reward']) + logger.info("Reward mean and std dev with DAgger: %f(%f)"%(np.mean(results), np.std(results))) + return data['mean_return'], data['std_return'], np.mean(results), np.std(results) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--restore", type=bool, default=False) + args = parser.parse_args() + + log_file = os.path.join(os.getcwd(), 'results', 'train_out.log') + logger = config_logging(log_file) + + env_models = [('Ant-v1', 'data/Ant-v1_data_250_rollouts.pkl', 'experts/Ant-v1.pkl', 250), + ('HalfCheetah-v1', 'data/HalfCheetah-v1_data_10_rollouts.pkl', 'experts/HalfCheetah-v1.pkl', 10), + ('Hopper-v1', 'data/Hopper-v1_data_10_rollouts.pkl', 'experts/Hopper-v1.pkl', 10), + ('Humanoid-v1', 'data/Humanoid-v1_data_250_rollouts.pkl', 'experts/Humanoid-v1.pkl', 250), + ('Reacher-v1', 'data/Reacher-v1_data_250_rollouts.pkl', 'experts/Reacher-v1.pkl', 250), + ('Walker2d-v1', 'data/Walker2d-v1_data_10_rollouts.pkl','experts/Walker2d-v1.pkl', 10) + ] + + results = [] + for env_name, rollout_data, expert_policy_file, num_rollouts in env_models : + bc_results_dir = os.path.join(os.getcwd(), 'results', env_name, 'bc') + if not os.path.exists(bc_results_dir): + os.makedirs(bc_results_dir) + ex_mean, ex_std, bc_mean,bc_std = bc(rollout_data, expert_policy_file, env_name, args.restore, bc_results_dir, num_rollouts) + + da_results_dir = os.path.join(os.getcwd(), 'results', env_name, 'da') + if not os.path.exists(da_results_dir): + os.makedirs(da_results_dir) + _,_, da_mean,da_std = dagger(rollout_data, expert_policy_file, env_name, args.restore, da_results_dir, num_rollouts) + results.append((env_name, ex_mean, ex_std, bc_mean, bc_std, da_mean, da_std)) + + for env_name, ex_mean, ex_std, bc_mean, bc_std, da_mean, da_std in results : + logger.info('Env: %s, Expert: %f(%f), Behavior Cloning: %f(%f), Dagger: %f(%f)'% + (env_name, ex_mean, ex_std, bc_mean, bc_std, da_mean, da_std)) + \ No newline at end of file diff --git a/hw1/data/.gitignore b/hw1/data/.gitignore new file mode 100644 index 000000000..f2a723b6f --- /dev/null +++ b/hw1/data/.gitignore @@ -0,0 +1 @@ +*.pkl \ No newline at end of file diff --git a/hw1/data/bc_data.py b/hw1/data/bc_data.py new file mode 100644 index 000000000..58767dcca --- /dev/null +++ b/hw1/data/bc_data.py @@ -0,0 +1,87 @@ +import pickle +import numpy as np +from sklearn.utils import shuffle + +# TODO: pass in logger + + +class Data(object): + def __init__(self, data_file, train_ratio, val_ratio): + data = pickle.load(open(data_file, "rb")) + + self.expert_mean_return=data['mean_return'] + self.expert_std_return=data['std_return'] + + self.train_ratio = train_ratio + self.val_ratio = val_ratio + + obs = np.stack(data['observations'], axis=0) + actions = np.squeeze(np.stack(data['actions'], axis=0)) + assert len(obs) == len(actions), "obs and action mismatch!" + + obs, actions = shuffle(obs, actions, random_state=0) + + self.num_observations = obs.shape[1] + self.num_actions = actions.shape[1] + + print("Splitting dataset...") + self.train, self.val, self.test = self.split(obs, actions) + + self.print_stat(self.train, "Training") + self.print_stat(self.val, "Validation") + self.print_stat(self.test, "Test") + + obs_mean = np.mean(self.train["observations"], axis=0) + obs_std = np.std(self.train["observations"], axis=0) + + print("Normalizing observations...") + self.pre_proc(self.train, obs_mean, obs_std) + self.pre_proc(self.val, obs_mean, obs_std) + self.pre_proc(self.test, obs_mean, obs_std) + + def split(self, obs, actions): + """Split the dataset into train, val, and test""" + n_total = len(obs) + n_train, n_val = int(n_total * self.train_ratio), int(n_total * self.val_ratio) + + train_data = {"observations": obs[:n_train], "actions": actions[:n_train]} + val_data = {"observations": obs[n_train:n_train + n_val], "actions": actions[n_train:n_train + n_val]} + test_data = {"observations": obs[n_train + n_val:], "actions": actions[n_train + n_val:]} + + return train_data, val_data, test_data + + def get_small_dataset(self, num_data=100): + """Return a subset of the training data""" + obs, actions = self.train["observations"], self.train["actions"] + idx = np.random.choice(np.arange(len(obs)), size=num_data, replace=False) + small_data = {"observations": obs[idx], "actions": actions[idx]} + return small_data + + @staticmethod + def batch_iter(data, batch_size, num_epochs, shuffle=True): + """Batch generator for a dataset""" + num_data = len(data["observations"]) + num_batch_per_epoch = int((num_data-1) / batch_size) + 1 + + for epoch in range(num_epochs): + obs, actions = data["observations"], data["actions"] + if shuffle: + idx = np.random.permutation(np.arange(num_data)) + obs = obs[idx] + actions = actions[idx] + for i in range(num_batch_per_epoch): + start_idx = i * batch_size + end_idx = min((i + 1) * batch_size, num_data) + yield obs[start_idx:end_idx], actions[start_idx:end_idx] + + @staticmethod + def print_stat(data, title): + obs, actions = data["observations"], data["actions"] + print("%s Observations %s, mean: %s" % (title, str(obs.shape), str(np.mean(obs, axis=0)))) + print("%s Actions %s, mean: %s" % (title, str(actions.shape), str(np.mean(actions, axis=0)))) + + @staticmethod + def pre_proc(data, mean, std): + """Normalize observations""" + obs = data["observations"] + data["observations"] = (obs - mean) / (std + 1e-6) # See load_policy.py diff --git a/hw1/images/ant_expert_250.mp4 b/hw1/images/ant_expert_250.mp4 new file mode 100644 index 000000000..b557efd73 Binary files /dev/null and b/hw1/images/ant_expert_250.mp4 differ diff --git a/hw1/images/graph.png b/hw1/images/graph.png new file mode 100644 index 000000000..744e722b2 Binary files /dev/null and b/hw1/images/graph.png differ diff --git a/hw1/images/halfcheetah_expert_250.mp4 b/hw1/images/halfcheetah_expert_250.mp4 new file mode 100644 index 000000000..03f0c1244 Binary files /dev/null and b/hw1/images/halfcheetah_expert_250.mp4 differ diff --git a/hw1/images/halfcheetah_val_loss.png b/hw1/images/halfcheetah_val_loss.png new file mode 100644 index 000000000..bca95c414 Binary files /dev/null and b/hw1/images/halfcheetah_val_loss.png differ diff --git a/hw1/images/hopper_expert_250.mp4 b/hw1/images/hopper_expert_250.mp4 new file mode 100644 index 000000000..8cee77622 Binary files /dev/null and b/hw1/images/hopper_expert_250.mp4 differ diff --git a/hw1/images/humanoid_expert_250.mp4 b/hw1/images/humanoid_expert_250.mp4 new file mode 100644 index 000000000..fe1c35069 Binary files /dev/null and b/hw1/images/humanoid_expert_250.mp4 differ diff --git a/hw1/images/reacher_expert_250.mp4 b/hw1/images/reacher_expert_250.mp4 new file mode 100644 index 000000000..3a515bba1 Binary files /dev/null and b/hw1/images/reacher_expert_250.mp4 differ diff --git a/hw1/images/walk2d_frame.png b/hw1/images/walk2d_frame.png new file mode 100644 index 000000000..f3f4f6c77 Binary files /dev/null and b/hw1/images/walk2d_frame.png differ diff --git a/hw1/images/walker_bc.mp4 b/hw1/images/walker_bc.mp4 new file mode 100644 index 000000000..b9556a75c Binary files /dev/null and b/hw1/images/walker_bc.mp4 differ diff --git a/hw1/images/walker_da.mp4 b/hw1/images/walker_da.mp4 new file mode 100644 index 000000000..75138df05 Binary files /dev/null and b/hw1/images/walker_da.mp4 differ diff --git a/hw1/images/walker_expert_250.mp4 b/hw1/images/walker_expert_250.mp4 new file mode 100644 index 000000000..e4bd11662 Binary files /dev/null and b/hw1/images/walker_expert_250.mp4 differ diff --git a/hw1/images/walker_val_loss.png b/hw1/images/walker_val_loss.png new file mode 100644 index 000000000..010ab92ef Binary files /dev/null and b/hw1/images/walker_val_loss.png differ diff --git a/hw1/models/bc_model.py b/hw1/models/bc_model.py new file mode 100644 index 000000000..536a72e32 --- /dev/null +++ b/hw1/models/bc_model.py @@ -0,0 +1,156 @@ +import os +import numpy as np +import tensorflow as tf +import tensorflow.contrib.slim as slim +import itertools + +class Model: + def __init__(self, obs_samples, num_observations, num_actions, checkpoint_dir, logger, optimizer, learning_rate): + self.logger = logger + + self.obs_mean = obs_samples.mean(axis=0) + self.obs_std = obs_samples.std(axis=0) + + self.num_observations = num_observations + self.num_actions = num_actions + + self.obs = tf.placeholder(tf.float32, [None, num_observations]) + self.actions = tf.placeholder(tf.float32, [None, num_actions]) + self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') + + self.init_global_step() + self.pred, self.parameters = self.build_model() + + self.loss = self.get_loss() + self.training_scalar = tf.summary.scalar("training_loss", self.loss) + self.validation_scalar = tf.summary.scalar("validation_loss", self.loss) + self.optimizer = self.get_optimizer(optimizer, learning_rate) + + self.checkpoint_dir = checkpoint_dir + self.saver = tf.train.Saver(var_list=tf.global_variables()) + + def save(self, sess): + if not os.path.exists(self.checkpoint_dir): + os.makedirs(self.checkpoint_dir) + self.saver.save(sess, self.checkpoint_dir + 'model', global_step=self.global_step_tensor) + self.logger.info("Model saved") + + def load(self, session): + latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_dir) + if latest_checkpoint: + self.logger.info("Loading model checkpoint {} ...\n".format(latest_checkpoint)) + self.saver.restore(session, latest_checkpoint) + self.logger.info("Model loaded") + + def init_global_step(self): + with tf.variable_scope('global_step'): + self.global_step_tensor = tf.Variable(0, trainable=False, name='global_step') + + def variable_summaries(self, var): + with tf.name_scope('summaries'): + mean = tf.reduce_mean(var) + tf.summary.scalar('mean', mean) + with tf.name_scope('stddev'): + stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) + tf.summary.scalar('stddev', stddev) + tf.summary.scalar('max', tf.reduce_max(var)) + tf.summary.scalar('min', tf.reduce_min(var)) + tf.summary.histogram('histogram', var) + + def fc_layer(self, input_tensor, input_dim, output_dim, layer_name, parameters, activation='relu'): + with tf.name_scope(layer_name): + weights = tf.get_variable('weights_' + layer_name, [input_dim, output_dim], + initializer=tf.contrib.layers.xavier_initializer(), dtype=tf.float32) + self.variable_summaries(weights) + + biases = tf.get_variable('biases_' + layer_name, [output_dim], + initializer=tf.constant_initializer(0), dtype=tf.float32) + self.variable_summaries(biases) + + z_fc1 = tf.add(tf.matmul(input_tensor, weights), biases) + tf.summary.histogram('z_' + layer_name, z_fc1) + + if activation == 'relu': + a_fc1 = tf.nn.relu(z_fc1) + tf.summary.histogram('a_' + layer_name, a_fc1) + else: + a_fc1 = z_fc1 + + parameters += [weights, biases] + return a_fc1 + + def build_model(self): + parameters = [] + + normalized = (self.obs - self.obs_mean) / self.obs_std + + net = slim.fully_connected(normalized, 50, scope='fc1', activation_fn=tf.nn.relu) + net = slim.dropout(net, self.keep_prob) + net = slim.fully_connected(net, 50, scope='fc2', activation_fn=tf.nn.relu) + net = slim.dropout(net, self.keep_prob) + policy = slim.fully_connected(net, self.num_actions, activation_fn=None, scope='policy') + + + + # a_fc1 = self.fc_layer(normalized, self.num_observations, 128, 'fc1', parameters) + # a_fc2 = self.fc_layer(a_fc1, 128, 128, 'fc2', parameters) + # a_fc3 = self.fc_layer(a_fc2, 128, 128, 'fc3', parameters) + # z_fc4 = self.fc_layer(a_fc3, 128, self.num_actions, 'fc4', parameters, activation = None) + return policy, parameters + + def get_optimizer(self, optimizer, learning_rate): + self.logger.info("Using %s optimizer" % optimizer) + if optimizer == "adam": + return tf.train.AdamOptimizer(learning_rate).minimize(self.loss, + global_step=self.global_step_tensor) + elif optimizer == "adagrad": + return tf.train.AdagradOptimizer(learning_rate).minimize(self.loss, + global_step=self.global_step_tensor) + elif optimizer == "rmsprop": + return tf.train.RMSPropOptimizer(learning_rate).minimize(self.loss, + global_step=self.global_step_tensor) + else: + return tf.train.GradientDescentOptimizer(learning_rate).minimize(self.loss, + global_step=self.global_step_tensor) + + def get_loss(self): + loss = tf.reduce_mean(tf.reduce_sum((self.pred - self.actions)**2, axis=1)) + # loss = tf.reduce_mean(tf.pow(self.pred - self.actions, 2)) / 2 + return loss + + def validate(self, sess, batch_x, batch_y): + return sess.run([self.loss, self.validation_scalar], + feed_dict={self.obs:batch_x, + self.actions: batch_y, + self.keep_prob: 1}) + + def predict(self, sess, batch_x): + return sess.run(self.pred, + feed_dict={self.obs:batch_x, + self.keep_prob: 1}) + + def update(self, sess, batch_x, batch_y, keep_prob): + loss, training_scalar, _ = sess.run([self.loss, self.training_scalar, self.optimizer], + feed_dict={self.obs: batch_x, + self.actions: batch_y, + self.keep_prob: keep_prob}) + return loss, training_scalar + + def test_run(self, sess, env, max_steps): + obvs = [] + actions = [] + reward = 0. + + obv = env.reset() + for steps in itertools.count() : + obvs.append(obv) + actions.append(self.predict(sess, np.expand_dims(obv,axis=0))[0]) + obv, r, done, _ = env.step(actions[-1]) + reward += r + if steps >= max_steps or done: + break + + experience = {'observations': np.stack(obvs,axis=0), + 'actions': np.squeeze(np.stack(actions,axis=0)), + 'reward':reward} + return experience \ No newline at end of file diff --git a/hw1/run_expert.py b/hw1/run_expert.py index bdf4988df..e2a9b778d 100755 --- a/hw1/run_expert.py +++ b/hw1/run_expert.py @@ -8,40 +8,43 @@ Author of this script and included expert policies: Jonathan Ho (hoj@openai.com) """ - +import os import pickle import tensorflow as tf import numpy as np import tf_util import gym +from gym import wrappers import load_policy -def main(): - import argparse - parser = argparse.ArgumentParser() - parser.add_argument('expert_policy_file', type=str) - parser.add_argument('envname', type=str) - parser.add_argument('--render', action='store_true') - parser.add_argument("--max_timesteps", type=int) - parser.add_argument('--num_rollouts', type=int, default=20, - help='Number of expert roll outs') - args = parser.parse_args() +def generate_all_rollout_data(): + generate_rollout_data('experts/Ant-v1.pkl', 'Ant-v1', 250, False, 'data') + generate_rollout_data('experts/HalfCheetah-v1.pkl', 'HalfCheetah-v1', 10, False, 'data') + generate_rollout_data('experts/Hopper-v1.pkl', 'Hopper-v1', 10, False, 'data') + generate_rollout_data('experts/Humanoid-v1.pkl', 'Humanoid-v1', 250, False, 'data') + generate_rollout_data('experts/Reacher-v1.pkl', 'Reacher-v1', 250, False, 'data') + generate_rollout_data('experts/Walker2d-v1.pkl', 'Walker2d-v1', 10, False, 'data') + +def generate_rollout_data(expert_policy_file, env_name, num_rollouts, render, output_dir=None, save=False, max_timesteps=None): print('loading and building expert policy') - policy_fn = load_policy.load_policy(args.expert_policy_file) + policy_fn = load_policy.load_policy(expert_policy_file) print('loaded and built') with tf.Session(): tf_util.initialize() - import gym - env = gym.make(args.envname) - max_steps = args.max_timesteps or env.spec.timestep_limit + env = gym.make(env_name) + max_steps = max_timesteps or env.spec.timestep_limit + if save: + expert_results_dir = os.path.join(os.getcwd(), 'results', env_name, 'expert') + env = wrappers.Monitor(env, expert_results_dir, force=True) + returns = [] observations = [] actions = [] - for i in range(args.num_rollouts): + for i in range(num_rollouts): print('iter', i) obs = env.reset() done = False @@ -54,7 +57,7 @@ def main(): obs, r, done, _ = env.step(action) totalr += r steps += 1 - if args.render: + if render: env.render() if steps % 100 == 0: print("%i/%i"%(steps, max_steps)) if steps >= max_steps: @@ -66,7 +69,26 @@ def main(): print('std of return', np.std(returns)) expert_data = {'observations': np.array(observations), - 'actions': np.array(actions)} + 'actions': np.array(actions), + 'mean_return': np.mean(returns), + 'std_return': np.std(returns)} + + if output_dir is not 'None': + output_dir = os.path.join(os.getcwd(), output_dir) + filename = '{}_data_{}_rollouts.pkl'.format(env_name, num_rollouts) + with open(output_dir + '/' + filename,'wb') as f: + pickle.dump(expert_data, f) if __name__ == '__main__': - main() + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('expert_policy_file', type=str) + parser.add_argument('envname', type=str) + parser.add_argument("--max_timesteps", type=int) + parser.add_argument('--num_rollouts', type=int, default=20, + help='Number of expert roll outs') + parser.add_argument('--render', action='store_true') + parser.add_argument("--output_dir", type=str, default='data') + args = parser.parse_args() + + generate_rollout_data(args.expert_policy_file, args.envname, args.num_rollouts, args.render, args.max_timesteps) diff --git a/hw2/README.md b/hw2/README.md new file mode 100644 index 000000000..b7d01a2a0 --- /dev/null +++ b/hw2/README.md @@ -0,0 +1,81 @@ +## Testing +``` +python train_pg_test.py +``` + +# Discrete Policy Gradient Training on CartPole-v0 + +## train_pg.py flags +``` +-n : Number of iterations. + +-b : Batch size (number of state-action pairs sampled while acting according to the current policy at each iteration). + +-e : Number of experiments to run with the same configuration. Each experiment will start with a different randomly initialized policy, and have a different stream of random numbers. + +-dna : Flag: if present, sets normalize_advantages to False. Otherwise, by default, normalize_advantages=True. + +-rtg : Flag: if present, sets reward_to_go=True. Otherwise, by default, reward_to_go=False. + +--exp_name : Name for experiment, which goes into the name for the data directory. +``` + +## Sample Runs For Discrete CartPole-v0 +``` +python train_pg.py CartPole-v0 -n 100 -b 1000 -e 5 -dna --exp_name sb_no_rtg_dna + +python train_pg.py CartPole-v0 -n 100 -b 1000 -e 5 -rtg -dna --exp_name sb_rtg_dna + +python train_pg.py CartPole-v0 -n 100 -b 1000 -e 5 -rtg --exp_name sb_rtg_na + +python train_pg.py CartPole-v0 -n 100 -b 5000 -e 5 -dna --exp_name lb_no_rtg_dna + +python train_pg.py CartPole-v0 -n 100 -b 5000 -e 5 -rtg -dna --exp_name lb_rtg_dna + +python train_pg.py CartPole-v0 -n 100 -b 5000 -e 5 -rtg --exp_name lb_rtg_na +``` + +## Plot Small Batch (1000) Average Returns +``` +python plot.py data/sb_no_rtg_dna_CartPole-v0_26-06-2018_06-25-21 data/sb_rtg_dna_CartPole-v0_26-06-2018_06-40-11 data/sb_rtg_na_CartPole-v0_26-06-2018_07-39-12 +``` +![Small Batch Returns](charts/avg_return_cartpole_smbatch_1000.png) + +``` +sb: small batch +lg: large batch +rtg: reward to go +no_rtg: no "reward to go" +na: normalize advantages +dna: don't "normalize advantages" +``` + +## Plot Large Batch (5000) Average Returns +``` +python plot.py data/lb_no_rtg_dna_CartPole-v0_26-06-2018_07-51-18 data/lb_rtg_dna_CartPole-v0_26-06-2018_08-44-23 data/lb_rtg_na_CartPole-v0_26-06-2018_10-06-53 +``` +![Large Batch Returns](charts/avg_return_cartpole_lgbatch_5000.png) + +## Video of CartPole after 100 iterations +[MP4](videos/video_cartpole.mp4) + + +# InvertedPendulum-v0 + +## Sample Runs For InvertedPendulum-v0 +``` +python train_pg.py InvertedPendulum-v1 -n 100 -b 1000 -e 5 -rtg -lr 0.02 --exp_name sb_rtg_na_0.02 + +python train_pg.py InvertedPendulum-v1 -n 100 -b 1000 -e 5 -rtg -dna -lr 0.02 --exp_name sb_rtg_dna_0.02 + +python train_pg.py InvertedPendulum-v1 -n 100 -b 1000 -e 5 -rtg -lr 0.02 -bl --exp_name sb_rtg_na_0.02_bl + +``` +## Plot InvertedPendulum Sample Runs +``` +python plot.py data/sb_rtg_na_0.02_InvertedPendulum-v1_27-06-2018_10-45-20 data/sb_rtg_dna_0.02_InvertedPendulum-v1_29-06-2018_09-34-48 data/sb_rtg_na_0.02_bl_InvertedPendulum-v1_29-06-2018_09-47-27 +``` +![InvertedPendulum Returns](charts/avg_return_invertedpendulum.png) + +## Video of InvertedPendulum after 100 iterations +[MP4](videos/video_invertedpendulum.mp4) \ No newline at end of file diff --git a/hw2/charts/avg_return_cartpole_lgbatch_5000.png b/hw2/charts/avg_return_cartpole_lgbatch_5000.png new file mode 100644 index 000000000..2d7be2241 Binary files /dev/null and b/hw2/charts/avg_return_cartpole_lgbatch_5000.png differ diff --git a/hw2/charts/avg_return_cartpole_smbatch_1000.png b/hw2/charts/avg_return_cartpole_smbatch_1000.png new file mode 100644 index 000000000..827c86362 Binary files /dev/null and b/hw2/charts/avg_return_cartpole_smbatch_1000.png differ diff --git a/hw2/charts/avg_return_invertedpendulum.png b/hw2/charts/avg_return_invertedpendulum.png new file mode 100644 index 000000000..3cce661fc Binary files /dev/null and b/hw2/charts/avg_return_invertedpendulum.png differ diff --git a/hw2/train_pg.py b/hw2/train_pg.py index d5af4368e..6fb40f113 100644 --- a/hw2/train_pg.py +++ b/hw2/train_pg.py @@ -7,6 +7,7 @@ import time import inspect from multiprocessing import Process +from gym import wrappers #============================================================================================# # Utilities @@ -34,12 +35,31 @@ def build_mlp( #========================================================================================# with tf.variable_scope(scope): - # YOUR_CODE_HERE - pass + dense = input_placeholder + for _ in range(n_layers): + dense = tf.layers.dense(inputs=dense, units=size, activation=activation) + + return tf.layers.dense(inputs=dense, units=output_size, activation=output_activation) + def pathlength(path): return len(path["reward"]) +def discounted_rewards_to_go(rewards, gamma): + """ state/action-centric policy gradients; reward-to-go=True. + """ + rtgs = [] + future_reward = 0 + # start at time step t and use future_reward to calculate current reward + for r in reversed(rewards): + future_reward = future_reward * gamma + r + rtgs.append(future_reward) + return rtgs[::-1] + +def sum_discounted_rewards(rewards, gamma): + """ trajectory-centric policy gradients; reward-to-go=False + """ + return sum((gamma**i) * rewards[i] for i in range(len(rewards))) #============================================================================================# @@ -81,6 +101,8 @@ def train_PG(exp_name='', # Make the gym environment env = gym.make(env_name) + + env = wrappers.Monitor(env, logdir, force=True) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) @@ -123,7 +145,7 @@ def train_PG(exp_name='', sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages - sy_adv_n = TODO + sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) #========================================================================================# @@ -166,26 +188,30 @@ def train_PG(exp_name='', #========================================================================================# if discrete: - # YOUR_CODE_HERE - sy_logits_na = TODO - sy_sampled_ac = TODO # Hint: Use the tf.multinomial op - sy_logprob_n = TODO - + sy_logits_na = build_mlp(input_placeholder=sy_ob_no, output_size=ac_dim, + scope="discrete_policy_network", n_layers=n_layers, size=size, + activation=tf.nn.relu) + sy_sampled_ac = tf.squeeze(tf.multinomial(sy_logits_na, 1), axis=[1]) + sy_logprob_n = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=sy_ac_na, logits=sy_logits_na) else: - # YOUR_CODE_HERE - sy_mean = TODO - sy_logstd = TODO # logstd should just be a trainable variable, not a network output. - sy_sampled_ac = TODO - sy_logprob_n = TODO # Hint: Use the log probability under a multivariate gaussian. - - + sy_mean = build_mlp(input_placeholder=sy_ob_no, output_size=ac_dim, + scope="continuous_policy_network", n_layers=n_layers, size=size, + activation=tf.nn.relu) + # logstd should just be a trainable variable, not a network output. + sy_logstd = tf.get_variable("logstd", shape=[ac_dim], dtype=tf.float32) + sy_sampled_ac = tf.random_normal(shape=tf.shape(sy_mean), mean=sy_mean, stddev=tf.exp(sy_logstd)) + dist = tf.contrib.distributions.MultivariateNormalDiag(loc=sy_mean, + scale_diag=tf.exp(sy_logstd)) + sy_logprob_n = -dist.log_prob(sy_ac_na) #========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation #========================================================================================# - loss = TODO # Loss function that we'll differentiate to get the policy gradient. + # multiply the log probability by the advantages + # Loss function that we'll differentiate to get the policy gradient. + loss = tf.reduce_mean(tf.multiply(sy_logprob_n, sy_adv_n)) update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) @@ -203,8 +229,9 @@ def train_PG(exp_name='', size=size)) # Define placeholders for targets, a loss function and an update op for fitting a # neural network baseline. These will be used to fit the neural network baseline. - # YOUR_CODE_HERE - baseline_update_op = TODO + baseline_targets = tf.placeholder(shape=[None], name="baseline_targets", dtype=tf.float32) + baseline_loss = tf.nn.l2_loss(baseline_prediction - baseline_targets) + baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize(baseline_loss) #========================================================================================# @@ -316,8 +343,14 @@ def train_PG(exp_name='', # #====================================================================================# - # YOUR_CODE_HERE - q_n = TODO + q_n = [] + if reward_to_go: + q_n = np.concatenate([discounted_rewards_to_go(path["reward"], gamma) for path in paths]) + else: + q_n = np.concatenate([ + [sum_discounted_rewards(path["reward"], gamma)] * pathlength(path) + for path in paths]) + #====================================================================================# # ----------SECTION 5---------- @@ -332,8 +365,8 @@ def train_PG(exp_name='', # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) - - b_n = TODO + b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no: ob_no}) + b_n = b_n * np.std(q_n, axis=0) + np.mean(q_n, axis=0) adv_n = q_n - b_n else: adv_n = q_n.copy() @@ -344,11 +377,7 @@ def train_PG(exp_name='', #====================================================================================# if normalize_advantages: - # On the next line, implement a trick which is known empirically to reduce variance - # in policy gradient methods: normalize adv_n to have mean zero and std=1. - # YOUR_CODE_HERE - pass - + adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) #====================================================================================# # ----------SECTION 5---------- @@ -364,9 +393,10 @@ def train_PG(exp_name='', # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) - - # YOUR_CODE_HERE - pass + q_n_mean = np.mean(q_n, axis=0) + q_n_std = np.std(q_n, axis=0) + q_n = (q_n - q_n_mean) / (q_n_std + 1e-7) + sess.run(baseline_update_op, feed_dict={sy_ob_no: ob_no, baseline_targets: q_n}) #====================================================================================# # ----------SECTION 4---------- @@ -379,8 +409,8 @@ def train_PG(exp_name='', # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. - # YOUR_CODE_HERE - + _ = sess.run([update_op], feed_dict={sy_ob_no: ob_no, + sy_ac_na: ac_na,sy_adv_n: adv_n}) # Log diagnostics returns = [path["reward"].sum() for path in paths] diff --git a/hw2/train_pg_test.py b/hw2/train_pg_test.py new file mode 100644 index 000000000..bb130b07b --- /dev/null +++ b/hw2/train_pg_test.py @@ -0,0 +1,90 @@ +import tensorflow as tf +import train_pg +from tensorflow.python.platform import gfile + +class TrainPGTest(tf.test.TestCase): + def test_build_mlp(self): + with self.test_session() as session: + + input_features = 64 + input_placeholder = tf.placeholder(shape=[64, 128], name="ob", dtype=tf.float32) + output_size = 10 + mlp_output_layer = train_pg.build_mlp(input_placeholder, + output_size=output_size, + scope='MLP', + n_layers=3, + size=256, + activation=tf.tanh, + output_activation=None) + + # tf.train.write_graph(session.graph_def, ".", "train_pg_test_graph.pb", False) + + self.assertAllEqual(mlp_output_layer.get_shape().as_list(), [input_features, output_size]) + + with tf.Session() as session2: + with gfile.FastGFile("train_pg_test_graph.pb",'rb') as f: + expected_graph_def = tf.GraphDef() + expected_graph_def.ParseFromString(f.read()) + session2.graph.as_default() + tf.import_graph_def(expected_graph_def, name='') + + print(tf.get_default_graph().as_graph_def()) + tf.test.assert_equal_graph_def(tf.get_default_graph().as_graph_def(), session2.graph.as_graph_def()) + + def test_sum_discounted_rewards(self): + rewards = [1, 10, 100] + gamma = .9 + sum_discount_rewards = train_pg.sum_discounted_rewards(rewards, gamma) + self.assertEqual(sum_discount_rewards, 1 + 9 + 81) + + + def test_discounted_rewards_to_go(self): + rewards = [1, 10, 100] + gamma = .9 + rtgs = train_pg.discounted_rewards_to_go(rewards, gamma) + self.assertEqual(rtgs, [91, 100, 100]) + + def test_discrete_policy_network(self): + with self.test_session() as session: + sy_logits_na = tf.log([[2., 1.]]) + sy_logits_na_val = session.run(sy_logits_na) + self.assertArrayNear(sy_logits_na_val[0], [0.693147, 0.0], err=1e-4) + + # draw one sample from a multinomial distribution + logits_multinomial = tf.multinomial(sy_logits_na, 1, seed=1234) + logits_multinomial_val = session.run(logits_multinomial) + self.assertEqual(logits_multinomial_val, [[1]]) + + sampled_ac = tf.squeeze(logits_multinomial_val, axis=[1]) + sampled_ac_val = session.run(sampled_ac) + self.assertEqual(sampled_ac_val, [1]) + + # Compute the log probability of a set of actions that were actually taken, according to the policy. + sy_ac_na = [0] + sy_logprob_n = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=sy_ac_na, logits=sy_logits_na) + sy_logprob_n_val = session.run(sy_logprob_n) + self.assertArrayNear(sy_logprob_n_val, [0.4054651], err=1e-4) + + sy_ac_na_1 = [1] + sy_logprob_n_1 = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=sy_ac_na_1, logits=sy_logits_na) + sy_logprob_n_val_1 = session.run(sy_logprob_n_1) + self.assertArrayNear(sy_logprob_n_val_1, [0.4054651], err=1e-4) + + def test_continuous_policy_network(self): + with self.test_session() as session: + sy_mean = [1., -1] + sy_logstd = tf.log([1, 2.]) + sy_ac_na = [-1., 0] + + sampled_ac = tf.random_normal(shape=tf.shape(sy_mean), mean=sy_mean, stddev=tf.exp(sy_logstd), seed=1234) + sampled_ac_val = session.run(sampled_ac) + self.assertArrayNear(sampled_ac_val, [1.5134048, -1.5116279], err=1e-4) + + dist = tf.contrib.distributions.MultivariateNormalDiag(loc=sy_mean, + scale_diag=tf.exp(sy_logstd)) + + sy_logprob_n = -dist.log_prob(sy_ac_na).eval() + self.assertAlmostEqual(sy_logprob_n, 4.6560245) + +if __name__ == '__main__': + tf.test.main() diff --git a/hw2/train_pg_test_graph.pb b/hw2/train_pg_test_graph.pb new file mode 100644 index 000000000..69cff3fe7 Binary files /dev/null and b/hw2/train_pg_test_graph.pb differ diff --git a/hw2/videos/video_cartpole.mp4 b/hw2/videos/video_cartpole.mp4 new file mode 100644 index 000000000..3d7b736ee Binary files /dev/null and b/hw2/videos/video_cartpole.mp4 differ diff --git a/hw2/videos/video_invertedpendulum.mp4 b/hw2/videos/video_invertedpendulum.mp4 new file mode 100644 index 000000000..b2a26f3b2 Binary files /dev/null and b/hw2/videos/video_invertedpendulum.mp4 differ diff --git a/hw3/README b/hw3/README deleted file mode 100644 index 7d764b91e..000000000 --- a/hw3/README +++ /dev/null @@ -1,5 +0,0 @@ -See http://rll.berkeley.edu/deeprlcourse/f17docs/hw3.pdf for instructions - -The starter code was based on an implementation of Q-learning for Atari -generously provided by Szymon Sidor from OpenAI - diff --git a/hw3/README.md b/hw3/README.md new file mode 100644 index 000000000..12c92c0a9 --- /dev/null +++ b/hw3/README.md @@ -0,0 +1,20 @@ +## Run Training +``` +python run_dqn_atari.py +``` + +## Pong Rewards After 5 million steps + +![Atari Pong Returns](charts/pong_5M_rewards20.png) + + +## Video of Pong +[MP4](videos/openaigym.video.0.7441.video002000.mp4) + +## Homework Instructions + +See http://rll.berkeley.edu/deeprlcourse/f17docs/hw3.pdf for instructions + +The starter code was based on an implementation of Q-learning for Atari +generously provided by Szymon Sidor from OpenAI + diff --git a/hw3/charts/pong_5M_rewards20.png b/hw3/charts/pong_5M_rewards20.png new file mode 100644 index 000000000..1768619c9 Binary files /dev/null and b/hw3/charts/pong_5M_rewards20.png differ diff --git a/hw3/dqn.py b/hw3/dqn.py index 29f51474e..b85032858 100644 --- a/hw3/dqn.py +++ b/hw3/dqn.py @@ -11,6 +11,7 @@ OptimizerSpec = namedtuple("OptimizerSpec", ["constructor", "kwargs", "lr_schedule"]) def learn(env, + results_dir, q_func, optimizer_spec, session, @@ -77,6 +78,9 @@ def learn(env, assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete + summary_writer_meanreward = tf.summary.FileWriter(results_dir + '/mean_reward') + summary_writer_bestmeanreward = tf.summary.FileWriter(results_dir + '/best_mean_reward') + ############### # BUILD MODEL # ############### @@ -127,7 +131,25 @@ def learn(env, # Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES" ###### - # YOUR CODE HERE + # Create the Q-function network + q_t = q_func(obs_t_float, num_actions, scope="online_q_func", reuse=False) + q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='online_q_func') + + # Create the target network + q_tp1 = q_func(obs_tp1_float, num_actions, scope="target_q_func", reuse=False) + target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_func') + + # q scores for actions which we know were selected in the given state. + q_t_selected = tf.reduce_sum(tf.multiply(q_t, tf.one_hot(act_t_ph, num_actions)), axis = 1) + + # compute best possible value with done mask, gamma, reward + q_t_selected_target = tf.reduce_max(q_tp1, axis=1) * (1.0 - done_mask_ph) * gamma + rew_t_ph + + # MSE + # total_error = tf.reduce_mean(tf.square(tf.subtract(q_t_selected_target, q_t_selected)) + + # Huber Loss + total_error = tf.reduce_mean(huber_loss(tf.subtract(q_t_selected_target, q_t_selected))) ###### @@ -194,7 +216,25 @@ def learn(env, ##### - # YOUR CODE HERE + buffer_idx = replay_buffer.store_frame(last_obs) + observation = replay_buffer.encode_recent_observation() + + observation = np.expand_dims(observation, axis=0) + + # epsilon greedy + if not model_initialized or np.random.rand(1) < exploration.value(t): + # randomly choose action + action = np.random.choice(num_actions) + else: + # greedily choose action from Q-function network + q_val = session.run(q_t, feed_dict={obs_t_ph: observation}) + action = np.argmax(np.squeeze(q_val)) + + last_obs, reward, done, _ = env.step(action) + replay_buffer.store_effect(buffer_idx, action, reward, done) + + if done: + last_obs = env.reset() ##### @@ -244,8 +284,24 @@ def learn(env, # variable num_param_updates useful for this (it was initialized to 0) ##### - # YOUR CODE HERE + obs_t_batch, act_t_batch, rew_t_batch, obs_tp1_batch, done_mask_batch = replay_buffer.sample(batch_size) + + if not model_initialized: + initialize_interdependent_variables(session, tf.global_variables(), { + obs_t_ph: obs_t_batch, + obs_tp1_ph: obs_tp1_batch, + }) + model_initialized = True + + _ = session.run(train_fn, feed_dict={ obs_t_ph: obs_t_batch, act_t_ph: act_t_batch, + rew_t_ph: rew_t_batch, obs_tp1_ph: obs_tp1_batch, done_mask_ph: done_mask_batch, + learning_rate: optimizer_spec.lr_schedule.value(t) }) + num_param_updates += 1 + + if (num_param_updates % target_update_freq == 0): + session.run(update_target_fn) + print("target updated at round: {}".format(num_param_updates)) ##### ### 4. Log progress @@ -255,6 +311,14 @@ def learn(env, if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and model_initialized: + summary_meanreward = tf.Summary() + summary_meanreward.value.add(tag='global/mean_reward', simple_value=mean_episode_reward) + summary_writer_meanreward.add_summary(summary_meanreward, global_step=t) + + summary_bestmeanreward = tf.Summary() + summary_bestmeanreward.value.add(tag='global/mean_reward', simple_value=best_mean_episode_reward) + summary_writer_bestmeanreward.add_summary(summary_bestmeanreward, global_step=t) + print("Timestep %d" % (t,)) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) diff --git a/hw3/dqn_utils.py b/hw3/dqn_utils.py index 69cb9493b..88fe781b5 100644 --- a/hw3/dqn_utils.py +++ b/hw3/dqn_utils.py @@ -7,7 +7,8 @@ def huber_loss(x, delta=1.0): # https://en.wikipedia.org/wiki/Huber_loss - return tf.select( + # tf.select changed to tf.where + return tf.where( tf.abs(x) < delta, tf.square(x) * 0.5, delta * (tf.abs(x) - 0.5 * delta) diff --git a/hw3/run_dqn_atari.py b/hw3/run_dqn_atari.py index 59815e246..926f03e92 100644 --- a/hw3/run_dqn_atari.py +++ b/hw3/run_dqn_atari.py @@ -1,3 +1,5 @@ +import os +import time import argparse import gym from gym import wrappers @@ -30,6 +32,7 @@ def atari_model(img_in, num_actions, scope, reuse=False): def atari_learn(env, session, + results_dir, num_timesteps): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 @@ -62,6 +65,7 @@ def stopping_criterion(env, t): dqn.learn( env, + results_dir, q_func=atari_model, optimizer_spec=optimizer, session=session, @@ -110,11 +114,12 @@ def get_env(task, seed): set_global_seeds(seed) env.seed(seed) - expt_dir = '/tmp/hw3_vid_dir2/' - env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True) + results_dir = os.path.join(os.getcwd(), 'results', time.strftime("%d-%m-%Y_%H-%M-%S")) + + env = wrappers.Monitor(env, results_dir, force=True) env = wrap_deepmind(env) - return env + return env, results_dir def main(): # Get Atari games. @@ -125,9 +130,9 @@ def main(): # Run training seed = 0 # Use a seed of zero (you may want to randomize the seed!) - env = get_env(task, seed) + env, results_dir = get_env(task, seed) session = get_session() - atari_learn(env, session, num_timesteps=task.max_timesteps) + atari_learn(env, session, results_dir, num_timesteps=task.max_timesteps) if __name__ == "__main__": main() diff --git a/hw3/videos/openaigym.video.0.7441.video002000.mp4 b/hw3/videos/openaigym.video.0.7441.video002000.mp4 new file mode 100644 index 000000000..cf0df6cb8 Binary files /dev/null and b/hw3/videos/openaigym.video.0.7441.video002000.mp4 differ diff --git a/hw4/assets/halfcheetah_13iterations.mp4 b/hw4/assets/halfcheetah_13iterations.mp4 new file mode 100644 index 000000000..1ff55f271 Binary files /dev/null and b/hw4/assets/halfcheetah_13iterations.mp4 differ diff --git a/hw4/assets/halfcheetah_averagereturn_n15.png b/hw4/assets/halfcheetah_averagereturn_n15.png new file mode 100644 index 000000000..87b3836ce Binary files /dev/null and b/hw4/assets/halfcheetah_averagereturn_n15.png differ diff --git a/hw4/assets/halfcheetah_random.mp4 b/hw4/assets/halfcheetah_random.mp4 new file mode 100644 index 000000000..97888c404 Binary files /dev/null and b/hw4/assets/halfcheetah_random.mp4 differ diff --git a/hw4/cheetah_env2.py b/hw4/cheetah_env2.py new file mode 100644 index 000000000..da2507d94 --- /dev/null +++ b/hw4/cheetah_env2.py @@ -0,0 +1,22 @@ +import numpy as np +from gym import utils +from gym.envs.mujoco import mujoco_env +from gym.envs.mujoco.half_cheetah import HalfCheetahEnv + +class HalfCheetahTorsoEnv(HalfCheetahEnv, utils.EzPickle): + """ + Adds .get_body_com("torso").flat to observations and sets frame skip to 1 + """ + + def __init__(self, **kwargs): + # frame skip to 1 + mujoco_env.MujocoEnv.__init__(self, kwargs["model_path"], 1) + utils.EzPickle.__init__(self) + + + def _get_obs(self): + obs = np.concatenate([ + HalfCheetahEnv._get_obs(self), + self.get_body_com("torso").flat, + ]) + return obs \ No newline at end of file diff --git a/hw4/controllers.py b/hw4/controllers.py index 34b20e577..d597f8af0 100644 --- a/hw4/controllers.py +++ b/hw4/controllers.py @@ -13,13 +13,10 @@ def get_action(self, state): class RandomController(Controller): def __init__(self, env): - """ YOUR CODE HERE """ - pass + self.env = env def get_action(self, state): - """ YOUR CODE HERE """ - """ Your code should randomly sample an action uniformly from the action space """ - pass + return self.env.action_space.sample() class MPCcontroller(Controller): @@ -38,6 +35,24 @@ def __init__(self, self.num_simulated_paths = num_simulated_paths def get_action(self, state): - """ YOUR CODE HERE """ - """ Note: be careful to batch your simulations through the model for speed """ + obs, obs_list, obs_next_list, act_list = [], [], [], [] + + [obs.append(state) for _ in range(self.num_simulated_paths)] + + for _ in range(self.horizon): + obs_list.append(obs) + + # get random actions + actions = [] + [actions.append(self.env.action_space.sample()) for _ in range(self.num_simulated_paths)] + + act_list.append(actions) + obs = self.dyn_model.predict(np.array(obs), np.array(actions)) + obs_next_list.append(obs) + + trajectory_cost_list = trajectory_cost_fn(self.cost_fn, np.array(obs_list), np.array(act_list), np.array(obs_next_list)) + + j = np.argmin(trajectory_cost_list) + + return act_list[0][j] diff --git a/hw4/dynamics.py b/hw4/dynamics.py index 31adfa699..6da19b9b6 100644 --- a/hw4/dynamics.py +++ b/hw4/dynamics.py @@ -30,16 +30,72 @@ def __init__(self, learning_rate, sess ): - """ YOUR CODE HERE """ + self.mean_obs, self.std_obs, self.mean_deltas, self.std_deltas, self.mean_actions, self.std_actions = normalization + self.batch_size = batch_size + self.iterations = iterations + + self.sess = sess + # input placeholder, state/action pairs + self.state_act_placeholder = tf.placeholder(shape = [None, env.observation_space.shape[0] + env.action_space.shape[0]], + name = 'input_state_act', dtype = tf.float32) + # labels + self.deltas_placeholder = tf.placeholder(shape = [None, env.observation_space.shape[0]], name = 'deltas', dtype = tf.float32) + + # build MLP + self.deltas_predict = build_mlp(self.state_act_placeholder, env.observation_space.shape[0], + scope = 'model', n_layers = n_layers, size = size, + activation = activation, output_activation = output_activation) + + # MSE between deltas predicted and actual + self.loss = tf.reduce_mean(tf.square(self.deltas_predict - self.deltas_placeholder)) + self.train_op = tf.train.AdamOptimizer(learning_rate).minimize(self.loss) """ Note: Be careful about normalization """ def fit(self, data): """ Write a function to take in a dataset of (unnormalized)states, (unnormalized)actions, (unnormalized)next_states and fit the dynamics model going from normalized states, normalized actions to normalized state differences (s_t+1 - s_t) """ + # flatten + observations = np.concatenate([item['observations'] for item in data]) + actions = np.concatenate([item['actions'] for item in data]) + next_observations = np.concatenate([item['next_observations'] for item in data]) + + # normalize states and actions + obs_norm = (observations - self.mean_obs) / (self.std_obs + 1e-7) + acts_norm = (actions - self.mean_actions) / (self.std_actions + 1e-7) + + # normalize the state differences + deltas_obs_norm = ((next_observations - observations) - self.mean_deltas) / (self.std_deltas + 1e-7) + + obs_act_norm = np.concatenate((obs_norm, acts_norm), axis = 1) + + train_indices = np.arange(observations.shape[0]) + + for i in range(self.iterations): + np.random.shuffle(train_indices) + + for j in range((observations.shape[0] // self.batch_size) + 1): + start_index = j * self.batch_size + indices_shuffled = train_indices[start_index:start_index + self.batch_size] + + input_batch = obs_act_norm[indices_shuffled, :] + label_batch = deltas_obs_norm[indices_shuffled, :] + + self.sess.run([self.train_op], feed_dict = {self.state_act_placeholder: input_batch, self.deltas_placeholder: label_batch}) + - """YOUR CODE HERE """ def predict(self, states, actions): """ Write a function to take in a batch of (unnormalized) states and (unnormalized) actions and return the (unnormalized) next states as predicted by using the model """ - """ YOUR CODE HERE """ + # normalize the states and actions + obs_norm = (states - self.mean_obs) / (self.std_obs + 1e-7) + act_norm = (actions - self.mean_actions) / (self.std_actions + 1e-7) + + # concatenate normalized states and actions + obs_act_norm = np.concatenate((obs_norm, act_norm), axis=1 ) + + # predict the deltas between states and next states + deltas = self.sess.run(self.deltas_predict, feed_dict = {self.state_act_placeholder: obs_act_norm}) + + # calculate the next states using the predicted delta values and denormalize + return deltas * self.std_deltas + self.mean_deltas + states diff --git a/hw4/main.py b/hw4/main.py index 8f13a723f..233d37fcb 100644 --- a/hw4/main.py +++ b/hw4/main.py @@ -1,15 +1,18 @@ import numpy as np import tensorflow as tf import gym +from gym import wrappers +import gym.envs.mujoco from dynamics import NNDynamicsModel from controllers import MPCcontroller, RandomController from cost_functions import cheetah_cost_fn, trajectory_cost_fn import time import logz +import tqdm import os import copy import matplotlib.pyplot as plt -from cheetah_env import HalfCheetahEnvNew +from gym.envs.registration import registry, register, make, spec def sample(env, controller, @@ -23,8 +26,31 @@ def sample(env, Each path can have elements for observations, next_observations, rewards, returns, actions, etc. """ paths = [] - """ YOUR CODE HERE """ - + for _ in tqdm.tqdm(range(num_paths)): + observations, actions, next_observations, rewards = [], [], [], [] + observation = env.reset() + steps = 0 + while True: + if render: + env.render() + observations.append(observation) + action = controller.get_action(observation) + actions.append(action) + observation, reward, done, _ = env.step(action) + next_observations.append(observation) + rewards.append(reward) + steps += 1 + if done: + break + if steps >= horizon: + print('steps exceeeded horizon') + break + + path = {'observations': np.array(observations), + 'actions': np.array(actions), + 'next_observations': np.array(next_observations), + 'rewards': np.array(rewards)} + paths.append(path) return paths # Utility to compute cost a path for a given cost function @@ -37,8 +63,21 @@ def compute_normalization(data): Return 6 elements: mean of s_t, std of s_t, mean of (s_t+1 - s_t), std of (s_t+1 - s_t), mean of actions, std of actions """ - """ YOUR CODE HERE """ - return mean_obs, std_obs, mean_deltas, std_deltas, mean_action, std_action + # flatten dataset across all paths + observations = np.concatenate([item['observations'] for item in data]) + next_observations = np.concatenate([item['next_observations'] for item in data]) + actions = np.concatenate([item['actions'] for item in data]) + + mean_obs = np.mean(observations, axis=0) + std_obs = np.std(observations, axis=0) + + mean_deltas = np.mean(next_observations - observations, axis=0) + std_deltas = np.std(next_observations - observations, axis=0) + + mean_actions = np.mean(actions, axis=0) + std_actions = np.std(actions, axis=0) + + return mean_obs, std_obs, mean_deltas, std_deltas, mean_actions, std_actions def plot_comparison(env, dyn_model): @@ -111,8 +150,7 @@ def train(env, random_controller = RandomController(env) - """ YOUR CODE HERE """ - + data = sample(env, random_controller, num_paths_random, env_horizon, render=render) #======================================================== # @@ -122,8 +160,7 @@ def train(env, # for normalizing inputs and denormalizing outputs # from the dynamics network. # - normalization = """ YOUR CODE HERE """ - + normalization = compute_normalization(data) #======================================================== # @@ -162,9 +199,17 @@ def train(env, # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): - """ YOUR CODE HERE """ - - + # refit the dynamics model to current dataset + dyn_model.fit(data) + + # take on-policy samples + paths = sample(env, mpc_controller, num_paths_onpol, env_horizon, render=render) + + # aggregate samples to the dataset + data = np.concatenate((data, paths)) + + returns = [np.sum(path['rewards']) for path in paths] + costs = [path_cost(cost_fn, path) for path in paths] # LOGGING # Statistics for performance of MPC policy using @@ -222,8 +267,18 @@ def main(): os.makedirs(logdir) # Make env + + register( + id='HalfCheetahTorso-v1', + entry_point='cheetah_env2:HalfCheetahTorsoEnv', + reward_threshold=4800.0, + max_episode_steps=args.ep_len, + kwargs= dict(model_path=os.path.dirname(gym.envs.mujoco.__file__) + "/assets/half_cheetah.xml") + ) + if args.env_name is "HalfCheetah-v1": - env = HalfCheetahEnvNew() + env = gym.make('HalfCheetahTorso-v1') + env = wrappers.Monitor(env, logdir, force=True) cost_fn = cheetah_cost_fn train(env=env, cost_fn=cost_fn,