Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

thanks #43

Open
wants to merge 41 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
ab5f2ee
create function to generate roll-out data files to data folder
hollygrimm Jun 19, 2018
ad25786
add ant, humanoid, reacher, walker data
hollygrimm Jun 19, 2018
f413b09
initial split and shuffle dataset code
hollygrimm Jun 20, 2018
04c7e4e
initial BC model code
hollygrimm Jun 20, 2018
02e64dc
initial main bc code
hollygrimm Jun 20, 2018
52b426c
revert, use load_policy for expert data
hollygrimm Jun 21, 2018
e0e02b5
add DAgger; gather expert experience when training
hollygrimm Jun 21, 2018
c1e994c
add logging; modify loss calc; use tf.contrib slim for fc layers
hollygrimm Jun 21, 2018
4e1a463
pass in data instead of data_file
hollygrimm Jun 21, 2018
b2e1b08
create function to generate rollout data
hollygrimm Jun 22, 2018
eb6ccac
refactor arguments, add num_rollouts var
hollygrimm Jun 22, 2018
3e1da24
stack and sqeeze data after loading
hollygrimm Jun 22, 2018
9724bf4
load pre-saved rollout file; use test_train_split
hollygrimm Jun 22, 2018
ca3019c
add graphs
hollygrimm Jun 22, 2018
e598618
add tensorboard scalars; keep prob; validation loss
hollygrimm Jun 22, 2018
c826d06
add keep_prob; tensorboard scalars
hollygrimm Jun 22, 2018
6b42f7e
add Walker2d loss chart
hollygrimm Jun 22, 2018
35dacdf
add expert videos
hollygrimm Jun 22, 2018
2fa2666
walker2d videos
hollygrimm Jun 22, 2018
c4c9841
ignore intermediate files
hollygrimm Jun 25, 2018
729be06
code and test build_mlp
hollygrimm Jun 25, 2018
e33cf38
Added PG for discrete and continuous networks; Advantage Normalization
hollygrimm Jun 26, 2018
995826d
add charts
hollygrimm Jun 26, 2018
6701f68
add tests for disc rewards, discrete and continuous policy networks
hollygrimm Jun 26, 2018
2b5e471
add sparse softmax cross entropy computation
hollygrimm Jun 29, 2018
1fa10bf
add baseline code
hollygrimm Jun 29, 2018
5f123bc
invertedpendulum final chart
hollygrimm Jun 29, 2018
546b048
add videos
hollygrimm Jun 29, 2018
a2994e5
fix y axis title
hollygrimm Jun 30, 2018
2a99b56
add image of walker2d
hollygrimm Jul 2, 2018
37c7a46
use tf.where instead of tf.select
hollygrimm Jul 3, 2018
d520eaa
add all initial DQN code
hollygrimm Jul 3, 2018
8a52a4b
add two tf filewriters for tensorboard charts
hollygrimm Jul 4, 2018
f9a9b3d
create results_dir for gym videos and tensorboard logs
hollygrimm Jul 4, 2018
fd5b17e
add video
hollygrimm Jul 5, 2018
9e4aa86
add chart
hollygrimm Jul 5, 2018
7dc2e28
rename and add content
hollygrimm Jul 5, 2018
763bdc2
extend HalfCheetah, add observations, change frame skip
hollygrimm Jul 12, 2018
8c43a15
initial commit
hollygrimm Jul 12, 2018
54c9cee
add chart
hollygrimm Jul 13, 2018
209ac77
add videos
hollygrimm Jul 13, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,15 @@ ENV/

# Rope project settings
.ropeproject

# HW1 files
hw1/archive/
hw1/results/
.vscode/settings.json
hw1/images/*.xcf

# HW2 files
hw2/data/

# HW3 files
hw3/results/
246 changes: 246 additions & 0 deletions hw1/bc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
from __future__ import print_function
import os
import sys
import logging
import argparse
from tqdm import tqdm
import tensorflow as tf
import numpy as np
import gym
from gym import wrappers
import load_policy
import pickle
from sklearn.model_selection import train_test_split

from data.bc_data import Data
from models.bc_model import Model

def config_logging(log_file):
if os.path.exists(log_file):
os.remove(log_file)

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(message)s')

fh = logging.FileHandler(log_file)
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
logger.addHandler(fh)

return logger

def create_model(session, obs_samples, num_observations, num_actions, logger, optimizer, learning_rate, restore, checkpoint_dir):
model = Model(obs_samples, num_observations, num_actions, checkpoint_dir, logger, optimizer, learning_rate)

if restore:
model.load(session)
else:
logger.info("Created model with fresh parameters")
session.run(tf.global_variables_initializer())

return model

def gather_expert_experience(num_rollouts, env, policy_fn, max_steps):
with tf.Session():
returns = []
observations = []
actions = []
for _ in tqdm(range(num_rollouts)):
obs = env.reset()
done = False
totalr = 0.
steps = 0
while not done:
action = policy_fn(obs[None,:])
observations.append(obs)
actions.append(action)
obs, r, done, _ = env.step(action)
totalr += r
steps += 1
if steps >= max_steps:
break
returns.append(totalr)

expert_data = {'observations': np.stack(observations, axis=0),
'actions': np.squeeze(np.stack(actions, axis=0)),
'returns':np.array(returns)}
return expert_data


def bc(expert_data_file, expert_policy_file, env_name, restore, results_dir,
num_rollouts, max_timesteps=None, optimizer='adam', num_epochs=100, learning_rate=.001, batch_size=32, keep_prob=1):
tf.reset_default_graph()

env = gym.make(env_name)
max_steps = max_timesteps or env.spec.timestep_limit

# data = Data(expert_data_file, train_ratio=0.9, val_ratio=0.05)

with open(expert_data_file, 'rb') as f:
data = pickle.loads(f.read())

obs = np.stack(data['observations'], axis=0)
actions = np.squeeze(np.stack(data['actions'], axis=0))

x_train, x_test, y_train, y_test = train_test_split(obs, actions, test_size=0.2)

num_samples = len(x_train)

min_val_loss = sys.maxsize

with tf.Session() as session:
model = create_model(session, x_train, x_train.shape[1], y_train.shape[1], logger, optimizer, learning_rate, restore, results_dir)

file_writer = tf.summary.FileWriter(results_dir, session.graph)

for epoch in tqdm(range(num_epochs)):
perm = np.random.permutation(x_train.shape[0])

obs_samples = x_train[perm]
action_samples = y_train[perm]

loss = 0.
for k in range(0,obs_samples.shape[0], batch_size):
batch_loss, training_scalar = model.update(session, obs_samples[k:k+batch_size],
action_samples[k:k+batch_size],
keep_prob)
loss += batch_loss

file_writer.add_summary(training_scalar, epoch)

min_val_loss, validation_scalar = validate(model, logger, session, x_test, y_test, epoch, batch_size, min_val_loss, results_dir)
file_writer.add_summary(validation_scalar, epoch)

new_exp = model.test_run(session, env, max_steps )
tqdm.write("Epoch %3d Loss %f Reward %f" %(epoch, loss/num_samples, new_exp['reward']))

env = wrappers.Monitor(env, results_dir, force=True)

results = []
for _ in tqdm(range(10)):
results.append(model.test_run(session, env, max_steps )['reward'])
logger.info("Reward mean and std dev with behavior cloning: %f(%f)"%(np.mean(results), np.std(results)))
return data['mean_return'], data['std_return'], np.mean(results), np.std(results)

def validate(model, logger, session, x_test, y_test, num_epoch, batch_size, min_loss, checkpoint_dir):
avg_loss = []

# for k in range(0, x_test.shape[0], batch_size):
loss, validation_scalar = model.validate(session, x_test, y_test)
avg_loss.append(loss)

new_loss = sum(avg_loss) / len(avg_loss)
logger.info("Finished epoch %d, average validation loss = %f" % (num_epoch, new_loss))

if new_loss < min_loss: # Only save model if val loss dropped
model.save(session)
min_loss = new_loss
return min_loss, validation_scalar

def dagger(expert_data_file, expert_policy_file, env_name, restore, results_dir,
num_rollouts, max_timesteps=None, optimizer='adam', num_epochs=40, learning_rate=.001, batch_size=32, keep_prob=1):
tf.reset_default_graph()

env = gym.make(env_name)
max_steps = max_timesteps or env.spec.timestep_limit

expert_policy_fn = load_policy.load_policy(expert_policy_file)

# data = Data(expert_data_file, train_ratio=0.9, val_ratio=0.05)

with open(expert_data_file, 'rb') as f:
data = pickle.loads(f.read())

obs = np.stack(data['observations'], axis=0)
actions = np.squeeze(np.stack(data['actions'], axis=0))

x_train, x_test, y_train, y_test = train_test_split(obs, actions, test_size=0.2)

min_val_loss = sys.maxsize

with tf.Session() as session:
model = create_model(session, x_train, x_train.shape[1], y_train.shape[1], logger, optimizer, learning_rate, restore, results_dir)

file_writer = tf.summary.FileWriter(results_dir, session.graph)

for epoch in tqdm(range(num_epochs)):
num_samples = x_train.shape[0]
perm = np.random.permutation(num_samples)

obsv_samples = x_train[perm]
action_samples = y_train[perm]

obsv_samples = np.stack(obsv_samples, axis=0)
action_samples = np.squeeze(np.stack(action_samples, axis=0))


loss = 0.
for k in range(0,obsv_samples.shape[0], batch_size):
batch_loss, training_scalar = model.update(session, obsv_samples[k:k+batch_size],
action_samples[k:k+batch_size],
keep_prob)
loss += batch_loss

file_writer.add_summary(training_scalar, epoch)

min_val_loss, validation_scalar = validate(model, logger, session, x_test, y_test, epoch, batch_size, min_val_loss, results_dir)
file_writer.add_summary(validation_scalar, epoch)

new_exp = model.test_run(session, env, max_steps)

#Data Aggregation Steps. Supervision signal comes from expert policy.
new_exp_len = new_exp['observations'].shape[0]
expert_expected_actions = []
for k in range(0, new_exp_len, batch_size) :
expert_expected_actions.append(expert_policy_fn(new_exp['observations'][k:k+batch_size]))

# add new experience into original one. (No eviction)
x_train = np.concatenate((x_train, new_exp['observations']),
axis=0)
y_train = np.concatenate([y_train] + expert_expected_actions,
axis=0)
tqdm.write("Epoch %3d Loss %f Reward %f" %(epoch, loss/num_samples, new_exp['reward']))

env = wrappers.Monitor(env, results_dir, force=True)

results = []
for _ in tqdm(range(10)):
results.append(model.test_run(session, env, max_steps )['reward'])
logger.info("Reward mean and std dev with DAgger: %f(%f)"%(np.mean(results), np.std(results)))
return data['mean_return'], data['std_return'], np.mean(results), np.std(results)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--restore", type=bool, default=False)
args = parser.parse_args()

log_file = os.path.join(os.getcwd(), 'results', 'train_out.log')
logger = config_logging(log_file)

env_models = [('Ant-v1', 'data/Ant-v1_data_250_rollouts.pkl', 'experts/Ant-v1.pkl', 250),
('HalfCheetah-v1', 'data/HalfCheetah-v1_data_10_rollouts.pkl', 'experts/HalfCheetah-v1.pkl', 10),
('Hopper-v1', 'data/Hopper-v1_data_10_rollouts.pkl', 'experts/Hopper-v1.pkl', 10),
('Humanoid-v1', 'data/Humanoid-v1_data_250_rollouts.pkl', 'experts/Humanoid-v1.pkl', 250),
('Reacher-v1', 'data/Reacher-v1_data_250_rollouts.pkl', 'experts/Reacher-v1.pkl', 250),
('Walker2d-v1', 'data/Walker2d-v1_data_10_rollouts.pkl','experts/Walker2d-v1.pkl', 10)
]

results = []
for env_name, rollout_data, expert_policy_file, num_rollouts in env_models :
bc_results_dir = os.path.join(os.getcwd(), 'results', env_name, 'bc')
if not os.path.exists(bc_results_dir):
os.makedirs(bc_results_dir)
ex_mean, ex_std, bc_mean,bc_std = bc(rollout_data, expert_policy_file, env_name, args.restore, bc_results_dir, num_rollouts)

da_results_dir = os.path.join(os.getcwd(), 'results', env_name, 'da')
if not os.path.exists(da_results_dir):
os.makedirs(da_results_dir)
_,_, da_mean,da_std = dagger(rollout_data, expert_policy_file, env_name, args.restore, da_results_dir, num_rollouts)
results.append((env_name, ex_mean, ex_std, bc_mean, bc_std, da_mean, da_std))

for env_name, ex_mean, ex_std, bc_mean, bc_std, da_mean, da_std in results :
logger.info('Env: %s, Expert: %f(%f), Behavior Cloning: %f(%f), Dagger: %f(%f)'%
(env_name, ex_mean, ex_std, bc_mean, bc_std, da_mean, da_std))

1 change: 1 addition & 0 deletions hw1/data/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.pkl
87 changes: 87 additions & 0 deletions hw1/data/bc_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import pickle
import numpy as np
from sklearn.utils import shuffle

# TODO: pass in logger


class Data(object):
def __init__(self, data_file, train_ratio, val_ratio):
data = pickle.load(open(data_file, "rb"))

self.expert_mean_return=data['mean_return']
self.expert_std_return=data['std_return']

self.train_ratio = train_ratio
self.val_ratio = val_ratio

obs = np.stack(data['observations'], axis=0)
actions = np.squeeze(np.stack(data['actions'], axis=0))
assert len(obs) == len(actions), "obs and action mismatch!"

obs, actions = shuffle(obs, actions, random_state=0)

self.num_observations = obs.shape[1]
self.num_actions = actions.shape[1]

print("Splitting dataset...")
self.train, self.val, self.test = self.split(obs, actions)

self.print_stat(self.train, "Training")
self.print_stat(self.val, "Validation")
self.print_stat(self.test, "Test")

obs_mean = np.mean(self.train["observations"], axis=0)
obs_std = np.std(self.train["observations"], axis=0)

print("Normalizing observations...")
self.pre_proc(self.train, obs_mean, obs_std)
self.pre_proc(self.val, obs_mean, obs_std)
self.pre_proc(self.test, obs_mean, obs_std)

def split(self, obs, actions):
"""Split the dataset into train, val, and test"""
n_total = len(obs)
n_train, n_val = int(n_total * self.train_ratio), int(n_total * self.val_ratio)

train_data = {"observations": obs[:n_train], "actions": actions[:n_train]}
val_data = {"observations": obs[n_train:n_train + n_val], "actions": actions[n_train:n_train + n_val]}
test_data = {"observations": obs[n_train + n_val:], "actions": actions[n_train + n_val:]}

return train_data, val_data, test_data

def get_small_dataset(self, num_data=100):
"""Return a subset of the training data"""
obs, actions = self.train["observations"], self.train["actions"]
idx = np.random.choice(np.arange(len(obs)), size=num_data, replace=False)
small_data = {"observations": obs[idx], "actions": actions[idx]}
return small_data

@staticmethod
def batch_iter(data, batch_size, num_epochs, shuffle=True):
"""Batch generator for a dataset"""
num_data = len(data["observations"])
num_batch_per_epoch = int((num_data-1) / batch_size) + 1

for epoch in range(num_epochs):
obs, actions = data["observations"], data["actions"]
if shuffle:
idx = np.random.permutation(np.arange(num_data))
obs = obs[idx]
actions = actions[idx]
for i in range(num_batch_per_epoch):
start_idx = i * batch_size
end_idx = min((i + 1) * batch_size, num_data)
yield obs[start_idx:end_idx], actions[start_idx:end_idx]

@staticmethod
def print_stat(data, title):
obs, actions = data["observations"], data["actions"]
print("%s Observations %s, mean: %s" % (title, str(obs.shape), str(np.mean(obs, axis=0))))
print("%s Actions %s, mean: %s" % (title, str(actions.shape), str(np.mean(actions, axis=0))))

@staticmethod
def pre_proc(data, mean, std):
"""Normalize observations"""
obs = data["observations"]
data["observations"] = (obs - mean) / (std + 1e-6) # See load_policy.py
Binary file added hw1/images/ant_expert_250.mp4
Binary file not shown.
Binary file added hw1/images/graph.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added hw1/images/halfcheetah_expert_250.mp4
Binary file not shown.
Binary file added hw1/images/halfcheetah_val_loss.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added hw1/images/hopper_expert_250.mp4
Binary file not shown.
Binary file added hw1/images/humanoid_expert_250.mp4
Binary file not shown.
Binary file added hw1/images/reacher_expert_250.mp4
Binary file not shown.
Binary file added hw1/images/walk2d_frame.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added hw1/images/walker_bc.mp4
Binary file not shown.
Binary file added hw1/images/walker_da.mp4
Binary file not shown.
Binary file added hw1/images/walker_expert_250.mp4
Binary file not shown.
Binary file added hw1/images/walker_val_loss.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading