From 8fc26ab4d9d228ee45a9fec9caf42c6600056144 Mon Sep 17 00:00:00 2001 From: Magnus Jahnen Date: Wed, 22 Nov 2017 17:47:53 -0800 Subject: [PATCH 1/4] Use tensorflows sparse tensors for VI --- deep_maxent_irl.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index e20acb4..d0fc3f0 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -23,11 +23,14 @@ def __init__(self, n_input, n_actions, lr, n_h1=400, n_h2=300, l2=10, name='deep self.n_h2 = n_h2 self.name = name - self.sess = tf.Session() + config = tf.ConfigProto() + config.gpu_options.allow_growth = True + self.sess = tf.Session(config=config) self.input_s, self.reward, self.theta = self._build_network(self.name) # value iteration - self.P_a = tf.placeholder(tf.float32, shape=(n_input, n_actions, n_input)) + self.P_a = tf.sparse_placeholder(tf.float32, shape=(n_input, n_actions, n_input)) + self.gamma = tf.placeholder(tf.float32) self.epsilon = tf.placeholder(tf.float32) self.values, self.policy = self._vi(self.reward) @@ -65,7 +68,7 @@ def _vi(self, rewards): def body(i, c, t): old_values = t.read(i) - new_values = tf.reduce_max(tf.reduce_sum(self.P_a * (rewards + self.gamma * old_values), axis=2), axis=1) + new_values = tf.sparse_reduce_max(tf.sparse_reduce_sum_sparse(self.P_a * (rewards + self.gamma * old_values), axis=2), axis=1) c = tf.reduce_max(tf.abs(new_values - old_values)) > self.epsilon c.set_shape(()) t = t.write(i + 1, new_values) @@ -80,7 +83,7 @@ def condition(i, c, t): i, _, values = tf.while_loop(condition, body, [0, True, t], parallel_iterations=1, back_prop=False, name='VI_loop') values = values.read(i) - policy = tf.reduce_max(tf.reduce_sum(self.P_a * (rewards + self.gamma * values), axis=2), axis=1) + policy = tf.argmax(tf.sparse_tensor_to_dense(tf.sparse_reduce_sum_sparse(self.P_a * (rewards + self.gamma * values), axis=2)), axis=1) return values, policy @@ -199,19 +202,27 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters): # find state visitation frequencies using demonstrations mu_D = demo_svf(trajs, N_STATES) + P_a_t = P_a.transpose(0, 2, 1) + mask = P_a_t > 0 + indices = np.argwhere(mask) + P_a_sparse = tf.SparseTensorValue(indices, P_a_t[mask], P_a_t.shape) + del P_a_t + # training for iteration in range(n_iters): if iteration % (n_iters/10) == 0: print 'iteration: {}'.format(iteration) # compute the reward matrix - # rewards = nn_r.get_rewards(feat_map) + #rewards = nn_r.get_rewards(feat_map) # compute policy - # _, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True) + #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True) # compute rewards and policy at the same time - rewards, _, policy = nn_r.get_policy(feat_map, P_a.transpose(0, 2, 1), gamma, 0.01) + t = time.time() + rewards, _, policy = nn_r.get_policy(feat_map, P_a_sparse, gamma, 0.01) + print('tensorflow vi', time.time() - t) # compute expected svf mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True) From 9e88153d8b4a94410c1b85a2d1f19c028ae9231f Mon Sep 17 00:00:00 2001 From: Magnus Jahnen Date: Wed, 22 Nov 2017 19:59:18 -0800 Subject: [PATCH 2/4] Add program argument for sparse tensors --- deep_maxent_irl.py | 4 ++-- deep_maxent_irl_gridworld.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index 4400024..9d99092 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -186,7 +186,7 @@ def demo_svf(trajs, n_states): p = p/len(trajs) return p -def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters): +def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): """ Maximum Entropy Inverse Reinforcement Learning (Maxent IRL) @@ -209,7 +209,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters): N_STATES, _, N_ACTIONS = np.shape(P_a) # init nn model - nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, 3, 3) + nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, 3, 3, sparse) # find state visitation frequencies using demonstrations mu_D = demo_svf(trajs, N_STATES) diff --git a/deep_maxent_irl_gridworld.py b/deep_maxent_irl_gridworld.py index 720b379..845dd2e 100644 --- a/deep_maxent_irl_gridworld.py +++ b/deep_maxent_irl_gridworld.py @@ -27,6 +27,7 @@ PARSER.set_defaults(rand_start=True) PARSER.add_argument('-lr', '--learning_rate', default=0.02, type=float, help='learning rate') PARSER.add_argument('-ni', '--n_iters', default=20, type=int, help='number of iterations') +PARSER.add_argument('-s', '--sparse', default=False, action='store_true', help='flag to use sparse tensors in tf') ARGS = PARSER.parse_args() print ARGS @@ -100,7 +101,7 @@ def main(): print 'Deep Max Ent IRL training ..' t = time.time() - rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) + rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS, ARGS.sparse) print('time for dirl', time.time() - t) values, _ = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) From 6e63f9f8a5e4e927fad582769c7cfed378327655 Mon Sep 17 00:00:00 2001 From: Magnus Jahnen Date: Thu, 23 Nov 2017 20:23:54 -0800 Subject: [PATCH 3/4] Add program argument for sparse tensors --- deep_maxent_irl.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index 9d99092..ef587ac 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -214,6 +214,12 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): # find state visitation frequencies using demonstrations mu_D = demo_svf(trajs, N_STATES) + P_a_t = P_a.transpose(0, 2, 1) + if sparse: + mask = P_a_t > 0 + indices = np.argwhere(mask) + P_a_t = tf.SparseTensorValue(indices, P_a_t[mask], P_a_t.shape) + # training for iteration in range(n_iters): if iteration % (n_iters/10) == 0: @@ -227,7 +233,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): # compute rewards and policy at the same time t = time.time() - rewards, _, policy = nn_r.get_policy(feat_map, P_a.transpose(0, 2, 1), gamma, 0.01) + rewards, _, policy = nn_r.get_policy(feat_map, P_a_t, gamma, 0.01) print('tensorflow VI', time.time() - t) # compute expected svf From 62949c5c513c5843cf8c6f0abe633cf13db862b4 Mon Sep 17 00:00:00 2001 From: Magnus Jahnen Date: Fri, 24 Nov 2017 13:57:18 -0800 Subject: [PATCH 4/4] Fix sparse --- deep_maxent_irl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index ef587ac..b641b1a 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -209,7 +209,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): N_STATES, _, N_ACTIONS = np.shape(P_a) # init nn model - nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, 3, 3, sparse) + nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, 3, 3, sparse=sparse) # find state visitation frequencies using demonstrations mu_D = demo_svf(trajs, N_STATES)