From 8fc26ab4d9d228ee45a9fec9caf42c6600056144 Mon Sep 17 00:00:00 2001
From: Magnus Jahnen <magnus.jahnen@daimler.com>
Date: Wed, 22 Nov 2017 17:47:53 -0800
Subject: [PATCH 1/4] Use tensorflows sparse tensors for VI

---
 deep_maxent_irl.py | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
index e20acb4..d0fc3f0 100644
--- a/deep_maxent_irl.py
+++ b/deep_maxent_irl.py
@@ -23,11 +23,14 @@ def __init__(self, n_input, n_actions, lr, n_h1=400, n_h2=300, l2=10, name='deep
     self.n_h2 = n_h2
     self.name = name
 
-    self.sess = tf.Session()
+    config = tf.ConfigProto()
+    config.gpu_options.allow_growth = True
+    self.sess = tf.Session(config=config)
     self.input_s, self.reward, self.theta = self._build_network(self.name)
 
     # value iteration
-    self.P_a = tf.placeholder(tf.float32, shape=(n_input, n_actions, n_input))
+    self.P_a = tf.sparse_placeholder(tf.float32, shape=(n_input, n_actions, n_input))
+
     self.gamma = tf.placeholder(tf.float32)
     self.epsilon = tf.placeholder(tf.float32)
     self.values, self.policy = self._vi(self.reward)
@@ -65,7 +68,7 @@ def _vi(self, rewards):
 
       def body(i, c, t):
           old_values = t.read(i)
-          new_values = tf.reduce_max(tf.reduce_sum(self.P_a * (rewards + self.gamma * old_values), axis=2), axis=1)
+          new_values = tf.sparse_reduce_max(tf.sparse_reduce_sum_sparse(self.P_a * (rewards + self.gamma * old_values), axis=2), axis=1)
           c = tf.reduce_max(tf.abs(new_values - old_values)) > self.epsilon
           c.set_shape(())
           t = t.write(i + 1, new_values)
@@ -80,7 +83,7 @@ def condition(i, c, t):
       i, _, values = tf.while_loop(condition, body, [0, True, t], parallel_iterations=1, back_prop=False,
                                    name='VI_loop')
       values = values.read(i)
-      policy = tf.reduce_max(tf.reduce_sum(self.P_a * (rewards + self.gamma * values), axis=2), axis=1)
+      policy = tf.argmax(tf.sparse_tensor_to_dense(tf.sparse_reduce_sum_sparse(self.P_a * (rewards + self.gamma * values), axis=2)), axis=1)
 
       return values, policy
 
@@ -199,19 +202,27 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters):
   # find state visitation frequencies using demonstrations
   mu_D = demo_svf(trajs, N_STATES)
 
+  P_a_t = P_a.transpose(0, 2, 1)
+  mask = P_a_t > 0
+  indices = np.argwhere(mask)
+  P_a_sparse = tf.SparseTensorValue(indices, P_a_t[mask], P_a_t.shape)
+  del P_a_t
+
   # training 
   for iteration in range(n_iters):
     if iteration % (n_iters/10) == 0:
       print 'iteration: {}'.format(iteration)
     
     # compute the reward matrix
-    # rewards = nn_r.get_rewards(feat_map)
+    #rewards = nn_r.get_rewards(feat_map)
     
     # compute policy 
-    # _, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True)
+    #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True)
 
     # compute rewards and policy at the same time
-    rewards, _, policy = nn_r.get_policy(feat_map, P_a.transpose(0, 2, 1), gamma, 0.01)
+    t = time.time()
+    rewards, _, policy = nn_r.get_policy(feat_map, P_a_sparse, gamma, 0.01)
+    print('tensorflow vi', time.time() - t)
     
     # compute expected svf
     mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True)

From 9e88153d8b4a94410c1b85a2d1f19c028ae9231f Mon Sep 17 00:00:00 2001
From: Magnus Jahnen <magnus.jahnen@daimler.com>
Date: Wed, 22 Nov 2017 19:59:18 -0800
Subject: [PATCH 2/4] Add program argument for sparse tensors

---
 deep_maxent_irl.py           | 4 ++--
 deep_maxent_irl_gridworld.py | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
index 4400024..9d99092 100644
--- a/deep_maxent_irl.py
+++ b/deep_maxent_irl.py
@@ -186,7 +186,7 @@ def demo_svf(trajs, n_states):
   p = p/len(trajs)
   return p
 
-def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters):
+def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
   """
   Maximum Entropy Inverse Reinforcement Learning (Maxent IRL)
 
@@ -209,7 +209,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters):
   N_STATES, _, N_ACTIONS = np.shape(P_a)
 
   # init nn model
-  nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, 3, 3)
+  nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, 3, 3, sparse)
 
   # find state visitation frequencies using demonstrations
   mu_D = demo_svf(trajs, N_STATES)
diff --git a/deep_maxent_irl_gridworld.py b/deep_maxent_irl_gridworld.py
index 720b379..845dd2e 100644
--- a/deep_maxent_irl_gridworld.py
+++ b/deep_maxent_irl_gridworld.py
@@ -27,6 +27,7 @@
 PARSER.set_defaults(rand_start=True)
 PARSER.add_argument('-lr', '--learning_rate', default=0.02, type=float, help='learning rate')
 PARSER.add_argument('-ni', '--n_iters', default=20, type=int, help='number of iterations')
+PARSER.add_argument('-s', '--sparse', default=False, action='store_true', help='flag to use sparse tensors in tf')
 ARGS = PARSER.parse_args()
 print ARGS
 
@@ -100,7 +101,7 @@ def main():
   
   print 'Deep Max Ent IRL training ..'
   t = time.time()
-  rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS)
+  rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS, ARGS.sparse)
   print('time for dirl', time.time() - t)
 
   values, _ = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True)

From 6e63f9f8a5e4e927fad582769c7cfed378327655 Mon Sep 17 00:00:00 2001
From: Magnus Jahnen <magnus.jahnen@daimler.com>
Date: Thu, 23 Nov 2017 20:23:54 -0800
Subject: [PATCH 3/4] Add program argument for sparse tensors

---
 deep_maxent_irl.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
index 9d99092..ef587ac 100644
--- a/deep_maxent_irl.py
+++ b/deep_maxent_irl.py
@@ -214,6 +214,12 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
   # find state visitation frequencies using demonstrations
   mu_D = demo_svf(trajs, N_STATES)
 
+  P_a_t = P_a.transpose(0, 2, 1)
+  if sparse:
+    mask = P_a_t > 0
+    indices = np.argwhere(mask)
+    P_a_t = tf.SparseTensorValue(indices, P_a_t[mask], P_a_t.shape)
+
   # training 
   for iteration in range(n_iters):
     if iteration % (n_iters/10) == 0:
@@ -227,7 +233,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
 
     # compute rewards and policy at the same time
     t = time.time()
-    rewards, _, policy = nn_r.get_policy(feat_map, P_a.transpose(0, 2, 1), gamma, 0.01)
+    rewards, _, policy = nn_r.get_policy(feat_map, P_a_t, gamma, 0.01)
     print('tensorflow VI', time.time() - t)
     
     # compute expected svf

From 62949c5c513c5843cf8c6f0abe633cf13db862b4 Mon Sep 17 00:00:00 2001
From: Magnus Jahnen <magnus.jahnen@daimler.com>
Date: Fri, 24 Nov 2017 13:57:18 -0800
Subject: [PATCH 4/4] Fix sparse

---
 deep_maxent_irl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
index ef587ac..b641b1a 100644
--- a/deep_maxent_irl.py
+++ b/deep_maxent_irl.py
@@ -209,7 +209,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
   N_STATES, _, N_ACTIONS = np.shape(P_a)
 
   # init nn model
-  nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, 3, 3, sparse)
+  nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, 3, 3, sparse=sparse)
 
   # find state visitation frequencies using demonstrations
   mu_D = demo_svf(trajs, N_STATES)