From 861e83e0952efdc16a355a989915a08c8e00e182 Mon Sep 17 00:00:00 2001
From: Magnus Jahnen <magnus.jahnen@daimler.com>
Date: Tue, 28 Nov 2017 20:45:36 -0800
Subject: [PATCH 01/10] Debug to fix tensorflow VI

---
 deep_maxent_irl.py           | 24 ++++++++++---
 deep_maxent_irl_gridworld.py |  2 +-
 maxent_irl.py                | 36 -------------------
 mdp/value_iteration.py       | 70 ++++++++++++++++++++++++++++++++++++
 4 files changed, 90 insertions(+), 42 deletions(-)

diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
index 5f23563..95e126d 100644
--- a/deep_maxent_irl.py
+++ b/deep_maxent_irl.py
@@ -68,19 +68,31 @@ def _build_network(self, name):
         initializer=tf.contrib.layers.variance_scaling_initializer(mode="FAN_IN"))
       reward = tf_utils.fc(fc2, 1, scope="reward")
     theta = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name)
-    return input_s, reward, theta
+    return input_s, tf.squeeze(reward), theta
 
   def _vi(self, rewards):
 
-      rewards = tf.squeeze(rewards)
-
+      rewards = tf.Print(rewards, [rewards], 'rewards: ', summarize=500)
+      P_a = tf.Print(self.P_a, [self.P_a], 'P_a: ', summarize=500)
       def body(i, c, t):
           old_values = t.read(i)
           if self.sparse:
               new_values = tf.sparse_reduce_max(
                   tf.sparse_reduce_sum_sparse(self.P_a * (rewards + self.gamma * old_values), axis=2), axis=1)
           else:
-            new_values = tf.reduce_max(tf.reduce_sum(self.P_a * (rewards + self.gamma * old_values), axis=2), axis=1)
+            old_values = tf.Print(old_values, [old_values], 'old_values', summarize=500)
+            rewards_expanded = tf.tile(tf.expand_dims(rewards, 1), [1, self.n_input])
+            tmp = (rewards_expanded + self.gamma * old_values)
+            tmp = tf.Print(tmp, [tmp], 'tmp: ', summarize=500)
+            tmp = tf.tile(tf.expand_dims(tmp, 1), [1, tf.shape(P_a)[1], 1])
+            mm = P_a * tmp
+            mm = tf.Print(mm, [mm], 'mm', summarize=500)
+
+            ss = tf.reduce_sum(mm, axis=2)
+            ss = tf.Print(ss, [ss], 'ss', summarize=500)
+            new_values = tf.reduce_max(ss, axis=1)
+
+          new_values = tf.Print(new_values, [new_values], 'new_values', summarize=500)
 
           c = tf.reduce_max(tf.abs(new_values - old_values)) > self.epsilon
           c.set_shape(())
@@ -95,7 +107,7 @@ def condition(i, c, t):
 
       i, _, values = tf.while_loop(condition, body, [0, True, t], parallel_iterations=1, back_prop=False,
                                    name='VI_loop')
-      values = values.read(i)
+      values = values.read(tf.Print(i, [i], 'i: '))
 
       if self.deterministic:
           if self.sparse:
@@ -346,7 +358,9 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
 
     assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001, deterministic=True)
     assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001, deterministic=True)
+    assert_values2 = value_iteration.optimal_value(N_STATES, N_ACTIONS, P_a_t, rewards, gamma, threshold=0.000001)
 
+    assert (np.abs(assert_values - assert_values2) < 0.0001).all()
     assert (np.abs(assert_values - assert_values_old) < 0.0001).all()
     assert (np.abs(values - assert_values) < 0.0001).all()
     assert (np.abs(values - assert_values_old) < 0.0001).all()
diff --git a/deep_maxent_irl_gridworld.py b/deep_maxent_irl_gridworld.py
index e1c62e6..44108e8 100644
--- a/deep_maxent_irl_gridworld.py
+++ b/deep_maxent_irl_gridworld.py
@@ -112,7 +112,7 @@ def main():
 
   values, policy = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True)
 
-  print('evd', expected_value_diff(P_a, rewards, rewards_gt, GAMMA, mu, values_gt, policy))
+  #print('evd', expected_value_diff(P_a, rewards, rewards_gt, GAMMA, mu, values_gt, policy))
 
   # plots
   plt.figure(figsize=(20,4))
diff --git a/maxent_irl.py b/maxent_irl.py
index e528006..20559e5 100644
--- a/maxent_irl.py
+++ b/maxent_irl.py
@@ -104,39 +104,3 @@ def maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters):
   rewards = np.dot(feat_map, theta)
   # return sigmoid(normalize(rewards))
   return normalize(rewards)
-
-def value(policy, n_states, transition_probabilities, reward, discount,
-                    threshold=1e-2):
-    """
-    FROM https://github.com/MatthewJA/Inverse-Reinforcement-Learning/blob/master/irl/value_iteration.py#L10
-
-    Find the value function associated with a policy.
-
-    policy: List of action ints for each state.
-    n_states: Number of states. int.
-    transition_probabilities: Function taking (state, action, state) to
-        transition probabilities.
-    reward: Vector of rewards for each state.
-    discount: MDP discount factor. float.
-    threshold: Convergence threshold, default 1e-2. float.
-    -> Array of values for each state
-    """
-    v = np.zeros(n_states)
-
-    diff = float("inf")
-    while diff > threshold:
-        diff = 0
-        for s in range(n_states):
-            vs = v[s]
-            a = policy[s]
-            v[s] = sum(transition_probabilities[s, a, k] *
-                       (reward[k] + discount * v[k])
-                       for k in range(n_states))
-            diff = max(diff, abs(vs - v[s]))
-
-    return v
-
-def expected_value_diff(P_a, rewards, true_rewards, gamma, p_start, optimal_value, policy, error=0.01, deterministic=True):
-  v = value(policy, P_a.shape[0], P_a.transpose(0, 2, 1), true_rewards, gamma)
-
-  return optimal_value.dot(p_start) - v.dot(p_start)
diff --git a/mdp/value_iteration.py b/mdp/value_iteration.py
index 5a42296..c81e12b 100644
--- a/mdp/value_iteration.py
+++ b/mdp/value_iteration.py
@@ -50,6 +50,9 @@ def value_iteration_old(P_a, rewards, gamma, error=0.01, deterministic=True):
       v_s = []
       values[s] = max([sum([P_a[s, s1, a] * (rewards[s] + gamma * values_tmp[s1]) for s1 in range(N_STATES)]) for a in
                        range(N_ACTIONS)])
+      print([P_a[s, s1, :] * (rewards[s] + gamma * values_tmp[s1]) for s1 in range(N_STATES)])
+      print([sum([P_a[s, s1, a] * (rewards[s] + gamma * values_tmp[s1]) for s1 in range(N_STATES)]) for a in
+                       range(N_ACTIONS)])
 
     if max([abs(values[s] - values_tmp[s]) for s in range(N_STATES)]) < error:
       break
@@ -347,8 +350,75 @@ def get_action(self, state):
     return actions[a_id]
 
 
+def optimal_value(n_states, n_actions, transition_probabilities, reward,
+                  discount, threshold=1e-2):
+    """
+    FROM https://github.com/MatthewJA/Inverse-Reinforcement-Learning/blob/master/irl/value_iteration.py
+    Find the optimal value function.
+    n_states: Number of states. int.
+    n_actions: Number of actions. int.
+    transition_probabilities: Function taking (state, action, state) to
+        transition probabilities.
+    reward: Vector of rewards for each state.
+    discount: MDP discount factor. float.
+    threshold: Convergence threshold, default 1e-2. float.
+    -> Array of values for each state
+    """
+
+    v = np.zeros(n_states)
+    new_v = np.zeros(n_states)
+
+    diff = float("inf")
+    while diff > threshold:
+        diff = 0
+        v = new_v.copy()
+        for s in range(n_states):
+            max_v = float("-inf")
+            for a in range(n_actions):
+                tp = transition_probabilities[s, a, :]
+                max_v = max(max_v, np.dot(tp, reward[s] + discount * v))
+                max_v = max(max_v, np.sum(tp * (reward[s] + discount*v)))
 
+            new_diff = abs(v[s] - max_v)
+            if new_diff > diff:
+                diff = new_diff
+            new_v[s] = max_v
+
+    return v
+
+
+def value(policy, n_states, transition_probabilities, reward, discount,
+                    threshold=1e-2):
+    """
+    FROM https://github.com/MatthewJA/Inverse-Reinforcement-Learning/blob/master/irl/value_iteration.py#L10
+
+    Find the value function associated with a policy.
+
+    policy: List of action ints for each state.
+    n_states: Number of states. int.
+    transition_probabilities: Function taking (state, action, state) to
+        transition probabilities.
+    reward: Vector of rewards for each state.
+    discount: MDP discount factor. float.
+    threshold: Convergence threshold, default 1e-2. float.
+    -> Array of values for each state
+    """
+    v = np.zeros(n_states)
 
+    diff = float("inf")
+    while diff > threshold:
+        diff = 0
+        for s in range(n_states):
+            vs = v[s]
+            a = policy[s]
+            v[s] = sum(transition_probabilities[s, a, k] *
+                       (reward[k] + discount * v[k])
+                       for k in range(n_states))
+            diff = max(diff, abs(vs - v[s]))
 
+    return v
 
+def expected_value_diff(P_a, rewards, true_rewards, gamma, p_start, optimal_value, policy, error=0.01, deterministic=True):
+  v = value(policy, P_a.shape[0], P_a.transpose(0, 2, 1), true_rewards, gamma)
 
+  return optimal_value.dot(p_start) - v.dot(p_start)
\ No newline at end of file

From 2fa875eaeccbf827368309fde86449afa57d3f37 Mon Sep 17 00:00:00 2001
From: Magnus Jahnen <magnus.jahnen@daimler.com>
Date: Tue, 28 Nov 2017 21:43:04 -0800
Subject: [PATCH 02/10] Fix tf policy

---
 deep_maxent_irl.py     | 34 ++++++++++++++++++++--------------
 mdp/value_iteration.py | 28 ++++++++++++++++++----------
 2 files changed, 38 insertions(+), 24 deletions(-)

diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
index 95e126d..c26f789 100644
--- a/deep_maxent_irl.py
+++ b/deep_maxent_irl.py
@@ -73,6 +73,8 @@ def _build_network(self, name):
   def _vi(self, rewards):
 
       rewards = tf.Print(rewards, [rewards], 'rewards: ', summarize=500)
+      rewards_expanded = tf.tile(tf.expand_dims(rewards, 1), [1, self.n_input])
+
       P_a = tf.Print(self.P_a, [self.P_a], 'P_a: ', summarize=500)
       def body(i, c, t):
           old_values = t.read(i)
@@ -81,11 +83,10 @@ def body(i, c, t):
                   tf.sparse_reduce_sum_sparse(self.P_a * (rewards + self.gamma * old_values), axis=2), axis=1)
           else:
             old_values = tf.Print(old_values, [old_values], 'old_values', summarize=500)
-            rewards_expanded = tf.tile(tf.expand_dims(rewards, 1), [1, self.n_input])
-            tmp = (rewards_expanded + self.gamma * old_values)
-            tmp = tf.Print(tmp, [tmp], 'tmp: ', summarize=500)
-            tmp = tf.tile(tf.expand_dims(tmp, 1), [1, tf.shape(P_a)[1], 1])
-            mm = P_a * tmp
+            expected_value = rewards_expanded + self.gamma * old_values
+            expected_value = tf.Print(expected_value, [expected_value], 'expected_value: ', summarize=500)
+            expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(P_a)[1], 1])
+            mm = P_a * expected_value
             mm = tf.Print(mm, [mm], 'mm', summarize=500)
 
             ss = tf.reduce_sum(mm, axis=2)
@@ -109,15 +110,20 @@ def condition(i, c, t):
                                    name='VI_loop')
       values = values.read(tf.Print(i, [i], 'i: '))
 
+      expected_value = (rewards_expanded + self.gamma * values)
+      expected_value = tf.Print(expected_value, [expected_value], 'expected_value: ', summarize=500)
+      expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(P_a)[1], 1])
+
       if self.deterministic:
           if self.sparse:
               policy = tf.argmax(tf.sparse_tensor_to_dense(tf.sparse_reduce_sum_sparse(self.P_a * (rewards + self.gamma * values), axis=2)), axis=1)
           else:
-              policy = tf.argmax(tf.reduce_sum(self.P_a * (rewards + self.gamma * values), axis=2), axis=1)
+
+              policy = tf.argmax(tf.reduce_sum(self.P_a * expected_value, axis=2), axis=1)
       else:
           if self.sparse:
               policy = tf.sparse_tensor_to_dense(
-                  tf.sparse_reduce_sum_sparse(self.P_a * (rewards + self.gamma * values), axis=2))
+                  tf.sparse_reduce_sum_sparse(self.P_a * expected_value, axis=2))
           else:
               policy = tf.reduce_sum(self.P_a * (rewards + self.gamma * values), axis=2)
 
@@ -321,7 +327,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
   N_STATES, _, N_ACTIONS = np.shape(P_a)
 
   # init nn model
-  nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=True, sparse=sparse)
+  nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=False, sparse=sparse)
 
   # find state visitation frequencies using demonstrations
   mu_D = demo_svf(trajs, N_STATES)
@@ -342,7 +348,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
     rewards = nn_r.get_rewards(feat_map)
 
     # compute policy
-    #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True)
+    #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=False)
 
     # compute rewards and policy at the same time
     #t = time.time()
@@ -350,14 +356,14 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
     #print('tensorflow VI', time.time() - t)
     
     # compute expected svf
-    #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True)
+    #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False)
 
     rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001)
     # compute gradients on rewards:
     grad_r = mu_D - mu_exp
 
-    assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001, deterministic=True)
-    assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001, deterministic=True)
+    assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001, deterministic=False)
+    assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001, deterministic=False)
     assert_values2 = value_iteration.optimal_value(N_STATES, N_ACTIONS, P_a_t, rewards, gamma, threshold=0.000001)
 
     assert (np.abs(assert_values - assert_values2) < 0.0001).all()
@@ -369,8 +375,8 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
     assert (np.abs(policy - assert_policy) < 0.001).all()
     assert (np.abs(policy - assert_policy_old) < 0.001).all()
 
-    assert (np.abs(mu_exp - compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True)) < 0.00001).all()
-    assert (np.abs(mu_exp - compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=True)) < 0.00001).all()
+    assert (np.abs(mu_exp - compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False)) < 0.00001).all()
+    assert (np.abs(mu_exp - compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=False)) < 0.00001).all()
 
     # apply gradients to the neural network
     grad_theta, l2_loss, grad_norm = nn_r.apply_grads(feat_map, grad_r)
diff --git a/mdp/value_iteration.py b/mdp/value_iteration.py
index c81e12b..27382b6 100644
--- a/mdp/value_iteration.py
+++ b/mdp/value_iteration.py
@@ -17,8 +17,12 @@
 
 def softmax(x):
   """Compute softmax values for each sets of scores in x."""
-  e_x = np.exp(x - np.max(x, axis=-1)[:, np.newaxis])
-  return e_x / e_x.sum(axis=-1)[:, np.newaxis]
+  if len(x.shape) > 1:
+    e_x = np.exp(x - np.max(x, axis=-1)[:, np.newaxis])
+    return e_x / e_x.sum(axis=-1)[:, np.newaxis]
+  else:
+    e_x = np.exp(x - np.max(x))
+    return e_x / e_x.sum()
 
 def value_iteration_old(P_a, rewards, gamma, error=0.01, deterministic=True):
   """
@@ -109,6 +113,7 @@ def value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True):
   if chunk_size == 0:
     chunk_size = N_STATES
 
+  rewards_expanded = rewards[:, np.newaxis].repeat(N_STATES, axis=1)
   count = 0
   # estimate values
   while True:
@@ -116,10 +121,10 @@ def value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True):
     values_tmp = values.copy()
 
     def step(start, end):
-      tmp = rewards[start:end, np.newaxis].repeat(N_STATES, axis=1) + gamma * values_tmp
-      tmp = tmp[:, :, np.newaxis].repeat(N_ACTIONS, axis=2)
-      tmp = np.transpose(tmp, (0, 2, 1))
-      values[start:end] = (P[start:end, :, :] * tmp).sum(axis=2).max(axis=1)
+      expected_value = rewards_expanded[start:end, :] + gamma * values_tmp
+      expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2)
+      expected_value = np.transpose(expected_value, (0, 2, 1))
+      values[start:end] = (P[start:end, :, :] * expected_value).sum(axis=2).max(axis=1)
 
     with ThreadPoolExecutor(max_workers=num_cpus) as e:
       futures = list()
@@ -137,17 +142,20 @@ def step(start, end):
       print('VI', count)
       break
 
+  expected_value = rewards_expanded + gamma * values_tmp
+  expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2)
+  expected_value = np.transpose(expected_value, (0, 2, 1))
+
+
   if deterministic:
     # generate deterministic policy
-    policy = np.argmax((P * (rewards + gamma * values_tmp)).sum(axis=2), axis=1)
-
+    policy = np.argmax((P * expected_value).sum(axis=2), axis=1)
     print(time.time() - t)
 
     return values, policy
   else:
-
     # generate stochastic policy
-    policy = (P * (rewards + gamma * values_tmp)).sum(axis=2)
+    policy = (P * (rewards + expected_value)).sum(axis=2)
     policy = softmax(policy)
 
     print(time.time() - t)

From 202d39417d46f92d16f7d27d41987a9680557905 Mon Sep 17 00:00:00 2001
From: Magnus Jahnen <magnus.jahnen@daimler.com>
Date: Tue, 28 Nov 2017 21:50:05 -0800
Subject: [PATCH 03/10] Fix stochastic policy (tf and numpy)

---
 deep_maxent_irl.py     | 2 +-
 mdp/value_iteration.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
index c26f789..8db5a8e 100644
--- a/deep_maxent_irl.py
+++ b/deep_maxent_irl.py
@@ -125,7 +125,7 @@ def condition(i, c, t):
               policy = tf.sparse_tensor_to_dense(
                   tf.sparse_reduce_sum_sparse(self.P_a * expected_value, axis=2))
           else:
-              policy = tf.reduce_sum(self.P_a * (rewards + self.gamma * values), axis=2)
+              policy = tf.reduce_sum(self.P_a * expected_value, axis=2)
 
           policy = tf.nn.softmax(policy)
 
diff --git a/mdp/value_iteration.py b/mdp/value_iteration.py
index 27382b6..a631bfc 100644
--- a/mdp/value_iteration.py
+++ b/mdp/value_iteration.py
@@ -155,7 +155,7 @@ def step(start, end):
     return values, policy
   else:
     # generate stochastic policy
-    policy = (P * (rewards + expected_value)).sum(axis=2)
+    policy = (P * expected_value).sum(axis=2)
     policy = softmax(policy)
 
     print(time.time() - t)

From 76291c4e01657490c0268140a252c3ab642f3c3a Mon Sep 17 00:00:00 2001
From: Magnus Jahnen <magnus.jahnen@daimler.com>
Date: Wed, 29 Nov 2017 10:36:17 -0800
Subject: [PATCH 04/10] minor things

---
 deep_maxent_irl.py     | 22 +++++++++++++++-------
 mdp/value_iteration.py |  3 ---
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
index 8db5a8e..95f533e 100644
--- a/deep_maxent_irl.py
+++ b/deep_maxent_irl.py
@@ -110,7 +110,7 @@ def condition(i, c, t):
                                    name='VI_loop')
       values = values.read(tf.Print(i, [i], 'i: '))
 
-      expected_value = (rewards_expanded + self.gamma * values)
+      expected_value = rewards_expanded + self.gamma * values
       expected_value = tf.Print(expected_value, [expected_value], 'expected_value: ', summarize=500)
       expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(P_a)[1], 1])
 
@@ -345,7 +345,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
       print 'iteration: {}'.format(iteration)
 
     # compute the reward matrix
-    rewards = nn_r.get_rewards(feat_map)
+    # rewards = nn_r.get_rewards(feat_map)
 
     # compute policy
     #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=False)
@@ -359,8 +359,6 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
     #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False)
 
     rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001)
-    # compute gradients on rewards:
-    grad_r = mu_D - mu_exp
 
     assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001, deterministic=False)
     assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001, deterministic=False)
@@ -370,14 +368,24 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
     assert (np.abs(assert_values - assert_values_old) < 0.0001).all()
     assert (np.abs(values - assert_values) < 0.0001).all()
     assert (np.abs(values - assert_values_old) < 0.0001).all()
-
+    print 'iteration: {}'.format(iteration)
+
+    print(assert_policy)
+    print(assert_policy_old)
+    print(policy)
+    print(values)
+    print(assert_values)
+    print(rewards)
     assert (np.abs(assert_policy - assert_policy_old) < 0.0001).all()
-    assert (np.abs(policy - assert_policy) < 0.001).all()
-    assert (np.abs(policy - assert_policy_old) < 0.001).all()
+    assert (np.abs(policy - assert_policy) < 0.0001).all()
+    assert (np.abs(policy - assert_policy_old) < 0.0001).all()
 
     assert (np.abs(mu_exp - compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False)) < 0.00001).all()
     assert (np.abs(mu_exp - compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=False)) < 0.00001).all()
 
+    # compute gradients on rewards:
+    grad_r = mu_D - mu_exp
+
     # apply gradients to the neural network
     grad_theta, l2_loss, grad_norm = nn_r.apply_grads(feat_map, grad_r)
     
diff --git a/mdp/value_iteration.py b/mdp/value_iteration.py
index a631bfc..d6c2298 100644
--- a/mdp/value_iteration.py
+++ b/mdp/value_iteration.py
@@ -54,9 +54,6 @@ def value_iteration_old(P_a, rewards, gamma, error=0.01, deterministic=True):
       v_s = []
       values[s] = max([sum([P_a[s, s1, a] * (rewards[s] + gamma * values_tmp[s1]) for s1 in range(N_STATES)]) for a in
                        range(N_ACTIONS)])
-      print([P_a[s, s1, :] * (rewards[s] + gamma * values_tmp[s1]) for s1 in range(N_STATES)])
-      print([sum([P_a[s, s1, a] * (rewards[s] + gamma * values_tmp[s1]) for s1 in range(N_STATES)]) for a in
-                       range(N_ACTIONS)])
 
     if max([abs(values[s] - values_tmp[s]) for s in range(N_STATES)]) < error:
       break

From 328f2d7b4bf277f187ce5f46ad9645625bacbf8b Mon Sep 17 00:00:00 2001
From: Magnus Jahnen <magnus.jahnen@daimler.com>
Date: Wed, 29 Nov 2017 13:56:18 -0800
Subject: [PATCH 05/10] Debug gradients

---
 deep_maxent_irl.py     | 91 ++++++++++++++++++++++++++++++++----------
 mdp/value_iteration.py |  4 +-
 2 files changed, 74 insertions(+), 21 deletions(-)

diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
index 95f533e..4df3960 100644
--- a/deep_maxent_irl.py
+++ b/deep_maxent_irl.py
@@ -158,7 +158,20 @@ def _svf(self, policy):
                   mu.append(cur_mu)
 
       mu = tf.stack(mu)
-      return tf.reduce_sum(mu, axis=0)
+      mu = tf.reduce_sum(mu, axis=0)
+      if self.deterministic:
+          # NOTE: In the deterministic case it helps to scale the svf by T to recover the reward properly
+          # I noticed that if it is not scaled by T then the recovered reward and the resulting value function
+          # have extremely low values (usually < 0.01). With such low values it is hard to actually recover a
+          # difference in the value of states (i.e. if only the last few digits after the comma differ).
+          # One intuition why scaling by T is useful is to stabilize the gradients and avoid that the gradients
+          # are getting too high
+          # TODO: maybe gradient clipping and normalizing the svf of demonstrations and the policy might help as well
+          # As a side note: This is not mentioned somewhere in the pulications, but for me this countermeasure works
+          # pretty well (on the other hand using a deterministic policy is anyways never meantioned in one of the
+          # publications, they always describe optimizing with stochastic policies)
+          mu /= self.T
+      return mu
 
 
   def get_theta(self):
@@ -248,6 +261,19 @@ def step(t, start, end):
 
   p = np.sum(mu, 1)
 
+  if deterministic:
+      # NOTE: In the deterministic case it helps to scale the svf by T to recover the reward properly
+      # I noticed that if it is not scaled by T then the recovered reward and the resulting value function
+      # have extremely low values (usually < 0.01). With such low values it is hard to actually recover a
+      # difference in the value of states (i.e. if only the last few digits after the comma differ).
+      # One intuition why scaling by T is useful is to stabilize the gradients and avoid that the gradients
+      # are getting too high
+      # TODO: maybe gradient clipping and normalizing the svf of demonstrations and the policy might help as well
+      # As a side note: This is not mentioned somewhere in the pulications, but for me this countermeasure works
+      # pretty well (on the other hand using a deterministic policy is anyways never meantioned in one of the
+      # publications, they always describe optimizing with stochastic policies)
+      p /= T
+
   print(time.time() - tt)
   return p
 
@@ -301,6 +327,18 @@ def compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=Tru
                      range(N_STATES)])
 
     p = np.sum(mu, 1)
+    if deterministic:
+        # NOTE: In the deterministic case it helps to scale the svf by T to recover the reward properly
+        # I noticed that if it is not scaled by T then the recovered reward and the resulting value function
+        # have extremely low values (usually < 0.01). With such low values it is hard to actually recover a
+        # difference in the value of states (i.e. if only the last few digits after the comma differ).
+        # One intuition why scaling by T is useful is to stabilize the gradients and avoid that the gradients
+        # are getting too high
+        # TODO: maybe gradient clipping and normalizing the svf of demonstrations and the policy might help as well
+        # As a side note: This is not mentioned somewhere in the pulications, but for me this countermeasure works
+        # pretty well (on the other hand using a deterministic policy is anyways never meantioned in one of the
+        # publications, they always describe optimizing with stochastic policies)
+        p /= T
     return p
 
 
@@ -327,7 +365,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
   N_STATES, _, N_ACTIONS = np.shape(P_a)
 
   # init nn model
-  nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=False, sparse=sparse)
+  nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=True, sparse=sparse)
 
   # find state visitation frequencies using demonstrations
   mu_D = demo_svf(trajs, N_STATES)
@@ -339,6 +377,8 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
     indices = np.argwhere(mask)
     P_a_t = tf.SparseTensorValue(indices, P_a_t[mask], P_a_t.shape)
 
+  grads = list()
+
   # training 
   for iteration in range(n_iters):
     if iteration % (n_iters/10) == 0:
@@ -348,7 +388,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
     # rewards = nn_r.get_rewards(feat_map)
 
     # compute policy
-    #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=False)
+    #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True)
 
     # compute rewards and policy at the same time
     #t = time.time()
@@ -356,19 +396,40 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
     #print('tensorflow VI', time.time() - t)
     
     # compute expected svf
-    #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False)
+    #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True)
 
     rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001)
 
-    assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001, deterministic=False)
-    assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001, deterministic=False)
+    assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, True)
+
+
+    # compute gradients on rewards:
+    grad_r = mu_D - mu_exp
+    grads.append(grad_r)
+
+    # apply gradients to the neural network
+    grad_theta, l2_loss, grad_norm = nn_r.apply_grads(feat_map, grad_r)
+
+
+  print(np.mean(grads, axis=0))
+  print(np.std(grads, axis=0))
+
+
+  rewards = nn_r.get_rewards(feat_map)
+  # return sigmoid(normalize(rewards))
+  return normalize(rewards)
+
+def assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, deterministic):
+    assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001,
+                                                                   deterministic=deterministic)
+    assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001,
+                                                                               deterministic=deterministic)
     assert_values2 = value_iteration.optimal_value(N_STATES, N_ACTIONS, P_a_t, rewards, gamma, threshold=0.000001)
 
     assert (np.abs(assert_values - assert_values2) < 0.0001).all()
     assert (np.abs(assert_values - assert_values_old) < 0.0001).all()
     assert (np.abs(values - assert_values) < 0.0001).all()
     assert (np.abs(values - assert_values_old) < 0.0001).all()
-    print 'iteration: {}'.format(iteration)
 
     print(assert_policy)
     print(assert_policy_old)
@@ -380,19 +441,9 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
     assert (np.abs(policy - assert_policy) < 0.0001).all()
     assert (np.abs(policy - assert_policy_old) < 0.0001).all()
 
-    assert (np.abs(mu_exp - compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False)) < 0.00001).all()
-    assert (np.abs(mu_exp - compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=False)) < 0.00001).all()
-
-    # compute gradients on rewards:
-    grad_r = mu_D - mu_exp
-
-    # apply gradients to the neural network
-    grad_theta, l2_loss, grad_norm = nn_r.apply_grads(feat_map, grad_r)
-    
-
-  rewards = nn_r.get_rewards(feat_map)
-  # return sigmoid(normalize(rewards))
-  return normalize(rewards)
+    assert (np.abs(mu_exp - compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=deterministic)) < 0.00001).all()
+    assert (
+    np.abs(mu_exp - compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=deterministic)) < 0.00001).all()
 
 
 
diff --git a/mdp/value_iteration.py b/mdp/value_iteration.py
index d6c2298..e090b35 100644
--- a/mdp/value_iteration.py
+++ b/mdp/value_iteration.py
@@ -139,7 +139,9 @@ def step(start, end):
       print('VI', count)
       break
 
-  expected_value = rewards_expanded + gamma * values_tmp
+  expected_value = rewards_expanded + gamma * values
+  print(expected_value)
+
   expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2)
   expected_value = np.transpose(expected_value, (0, 2, 1))
 

From 7f98f664424374cb86a7cdb1f3882fff4cccfae0 Mon Sep 17 00:00:00 2001
From: Magnus Jahnen <magnus.jahnen@daimler.com>
Date: Wed, 29 Nov 2017 14:48:27 -0800
Subject: [PATCH 06/10] Debug gradients

---
 deep_maxent_irl.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
index 4df3960..50c40e0 100644
--- a/deep_maxent_irl.py
+++ b/deep_maxent_irl.py
@@ -327,6 +327,7 @@ def compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=Tru
                      range(N_STATES)])
 
     p = np.sum(mu, 1)
+    print('SUM SVF', p.sum())
     if deterministic:
         # NOTE: In the deterministic case it helps to scale the svf by T to recover the reward properly
         # I noticed that if it is not scaled by T then the recovered reward and the resulting value function
@@ -365,7 +366,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
   N_STATES, _, N_ACTIONS = np.shape(P_a)
 
   # init nn model
-  nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=True, sparse=sparse)
+  nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=False, sparse=sparse)
 
   # find state visitation frequencies using demonstrations
   mu_D = demo_svf(trajs, N_STATES)
@@ -388,7 +389,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
     # rewards = nn_r.get_rewards(feat_map)
 
     # compute policy
-    #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True)
+    #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=False)
 
     # compute rewards and policy at the same time
     #t = time.time()
@@ -396,11 +397,11 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
     #print('tensorflow VI', time.time() - t)
     
     # compute expected svf
-    #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True)
+    #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False)
 
     rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001)
 
-    assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, True)
+    assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, False)
 
 
     # compute gradients on rewards:
@@ -411,9 +412,8 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
     grad_theta, l2_loss, grad_norm = nn_r.apply_grads(feat_map, grad_r)
 
 
-  print(np.mean(grads, axis=0))
-  print(np.std(grads, axis=0))
-
+  print('grad mean', np.mean(grads, axis=0))
+  print('grad std', np.std(grads, axis=0))
 
   rewards = nn_r.get_rewards(feat_map)
   # return sigmoid(normalize(rewards))

From 206375192113a8f711fc7c72be13162b548007e8 Mon Sep 17 00:00:00 2001
From: Magnus Jahnen <magnus.jahnen@daimler.com>
Date: Wed, 29 Nov 2017 14:57:16 -0800
Subject: [PATCH 07/10] Remove tensorflow prints

---
 deep_maxent_irl.py | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
index 50c40e0..1a500c8 100644
--- a/deep_maxent_irl.py
+++ b/deep_maxent_irl.py
@@ -72,28 +72,21 @@ def _build_network(self, name):
 
   def _vi(self, rewards):
 
-      rewards = tf.Print(rewards, [rewards], 'rewards: ', summarize=500)
       rewards_expanded = tf.tile(tf.expand_dims(rewards, 1), [1, self.n_input])
 
-      P_a = tf.Print(self.P_a, [self.P_a], 'P_a: ', summarize=500)
       def body(i, c, t):
           old_values = t.read(i)
           if self.sparse:
               new_values = tf.sparse_reduce_max(
                   tf.sparse_reduce_sum_sparse(self.P_a * (rewards + self.gamma * old_values), axis=2), axis=1)
           else:
-            old_values = tf.Print(old_values, [old_values], 'old_values', summarize=500)
             expected_value = rewards_expanded + self.gamma * old_values
-            expected_value = tf.Print(expected_value, [expected_value], 'expected_value: ', summarize=500)
-            expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(P_a)[1], 1])
-            mm = P_a * expected_value
-            mm = tf.Print(mm, [mm], 'mm', summarize=500)
+            expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(self.P_a)[1], 1])
+            mm = self.P_a * expected_value
 
             ss = tf.reduce_sum(mm, axis=2)
-            ss = tf.Print(ss, [ss], 'ss', summarize=500)
             new_values = tf.reduce_max(ss, axis=1)
 
-          new_values = tf.Print(new_values, [new_values], 'new_values', summarize=500)
 
           c = tf.reduce_max(tf.abs(new_values - old_values)) > self.epsilon
           c.set_shape(())
@@ -108,11 +101,10 @@ def condition(i, c, t):
 
       i, _, values = tf.while_loop(condition, body, [0, True, t], parallel_iterations=1, back_prop=False,
                                    name='VI_loop')
-      values = values.read(tf.Print(i, [i], 'i: '))
+      values = values.read(i)
 
       expected_value = rewards_expanded + self.gamma * values
-      expected_value = tf.Print(expected_value, [expected_value], 'expected_value: ', summarize=500)
-      expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(P_a)[1], 1])
+      expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(self.P_a)[1], 1])
 
       if self.deterministic:
           if self.sparse:
@@ -366,7 +358,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
   N_STATES, _, N_ACTIONS = np.shape(P_a)
 
   # init nn model
-  nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=False, sparse=sparse)
+  nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=True, sparse=sparse)
 
   # find state visitation frequencies using demonstrations
   mu_D = demo_svf(trajs, N_STATES)
@@ -389,7 +381,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
     # rewards = nn_r.get_rewards(feat_map)
 
     # compute policy
-    #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=False)
+    #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True)
 
     # compute rewards and policy at the same time
     #t = time.time()
@@ -397,11 +389,11 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
     #print('tensorflow VI', time.time() - t)
     
     # compute expected svf
-    #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False)
+    #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True)
 
     rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001)
 
-    assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, False)
+    assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, True)
 
 
     # compute gradients on rewards:
@@ -444,6 +436,8 @@ def assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS,
     assert (np.abs(mu_exp - compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=deterministic)) < 0.00001).all()
     assert (
     np.abs(mu_exp - compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=deterministic)) < 0.00001).all()
+    
+    print('tf sum SVF', mu_exp.sum())
 
 
 

From 205cf45411166833008cda3510d91e5832a46c4c Mon Sep 17 00:00:00 2001
From: Magnus Jahnen <magnus.jahnen@daimler.com>
Date: Wed, 29 Nov 2017 14:58:58 -0800
Subject: [PATCH 08/10] Remove some python prints

---
 deep_maxent_irl.py           | 3 +--
 deep_maxent_irl_gridworld.py | 5 -----
 mdp/value_iteration.py       | 2 --
 3 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
index 1a500c8..e849b51 100644
--- a/deep_maxent_irl.py
+++ b/deep_maxent_irl.py
@@ -393,8 +393,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
 
     rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001)
 
-    assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, True)
-
+    # assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, True)
 
     # compute gradients on rewards:
     grad_r = mu_D - mu_exp
diff --git a/deep_maxent_irl_gridworld.py b/deep_maxent_irl_gridworld.py
index 44108e8..f867329 100644
--- a/deep_maxent_irl_gridworld.py
+++ b/deep_maxent_irl_gridworld.py
@@ -99,11 +99,6 @@ def main():
 
   trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START)
 
-  mu = np.zeros([N_STATES])
-
-  for traj in trajs:
-    mu[traj[0].cur_state] += 1
-  mu = mu / len(trajs)
   
   print 'Deep Max Ent IRL training ..'
   t = time.time()
diff --git a/mdp/value_iteration.py b/mdp/value_iteration.py
index e090b35..718957a 100644
--- a/mdp/value_iteration.py
+++ b/mdp/value_iteration.py
@@ -140,8 +140,6 @@ def step(start, end):
       break
 
   expected_value = rewards_expanded + gamma * values
-  print(expected_value)
-
   expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2)
   expected_value = np.transpose(expected_value, (0, 2, 1))
 

From 521d15476e82e301eedf655c92b02350c32509a0 Mon Sep 17 00:00:00 2001
From: Magnus Jahnen <magnus.jahnen@daimler.com>
Date: Wed, 29 Nov 2017 15:16:24 -0800
Subject: [PATCH 09/10] Remove sparse tensor support

---
 deep_maxent_irl.py           | 50 ++++++++++--------------------------
 deep_maxent_irl_gridworld.py |  3 +--
 2 files changed, 15 insertions(+), 38 deletions(-)

diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
index e849b51..8d369d4 100644
--- a/deep_maxent_irl.py
+++ b/deep_maxent_irl.py
@@ -16,23 +16,19 @@
 class DeepIRLFC:
 
 
-  def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, deterministic=False, sparse=False, name='deep_irl_fc'):
+  def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, deterministic=False, name='deep_irl_fc'):
     self.n_input = n_input
     self.lr = lr
     self.n_h1 = n_h1
     self.n_h2 = n_h2
     self.name = name
-    self.sparse = sparse
     self.deterministic = deterministic
 
     self.sess = tf.Session()
     self.input_s, self.reward, self.theta = self._build_network(self.name)
 
     # value iteration
-    if sparse:
-        self.P_a = tf.sparse_placeholder(tf.float32, shape=(n_input, n_actions, n_input))
-    else:
-        self.P_a = tf.placeholder(tf.float32, shape=(n_input, n_actions, n_input))
+    self.P_a = tf.placeholder(tf.float32, shape=(n_input, n_actions, n_input))
     self.gamma = tf.placeholder(tf.float32)
     self.epsilon = tf.placeholder(tf.float32)
     self.values, self.policy = self._vi(self.reward)
@@ -76,21 +72,16 @@ def _vi(self, rewards):
 
       def body(i, c, t):
           old_values = t.read(i)
-          if self.sparse:
-              new_values = tf.sparse_reduce_max(
-                  tf.sparse_reduce_sum_sparse(self.P_a * (rewards + self.gamma * old_values), axis=2), axis=1)
-          else:
-            expected_value = rewards_expanded + self.gamma * old_values
-            expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(self.P_a)[1], 1])
-            mm = self.P_a * expected_value
 
-            ss = tf.reduce_sum(mm, axis=2)
-            new_values = tf.reduce_max(ss, axis=1)
+          expected_value = rewards_expanded + self.gamma * old_values
+          expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(self.P_a)[1], 1])
 
+          new_values = tf.reduce_max(tf.reduce_sum(self.P_a * expected_value, axis=2), axis=1)
+          t = t.write(i + 1, new_values)
 
           c = tf.reduce_max(tf.abs(new_values - old_values)) > self.epsilon
           c.set_shape(())
-          t = t.write(i + 1, new_values)
+
           return i + 1, c, t
 
       def condition(i, c, t):
@@ -107,18 +98,9 @@ def condition(i, c, t):
       expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(self.P_a)[1], 1])
 
       if self.deterministic:
-          if self.sparse:
-              policy = tf.argmax(tf.sparse_tensor_to_dense(tf.sparse_reduce_sum_sparse(self.P_a * (rewards + self.gamma * values), axis=2)), axis=1)
-          else:
-
-              policy = tf.argmax(tf.reduce_sum(self.P_a * expected_value, axis=2), axis=1)
+          policy = tf.argmax(tf.reduce_sum(self.P_a * expected_value, axis=2), axis=1)
       else:
-          if self.sparse:
-              policy = tf.sparse_tensor_to_dense(
-                  tf.sparse_reduce_sum_sparse(self.P_a * expected_value, axis=2))
-          else:
-              policy = tf.reduce_sum(self.P_a * expected_value, axis=2)
-
+          policy = tf.reduce_sum(self.P_a * expected_value, axis=2)
           policy = tf.nn.softmax(policy)
 
       return values, policy
@@ -335,7 +317,7 @@ def compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=Tru
     return p
 
 
-def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
+def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters):
   """
   Maximum Entropy Inverse Reinforcement Learning (Maxent IRL)
 
@@ -358,17 +340,13 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
   N_STATES, _, N_ACTIONS = np.shape(P_a)
 
   # init nn model
-  nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=True, sparse=sparse)
+  nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=False)
 
   # find state visitation frequencies using demonstrations
   mu_D = demo_svf(trajs, N_STATES)
   p_start_state = start_state_probs(trajs, N_STATES)
 
   P_a_t = P_a.transpose(0, 2, 1)
-  if sparse:
-    mask = P_a_t > 0
-    indices = np.argwhere(mask)
-    P_a_t = tf.SparseTensorValue(indices, P_a_t[mask], P_a_t.shape)
 
   grads = list()
 
@@ -381,7 +359,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
     # rewards = nn_r.get_rewards(feat_map)
 
     # compute policy
-    #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True)
+    #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=False)
 
     # compute rewards and policy at the same time
     #t = time.time()
@@ -389,11 +367,11 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
     #print('tensorflow VI', time.time() - t)
     
     # compute expected svf
-    #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True)
+    #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False)
 
     rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001)
 
-    # assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, True)
+    assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, False)
 
     # compute gradients on rewards:
     grad_r = mu_D - mu_exp
diff --git a/deep_maxent_irl_gridworld.py b/deep_maxent_irl_gridworld.py
index f867329..413ed3f 100644
--- a/deep_maxent_irl_gridworld.py
+++ b/deep_maxent_irl_gridworld.py
@@ -27,7 +27,6 @@
 PARSER.set_defaults(rand_start=True)
 PARSER.add_argument('-lr', '--learning_rate', default=0.02, type=float, help='learning rate')
 PARSER.add_argument('-ni', '--n_iters', default=20, type=int, help='number of iterations')
-PARSER.add_argument('-s', '--sparse', default=False, action='store_true', help='flag to use sparse tensors in tf')
 ARGS = PARSER.parse_args()
 print ARGS
 
@@ -102,7 +101,7 @@ def main():
   
   print 'Deep Max Ent IRL training ..'
   t = time.time()
-  rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS, ARGS.sparse)
+  rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS)
   print('time for dirl', time.time() - t)
 
   values, policy = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True)

From 3e77ea8ea4b0fd7a3fef4e066b2a968f864a9407 Mon Sep 17 00:00:00 2001
From: Magnus Jahnen <magnus.jahnen@daimler.com>
Date: Wed, 29 Nov 2017 15:20:34 -0800
Subject: [PATCH 10/10] no assert

---
 deep_maxent_irl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
index 8d369d4..9f0946a 100644
--- a/deep_maxent_irl.py
+++ b/deep_maxent_irl.py
@@ -371,7 +371,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters):
 
     rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001)
 
-    assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, False)
+    #assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, False)
 
     # compute gradients on rewards:
     grad_r = mu_D - mu_exp