diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
index 5f23563..9f0946a 100644
--- a/deep_maxent_irl.py
+++ b/deep_maxent_irl.py
@@ -16,23 +16,19 @@
 class DeepIRLFC:
 
 
-  def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, deterministic=False, sparse=False, name='deep_irl_fc'):
+  def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, deterministic=False, name='deep_irl_fc'):
     self.n_input = n_input
     self.lr = lr
     self.n_h1 = n_h1
     self.n_h2 = n_h2
     self.name = name
-    self.sparse = sparse
     self.deterministic = deterministic
 
     self.sess = tf.Session()
     self.input_s, self.reward, self.theta = self._build_network(self.name)
 
     # value iteration
-    if sparse:
-        self.P_a = tf.sparse_placeholder(tf.float32, shape=(n_input, n_actions, n_input))
-    else:
-        self.P_a = tf.placeholder(tf.float32, shape=(n_input, n_actions, n_input))
+    self.P_a = tf.placeholder(tf.float32, shape=(n_input, n_actions, n_input))
     self.gamma = tf.placeholder(tf.float32)
     self.epsilon = tf.placeholder(tf.float32)
     self.values, self.policy = self._vi(self.reward)
@@ -68,23 +64,24 @@ def _build_network(self, name):
         initializer=tf.contrib.layers.variance_scaling_initializer(mode="FAN_IN"))
       reward = tf_utils.fc(fc2, 1, scope="reward")
     theta = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name)
-    return input_s, reward, theta
+    return input_s, tf.squeeze(reward), theta
 
   def _vi(self, rewards):
 
-      rewards = tf.squeeze(rewards)
+      rewards_expanded = tf.tile(tf.expand_dims(rewards, 1), [1, self.n_input])
 
       def body(i, c, t):
           old_values = t.read(i)
-          if self.sparse:
-              new_values = tf.sparse_reduce_max(
-                  tf.sparse_reduce_sum_sparse(self.P_a * (rewards + self.gamma * old_values), axis=2), axis=1)
-          else:
-            new_values = tf.reduce_max(tf.reduce_sum(self.P_a * (rewards + self.gamma * old_values), axis=2), axis=1)
+
+          expected_value = rewards_expanded + self.gamma * old_values
+          expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(self.P_a)[1], 1])
+
+          new_values = tf.reduce_max(tf.reduce_sum(self.P_a * expected_value, axis=2), axis=1)
+          t = t.write(i + 1, new_values)
 
           c = tf.reduce_max(tf.abs(new_values - old_values)) > self.epsilon
           c.set_shape(())
-          t = t.write(i + 1, new_values)
+
           return i + 1, c, t
 
       def condition(i, c, t):
@@ -97,18 +94,13 @@ def condition(i, c, t):
                                    name='VI_loop')
       values = values.read(i)
 
+      expected_value = rewards_expanded + self.gamma * values
+      expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(self.P_a)[1], 1])
+
       if self.deterministic:
-          if self.sparse:
-              policy = tf.argmax(tf.sparse_tensor_to_dense(tf.sparse_reduce_sum_sparse(self.P_a * (rewards + self.gamma * values), axis=2)), axis=1)
-          else:
-              policy = tf.argmax(tf.reduce_sum(self.P_a * (rewards + self.gamma * values), axis=2), axis=1)
+          policy = tf.argmax(tf.reduce_sum(self.P_a * expected_value, axis=2), axis=1)
       else:
-          if self.sparse:
-              policy = tf.sparse_tensor_to_dense(
-                  tf.sparse_reduce_sum_sparse(self.P_a * (rewards + self.gamma * values), axis=2))
-          else:
-              policy = tf.reduce_sum(self.P_a * (rewards + self.gamma * values), axis=2)
-
+          policy = tf.reduce_sum(self.P_a * expected_value, axis=2)
           policy = tf.nn.softmax(policy)
 
       return values, policy
@@ -140,7 +132,20 @@ def _svf(self, policy):
                   mu.append(cur_mu)
 
       mu = tf.stack(mu)
-      return tf.reduce_sum(mu, axis=0)
+      mu = tf.reduce_sum(mu, axis=0)
+      if self.deterministic:
+          # NOTE: In the deterministic case it helps to scale the svf by T to recover the reward properly
+          # I noticed that if it is not scaled by T then the recovered reward and the resulting value function
+          # have extremely low values (usually < 0.01). With such low values it is hard to actually recover a
+          # difference in the value of states (i.e. if only the last few digits after the comma differ).
+          # One intuition why scaling by T is useful is to stabilize the gradients and avoid that the gradients
+          # are getting too high
+          # TODO: maybe gradient clipping and normalizing the svf of demonstrations and the policy might help as well
+          # As a side note: This is not mentioned somewhere in the pulications, but for me this countermeasure works
+          # pretty well (on the other hand using a deterministic policy is anyways never meantioned in one of the
+          # publications, they always describe optimizing with stochastic policies)
+          mu /= self.T
+      return mu
 
 
   def get_theta(self):
@@ -230,6 +235,19 @@ def step(t, start, end):
 
   p = np.sum(mu, 1)
 
+  if deterministic:
+      # NOTE: In the deterministic case it helps to scale the svf by T to recover the reward properly
+      # I noticed that if it is not scaled by T then the recovered reward and the resulting value function
+      # have extremely low values (usually < 0.01). With such low values it is hard to actually recover a
+      # difference in the value of states (i.e. if only the last few digits after the comma differ).
+      # One intuition why scaling by T is useful is to stabilize the gradients and avoid that the gradients
+      # are getting too high
+      # TODO: maybe gradient clipping and normalizing the svf of demonstrations and the policy might help as well
+      # As a side note: This is not mentioned somewhere in the pulications, but for me this countermeasure works
+      # pretty well (on the other hand using a deterministic policy is anyways never meantioned in one of the
+      # publications, they always describe optimizing with stochastic policies)
+      p /= T
+
   print(time.time() - tt)
   return p
 
@@ -283,10 +301,23 @@ def compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=Tru
                      range(N_STATES)])
 
     p = np.sum(mu, 1)
+    print('SUM SVF', p.sum())
+    if deterministic:
+        # NOTE: In the deterministic case it helps to scale the svf by T to recover the reward properly
+        # I noticed that if it is not scaled by T then the recovered reward and the resulting value function
+        # have extremely low values (usually < 0.01). With such low values it is hard to actually recover a
+        # difference in the value of states (i.e. if only the last few digits after the comma differ).
+        # One intuition why scaling by T is useful is to stabilize the gradients and avoid that the gradients
+        # are getting too high
+        # TODO: maybe gradient clipping and normalizing the svf of demonstrations and the policy might help as well
+        # As a side note: This is not mentioned somewhere in the pulications, but for me this countermeasure works
+        # pretty well (on the other hand using a deterministic policy is anyways never meantioned in one of the
+        # publications, they always describe optimizing with stochastic policies)
+        p /= T
     return p
 
 
-def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
+def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters):
   """
   Maximum Entropy Inverse Reinforcement Learning (Maxent IRL)
 
@@ -309,17 +340,15 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
   N_STATES, _, N_ACTIONS = np.shape(P_a)
 
   # init nn model
-  nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=True, sparse=sparse)
+  nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=False)
 
   # find state visitation frequencies using demonstrations
   mu_D = demo_svf(trajs, N_STATES)
   p_start_state = start_state_probs(trajs, N_STATES)
 
   P_a_t = P_a.transpose(0, 2, 1)
-  if sparse:
-    mask = P_a_t > 0
-    indices = np.argwhere(mask)
-    P_a_t = tf.SparseTensorValue(indices, P_a_t[mask], P_a_t.shape)
+
+  grads = list()
 
   # training 
   for iteration in range(n_iters):
@@ -327,10 +356,10 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
       print 'iteration: {}'.format(iteration)
 
     # compute the reward matrix
-    rewards = nn_r.get_rewards(feat_map)
+    # rewards = nn_r.get_rewards(feat_map)
 
     # compute policy
-    #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True)
+    #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=False)
 
     # compute rewards and policy at the same time
     #t = time.time()
@@ -338,33 +367,54 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
     #print('tensorflow VI', time.time() - t)
     
     # compute expected svf
-    #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True)
+    #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False)
 
     rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001)
+
+    #assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, False)
+
     # compute gradients on rewards:
     grad_r = mu_D - mu_exp
+    grads.append(grad_r)
 
-    assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001, deterministic=True)
-    assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001, deterministic=True)
+    # apply gradients to the neural network
+    grad_theta, l2_loss, grad_norm = nn_r.apply_grads(feat_map, grad_r)
+
+
+  print('grad mean', np.mean(grads, axis=0))
+  print('grad std', np.std(grads, axis=0))
+
+  rewards = nn_r.get_rewards(feat_map)
+  # return sigmoid(normalize(rewards))
+  return normalize(rewards)
 
+def assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, deterministic):
+    assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001,
+                                                                   deterministic=deterministic)
+    assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001,
+                                                                               deterministic=deterministic)
+    assert_values2 = value_iteration.optimal_value(N_STATES, N_ACTIONS, P_a_t, rewards, gamma, threshold=0.000001)
+
+    assert (np.abs(assert_values - assert_values2) < 0.0001).all()
     assert (np.abs(assert_values - assert_values_old) < 0.0001).all()
     assert (np.abs(values - assert_values) < 0.0001).all()
     assert (np.abs(values - assert_values_old) < 0.0001).all()
 
+    print(assert_policy)
+    print(assert_policy_old)
+    print(policy)
+    print(values)
+    print(assert_values)
+    print(rewards)
     assert (np.abs(assert_policy - assert_policy_old) < 0.0001).all()
-    assert (np.abs(policy - assert_policy) < 0.001).all()
-    assert (np.abs(policy - assert_policy_old) < 0.001).all()
-
-    assert (np.abs(mu_exp - compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True)) < 0.00001).all()
-    assert (np.abs(mu_exp - compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=True)) < 0.00001).all()
+    assert (np.abs(policy - assert_policy) < 0.0001).all()
+    assert (np.abs(policy - assert_policy_old) < 0.0001).all()
 
-    # apply gradients to the neural network
-    grad_theta, l2_loss, grad_norm = nn_r.apply_grads(feat_map, grad_r)
+    assert (np.abs(mu_exp - compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=deterministic)) < 0.00001).all()
+    assert (
+    np.abs(mu_exp - compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=deterministic)) < 0.00001).all()
     
-
-  rewards = nn_r.get_rewards(feat_map)
-  # return sigmoid(normalize(rewards))
-  return normalize(rewards)
+    print('tf sum SVF', mu_exp.sum())
 
 
 
diff --git a/deep_maxent_irl_gridworld.py b/deep_maxent_irl_gridworld.py
index e1c62e6..413ed3f 100644
--- a/deep_maxent_irl_gridworld.py
+++ b/deep_maxent_irl_gridworld.py
@@ -27,7 +27,6 @@
 PARSER.set_defaults(rand_start=True)
 PARSER.add_argument('-lr', '--learning_rate', default=0.02, type=float, help='learning rate')
 PARSER.add_argument('-ni', '--n_iters', default=20, type=int, help='number of iterations')
-PARSER.add_argument('-s', '--sparse', default=False, action='store_true', help='flag to use sparse tensors in tf')
 ARGS = PARSER.parse_args()
 print ARGS
 
@@ -99,20 +98,15 @@ def main():
 
   trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START)
 
-  mu = np.zeros([N_STATES])
-
-  for traj in trajs:
-    mu[traj[0].cur_state] += 1
-  mu = mu / len(trajs)
   
   print 'Deep Max Ent IRL training ..'
   t = time.time()
-  rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS, ARGS.sparse)
+  rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS)
   print('time for dirl', time.time() - t)
 
   values, policy = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True)
 
-  print('evd', expected_value_diff(P_a, rewards, rewards_gt, GAMMA, mu, values_gt, policy))
+  #print('evd', expected_value_diff(P_a, rewards, rewards_gt, GAMMA, mu, values_gt, policy))
 
   # plots
   plt.figure(figsize=(20,4))
diff --git a/maxent_irl.py b/maxent_irl.py
index e528006..20559e5 100644
--- a/maxent_irl.py
+++ b/maxent_irl.py
@@ -104,39 +104,3 @@ def maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters):
   rewards = np.dot(feat_map, theta)
   # return sigmoid(normalize(rewards))
   return normalize(rewards)
-
-def value(policy, n_states, transition_probabilities, reward, discount,
-                    threshold=1e-2):
-    """
-    FROM https://github.com/MatthewJA/Inverse-Reinforcement-Learning/blob/master/irl/value_iteration.py#L10
-
-    Find the value function associated with a policy.
-
-    policy: List of action ints for each state.
-    n_states: Number of states. int.
-    transition_probabilities: Function taking (state, action, state) to
-        transition probabilities.
-    reward: Vector of rewards for each state.
-    discount: MDP discount factor. float.
-    threshold: Convergence threshold, default 1e-2. float.
-    -> Array of values for each state
-    """
-    v = np.zeros(n_states)
-
-    diff = float("inf")
-    while diff > threshold:
-        diff = 0
-        for s in range(n_states):
-            vs = v[s]
-            a = policy[s]
-            v[s] = sum(transition_probabilities[s, a, k] *
-                       (reward[k] + discount * v[k])
-                       for k in range(n_states))
-            diff = max(diff, abs(vs - v[s]))
-
-    return v
-
-def expected_value_diff(P_a, rewards, true_rewards, gamma, p_start, optimal_value, policy, error=0.01, deterministic=True):
-  v = value(policy, P_a.shape[0], P_a.transpose(0, 2, 1), true_rewards, gamma)
-
-  return optimal_value.dot(p_start) - v.dot(p_start)
diff --git a/mdp/value_iteration.py b/mdp/value_iteration.py
index 5a42296..718957a 100644
--- a/mdp/value_iteration.py
+++ b/mdp/value_iteration.py
@@ -17,8 +17,12 @@
 
 def softmax(x):
   """Compute softmax values for each sets of scores in x."""
-  e_x = np.exp(x - np.max(x, axis=-1)[:, np.newaxis])
-  return e_x / e_x.sum(axis=-1)[:, np.newaxis]
+  if len(x.shape) > 1:
+    e_x = np.exp(x - np.max(x, axis=-1)[:, np.newaxis])
+    return e_x / e_x.sum(axis=-1)[:, np.newaxis]
+  else:
+    e_x = np.exp(x - np.max(x))
+    return e_x / e_x.sum()
 
 def value_iteration_old(P_a, rewards, gamma, error=0.01, deterministic=True):
   """
@@ -106,6 +110,7 @@ def value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True):
   if chunk_size == 0:
     chunk_size = N_STATES
 
+  rewards_expanded = rewards[:, np.newaxis].repeat(N_STATES, axis=1)
   count = 0
   # estimate values
   while True:
@@ -113,10 +118,10 @@ def value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True):
     values_tmp = values.copy()
 
     def step(start, end):
-      tmp = rewards[start:end, np.newaxis].repeat(N_STATES, axis=1) + gamma * values_tmp
-      tmp = tmp[:, :, np.newaxis].repeat(N_ACTIONS, axis=2)
-      tmp = np.transpose(tmp, (0, 2, 1))
-      values[start:end] = (P[start:end, :, :] * tmp).sum(axis=2).max(axis=1)
+      expected_value = rewards_expanded[start:end, :] + gamma * values_tmp
+      expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2)
+      expected_value = np.transpose(expected_value, (0, 2, 1))
+      values[start:end] = (P[start:end, :, :] * expected_value).sum(axis=2).max(axis=1)
 
     with ThreadPoolExecutor(max_workers=num_cpus) as e:
       futures = list()
@@ -134,17 +139,20 @@ def step(start, end):
       print('VI', count)
       break
 
+  expected_value = rewards_expanded + gamma * values
+  expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2)
+  expected_value = np.transpose(expected_value, (0, 2, 1))
+
+
   if deterministic:
     # generate deterministic policy
-    policy = np.argmax((P * (rewards + gamma * values_tmp)).sum(axis=2), axis=1)
-
+    policy = np.argmax((P * expected_value).sum(axis=2), axis=1)
     print(time.time() - t)
 
     return values, policy
   else:
-
     # generate stochastic policy
-    policy = (P * (rewards + gamma * values_tmp)).sum(axis=2)
+    policy = (P * expected_value).sum(axis=2)
     policy = softmax(policy)
 
     print(time.time() - t)
@@ -347,8 +355,75 @@ def get_action(self, state):
     return actions[a_id]
 
 
+def optimal_value(n_states, n_actions, transition_probabilities, reward,
+                  discount, threshold=1e-2):
+    """
+    FROM https://github.com/MatthewJA/Inverse-Reinforcement-Learning/blob/master/irl/value_iteration.py
+    Find the optimal value function.
+    n_states: Number of states. int.
+    n_actions: Number of actions. int.
+    transition_probabilities: Function taking (state, action, state) to
+        transition probabilities.
+    reward: Vector of rewards for each state.
+    discount: MDP discount factor. float.
+    threshold: Convergence threshold, default 1e-2. float.
+    -> Array of values for each state
+    """
+
+    v = np.zeros(n_states)
+    new_v = np.zeros(n_states)
+
+    diff = float("inf")
+    while diff > threshold:
+        diff = 0
+        v = new_v.copy()
+        for s in range(n_states):
+            max_v = float("-inf")
+            for a in range(n_actions):
+                tp = transition_probabilities[s, a, :]
+                max_v = max(max_v, np.dot(tp, reward[s] + discount * v))
+                max_v = max(max_v, np.sum(tp * (reward[s] + discount*v)))
+
+            new_diff = abs(v[s] - max_v)
+            if new_diff > diff:
+                diff = new_diff
+            new_v[s] = max_v
+
+    return v
 
 
+def value(policy, n_states, transition_probabilities, reward, discount,
+                    threshold=1e-2):
+    """
+    FROM https://github.com/MatthewJA/Inverse-Reinforcement-Learning/blob/master/irl/value_iteration.py#L10
+
+    Find the value function associated with a policy.
+
+    policy: List of action ints for each state.
+    n_states: Number of states. int.
+    transition_probabilities: Function taking (state, action, state) to
+        transition probabilities.
+    reward: Vector of rewards for each state.
+    discount: MDP discount factor. float.
+    threshold: Convergence threshold, default 1e-2. float.
+    -> Array of values for each state
+    """
+    v = np.zeros(n_states)
+
+    diff = float("inf")
+    while diff > threshold:
+        diff = 0
+        for s in range(n_states):
+            vs = v[s]
+            a = policy[s]
+            v[s] = sum(transition_probabilities[s, a, k] *
+                       (reward[k] + discount * v[k])
+                       for k in range(n_states))
+            diff = max(diff, abs(vs - v[s]))
 
+    return v
 
+def expected_value_diff(P_a, rewards, true_rewards, gamma, p_start, optimal_value, policy, error=0.01, deterministic=True):
+  v = value(policy, P_a.shape[0], P_a.transpose(0, 2, 1), true_rewards, gamma)
 
+  return optimal_value.dot(p_start) - v.dot(p_start)
\ No newline at end of file