From b0f887bc7d50c33330209b851dc70927ba3840dd Mon Sep 17 00:00:00 2001
From: Magnus Jahnen <magnus.jahnen@daimler.com>
Date: Fri, 1 Dec 2017 16:48:48 -0800
Subject: [PATCH 01/14] Numpy VI deterministic model

---
 deep_maxent_irl.py           |   5 +-
 deep_maxent_irl_gridworld.py |  10 +++-
 mdp/value_iteration.py       | 112 ++++++++++++++++++++++++++---------
 3 files changed, 95 insertions(+), 32 deletions(-)

diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
index 6888538..243e2d7 100644
--- a/deep_maxent_irl.py
+++ b/deep_maxent_irl.py
@@ -324,7 +324,10 @@ def compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=Tru
     returns:
       p       Nx1 vector - state visitation frequencies
     """
-    N_STATES, _, N_ACTIONS = np.shape(P_a)
+    if len(P_a.shape) == 3:
+        N_STATES, _, N_ACTIONS = np.shape(P_a)
+    else:
+        N_STATES, N_ACTIONS = np.shape(P_a)
 
     T = len(trajs[0])
     # mu[s, t] is the prob of visiting state s at time t
diff --git a/deep_maxent_irl_gridworld.py b/deep_maxent_irl_gridworld.py
index 0420b75..e35f257 100644
--- a/deep_maxent_irl_gridworld.py
+++ b/deep_maxent_irl_gridworld.py
@@ -93,8 +93,14 @@ def main():
   rewards_gt = np.reshape(rmap_gt, H*W, order='F')
   P_a = gw.get_transition_mat()
 
-  values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True)
-  
+  if ACT_RAND == 0:
+    print(P_a.transpose(0, 2, 1))
+    P_an = np.argmax(P_a.transpose(0, 2, 1), axis=-1)
+    print(P_an)
+
+  values_gt, policy_gt = value_iteration.value_iteration_old(P_a, rewards_gt, GAMMA, error=0.01, deterministic=False)
+  values_gt2, policy_gt2 = value_iteration.value_iteration(P_an, rewards_gt, GAMMA, error=0.01, deterministic=False)
+
   # use identity matrix as feature
   #feat_map = np.eye(N_STATES)
   feat_map = np.zeros(N_STATES).reshape((H, W))
diff --git a/mdp/value_iteration.py b/mdp/value_iteration.py
index 30ac6e6..6e5fed2 100644
--- a/mdp/value_iteration.py
+++ b/mdp/value_iteration.py
@@ -42,7 +42,11 @@ def value_iteration_old(P_a, rewards, gamma, error=0.01, deterministic=True):
     values    Nx1 matrix - estimated values
     policy    Nx1 (NxN_ACTIONS if non-det) matrix - policy
   """
-  N_STATES, _, N_ACTIONS = np.shape(P_a)
+  if len(P_a.shape) == 3:
+    N_STATES, _, N_ACTIONS = np.shape(P_a)
+  else:
+    N_STATES, N_ACTIONS = np.shape(P_a)
+
 
   values = np.zeros([N_STATES])
 
@@ -51,9 +55,10 @@ def value_iteration_old(P_a, rewards, gamma, error=0.01, deterministic=True):
     values_tmp = values.copy()
 
     for s in range(N_STATES):
-      v_s = []
-      values[s] = max([sum([P_a[s, s1, a] * (rewards[s1] + gamma * values_tmp[s1]) for s1 in range(N_STATES)]) for a in
-                       range(N_ACTIONS)])
+      if len(P_a.shape) == 3:
+          values[s] = max([sum([P_a[s, s1, a] * (rewards[s1] + gamma * values_tmp[s1]) for s1 in range(N_STATES)]) for a in range(N_ACTIONS)])
+      else:
+          values[s] = max([rewards[P_a[s, a]] + gamma * values_tmp[P_a[s, a]] for a in range(N_ACTIONS)])
 
     if max([abs(values[s] - values_tmp[s]) for s in range(N_STATES)]) < error:
       break
@@ -62,17 +67,25 @@ def value_iteration_old(P_a, rewards, gamma, error=0.01, deterministic=True):
     # generate deterministic policy
     policy = np.zeros([N_STATES])
     for s in range(N_STATES):
-      policy[s] = np.argmax([sum([P_a[s, s1, a] * (rewards[s1] + gamma * values[s1])
+      if len(P_a.shape) == 3:
+          policy[s] = np.argmax([sum([P_a[s, s1, a] * (rewards[s1] + gamma * values[s1])
                                   for s1 in range(N_STATES)])
                              for a in range(N_ACTIONS)])
+      else:
+          policy[s] = np.argmax([rewards[P_a[s, a]] + gamma * values[P_a[s, a]] for a in range(N_ACTIONS)])
+
 
     return values, policy
   else:
     # generate stochastic policy
     policy = np.zeros([N_STATES, N_ACTIONS])
     for s in range(N_STATES):
-      v_s = np.array(
-        [sum([P_a[s, s1, a] * (rewards[s1] + gamma * values[s1]) for s1 in range(N_STATES)]) for a in range(N_ACTIONS)])
+      if len(P_a.shape) == 3:
+        v_s = np.array(
+            [sum([P_a[s, s1, a] * (rewards[s1] + gamma * values[s1]) for s1 in range(N_STATES)]) for a in range(N_ACTIONS)])
+      else:
+          v_s = np.array([rewards[P_a[s, a]] + gamma * values[P_a[s, a]] for a in range(N_ACTIONS)])
+
       policy[s, :] = softmax(v_s).squeeze()
 
 
@@ -97,13 +110,19 @@ def value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True):
     values    Nx1 matrix - estimated values
     policy    Nx1 (NxN_ACTIONS if non-det) matrix - policy
   """
-  N_STATES, _, N_ACTIONS = np.shape(P_a)
+  if len(P_a.shape) == 3:
+    N_STATES, _, N_ACTIONS = np.shape(P_a)
+  else:
+    N_STATES, N_ACTIONS = np.shape(P_a)
 
   values = np.zeros([N_STATES])
 
   t = time.time()
   rewards = rewards.squeeze()
-  P = P_a.transpose(0, 2, 1)
+  if len(P_a.shape) == 3:
+    P = P_a.transpose(0, 2, 1)
+  else:
+      P = P_a
 
   num_cpus = multiprocessing.cpu_count()
   chunk_size = N_STATES // num_cpus
@@ -117,12 +136,20 @@ def value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True):
     count += 1
     values_tmp = values.copy()
 
-    def step(start, end):
-      expected_value = rewards + gamma * values_tmp
-      #expected_value = expected_value[:, np.newaxis].repeat(N_STATES, axis=1)
-      #expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2)
-      #expected_value = np.transpose(expected_value, (0, 2, 1))
-      values[start:end] = (P[start:end, :, :] * expected_value).sum(axis=2).max(axis=1)
+    if len(P.shape) == 3:
+        def step(start, end):
+          expected_value = rewards + gamma * values_tmp
+          #expected_value = expected_value[:, np.newaxis].repeat(N_STATES, axis=1)
+          #expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2)
+          #expected_value = np.transpose(expected_value, (0, 2, 1))
+          values[start:end] = (P[start:end, :, :] * expected_value).sum(axis=2).max(axis=1)
+    else:
+        def step(start, end):
+          expected_value = rewards[P[start:end, :]] + gamma * values_tmp[P[start:end, :]]
+          #expected_value = expected_value[:, np.newaxis].repeat(N_STATES, axis=1)
+          #expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2)
+          #expected_value = np.transpose(expected_value, (0, 2, 1))
+          values[start:end] = expected_value.max(axis=1)
 
     with ThreadPoolExecutor(max_workers=num_cpus) as e:
       futures = list()
@@ -140,21 +167,32 @@ def step(start, end):
       print('VI', count)
       break
 
-  expected_value = rewards + gamma * values_tmp
-  #expected_value = expected_value[:, np.newaxis].repeat(N_STATES, axis=1)
-  #expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2)
-  #expected_value = np.transpose(expected_value, (0, 2, 1))
+
+  if len(P.shape) == 3:
+      expected_value = rewards + gamma * values_tmp
+      #expected_value = expected_value[:, np.newaxis].repeat(N_STATES, axis=1)
+      #expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2)
+      #expected_value = np.transpose(expected_value, (0, 2, 1))
+  else:
+      expected_value = rewards[P] + gamma * values_tmp[P]
 
 
   if deterministic:
     # generate deterministic policy
-    policy = np.argmax((P * expected_value).sum(axis=2), axis=1)
+    if len(P.shape) == 3:
+        policy = np.argmax((P * expected_value).sum(axis=2), axis=1)
+    else:
+        policy = np.argmax(expected_value, axis=1)
     print(time.time() - t)
 
     return values, policy
   else:
     # generate stochastic policy
-    policy = (P * expected_value).sum(axis=2)
+    if len(P.shape) == 3:
+        policy = (P * expected_value).sum(axis=2)
+    else:
+        policy = expected_value
+
     policy = softmax(policy)
 
     print(time.time() - t)
@@ -195,29 +233,45 @@ def softmax(x):
   while True:
     values_tmp = values.copy()
 
-    for s in range(N_STATES):
-      v_s = []
-      q = [sum([P_a[s, s1, a]*(rewards[s] + gamma*values_tmp[s1]) for s1 in range(N_STATES)]) for a in range(N_ACTIONS)]
-      values[s] = softmax(q)
+    if len(P_a.shape) == 3:
+        for s in range(N_STATES):
+          q = [sum([P_a[s, s1, a]*(rewards[s] + gamma*values_tmp[s1]) for s1 in range(N_STATES)]) for a in range(N_ACTIONS)]
+          values[s] = softmax(q)
 
-    if max([abs(values[s] - values_tmp[s]) for s in range(N_STATES)]) < error:
-      break
+        if max([abs(values[s] - values_tmp[s]) for s in range(N_STATES)]) < error:
+          break
+    else:
+        for s in range(N_STATES):
+            q = [sum([(rewards[P_a[:, a]] + gamma * values_tmp[s1]) for s1 in range(N_STATES)]) for a in
+                 range(N_ACTIONS)]
+            values[s] = softmax(q)
 
+        if max([abs(values[s] - values_tmp[s]) for s in range(N_STATES)]) < error:
+            break
 
   if deterministic:
     # generate deterministic policy
     policy = np.zeros([N_STATES])
     for s in range(N_STATES):
-      policy[s] = np.argmax([sum([P_a[s, s1, a]*(rewards[s]+gamma*values[s1])
+      if len(P_a.shape) == 3:
+        policy[s] = np.argmax([sum([P_a[s, s1, a]*(rewards[s]+gamma*values[s1])
                                   for s1 in range(N_STATES)])
                                   for a in range(N_ACTIONS)])
+      else:
+          policy[s] = np.argmax([sum([(rewards[P_a[:, a]] + gamma * values[s1])
+                                      for s1 in range(N_STATES)])
+                                 for a in range(N_ACTIONS)])
 
     return values, policy
   else:
     # generate stochastic policy
     policy = np.zeros([N_STATES, N_ACTIONS])
     for s in range(N_STATES):
-      v_s = np.asarray([sum([P_a[s, s1, a]*(rewards[s] + gamma*values[s1]) for s1 in range(N_STATES)]) for a in range(N_ACTIONS)])
+      if len(P_a.shape) == 3:
+        v_s = np.asarray([sum([P_a[s, s1, a]*(rewards[s] + gamma*values[s1]) for s1 in range(N_STATES)]) for a in range(N_ACTIONS)])
+      else:
+          v_s = np.asarray([sum([(rewards[P_a[:, a]] + gamma * values[s1]) for s1 in range(N_STATES)]) for a in
+                            range(N_ACTIONS)])
       policy[s, :] = np.exp(v_s.squeeze() - values[s])
     return values, policy
 

From bb12fe1e5135eec3fb78abff9c799c38f7200d7a Mon Sep 17 00:00:00 2001
From: Magnus Jahnen <magnus.jahnen@daimler.com>
Date: Fri, 1 Dec 2017 17:17:55 -0800
Subject: [PATCH 02/14] Numpy SVF deterministic model

---
 deep_maxent_irl.py           | 17 +++++++++++++----
 deep_maxent_irl_gridworld.py | 10 ++++++++--
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
index 243e2d7..6c45a8d 100644
--- a/deep_maxent_irl.py
+++ b/deep_maxent_irl.py
@@ -339,12 +339,21 @@ def compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=Tru
     for t in range(T - 1):
       for s in range(N_STATES):
             if deterministic:
-                mu[s, t + 1] = sum([mu[pre_s, t] * P_a[pre_s, s, int(policy[pre_s])] for pre_s in range(N_STATES)])
+                if len(P_a.shape) == 3:
+                    mu[s, t + 1] = sum([mu[pre_s, t] * P_a[pre_s, s, int(policy[pre_s])] for pre_s in range(N_STATES)])
+                else:
+                    mu[P_a[s, int(policy[s])], t + 1] += mu[s, t]
             else:
-                mu[s, t + 1] = sum(
-                    [sum([mu[pre_s, t] * P_a[pre_s, s, a1] * policy[pre_s, a1] for a1 in range(N_ACTIONS)]) for pre_s in
-                     range(N_STATES)])
+                if len(P_a.shape) == 3:
+                    mu[s, t + 1] = sum(
+                        [sum([mu[pre_s, t] * P_a[pre_s, s, a1] * policy[pre_s, a1] for a1 in range(N_ACTIONS)]) for pre_s in
+                        range(N_STATES)])
+                else:
+                    for a1 in range(N_ACTIONS):
+                        mu[P_a[s, a1], t + 1] += mu[s, t] * policy[s, a1]
 
+
+    print(mu)
     p = np.sum(mu, 1)
     print('SUM SVF', p.sum())
 
diff --git a/deep_maxent_irl_gridworld.py b/deep_maxent_irl_gridworld.py
index e35f257..7ee3c0c 100644
--- a/deep_maxent_irl_gridworld.py
+++ b/deep_maxent_irl_gridworld.py
@@ -98,8 +98,8 @@ def main():
     P_an = np.argmax(P_a.transpose(0, 2, 1), axis=-1)
     print(P_an)
 
-  values_gt, policy_gt = value_iteration.value_iteration_old(P_a, rewards_gt, GAMMA, error=0.01, deterministic=False)
-  values_gt2, policy_gt2 = value_iteration.value_iteration(P_an, rewards_gt, GAMMA, error=0.01, deterministic=False)
+  values_gt, policy_gt = value_iteration.value_iteration_old(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True)
+  values_gt2, policy_gt2 = value_iteration.value_iteration(P_an, rewards_gt, GAMMA, error=0.01, deterministic=True)
 
   # use identity matrix as feature
   #feat_map = np.eye(N_STATES)
@@ -115,6 +115,12 @@ def main():
 
   trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START)
 
+  values_gt, policy_gt = value_iteration.value_iteration_old(P_a, rewards_gt, GAMMA, error=0.01, deterministic=False)
+
+  svf = compute_state_visition_freq_old(P_a, GAMMA, trajs, policy_gt, False)
+  svf2 = compute_state_visition_freq_old(P_an, GAMMA, trajs, policy_gt, False)
+
+
   
   print 'Deep Max Ent IRL training ..'
   t = time.time()

From 1e5fc1516cd9e046ef1c2936f5f21d1c486c9c41 Mon Sep 17 00:00:00 2001
From: Magnus Jahnen <magnus.jahnen@daimler.com>
Date: Fri, 1 Dec 2017 18:06:04 -0800
Subject: [PATCH 03/14] Deterministic policy svf deterministic model works now
 in numpyu

---
 deep_maxent_irl.py           | 39 ++++++++++++++++++++++++++++--------
 deep_maxent_irl_gridworld.py |  8 ++++----
 2 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
index 6c45a8d..6fc87d9 100644
--- a/deep_maxent_irl.py
+++ b/deep_maxent_irl.py
@@ -239,7 +239,10 @@ def compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True):
     p       Nx1 vector - state visitation frequencies
   """
   tt = time.time()
-  N_STATES, _, N_ACTIONS = np.shape(P_a)
+  if len(P_a.shape) == 3:
+    N_STATES, _, N_ACTIONS = np.shape(P_a)
+  else:
+    N_STATES, N_ACTIONS = np.shape(P_a)
 
   T = len(trajs[0])
   # mu[s, t] is the prob of visiting state s at time t
@@ -253,16 +256,36 @@ def compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True):
   if chunk_size == 0:
     chunk_size = N_STATES
 
-  if deterministic:
-    P_az = P_a[np.arange(0, N_STATES), :, policy]
-  else:
-    P_a = P_a.transpose(0, 2, 1)
 
-  def step(t, start, end):
+  if len(P_a.shape) == 3:
       if deterministic:
-        mu[start:end, t + 1] = np.sum(mu[:, t, np.newaxis] * P_az[:, start:end], axis=0)
+        P_az = P_a[np.arange(0, N_STATES), :, policy]
       else:
-        mu[start:end, t + 1] = np.sum(np.sum(mu[:, t, np.newaxis, np.newaxis] * (P_a[:, :, start:end] * policy[:, :, np.newaxis]), axis=1), axis=0)
+        P_a = P_a.transpose(0, 2, 1)
+  else:
+      if deterministic:
+        P_az = P_a[np.arange(N_STATES), policy]
+
+  if len(P_a.shape) == 3:
+      def step(t, start, end):
+          if deterministic:
+            mu[start:end, t + 1] = np.sum(mu[:, t, np.newaxis] * P_az[:, start:end], axis=0)
+          else:
+            mu[start:end, t + 1] = np.sum(np.sum(mu[:, t, np.newaxis, np.newaxis] * (P_a[:, :, start:end] * policy[:, :, np.newaxis]), axis=1), axis=0)
+  else:
+      def step(t, start, end):
+          print(t)
+          if deterministic:
+            # The following needs be be done using ufunc
+            # https://stackoverflow.com/questions/41990028/add-multiple-values-to-one-numpy-array-index
+            # P_az[start:end] sometimes points to same state for multiple values, with the usual fancy indexing only
+            # one addition (latest) would be executed!
+            # https://stackoverflow.com/questions/15973827/handling-of-duplicate-indices-in-numpy-assignments
+            # mu[P_az[start:end], t + 1] += mu[start:end, t]
+            np.add.at(mu, [P_az[start:end], t + 1], mu[start:end, t])
+          else:
+            mu[start:end, t + 1] = np.sum(np.sum(mu[:, t, np.newaxis, np.newaxis] * (P_a[:, :, start:end] * policy[:, :, np.newaxis]), axis=1), axis=0)
+
 
   with ThreadPoolExecutor(max_workers=num_cpus) as e:
     for t in range(T - 1):
diff --git a/deep_maxent_irl_gridworld.py b/deep_maxent_irl_gridworld.py
index 7ee3c0c..5515daf 100644
--- a/deep_maxent_irl_gridworld.py
+++ b/deep_maxent_irl_gridworld.py
@@ -8,7 +8,7 @@
 from mdp import gridworld
 from mdp import value_iteration
 from deep_maxent_irl import *
-from maxent_irl import *
+
 from utils import *
 from lp_irl import *
 
@@ -115,10 +115,10 @@ def main():
 
   trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START)
 
-  values_gt, policy_gt = value_iteration.value_iteration_old(P_a, rewards_gt, GAMMA, error=0.01, deterministic=False)
+  values_gt, policy_gt = value_iteration.value_iteration_old(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True)
 
-  svf = compute_state_visition_freq_old(P_a, GAMMA, trajs, policy_gt, False)
-  svf2 = compute_state_visition_freq_old(P_an, GAMMA, trajs, policy_gt, False)
+  svf = compute_state_visition_freq_old(P_a, GAMMA, trajs, policy_gt, True)
+  svf2 = compute_state_visition_freq(P_an, GAMMA, trajs, policy_gt.astype(np.int32), True)
 
 
   

From 69c97c2da8df94aa513b7777aef2d38bf8d9855c Mon Sep 17 00:00:00 2001
From: Magnus Jahnen <magnus.jahnen@daimler.com>
Date: Fri, 1 Dec 2017 18:21:48 -0800
Subject: [PATCH 04/14] Stochastic policy svf deterministic model works now in
 numpy

---
 deep_maxent_irl.py           | 5 +++--
 deep_maxent_irl_gridworld.py | 6 +++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
index 6fc87d9..cb9ea24 100644
--- a/deep_maxent_irl.py
+++ b/deep_maxent_irl.py
@@ -274,7 +274,6 @@ def step(t, start, end):
             mu[start:end, t + 1] = np.sum(np.sum(mu[:, t, np.newaxis, np.newaxis] * (P_a[:, :, start:end] * policy[:, :, np.newaxis]), axis=1), axis=0)
   else:
       def step(t, start, end):
-          print(t)
           if deterministic:
             # The following needs be be done using ufunc
             # https://stackoverflow.com/questions/41990028/add-multiple-values-to-one-numpy-array-index
@@ -284,7 +283,9 @@ def step(t, start, end):
             # mu[P_az[start:end], t + 1] += mu[start:end, t]
             np.add.at(mu, [P_az[start:end], t + 1], mu[start:end, t])
           else:
-            mu[start:end, t + 1] = np.sum(np.sum(mu[:, t, np.newaxis, np.newaxis] * (P_a[:, :, start:end] * policy[:, :, np.newaxis]), axis=1), axis=0)
+            # mu[P_a[start:end, :], t + 1] += mu[start:end, t, np.newaxis] * policy[start:end, :]
+            val = mu[start:end, t, np.newaxis] * policy[start:end, :]
+            np.add.at(mu, [P_a[start:end, :], t + 1], mu[start:end, t, np.newaxis] * policy[start:end, :])
 
 
   with ThreadPoolExecutor(max_workers=num_cpus) as e:
diff --git a/deep_maxent_irl_gridworld.py b/deep_maxent_irl_gridworld.py
index 5515daf..5a3642c 100644
--- a/deep_maxent_irl_gridworld.py
+++ b/deep_maxent_irl_gridworld.py
@@ -115,10 +115,10 @@ def main():
 
   trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START)
 
-  values_gt, policy_gt = value_iteration.value_iteration_old(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True)
+  values_gt, policy_gt = value_iteration.value_iteration_old(P_a, rewards_gt, GAMMA, error=0.01, deterministic=False)
 
-  svf = compute_state_visition_freq_old(P_a, GAMMA, trajs, policy_gt, True)
-  svf2 = compute_state_visition_freq(P_an, GAMMA, trajs, policy_gt.astype(np.int32), True)
+  svf = compute_state_visition_freq_old(P_a, GAMMA, trajs, policy_gt, False)
+  svf2 = compute_state_visition_freq(P_an, GAMMA, trajs, policy_gt, False)
 
 
   

From 19eee2b50040e84abd36ab43e577f3df6d2b9e34 Mon Sep 17 00:00:00 2001
From: Magnus Jahnen <magnus.jahnen@daimler.com>
Date: Fri, 1 Dec 2017 18:37:57 -0800
Subject: [PATCH 05/14] cleanup

---
 deep_maxent_irl.py           |  8 +++++---
 deep_maxent_irl_gridworld.py | 12 +-----------
 2 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
index cb9ea24..fa66cba 100644
--- a/deep_maxent_irl.py
+++ b/deep_maxent_irl.py
@@ -284,7 +284,6 @@ def step(t, start, end):
             np.add.at(mu, [P_az[start:end], t + 1], mu[start:end, t])
           else:
             # mu[P_a[start:end, :], t + 1] += mu[start:end, t, np.newaxis] * policy[start:end, :]
-            val = mu[start:end, t, np.newaxis] * policy[start:end, :]
             np.add.at(mu, [P_a[start:end, :], t + 1], mu[start:end, t, np.newaxis] * policy[start:end, :])
 
 
@@ -413,8 +412,11 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse):
   """
 
   # tf.set_random_seed(1)
-  
-  N_STATES, _, N_ACTIONS = np.shape(P_a)
+
+  if len(P_a.shape) == 3:
+      N_STATES, _, N_ACTIONS = np.shape(P_a)
+  else:
+      N_STATES, N_ACTIONS = np.shape(P_a)
 
   # init nn model
   nn_r = DeepIRLFC(feat_map.shape, N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=False, conv=conv, sparse=sparse)
diff --git a/deep_maxent_irl_gridworld.py b/deep_maxent_irl_gridworld.py
index 5a3642c..ed9fd98 100644
--- a/deep_maxent_irl_gridworld.py
+++ b/deep_maxent_irl_gridworld.py
@@ -94,12 +94,9 @@ def main():
   P_a = gw.get_transition_mat()
 
   if ACT_RAND == 0:
-    print(P_a.transpose(0, 2, 1))
-    P_an = np.argmax(P_a.transpose(0, 2, 1), axis=-1)
-    print(P_an)
+    P_a = np.argmax(P_a.transpose(0, 2, 1), axis=-1)
 
   values_gt, policy_gt = value_iteration.value_iteration_old(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True)
-  values_gt2, policy_gt2 = value_iteration.value_iteration(P_an, rewards_gt, GAMMA, error=0.01, deterministic=True)
 
   # use identity matrix as feature
   #feat_map = np.eye(N_STATES)
@@ -115,13 +112,6 @@ def main():
 
   trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START)
 
-  values_gt, policy_gt = value_iteration.value_iteration_old(P_a, rewards_gt, GAMMA, error=0.01, deterministic=False)
-
-  svf = compute_state_visition_freq_old(P_a, GAMMA, trajs, policy_gt, False)
-  svf2 = compute_state_visition_freq(P_an, GAMMA, trajs, policy_gt, False)
-
-
-  
   print 'Deep Max Ent IRL training ..'
   t = time.time()
   rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS, ARGS.conv, ARGS.sparse)

From 6068586d251af75d5a415566229992d7451383ce Mon Sep 17 00:00:00 2001
From: Magnus Jahnen <magnus.jahnen@daimler.com>
Date: Fri, 1 Dec 2017 18:43:55 -0800
Subject: [PATCH 06/14] Use numpy VI

---
 deep_maxent_irl_gridworld.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deep_maxent_irl_gridworld.py b/deep_maxent_irl_gridworld.py
index ed9fd98..a5813e0 100644
--- a/deep_maxent_irl_gridworld.py
+++ b/deep_maxent_irl_gridworld.py
@@ -96,7 +96,7 @@ def main():
   if ACT_RAND == 0:
     P_a = np.argmax(P_a.transpose(0, 2, 1), axis=-1)
 
-  values_gt, policy_gt = value_iteration.value_iteration_old(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True)
+  values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True)
 
   # use identity matrix as feature
   #feat_map = np.eye(N_STATES)

From 9e74b90a3565c65e8985a8b70797ea7511e13582 Mon Sep 17 00:00:00 2001
From: Magnus Jahnen <magnus.jahnen@daimler.com>
Date: Fri, 1 Dec 2017 19:09:47 -0800
Subject: [PATCH 07/14] Create deterministic gridworld transition matrix on the
 fly

---
 deep_maxent_irl_gridworld.py |  5 +++--
 mdp/gridworld.py             | 28 ++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/deep_maxent_irl_gridworld.py b/deep_maxent_irl_gridworld.py
index a5813e0..114cbc0 100644
--- a/deep_maxent_irl_gridworld.py
+++ b/deep_maxent_irl_gridworld.py
@@ -91,10 +91,11 @@ def main():
   gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)
 
   rewards_gt = np.reshape(rmap_gt, H*W, order='F')
-  P_a = gw.get_transition_mat()
 
   if ACT_RAND == 0:
-    P_a = np.argmax(P_a.transpose(0, 2, 1), axis=-1)
+    P_a = gw.get_transition_mat_deterministic()
+  else:
+    P_a = gw.get_transition_mat()
 
   values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True)
 
diff --git a/mdp/gridworld.py b/mdp/gridworld.py
index e48fa80..40d7a22 100644
--- a/mdp/gridworld.py
+++ b/mdp/gridworld.py
@@ -325,6 +325,34 @@ def get_transition_mat(self):
           P_a[si, sj, a] = prob
     return P_a
 
+  def get_transition_mat_deterministic(self):
+    """
+    get transition dynamics of the gridworld
+
+    return:
+      P_a         NxNxN_ACTIONS transition probabilities matrix -
+                    P_a[s0, s1, a] is the transition prob of
+                    landing at state s1 when taking action
+                    a at state s0
+    """
+    N_STATES = self.height*self.width
+    N_ACTIONS = len(self.actions)
+    P_a = np.zeros((N_STATES, N_ACTIONS), dtype=np.int32)
+    for si in range(N_STATES):
+      posi = self.idx2pos(si)
+      for a in range(N_ACTIONS):
+        probs = self.get_transition_states_and_probs(posi, a)
+
+        for posj, prob in probs:
+          sj = self.pos2idx(posj)
+          # Prob of si to sj given action a
+          prob = int(prob)
+          if prob == 1:
+            P_a[si, a] = sj
+          elif prob != 0:
+            raise ValueError('not a deterministic environment!')
+    return P_a
+
   def get_values_mat(self, values):
     """
     inputs:

From 87013d1d96963582ff957d56710dd65ea1132daa Mon Sep 17 00:00:00 2001
From: Magnus Jahnen <magnus.jahnen@daimler.com>
Date: Sat, 2 Dec 2017 16:03:34 -0800
Subject: [PATCH 08/14] TF VI deterministic model

---
 deep_maxent_irl.py | 77 +++++++++++++++++++++++++++++-----------------
 1 file changed, 48 insertions(+), 29 deletions(-)

diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
index fa66cba..b9c10b2 100644
--- a/deep_maxent_irl.py
+++ b/deep_maxent_irl.py
@@ -16,7 +16,7 @@
 class DeepIRLFC:
 
 
-  def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, deterministic=False, sparse=False, conv=False, name='deep_irl_fc'):
+  def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, deterministic_env=False, deterministic=False, sparse=False, conv=False, name='deep_irl_fc'):
     if len(n_input) > 1:
         self.height, self.width = n_input
         self.n_input = self.height * self.width
@@ -26,6 +26,7 @@ def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, determi
     self.n_h1 = n_h1
     self.n_h2 = n_h2
     self.name = name
+    self.deterministic_env = deterministic_env
     self.deterministic = deterministic
     self.sparse = sparse
     self.conv = conv
@@ -35,16 +36,23 @@ def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, determi
     self.sess = tf.Session(config=config)
     self.input_s, self.reward, self.theta = self._build_network(self.name, conv)
 
+    if self.deterministic_env:
+        p_a_shape = (self.n_input, n_actions)
+        p_a_dtype = tf.int32
+    else:
+        p_a_shape = (self.n_input, n_actions, self.n_input)
+        p_a_dtype = tf.float32
+
     # value iteration
     if sparse:
-        self.P_a = tf.sparse_placeholder(tf.float32, shape=(self.n_input, n_actions, self.n_input))
+        self.P_a = tf.sparse_placeholder(p_a_dtype, shape=p_a_shape)
         self.reduce_max_sparse = tf.sparse_reduce_max_sparse
         self.reduce_sum_sparse = tf.sparse_reduce_sum_sparse
         self.reduce_max = tf.sparse_reduce_max
         self.reduce_sum = tf.sparse_reduce_sum
         self.sparse_transpose = tf.sparse_transpose
     else:
-        self.P_a = tf.placeholder(tf.float32, shape=(self.n_input, n_actions, self.n_input))
+        self.P_a = tf.placeholder(p_a_dtype, shape=p_a_shape)
         self.reduce_max = tf.reduce_max
         self.reduce_max_sparse = tf.reduce_max
         self.reduce_sum = tf.reduce_sum
@@ -59,7 +67,7 @@ def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, determi
     self.T = T
     self.mu = tf.placeholder(tf.float32, self.n_input, name='mu_placerholder')
 
-    self.svf = self._svf(self.policy)
+    #self.svf = self._svf(self.policy)
 
     self.optimizer = tf.train.GradientDescentOptimizer(lr)
     
@@ -102,13 +110,18 @@ def _vi(self, rewards):
 
       rewards_expanded = rewards #tf.tile(tf.expand_dims(rewards, 1), [1, self.n_input])
 
-      def body(i, c, t):
-          old_values = t.read(i)
+      def vi_step(values):
+          if self.deterministic_env:
+            new_value = tf.gather(rewards_expanded, self.P_a) + self.gamma * tf.gather(values, self.P_a)
+          else:
+            new_value = self.reduce_sum_sparse(self.P_a * (rewards_expanded + self.gamma * values), axis=2)
 
-          expected_value = rewards_expanded + self.gamma * old_values
-          #expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(self.P_a)[1], 1])
+          return new_value
 
-          new_values = self.reduce_max(self.reduce_sum_sparse(self.P_a * expected_value, axis=2), axis=1)
+      def body(i, c, t):
+          old_values = t.read(i)
+          new_values = vi_step(old_values)
+          new_values = self.reduce_max(new_values, axis=1)
           t = t.write(i + 1, new_values)
 
           c = tf.reduce_max(tf.abs(new_values - old_values)) > self.epsilon
@@ -125,15 +138,12 @@ def condition(i, c, t):
       i, _, values = tf.while_loop(condition, body, [0, True, t], parallel_iterations=1, back_prop=False,
                                    name='VI_loop')
       values = values.read(i)
-
-      expected_value = rewards_expanded + self.gamma * values
-      #expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(self.P_a)[1], 1])
+      new_values = vi_step(values)
 
       if self.deterministic:
-          policy = tf.argmax(self.reduce_sum(self.P_a * expected_value, axis=2), axis=1)
+        policy = tf.argmax(new_values, axis=1)
       else:
-          policy = self.reduce_sum(self.P_a * expected_value, axis=2)
-          policy = tf.nn.softmax(policy)
+        policy = tf.nn.softmax(new_values)
 
       return values, policy
 
@@ -192,6 +202,10 @@ def get_rewards(self, states):
     return rewards
 
   def get_policy(self, states, P_a, gamma, epsilon=0.01):
+    if self.conv:
+      states = np.expand_dims(np.expand_dims(states, axis=0), axis=-1)
+    else:
+      states = np.expand_dims(states, axis=0)
     return self.sess.run([self.reward, self.values, self.policy],
                          feed_dict={self.input_s: states, self.P_a: P_a, self.gamma: gamma, self.epsilon: epsilon})
 
@@ -287,7 +301,7 @@ def step(t, start, end):
             np.add.at(mu, [P_a[start:end, :], t + 1], mu[start:end, t, np.newaxis] * policy[start:end, :])
 
 
-  with ThreadPoolExecutor(max_workers=num_cpus) as e:
+  with ThreadPoolExecutor(max_workers=1) as e:
     for t in range(T - 1):
       futures = list()
       for i in range(0, N_STATES, chunk_size):
@@ -418,18 +432,23 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse):
   else:
       N_STATES, N_ACTIONS = np.shape(P_a)
 
+  deterministic = True
+
   # init nn model
-  nn_r = DeepIRLFC(feat_map.shape, N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=False, conv=conv, sparse=sparse)
+  nn_r = DeepIRLFC(feat_map.shape, N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic_env=len(P_a.shape) == 2,  deterministic=deterministic, conv=conv, sparse=sparse)
 
   # find state visitation frequencies using demonstrations
   mu_D = demo_svf(trajs, N_STATES)
   p_start_state = start_state_probs(trajs, N_STATES)
 
-  P_a_t = P_a.transpose(0, 2, 1)
-  if sparse:
-    mask = P_a_t > 0
-    indices = np.argwhere(mask)
-    P_a_t = tf.SparseTensorValue(indices, P_a_t[mask], P_a_t.shape)
+  if len(P_a.shape) == 3:
+      P_a_t = P_a.transpose(0, 2, 1)
+      if sparse:
+        mask = P_a_t > 0
+        indices = np.argwhere(mask)
+        P_a_t = tf.SparseTensorValue(indices, P_a_t[mask], P_a_t.shape)
+  else:
+      P_a_t = P_a
 
   grads = list()
 
@@ -442,20 +461,20 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse):
     # rewards = nn_r.get_rewards(feat_map)
 
     # compute policy
-    #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=False)
+    #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=deterministic)
 
     # compute rewards and policy at the same time
     #t = time.time()
-    #rewards, _, policy = nn_r.get_policy(feat_map, P_a_t, gamma, 0.01)
+    rewards, values, policy = nn_r.get_policy(feat_map, P_a_t, gamma, 0.000001)
     #print('tensorflow VI', time.time() - t)
     
     # compute expected svf
-    #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False)
+    mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=deterministic)
 
-    rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001)
+    #rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001)
     #print(rewards)
 
-    #assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, False)
+    assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, deterministic)
 
     # compute gradients on rewards:
     grad_r = mu_D - mu_exp
@@ -477,9 +496,9 @@ def assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS,
                                                                    deterministic=deterministic)
     assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001,
                                                                                deterministic=deterministic)
-    assert_values2 = value_iteration.optimal_value(N_STATES, N_ACTIONS, P_a_t, rewards, gamma, threshold=0.000001)
+    #assert_values2 = value_iteration.optimal_value(N_STATES, N_ACTIONS, P_a_t, rewards, gamma, threshold=0.000001)
 
-    assert (np.abs(assert_values - assert_values2) < 0.0001).all()
+    #assert (np.abs(assert_values - assert_values2) < 0.0001).all()
     assert (np.abs(assert_values - assert_values_old) < 0.0001).all()
     assert (np.abs(values - assert_values) < 0.0001).all()
     assert (np.abs(values - assert_values_old) < 0.0001).all()

From 0b8258e11532e0b45133948a748ae674d2b60483 Mon Sep 17 00:00:00 2001
From: Magnus Jahnen <magnus.jahnen@daimler.com>
Date: Sat, 2 Dec 2017 16:16:03 -0800
Subject: [PATCH 09/14] Additional assertion tests

---
 deep_maxent_irl.py | 59 ++++++++++++++++++++++++++++------------------
 1 file changed, 36 insertions(+), 23 deletions(-)

diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
index b9c10b2..0b6ea09 100644
--- a/deep_maxent_irl.py
+++ b/deep_maxent_irl.py
@@ -432,7 +432,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse):
   else:
       N_STATES, N_ACTIONS = np.shape(P_a)
 
-  deterministic = True
+  deterministic = False
 
   # init nn model
   nn_r = DeepIRLFC(feat_map.shape, N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic_env=len(P_a.shape) == 2,  deterministic=deterministic, conv=conv, sparse=sparse)
@@ -474,7 +474,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse):
     #rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001)
     #print(rewards)
 
-    assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, deterministic)
+    assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, N_ACTIONS, N_STATES, trajs, gamma, deterministic)
 
     # compute gradients on rewards:
     grad_r = mu_D - mu_exp
@@ -491,27 +491,40 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse):
   # return sigmoid(normalize(rewards))
   return normalize(rewards)
 
-def assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, deterministic):
-    assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001,
-                                                                   deterministic=deterministic)
-    assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001,
-                                                                               deterministic=deterministic)
-    #assert_values2 = value_iteration.optimal_value(N_STATES, N_ACTIONS, P_a_t, rewards, gamma, threshold=0.000001)
-
-    #assert (np.abs(assert_values - assert_values2) < 0.0001).all()
-    assert (np.abs(assert_values - assert_values_old) < 0.0001).all()
-    assert (np.abs(values - assert_values) < 0.0001).all()
-    assert (np.abs(values - assert_values_old) < 0.0001).all()
-
-    print(assert_policy)
-    print(assert_policy_old)
-    print(policy)
-    print(values)
-    print(assert_values)
-    print(rewards)
-    assert (np.abs(assert_policy - assert_policy_old) < 0.0001).all()
-    assert (np.abs(policy - assert_policy) < 0.0001).all()
-    assert (np.abs(policy - assert_policy_old) < 0.0001).all()
+def assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, N_ACTIONS, N_STATES, trajs, gamma, deterministic):
+
+    def assert_vi(P_a):
+        assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001,
+                                                                       deterministic=deterministic)
+        assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001,
+                                                                                   deterministic=deterministic)
+
+        if len(P_a) == 3:
+            assert_values2 = value_iteration.optimal_value(N_STATES, N_ACTIONS, P_a_t, rewards, gamma, threshold=0.000001)
+
+            assert (np.abs(assert_values - assert_values2) < 0.0001).all()
+
+        assert (np.abs(assert_values - assert_values_old) < 0.0001).all()
+        assert (np.abs(values - assert_values) < 0.0001).all()
+        assert (np.abs(values - assert_values_old) < 0.0001).all()
+
+        # print(assert_policy)
+        # print(assert_policy_old)
+        # print(policy)
+        # print(values)
+        # print(assert_values)
+        # print(rewards)
+        assert (np.abs(assert_policy - assert_policy_old) < 0.0001).all()
+        assert (np.abs(policy - assert_policy) < 0.0001).all()
+        assert (np.abs(policy - assert_policy_old) < 0.0001).all()
+
+    assert_vi(P_a)
+    if len(P_a.shape) == 2:
+        print('creating full transistion matrix')
+        # construct full sparse transisiton matrix and make sure values are the same
+        P_a_t = np.zeros((N_STATES, N_ACTIONS, N_STATES))
+        P_a_t[P_a] = 1
+        assert_vi(P_a)
 
     assert (np.abs(mu_exp - compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=deterministic)) < 0.00001).all()
     assert (

From 155ac2af825dfabc3d14fff088a2b56fc42fc51d Mon Sep 17 00:00:00 2001
From: Magnus Jahnen <magnus.jahnen@daimler.com>
Date: Sat, 2 Dec 2017 16:41:09 -0800
Subject: [PATCH 10/14] EVD deterministic model dynamics

---
 deep_maxent_irl.py     |  2 +-
 mdp/value_iteration.py | 48 +++++++++++++++++++++++++++++-------------
 2 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
index 0b6ea09..a7a8b0b 100644
--- a/deep_maxent_irl.py
+++ b/deep_maxent_irl.py
@@ -474,7 +474,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse):
     #rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001)
     #print(rewards)
 
-    assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, N_ACTIONS, N_STATES, trajs, gamma, deterministic)
+    #assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, N_ACTIONS, N_STATES, trajs, gamma, deterministic)
 
     # compute gradients on rewards:
     grad_r = mu_D - mu_exp
diff --git a/mdp/value_iteration.py b/mdp/value_iteration.py
index 6e5fed2..2fb8bb8 100644
--- a/mdp/value_iteration.py
+++ b/mdp/value_iteration.py
@@ -448,8 +448,13 @@ def optimal_value(n_states, n_actions, transition_probabilities, reward,
     return v
 
 def value_parallel(policy, P_a, rewards, gamma, threshold=1e-2):
+    if len(P_a.shape) == 3:
+        N_STATES, _, N_ACTIONS = np.shape(P_a)
+    else:
+        N_STATES, N_ACTIONS = np.shape(P_a)
+
     deterministic = len(policy.shape) == 1
-    N_STATES, _, N_ACTIONS = np.shape(P_a)
+    deterministic_env = len(P_a.shape) == 2
 
     values = np.zeros([N_STATES])
 
@@ -458,25 +463,40 @@ def value_parallel(policy, P_a, rewards, gamma, threshold=1e-2):
     if chunk_size == 0:
         chunk_size = N_STATES
 
-    rewards_expanded = rewards[:, np.newaxis].repeat(N_STATES, axis=1)
-    
-    if deterministic:
-        P_az = P_a[np.arange(0, N_STATES), :, policy]
+    rewards_expanded = rewards
+
+    if not deterministic_env:
+        if deterministic:
+            P_az = P_a[np.arange(0, N_STATES), :, policy]
+        else:
+            P_a = P_a.transpose(0, 2, 1)
     else:
-        P_a = P_a.transpose(0, 2, 1)
+        if deterministic:
+            P_az = P_a[np.arange(0, N_STATES), policy]
+
 
     # estimate values
     while True:
         values_tmp = values.copy()
 
         def step(start, end):
-            expected_value = rewards_expanded[start:end, :] + gamma * values_tmp
+            if deterministic_env:
+                expected_value = rewards_expanded[P_az[start:end]] + gamma * values_tmp[P_az[start:end]]
+            else:
+                expected_value = rewards_expanded[start:end, :] + gamma * values_tmp
+
             if deterministic:
-                values[start:end] = (P_az[start:end, :] * expected_value).sum(axis=1)
+
+                if deterministic_env:
+                    values[start:end] = expected_value
+                else:
+                    values[start:end] = (P_az[start:end, :] * expected_value).sum(axis=1)
             else:
-               expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2)
-               #expected_value = np.transpose(expected_value, (0, 2, 1))
-               values[start:end] = (P_a[start:end, :, :] * expected_value).sum(axis=2).sum(axis=1)
+                if deterministic_env:
+                    values[start:end] = (policy * expected_value).sum(axis=1)
+                else:
+                    expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2)
+                    values[start:end] = (P_a[start:end, :, :] * expected_value * policy).sum(axis=2).sum(axis=1)
 
         with ThreadPoolExecutor(max_workers=num_cpus) as e:
             futures = list()
@@ -517,16 +537,14 @@ def value(policy, n_states, transition_probabilities, reward, discount,
         for s in range(n_states):
             vs = values_tmp[s]
             a = policy[s]
-            v[s] = sum(transition_probabilities[s, a, k] *
-                       (reward[s] + discount * values_tmp[k])
-                       for k in range(n_states))
+            v[s] = reward[transition_probabilities[s, a]] + discount * values_tmp[transition_probabilities[s, a]]
             diff = max(diff, abs(vs - v[s]))
 
     return v
 
 def expected_value_diff(P_a, true_rewards, gamma, p_start, optimal_value, policy):
   v = value_parallel(policy, P_a, true_rewards, gamma)
-  #v_old = value(policy, P_a.shape[0], P_a.transpose(0, 2, 1), true_rewards, gamma)
+  #v_old = value(policy, P_a.shape[0], P_a, true_rewards, gamma)
 
   #if len(policy.shape) == 1:
   #  assert (np.abs(v - v_old) < 0.001).all()

From 10727f9ea93f2246d27f149cd8af31eefdfa457c Mon Sep 17 00:00:00 2001
From: Magnus Jahnen <magnus.jahnen@daimler.com>
Date: Sat, 2 Dec 2017 17:16:27 -0800
Subject: [PATCH 11/14] First try: svf deterministic dynamics model TF

---
 deep_maxent_irl.py | 48 ++++++++++++++++++++++++++++++----------------
 1 file changed, 31 insertions(+), 17 deletions(-)

diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
index a7a8b0b..74c3d13 100644
--- a/deep_maxent_irl.py
+++ b/deep_maxent_irl.py
@@ -67,7 +67,7 @@ def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, determi
     self.T = T
     self.mu = tf.placeholder(tf.float32, self.n_input, name='mu_placerholder')
 
-    #self.svf = self._svf(self.policy)
+    self.svf = self._svf(self.policy)
 
     self.optimizer = tf.train.GradientDescentOptimizer(lr)
     
@@ -148,25 +148,39 @@ def condition(i, c, t):
       return values, policy
 
   def _svf(self, policy):
-      if self.deterministic:
-        r = tf.range(self.n_input, dtype=tf.int64)
-        expanded = tf.expand_dims(policy, 1)
-        tiled = tf.tile(expanded, [1, self.n_input])
-
-        grid = tf.meshgrid(r, r)
-        indices = tf.stack([grid[1], grid[0], tiled], axis=2)
-        
-        P_a_cur_policy = tf.gather_nd(self.sparse_transpose(self.P_a, (0, 2, 1)), indices)
-        P_a_cur_policy = tf.transpose(P_a_cur_policy, (1, 0))
+      if not self.deterministic_env:
+          if self.deterministic:
+            r = tf.range(self.n_input, dtype=tf.int64)
+            expanded = tf.expand_dims(policy, 1)
+            tiled = tf.tile(expanded, [1, self.n_input])
+
+            grid = tf.meshgrid(r, r)
+            indices = tf.stack([grid[1], grid[0], tiled], axis=2)
+
+            P_a_cur_policy = tf.gather_nd(self.sparse_transpose(self.P_a, (0, 2, 1)), indices)
+            P_a_cur_policy = tf.transpose(P_a_cur_policy, (1, 0))
+          else:
+            P_a_cur_policy = self.P_a * tf.expand_dims(policy, 2)
       else:
-        P_a_cur_policy = self.P_a * tf.expand_dims(policy, 2)
+          if self.deterministic:
+            r = tf.range(self.n_input, dtype=tf.int64)
+            indices = tf.stack([r, policy], axis=1)
+
+            P_a_cur_policy = tf.gather_nd(self.P_a, indices)
+            P_a_cur_policy = tf.Print(P_a_cur_policy, [P_a_cur_policy], 'P_a_cur_policy', summarize=500)
+          else:
+            P_a_cur_policy = self.P_a
 
       mu = list()
       mu.append(self.mu)
       with tf.variable_scope('svf'):
           if self.deterministic:
               for t in range(self.T - 1):
-                  cur_mu = self.reduce_sum(mu[t] * P_a_cur_policy, axis=1)
+                  if self.deterministic_env:
+                      cur_mu = tf.Variable(tf.constant(0, dtype=tf.float32, shape=(self.n_input,)), trainable=False)
+                      cur_mu = tf.scatter_add(cur_mu, P_a_cur_policy, mu[t])
+                  else:
+                    cur_mu = self.reduce_sum(mu[t] * P_a_cur_policy, axis=1)
                   mu.append(cur_mu)
           else:
               for t in range(self.T - 1):
@@ -432,7 +446,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse):
   else:
       N_STATES, N_ACTIONS = np.shape(P_a)
 
-  deterministic = False
+  deterministic = True
 
   # init nn model
   nn_r = DeepIRLFC(feat_map.shape, N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic_env=len(P_a.shape) == 2,  deterministic=deterministic, conv=conv, sparse=sparse)
@@ -469,12 +483,12 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse):
     #print('tensorflow VI', time.time() - t)
     
     # compute expected svf
-    mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=deterministic)
+    #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=deterministic)
 
-    #rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001)
+    rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001)
     #print(rewards)
 
-    #assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, N_ACTIONS, N_STATES, trajs, gamma, deterministic)
+    assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, N_ACTIONS, N_STATES, trajs, gamma, deterministic)
 
     # compute gradients on rewards:
     grad_r = mu_D - mu_exp

From 2d9b584ada118c1fc926454171545499ecc58f8f Mon Sep 17 00:00:00 2001
From: Magnus <jahnen@in.tum.de>
Date: Sat, 2 Dec 2017 22:07:51 -0800
Subject: [PATCH 12/14] Fix TF svf

---
 deep_maxent_irl.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
index 74c3d13..28f40de 100644
--- a/deep_maxent_irl.py
+++ b/deep_maxent_irl.py
@@ -167,7 +167,6 @@ def _svf(self, policy):
             indices = tf.stack([r, policy], axis=1)
 
             P_a_cur_policy = tf.gather_nd(self.P_a, indices)
-            P_a_cur_policy = tf.Print(P_a_cur_policy, [P_a_cur_policy], 'P_a_cur_policy', summarize=500)
           else:
             P_a_cur_policy = self.P_a
 
@@ -177,7 +176,10 @@ def _svf(self, policy):
           if self.deterministic:
               for t in range(self.T - 1):
                   if self.deterministic_env:
+                      # TODO using a variable here seems a little hacky
+                      # https://github.com/tensorflow/tensorflow/issues/2358
                       cur_mu = tf.Variable(tf.constant(0, dtype=tf.float32, shape=(self.n_input,)), trainable=False)
+                      cur_mu = cur_mu.assign(tf.zeros(shape=(self.n_input,)))
                       cur_mu = tf.scatter_add(cur_mu, P_a_cur_policy, mu[t])
                   else:
                     cur_mu = self.reduce_sum(mu[t] * P_a_cur_policy, axis=1)

From 09b02d9d0f992260c0c84da3a0bc736cc523365b Mon Sep 17 00:00:00 2001
From: Magnus <jahnen@in.tum.de>
Date: Sat, 2 Dec 2017 22:14:16 -0800
Subject: [PATCH 13/14] stochastic policy deterministic model TF SVF

---
 deep_maxent_irl.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
index 28f40de..70426c2 100644
--- a/deep_maxent_irl.py
+++ b/deep_maxent_irl.py
@@ -186,7 +186,12 @@ def _svf(self, policy):
                   mu.append(cur_mu)
           else:
               for t in range(self.T - 1):
-                  cur_mu = self.reduce_sum(self.reduce_sum_sparse(tf.tile(tf.expand_dims(tf.expand_dims(mu[t], 1), 2),
+                  if self.deterministic_env:
+                      cur_mu = tf.Variable(tf.constant(0, dtype=tf.float32, shape=(self.n_input,)), trainable=False)
+                      cur_mu = cur_mu.assign(tf.zeros(shape=(self.n_input,)))
+                      cur_mu = tf.scatter_add(cur_mu, P_a_cur_policy, tf.expand_dims(mu[t], axis=1) * policy)
+                  else:
+                    cur_mu = self.reduce_sum(self.reduce_sum_sparse(tf.tile(tf.expand_dims(tf.expand_dims(mu[t], 1), 2),
                                                                           [1, tf.shape(policy)[1],
                                                                            self.n_input]) * P_a_cur_policy, axis=1),
                                            axis=0)
@@ -448,7 +453,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse):
   else:
       N_STATES, N_ACTIONS = np.shape(P_a)
 
-  deterministic = True
+  deterministic = False
 
   # init nn model
   nn_r = DeepIRLFC(feat_map.shape, N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic_env=len(P_a.shape) == 2,  deterministic=deterministic, conv=conv, sparse=sparse)

From 2db0a12033298a356f83e9e50dcfaa932403c5bb Mon Sep 17 00:00:00 2001
From: Magnus Jahnen <magnus.jahnen@daimler.com>
Date: Mon, 4 Dec 2017 14:56:54 -0800
Subject: [PATCH 14/14] deeper convnet with 3x3 filters first layer

---
 deep_maxent_irl.py           | 10 +++++-----
 deep_maxent_irl_gridworld.py | 10 ++++++----
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
index 70426c2..c4a263a 100644
--- a/deep_maxent_irl.py
+++ b/deep_maxent_irl.py
@@ -89,10 +89,10 @@ def _build_network(self, name, conv):
     if conv:
         input_s = tf.placeholder(tf.float32, [None, self.width, self.height, 1])
         with tf.variable_scope(name):
-          #conv1 = tf_utils.conv2d(input_s, 64, (1, 1), 1)
-          #conv2 = tf_utils.conv2d(conv1, 32, (1, 1), 1)
-          #conv3 = tf_utils.conv2d(conv2, 32, (1, 1), 1)
-          reward = tf_utils.conv2d(input_s, 1, (1, 1), 1)
+          conv1 = tf_utils.conv2d(input_s, 64, (3, 3), 1)
+          conv2 = tf_utils.conv2d(conv1, 32, (1, 1), 1)
+          conv3 = tf_utils.conv2d(conv2, 32, (1, 1), 1)
+          reward = tf_utils.conv2d(conv3, 1, (1, 1), 1)
         theta = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name)
         return input_s, tf.squeeze(tf.reshape(reward, (-1, self.n_input))), theta
     else:
@@ -495,7 +495,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse):
     rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001)
     #print(rewards)
 
-    assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, N_ACTIONS, N_STATES, trajs, gamma, deterministic)
+    #assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, N_ACTIONS, N_STATES, trajs, gamma, deterministic)
 
     # compute gradients on rewards:
     grad_r = mu_D - mu_exp
diff --git a/deep_maxent_irl_gridworld.py b/deep_maxent_irl_gridworld.py
index 114cbc0..fe05ef1 100644
--- a/deep_maxent_irl_gridworld.py
+++ b/deep_maxent_irl_gridworld.py
@@ -101,12 +101,14 @@ def main():
 
   # use identity matrix as feature
   #feat_map = np.eye(N_STATES)
-  feat_map = np.zeros(N_STATES).reshape((H, W))
+  # feat_map = np.zeros(N_STATES).reshape((H, W))
+  feat_map = np.random.rand(N_STATES).reshape((H, W))
   #feat_map = np.arange(N_STATES).reshape((H, W))
   if ARGS.conv:
-    feat_map[H-1, W-1] = -5
-    feat_map[0, W-1] = -5
-    feat_map[H-1, 0] = -5
+    #feat_map[H-1, W-1] = -5
+    #feat_map[0, W-1] = -5
+    #feat_map[H-1, 0] = -5
+    pass
   else:
     feat_map = feat_map.reshape(N_STATES)
   #feat_map = rmap_gt