magnusja · magnusja · Dec 4, 2017 · Dec 2, 2017 · Dec 2, 2017 · Dec 2, 2017
diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py
@@ -16,7 +16,7 @@
 class DeepIRLFC:
 
 
-  def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, deterministic=False, sparse=False, conv=False, name='deep_irl_fc'):
+  def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, deterministic_env=False, deterministic=False, sparse=False, conv=False, name='deep_irl_fc'):
     if len(n_input) > 1:
         self.height, self.width = n_input
         self.n_input = self.height * self.width
@@ -26,6 +26,7 @@ def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, determi
     self.n_h1 = n_h1
     self.n_h2 = n_h2
     self.name = name
+    self.deterministic_env = deterministic_env
     self.deterministic = deterministic
     self.sparse = sparse
     self.conv = conv
@@ -35,16 +36,23 @@ def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, determi
     self.sess = tf.Session(config=config)
     self.input_s, self.reward, self.theta = self._build_network(self.name, conv)
 
+    if self.deterministic_env:
+        p_a_shape = (self.n_input, n_actions)
+        p_a_dtype = tf.int32
+    else:
+        p_a_shape = (self.n_input, n_actions, self.n_input)
+        p_a_dtype = tf.float32
+
     # value iteration
     if sparse:
-        self.P_a = tf.sparse_placeholder(tf.float32, shape=(self.n_input, n_actions, self.n_input))
+        self.P_a = tf.sparse_placeholder(p_a_dtype, shape=p_a_shape)
         self.reduce_max_sparse = tf.sparse_reduce_max_sparse
         self.reduce_sum_sparse = tf.sparse_reduce_sum_sparse
         self.reduce_max = tf.sparse_reduce_max
         self.reduce_sum = tf.sparse_reduce_sum
         self.sparse_transpose = tf.sparse_transpose
     else:
-        self.P_a = tf.placeholder(tf.float32, shape=(self.n_input, n_actions, self.n_input))
+        self.P_a = tf.placeholder(p_a_dtype, shape=p_a_shape)
         self.reduce_max = tf.reduce_max
         self.reduce_max_sparse = tf.reduce_max
         self.reduce_sum = tf.reduce_sum
@@ -81,10 +89,10 @@ def _build_network(self, name, conv):
     if conv:
         input_s = tf.placeholder(tf.float32, [None, self.width, self.height, 1])
         with tf.variable_scope(name):
-          #conv1 = tf_utils.conv2d(input_s, 64, (1, 1), 1)
-          #conv2 = tf_utils.conv2d(conv1, 32, (1, 1), 1)
-          #conv3 = tf_utils.conv2d(conv2, 32, (1, 1), 1)
-          reward = tf_utils.conv2d(input_s, 1, (1, 1), 1)
+          conv1 = tf_utils.conv2d(input_s, 64, (3, 3), 1)
+          conv2 = tf_utils.conv2d(conv1, 32, (1, 1), 1)
+          conv3 = tf_utils.conv2d(conv2, 32, (1, 1), 1)
+          reward = tf_utils.conv2d(conv3, 1, (1, 1), 1)
         theta = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name)
         return input_s, tf.squeeze(tf.reshape(reward, (-1, self.n_input))), theta
     else:
@@ -102,13 +110,18 @@ def _vi(self, rewards):
 
       rewards_expanded = rewards #tf.tile(tf.expand_dims(rewards, 1), [1, self.n_input])
 
-      def body(i, c, t):
-          old_values = t.read(i)
+      def vi_step(values):
+          if self.deterministic_env:
+            new_value = tf.gather(rewards_expanded, self.P_a) + self.gamma * tf.gather(values, self.P_a)
+          else:
+            new_value = self.reduce_sum_sparse(self.P_a * (rewards_expanded + self.gamma * values), axis=2)
 
-          expected_value = rewards_expanded + self.gamma * old_values
-          #expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(self.P_a)[1], 1])
+          return new_value
 
-          new_values = self.reduce_max(self.reduce_sum_sparse(self.P_a * expected_value, axis=2), axis=1)
+      def body(i, c, t):
+          old_values = t.read(i)
+          new_values = vi_step(old_values)
+          new_values = self.reduce_max(new_values, axis=1)
           t = t.write(i + 1, new_values)
 
           c = tf.reduce_max(tf.abs(new_values - old_values)) > self.epsilon
@@ -125,42 +138,60 @@ def condition(i, c, t):
       i, _, values = tf.while_loop(condition, body, [0, True, t], parallel_iterations=1, back_prop=False,
                                    name='VI_loop')
       values = values.read(i)
-
-      expected_value = rewards_expanded + self.gamma * values
-      #expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(self.P_a)[1], 1])
+      new_values = vi_step(values)
 
       if self.deterministic:
-          policy = tf.argmax(self.reduce_sum(self.P_a * expected_value, axis=2), axis=1)
+        policy = tf.argmax(new_values, axis=1)
       else:
-          policy = self.reduce_sum(self.P_a * expected_value, axis=2)
-          policy = tf.nn.softmax(policy)
+        policy = tf.nn.softmax(new_values)
 
       return values, policy
 
   def _svf(self, policy):
-      if self.deterministic:
-        r = tf.range(self.n_input, dtype=tf.int64)
-        expanded = tf.expand_dims(policy, 1)
-        tiled = tf.tile(expanded, [1, self.n_input])
-
-        grid = tf.meshgrid(r, r)
-        indices = tf.stack([grid[1], grid[0], tiled], axis=2)
-
-        P_a_cur_policy = tf.gather_nd(self.sparse_transpose(self.P_a, (0, 2, 1)), indices)
-        P_a_cur_policy = tf.transpose(P_a_cur_policy, (1, 0))
+      if not self.deterministic_env:
+          if self.deterministic:
+            r = tf.range(self.n_input, dtype=tf.int64)
+            expanded = tf.expand_dims(policy, 1)
+            tiled = tf.tile(expanded, [1, self.n_input])
+
+            grid = tf.meshgrid(r, r)
+            indices = tf.stack([grid[1], grid[0], tiled], axis=2)
+
+            P_a_cur_policy = tf.gather_nd(self.sparse_transpose(self.P_a, (0, 2, 1)), indices)
+            P_a_cur_policy = tf.transpose(P_a_cur_policy, (1, 0))
+          else:
+            P_a_cur_policy = self.P_a * tf.expand_dims(policy, 2)
       else:
-        P_a_cur_policy = self.P_a * tf.expand_dims(policy, 2)
+          if self.deterministic:
+            r = tf.range(self.n_input, dtype=tf.int64)
+            indices = tf.stack([r, policy], axis=1)
+
+            P_a_cur_policy = tf.gather_nd(self.P_a, indices)
+          else:
+            P_a_cur_policy = self.P_a
 
       mu = list()
       mu.append(self.mu)
       with tf.variable_scope('svf'):
           if self.deterministic:
               for t in range(self.T - 1):
-                  cur_mu = self.reduce_sum(mu[t] * P_a_cur_policy, axis=1)
+                  if self.deterministic_env:
+                      # TODO using a variable here seems a little hacky
+                      # https://github.com/tensorflow/tensorflow/issues/2358
+                      cur_mu = tf.Variable(tf.constant(0, dtype=tf.float32, shape=(self.n_input,)), trainable=False)
+                      cur_mu = cur_mu.assign(tf.zeros(shape=(self.n_input,)))
+                      cur_mu = tf.scatter_add(cur_mu, P_a_cur_policy, mu[t])
+                  else:
+                    cur_mu = self.reduce_sum(mu[t] * P_a_cur_policy, axis=1)
                   mu.append(cur_mu)
           else:
               for t in range(self.T - 1):
-                  cur_mu = self.reduce_sum(self.reduce_sum_sparse(tf.tile(tf.expand_dims(tf.expand_dims(mu[t], 1), 2),
+                  if self.deterministic_env:
+                      cur_mu = tf.Variable(tf.constant(0, dtype=tf.float32, shape=(self.n_input,)), trainable=False)
+                      cur_mu = cur_mu.assign(tf.zeros(shape=(self.n_input,)))
+                      cur_mu = tf.scatter_add(cur_mu, P_a_cur_policy, tf.expand_dims(mu[t], axis=1) * policy)
+                  else:
+                    cur_mu = self.reduce_sum(self.reduce_sum_sparse(tf.tile(tf.expand_dims(tf.expand_dims(mu[t], 1), 2),
                                                                           [1, tf.shape(policy)[1],
                                                                            self.n_input]) * P_a_cur_policy, axis=1),
                                            axis=0)
@@ -192,6 +223,10 @@ def get_rewards(self, states):
     return rewards
 
   def get_policy(self, states, P_a, gamma, epsilon=0.01):
+    if self.conv:
+      states = np.expand_dims(np.expand_dims(states, axis=0), axis=-1)
+    else:
+      states = np.expand_dims(states, axis=0)
     return self.sess.run([self.reward, self.values, self.policy],
                          feed_dict={self.input_s: states, self.P_a: P_a, self.gamma: gamma, self.epsilon: epsilon})
 
@@ -239,7 +274,10 @@ def compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True):
     p       Nx1 vector - state visitation frequencies
   """
   tt = time.time()
-  N_STATES, _, N_ACTIONS = np.shape(P_a)
+  if len(P_a.shape) == 3:
+    N_STATES, _, N_ACTIONS = np.shape(P_a)
+  else:
+    N_STATES, N_ACTIONS = np.shape(P_a)
 
   T = len(trajs[0])
   # mu[s, t] is the prob of visiting state s at time t
@@ -253,18 +291,38 @@ def compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True):
   if chunk_size == 0:
     chunk_size = N_STATES
 
-  if deterministic:
-    P_az = P_a[np.arange(0, N_STATES), :, policy]
-  else:
-    P_a = P_a.transpose(0, 2, 1)
 
-  def step(t, start, end):
+  if len(P_a.shape) == 3:
       if deterministic:
-        mu[start:end, t + 1] = np.sum(mu[:, t, np.newaxis] * P_az[:, start:end], axis=0)
+        P_az = P_a[np.arange(0, N_STATES), :, policy]
       else:
-        mu[start:end, t + 1] = np.sum(np.sum(mu[:, t, np.newaxis, np.newaxis] * (P_a[:, :, start:end] * policy[:, :, np.newaxis]), axis=1), axis=0)
+        P_a = P_a.transpose(0, 2, 1)
+  else:
+      if deterministic:
+        P_az = P_a[np.arange(N_STATES), policy]
+
+  if len(P_a.shape) == 3:
+      def step(t, start, end):
+          if deterministic:
+            mu[start:end, t + 1] = np.sum(mu[:, t, np.newaxis] * P_az[:, start:end], axis=0)
+          else:
+            mu[start:end, t + 1] = np.sum(np.sum(mu[:, t, np.newaxis, np.newaxis] * (P_a[:, :, start:end] * policy[:, :, np.newaxis]), axis=1), axis=0)
+  else:
+      def step(t, start, end):
+          if deterministic:
+            # The following needs be be done using ufunc
+            # https://stackoverflow.com/questions/41990028/add-multiple-values-to-one-numpy-array-index
+            # P_az[start:end] sometimes points to same state for multiple values, with the usual fancy indexing only
+            # one addition (latest) would be executed!
+            # https://stackoverflow.com/questions/15973827/handling-of-duplicate-indices-in-numpy-assignments
+            # mu[P_az[start:end], t + 1] += mu[start:end, t]
+            np.add.at(mu, [P_az[start:end], t + 1], mu[start:end, t])
+          else:
+            # mu[P_a[start:end, :], t + 1] += mu[start:end, t, np.newaxis] * policy[start:end, :]
+            np.add.at(mu, [P_a[start:end, :], t + 1], mu[start:end, t, np.newaxis] * policy[start:end, :])
 
-  with ThreadPoolExecutor(max_workers=num_cpus) as e:
+
+  with ThreadPoolExecutor(max_workers=1) as e:
     for t in range(T - 1):
       futures = list()
       for i in range(0, N_STATES, chunk_size):
@@ -324,7 +382,10 @@ def compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=Tru
     returns:
       p       Nx1 vector - state visitation frequencies
     """
-    N_STATES, _, N_ACTIONS = np.shape(P_a)
+    if len(P_a.shape) == 3:
+        N_STATES, _, N_ACTIONS = np.shape(P_a)
+    else:
+        N_STATES, N_ACTIONS = np.shape(P_a)
 
     T = len(trajs[0])
     # mu[s, t] is the prob of visiting state s at time t
@@ -336,12 +397,21 @@ def compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=Tru
     for t in range(T - 1):
       for s in range(N_STATES):
             if deterministic:
-                mu[s, t + 1] = sum([mu[pre_s, t] * P_a[pre_s, s, int(policy[pre_s])] for pre_s in range(N_STATES)])
+                if len(P_a.shape) == 3:
+                    mu[s, t + 1] = sum([mu[pre_s, t] * P_a[pre_s, s, int(policy[pre_s])] for pre_s in range(N_STATES)])
+                else:
+                    mu[P_a[s, int(policy[s])], t + 1] += mu[s, t]
             else:
-                mu[s, t + 1] = sum(
-                    [sum([mu[pre_s, t] * P_a[pre_s, s, a1] * policy[pre_s, a1] for a1 in range(N_ACTIONS)]) for pre_s in
-                     range(N_STATES)])
+                if len(P_a.shape) == 3:
+                    mu[s, t + 1] = sum(
+                        [sum([mu[pre_s, t] * P_a[pre_s, s, a1] * policy[pre_s, a1] for a1 in range(N_ACTIONS)]) for pre_s in
+                        range(N_STATES)])
+                else:
+                    for a1 in range(N_ACTIONS):
+                        mu[P_a[s, a1], t + 1] += mu[s, t] * policy[s, a1]
 
+
+    print(mu)
     p = np.sum(mu, 1)
     print('SUM SVF', p.sum())
 
@@ -377,21 +447,29 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse):
   """
 
   # tf.set_random_seed(1)
-
-  N_STATES, _, N_ACTIONS = np.shape(P_a)
+
+  if len(P_a.shape) == 3:
+      N_STATES, _, N_ACTIONS = np.shape(P_a)
+  else:
+      N_STATES, N_ACTIONS = np.shape(P_a)
+
+  deterministic = False
 
   # init nn model
-  nn_r = DeepIRLFC(feat_map.shape, N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=False, conv=conv, sparse=sparse)
+  nn_r = DeepIRLFC(feat_map.shape, N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic_env=len(P_a.shape) == 2,  deterministic=deterministic, conv=conv, sparse=sparse)
 
   # find state visitation frequencies using demonstrations
   mu_D = demo_svf(trajs, N_STATES)
   p_start_state = start_state_probs(trajs, N_STATES)
 
-  P_a_t = P_a.transpose(0, 2, 1)
-  if sparse:
-    mask = P_a_t > 0
-    indices = np.argwhere(mask)
-    P_a_t = tf.SparseTensorValue(indices, P_a_t[mask], P_a_t.shape)
+  if len(P_a.shape) == 3:
+      P_a_t = P_a.transpose(0, 2, 1)
+      if sparse:
+        mask = P_a_t > 0
+        indices = np.argwhere(mask)
+        P_a_t = tf.SparseTensorValue(indices, P_a_t[mask], P_a_t.shape)
+  else:
+      P_a_t = P_a
 
   grads = list()
 
@@ -404,20 +482,20 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse):
     # rewards = nn_r.get_rewards(feat_map)
 
     # compute policy
-    #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=False)
+    #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=deterministic)
 
     # compute rewards and policy at the same time
     #t = time.time()
-    #rewards, _, policy = nn_r.get_policy(feat_map, P_a_t, gamma, 0.01)
+    rewards, values, policy = nn_r.get_policy(feat_map, P_a_t, gamma, 0.000001)
     #print('tensorflow VI', time.time() - t)
 
     # compute expected svf
-    #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False)
+    #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=deterministic)
 
     rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001)
     #print(rewards)
 
-    #assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, False)
+    #assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, N_ACTIONS, N_STATES, trajs, gamma, deterministic)
 
     # compute gradients on rewards:
     grad_r = mu_D - mu_exp
@@ -434,27 +512,40 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse):
   # return sigmoid(normalize(rewards))
   return normalize(rewards)
 
-def assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, deterministic):
-    assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001,
-                                                                   deterministic=deterministic)
-    assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001,
-                                                                               deterministic=deterministic)
-    assert_values2 = value_iteration.optimal_value(N_STATES, N_ACTIONS, P_a_t, rewards, gamma, threshold=0.000001)
-
-    assert (np.abs(assert_values - assert_values2) < 0.0001).all()
-    assert (np.abs(assert_values - assert_values_old) < 0.0001).all()
-    assert (np.abs(values - assert_values) < 0.0001).all()
-    assert (np.abs(values - assert_values_old) < 0.0001).all()
-
-    print(assert_policy)
-    print(assert_policy_old)
-    print(policy)
-    print(values)
-    print(assert_values)
-    print(rewards)
-    assert (np.abs(assert_policy - assert_policy_old) < 0.0001).all()
-    assert (np.abs(policy - assert_policy) < 0.0001).all()
-    assert (np.abs(policy - assert_policy_old) < 0.0001).all()
+def assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, N_ACTIONS, N_STATES, trajs, gamma, deterministic):
+
+    def assert_vi(P_a):
+        assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001,
+                                                                       deterministic=deterministic)
+        assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001,
+                                                                                   deterministic=deterministic)
+
+        if len(P_a) == 3:
+            assert_values2 = value_iteration.optimal_value(N_STATES, N_ACTIONS, P_a_t, rewards, gamma, threshold=0.000001)
+
+            assert (np.abs(assert_values - assert_values2) < 0.0001).all()
+
+        assert (np.abs(assert_values - assert_values_old) < 0.0001).all()
+        assert (np.abs(values - assert_values) < 0.0001).all()
+        assert (np.abs(values - assert_values_old) < 0.0001).all()
+
+        # print(assert_policy)
+        # print(assert_policy_old)
+        # print(policy)
+        # print(values)
+        # print(assert_values)
+        # print(rewards)
+        assert (np.abs(assert_policy - assert_policy_old) < 0.0001).all()
+        assert (np.abs(policy - assert_policy) < 0.0001).all()
+        assert (np.abs(policy - assert_policy_old) < 0.0001).all()
+
+    assert_vi(P_a)
+    if len(P_a.shape) == 2:
+        print('creating full transistion matrix')
+        # construct full sparse transisiton matrix and make sure values are the same
+        P_a_t = np.zeros((N_STATES, N_ACTIONS, N_STATES))
+        P_a_t[P_a] = 1
+        assert_vi(P_a)
 
     assert (np.abs(mu_exp - compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=deterministic)) < 0.00001).all()
     assert (