From 861e83e0952efdc16a355a989915a08c8e00e182 Mon Sep 17 00:00:00 2001 From: Magnus Jahnen Date: Tue, 28 Nov 2017 20:45:36 -0800 Subject: [PATCH 01/10] Debug to fix tensorflow VI --- deep_maxent_irl.py | 24 ++++++++++--- deep_maxent_irl_gridworld.py | 2 +- maxent_irl.py | 36 ------------------- mdp/value_iteration.py | 70 ++++++++++++++++++++++++++++++++++++ 4 files changed, 90 insertions(+), 42 deletions(-) diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index 5f23563..95e126d 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -68,19 +68,31 @@ def _build_network(self, name): initializer=tf.contrib.layers.variance_scaling_initializer(mode="FAN_IN")) reward = tf_utils.fc(fc2, 1, scope="reward") theta = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name) - return input_s, reward, theta + return input_s, tf.squeeze(reward), theta def _vi(self, rewards): - rewards = tf.squeeze(rewards) - + rewards = tf.Print(rewards, [rewards], 'rewards: ', summarize=500) + P_a = tf.Print(self.P_a, [self.P_a], 'P_a: ', summarize=500) def body(i, c, t): old_values = t.read(i) if self.sparse: new_values = tf.sparse_reduce_max( tf.sparse_reduce_sum_sparse(self.P_a * (rewards + self.gamma * old_values), axis=2), axis=1) else: - new_values = tf.reduce_max(tf.reduce_sum(self.P_a * (rewards + self.gamma * old_values), axis=2), axis=1) + old_values = tf.Print(old_values, [old_values], 'old_values', summarize=500) + rewards_expanded = tf.tile(tf.expand_dims(rewards, 1), [1, self.n_input]) + tmp = (rewards_expanded + self.gamma * old_values) + tmp = tf.Print(tmp, [tmp], 'tmp: ', summarize=500) + tmp = tf.tile(tf.expand_dims(tmp, 1), [1, tf.shape(P_a)[1], 1]) + mm = P_a * tmp + mm = tf.Print(mm, [mm], 'mm', summarize=500) + + ss = tf.reduce_sum(mm, axis=2) + ss = tf.Print(ss, [ss], 'ss', summarize=500) + new_values = tf.reduce_max(ss, axis=1) + + new_values = tf.Print(new_values, [new_values], 'new_values', summarize=500) c = tf.reduce_max(tf.abs(new_values - old_values)) > self.epsilon c.set_shape(()) @@ -95,7 +107,7 @@ def condition(i, c, t): i, _, values = tf.while_loop(condition, body, [0, True, t], parallel_iterations=1, back_prop=False, name='VI_loop') - values = values.read(i) + values = values.read(tf.Print(i, [i], 'i: ')) if self.deterministic: if self.sparse: @@ -346,7 +358,9 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001, deterministic=True) assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001, deterministic=True) + assert_values2 = value_iteration.optimal_value(N_STATES, N_ACTIONS, P_a_t, rewards, gamma, threshold=0.000001) + assert (np.abs(assert_values - assert_values2) < 0.0001).all() assert (np.abs(assert_values - assert_values_old) < 0.0001).all() assert (np.abs(values - assert_values) < 0.0001).all() assert (np.abs(values - assert_values_old) < 0.0001).all() diff --git a/deep_maxent_irl_gridworld.py b/deep_maxent_irl_gridworld.py index e1c62e6..44108e8 100644 --- a/deep_maxent_irl_gridworld.py +++ b/deep_maxent_irl_gridworld.py @@ -112,7 +112,7 @@ def main(): values, policy = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) - print('evd', expected_value_diff(P_a, rewards, rewards_gt, GAMMA, mu, values_gt, policy)) + #print('evd', expected_value_diff(P_a, rewards, rewards_gt, GAMMA, mu, values_gt, policy)) # plots plt.figure(figsize=(20,4)) diff --git a/maxent_irl.py b/maxent_irl.py index e528006..20559e5 100644 --- a/maxent_irl.py +++ b/maxent_irl.py @@ -104,39 +104,3 @@ def maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters): rewards = np.dot(feat_map, theta) # return sigmoid(normalize(rewards)) return normalize(rewards) - -def value(policy, n_states, transition_probabilities, reward, discount, - threshold=1e-2): - """ - FROM https://github.com/MatthewJA/Inverse-Reinforcement-Learning/blob/master/irl/value_iteration.py#L10 - - Find the value function associated with a policy. - - policy: List of action ints for each state. - n_states: Number of states. int. - transition_probabilities: Function taking (state, action, state) to - transition probabilities. - reward: Vector of rewards for each state. - discount: MDP discount factor. float. - threshold: Convergence threshold, default 1e-2. float. - -> Array of values for each state - """ - v = np.zeros(n_states) - - diff = float("inf") - while diff > threshold: - diff = 0 - for s in range(n_states): - vs = v[s] - a = policy[s] - v[s] = sum(transition_probabilities[s, a, k] * - (reward[k] + discount * v[k]) - for k in range(n_states)) - diff = max(diff, abs(vs - v[s])) - - return v - -def expected_value_diff(P_a, rewards, true_rewards, gamma, p_start, optimal_value, policy, error=0.01, deterministic=True): - v = value(policy, P_a.shape[0], P_a.transpose(0, 2, 1), true_rewards, gamma) - - return optimal_value.dot(p_start) - v.dot(p_start) diff --git a/mdp/value_iteration.py b/mdp/value_iteration.py index 5a42296..c81e12b 100644 --- a/mdp/value_iteration.py +++ b/mdp/value_iteration.py @@ -50,6 +50,9 @@ def value_iteration_old(P_a, rewards, gamma, error=0.01, deterministic=True): v_s = [] values[s] = max([sum([P_a[s, s1, a] * (rewards[s] + gamma * values_tmp[s1]) for s1 in range(N_STATES)]) for a in range(N_ACTIONS)]) + print([P_a[s, s1, :] * (rewards[s] + gamma * values_tmp[s1]) for s1 in range(N_STATES)]) + print([sum([P_a[s, s1, a] * (rewards[s] + gamma * values_tmp[s1]) for s1 in range(N_STATES)]) for a in + range(N_ACTIONS)]) if max([abs(values[s] - values_tmp[s]) for s in range(N_STATES)]) < error: break @@ -347,8 +350,75 @@ def get_action(self, state): return actions[a_id] +def optimal_value(n_states, n_actions, transition_probabilities, reward, + discount, threshold=1e-2): + """ + FROM https://github.com/MatthewJA/Inverse-Reinforcement-Learning/blob/master/irl/value_iteration.py + Find the optimal value function. + n_states: Number of states. int. + n_actions: Number of actions. int. + transition_probabilities: Function taking (state, action, state) to + transition probabilities. + reward: Vector of rewards for each state. + discount: MDP discount factor. float. + threshold: Convergence threshold, default 1e-2. float. + -> Array of values for each state + """ + + v = np.zeros(n_states) + new_v = np.zeros(n_states) + + diff = float("inf") + while diff > threshold: + diff = 0 + v = new_v.copy() + for s in range(n_states): + max_v = float("-inf") + for a in range(n_actions): + tp = transition_probabilities[s, a, :] + max_v = max(max_v, np.dot(tp, reward[s] + discount * v)) + max_v = max(max_v, np.sum(tp * (reward[s] + discount*v))) + new_diff = abs(v[s] - max_v) + if new_diff > diff: + diff = new_diff + new_v[s] = max_v + + return v + + +def value(policy, n_states, transition_probabilities, reward, discount, + threshold=1e-2): + """ + FROM https://github.com/MatthewJA/Inverse-Reinforcement-Learning/blob/master/irl/value_iteration.py#L10 + + Find the value function associated with a policy. + + policy: List of action ints for each state. + n_states: Number of states. int. + transition_probabilities: Function taking (state, action, state) to + transition probabilities. + reward: Vector of rewards for each state. + discount: MDP discount factor. float. + threshold: Convergence threshold, default 1e-2. float. + -> Array of values for each state + """ + v = np.zeros(n_states) + diff = float("inf") + while diff > threshold: + diff = 0 + for s in range(n_states): + vs = v[s] + a = policy[s] + v[s] = sum(transition_probabilities[s, a, k] * + (reward[k] + discount * v[k]) + for k in range(n_states)) + diff = max(diff, abs(vs - v[s])) + return v +def expected_value_diff(P_a, rewards, true_rewards, gamma, p_start, optimal_value, policy, error=0.01, deterministic=True): + v = value(policy, P_a.shape[0], P_a.transpose(0, 2, 1), true_rewards, gamma) + return optimal_value.dot(p_start) - v.dot(p_start) \ No newline at end of file From 2fa875eaeccbf827368309fde86449afa57d3f37 Mon Sep 17 00:00:00 2001 From: Magnus Jahnen Date: Tue, 28 Nov 2017 21:43:04 -0800 Subject: [PATCH 02/10] Fix tf policy --- deep_maxent_irl.py | 34 ++++++++++++++++++++-------------- mdp/value_iteration.py | 28 ++++++++++++++++++---------- 2 files changed, 38 insertions(+), 24 deletions(-) diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index 95e126d..c26f789 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -73,6 +73,8 @@ def _build_network(self, name): def _vi(self, rewards): rewards = tf.Print(rewards, [rewards], 'rewards: ', summarize=500) + rewards_expanded = tf.tile(tf.expand_dims(rewards, 1), [1, self.n_input]) + P_a = tf.Print(self.P_a, [self.P_a], 'P_a: ', summarize=500) def body(i, c, t): old_values = t.read(i) @@ -81,11 +83,10 @@ def body(i, c, t): tf.sparse_reduce_sum_sparse(self.P_a * (rewards + self.gamma * old_values), axis=2), axis=1) else: old_values = tf.Print(old_values, [old_values], 'old_values', summarize=500) - rewards_expanded = tf.tile(tf.expand_dims(rewards, 1), [1, self.n_input]) - tmp = (rewards_expanded + self.gamma * old_values) - tmp = tf.Print(tmp, [tmp], 'tmp: ', summarize=500) - tmp = tf.tile(tf.expand_dims(tmp, 1), [1, tf.shape(P_a)[1], 1]) - mm = P_a * tmp + expected_value = rewards_expanded + self.gamma * old_values + expected_value = tf.Print(expected_value, [expected_value], 'expected_value: ', summarize=500) + expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(P_a)[1], 1]) + mm = P_a * expected_value mm = tf.Print(mm, [mm], 'mm', summarize=500) ss = tf.reduce_sum(mm, axis=2) @@ -109,15 +110,20 @@ def condition(i, c, t): name='VI_loop') values = values.read(tf.Print(i, [i], 'i: ')) + expected_value = (rewards_expanded + self.gamma * values) + expected_value = tf.Print(expected_value, [expected_value], 'expected_value: ', summarize=500) + expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(P_a)[1], 1]) + if self.deterministic: if self.sparse: policy = tf.argmax(tf.sparse_tensor_to_dense(tf.sparse_reduce_sum_sparse(self.P_a * (rewards + self.gamma * values), axis=2)), axis=1) else: - policy = tf.argmax(tf.reduce_sum(self.P_a * (rewards + self.gamma * values), axis=2), axis=1) + + policy = tf.argmax(tf.reduce_sum(self.P_a * expected_value, axis=2), axis=1) else: if self.sparse: policy = tf.sparse_tensor_to_dense( - tf.sparse_reduce_sum_sparse(self.P_a * (rewards + self.gamma * values), axis=2)) + tf.sparse_reduce_sum_sparse(self.P_a * expected_value, axis=2)) else: policy = tf.reduce_sum(self.P_a * (rewards + self.gamma * values), axis=2) @@ -321,7 +327,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): N_STATES, _, N_ACTIONS = np.shape(P_a) # init nn model - nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=True, sparse=sparse) + nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=False, sparse=sparse) # find state visitation frequencies using demonstrations mu_D = demo_svf(trajs, N_STATES) @@ -342,7 +348,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): rewards = nn_r.get_rewards(feat_map) # compute policy - #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True) + #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=False) # compute rewards and policy at the same time #t = time.time() @@ -350,14 +356,14 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): #print('tensorflow VI', time.time() - t) # compute expected svf - #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True) + #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False) rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001) # compute gradients on rewards: grad_r = mu_D - mu_exp - assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001, deterministic=True) - assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001, deterministic=True) + assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001, deterministic=False) + assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001, deterministic=False) assert_values2 = value_iteration.optimal_value(N_STATES, N_ACTIONS, P_a_t, rewards, gamma, threshold=0.000001) assert (np.abs(assert_values - assert_values2) < 0.0001).all() @@ -369,8 +375,8 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): assert (np.abs(policy - assert_policy) < 0.001).all() assert (np.abs(policy - assert_policy_old) < 0.001).all() - assert (np.abs(mu_exp - compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True)) < 0.00001).all() - assert (np.abs(mu_exp - compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=True)) < 0.00001).all() + assert (np.abs(mu_exp - compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False)) < 0.00001).all() + assert (np.abs(mu_exp - compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=False)) < 0.00001).all() # apply gradients to the neural network grad_theta, l2_loss, grad_norm = nn_r.apply_grads(feat_map, grad_r) diff --git a/mdp/value_iteration.py b/mdp/value_iteration.py index c81e12b..27382b6 100644 --- a/mdp/value_iteration.py +++ b/mdp/value_iteration.py @@ -17,8 +17,12 @@ def softmax(x): """Compute softmax values for each sets of scores in x.""" - e_x = np.exp(x - np.max(x, axis=-1)[:, np.newaxis]) - return e_x / e_x.sum(axis=-1)[:, np.newaxis] + if len(x.shape) > 1: + e_x = np.exp(x - np.max(x, axis=-1)[:, np.newaxis]) + return e_x / e_x.sum(axis=-1)[:, np.newaxis] + else: + e_x = np.exp(x - np.max(x)) + return e_x / e_x.sum() def value_iteration_old(P_a, rewards, gamma, error=0.01, deterministic=True): """ @@ -109,6 +113,7 @@ def value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True): if chunk_size == 0: chunk_size = N_STATES + rewards_expanded = rewards[:, np.newaxis].repeat(N_STATES, axis=1) count = 0 # estimate values while True: @@ -116,10 +121,10 @@ def value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True): values_tmp = values.copy() def step(start, end): - tmp = rewards[start:end, np.newaxis].repeat(N_STATES, axis=1) + gamma * values_tmp - tmp = tmp[:, :, np.newaxis].repeat(N_ACTIONS, axis=2) - tmp = np.transpose(tmp, (0, 2, 1)) - values[start:end] = (P[start:end, :, :] * tmp).sum(axis=2).max(axis=1) + expected_value = rewards_expanded[start:end, :] + gamma * values_tmp + expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2) + expected_value = np.transpose(expected_value, (0, 2, 1)) + values[start:end] = (P[start:end, :, :] * expected_value).sum(axis=2).max(axis=1) with ThreadPoolExecutor(max_workers=num_cpus) as e: futures = list() @@ -137,17 +142,20 @@ def step(start, end): print('VI', count) break + expected_value = rewards_expanded + gamma * values_tmp + expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2) + expected_value = np.transpose(expected_value, (0, 2, 1)) + + if deterministic: # generate deterministic policy - policy = np.argmax((P * (rewards + gamma * values_tmp)).sum(axis=2), axis=1) - + policy = np.argmax((P * expected_value).sum(axis=2), axis=1) print(time.time() - t) return values, policy else: - # generate stochastic policy - policy = (P * (rewards + gamma * values_tmp)).sum(axis=2) + policy = (P * (rewards + expected_value)).sum(axis=2) policy = softmax(policy) print(time.time() - t) From 202d39417d46f92d16f7d27d41987a9680557905 Mon Sep 17 00:00:00 2001 From: Magnus Jahnen Date: Tue, 28 Nov 2017 21:50:05 -0800 Subject: [PATCH 03/10] Fix stochastic policy (tf and numpy) --- deep_maxent_irl.py | 2 +- mdp/value_iteration.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index c26f789..8db5a8e 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -125,7 +125,7 @@ def condition(i, c, t): policy = tf.sparse_tensor_to_dense( tf.sparse_reduce_sum_sparse(self.P_a * expected_value, axis=2)) else: - policy = tf.reduce_sum(self.P_a * (rewards + self.gamma * values), axis=2) + policy = tf.reduce_sum(self.P_a * expected_value, axis=2) policy = tf.nn.softmax(policy) diff --git a/mdp/value_iteration.py b/mdp/value_iteration.py index 27382b6..a631bfc 100644 --- a/mdp/value_iteration.py +++ b/mdp/value_iteration.py @@ -155,7 +155,7 @@ def step(start, end): return values, policy else: # generate stochastic policy - policy = (P * (rewards + expected_value)).sum(axis=2) + policy = (P * expected_value).sum(axis=2) policy = softmax(policy) print(time.time() - t) From 76291c4e01657490c0268140a252c3ab642f3c3a Mon Sep 17 00:00:00 2001 From: Magnus Jahnen Date: Wed, 29 Nov 2017 10:36:17 -0800 Subject: [PATCH 04/10] minor things --- deep_maxent_irl.py | 22 +++++++++++++++------- mdp/value_iteration.py | 3 --- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index 8db5a8e..95f533e 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -110,7 +110,7 @@ def condition(i, c, t): name='VI_loop') values = values.read(tf.Print(i, [i], 'i: ')) - expected_value = (rewards_expanded + self.gamma * values) + expected_value = rewards_expanded + self.gamma * values expected_value = tf.Print(expected_value, [expected_value], 'expected_value: ', summarize=500) expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(P_a)[1], 1]) @@ -345,7 +345,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): print 'iteration: {}'.format(iteration) # compute the reward matrix - rewards = nn_r.get_rewards(feat_map) + # rewards = nn_r.get_rewards(feat_map) # compute policy #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=False) @@ -359,8 +359,6 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False) rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001) - # compute gradients on rewards: - grad_r = mu_D - mu_exp assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001, deterministic=False) assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001, deterministic=False) @@ -370,14 +368,24 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): assert (np.abs(assert_values - assert_values_old) < 0.0001).all() assert (np.abs(values - assert_values) < 0.0001).all() assert (np.abs(values - assert_values_old) < 0.0001).all() - + print 'iteration: {}'.format(iteration) + + print(assert_policy) + print(assert_policy_old) + print(policy) + print(values) + print(assert_values) + print(rewards) assert (np.abs(assert_policy - assert_policy_old) < 0.0001).all() - assert (np.abs(policy - assert_policy) < 0.001).all() - assert (np.abs(policy - assert_policy_old) < 0.001).all() + assert (np.abs(policy - assert_policy) < 0.0001).all() + assert (np.abs(policy - assert_policy_old) < 0.0001).all() assert (np.abs(mu_exp - compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False)) < 0.00001).all() assert (np.abs(mu_exp - compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=False)) < 0.00001).all() + # compute gradients on rewards: + grad_r = mu_D - mu_exp + # apply gradients to the neural network grad_theta, l2_loss, grad_norm = nn_r.apply_grads(feat_map, grad_r) diff --git a/mdp/value_iteration.py b/mdp/value_iteration.py index a631bfc..d6c2298 100644 --- a/mdp/value_iteration.py +++ b/mdp/value_iteration.py @@ -54,9 +54,6 @@ def value_iteration_old(P_a, rewards, gamma, error=0.01, deterministic=True): v_s = [] values[s] = max([sum([P_a[s, s1, a] * (rewards[s] + gamma * values_tmp[s1]) for s1 in range(N_STATES)]) for a in range(N_ACTIONS)]) - print([P_a[s, s1, :] * (rewards[s] + gamma * values_tmp[s1]) for s1 in range(N_STATES)]) - print([sum([P_a[s, s1, a] * (rewards[s] + gamma * values_tmp[s1]) for s1 in range(N_STATES)]) for a in - range(N_ACTIONS)]) if max([abs(values[s] - values_tmp[s]) for s in range(N_STATES)]) < error: break From 328f2d7b4bf277f187ce5f46ad9645625bacbf8b Mon Sep 17 00:00:00 2001 From: Magnus Jahnen Date: Wed, 29 Nov 2017 13:56:18 -0800 Subject: [PATCH 05/10] Debug gradients --- deep_maxent_irl.py | 91 ++++++++++++++++++++++++++++++++---------- mdp/value_iteration.py | 4 +- 2 files changed, 74 insertions(+), 21 deletions(-) diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index 95f533e..4df3960 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -158,7 +158,20 @@ def _svf(self, policy): mu.append(cur_mu) mu = tf.stack(mu) - return tf.reduce_sum(mu, axis=0) + mu = tf.reduce_sum(mu, axis=0) + if self.deterministic: + # NOTE: In the deterministic case it helps to scale the svf by T to recover the reward properly + # I noticed that if it is not scaled by T then the recovered reward and the resulting value function + # have extremely low values (usually < 0.01). With such low values it is hard to actually recover a + # difference in the value of states (i.e. if only the last few digits after the comma differ). + # One intuition why scaling by T is useful is to stabilize the gradients and avoid that the gradients + # are getting too high + # TODO: maybe gradient clipping and normalizing the svf of demonstrations and the policy might help as well + # As a side note: This is not mentioned somewhere in the pulications, but for me this countermeasure works + # pretty well (on the other hand using a deterministic policy is anyways never meantioned in one of the + # publications, they always describe optimizing with stochastic policies) + mu /= self.T + return mu def get_theta(self): @@ -248,6 +261,19 @@ def step(t, start, end): p = np.sum(mu, 1) + if deterministic: + # NOTE: In the deterministic case it helps to scale the svf by T to recover the reward properly + # I noticed that if it is not scaled by T then the recovered reward and the resulting value function + # have extremely low values (usually < 0.01). With such low values it is hard to actually recover a + # difference in the value of states (i.e. if only the last few digits after the comma differ). + # One intuition why scaling by T is useful is to stabilize the gradients and avoid that the gradients + # are getting too high + # TODO: maybe gradient clipping and normalizing the svf of demonstrations and the policy might help as well + # As a side note: This is not mentioned somewhere in the pulications, but for me this countermeasure works + # pretty well (on the other hand using a deterministic policy is anyways never meantioned in one of the + # publications, they always describe optimizing with stochastic policies) + p /= T + print(time.time() - tt) return p @@ -301,6 +327,18 @@ def compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=Tru range(N_STATES)]) p = np.sum(mu, 1) + if deterministic: + # NOTE: In the deterministic case it helps to scale the svf by T to recover the reward properly + # I noticed that if it is not scaled by T then the recovered reward and the resulting value function + # have extremely low values (usually < 0.01). With such low values it is hard to actually recover a + # difference in the value of states (i.e. if only the last few digits after the comma differ). + # One intuition why scaling by T is useful is to stabilize the gradients and avoid that the gradients + # are getting too high + # TODO: maybe gradient clipping and normalizing the svf of demonstrations and the policy might help as well + # As a side note: This is not mentioned somewhere in the pulications, but for me this countermeasure works + # pretty well (on the other hand using a deterministic policy is anyways never meantioned in one of the + # publications, they always describe optimizing with stochastic policies) + p /= T return p @@ -327,7 +365,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): N_STATES, _, N_ACTIONS = np.shape(P_a) # init nn model - nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=False, sparse=sparse) + nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=True, sparse=sparse) # find state visitation frequencies using demonstrations mu_D = demo_svf(trajs, N_STATES) @@ -339,6 +377,8 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): indices = np.argwhere(mask) P_a_t = tf.SparseTensorValue(indices, P_a_t[mask], P_a_t.shape) + grads = list() + # training for iteration in range(n_iters): if iteration % (n_iters/10) == 0: @@ -348,7 +388,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): # rewards = nn_r.get_rewards(feat_map) # compute policy - #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=False) + #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True) # compute rewards and policy at the same time #t = time.time() @@ -356,19 +396,40 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): #print('tensorflow VI', time.time() - t) # compute expected svf - #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False) + #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True) rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001) - assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001, deterministic=False) - assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001, deterministic=False) + assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, True) + + + # compute gradients on rewards: + grad_r = mu_D - mu_exp + grads.append(grad_r) + + # apply gradients to the neural network + grad_theta, l2_loss, grad_norm = nn_r.apply_grads(feat_map, grad_r) + + + print(np.mean(grads, axis=0)) + print(np.std(grads, axis=0)) + + + rewards = nn_r.get_rewards(feat_map) + # return sigmoid(normalize(rewards)) + return normalize(rewards) + +def assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, deterministic): + assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001, + deterministic=deterministic) + assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001, + deterministic=deterministic) assert_values2 = value_iteration.optimal_value(N_STATES, N_ACTIONS, P_a_t, rewards, gamma, threshold=0.000001) assert (np.abs(assert_values - assert_values2) < 0.0001).all() assert (np.abs(assert_values - assert_values_old) < 0.0001).all() assert (np.abs(values - assert_values) < 0.0001).all() assert (np.abs(values - assert_values_old) < 0.0001).all() - print 'iteration: {}'.format(iteration) print(assert_policy) print(assert_policy_old) @@ -380,19 +441,9 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): assert (np.abs(policy - assert_policy) < 0.0001).all() assert (np.abs(policy - assert_policy_old) < 0.0001).all() - assert (np.abs(mu_exp - compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False)) < 0.00001).all() - assert (np.abs(mu_exp - compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=False)) < 0.00001).all() - - # compute gradients on rewards: - grad_r = mu_D - mu_exp - - # apply gradients to the neural network - grad_theta, l2_loss, grad_norm = nn_r.apply_grads(feat_map, grad_r) - - - rewards = nn_r.get_rewards(feat_map) - # return sigmoid(normalize(rewards)) - return normalize(rewards) + assert (np.abs(mu_exp - compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=deterministic)) < 0.00001).all() + assert ( + np.abs(mu_exp - compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=deterministic)) < 0.00001).all() diff --git a/mdp/value_iteration.py b/mdp/value_iteration.py index d6c2298..e090b35 100644 --- a/mdp/value_iteration.py +++ b/mdp/value_iteration.py @@ -139,7 +139,9 @@ def step(start, end): print('VI', count) break - expected_value = rewards_expanded + gamma * values_tmp + expected_value = rewards_expanded + gamma * values + print(expected_value) + expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2) expected_value = np.transpose(expected_value, (0, 2, 1)) From 7f98f664424374cb86a7cdb1f3882fff4cccfae0 Mon Sep 17 00:00:00 2001 From: Magnus Jahnen Date: Wed, 29 Nov 2017 14:48:27 -0800 Subject: [PATCH 06/10] Debug gradients --- deep_maxent_irl.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index 4df3960..50c40e0 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -327,6 +327,7 @@ def compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=Tru range(N_STATES)]) p = np.sum(mu, 1) + print('SUM SVF', p.sum()) if deterministic: # NOTE: In the deterministic case it helps to scale the svf by T to recover the reward properly # I noticed that if it is not scaled by T then the recovered reward and the resulting value function @@ -365,7 +366,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): N_STATES, _, N_ACTIONS = np.shape(P_a) # init nn model - nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=True, sparse=sparse) + nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=False, sparse=sparse) # find state visitation frequencies using demonstrations mu_D = demo_svf(trajs, N_STATES) @@ -388,7 +389,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): # rewards = nn_r.get_rewards(feat_map) # compute policy - #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True) + #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=False) # compute rewards and policy at the same time #t = time.time() @@ -396,11 +397,11 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): #print('tensorflow VI', time.time() - t) # compute expected svf - #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True) + #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False) rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001) - assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, True) + assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, False) # compute gradients on rewards: @@ -411,9 +412,8 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): grad_theta, l2_loss, grad_norm = nn_r.apply_grads(feat_map, grad_r) - print(np.mean(grads, axis=0)) - print(np.std(grads, axis=0)) - + print('grad mean', np.mean(grads, axis=0)) + print('grad std', np.std(grads, axis=0)) rewards = nn_r.get_rewards(feat_map) # return sigmoid(normalize(rewards)) From 206375192113a8f711fc7c72be13162b548007e8 Mon Sep 17 00:00:00 2001 From: Magnus Jahnen Date: Wed, 29 Nov 2017 14:57:16 -0800 Subject: [PATCH 07/10] Remove tensorflow prints --- deep_maxent_irl.py | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index 50c40e0..1a500c8 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -72,28 +72,21 @@ def _build_network(self, name): def _vi(self, rewards): - rewards = tf.Print(rewards, [rewards], 'rewards: ', summarize=500) rewards_expanded = tf.tile(tf.expand_dims(rewards, 1), [1, self.n_input]) - P_a = tf.Print(self.P_a, [self.P_a], 'P_a: ', summarize=500) def body(i, c, t): old_values = t.read(i) if self.sparse: new_values = tf.sparse_reduce_max( tf.sparse_reduce_sum_sparse(self.P_a * (rewards + self.gamma * old_values), axis=2), axis=1) else: - old_values = tf.Print(old_values, [old_values], 'old_values', summarize=500) expected_value = rewards_expanded + self.gamma * old_values - expected_value = tf.Print(expected_value, [expected_value], 'expected_value: ', summarize=500) - expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(P_a)[1], 1]) - mm = P_a * expected_value - mm = tf.Print(mm, [mm], 'mm', summarize=500) + expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(self.P_a)[1], 1]) + mm = self.P_a * expected_value ss = tf.reduce_sum(mm, axis=2) - ss = tf.Print(ss, [ss], 'ss', summarize=500) new_values = tf.reduce_max(ss, axis=1) - new_values = tf.Print(new_values, [new_values], 'new_values', summarize=500) c = tf.reduce_max(tf.abs(new_values - old_values)) > self.epsilon c.set_shape(()) @@ -108,11 +101,10 @@ def condition(i, c, t): i, _, values = tf.while_loop(condition, body, [0, True, t], parallel_iterations=1, back_prop=False, name='VI_loop') - values = values.read(tf.Print(i, [i], 'i: ')) + values = values.read(i) expected_value = rewards_expanded + self.gamma * values - expected_value = tf.Print(expected_value, [expected_value], 'expected_value: ', summarize=500) - expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(P_a)[1], 1]) + expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(self.P_a)[1], 1]) if self.deterministic: if self.sparse: @@ -366,7 +358,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): N_STATES, _, N_ACTIONS = np.shape(P_a) # init nn model - nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=False, sparse=sparse) + nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=True, sparse=sparse) # find state visitation frequencies using demonstrations mu_D = demo_svf(trajs, N_STATES) @@ -389,7 +381,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): # rewards = nn_r.get_rewards(feat_map) # compute policy - #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=False) + #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True) # compute rewards and policy at the same time #t = time.time() @@ -397,11 +389,11 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): #print('tensorflow VI', time.time() - t) # compute expected svf - #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False) + #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True) rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001) - assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, False) + assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, True) # compute gradients on rewards: @@ -444,6 +436,8 @@ def assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, assert (np.abs(mu_exp - compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=deterministic)) < 0.00001).all() assert ( np.abs(mu_exp - compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=deterministic)) < 0.00001).all() + + print('tf sum SVF', mu_exp.sum()) From 205cf45411166833008cda3510d91e5832a46c4c Mon Sep 17 00:00:00 2001 From: Magnus Jahnen Date: Wed, 29 Nov 2017 14:58:58 -0800 Subject: [PATCH 08/10] Remove some python prints --- deep_maxent_irl.py | 3 +-- deep_maxent_irl_gridworld.py | 5 ----- mdp/value_iteration.py | 2 -- 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index 1a500c8..e849b51 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -393,8 +393,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001) - assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, True) - + # assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, True) # compute gradients on rewards: grad_r = mu_D - mu_exp diff --git a/deep_maxent_irl_gridworld.py b/deep_maxent_irl_gridworld.py index 44108e8..f867329 100644 --- a/deep_maxent_irl_gridworld.py +++ b/deep_maxent_irl_gridworld.py @@ -99,11 +99,6 @@ def main(): trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) - mu = np.zeros([N_STATES]) - - for traj in trajs: - mu[traj[0].cur_state] += 1 - mu = mu / len(trajs) print 'Deep Max Ent IRL training ..' t = time.time() diff --git a/mdp/value_iteration.py b/mdp/value_iteration.py index e090b35..718957a 100644 --- a/mdp/value_iteration.py +++ b/mdp/value_iteration.py @@ -140,8 +140,6 @@ def step(start, end): break expected_value = rewards_expanded + gamma * values - print(expected_value) - expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2) expected_value = np.transpose(expected_value, (0, 2, 1)) From 521d15476e82e301eedf655c92b02350c32509a0 Mon Sep 17 00:00:00 2001 From: Magnus Jahnen Date: Wed, 29 Nov 2017 15:16:24 -0800 Subject: [PATCH 09/10] Remove sparse tensor support --- deep_maxent_irl.py | 50 ++++++++++-------------------------- deep_maxent_irl_gridworld.py | 3 +-- 2 files changed, 15 insertions(+), 38 deletions(-) diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index e849b51..8d369d4 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -16,23 +16,19 @@ class DeepIRLFC: - def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, deterministic=False, sparse=False, name='deep_irl_fc'): + def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, deterministic=False, name='deep_irl_fc'): self.n_input = n_input self.lr = lr self.n_h1 = n_h1 self.n_h2 = n_h2 self.name = name - self.sparse = sparse self.deterministic = deterministic self.sess = tf.Session() self.input_s, self.reward, self.theta = self._build_network(self.name) # value iteration - if sparse: - self.P_a = tf.sparse_placeholder(tf.float32, shape=(n_input, n_actions, n_input)) - else: - self.P_a = tf.placeholder(tf.float32, shape=(n_input, n_actions, n_input)) + self.P_a = tf.placeholder(tf.float32, shape=(n_input, n_actions, n_input)) self.gamma = tf.placeholder(tf.float32) self.epsilon = tf.placeholder(tf.float32) self.values, self.policy = self._vi(self.reward) @@ -76,21 +72,16 @@ def _vi(self, rewards): def body(i, c, t): old_values = t.read(i) - if self.sparse: - new_values = tf.sparse_reduce_max( - tf.sparse_reduce_sum_sparse(self.P_a * (rewards + self.gamma * old_values), axis=2), axis=1) - else: - expected_value = rewards_expanded + self.gamma * old_values - expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(self.P_a)[1], 1]) - mm = self.P_a * expected_value - ss = tf.reduce_sum(mm, axis=2) - new_values = tf.reduce_max(ss, axis=1) + expected_value = rewards_expanded + self.gamma * old_values + expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(self.P_a)[1], 1]) + new_values = tf.reduce_max(tf.reduce_sum(self.P_a * expected_value, axis=2), axis=1) + t = t.write(i + 1, new_values) c = tf.reduce_max(tf.abs(new_values - old_values)) > self.epsilon c.set_shape(()) - t = t.write(i + 1, new_values) + return i + 1, c, t def condition(i, c, t): @@ -107,18 +98,9 @@ def condition(i, c, t): expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(self.P_a)[1], 1]) if self.deterministic: - if self.sparse: - policy = tf.argmax(tf.sparse_tensor_to_dense(tf.sparse_reduce_sum_sparse(self.P_a * (rewards + self.gamma * values), axis=2)), axis=1) - else: - - policy = tf.argmax(tf.reduce_sum(self.P_a * expected_value, axis=2), axis=1) + policy = tf.argmax(tf.reduce_sum(self.P_a * expected_value, axis=2), axis=1) else: - if self.sparse: - policy = tf.sparse_tensor_to_dense( - tf.sparse_reduce_sum_sparse(self.P_a * expected_value, axis=2)) - else: - policy = tf.reduce_sum(self.P_a * expected_value, axis=2) - + policy = tf.reduce_sum(self.P_a * expected_value, axis=2) policy = tf.nn.softmax(policy) return values, policy @@ -335,7 +317,7 @@ def compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=Tru return p -def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): +def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters): """ Maximum Entropy Inverse Reinforcement Learning (Maxent IRL) @@ -358,17 +340,13 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): N_STATES, _, N_ACTIONS = np.shape(P_a) # init nn model - nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=True, sparse=sparse) + nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=False) # find state visitation frequencies using demonstrations mu_D = demo_svf(trajs, N_STATES) p_start_state = start_state_probs(trajs, N_STATES) P_a_t = P_a.transpose(0, 2, 1) - if sparse: - mask = P_a_t > 0 - indices = np.argwhere(mask) - P_a_t = tf.SparseTensorValue(indices, P_a_t[mask], P_a_t.shape) grads = list() @@ -381,7 +359,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): # rewards = nn_r.get_rewards(feat_map) # compute policy - #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True) + #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=False) # compute rewards and policy at the same time #t = time.time() @@ -389,11 +367,11 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): #print('tensorflow VI', time.time() - t) # compute expected svf - #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True) + #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False) rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001) - # assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, True) + assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, False) # compute gradients on rewards: grad_r = mu_D - mu_exp diff --git a/deep_maxent_irl_gridworld.py b/deep_maxent_irl_gridworld.py index f867329..413ed3f 100644 --- a/deep_maxent_irl_gridworld.py +++ b/deep_maxent_irl_gridworld.py @@ -27,7 +27,6 @@ PARSER.set_defaults(rand_start=True) PARSER.add_argument('-lr', '--learning_rate', default=0.02, type=float, help='learning rate') PARSER.add_argument('-ni', '--n_iters', default=20, type=int, help='number of iterations') -PARSER.add_argument('-s', '--sparse', default=False, action='store_true', help='flag to use sparse tensors in tf') ARGS = PARSER.parse_args() print ARGS @@ -102,7 +101,7 @@ def main(): print 'Deep Max Ent IRL training ..' t = time.time() - rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS, ARGS.sparse) + rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) print('time for dirl', time.time() - t) values, policy = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) From 3e77ea8ea4b0fd7a3fef4e066b2a968f864a9407 Mon Sep 17 00:00:00 2001 From: Magnus Jahnen Date: Wed, 29 Nov 2017 15:20:34 -0800 Subject: [PATCH 10/10] no assert --- deep_maxent_irl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index 8d369d4..9f0946a 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -371,7 +371,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters): rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001) - assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, False) + #assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, False) # compute gradients on rewards: grad_r = mu_D - mu_exp