diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index 5f23563..9f0946a 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -16,23 +16,19 @@ class DeepIRLFC: - def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, deterministic=False, sparse=False, name='deep_irl_fc'): + def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, deterministic=False, name='deep_irl_fc'): self.n_input = n_input self.lr = lr self.n_h1 = n_h1 self.n_h2 = n_h2 self.name = name - self.sparse = sparse self.deterministic = deterministic self.sess = tf.Session() self.input_s, self.reward, self.theta = self._build_network(self.name) # value iteration - if sparse: - self.P_a = tf.sparse_placeholder(tf.float32, shape=(n_input, n_actions, n_input)) - else: - self.P_a = tf.placeholder(tf.float32, shape=(n_input, n_actions, n_input)) + self.P_a = tf.placeholder(tf.float32, shape=(n_input, n_actions, n_input)) self.gamma = tf.placeholder(tf.float32) self.epsilon = tf.placeholder(tf.float32) self.values, self.policy = self._vi(self.reward) @@ -68,23 +64,24 @@ def _build_network(self, name): initializer=tf.contrib.layers.variance_scaling_initializer(mode="FAN_IN")) reward = tf_utils.fc(fc2, 1, scope="reward") theta = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name) - return input_s, reward, theta + return input_s, tf.squeeze(reward), theta def _vi(self, rewards): - rewards = tf.squeeze(rewards) + rewards_expanded = tf.tile(tf.expand_dims(rewards, 1), [1, self.n_input]) def body(i, c, t): old_values = t.read(i) - if self.sparse: - new_values = tf.sparse_reduce_max( - tf.sparse_reduce_sum_sparse(self.P_a * (rewards + self.gamma * old_values), axis=2), axis=1) - else: - new_values = tf.reduce_max(tf.reduce_sum(self.P_a * (rewards + self.gamma * old_values), axis=2), axis=1) + + expected_value = rewards_expanded + self.gamma * old_values + expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(self.P_a)[1], 1]) + + new_values = tf.reduce_max(tf.reduce_sum(self.P_a * expected_value, axis=2), axis=1) + t = t.write(i + 1, new_values) c = tf.reduce_max(tf.abs(new_values - old_values)) > self.epsilon c.set_shape(()) - t = t.write(i + 1, new_values) + return i + 1, c, t def condition(i, c, t): @@ -97,18 +94,13 @@ def condition(i, c, t): name='VI_loop') values = values.read(i) + expected_value = rewards_expanded + self.gamma * values + expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(self.P_a)[1], 1]) + if self.deterministic: - if self.sparse: - policy = tf.argmax(tf.sparse_tensor_to_dense(tf.sparse_reduce_sum_sparse(self.P_a * (rewards + self.gamma * values), axis=2)), axis=1) - else: - policy = tf.argmax(tf.reduce_sum(self.P_a * (rewards + self.gamma * values), axis=2), axis=1) + policy = tf.argmax(tf.reduce_sum(self.P_a * expected_value, axis=2), axis=1) else: - if self.sparse: - policy = tf.sparse_tensor_to_dense( - tf.sparse_reduce_sum_sparse(self.P_a * (rewards + self.gamma * values), axis=2)) - else: - policy = tf.reduce_sum(self.P_a * (rewards + self.gamma * values), axis=2) - + policy = tf.reduce_sum(self.P_a * expected_value, axis=2) policy = tf.nn.softmax(policy) return values, policy @@ -140,7 +132,20 @@ def _svf(self, policy): mu.append(cur_mu) mu = tf.stack(mu) - return tf.reduce_sum(mu, axis=0) + mu = tf.reduce_sum(mu, axis=0) + if self.deterministic: + # NOTE: In the deterministic case it helps to scale the svf by T to recover the reward properly + # I noticed that if it is not scaled by T then the recovered reward and the resulting value function + # have extremely low values (usually < 0.01). With such low values it is hard to actually recover a + # difference in the value of states (i.e. if only the last few digits after the comma differ). + # One intuition why scaling by T is useful is to stabilize the gradients and avoid that the gradients + # are getting too high + # TODO: maybe gradient clipping and normalizing the svf of demonstrations and the policy might help as well + # As a side note: This is not mentioned somewhere in the pulications, but for me this countermeasure works + # pretty well (on the other hand using a deterministic policy is anyways never meantioned in one of the + # publications, they always describe optimizing with stochastic policies) + mu /= self.T + return mu def get_theta(self): @@ -230,6 +235,19 @@ def step(t, start, end): p = np.sum(mu, 1) + if deterministic: + # NOTE: In the deterministic case it helps to scale the svf by T to recover the reward properly + # I noticed that if it is not scaled by T then the recovered reward and the resulting value function + # have extremely low values (usually < 0.01). With such low values it is hard to actually recover a + # difference in the value of states (i.e. if only the last few digits after the comma differ). + # One intuition why scaling by T is useful is to stabilize the gradients and avoid that the gradients + # are getting too high + # TODO: maybe gradient clipping and normalizing the svf of demonstrations and the policy might help as well + # As a side note: This is not mentioned somewhere in the pulications, but for me this countermeasure works + # pretty well (on the other hand using a deterministic policy is anyways never meantioned in one of the + # publications, they always describe optimizing with stochastic policies) + p /= T + print(time.time() - tt) return p @@ -283,10 +301,23 @@ def compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=Tru range(N_STATES)]) p = np.sum(mu, 1) + print('SUM SVF', p.sum()) + if deterministic: + # NOTE: In the deterministic case it helps to scale the svf by T to recover the reward properly + # I noticed that if it is not scaled by T then the recovered reward and the resulting value function + # have extremely low values (usually < 0.01). With such low values it is hard to actually recover a + # difference in the value of states (i.e. if only the last few digits after the comma differ). + # One intuition why scaling by T is useful is to stabilize the gradients and avoid that the gradients + # are getting too high + # TODO: maybe gradient clipping and normalizing the svf of demonstrations and the policy might help as well + # As a side note: This is not mentioned somewhere in the pulications, but for me this countermeasure works + # pretty well (on the other hand using a deterministic policy is anyways never meantioned in one of the + # publications, they always describe optimizing with stochastic policies) + p /= T return p -def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): +def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters): """ Maximum Entropy Inverse Reinforcement Learning (Maxent IRL) @@ -309,17 +340,15 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): N_STATES, _, N_ACTIONS = np.shape(P_a) # init nn model - nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=True, sparse=sparse) + nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=False) # find state visitation frequencies using demonstrations mu_D = demo_svf(trajs, N_STATES) p_start_state = start_state_probs(trajs, N_STATES) P_a_t = P_a.transpose(0, 2, 1) - if sparse: - mask = P_a_t > 0 - indices = np.argwhere(mask) - P_a_t = tf.SparseTensorValue(indices, P_a_t[mask], P_a_t.shape) + + grads = list() # training for iteration in range(n_iters): @@ -327,10 +356,10 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): print 'iteration: {}'.format(iteration) # compute the reward matrix - rewards = nn_r.get_rewards(feat_map) + # rewards = nn_r.get_rewards(feat_map) # compute policy - #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True) + #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=False) # compute rewards and policy at the same time #t = time.time() @@ -338,33 +367,54 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse): #print('tensorflow VI', time.time() - t) # compute expected svf - #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True) + #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False) rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001) + + #assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, False) + # compute gradients on rewards: grad_r = mu_D - mu_exp + grads.append(grad_r) - assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001, deterministic=True) - assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001, deterministic=True) + # apply gradients to the neural network + grad_theta, l2_loss, grad_norm = nn_r.apply_grads(feat_map, grad_r) + + + print('grad mean', np.mean(grads, axis=0)) + print('grad std', np.std(grads, axis=0)) + + rewards = nn_r.get_rewards(feat_map) + # return sigmoid(normalize(rewards)) + return normalize(rewards) +def assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, deterministic): + assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001, + deterministic=deterministic) + assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001, + deterministic=deterministic) + assert_values2 = value_iteration.optimal_value(N_STATES, N_ACTIONS, P_a_t, rewards, gamma, threshold=0.000001) + + assert (np.abs(assert_values - assert_values2) < 0.0001).all() assert (np.abs(assert_values - assert_values_old) < 0.0001).all() assert (np.abs(values - assert_values) < 0.0001).all() assert (np.abs(values - assert_values_old) < 0.0001).all() + print(assert_policy) + print(assert_policy_old) + print(policy) + print(values) + print(assert_values) + print(rewards) assert (np.abs(assert_policy - assert_policy_old) < 0.0001).all() - assert (np.abs(policy - assert_policy) < 0.001).all() - assert (np.abs(policy - assert_policy_old) < 0.001).all() - - assert (np.abs(mu_exp - compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True)) < 0.00001).all() - assert (np.abs(mu_exp - compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=True)) < 0.00001).all() + assert (np.abs(policy - assert_policy) < 0.0001).all() + assert (np.abs(policy - assert_policy_old) < 0.0001).all() - # apply gradients to the neural network - grad_theta, l2_loss, grad_norm = nn_r.apply_grads(feat_map, grad_r) + assert (np.abs(mu_exp - compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=deterministic)) < 0.00001).all() + assert ( + np.abs(mu_exp - compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=deterministic)) < 0.00001).all() - - rewards = nn_r.get_rewards(feat_map) - # return sigmoid(normalize(rewards)) - return normalize(rewards) + print('tf sum SVF', mu_exp.sum()) diff --git a/deep_maxent_irl_gridworld.py b/deep_maxent_irl_gridworld.py index e1c62e6..413ed3f 100644 --- a/deep_maxent_irl_gridworld.py +++ b/deep_maxent_irl_gridworld.py @@ -27,7 +27,6 @@ PARSER.set_defaults(rand_start=True) PARSER.add_argument('-lr', '--learning_rate', default=0.02, type=float, help='learning rate') PARSER.add_argument('-ni', '--n_iters', default=20, type=int, help='number of iterations') -PARSER.add_argument('-s', '--sparse', default=False, action='store_true', help='flag to use sparse tensors in tf') ARGS = PARSER.parse_args() print ARGS @@ -99,20 +98,15 @@ def main(): trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) - mu = np.zeros([N_STATES]) - - for traj in trajs: - mu[traj[0].cur_state] += 1 - mu = mu / len(trajs) print 'Deep Max Ent IRL training ..' t = time.time() - rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS, ARGS.sparse) + rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) print('time for dirl', time.time() - t) values, policy = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) - print('evd', expected_value_diff(P_a, rewards, rewards_gt, GAMMA, mu, values_gt, policy)) + #print('evd', expected_value_diff(P_a, rewards, rewards_gt, GAMMA, mu, values_gt, policy)) # plots plt.figure(figsize=(20,4)) diff --git a/maxent_irl.py b/maxent_irl.py index e528006..20559e5 100644 --- a/maxent_irl.py +++ b/maxent_irl.py @@ -104,39 +104,3 @@ def maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters): rewards = np.dot(feat_map, theta) # return sigmoid(normalize(rewards)) return normalize(rewards) - -def value(policy, n_states, transition_probabilities, reward, discount, - threshold=1e-2): - """ - FROM https://github.com/MatthewJA/Inverse-Reinforcement-Learning/blob/master/irl/value_iteration.py#L10 - - Find the value function associated with a policy. - - policy: List of action ints for each state. - n_states: Number of states. int. - transition_probabilities: Function taking (state, action, state) to - transition probabilities. - reward: Vector of rewards for each state. - discount: MDP discount factor. float. - threshold: Convergence threshold, default 1e-2. float. - -> Array of values for each state - """ - v = np.zeros(n_states) - - diff = float("inf") - while diff > threshold: - diff = 0 - for s in range(n_states): - vs = v[s] - a = policy[s] - v[s] = sum(transition_probabilities[s, a, k] * - (reward[k] + discount * v[k]) - for k in range(n_states)) - diff = max(diff, abs(vs - v[s])) - - return v - -def expected_value_diff(P_a, rewards, true_rewards, gamma, p_start, optimal_value, policy, error=0.01, deterministic=True): - v = value(policy, P_a.shape[0], P_a.transpose(0, 2, 1), true_rewards, gamma) - - return optimal_value.dot(p_start) - v.dot(p_start) diff --git a/mdp/value_iteration.py b/mdp/value_iteration.py index 5a42296..718957a 100644 --- a/mdp/value_iteration.py +++ b/mdp/value_iteration.py @@ -17,8 +17,12 @@ def softmax(x): """Compute softmax values for each sets of scores in x.""" - e_x = np.exp(x - np.max(x, axis=-1)[:, np.newaxis]) - return e_x / e_x.sum(axis=-1)[:, np.newaxis] + if len(x.shape) > 1: + e_x = np.exp(x - np.max(x, axis=-1)[:, np.newaxis]) + return e_x / e_x.sum(axis=-1)[:, np.newaxis] + else: + e_x = np.exp(x - np.max(x)) + return e_x / e_x.sum() def value_iteration_old(P_a, rewards, gamma, error=0.01, deterministic=True): """ @@ -106,6 +110,7 @@ def value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True): if chunk_size == 0: chunk_size = N_STATES + rewards_expanded = rewards[:, np.newaxis].repeat(N_STATES, axis=1) count = 0 # estimate values while True: @@ -113,10 +118,10 @@ def value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True): values_tmp = values.copy() def step(start, end): - tmp = rewards[start:end, np.newaxis].repeat(N_STATES, axis=1) + gamma * values_tmp - tmp = tmp[:, :, np.newaxis].repeat(N_ACTIONS, axis=2) - tmp = np.transpose(tmp, (0, 2, 1)) - values[start:end] = (P[start:end, :, :] * tmp).sum(axis=2).max(axis=1) + expected_value = rewards_expanded[start:end, :] + gamma * values_tmp + expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2) + expected_value = np.transpose(expected_value, (0, 2, 1)) + values[start:end] = (P[start:end, :, :] * expected_value).sum(axis=2).max(axis=1) with ThreadPoolExecutor(max_workers=num_cpus) as e: futures = list() @@ -134,17 +139,20 @@ def step(start, end): print('VI', count) break + expected_value = rewards_expanded + gamma * values + expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2) + expected_value = np.transpose(expected_value, (0, 2, 1)) + + if deterministic: # generate deterministic policy - policy = np.argmax((P * (rewards + gamma * values_tmp)).sum(axis=2), axis=1) - + policy = np.argmax((P * expected_value).sum(axis=2), axis=1) print(time.time() - t) return values, policy else: - # generate stochastic policy - policy = (P * (rewards + gamma * values_tmp)).sum(axis=2) + policy = (P * expected_value).sum(axis=2) policy = softmax(policy) print(time.time() - t) @@ -347,8 +355,75 @@ def get_action(self, state): return actions[a_id] +def optimal_value(n_states, n_actions, transition_probabilities, reward, + discount, threshold=1e-2): + """ + FROM https://github.com/MatthewJA/Inverse-Reinforcement-Learning/blob/master/irl/value_iteration.py + Find the optimal value function. + n_states: Number of states. int. + n_actions: Number of actions. int. + transition_probabilities: Function taking (state, action, state) to + transition probabilities. + reward: Vector of rewards for each state. + discount: MDP discount factor. float. + threshold: Convergence threshold, default 1e-2. float. + -> Array of values for each state + """ + + v = np.zeros(n_states) + new_v = np.zeros(n_states) + + diff = float("inf") + while diff > threshold: + diff = 0 + v = new_v.copy() + for s in range(n_states): + max_v = float("-inf") + for a in range(n_actions): + tp = transition_probabilities[s, a, :] + max_v = max(max_v, np.dot(tp, reward[s] + discount * v)) + max_v = max(max_v, np.sum(tp * (reward[s] + discount*v))) + + new_diff = abs(v[s] - max_v) + if new_diff > diff: + diff = new_diff + new_v[s] = max_v + + return v +def value(policy, n_states, transition_probabilities, reward, discount, + threshold=1e-2): + """ + FROM https://github.com/MatthewJA/Inverse-Reinforcement-Learning/blob/master/irl/value_iteration.py#L10 + + Find the value function associated with a policy. + + policy: List of action ints for each state. + n_states: Number of states. int. + transition_probabilities: Function taking (state, action, state) to + transition probabilities. + reward: Vector of rewards for each state. + discount: MDP discount factor. float. + threshold: Convergence threshold, default 1e-2. float. + -> Array of values for each state + """ + v = np.zeros(n_states) + + diff = float("inf") + while diff > threshold: + diff = 0 + for s in range(n_states): + vs = v[s] + a = policy[s] + v[s] = sum(transition_probabilities[s, a, k] * + (reward[k] + discount * v[k]) + for k in range(n_states)) + diff = max(diff, abs(vs - v[s])) + return v +def expected_value_diff(P_a, rewards, true_rewards, gamma, p_start, optimal_value, policy, error=0.01, deterministic=True): + v = value(policy, P_a.shape[0], P_a.transpose(0, 2, 1), true_rewards, gamma) + return optimal_value.dot(p_start) - v.dot(p_start) \ No newline at end of file