Skip to content

Commit

Permalink
Merge pull request #2 from magnusja/bugfix/tf-vi-debug-cleanup
Browse files Browse the repository at this point in the history
Bugfix/tf vi debug cleanup
  • Loading branch information
magnusja authored Nov 29, 2017
2 parents 49ee38a + 3e77ea8 commit c049c20
Show file tree
Hide file tree
Showing 4 changed files with 184 additions and 101 deletions.
144 changes: 97 additions & 47 deletions deep_maxent_irl.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,23 +16,19 @@
class DeepIRLFC:


def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, deterministic=False, sparse=False, name='deep_irl_fc'):
def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, deterministic=False, name='deep_irl_fc'):
self.n_input = n_input
self.lr = lr
self.n_h1 = n_h1
self.n_h2 = n_h2
self.name = name
self.sparse = sparse
self.deterministic = deterministic

self.sess = tf.Session()
self.input_s, self.reward, self.theta = self._build_network(self.name)

# value iteration
if sparse:
self.P_a = tf.sparse_placeholder(tf.float32, shape=(n_input, n_actions, n_input))
else:
self.P_a = tf.placeholder(tf.float32, shape=(n_input, n_actions, n_input))
self.P_a = tf.placeholder(tf.float32, shape=(n_input, n_actions, n_input))
self.gamma = tf.placeholder(tf.float32)
self.epsilon = tf.placeholder(tf.float32)
self.values, self.policy = self._vi(self.reward)
Expand Down Expand Up @@ -68,23 +64,24 @@ def _build_network(self, name):
initializer=tf.contrib.layers.variance_scaling_initializer(mode="FAN_IN"))
reward = tf_utils.fc(fc2, 1, scope="reward")
theta = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name)
return input_s, reward, theta
return input_s, tf.squeeze(reward), theta

def _vi(self, rewards):

rewards = tf.squeeze(rewards)
rewards_expanded = tf.tile(tf.expand_dims(rewards, 1), [1, self.n_input])

def body(i, c, t):
old_values = t.read(i)
if self.sparse:
new_values = tf.sparse_reduce_max(
tf.sparse_reduce_sum_sparse(self.P_a * (rewards + self.gamma * old_values), axis=2), axis=1)
else:
new_values = tf.reduce_max(tf.reduce_sum(self.P_a * (rewards + self.gamma * old_values), axis=2), axis=1)

expected_value = rewards_expanded + self.gamma * old_values
expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(self.P_a)[1], 1])

new_values = tf.reduce_max(tf.reduce_sum(self.P_a * expected_value, axis=2), axis=1)
t = t.write(i + 1, new_values)

c = tf.reduce_max(tf.abs(new_values - old_values)) > self.epsilon
c.set_shape(())
t = t.write(i + 1, new_values)

return i + 1, c, t

def condition(i, c, t):
Expand All @@ -97,18 +94,13 @@ def condition(i, c, t):
name='VI_loop')
values = values.read(i)

expected_value = rewards_expanded + self.gamma * values
expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(self.P_a)[1], 1])

if self.deterministic:
if self.sparse:
policy = tf.argmax(tf.sparse_tensor_to_dense(tf.sparse_reduce_sum_sparse(self.P_a * (rewards + self.gamma * values), axis=2)), axis=1)
else:
policy = tf.argmax(tf.reduce_sum(self.P_a * (rewards + self.gamma * values), axis=2), axis=1)
policy = tf.argmax(tf.reduce_sum(self.P_a * expected_value, axis=2), axis=1)
else:
if self.sparse:
policy = tf.sparse_tensor_to_dense(
tf.sparse_reduce_sum_sparse(self.P_a * (rewards + self.gamma * values), axis=2))
else:
policy = tf.reduce_sum(self.P_a * (rewards + self.gamma * values), axis=2)

policy = tf.reduce_sum(self.P_a * expected_value, axis=2)
policy = tf.nn.softmax(policy)

return values, policy
Expand Down Expand Up @@ -140,7 +132,20 @@ def _svf(self, policy):
mu.append(cur_mu)

mu = tf.stack(mu)
return tf.reduce_sum(mu, axis=0)
mu = tf.reduce_sum(mu, axis=0)
if self.deterministic:
# NOTE: In the deterministic case it helps to scale the svf by T to recover the reward properly
# I noticed that if it is not scaled by T then the recovered reward and the resulting value function
# have extremely low values (usually < 0.01). With such low values it is hard to actually recover a
# difference in the value of states (i.e. if only the last few digits after the comma differ).
# One intuition why scaling by T is useful is to stabilize the gradients and avoid that the gradients
# are getting too high
# TODO: maybe gradient clipping and normalizing the svf of demonstrations and the policy might help as well
# As a side note: This is not mentioned somewhere in the pulications, but for me this countermeasure works
# pretty well (on the other hand using a deterministic policy is anyways never meantioned in one of the
# publications, they always describe optimizing with stochastic policies)
mu /= self.T
return mu


def get_theta(self):
Expand Down Expand Up @@ -230,6 +235,19 @@ def step(t, start, end):

p = np.sum(mu, 1)

if deterministic:
# NOTE: In the deterministic case it helps to scale the svf by T to recover the reward properly
# I noticed that if it is not scaled by T then the recovered reward and the resulting value function
# have extremely low values (usually < 0.01). With such low values it is hard to actually recover a
# difference in the value of states (i.e. if only the last few digits after the comma differ).
# One intuition why scaling by T is useful is to stabilize the gradients and avoid that the gradients
# are getting too high
# TODO: maybe gradient clipping and normalizing the svf of demonstrations and the policy might help as well
# As a side note: This is not mentioned somewhere in the pulications, but for me this countermeasure works
# pretty well (on the other hand using a deterministic policy is anyways never meantioned in one of the
# publications, they always describe optimizing with stochastic policies)
p /= T

print(time.time() - tt)
return p

Expand Down Expand Up @@ -283,10 +301,23 @@ def compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=Tru
range(N_STATES)])

p = np.sum(mu, 1)
print('SUM SVF', p.sum())
if deterministic:
# NOTE: In the deterministic case it helps to scale the svf by T to recover the reward properly
# I noticed that if it is not scaled by T then the recovered reward and the resulting value function
# have extremely low values (usually < 0.01). With such low values it is hard to actually recover a
# difference in the value of states (i.e. if only the last few digits after the comma differ).
# One intuition why scaling by T is useful is to stabilize the gradients and avoid that the gradients
# are getting too high
# TODO: maybe gradient clipping and normalizing the svf of demonstrations and the policy might help as well
# As a side note: This is not mentioned somewhere in the pulications, but for me this countermeasure works
# pretty well (on the other hand using a deterministic policy is anyways never meantioned in one of the
# publications, they always describe optimizing with stochastic policies)
p /= T
return p


def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters):
"""
Maximum Entropy Inverse Reinforcement Learning (Maxent IRL)
Expand All @@ -309,62 +340,81 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, sparse):
N_STATES, _, N_ACTIONS = np.shape(P_a)

# init nn model
nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=True, sparse=sparse)
nn_r = DeepIRLFC(feat_map.shape[1], N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=False)

# find state visitation frequencies using demonstrations
mu_D = demo_svf(trajs, N_STATES)
p_start_state = start_state_probs(trajs, N_STATES)

P_a_t = P_a.transpose(0, 2, 1)
if sparse:
mask = P_a_t > 0
indices = np.argwhere(mask)
P_a_t = tf.SparseTensorValue(indices, P_a_t[mask], P_a_t.shape)

grads = list()

# training
for iteration in range(n_iters):
if iteration % (n_iters/10) == 0:
print 'iteration: {}'.format(iteration)

# compute the reward matrix
rewards = nn_r.get_rewards(feat_map)
# rewards = nn_r.get_rewards(feat_map)

# compute policy
#_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True)
#_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=False)

# compute rewards and policy at the same time
#t = time.time()
#rewards, _, policy = nn_r.get_policy(feat_map, P_a_t, gamma, 0.01)
#print('tensorflow VI', time.time() - t)

# compute expected svf
#mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True)
#mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False)

rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001)

#assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, False)

# compute gradients on rewards:
grad_r = mu_D - mu_exp
grads.append(grad_r)

assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001, deterministic=True)
assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001, deterministic=True)
# apply gradients to the neural network
grad_theta, l2_loss, grad_norm = nn_r.apply_grads(feat_map, grad_r)


print('grad mean', np.mean(grads, axis=0))
print('grad std', np.std(grads, axis=0))

rewards = nn_r.get_rewards(feat_map)
# return sigmoid(normalize(rewards))
return normalize(rewards)

def assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, deterministic):
assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001,
deterministic=deterministic)
assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001,
deterministic=deterministic)
assert_values2 = value_iteration.optimal_value(N_STATES, N_ACTIONS, P_a_t, rewards, gamma, threshold=0.000001)

assert (np.abs(assert_values - assert_values2) < 0.0001).all()
assert (np.abs(assert_values - assert_values_old) < 0.0001).all()
assert (np.abs(values - assert_values) < 0.0001).all()
assert (np.abs(values - assert_values_old) < 0.0001).all()

print(assert_policy)
print(assert_policy_old)
print(policy)
print(values)
print(assert_values)
print(rewards)
assert (np.abs(assert_policy - assert_policy_old) < 0.0001).all()
assert (np.abs(policy - assert_policy) < 0.001).all()
assert (np.abs(policy - assert_policy_old) < 0.001).all()

assert (np.abs(mu_exp - compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True)) < 0.00001).all()
assert (np.abs(mu_exp - compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=True)) < 0.00001).all()
assert (np.abs(policy - assert_policy) < 0.0001).all()
assert (np.abs(policy - assert_policy_old) < 0.0001).all()

# apply gradients to the neural network
grad_theta, l2_loss, grad_norm = nn_r.apply_grads(feat_map, grad_r)
assert (np.abs(mu_exp - compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=deterministic)) < 0.00001).all()
assert (
np.abs(mu_exp - compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=deterministic)) < 0.00001).all()


rewards = nn_r.get_rewards(feat_map)
# return sigmoid(normalize(rewards))
return normalize(rewards)
print('tf sum SVF', mu_exp.sum())



Expand Down
10 changes: 2 additions & 8 deletions deep_maxent_irl_gridworld.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
PARSER.set_defaults(rand_start=True)
PARSER.add_argument('-lr', '--learning_rate', default=0.02, type=float, help='learning rate')
PARSER.add_argument('-ni', '--n_iters', default=20, type=int, help='number of iterations')
PARSER.add_argument('-s', '--sparse', default=False, action='store_true', help='flag to use sparse tensors in tf')
ARGS = PARSER.parse_args()
print ARGS

Expand Down Expand Up @@ -99,20 +98,15 @@ def main():

trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START)

mu = np.zeros([N_STATES])

for traj in trajs:
mu[traj[0].cur_state] += 1
mu = mu / len(trajs)

print 'Deep Max Ent IRL training ..'
t = time.time()
rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS, ARGS.sparse)
rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS)
print('time for dirl', time.time() - t)

values, policy = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True)

print('evd', expected_value_diff(P_a, rewards, rewards_gt, GAMMA, mu, values_gt, policy))
#print('evd', expected_value_diff(P_a, rewards, rewards_gt, GAMMA, mu, values_gt, policy))

# plots
plt.figure(figsize=(20,4))
Expand Down
36 changes: 0 additions & 36 deletions maxent_irl.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,39 +104,3 @@ def maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters):
rewards = np.dot(feat_map, theta)
# return sigmoid(normalize(rewards))
return normalize(rewards)

def value(policy, n_states, transition_probabilities, reward, discount,
threshold=1e-2):
"""
FROM https://github.com/MatthewJA/Inverse-Reinforcement-Learning/blob/master/irl/value_iteration.py#L10
Find the value function associated with a policy.
policy: List of action ints for each state.
n_states: Number of states. int.
transition_probabilities: Function taking (state, action, state) to
transition probabilities.
reward: Vector of rewards for each state.
discount: MDP discount factor. float.
threshold: Convergence threshold, default 1e-2. float.
-> Array of values for each state
"""
v = np.zeros(n_states)

diff = float("inf")
while diff > threshold:
diff = 0
for s in range(n_states):
vs = v[s]
a = policy[s]
v[s] = sum(transition_probabilities[s, a, k] *
(reward[k] + discount * v[k])
for k in range(n_states))
diff = max(diff, abs(vs - v[s]))

return v

def expected_value_diff(P_a, rewards, true_rewards, gamma, p_start, optimal_value, policy, error=0.01, deterministic=True):
v = value(policy, P_a.shape[0], P_a.transpose(0, 2, 1), true_rewards, gamma)

return optimal_value.dot(p_start) - v.dot(p_start)
Loading

0 comments on commit c049c20

Please sign in to comment.