Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/deterministic model dynamics #6

Merged
merged 14 commits into from
Dec 4, 2017
247 changes: 169 additions & 78 deletions deep_maxent_irl.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
class DeepIRLFC:


def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, deterministic=False, sparse=False, conv=False, name='deep_irl_fc'):
def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, deterministic_env=False, deterministic=False, sparse=False, conv=False, name='deep_irl_fc'):
if len(n_input) > 1:
self.height, self.width = n_input
self.n_input = self.height * self.width
Expand All @@ -26,6 +26,7 @@ def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, determi
self.n_h1 = n_h1
self.n_h2 = n_h2
self.name = name
self.deterministic_env = deterministic_env
self.deterministic = deterministic
self.sparse = sparse
self.conv = conv
Expand All @@ -35,16 +36,23 @@ def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, determi
self.sess = tf.Session(config=config)
self.input_s, self.reward, self.theta = self._build_network(self.name, conv)

if self.deterministic_env:
p_a_shape = (self.n_input, n_actions)
p_a_dtype = tf.int32
else:
p_a_shape = (self.n_input, n_actions, self.n_input)
p_a_dtype = tf.float32

# value iteration
if sparse:
self.P_a = tf.sparse_placeholder(tf.float32, shape=(self.n_input, n_actions, self.n_input))
self.P_a = tf.sparse_placeholder(p_a_dtype, shape=p_a_shape)
self.reduce_max_sparse = tf.sparse_reduce_max_sparse
self.reduce_sum_sparse = tf.sparse_reduce_sum_sparse
self.reduce_max = tf.sparse_reduce_max
self.reduce_sum = tf.sparse_reduce_sum
self.sparse_transpose = tf.sparse_transpose
else:
self.P_a = tf.placeholder(tf.float32, shape=(self.n_input, n_actions, self.n_input))
self.P_a = tf.placeholder(p_a_dtype, shape=p_a_shape)
self.reduce_max = tf.reduce_max
self.reduce_max_sparse = tf.reduce_max
self.reduce_sum = tf.reduce_sum
Expand Down Expand Up @@ -81,10 +89,10 @@ def _build_network(self, name, conv):
if conv:
input_s = tf.placeholder(tf.float32, [None, self.width, self.height, 1])
with tf.variable_scope(name):
#conv1 = tf_utils.conv2d(input_s, 64, (1, 1), 1)
#conv2 = tf_utils.conv2d(conv1, 32, (1, 1), 1)
#conv3 = tf_utils.conv2d(conv2, 32, (1, 1), 1)
reward = tf_utils.conv2d(input_s, 1, (1, 1), 1)
conv1 = tf_utils.conv2d(input_s, 64, (3, 3), 1)
conv2 = tf_utils.conv2d(conv1, 32, (1, 1), 1)
conv3 = tf_utils.conv2d(conv2, 32, (1, 1), 1)
reward = tf_utils.conv2d(conv3, 1, (1, 1), 1)
theta = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name)
return input_s, tf.squeeze(tf.reshape(reward, (-1, self.n_input))), theta
else:
Expand All @@ -102,13 +110,18 @@ def _vi(self, rewards):

rewards_expanded = rewards #tf.tile(tf.expand_dims(rewards, 1), [1, self.n_input])

def body(i, c, t):
old_values = t.read(i)
def vi_step(values):
if self.deterministic_env:
new_value = tf.gather(rewards_expanded, self.P_a) + self.gamma * tf.gather(values, self.P_a)
else:
new_value = self.reduce_sum_sparse(self.P_a * (rewards_expanded + self.gamma * values), axis=2)

expected_value = rewards_expanded + self.gamma * old_values
#expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(self.P_a)[1], 1])
return new_value

new_values = self.reduce_max(self.reduce_sum_sparse(self.P_a * expected_value, axis=2), axis=1)
def body(i, c, t):
old_values = t.read(i)
new_values = vi_step(old_values)
new_values = self.reduce_max(new_values, axis=1)
t = t.write(i + 1, new_values)

c = tf.reduce_max(tf.abs(new_values - old_values)) > self.epsilon
Expand All @@ -125,42 +138,60 @@ def condition(i, c, t):
i, _, values = tf.while_loop(condition, body, [0, True, t], parallel_iterations=1, back_prop=False,
name='VI_loop')
values = values.read(i)

expected_value = rewards_expanded + self.gamma * values
#expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(self.P_a)[1], 1])
new_values = vi_step(values)

if self.deterministic:
policy = tf.argmax(self.reduce_sum(self.P_a * expected_value, axis=2), axis=1)
policy = tf.argmax(new_values, axis=1)
else:
policy = self.reduce_sum(self.P_a * expected_value, axis=2)
policy = tf.nn.softmax(policy)
policy = tf.nn.softmax(new_values)

return values, policy

def _svf(self, policy):
if self.deterministic:
r = tf.range(self.n_input, dtype=tf.int64)
expanded = tf.expand_dims(policy, 1)
tiled = tf.tile(expanded, [1, self.n_input])

grid = tf.meshgrid(r, r)
indices = tf.stack([grid[1], grid[0], tiled], axis=2)

P_a_cur_policy = tf.gather_nd(self.sparse_transpose(self.P_a, (0, 2, 1)), indices)
P_a_cur_policy = tf.transpose(P_a_cur_policy, (1, 0))
if not self.deterministic_env:
if self.deterministic:
r = tf.range(self.n_input, dtype=tf.int64)
expanded = tf.expand_dims(policy, 1)
tiled = tf.tile(expanded, [1, self.n_input])

grid = tf.meshgrid(r, r)
indices = tf.stack([grid[1], grid[0], tiled], axis=2)

P_a_cur_policy = tf.gather_nd(self.sparse_transpose(self.P_a, (0, 2, 1)), indices)
P_a_cur_policy = tf.transpose(P_a_cur_policy, (1, 0))
else:
P_a_cur_policy = self.P_a * tf.expand_dims(policy, 2)
else:
P_a_cur_policy = self.P_a * tf.expand_dims(policy, 2)
if self.deterministic:
r = tf.range(self.n_input, dtype=tf.int64)
indices = tf.stack([r, policy], axis=1)

P_a_cur_policy = tf.gather_nd(self.P_a, indices)
else:
P_a_cur_policy = self.P_a

mu = list()
mu.append(self.mu)
with tf.variable_scope('svf'):
if self.deterministic:
for t in range(self.T - 1):
cur_mu = self.reduce_sum(mu[t] * P_a_cur_policy, axis=1)
if self.deterministic_env:
# TODO using a variable here seems a little hacky
# https://github.com/tensorflow/tensorflow/issues/2358
cur_mu = tf.Variable(tf.constant(0, dtype=tf.float32, shape=(self.n_input,)), trainable=False)
cur_mu = cur_mu.assign(tf.zeros(shape=(self.n_input,)))
cur_mu = tf.scatter_add(cur_mu, P_a_cur_policy, mu[t])
else:
cur_mu = self.reduce_sum(mu[t] * P_a_cur_policy, axis=1)
mu.append(cur_mu)
else:
for t in range(self.T - 1):
cur_mu = self.reduce_sum(self.reduce_sum_sparse(tf.tile(tf.expand_dims(tf.expand_dims(mu[t], 1), 2),
if self.deterministic_env:
cur_mu = tf.Variable(tf.constant(0, dtype=tf.float32, shape=(self.n_input,)), trainable=False)
cur_mu = cur_mu.assign(tf.zeros(shape=(self.n_input,)))
cur_mu = tf.scatter_add(cur_mu, P_a_cur_policy, tf.expand_dims(mu[t], axis=1) * policy)
else:
cur_mu = self.reduce_sum(self.reduce_sum_sparse(tf.tile(tf.expand_dims(tf.expand_dims(mu[t], 1), 2),
[1, tf.shape(policy)[1],
self.n_input]) * P_a_cur_policy, axis=1),
axis=0)
Expand Down Expand Up @@ -192,6 +223,10 @@ def get_rewards(self, states):
return rewards

def get_policy(self, states, P_a, gamma, epsilon=0.01):
if self.conv:
states = np.expand_dims(np.expand_dims(states, axis=0), axis=-1)
else:
states = np.expand_dims(states, axis=0)
return self.sess.run([self.reward, self.values, self.policy],
feed_dict={self.input_s: states, self.P_a: P_a, self.gamma: gamma, self.epsilon: epsilon})

Expand Down Expand Up @@ -239,7 +274,10 @@ def compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True):
p Nx1 vector - state visitation frequencies
"""
tt = time.time()
N_STATES, _, N_ACTIONS = np.shape(P_a)
if len(P_a.shape) == 3:
N_STATES, _, N_ACTIONS = np.shape(P_a)
else:
N_STATES, N_ACTIONS = np.shape(P_a)

T = len(trajs[0])
# mu[s, t] is the prob of visiting state s at time t
Expand All @@ -253,18 +291,38 @@ def compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True):
if chunk_size == 0:
chunk_size = N_STATES

if deterministic:
P_az = P_a[np.arange(0, N_STATES), :, policy]
else:
P_a = P_a.transpose(0, 2, 1)

def step(t, start, end):
if len(P_a.shape) == 3:
if deterministic:
mu[start:end, t + 1] = np.sum(mu[:, t, np.newaxis] * P_az[:, start:end], axis=0)
P_az = P_a[np.arange(0, N_STATES), :, policy]
else:
mu[start:end, t + 1] = np.sum(np.sum(mu[:, t, np.newaxis, np.newaxis] * (P_a[:, :, start:end] * policy[:, :, np.newaxis]), axis=1), axis=0)
P_a = P_a.transpose(0, 2, 1)
else:
if deterministic:
P_az = P_a[np.arange(N_STATES), policy]

if len(P_a.shape) == 3:
def step(t, start, end):
if deterministic:
mu[start:end, t + 1] = np.sum(mu[:, t, np.newaxis] * P_az[:, start:end], axis=0)
else:
mu[start:end, t + 1] = np.sum(np.sum(mu[:, t, np.newaxis, np.newaxis] * (P_a[:, :, start:end] * policy[:, :, np.newaxis]), axis=1), axis=0)
else:
def step(t, start, end):
if deterministic:
# The following needs be be done using ufunc
# https://stackoverflow.com/questions/41990028/add-multiple-values-to-one-numpy-array-index
# P_az[start:end] sometimes points to same state for multiple values, with the usual fancy indexing only
# one addition (latest) would be executed!
# https://stackoverflow.com/questions/15973827/handling-of-duplicate-indices-in-numpy-assignments
# mu[P_az[start:end], t + 1] += mu[start:end, t]
np.add.at(mu, [P_az[start:end], t + 1], mu[start:end, t])
else:
# mu[P_a[start:end, :], t + 1] += mu[start:end, t, np.newaxis] * policy[start:end, :]
np.add.at(mu, [P_a[start:end, :], t + 1], mu[start:end, t, np.newaxis] * policy[start:end, :])

with ThreadPoolExecutor(max_workers=num_cpus) as e:

with ThreadPoolExecutor(max_workers=1) as e:
for t in range(T - 1):
futures = list()
for i in range(0, N_STATES, chunk_size):
Expand Down Expand Up @@ -324,7 +382,10 @@ def compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=Tru
returns:
p Nx1 vector - state visitation frequencies
"""
N_STATES, _, N_ACTIONS = np.shape(P_a)
if len(P_a.shape) == 3:
N_STATES, _, N_ACTIONS = np.shape(P_a)
else:
N_STATES, N_ACTIONS = np.shape(P_a)

T = len(trajs[0])
# mu[s, t] is the prob of visiting state s at time t
Expand All @@ -336,12 +397,21 @@ def compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=Tru
for t in range(T - 1):
for s in range(N_STATES):
if deterministic:
mu[s, t + 1] = sum([mu[pre_s, t] * P_a[pre_s, s, int(policy[pre_s])] for pre_s in range(N_STATES)])
if len(P_a.shape) == 3:
mu[s, t + 1] = sum([mu[pre_s, t] * P_a[pre_s, s, int(policy[pre_s])] for pre_s in range(N_STATES)])
else:
mu[P_a[s, int(policy[s])], t + 1] += mu[s, t]
else:
mu[s, t + 1] = sum(
[sum([mu[pre_s, t] * P_a[pre_s, s, a1] * policy[pre_s, a1] for a1 in range(N_ACTIONS)]) for pre_s in
range(N_STATES)])
if len(P_a.shape) == 3:
mu[s, t + 1] = sum(
[sum([mu[pre_s, t] * P_a[pre_s, s, a1] * policy[pre_s, a1] for a1 in range(N_ACTIONS)]) for pre_s in
range(N_STATES)])
else:
for a1 in range(N_ACTIONS):
mu[P_a[s, a1], t + 1] += mu[s, t] * policy[s, a1]


print(mu)
p = np.sum(mu, 1)
print('SUM SVF', p.sum())

Expand Down Expand Up @@ -377,21 +447,29 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse):
"""

# tf.set_random_seed(1)

N_STATES, _, N_ACTIONS = np.shape(P_a)

if len(P_a.shape) == 3:
N_STATES, _, N_ACTIONS = np.shape(P_a)
else:
N_STATES, N_ACTIONS = np.shape(P_a)

deterministic = False

# init nn model
nn_r = DeepIRLFC(feat_map.shape, N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=False, conv=conv, sparse=sparse)
nn_r = DeepIRLFC(feat_map.shape, N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic_env=len(P_a.shape) == 2, deterministic=deterministic, conv=conv, sparse=sparse)

# find state visitation frequencies using demonstrations
mu_D = demo_svf(trajs, N_STATES)
p_start_state = start_state_probs(trajs, N_STATES)

P_a_t = P_a.transpose(0, 2, 1)
if sparse:
mask = P_a_t > 0
indices = np.argwhere(mask)
P_a_t = tf.SparseTensorValue(indices, P_a_t[mask], P_a_t.shape)
if len(P_a.shape) == 3:
P_a_t = P_a.transpose(0, 2, 1)
if sparse:
mask = P_a_t > 0
indices = np.argwhere(mask)
P_a_t = tf.SparseTensorValue(indices, P_a_t[mask], P_a_t.shape)
else:
P_a_t = P_a

grads = list()

Expand All @@ -404,20 +482,20 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse):
# rewards = nn_r.get_rewards(feat_map)

# compute policy
#_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=False)
#_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=deterministic)

# compute rewards and policy at the same time
#t = time.time()
#rewards, _, policy = nn_r.get_policy(feat_map, P_a_t, gamma, 0.01)
rewards, values, policy = nn_r.get_policy(feat_map, P_a_t, gamma, 0.000001)
#print('tensorflow VI', time.time() - t)

# compute expected svf
#mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False)
#mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=deterministic)

rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001)
#print(rewards)

#assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, False)
#assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, N_ACTIONS, N_STATES, trajs, gamma, deterministic)

# compute gradients on rewards:
grad_r = mu_D - mu_exp
Expand All @@ -434,27 +512,40 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse):
# return sigmoid(normalize(rewards))
return normalize(rewards)

def assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, deterministic):
assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001,
deterministic=deterministic)
assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001,
deterministic=deterministic)
assert_values2 = value_iteration.optimal_value(N_STATES, N_ACTIONS, P_a_t, rewards, gamma, threshold=0.000001)

assert (np.abs(assert_values - assert_values2) < 0.0001).all()
assert (np.abs(assert_values - assert_values_old) < 0.0001).all()
assert (np.abs(values - assert_values) < 0.0001).all()
assert (np.abs(values - assert_values_old) < 0.0001).all()

print(assert_policy)
print(assert_policy_old)
print(policy)
print(values)
print(assert_values)
print(rewards)
assert (np.abs(assert_policy - assert_policy_old) < 0.0001).all()
assert (np.abs(policy - assert_policy) < 0.0001).all()
assert (np.abs(policy - assert_policy_old) < 0.0001).all()
def assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, N_ACTIONS, N_STATES, trajs, gamma, deterministic):

def assert_vi(P_a):
assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001,
deterministic=deterministic)
assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001,
deterministic=deterministic)

if len(P_a) == 3:
assert_values2 = value_iteration.optimal_value(N_STATES, N_ACTIONS, P_a_t, rewards, gamma, threshold=0.000001)

assert (np.abs(assert_values - assert_values2) < 0.0001).all()

assert (np.abs(assert_values - assert_values_old) < 0.0001).all()
assert (np.abs(values - assert_values) < 0.0001).all()
assert (np.abs(values - assert_values_old) < 0.0001).all()

# print(assert_policy)
# print(assert_policy_old)
# print(policy)
# print(values)
# print(assert_values)
# print(rewards)
assert (np.abs(assert_policy - assert_policy_old) < 0.0001).all()
assert (np.abs(policy - assert_policy) < 0.0001).all()
assert (np.abs(policy - assert_policy_old) < 0.0001).all()

assert_vi(P_a)
if len(P_a.shape) == 2:
print('creating full transistion matrix')
# construct full sparse transisiton matrix and make sure values are the same
P_a_t = np.zeros((N_STATES, N_ACTIONS, N_STATES))
P_a_t[P_a] = 1
assert_vi(P_a)

assert (np.abs(mu_exp - compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=deterministic)) < 0.00001).all()
assert (
Expand Down
Loading