From b0f887bc7d50c33330209b851dc70927ba3840dd Mon Sep 17 00:00:00 2001 From: Magnus Jahnen Date: Fri, 1 Dec 2017 16:48:48 -0800 Subject: [PATCH 01/14] Numpy VI deterministic model --- deep_maxent_irl.py | 5 +- deep_maxent_irl_gridworld.py | 10 +++- mdp/value_iteration.py | 112 ++++++++++++++++++++++++++--------- 3 files changed, 95 insertions(+), 32 deletions(-) diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index 6888538..243e2d7 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -324,7 +324,10 @@ def compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=Tru returns: p Nx1 vector - state visitation frequencies """ - N_STATES, _, N_ACTIONS = np.shape(P_a) + if len(P_a.shape) == 3: + N_STATES, _, N_ACTIONS = np.shape(P_a) + else: + N_STATES, N_ACTIONS = np.shape(P_a) T = len(trajs[0]) # mu[s, t] is the prob of visiting state s at time t diff --git a/deep_maxent_irl_gridworld.py b/deep_maxent_irl_gridworld.py index 0420b75..e35f257 100644 --- a/deep_maxent_irl_gridworld.py +++ b/deep_maxent_irl_gridworld.py @@ -93,8 +93,14 @@ def main(): rewards_gt = np.reshape(rmap_gt, H*W, order='F') P_a = gw.get_transition_mat() - values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) - + if ACT_RAND == 0: + print(P_a.transpose(0, 2, 1)) + P_an = np.argmax(P_a.transpose(0, 2, 1), axis=-1) + print(P_an) + + values_gt, policy_gt = value_iteration.value_iteration_old(P_a, rewards_gt, GAMMA, error=0.01, deterministic=False) + values_gt2, policy_gt2 = value_iteration.value_iteration(P_an, rewards_gt, GAMMA, error=0.01, deterministic=False) + # use identity matrix as feature #feat_map = np.eye(N_STATES) feat_map = np.zeros(N_STATES).reshape((H, W)) diff --git a/mdp/value_iteration.py b/mdp/value_iteration.py index 30ac6e6..6e5fed2 100644 --- a/mdp/value_iteration.py +++ b/mdp/value_iteration.py @@ -42,7 +42,11 @@ def value_iteration_old(P_a, rewards, gamma, error=0.01, deterministic=True): values Nx1 matrix - estimated values policy Nx1 (NxN_ACTIONS if non-det) matrix - policy """ - N_STATES, _, N_ACTIONS = np.shape(P_a) + if len(P_a.shape) == 3: + N_STATES, _, N_ACTIONS = np.shape(P_a) + else: + N_STATES, N_ACTIONS = np.shape(P_a) + values = np.zeros([N_STATES]) @@ -51,9 +55,10 @@ def value_iteration_old(P_a, rewards, gamma, error=0.01, deterministic=True): values_tmp = values.copy() for s in range(N_STATES): - v_s = [] - values[s] = max([sum([P_a[s, s1, a] * (rewards[s1] + gamma * values_tmp[s1]) for s1 in range(N_STATES)]) for a in - range(N_ACTIONS)]) + if len(P_a.shape) == 3: + values[s] = max([sum([P_a[s, s1, a] * (rewards[s1] + gamma * values_tmp[s1]) for s1 in range(N_STATES)]) for a in range(N_ACTIONS)]) + else: + values[s] = max([rewards[P_a[s, a]] + gamma * values_tmp[P_a[s, a]] for a in range(N_ACTIONS)]) if max([abs(values[s] - values_tmp[s]) for s in range(N_STATES)]) < error: break @@ -62,17 +67,25 @@ def value_iteration_old(P_a, rewards, gamma, error=0.01, deterministic=True): # generate deterministic policy policy = np.zeros([N_STATES]) for s in range(N_STATES): - policy[s] = np.argmax([sum([P_a[s, s1, a] * (rewards[s1] + gamma * values[s1]) + if len(P_a.shape) == 3: + policy[s] = np.argmax([sum([P_a[s, s1, a] * (rewards[s1] + gamma * values[s1]) for s1 in range(N_STATES)]) for a in range(N_ACTIONS)]) + else: + policy[s] = np.argmax([rewards[P_a[s, a]] + gamma * values[P_a[s, a]] for a in range(N_ACTIONS)]) + return values, policy else: # generate stochastic policy policy = np.zeros([N_STATES, N_ACTIONS]) for s in range(N_STATES): - v_s = np.array( - [sum([P_a[s, s1, a] * (rewards[s1] + gamma * values[s1]) for s1 in range(N_STATES)]) for a in range(N_ACTIONS)]) + if len(P_a.shape) == 3: + v_s = np.array( + [sum([P_a[s, s1, a] * (rewards[s1] + gamma * values[s1]) for s1 in range(N_STATES)]) for a in range(N_ACTIONS)]) + else: + v_s = np.array([rewards[P_a[s, a]] + gamma * values[P_a[s, a]] for a in range(N_ACTIONS)]) + policy[s, :] = softmax(v_s).squeeze() @@ -97,13 +110,19 @@ def value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True): values Nx1 matrix - estimated values policy Nx1 (NxN_ACTIONS if non-det) matrix - policy """ - N_STATES, _, N_ACTIONS = np.shape(P_a) + if len(P_a.shape) == 3: + N_STATES, _, N_ACTIONS = np.shape(P_a) + else: + N_STATES, N_ACTIONS = np.shape(P_a) values = np.zeros([N_STATES]) t = time.time() rewards = rewards.squeeze() - P = P_a.transpose(0, 2, 1) + if len(P_a.shape) == 3: + P = P_a.transpose(0, 2, 1) + else: + P = P_a num_cpus = multiprocessing.cpu_count() chunk_size = N_STATES // num_cpus @@ -117,12 +136,20 @@ def value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True): count += 1 values_tmp = values.copy() - def step(start, end): - expected_value = rewards + gamma * values_tmp - #expected_value = expected_value[:, np.newaxis].repeat(N_STATES, axis=1) - #expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2) - #expected_value = np.transpose(expected_value, (0, 2, 1)) - values[start:end] = (P[start:end, :, :] * expected_value).sum(axis=2).max(axis=1) + if len(P.shape) == 3: + def step(start, end): + expected_value = rewards + gamma * values_tmp + #expected_value = expected_value[:, np.newaxis].repeat(N_STATES, axis=1) + #expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2) + #expected_value = np.transpose(expected_value, (0, 2, 1)) + values[start:end] = (P[start:end, :, :] * expected_value).sum(axis=2).max(axis=1) + else: + def step(start, end): + expected_value = rewards[P[start:end, :]] + gamma * values_tmp[P[start:end, :]] + #expected_value = expected_value[:, np.newaxis].repeat(N_STATES, axis=1) + #expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2) + #expected_value = np.transpose(expected_value, (0, 2, 1)) + values[start:end] = expected_value.max(axis=1) with ThreadPoolExecutor(max_workers=num_cpus) as e: futures = list() @@ -140,21 +167,32 @@ def step(start, end): print('VI', count) break - expected_value = rewards + gamma * values_tmp - #expected_value = expected_value[:, np.newaxis].repeat(N_STATES, axis=1) - #expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2) - #expected_value = np.transpose(expected_value, (0, 2, 1)) + + if len(P.shape) == 3: + expected_value = rewards + gamma * values_tmp + #expected_value = expected_value[:, np.newaxis].repeat(N_STATES, axis=1) + #expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2) + #expected_value = np.transpose(expected_value, (0, 2, 1)) + else: + expected_value = rewards[P] + gamma * values_tmp[P] if deterministic: # generate deterministic policy - policy = np.argmax((P * expected_value).sum(axis=2), axis=1) + if len(P.shape) == 3: + policy = np.argmax((P * expected_value).sum(axis=2), axis=1) + else: + policy = np.argmax(expected_value, axis=1) print(time.time() - t) return values, policy else: # generate stochastic policy - policy = (P * expected_value).sum(axis=2) + if len(P.shape) == 3: + policy = (P * expected_value).sum(axis=2) + else: + policy = expected_value + policy = softmax(policy) print(time.time() - t) @@ -195,29 +233,45 @@ def softmax(x): while True: values_tmp = values.copy() - for s in range(N_STATES): - v_s = [] - q = [sum([P_a[s, s1, a]*(rewards[s] + gamma*values_tmp[s1]) for s1 in range(N_STATES)]) for a in range(N_ACTIONS)] - values[s] = softmax(q) + if len(P_a.shape) == 3: + for s in range(N_STATES): + q = [sum([P_a[s, s1, a]*(rewards[s] + gamma*values_tmp[s1]) for s1 in range(N_STATES)]) for a in range(N_ACTIONS)] + values[s] = softmax(q) - if max([abs(values[s] - values_tmp[s]) for s in range(N_STATES)]) < error: - break + if max([abs(values[s] - values_tmp[s]) for s in range(N_STATES)]) < error: + break + else: + for s in range(N_STATES): + q = [sum([(rewards[P_a[:, a]] + gamma * values_tmp[s1]) for s1 in range(N_STATES)]) for a in + range(N_ACTIONS)] + values[s] = softmax(q) + if max([abs(values[s] - values_tmp[s]) for s in range(N_STATES)]) < error: + break if deterministic: # generate deterministic policy policy = np.zeros([N_STATES]) for s in range(N_STATES): - policy[s] = np.argmax([sum([P_a[s, s1, a]*(rewards[s]+gamma*values[s1]) + if len(P_a.shape) == 3: + policy[s] = np.argmax([sum([P_a[s, s1, a]*(rewards[s]+gamma*values[s1]) for s1 in range(N_STATES)]) for a in range(N_ACTIONS)]) + else: + policy[s] = np.argmax([sum([(rewards[P_a[:, a]] + gamma * values[s1]) + for s1 in range(N_STATES)]) + for a in range(N_ACTIONS)]) return values, policy else: # generate stochastic policy policy = np.zeros([N_STATES, N_ACTIONS]) for s in range(N_STATES): - v_s = np.asarray([sum([P_a[s, s1, a]*(rewards[s] + gamma*values[s1]) for s1 in range(N_STATES)]) for a in range(N_ACTIONS)]) + if len(P_a.shape) == 3: + v_s = np.asarray([sum([P_a[s, s1, a]*(rewards[s] + gamma*values[s1]) for s1 in range(N_STATES)]) for a in range(N_ACTIONS)]) + else: + v_s = np.asarray([sum([(rewards[P_a[:, a]] + gamma * values[s1]) for s1 in range(N_STATES)]) for a in + range(N_ACTIONS)]) policy[s, :] = np.exp(v_s.squeeze() - values[s]) return values, policy From bb12fe1e5135eec3fb78abff9c799c38f7200d7a Mon Sep 17 00:00:00 2001 From: Magnus Jahnen Date: Fri, 1 Dec 2017 17:17:55 -0800 Subject: [PATCH 02/14] Numpy SVF deterministic model --- deep_maxent_irl.py | 17 +++++++++++++---- deep_maxent_irl_gridworld.py | 10 ++++++++-- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index 243e2d7..6c45a8d 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -339,12 +339,21 @@ def compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=Tru for t in range(T - 1): for s in range(N_STATES): if deterministic: - mu[s, t + 1] = sum([mu[pre_s, t] * P_a[pre_s, s, int(policy[pre_s])] for pre_s in range(N_STATES)]) + if len(P_a.shape) == 3: + mu[s, t + 1] = sum([mu[pre_s, t] * P_a[pre_s, s, int(policy[pre_s])] for pre_s in range(N_STATES)]) + else: + mu[P_a[s, int(policy[s])], t + 1] += mu[s, t] else: - mu[s, t + 1] = sum( - [sum([mu[pre_s, t] * P_a[pre_s, s, a1] * policy[pre_s, a1] for a1 in range(N_ACTIONS)]) for pre_s in - range(N_STATES)]) + if len(P_a.shape) == 3: + mu[s, t + 1] = sum( + [sum([mu[pre_s, t] * P_a[pre_s, s, a1] * policy[pre_s, a1] for a1 in range(N_ACTIONS)]) for pre_s in + range(N_STATES)]) + else: + for a1 in range(N_ACTIONS): + mu[P_a[s, a1], t + 1] += mu[s, t] * policy[s, a1] + + print(mu) p = np.sum(mu, 1) print('SUM SVF', p.sum()) diff --git a/deep_maxent_irl_gridworld.py b/deep_maxent_irl_gridworld.py index e35f257..7ee3c0c 100644 --- a/deep_maxent_irl_gridworld.py +++ b/deep_maxent_irl_gridworld.py @@ -98,8 +98,8 @@ def main(): P_an = np.argmax(P_a.transpose(0, 2, 1), axis=-1) print(P_an) - values_gt, policy_gt = value_iteration.value_iteration_old(P_a, rewards_gt, GAMMA, error=0.01, deterministic=False) - values_gt2, policy_gt2 = value_iteration.value_iteration(P_an, rewards_gt, GAMMA, error=0.01, deterministic=False) + values_gt, policy_gt = value_iteration.value_iteration_old(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) + values_gt2, policy_gt2 = value_iteration.value_iteration(P_an, rewards_gt, GAMMA, error=0.01, deterministic=True) # use identity matrix as feature #feat_map = np.eye(N_STATES) @@ -115,6 +115,12 @@ def main(): trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) + values_gt, policy_gt = value_iteration.value_iteration_old(P_a, rewards_gt, GAMMA, error=0.01, deterministic=False) + + svf = compute_state_visition_freq_old(P_a, GAMMA, trajs, policy_gt, False) + svf2 = compute_state_visition_freq_old(P_an, GAMMA, trajs, policy_gt, False) + + print 'Deep Max Ent IRL training ..' t = time.time() From 1e5fc1516cd9e046ef1c2936f5f21d1c486c9c41 Mon Sep 17 00:00:00 2001 From: Magnus Jahnen Date: Fri, 1 Dec 2017 18:06:04 -0800 Subject: [PATCH 03/14] Deterministic policy svf deterministic model works now in numpyu --- deep_maxent_irl.py | 39 ++++++++++++++++++++++++++++-------- deep_maxent_irl_gridworld.py | 8 ++++---- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index 6c45a8d..6fc87d9 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -239,7 +239,10 @@ def compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True): p Nx1 vector - state visitation frequencies """ tt = time.time() - N_STATES, _, N_ACTIONS = np.shape(P_a) + if len(P_a.shape) == 3: + N_STATES, _, N_ACTIONS = np.shape(P_a) + else: + N_STATES, N_ACTIONS = np.shape(P_a) T = len(trajs[0]) # mu[s, t] is the prob of visiting state s at time t @@ -253,16 +256,36 @@ def compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True): if chunk_size == 0: chunk_size = N_STATES - if deterministic: - P_az = P_a[np.arange(0, N_STATES), :, policy] - else: - P_a = P_a.transpose(0, 2, 1) - def step(t, start, end): + if len(P_a.shape) == 3: if deterministic: - mu[start:end, t + 1] = np.sum(mu[:, t, np.newaxis] * P_az[:, start:end], axis=0) + P_az = P_a[np.arange(0, N_STATES), :, policy] else: - mu[start:end, t + 1] = np.sum(np.sum(mu[:, t, np.newaxis, np.newaxis] * (P_a[:, :, start:end] * policy[:, :, np.newaxis]), axis=1), axis=0) + P_a = P_a.transpose(0, 2, 1) + else: + if deterministic: + P_az = P_a[np.arange(N_STATES), policy] + + if len(P_a.shape) == 3: + def step(t, start, end): + if deterministic: + mu[start:end, t + 1] = np.sum(mu[:, t, np.newaxis] * P_az[:, start:end], axis=0) + else: + mu[start:end, t + 1] = np.sum(np.sum(mu[:, t, np.newaxis, np.newaxis] * (P_a[:, :, start:end] * policy[:, :, np.newaxis]), axis=1), axis=0) + else: + def step(t, start, end): + print(t) + if deterministic: + # The following needs be be done using ufunc + # https://stackoverflow.com/questions/41990028/add-multiple-values-to-one-numpy-array-index + # P_az[start:end] sometimes points to same state for multiple values, with the usual fancy indexing only + # one addition (latest) would be executed! + # https://stackoverflow.com/questions/15973827/handling-of-duplicate-indices-in-numpy-assignments + # mu[P_az[start:end], t + 1] += mu[start:end, t] + np.add.at(mu, [P_az[start:end], t + 1], mu[start:end, t]) + else: + mu[start:end, t + 1] = np.sum(np.sum(mu[:, t, np.newaxis, np.newaxis] * (P_a[:, :, start:end] * policy[:, :, np.newaxis]), axis=1), axis=0) + with ThreadPoolExecutor(max_workers=num_cpus) as e: for t in range(T - 1): diff --git a/deep_maxent_irl_gridworld.py b/deep_maxent_irl_gridworld.py index 7ee3c0c..5515daf 100644 --- a/deep_maxent_irl_gridworld.py +++ b/deep_maxent_irl_gridworld.py @@ -8,7 +8,7 @@ from mdp import gridworld from mdp import value_iteration from deep_maxent_irl import * -from maxent_irl import * + from utils import * from lp_irl import * @@ -115,10 +115,10 @@ def main(): trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) - values_gt, policy_gt = value_iteration.value_iteration_old(P_a, rewards_gt, GAMMA, error=0.01, deterministic=False) + values_gt, policy_gt = value_iteration.value_iteration_old(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) - svf = compute_state_visition_freq_old(P_a, GAMMA, trajs, policy_gt, False) - svf2 = compute_state_visition_freq_old(P_an, GAMMA, trajs, policy_gt, False) + svf = compute_state_visition_freq_old(P_a, GAMMA, trajs, policy_gt, True) + svf2 = compute_state_visition_freq(P_an, GAMMA, trajs, policy_gt.astype(np.int32), True) From 69c97c2da8df94aa513b7777aef2d38bf8d9855c Mon Sep 17 00:00:00 2001 From: Magnus Jahnen Date: Fri, 1 Dec 2017 18:21:48 -0800 Subject: [PATCH 04/14] Stochastic policy svf deterministic model works now in numpy --- deep_maxent_irl.py | 5 +++-- deep_maxent_irl_gridworld.py | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index 6fc87d9..cb9ea24 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -274,7 +274,6 @@ def step(t, start, end): mu[start:end, t + 1] = np.sum(np.sum(mu[:, t, np.newaxis, np.newaxis] * (P_a[:, :, start:end] * policy[:, :, np.newaxis]), axis=1), axis=0) else: def step(t, start, end): - print(t) if deterministic: # The following needs be be done using ufunc # https://stackoverflow.com/questions/41990028/add-multiple-values-to-one-numpy-array-index @@ -284,7 +283,9 @@ def step(t, start, end): # mu[P_az[start:end], t + 1] += mu[start:end, t] np.add.at(mu, [P_az[start:end], t + 1], mu[start:end, t]) else: - mu[start:end, t + 1] = np.sum(np.sum(mu[:, t, np.newaxis, np.newaxis] * (P_a[:, :, start:end] * policy[:, :, np.newaxis]), axis=1), axis=0) + # mu[P_a[start:end, :], t + 1] += mu[start:end, t, np.newaxis] * policy[start:end, :] + val = mu[start:end, t, np.newaxis] * policy[start:end, :] + np.add.at(mu, [P_a[start:end, :], t + 1], mu[start:end, t, np.newaxis] * policy[start:end, :]) with ThreadPoolExecutor(max_workers=num_cpus) as e: diff --git a/deep_maxent_irl_gridworld.py b/deep_maxent_irl_gridworld.py index 5515daf..5a3642c 100644 --- a/deep_maxent_irl_gridworld.py +++ b/deep_maxent_irl_gridworld.py @@ -115,10 +115,10 @@ def main(): trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) - values_gt, policy_gt = value_iteration.value_iteration_old(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) + values_gt, policy_gt = value_iteration.value_iteration_old(P_a, rewards_gt, GAMMA, error=0.01, deterministic=False) - svf = compute_state_visition_freq_old(P_a, GAMMA, trajs, policy_gt, True) - svf2 = compute_state_visition_freq(P_an, GAMMA, trajs, policy_gt.astype(np.int32), True) + svf = compute_state_visition_freq_old(P_a, GAMMA, trajs, policy_gt, False) + svf2 = compute_state_visition_freq(P_an, GAMMA, trajs, policy_gt, False) From 19eee2b50040e84abd36ab43e577f3df6d2b9e34 Mon Sep 17 00:00:00 2001 From: Magnus Jahnen Date: Fri, 1 Dec 2017 18:37:57 -0800 Subject: [PATCH 05/14] cleanup --- deep_maxent_irl.py | 8 +++++--- deep_maxent_irl_gridworld.py | 12 +----------- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index cb9ea24..fa66cba 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -284,7 +284,6 @@ def step(t, start, end): np.add.at(mu, [P_az[start:end], t + 1], mu[start:end, t]) else: # mu[P_a[start:end, :], t + 1] += mu[start:end, t, np.newaxis] * policy[start:end, :] - val = mu[start:end, t, np.newaxis] * policy[start:end, :] np.add.at(mu, [P_a[start:end, :], t + 1], mu[start:end, t, np.newaxis] * policy[start:end, :]) @@ -413,8 +412,11 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse): """ # tf.set_random_seed(1) - - N_STATES, _, N_ACTIONS = np.shape(P_a) + + if len(P_a.shape) == 3: + N_STATES, _, N_ACTIONS = np.shape(P_a) + else: + N_STATES, N_ACTIONS = np.shape(P_a) # init nn model nn_r = DeepIRLFC(feat_map.shape, N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=False, conv=conv, sparse=sparse) diff --git a/deep_maxent_irl_gridworld.py b/deep_maxent_irl_gridworld.py index 5a3642c..ed9fd98 100644 --- a/deep_maxent_irl_gridworld.py +++ b/deep_maxent_irl_gridworld.py @@ -94,12 +94,9 @@ def main(): P_a = gw.get_transition_mat() if ACT_RAND == 0: - print(P_a.transpose(0, 2, 1)) - P_an = np.argmax(P_a.transpose(0, 2, 1), axis=-1) - print(P_an) + P_a = np.argmax(P_a.transpose(0, 2, 1), axis=-1) values_gt, policy_gt = value_iteration.value_iteration_old(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) - values_gt2, policy_gt2 = value_iteration.value_iteration(P_an, rewards_gt, GAMMA, error=0.01, deterministic=True) # use identity matrix as feature #feat_map = np.eye(N_STATES) @@ -115,13 +112,6 @@ def main(): trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) - values_gt, policy_gt = value_iteration.value_iteration_old(P_a, rewards_gt, GAMMA, error=0.01, deterministic=False) - - svf = compute_state_visition_freq_old(P_a, GAMMA, trajs, policy_gt, False) - svf2 = compute_state_visition_freq(P_an, GAMMA, trajs, policy_gt, False) - - - print 'Deep Max Ent IRL training ..' t = time.time() rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS, ARGS.conv, ARGS.sparse) From 6068586d251af75d5a415566229992d7451383ce Mon Sep 17 00:00:00 2001 From: Magnus Jahnen Date: Fri, 1 Dec 2017 18:43:55 -0800 Subject: [PATCH 06/14] Use numpy VI --- deep_maxent_irl_gridworld.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deep_maxent_irl_gridworld.py b/deep_maxent_irl_gridworld.py index ed9fd98..a5813e0 100644 --- a/deep_maxent_irl_gridworld.py +++ b/deep_maxent_irl_gridworld.py @@ -96,7 +96,7 @@ def main(): if ACT_RAND == 0: P_a = np.argmax(P_a.transpose(0, 2, 1), axis=-1) - values_gt, policy_gt = value_iteration.value_iteration_old(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) + values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) # use identity matrix as feature #feat_map = np.eye(N_STATES) From 9e74b90a3565c65e8985a8b70797ea7511e13582 Mon Sep 17 00:00:00 2001 From: Magnus Jahnen Date: Fri, 1 Dec 2017 19:09:47 -0800 Subject: [PATCH 07/14] Create deterministic gridworld transition matrix on the fly --- deep_maxent_irl_gridworld.py | 5 +++-- mdp/gridworld.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/deep_maxent_irl_gridworld.py b/deep_maxent_irl_gridworld.py index a5813e0..114cbc0 100644 --- a/deep_maxent_irl_gridworld.py +++ b/deep_maxent_irl_gridworld.py @@ -91,10 +91,11 @@ def main(): gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H*W, order='F') - P_a = gw.get_transition_mat() if ACT_RAND == 0: - P_a = np.argmax(P_a.transpose(0, 2, 1), axis=-1) + P_a = gw.get_transition_mat_deterministic() + else: + P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) diff --git a/mdp/gridworld.py b/mdp/gridworld.py index e48fa80..40d7a22 100644 --- a/mdp/gridworld.py +++ b/mdp/gridworld.py @@ -325,6 +325,34 @@ def get_transition_mat(self): P_a[si, sj, a] = prob return P_a + def get_transition_mat_deterministic(self): + """ + get transition dynamics of the gridworld + + return: + P_a NxNxN_ACTIONS transition probabilities matrix - + P_a[s0, s1, a] is the transition prob of + landing at state s1 when taking action + a at state s0 + """ + N_STATES = self.height*self.width + N_ACTIONS = len(self.actions) + P_a = np.zeros((N_STATES, N_ACTIONS), dtype=np.int32) + for si in range(N_STATES): + posi = self.idx2pos(si) + for a in range(N_ACTIONS): + probs = self.get_transition_states_and_probs(posi, a) + + for posj, prob in probs: + sj = self.pos2idx(posj) + # Prob of si to sj given action a + prob = int(prob) + if prob == 1: + P_a[si, a] = sj + elif prob != 0: + raise ValueError('not a deterministic environment!') + return P_a + def get_values_mat(self, values): """ inputs: From 87013d1d96963582ff957d56710dd65ea1132daa Mon Sep 17 00:00:00 2001 From: Magnus Jahnen Date: Sat, 2 Dec 2017 16:03:34 -0800 Subject: [PATCH 08/14] TF VI deterministic model --- deep_maxent_irl.py | 77 +++++++++++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 29 deletions(-) diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index fa66cba..b9c10b2 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -16,7 +16,7 @@ class DeepIRLFC: - def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, deterministic=False, sparse=False, conv=False, name='deep_irl_fc'): + def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, deterministic_env=False, deterministic=False, sparse=False, conv=False, name='deep_irl_fc'): if len(n_input) > 1: self.height, self.width = n_input self.n_input = self.height * self.width @@ -26,6 +26,7 @@ def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, determi self.n_h1 = n_h1 self.n_h2 = n_h2 self.name = name + self.deterministic_env = deterministic_env self.deterministic = deterministic self.sparse = sparse self.conv = conv @@ -35,16 +36,23 @@ def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, determi self.sess = tf.Session(config=config) self.input_s, self.reward, self.theta = self._build_network(self.name, conv) + if self.deterministic_env: + p_a_shape = (self.n_input, n_actions) + p_a_dtype = tf.int32 + else: + p_a_shape = (self.n_input, n_actions, self.n_input) + p_a_dtype = tf.float32 + # value iteration if sparse: - self.P_a = tf.sparse_placeholder(tf.float32, shape=(self.n_input, n_actions, self.n_input)) + self.P_a = tf.sparse_placeholder(p_a_dtype, shape=p_a_shape) self.reduce_max_sparse = tf.sparse_reduce_max_sparse self.reduce_sum_sparse = tf.sparse_reduce_sum_sparse self.reduce_max = tf.sparse_reduce_max self.reduce_sum = tf.sparse_reduce_sum self.sparse_transpose = tf.sparse_transpose else: - self.P_a = tf.placeholder(tf.float32, shape=(self.n_input, n_actions, self.n_input)) + self.P_a = tf.placeholder(p_a_dtype, shape=p_a_shape) self.reduce_max = tf.reduce_max self.reduce_max_sparse = tf.reduce_max self.reduce_sum = tf.reduce_sum @@ -59,7 +67,7 @@ def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, determi self.T = T self.mu = tf.placeholder(tf.float32, self.n_input, name='mu_placerholder') - self.svf = self._svf(self.policy) + #self.svf = self._svf(self.policy) self.optimizer = tf.train.GradientDescentOptimizer(lr) @@ -102,13 +110,18 @@ def _vi(self, rewards): rewards_expanded = rewards #tf.tile(tf.expand_dims(rewards, 1), [1, self.n_input]) - def body(i, c, t): - old_values = t.read(i) + def vi_step(values): + if self.deterministic_env: + new_value = tf.gather(rewards_expanded, self.P_a) + self.gamma * tf.gather(values, self.P_a) + else: + new_value = self.reduce_sum_sparse(self.P_a * (rewards_expanded + self.gamma * values), axis=2) - expected_value = rewards_expanded + self.gamma * old_values - #expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(self.P_a)[1], 1]) + return new_value - new_values = self.reduce_max(self.reduce_sum_sparse(self.P_a * expected_value, axis=2), axis=1) + def body(i, c, t): + old_values = t.read(i) + new_values = vi_step(old_values) + new_values = self.reduce_max(new_values, axis=1) t = t.write(i + 1, new_values) c = tf.reduce_max(tf.abs(new_values - old_values)) > self.epsilon @@ -125,15 +138,12 @@ def condition(i, c, t): i, _, values = tf.while_loop(condition, body, [0, True, t], parallel_iterations=1, back_prop=False, name='VI_loop') values = values.read(i) - - expected_value = rewards_expanded + self.gamma * values - #expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(self.P_a)[1], 1]) + new_values = vi_step(values) if self.deterministic: - policy = tf.argmax(self.reduce_sum(self.P_a * expected_value, axis=2), axis=1) + policy = tf.argmax(new_values, axis=1) else: - policy = self.reduce_sum(self.P_a * expected_value, axis=2) - policy = tf.nn.softmax(policy) + policy = tf.nn.softmax(new_values) return values, policy @@ -192,6 +202,10 @@ def get_rewards(self, states): return rewards def get_policy(self, states, P_a, gamma, epsilon=0.01): + if self.conv: + states = np.expand_dims(np.expand_dims(states, axis=0), axis=-1) + else: + states = np.expand_dims(states, axis=0) return self.sess.run([self.reward, self.values, self.policy], feed_dict={self.input_s: states, self.P_a: P_a, self.gamma: gamma, self.epsilon: epsilon}) @@ -287,7 +301,7 @@ def step(t, start, end): np.add.at(mu, [P_a[start:end, :], t + 1], mu[start:end, t, np.newaxis] * policy[start:end, :]) - with ThreadPoolExecutor(max_workers=num_cpus) as e: + with ThreadPoolExecutor(max_workers=1) as e: for t in range(T - 1): futures = list() for i in range(0, N_STATES, chunk_size): @@ -418,18 +432,23 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse): else: N_STATES, N_ACTIONS = np.shape(P_a) + deterministic = True + # init nn model - nn_r = DeepIRLFC(feat_map.shape, N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=False, conv=conv, sparse=sparse) + nn_r = DeepIRLFC(feat_map.shape, N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic_env=len(P_a.shape) == 2, deterministic=deterministic, conv=conv, sparse=sparse) # find state visitation frequencies using demonstrations mu_D = demo_svf(trajs, N_STATES) p_start_state = start_state_probs(trajs, N_STATES) - P_a_t = P_a.transpose(0, 2, 1) - if sparse: - mask = P_a_t > 0 - indices = np.argwhere(mask) - P_a_t = tf.SparseTensorValue(indices, P_a_t[mask], P_a_t.shape) + if len(P_a.shape) == 3: + P_a_t = P_a.transpose(0, 2, 1) + if sparse: + mask = P_a_t > 0 + indices = np.argwhere(mask) + P_a_t = tf.SparseTensorValue(indices, P_a_t[mask], P_a_t.shape) + else: + P_a_t = P_a grads = list() @@ -442,20 +461,20 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse): # rewards = nn_r.get_rewards(feat_map) # compute policy - #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=False) + #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=deterministic) # compute rewards and policy at the same time #t = time.time() - #rewards, _, policy = nn_r.get_policy(feat_map, P_a_t, gamma, 0.01) + rewards, values, policy = nn_r.get_policy(feat_map, P_a_t, gamma, 0.000001) #print('tensorflow VI', time.time() - t) # compute expected svf - #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False) + mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=deterministic) - rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001) + #rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001) #print(rewards) - #assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, False) + assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, deterministic) # compute gradients on rewards: grad_r = mu_D - mu_exp @@ -477,9 +496,9 @@ def assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, deterministic=deterministic) assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001, deterministic=deterministic) - assert_values2 = value_iteration.optimal_value(N_STATES, N_ACTIONS, P_a_t, rewards, gamma, threshold=0.000001) + #assert_values2 = value_iteration.optimal_value(N_STATES, N_ACTIONS, P_a_t, rewards, gamma, threshold=0.000001) - assert (np.abs(assert_values - assert_values2) < 0.0001).all() + #assert (np.abs(assert_values - assert_values2) < 0.0001).all() assert (np.abs(assert_values - assert_values_old) < 0.0001).all() assert (np.abs(values - assert_values) < 0.0001).all() assert (np.abs(values - assert_values_old) < 0.0001).all() From 0b8258e11532e0b45133948a748ae674d2b60483 Mon Sep 17 00:00:00 2001 From: Magnus Jahnen Date: Sat, 2 Dec 2017 16:16:03 -0800 Subject: [PATCH 09/14] Additional assertion tests --- deep_maxent_irl.py | 59 ++++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 23 deletions(-) diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index b9c10b2..0b6ea09 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -432,7 +432,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse): else: N_STATES, N_ACTIONS = np.shape(P_a) - deterministic = True + deterministic = False # init nn model nn_r = DeepIRLFC(feat_map.shape, N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic_env=len(P_a.shape) == 2, deterministic=deterministic, conv=conv, sparse=sparse) @@ -474,7 +474,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse): #rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001) #print(rewards) - assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, deterministic) + assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, N_ACTIONS, N_STATES, trajs, gamma, deterministic) # compute gradients on rewards: grad_r = mu_D - mu_exp @@ -491,27 +491,40 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse): # return sigmoid(normalize(rewards)) return normalize(rewards) -def assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, deterministic): - assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001, - deterministic=deterministic) - assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001, - deterministic=deterministic) - #assert_values2 = value_iteration.optimal_value(N_STATES, N_ACTIONS, P_a_t, rewards, gamma, threshold=0.000001) - - #assert (np.abs(assert_values - assert_values2) < 0.0001).all() - assert (np.abs(assert_values - assert_values_old) < 0.0001).all() - assert (np.abs(values - assert_values) < 0.0001).all() - assert (np.abs(values - assert_values_old) < 0.0001).all() - - print(assert_policy) - print(assert_policy_old) - print(policy) - print(values) - print(assert_values) - print(rewards) - assert (np.abs(assert_policy - assert_policy_old) < 0.0001).all() - assert (np.abs(policy - assert_policy) < 0.0001).all() - assert (np.abs(policy - assert_policy_old) < 0.0001).all() +def assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, N_ACTIONS, N_STATES, trajs, gamma, deterministic): + + def assert_vi(P_a): + assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001, + deterministic=deterministic) + assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001, + deterministic=deterministic) + + if len(P_a) == 3: + assert_values2 = value_iteration.optimal_value(N_STATES, N_ACTIONS, P_a_t, rewards, gamma, threshold=0.000001) + + assert (np.abs(assert_values - assert_values2) < 0.0001).all() + + assert (np.abs(assert_values - assert_values_old) < 0.0001).all() + assert (np.abs(values - assert_values) < 0.0001).all() + assert (np.abs(values - assert_values_old) < 0.0001).all() + + # print(assert_policy) + # print(assert_policy_old) + # print(policy) + # print(values) + # print(assert_values) + # print(rewards) + assert (np.abs(assert_policy - assert_policy_old) < 0.0001).all() + assert (np.abs(policy - assert_policy) < 0.0001).all() + assert (np.abs(policy - assert_policy_old) < 0.0001).all() + + assert_vi(P_a) + if len(P_a.shape) == 2: + print('creating full transistion matrix') + # construct full sparse transisiton matrix and make sure values are the same + P_a_t = np.zeros((N_STATES, N_ACTIONS, N_STATES)) + P_a_t[P_a] = 1 + assert_vi(P_a) assert (np.abs(mu_exp - compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=deterministic)) < 0.00001).all() assert ( From 155ac2af825dfabc3d14fff088a2b56fc42fc51d Mon Sep 17 00:00:00 2001 From: Magnus Jahnen Date: Sat, 2 Dec 2017 16:41:09 -0800 Subject: [PATCH 10/14] EVD deterministic model dynamics --- deep_maxent_irl.py | 2 +- mdp/value_iteration.py | 48 +++++++++++++++++++++++++++++------------- 2 files changed, 34 insertions(+), 16 deletions(-) diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index 0b6ea09..a7a8b0b 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -474,7 +474,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse): #rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001) #print(rewards) - assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, N_ACTIONS, N_STATES, trajs, gamma, deterministic) + #assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, N_ACTIONS, N_STATES, trajs, gamma, deterministic) # compute gradients on rewards: grad_r = mu_D - mu_exp diff --git a/mdp/value_iteration.py b/mdp/value_iteration.py index 6e5fed2..2fb8bb8 100644 --- a/mdp/value_iteration.py +++ b/mdp/value_iteration.py @@ -448,8 +448,13 @@ def optimal_value(n_states, n_actions, transition_probabilities, reward, return v def value_parallel(policy, P_a, rewards, gamma, threshold=1e-2): + if len(P_a.shape) == 3: + N_STATES, _, N_ACTIONS = np.shape(P_a) + else: + N_STATES, N_ACTIONS = np.shape(P_a) + deterministic = len(policy.shape) == 1 - N_STATES, _, N_ACTIONS = np.shape(P_a) + deterministic_env = len(P_a.shape) == 2 values = np.zeros([N_STATES]) @@ -458,25 +463,40 @@ def value_parallel(policy, P_a, rewards, gamma, threshold=1e-2): if chunk_size == 0: chunk_size = N_STATES - rewards_expanded = rewards[:, np.newaxis].repeat(N_STATES, axis=1) - - if deterministic: - P_az = P_a[np.arange(0, N_STATES), :, policy] + rewards_expanded = rewards + + if not deterministic_env: + if deterministic: + P_az = P_a[np.arange(0, N_STATES), :, policy] + else: + P_a = P_a.transpose(0, 2, 1) else: - P_a = P_a.transpose(0, 2, 1) + if deterministic: + P_az = P_a[np.arange(0, N_STATES), policy] + # estimate values while True: values_tmp = values.copy() def step(start, end): - expected_value = rewards_expanded[start:end, :] + gamma * values_tmp + if deterministic_env: + expected_value = rewards_expanded[P_az[start:end]] + gamma * values_tmp[P_az[start:end]] + else: + expected_value = rewards_expanded[start:end, :] + gamma * values_tmp + if deterministic: - values[start:end] = (P_az[start:end, :] * expected_value).sum(axis=1) + + if deterministic_env: + values[start:end] = expected_value + else: + values[start:end] = (P_az[start:end, :] * expected_value).sum(axis=1) else: - expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2) - #expected_value = np.transpose(expected_value, (0, 2, 1)) - values[start:end] = (P_a[start:end, :, :] * expected_value).sum(axis=2).sum(axis=1) + if deterministic_env: + values[start:end] = (policy * expected_value).sum(axis=1) + else: + expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2) + values[start:end] = (P_a[start:end, :, :] * expected_value * policy).sum(axis=2).sum(axis=1) with ThreadPoolExecutor(max_workers=num_cpus) as e: futures = list() @@ -517,16 +537,14 @@ def value(policy, n_states, transition_probabilities, reward, discount, for s in range(n_states): vs = values_tmp[s] a = policy[s] - v[s] = sum(transition_probabilities[s, a, k] * - (reward[s] + discount * values_tmp[k]) - for k in range(n_states)) + v[s] = reward[transition_probabilities[s, a]] + discount * values_tmp[transition_probabilities[s, a]] diff = max(diff, abs(vs - v[s])) return v def expected_value_diff(P_a, true_rewards, gamma, p_start, optimal_value, policy): v = value_parallel(policy, P_a, true_rewards, gamma) - #v_old = value(policy, P_a.shape[0], P_a.transpose(0, 2, 1), true_rewards, gamma) + #v_old = value(policy, P_a.shape[0], P_a, true_rewards, gamma) #if len(policy.shape) == 1: # assert (np.abs(v - v_old) < 0.001).all() From 10727f9ea93f2246d27f149cd8af31eefdfa457c Mon Sep 17 00:00:00 2001 From: Magnus Jahnen Date: Sat, 2 Dec 2017 17:16:27 -0800 Subject: [PATCH 11/14] First try: svf deterministic dynamics model TF --- deep_maxent_irl.py | 48 ++++++++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index a7a8b0b..74c3d13 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -67,7 +67,7 @@ def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, determi self.T = T self.mu = tf.placeholder(tf.float32, self.n_input, name='mu_placerholder') - #self.svf = self._svf(self.policy) + self.svf = self._svf(self.policy) self.optimizer = tf.train.GradientDescentOptimizer(lr) @@ -148,25 +148,39 @@ def condition(i, c, t): return values, policy def _svf(self, policy): - if self.deterministic: - r = tf.range(self.n_input, dtype=tf.int64) - expanded = tf.expand_dims(policy, 1) - tiled = tf.tile(expanded, [1, self.n_input]) - - grid = tf.meshgrid(r, r) - indices = tf.stack([grid[1], grid[0], tiled], axis=2) - - P_a_cur_policy = tf.gather_nd(self.sparse_transpose(self.P_a, (0, 2, 1)), indices) - P_a_cur_policy = tf.transpose(P_a_cur_policy, (1, 0)) + if not self.deterministic_env: + if self.deterministic: + r = tf.range(self.n_input, dtype=tf.int64) + expanded = tf.expand_dims(policy, 1) + tiled = tf.tile(expanded, [1, self.n_input]) + + grid = tf.meshgrid(r, r) + indices = tf.stack([grid[1], grid[0], tiled], axis=2) + + P_a_cur_policy = tf.gather_nd(self.sparse_transpose(self.P_a, (0, 2, 1)), indices) + P_a_cur_policy = tf.transpose(P_a_cur_policy, (1, 0)) + else: + P_a_cur_policy = self.P_a * tf.expand_dims(policy, 2) else: - P_a_cur_policy = self.P_a * tf.expand_dims(policy, 2) + if self.deterministic: + r = tf.range(self.n_input, dtype=tf.int64) + indices = tf.stack([r, policy], axis=1) + + P_a_cur_policy = tf.gather_nd(self.P_a, indices) + P_a_cur_policy = tf.Print(P_a_cur_policy, [P_a_cur_policy], 'P_a_cur_policy', summarize=500) + else: + P_a_cur_policy = self.P_a mu = list() mu.append(self.mu) with tf.variable_scope('svf'): if self.deterministic: for t in range(self.T - 1): - cur_mu = self.reduce_sum(mu[t] * P_a_cur_policy, axis=1) + if self.deterministic_env: + cur_mu = tf.Variable(tf.constant(0, dtype=tf.float32, shape=(self.n_input,)), trainable=False) + cur_mu = tf.scatter_add(cur_mu, P_a_cur_policy, mu[t]) + else: + cur_mu = self.reduce_sum(mu[t] * P_a_cur_policy, axis=1) mu.append(cur_mu) else: for t in range(self.T - 1): @@ -432,7 +446,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse): else: N_STATES, N_ACTIONS = np.shape(P_a) - deterministic = False + deterministic = True # init nn model nn_r = DeepIRLFC(feat_map.shape, N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic_env=len(P_a.shape) == 2, deterministic=deterministic, conv=conv, sparse=sparse) @@ -469,12 +483,12 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse): #print('tensorflow VI', time.time() - t) # compute expected svf - mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=deterministic) + #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=deterministic) - #rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001) + rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001) #print(rewards) - #assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, N_ACTIONS, N_STATES, trajs, gamma, deterministic) + assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, N_ACTIONS, N_STATES, trajs, gamma, deterministic) # compute gradients on rewards: grad_r = mu_D - mu_exp From 2d9b584ada118c1fc926454171545499ecc58f8f Mon Sep 17 00:00:00 2001 From: Magnus Date: Sat, 2 Dec 2017 22:07:51 -0800 Subject: [PATCH 12/14] Fix TF svf --- deep_maxent_irl.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index 74c3d13..28f40de 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -167,7 +167,6 @@ def _svf(self, policy): indices = tf.stack([r, policy], axis=1) P_a_cur_policy = tf.gather_nd(self.P_a, indices) - P_a_cur_policy = tf.Print(P_a_cur_policy, [P_a_cur_policy], 'P_a_cur_policy', summarize=500) else: P_a_cur_policy = self.P_a @@ -177,7 +176,10 @@ def _svf(self, policy): if self.deterministic: for t in range(self.T - 1): if self.deterministic_env: + # TODO using a variable here seems a little hacky + # https://github.com/tensorflow/tensorflow/issues/2358 cur_mu = tf.Variable(tf.constant(0, dtype=tf.float32, shape=(self.n_input,)), trainable=False) + cur_mu = cur_mu.assign(tf.zeros(shape=(self.n_input,))) cur_mu = tf.scatter_add(cur_mu, P_a_cur_policy, mu[t]) else: cur_mu = self.reduce_sum(mu[t] * P_a_cur_policy, axis=1) From 09b02d9d0f992260c0c84da3a0bc736cc523365b Mon Sep 17 00:00:00 2001 From: Magnus Date: Sat, 2 Dec 2017 22:14:16 -0800 Subject: [PATCH 13/14] stochastic policy deterministic model TF SVF --- deep_maxent_irl.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index 28f40de..70426c2 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -186,7 +186,12 @@ def _svf(self, policy): mu.append(cur_mu) else: for t in range(self.T - 1): - cur_mu = self.reduce_sum(self.reduce_sum_sparse(tf.tile(tf.expand_dims(tf.expand_dims(mu[t], 1), 2), + if self.deterministic_env: + cur_mu = tf.Variable(tf.constant(0, dtype=tf.float32, shape=(self.n_input,)), trainable=False) + cur_mu = cur_mu.assign(tf.zeros(shape=(self.n_input,))) + cur_mu = tf.scatter_add(cur_mu, P_a_cur_policy, tf.expand_dims(mu[t], axis=1) * policy) + else: + cur_mu = self.reduce_sum(self.reduce_sum_sparse(tf.tile(tf.expand_dims(tf.expand_dims(mu[t], 1), 2), [1, tf.shape(policy)[1], self.n_input]) * P_a_cur_policy, axis=1), axis=0) @@ -448,7 +453,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse): else: N_STATES, N_ACTIONS = np.shape(P_a) - deterministic = True + deterministic = False # init nn model nn_r = DeepIRLFC(feat_map.shape, N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic_env=len(P_a.shape) == 2, deterministic=deterministic, conv=conv, sparse=sparse) From 2db0a12033298a356f83e9e50dcfaa932403c5bb Mon Sep 17 00:00:00 2001 From: Magnus Jahnen Date: Mon, 4 Dec 2017 14:56:54 -0800 Subject: [PATCH 14/14] deeper convnet with 3x3 filters first layer --- deep_maxent_irl.py | 10 +++++----- deep_maxent_irl_gridworld.py | 10 ++++++---- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index 70426c2..c4a263a 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -89,10 +89,10 @@ def _build_network(self, name, conv): if conv: input_s = tf.placeholder(tf.float32, [None, self.width, self.height, 1]) with tf.variable_scope(name): - #conv1 = tf_utils.conv2d(input_s, 64, (1, 1), 1) - #conv2 = tf_utils.conv2d(conv1, 32, (1, 1), 1) - #conv3 = tf_utils.conv2d(conv2, 32, (1, 1), 1) - reward = tf_utils.conv2d(input_s, 1, (1, 1), 1) + conv1 = tf_utils.conv2d(input_s, 64, (3, 3), 1) + conv2 = tf_utils.conv2d(conv1, 32, (1, 1), 1) + conv3 = tf_utils.conv2d(conv2, 32, (1, 1), 1) + reward = tf_utils.conv2d(conv3, 1, (1, 1), 1) theta = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name) return input_s, tf.squeeze(tf.reshape(reward, (-1, self.n_input))), theta else: @@ -495,7 +495,7 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse): rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001) #print(rewards) - assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, N_ACTIONS, N_STATES, trajs, gamma, deterministic) + #assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, N_ACTIONS, N_STATES, trajs, gamma, deterministic) # compute gradients on rewards: grad_r = mu_D - mu_exp diff --git a/deep_maxent_irl_gridworld.py b/deep_maxent_irl_gridworld.py index 114cbc0..fe05ef1 100644 --- a/deep_maxent_irl_gridworld.py +++ b/deep_maxent_irl_gridworld.py @@ -101,12 +101,14 @@ def main(): # use identity matrix as feature #feat_map = np.eye(N_STATES) - feat_map = np.zeros(N_STATES).reshape((H, W)) + # feat_map = np.zeros(N_STATES).reshape((H, W)) + feat_map = np.random.rand(N_STATES).reshape((H, W)) #feat_map = np.arange(N_STATES).reshape((H, W)) if ARGS.conv: - feat_map[H-1, W-1] = -5 - feat_map[0, W-1] = -5 - feat_map[H-1, 0] = -5 + #feat_map[H-1, W-1] = -5 + #feat_map[0, W-1] = -5 + #feat_map[H-1, 0] = -5 + pass else: feat_map = feat_map.reshape(N_STATES) #feat_map = rmap_gt