diff --git a/deep_maxent_irl.py b/deep_maxent_irl.py index 6888538..c4a263a 100644 --- a/deep_maxent_irl.py +++ b/deep_maxent_irl.py @@ -16,7 +16,7 @@ class DeepIRLFC: - def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, deterministic=False, sparse=False, conv=False, name='deep_irl_fc'): + def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, deterministic_env=False, deterministic=False, sparse=False, conv=False, name='deep_irl_fc'): if len(n_input) > 1: self.height, self.width = n_input self.n_input = self.height * self.width @@ -26,6 +26,7 @@ def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, determi self.n_h1 = n_h1 self.n_h2 = n_h2 self.name = name + self.deterministic_env = deterministic_env self.deterministic = deterministic self.sparse = sparse self.conv = conv @@ -35,16 +36,23 @@ def __init__(self, n_input, n_actions, lr, T, n_h1=400, n_h2=300, l2=10, determi self.sess = tf.Session(config=config) self.input_s, self.reward, self.theta = self._build_network(self.name, conv) + if self.deterministic_env: + p_a_shape = (self.n_input, n_actions) + p_a_dtype = tf.int32 + else: + p_a_shape = (self.n_input, n_actions, self.n_input) + p_a_dtype = tf.float32 + # value iteration if sparse: - self.P_a = tf.sparse_placeholder(tf.float32, shape=(self.n_input, n_actions, self.n_input)) + self.P_a = tf.sparse_placeholder(p_a_dtype, shape=p_a_shape) self.reduce_max_sparse = tf.sparse_reduce_max_sparse self.reduce_sum_sparse = tf.sparse_reduce_sum_sparse self.reduce_max = tf.sparse_reduce_max self.reduce_sum = tf.sparse_reduce_sum self.sparse_transpose = tf.sparse_transpose else: - self.P_a = tf.placeholder(tf.float32, shape=(self.n_input, n_actions, self.n_input)) + self.P_a = tf.placeholder(p_a_dtype, shape=p_a_shape) self.reduce_max = tf.reduce_max self.reduce_max_sparse = tf.reduce_max self.reduce_sum = tf.reduce_sum @@ -81,10 +89,10 @@ def _build_network(self, name, conv): if conv: input_s = tf.placeholder(tf.float32, [None, self.width, self.height, 1]) with tf.variable_scope(name): - #conv1 = tf_utils.conv2d(input_s, 64, (1, 1), 1) - #conv2 = tf_utils.conv2d(conv1, 32, (1, 1), 1) - #conv3 = tf_utils.conv2d(conv2, 32, (1, 1), 1) - reward = tf_utils.conv2d(input_s, 1, (1, 1), 1) + conv1 = tf_utils.conv2d(input_s, 64, (3, 3), 1) + conv2 = tf_utils.conv2d(conv1, 32, (1, 1), 1) + conv3 = tf_utils.conv2d(conv2, 32, (1, 1), 1) + reward = tf_utils.conv2d(conv3, 1, (1, 1), 1) theta = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name) return input_s, tf.squeeze(tf.reshape(reward, (-1, self.n_input))), theta else: @@ -102,13 +110,18 @@ def _vi(self, rewards): rewards_expanded = rewards #tf.tile(tf.expand_dims(rewards, 1), [1, self.n_input]) - def body(i, c, t): - old_values = t.read(i) + def vi_step(values): + if self.deterministic_env: + new_value = tf.gather(rewards_expanded, self.P_a) + self.gamma * tf.gather(values, self.P_a) + else: + new_value = self.reduce_sum_sparse(self.P_a * (rewards_expanded + self.gamma * values), axis=2) - expected_value = rewards_expanded + self.gamma * old_values - #expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(self.P_a)[1], 1]) + return new_value - new_values = self.reduce_max(self.reduce_sum_sparse(self.P_a * expected_value, axis=2), axis=1) + def body(i, c, t): + old_values = t.read(i) + new_values = vi_step(old_values) + new_values = self.reduce_max(new_values, axis=1) t = t.write(i + 1, new_values) c = tf.reduce_max(tf.abs(new_values - old_values)) > self.epsilon @@ -125,42 +138,60 @@ def condition(i, c, t): i, _, values = tf.while_loop(condition, body, [0, True, t], parallel_iterations=1, back_prop=False, name='VI_loop') values = values.read(i) - - expected_value = rewards_expanded + self.gamma * values - #expected_value = tf.tile(tf.expand_dims(expected_value, 1), [1, tf.shape(self.P_a)[1], 1]) + new_values = vi_step(values) if self.deterministic: - policy = tf.argmax(self.reduce_sum(self.P_a * expected_value, axis=2), axis=1) + policy = tf.argmax(new_values, axis=1) else: - policy = self.reduce_sum(self.P_a * expected_value, axis=2) - policy = tf.nn.softmax(policy) + policy = tf.nn.softmax(new_values) return values, policy def _svf(self, policy): - if self.deterministic: - r = tf.range(self.n_input, dtype=tf.int64) - expanded = tf.expand_dims(policy, 1) - tiled = tf.tile(expanded, [1, self.n_input]) - - grid = tf.meshgrid(r, r) - indices = tf.stack([grid[1], grid[0], tiled], axis=2) - - P_a_cur_policy = tf.gather_nd(self.sparse_transpose(self.P_a, (0, 2, 1)), indices) - P_a_cur_policy = tf.transpose(P_a_cur_policy, (1, 0)) + if not self.deterministic_env: + if self.deterministic: + r = tf.range(self.n_input, dtype=tf.int64) + expanded = tf.expand_dims(policy, 1) + tiled = tf.tile(expanded, [1, self.n_input]) + + grid = tf.meshgrid(r, r) + indices = tf.stack([grid[1], grid[0], tiled], axis=2) + + P_a_cur_policy = tf.gather_nd(self.sparse_transpose(self.P_a, (0, 2, 1)), indices) + P_a_cur_policy = tf.transpose(P_a_cur_policy, (1, 0)) + else: + P_a_cur_policy = self.P_a * tf.expand_dims(policy, 2) else: - P_a_cur_policy = self.P_a * tf.expand_dims(policy, 2) + if self.deterministic: + r = tf.range(self.n_input, dtype=tf.int64) + indices = tf.stack([r, policy], axis=1) + + P_a_cur_policy = tf.gather_nd(self.P_a, indices) + else: + P_a_cur_policy = self.P_a mu = list() mu.append(self.mu) with tf.variable_scope('svf'): if self.deterministic: for t in range(self.T - 1): - cur_mu = self.reduce_sum(mu[t] * P_a_cur_policy, axis=1) + if self.deterministic_env: + # TODO using a variable here seems a little hacky + # https://github.com/tensorflow/tensorflow/issues/2358 + cur_mu = tf.Variable(tf.constant(0, dtype=tf.float32, shape=(self.n_input,)), trainable=False) + cur_mu = cur_mu.assign(tf.zeros(shape=(self.n_input,))) + cur_mu = tf.scatter_add(cur_mu, P_a_cur_policy, mu[t]) + else: + cur_mu = self.reduce_sum(mu[t] * P_a_cur_policy, axis=1) mu.append(cur_mu) else: for t in range(self.T - 1): - cur_mu = self.reduce_sum(self.reduce_sum_sparse(tf.tile(tf.expand_dims(tf.expand_dims(mu[t], 1), 2), + if self.deterministic_env: + cur_mu = tf.Variable(tf.constant(0, dtype=tf.float32, shape=(self.n_input,)), trainable=False) + cur_mu = cur_mu.assign(tf.zeros(shape=(self.n_input,))) + cur_mu = tf.scatter_add(cur_mu, P_a_cur_policy, tf.expand_dims(mu[t], axis=1) * policy) + else: + cur_mu = self.reduce_sum(self.reduce_sum_sparse(tf.tile(tf.expand_dims(tf.expand_dims(mu[t], 1), 2), [1, tf.shape(policy)[1], self.n_input]) * P_a_cur_policy, axis=1), axis=0) @@ -192,6 +223,10 @@ def get_rewards(self, states): return rewards def get_policy(self, states, P_a, gamma, epsilon=0.01): + if self.conv: + states = np.expand_dims(np.expand_dims(states, axis=0), axis=-1) + else: + states = np.expand_dims(states, axis=0) return self.sess.run([self.reward, self.values, self.policy], feed_dict={self.input_s: states, self.P_a: P_a, self.gamma: gamma, self.epsilon: epsilon}) @@ -239,7 +274,10 @@ def compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True): p Nx1 vector - state visitation frequencies """ tt = time.time() - N_STATES, _, N_ACTIONS = np.shape(P_a) + if len(P_a.shape) == 3: + N_STATES, _, N_ACTIONS = np.shape(P_a) + else: + N_STATES, N_ACTIONS = np.shape(P_a) T = len(trajs[0]) # mu[s, t] is the prob of visiting state s at time t @@ -253,18 +291,38 @@ def compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True): if chunk_size == 0: chunk_size = N_STATES - if deterministic: - P_az = P_a[np.arange(0, N_STATES), :, policy] - else: - P_a = P_a.transpose(0, 2, 1) - def step(t, start, end): + if len(P_a.shape) == 3: if deterministic: - mu[start:end, t + 1] = np.sum(mu[:, t, np.newaxis] * P_az[:, start:end], axis=0) + P_az = P_a[np.arange(0, N_STATES), :, policy] else: - mu[start:end, t + 1] = np.sum(np.sum(mu[:, t, np.newaxis, np.newaxis] * (P_a[:, :, start:end] * policy[:, :, np.newaxis]), axis=1), axis=0) + P_a = P_a.transpose(0, 2, 1) + else: + if deterministic: + P_az = P_a[np.arange(N_STATES), policy] + + if len(P_a.shape) == 3: + def step(t, start, end): + if deterministic: + mu[start:end, t + 1] = np.sum(mu[:, t, np.newaxis] * P_az[:, start:end], axis=0) + else: + mu[start:end, t + 1] = np.sum(np.sum(mu[:, t, np.newaxis, np.newaxis] * (P_a[:, :, start:end] * policy[:, :, np.newaxis]), axis=1), axis=0) + else: + def step(t, start, end): + if deterministic: + # The following needs be be done using ufunc + # https://stackoverflow.com/questions/41990028/add-multiple-values-to-one-numpy-array-index + # P_az[start:end] sometimes points to same state for multiple values, with the usual fancy indexing only + # one addition (latest) would be executed! + # https://stackoverflow.com/questions/15973827/handling-of-duplicate-indices-in-numpy-assignments + # mu[P_az[start:end], t + 1] += mu[start:end, t] + np.add.at(mu, [P_az[start:end], t + 1], mu[start:end, t]) + else: + # mu[P_a[start:end, :], t + 1] += mu[start:end, t, np.newaxis] * policy[start:end, :] + np.add.at(mu, [P_a[start:end, :], t + 1], mu[start:end, t, np.newaxis] * policy[start:end, :]) - with ThreadPoolExecutor(max_workers=num_cpus) as e: + + with ThreadPoolExecutor(max_workers=1) as e: for t in range(T - 1): futures = list() for i in range(0, N_STATES, chunk_size): @@ -324,7 +382,10 @@ def compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=Tru returns: p Nx1 vector - state visitation frequencies """ - N_STATES, _, N_ACTIONS = np.shape(P_a) + if len(P_a.shape) == 3: + N_STATES, _, N_ACTIONS = np.shape(P_a) + else: + N_STATES, N_ACTIONS = np.shape(P_a) T = len(trajs[0]) # mu[s, t] is the prob of visiting state s at time t @@ -336,12 +397,21 @@ def compute_state_visition_freq_old(P_a, gamma, trajs, policy, deterministic=Tru for t in range(T - 1): for s in range(N_STATES): if deterministic: - mu[s, t + 1] = sum([mu[pre_s, t] * P_a[pre_s, s, int(policy[pre_s])] for pre_s in range(N_STATES)]) + if len(P_a.shape) == 3: + mu[s, t + 1] = sum([mu[pre_s, t] * P_a[pre_s, s, int(policy[pre_s])] for pre_s in range(N_STATES)]) + else: + mu[P_a[s, int(policy[s])], t + 1] += mu[s, t] else: - mu[s, t + 1] = sum( - [sum([mu[pre_s, t] * P_a[pre_s, s, a1] * policy[pre_s, a1] for a1 in range(N_ACTIONS)]) for pre_s in - range(N_STATES)]) + if len(P_a.shape) == 3: + mu[s, t + 1] = sum( + [sum([mu[pre_s, t] * P_a[pre_s, s, a1] * policy[pre_s, a1] for a1 in range(N_ACTIONS)]) for pre_s in + range(N_STATES)]) + else: + for a1 in range(N_ACTIONS): + mu[P_a[s, a1], t + 1] += mu[s, t] * policy[s, a1] + + print(mu) p = np.sum(mu, 1) print('SUM SVF', p.sum()) @@ -377,21 +447,29 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse): """ # tf.set_random_seed(1) - - N_STATES, _, N_ACTIONS = np.shape(P_a) + + if len(P_a.shape) == 3: + N_STATES, _, N_ACTIONS = np.shape(P_a) + else: + N_STATES, N_ACTIONS = np.shape(P_a) + + deterministic = False # init nn model - nn_r = DeepIRLFC(feat_map.shape, N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic=False, conv=conv, sparse=sparse) + nn_r = DeepIRLFC(feat_map.shape, N_ACTIONS, lr, len(trajs[0]), 3, 3, deterministic_env=len(P_a.shape) == 2, deterministic=deterministic, conv=conv, sparse=sparse) # find state visitation frequencies using demonstrations mu_D = demo_svf(trajs, N_STATES) p_start_state = start_state_probs(trajs, N_STATES) - P_a_t = P_a.transpose(0, 2, 1) - if sparse: - mask = P_a_t > 0 - indices = np.argwhere(mask) - P_a_t = tf.SparseTensorValue(indices, P_a_t[mask], P_a_t.shape) + if len(P_a.shape) == 3: + P_a_t = P_a.transpose(0, 2, 1) + if sparse: + mask = P_a_t > 0 + indices = np.argwhere(mask) + P_a_t = tf.SparseTensorValue(indices, P_a_t[mask], P_a_t.shape) + else: + P_a_t = P_a grads = list() @@ -404,20 +482,20 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse): # rewards = nn_r.get_rewards(feat_map) # compute policy - #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=False) + #_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=deterministic) # compute rewards and policy at the same time #t = time.time() - #rewards, _, policy = nn_r.get_policy(feat_map, P_a_t, gamma, 0.01) + rewards, values, policy = nn_r.get_policy(feat_map, P_a_t, gamma, 0.000001) #print('tensorflow VI', time.time() - t) # compute expected svf - #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False) + #mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=deterministic) rewards, values, policy, mu_exp = nn_r.get_policy_svf(feat_map, P_a_t, gamma, p_start_state, 0.000001) #print(rewards) - #assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, False) + #assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, N_ACTIONS, N_STATES, trajs, gamma, deterministic) # compute gradients on rewards: grad_r = mu_D - mu_exp @@ -434,27 +512,40 @@ def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, conv, sparse): # return sigmoid(normalize(rewards)) return normalize(rewards) -def assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, P_a_t, N_ACTIONS, N_STATES, trajs, gamma, deterministic): - assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001, - deterministic=deterministic) - assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001, - deterministic=deterministic) - assert_values2 = value_iteration.optimal_value(N_STATES, N_ACTIONS, P_a_t, rewards, gamma, threshold=0.000001) - - assert (np.abs(assert_values - assert_values2) < 0.0001).all() - assert (np.abs(assert_values - assert_values_old) < 0.0001).all() - assert (np.abs(values - assert_values) < 0.0001).all() - assert (np.abs(values - assert_values_old) < 0.0001).all() - - print(assert_policy) - print(assert_policy_old) - print(policy) - print(values) - print(assert_values) - print(rewards) - assert (np.abs(assert_policy - assert_policy_old) < 0.0001).all() - assert (np.abs(policy - assert_policy) < 0.0001).all() - assert (np.abs(policy - assert_policy_old) < 0.0001).all() +def assert_all_the_stuff(rewards, policy, values, mu_exp, P_a, N_ACTIONS, N_STATES, trajs, gamma, deterministic): + + def assert_vi(P_a): + assert_values, assert_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.000001, + deterministic=deterministic) + assert_values_old, assert_policy_old = value_iteration.value_iteration_old(P_a, rewards, gamma, error=0.000001, + deterministic=deterministic) + + if len(P_a) == 3: + assert_values2 = value_iteration.optimal_value(N_STATES, N_ACTIONS, P_a_t, rewards, gamma, threshold=0.000001) + + assert (np.abs(assert_values - assert_values2) < 0.0001).all() + + assert (np.abs(assert_values - assert_values_old) < 0.0001).all() + assert (np.abs(values - assert_values) < 0.0001).all() + assert (np.abs(values - assert_values_old) < 0.0001).all() + + # print(assert_policy) + # print(assert_policy_old) + # print(policy) + # print(values) + # print(assert_values) + # print(rewards) + assert (np.abs(assert_policy - assert_policy_old) < 0.0001).all() + assert (np.abs(policy - assert_policy) < 0.0001).all() + assert (np.abs(policy - assert_policy_old) < 0.0001).all() + + assert_vi(P_a) + if len(P_a.shape) == 2: + print('creating full transistion matrix') + # construct full sparse transisiton matrix and make sure values are the same + P_a_t = np.zeros((N_STATES, N_ACTIONS, N_STATES)) + P_a_t[P_a] = 1 + assert_vi(P_a) assert (np.abs(mu_exp - compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=deterministic)) < 0.00001).all() assert ( diff --git a/deep_maxent_irl_gridworld.py b/deep_maxent_irl_gridworld.py index 0420b75..fe05ef1 100644 --- a/deep_maxent_irl_gridworld.py +++ b/deep_maxent_irl_gridworld.py @@ -8,7 +8,7 @@ from mdp import gridworld from mdp import value_iteration from deep_maxent_irl import * -from maxent_irl import * + from utils import * from lp_irl import * @@ -91,25 +91,30 @@ def main(): gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H*W, order='F') - P_a = gw.get_transition_mat() + + if ACT_RAND == 0: + P_a = gw.get_transition_mat_deterministic() + else: + P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) - + # use identity matrix as feature #feat_map = np.eye(N_STATES) - feat_map = np.zeros(N_STATES).reshape((H, W)) + # feat_map = np.zeros(N_STATES).reshape((H, W)) + feat_map = np.random.rand(N_STATES).reshape((H, W)) #feat_map = np.arange(N_STATES).reshape((H, W)) if ARGS.conv: - feat_map[H-1, W-1] = -5 - feat_map[0, W-1] = -5 - feat_map[H-1, 0] = -5 + #feat_map[H-1, W-1] = -5 + #feat_map[0, W-1] = -5 + #feat_map[H-1, 0] = -5 + pass else: feat_map = feat_map.reshape(N_STATES) #feat_map = rmap_gt trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) - print 'Deep Max Ent IRL training ..' t = time.time() rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS, ARGS.conv, ARGS.sparse) diff --git a/mdp/gridworld.py b/mdp/gridworld.py index e48fa80..40d7a22 100644 --- a/mdp/gridworld.py +++ b/mdp/gridworld.py @@ -325,6 +325,34 @@ def get_transition_mat(self): P_a[si, sj, a] = prob return P_a + def get_transition_mat_deterministic(self): + """ + get transition dynamics of the gridworld + + return: + P_a NxNxN_ACTIONS transition probabilities matrix - + P_a[s0, s1, a] is the transition prob of + landing at state s1 when taking action + a at state s0 + """ + N_STATES = self.height*self.width + N_ACTIONS = len(self.actions) + P_a = np.zeros((N_STATES, N_ACTIONS), dtype=np.int32) + for si in range(N_STATES): + posi = self.idx2pos(si) + for a in range(N_ACTIONS): + probs = self.get_transition_states_and_probs(posi, a) + + for posj, prob in probs: + sj = self.pos2idx(posj) + # Prob of si to sj given action a + prob = int(prob) + if prob == 1: + P_a[si, a] = sj + elif prob != 0: + raise ValueError('not a deterministic environment!') + return P_a + def get_values_mat(self, values): """ inputs: diff --git a/mdp/value_iteration.py b/mdp/value_iteration.py index 30ac6e6..2fb8bb8 100644 --- a/mdp/value_iteration.py +++ b/mdp/value_iteration.py @@ -42,7 +42,11 @@ def value_iteration_old(P_a, rewards, gamma, error=0.01, deterministic=True): values Nx1 matrix - estimated values policy Nx1 (NxN_ACTIONS if non-det) matrix - policy """ - N_STATES, _, N_ACTIONS = np.shape(P_a) + if len(P_a.shape) == 3: + N_STATES, _, N_ACTIONS = np.shape(P_a) + else: + N_STATES, N_ACTIONS = np.shape(P_a) + values = np.zeros([N_STATES]) @@ -51,9 +55,10 @@ def value_iteration_old(P_a, rewards, gamma, error=0.01, deterministic=True): values_tmp = values.copy() for s in range(N_STATES): - v_s = [] - values[s] = max([sum([P_a[s, s1, a] * (rewards[s1] + gamma * values_tmp[s1]) for s1 in range(N_STATES)]) for a in - range(N_ACTIONS)]) + if len(P_a.shape) == 3: + values[s] = max([sum([P_a[s, s1, a] * (rewards[s1] + gamma * values_tmp[s1]) for s1 in range(N_STATES)]) for a in range(N_ACTIONS)]) + else: + values[s] = max([rewards[P_a[s, a]] + gamma * values_tmp[P_a[s, a]] for a in range(N_ACTIONS)]) if max([abs(values[s] - values_tmp[s]) for s in range(N_STATES)]) < error: break @@ -62,17 +67,25 @@ def value_iteration_old(P_a, rewards, gamma, error=0.01, deterministic=True): # generate deterministic policy policy = np.zeros([N_STATES]) for s in range(N_STATES): - policy[s] = np.argmax([sum([P_a[s, s1, a] * (rewards[s1] + gamma * values[s1]) + if len(P_a.shape) == 3: + policy[s] = np.argmax([sum([P_a[s, s1, a] * (rewards[s1] + gamma * values[s1]) for s1 in range(N_STATES)]) for a in range(N_ACTIONS)]) + else: + policy[s] = np.argmax([rewards[P_a[s, a]] + gamma * values[P_a[s, a]] for a in range(N_ACTIONS)]) + return values, policy else: # generate stochastic policy policy = np.zeros([N_STATES, N_ACTIONS]) for s in range(N_STATES): - v_s = np.array( - [sum([P_a[s, s1, a] * (rewards[s1] + gamma * values[s1]) for s1 in range(N_STATES)]) for a in range(N_ACTIONS)]) + if len(P_a.shape) == 3: + v_s = np.array( + [sum([P_a[s, s1, a] * (rewards[s1] + gamma * values[s1]) for s1 in range(N_STATES)]) for a in range(N_ACTIONS)]) + else: + v_s = np.array([rewards[P_a[s, a]] + gamma * values[P_a[s, a]] for a in range(N_ACTIONS)]) + policy[s, :] = softmax(v_s).squeeze() @@ -97,13 +110,19 @@ def value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True): values Nx1 matrix - estimated values policy Nx1 (NxN_ACTIONS if non-det) matrix - policy """ - N_STATES, _, N_ACTIONS = np.shape(P_a) + if len(P_a.shape) == 3: + N_STATES, _, N_ACTIONS = np.shape(P_a) + else: + N_STATES, N_ACTIONS = np.shape(P_a) values = np.zeros([N_STATES]) t = time.time() rewards = rewards.squeeze() - P = P_a.transpose(0, 2, 1) + if len(P_a.shape) == 3: + P = P_a.transpose(0, 2, 1) + else: + P = P_a num_cpus = multiprocessing.cpu_count() chunk_size = N_STATES // num_cpus @@ -117,12 +136,20 @@ def value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True): count += 1 values_tmp = values.copy() - def step(start, end): - expected_value = rewards + gamma * values_tmp - #expected_value = expected_value[:, np.newaxis].repeat(N_STATES, axis=1) - #expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2) - #expected_value = np.transpose(expected_value, (0, 2, 1)) - values[start:end] = (P[start:end, :, :] * expected_value).sum(axis=2).max(axis=1) + if len(P.shape) == 3: + def step(start, end): + expected_value = rewards + gamma * values_tmp + #expected_value = expected_value[:, np.newaxis].repeat(N_STATES, axis=1) + #expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2) + #expected_value = np.transpose(expected_value, (0, 2, 1)) + values[start:end] = (P[start:end, :, :] * expected_value).sum(axis=2).max(axis=1) + else: + def step(start, end): + expected_value = rewards[P[start:end, :]] + gamma * values_tmp[P[start:end, :]] + #expected_value = expected_value[:, np.newaxis].repeat(N_STATES, axis=1) + #expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2) + #expected_value = np.transpose(expected_value, (0, 2, 1)) + values[start:end] = expected_value.max(axis=1) with ThreadPoolExecutor(max_workers=num_cpus) as e: futures = list() @@ -140,21 +167,32 @@ def step(start, end): print('VI', count) break - expected_value = rewards + gamma * values_tmp - #expected_value = expected_value[:, np.newaxis].repeat(N_STATES, axis=1) - #expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2) - #expected_value = np.transpose(expected_value, (0, 2, 1)) + + if len(P.shape) == 3: + expected_value = rewards + gamma * values_tmp + #expected_value = expected_value[:, np.newaxis].repeat(N_STATES, axis=1) + #expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2) + #expected_value = np.transpose(expected_value, (0, 2, 1)) + else: + expected_value = rewards[P] + gamma * values_tmp[P] if deterministic: # generate deterministic policy - policy = np.argmax((P * expected_value).sum(axis=2), axis=1) + if len(P.shape) == 3: + policy = np.argmax((P * expected_value).sum(axis=2), axis=1) + else: + policy = np.argmax(expected_value, axis=1) print(time.time() - t) return values, policy else: # generate stochastic policy - policy = (P * expected_value).sum(axis=2) + if len(P.shape) == 3: + policy = (P * expected_value).sum(axis=2) + else: + policy = expected_value + policy = softmax(policy) print(time.time() - t) @@ -195,29 +233,45 @@ def softmax(x): while True: values_tmp = values.copy() - for s in range(N_STATES): - v_s = [] - q = [sum([P_a[s, s1, a]*(rewards[s] + gamma*values_tmp[s1]) for s1 in range(N_STATES)]) for a in range(N_ACTIONS)] - values[s] = softmax(q) + if len(P_a.shape) == 3: + for s in range(N_STATES): + q = [sum([P_a[s, s1, a]*(rewards[s] + gamma*values_tmp[s1]) for s1 in range(N_STATES)]) for a in range(N_ACTIONS)] + values[s] = softmax(q) - if max([abs(values[s] - values_tmp[s]) for s in range(N_STATES)]) < error: - break + if max([abs(values[s] - values_tmp[s]) for s in range(N_STATES)]) < error: + break + else: + for s in range(N_STATES): + q = [sum([(rewards[P_a[:, a]] + gamma * values_tmp[s1]) for s1 in range(N_STATES)]) for a in + range(N_ACTIONS)] + values[s] = softmax(q) + if max([abs(values[s] - values_tmp[s]) for s in range(N_STATES)]) < error: + break if deterministic: # generate deterministic policy policy = np.zeros([N_STATES]) for s in range(N_STATES): - policy[s] = np.argmax([sum([P_a[s, s1, a]*(rewards[s]+gamma*values[s1]) + if len(P_a.shape) == 3: + policy[s] = np.argmax([sum([P_a[s, s1, a]*(rewards[s]+gamma*values[s1]) for s1 in range(N_STATES)]) for a in range(N_ACTIONS)]) + else: + policy[s] = np.argmax([sum([(rewards[P_a[:, a]] + gamma * values[s1]) + for s1 in range(N_STATES)]) + for a in range(N_ACTIONS)]) return values, policy else: # generate stochastic policy policy = np.zeros([N_STATES, N_ACTIONS]) for s in range(N_STATES): - v_s = np.asarray([sum([P_a[s, s1, a]*(rewards[s] + gamma*values[s1]) for s1 in range(N_STATES)]) for a in range(N_ACTIONS)]) + if len(P_a.shape) == 3: + v_s = np.asarray([sum([P_a[s, s1, a]*(rewards[s] + gamma*values[s1]) for s1 in range(N_STATES)]) for a in range(N_ACTIONS)]) + else: + v_s = np.asarray([sum([(rewards[P_a[:, a]] + gamma * values[s1]) for s1 in range(N_STATES)]) for a in + range(N_ACTIONS)]) policy[s, :] = np.exp(v_s.squeeze() - values[s]) return values, policy @@ -394,8 +448,13 @@ def optimal_value(n_states, n_actions, transition_probabilities, reward, return v def value_parallel(policy, P_a, rewards, gamma, threshold=1e-2): + if len(P_a.shape) == 3: + N_STATES, _, N_ACTIONS = np.shape(P_a) + else: + N_STATES, N_ACTIONS = np.shape(P_a) + deterministic = len(policy.shape) == 1 - N_STATES, _, N_ACTIONS = np.shape(P_a) + deterministic_env = len(P_a.shape) == 2 values = np.zeros([N_STATES]) @@ -404,25 +463,40 @@ def value_parallel(policy, P_a, rewards, gamma, threshold=1e-2): if chunk_size == 0: chunk_size = N_STATES - rewards_expanded = rewards[:, np.newaxis].repeat(N_STATES, axis=1) - - if deterministic: - P_az = P_a[np.arange(0, N_STATES), :, policy] + rewards_expanded = rewards + + if not deterministic_env: + if deterministic: + P_az = P_a[np.arange(0, N_STATES), :, policy] + else: + P_a = P_a.transpose(0, 2, 1) else: - P_a = P_a.transpose(0, 2, 1) + if deterministic: + P_az = P_a[np.arange(0, N_STATES), policy] + # estimate values while True: values_tmp = values.copy() def step(start, end): - expected_value = rewards_expanded[start:end, :] + gamma * values_tmp + if deterministic_env: + expected_value = rewards_expanded[P_az[start:end]] + gamma * values_tmp[P_az[start:end]] + else: + expected_value = rewards_expanded[start:end, :] + gamma * values_tmp + if deterministic: - values[start:end] = (P_az[start:end, :] * expected_value).sum(axis=1) + + if deterministic_env: + values[start:end] = expected_value + else: + values[start:end] = (P_az[start:end, :] * expected_value).sum(axis=1) else: - expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2) - #expected_value = np.transpose(expected_value, (0, 2, 1)) - values[start:end] = (P_a[start:end, :, :] * expected_value).sum(axis=2).sum(axis=1) + if deterministic_env: + values[start:end] = (policy * expected_value).sum(axis=1) + else: + expected_value = expected_value[:, :, np.newaxis].repeat(N_ACTIONS, axis=2) + values[start:end] = (P_a[start:end, :, :] * expected_value * policy).sum(axis=2).sum(axis=1) with ThreadPoolExecutor(max_workers=num_cpus) as e: futures = list() @@ -463,16 +537,14 @@ def value(policy, n_states, transition_probabilities, reward, discount, for s in range(n_states): vs = values_tmp[s] a = policy[s] - v[s] = sum(transition_probabilities[s, a, k] * - (reward[s] + discount * values_tmp[k]) - for k in range(n_states)) + v[s] = reward[transition_probabilities[s, a]] + discount * values_tmp[transition_probabilities[s, a]] diff = max(diff, abs(vs - v[s])) return v def expected_value_diff(P_a, true_rewards, gamma, p_start, optimal_value, policy): v = value_parallel(policy, P_a, true_rewards, gamma) - #v_old = value(policy, P_a.shape[0], P_a.transpose(0, 2, 1), true_rewards, gamma) + #v_old = value(policy, P_a.shape[0], P_a, true_rewards, gamma) #if len(policy.shape) == 1: # assert (np.abs(v - v_old) < 0.001).all()