Skip to content

Commit

Permalink
debug
Browse files Browse the repository at this point in the history
  • Loading branch information
SheidaAbedpour committed Dec 9, 2023
1 parent c8dc0f5 commit 42e1f84
Showing 1 changed file with 41 additions and 40 deletions.
81 changes: 41 additions & 40 deletions MDP/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,72 +193,73 @@ def _render_gui(self, mode):
observation, info = env.reset(seed=30)


def value_iteration(env, gamma, theta, max_itr):

def policy_iteration(env, gamma, theta, max_itr):
V = np.zeros(env.nS)
goal_i, goal_j = env.terminal_state
goal_index = 12 * goal_i + goal_j
V[goal_index] = 1000
P = np.zeros(env.nS)
delta = theta + 1
P = np.zeros(env.nS, dtype=int)

iter = 0
while delta >= theta and iter < max_itr:
iter += 1
delta = 0
for _ in range(max_itr):
# Policy Evaluation
while True:
delta = 0

for s in range(env.nS):
if s != goal_index:
max_value = -np.inf
best_action = None
expected_value = 0
previous_value = V[s]
for s in range(env.nS):
if s == goal_index:
continue

for a in range(env.nA):
for prob, next_state, reward, _ in env.P[s][a]:
expected_value += prob * (reward + gamma * V[next_state])
if expected_value > max_value:
max_value = expected_value
best_action = a
v = V[s]
a = P[s]
expected_value = 0

V[s] = max_value
for prob, next_state, reward, _ in env.P[s][a]:
expected_value += prob * (reward + gamma * V[next_state])

# check for convergence
delta = max(abs(V[s] - previous_value), delta)
V[s] = expected_value
delta = max(delta, abs(v - V[s]))

if delta < theta:
break

for s in range(env.nS):
if s == goal_index:
continue
# Policy Improvement
policy_stable = True

max_value = -np.inf
best_action = None
for s in range(env.nS):
if s == goal_index:
continue

for a in range(env.nA):
value = 0
old_action = P[s]
max_value = -np.inf
best_action = None

for prob, next_state, reward, _ in env.P[s][a]:
value += prob * (reward + gamma * V[next_state])
for a in range(env.nA):
value = 0

if value > max_value:
max_value = value
best_action = a
for prob, next_state, reward, _ in env.P[s][a]:
value += prob * (reward + gamma * V[next_state])

P[s] = best_action
if value > max_value:
max_value = value
best_action = a

P[s] = best_action

if np.random.rand() < theta:
random_action = np.random.choice(env.nA)
P[s] = np.eye(env.nA)[random_action]
if old_action != P[s]:
policy_stable = False

return iter, V, P
if policy_stable:
break

return V, P

#Define the maximum number of iterations
max_iter_number = 1000

gamma = 0.8 # Discount factor
theta = 1e-6 # Convergence threshold
iter, Values, Policy = value_iteration(env, gamma, theta, max_iter_number)
Values, Policy = policy_iteration(env, gamma, theta, max_iter_number)
print("#iterations: ", iter, "\nPolicy:\n", Policy, "\nValues:\n", Values)


Expand Down

0 comments on commit 42e1f84

Please sign in to comment.