forked from EgOrlukha/MuJoCo-PyTorch
-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
115 lines (94 loc) · 3.43 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import gym
import torch
import argparse
import numpy as np
import torch.optim as optim
from model import Actor, Critic
from utils import get_action
from collections import deque
from running_state import ZFilter
from hparams import HyperParams as hp
import matplotlib.pyplot as plt
parser = argparse.ArgumentParser()
parser.add_argument('--algorithm', type=str, default='PPO',
help='select one of algorithms among Vanilla_PG, NPG, TPRO, PPO')
parser.add_argument('--env', type=str, default="Humanoid-v2",
help='name of Mujoco environement')
parser.add_argument('--render', default=False)
args = parser.parse_args()
if args.algorithm == "PG":
from vanila_pg import train_model
elif args.algorithm == "NPG":
from npg import train_model
elif args.algorithm == "TRPO":
from trpo import train_model
elif args.algorithm == "PPO":
from ppo import train_model
if __name__=="__main__":
# you can choose other environments.
# possible environments: Ant-v2, HalfCheetah-v2, Hopper-v2, Humanoid-v2,
# HumanoidStandup-v2, InvertedPendulum-v2, Reacher-v2, Swimmer-v2, Walker2d-v2
env = gym.make(args.env)
env.seed(500)
torch.manual_seed(500)
num_inputs = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]
print('state size:', num_inputs)
print('action size:', num_actions)
actor = Actor(num_inputs, num_actions)
critic = Critic(num_inputs)
actor_optim = optim.Adam(actor.parameters(), lr=hp.actor_lr)
critic_optim = optim.Adam(critic.parameters(), lr=hp.critic_lr,
weight_decay=hp.l2_rate)
running_state = ZFilter((num_inputs,), clip=5)
episodes = 0
xar = []
yar = []
for iter in range(50):
actor.eval(), critic.eval()
memory = deque()
steps = 0
scores = []
while steps < 2048:
episodes += 1
state = env.reset()
state = running_state(state)
score = 0
for _ in range(10000):
if episodes % 50 == 0:
env.render()
steps += 1
mu, std, _ = actor(torch.Tensor(state).unsqueeze(0))
action = get_action(mu, std)[0]
next_state, reward, done, _ = env.step(action)
next_state = running_state(next_state)
if done:
mask = 0
else:
mask = 1
memory.append([state, action, reward, mask])
score += reward
state = next_state
if done:
break
scores.append(score)
score_avg = np.mean(scores)
print('{} episode score is {:.2f}'.format(episodes, score_avg))
with open('reward per iter.txt','w') as file:
file.write(str(episodes)+","+str(score_avg))
file.write("\n")
file.close
xar.append(int(episodes))
yar.append(int(score_avg))
actor.train(), critic.train()
train_model(actor, critic, memory, actor_optim, critic_optim)
def plotting():
plt.plot(xar, yar, linewidth=3)
plt.title("Avg score/Episodes", fontsize=19)
plt.xlabel("Episodes", fontsize=10)
plt.ylabel("Avg score", fontsize=10)
plt.tick_params(axis='both', labelsize=9)
plt.show()
plotting()
print(xar,'\n',yar)
# env.render()