-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathActorCriticBrainSharedNetwork.py
140 lines (114 loc) · 7.36 KB
/
ActorCriticBrainSharedNetwork.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# coding=utf-8
import gym
import tensorflow as tf
from blocAndTools.buildingbloc import (ExperimentSpec, GymPlayground, build_MLP_computation_graph,
policy_theta_discrete_space, discrete_pseudo_loss, learning_rate_scheduler,
policy_optimizer, )
from blocAndTools.rl_vocabulary import rl_name
tf_cv1 = tf.compat.v1 # shortcut
vocab = rl_name()
def build_actor_critic_shared_graph(obs_ph: tf.Tensor, exp_spec: ExperimentSpec,
playground: GymPlayground) -> (tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor):
"""
The ACTOR-CRITIC shared network variant architecture
1. Actor network theta
input: the observations collected
output: the logits of each action in the action space
2. Policy
input: the actor network
output: a selected action & the probabilities of each action in the action space
3. Critic network phi
input: the observations collected
output: the logits of each action in the action space
:return: sampled_action, log_pi_all, theta_shared_MLP, critic
"""
""" ---- Assess the input shape compatibility ---- """
are_compatible = obs_ph.shape.as_list()[-1] == playground.OBSERVATION_SPACE.shape[0]
assert are_compatible, ("the observation_placeholder is incompatible with environment, "
"{} != {}").format(obs_ph.shape.as_list()[-1],
playground.OBSERVATION_SPACE.shape[0])
# ::Discrete case
if isinstance(playground.env.action_space, gym.spaces.Discrete):
""" ---- Build parameter THETA as a multilayer perceptron ---- """
theta_shared_MLP = build_MLP_computation_graph(obs_ph, playground.ACTION_CHOICES,
exp_spec.theta_nn_h_layer_topo,
hidden_layers_activation=exp_spec.theta_hidden_layers_activation,
output_layers_activation=exp_spec.theta_output_layers_activation,
reuse=None, # <-- (!)
name=vocab.shared_network)
""" ---- Build the policy for discrete space ---- """
sampled_action, log_pi_all = policy_theta_discrete_space(theta_shared_MLP, playground)
# ::Continuous case
elif isinstance(playground.env.action_space, gym.spaces.Box):
raise NotImplementedError # (Ice-Boxed) todo:implement --> for policy for continuous space:
# ::Other gym environment
else:
print("\n>>> The agent implementation does not support that environment space "
"{} yet.\n\n".format(playground.env.action_space))
raise NotImplementedError
""" ---- Build the Critic ---- """
phi_shared_MLP = build_MLP_computation_graph(obs_ph, playground.ACTION_CHOICES,
exp_spec.theta_nn_h_layer_topo,
hidden_layers_activation=exp_spec.theta_hidden_layers_activation,
output_layers_activation=exp_spec.theta_output_layers_activation,
reuse=True, # <-- (!)
name=vocab.shared_network)
critic = build_MLP_computation_graph(phi_shared_MLP, 1, (),
hidden_layers_activation=exp_spec.theta_hidden_layers_activation,
output_layers_activation=exp_spec.theta_output_layers_activation,
name=vocab.V_estimate)
return sampled_action, log_pi_all, theta_shared_MLP, critic
def actor_shared_train(action_placeholder: tf_cv1.Tensor, log_pi, advantage: tf_cv1.Tensor,
experiment_spec: ExperimentSpec,
playground: GymPlayground) -> (tf.Tensor, tf.Operation):
"""
Actor loss
input: the probabilities of each action in the action space, the collected actions, the computed advantages
output: Grad_theta log pi_theta * A^pi
:return: actor_loss, actor_policy_optimizer_op
"""
with tf_cv1.name_scope(vocab.policy_training):
""" ---- Build the pseudo loss function ---- """
actor_loss = discrete_pseudo_loss(log_pi, action_placeholder, advantage,
playground, name=vocab.actor_loss)
""" ---- Actor optimizer & learning rate scheduler ---- """
# (Ice-Boxed) todo:implement --> finish lr sheduler for online shared algo:
# (Ice-Boxed) todo:implement --> add 'global_timestep_max' to hparam:
# actor_lr_schedule, actor_global_grad_step = learning_rate_scheduler(
# max_gradient_step_expected=experiment_spec['global_timestep_max'] / experiment_spec['batch_size_in_ts'],
# learning_rate=experiment_spec.learning_rate,
# lr_decay_rate=experiment_spec['actor_lr_decay_rate'],
# name_sufix='actor')
#
# actor_policy_optimizer_op = tf_cv1.train.AdamOptimizer(learning_rate=actor_lr_schedule
# ).minimize(loss=actor_loss,
# global_step=actor_global_grad_step,
# name=vocab.policy_optimizer)
actor_policy_optimizer_op = policy_optimizer(actor_loss, experiment_spec.learning_rate)
return actor_loss, actor_policy_optimizer_op
def critic_shared_train(advantage, experiment_spec: ExperimentSpec) -> (tf.Tensor, tf.Operation):
"""
Critic loss
input: the target y (either Monte Carlo target or Bootstraped estimate target)
output: the Mean Squared Error (MSE)
:return: critic_loss, critic_optimizer
"""
with tf_cv1.name_scope(vocab.critic_training):
""" ---- Build the Mean Square Error loss function ---- """
with tf.name_scope(vocab.critic_loss):
critic_loss = tf.reduce_mean(advantage ** 2)
""" ---- Critic optimizer & learning rate scheduler ---- """
# (Ice-Boxed) todo:implement --> finish lr sheduler for online shared algo:
# (Ice-Boxed) todo:implement --> add 'global_timestep_max' to hparam:
# critic_lr_schedule, critic_global_grad_step = learning_rate_scheduler(
# max_gradient_step_expected=experiment_spec['critique_loop_len'] * experiment_spec.max_epoch,
# learning_rate=experiment_spec['critic_learning_rate'],
# lr_decay_rate=experiment_spec['critic_lr_decay_rate'],
# name_sufix='critic')
#
# critic_optimizer = tf_cv1.train.AdamOptimizer(learning_rate=critic_lr_schedule
# ).minimize(critic_loss,
# global_step=critic_global_grad_step,
# name=vocab.critic_optimizer)
critic_optimizer = tf_cv1.train.AdamOptimizer(learning_rate=experiment_spec['critic_learning_rate']).minimize(critic_loss, name=vocab.critic_optimizer)
return critic_loss, critic_optimizer