diff --git a/README.md b/README.md index b7b3cd76fd..73f4dfdf79 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ It provides **python-first** and **asynchronous-native** task and middleware abs - Offline RL algorithms: BCQ, CQL, TD3BC, Decision Transformer, EDAC, Diffuser, Decision Diffuser, SO2 - Model-based RL algorithms: SVG, STEVE, MBPO, DDPPO, DreamerV3 - Exploration algorithms: HER, RND, ICM, NGU -- LLM + RL Algorithms: PPO-max, DPO, PromptPG +- LLM + RL Algorithms: PPO-max, DPO, PromptPG, PromptAWR - Other algorithms: such as PER, PLR, PCGrad - MCTS + RL algorithms: AlphaZero, MuZero, please refer to [LightZero](https://github.com/opendilab/LightZero) - Generative Model + RL algorithms: Diffusion-QL, QGPO, SRPO, please refer to [GenerativeRL](https://github.com/opendilab/GenerativeRL) @@ -283,6 +283,7 @@ P.S: The `.py` file in `Runnable Demo` can be found in `dizoo` | 54 | [ST-DIM](https://arxiv.org/pdf/1906.08226.pdf) | ![other](https://img.shields.io/badge/-other-lightgrey) | [torch_utils/loss/contrastive_loss](https://github.com/opendilab/DI-engine/blob/main/ding/torch_utils/loss/contrastive_loss.py) | ding -m serial -c cartpole_dqn_stdim_config.py -s 0 | | 55 | [PLR](https://arxiv.org/pdf/2010.03934.pdf) | ![other](https://img.shields.io/badge/-other-lightgrey) | [PLR doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/plr.html)
[data/level_replay/level_sampler](https://github.com/opendilab/DI-engine/blob/main/ding/data/level_replay/level_sampler.py) | python3 -u bigfish_plr_config.py -s 0 | | 56 | [PCGrad](https://arxiv.org/pdf/2001.06782.pdf) | ![other](https://img.shields.io/badge/-other-lightgrey) | [torch_utils/optimizer_helper/PCGrad](https://github.com/opendilab/DI-engine/blob/main/ding/data/torch_utils/optimizer_helper.py) | python3 -u multi_mnist_pcgrad_main.py -s 0 | +| 57 | [AWR](https://arxiv.org/pdf/1910.00177) | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) | [policy/ibc](https://github.com/opendilab/DI-engine/blob/main/ding/policy/prompt_awr.py) | python3 -u tabmwp_awr_config.py | @@ -491,7 +492,7 @@ We appreciate all the feedbacks and contributions to improve DI-engine, both alg ```latex @misc{ding, title={DI-engine: A Universal AI System/Engine for Decision Intelligence}, - author={Yazhe Niu, Jingxin Xu, Yuan Pu, Yunpeng Nie, Jinouwen Zhang, Shuai Hu, Liangxuan Zhao, Ming Zhang, Yu Liu}, + author={Niu, Yazhe and Xu, Jingxin and Pu, Yuan and Nie, Yunpeng and Zhang, Jinouwen and Hu, Shuai and Zhao, Liangxuan and Zhang, Ming and Liu, Yu}, publisher={GitHub}, howpublished={\url{https://github.com/opendilab/DI-engine}}, year={2021}, diff --git a/ding/model/template/language_transformer.py b/ding/model/template/language_transformer.py index cac2d69adf..796b2ba0a7 100644 --- a/ding/model/template/language_transformer.py +++ b/ding/model/template/language_transformer.py @@ -1,4 +1,4 @@ -from typing import List, Dict +from typing import List, Dict, Optional import torch from torch import nn @@ -15,16 +15,21 @@ class LanguageTransformer(nn.Module): """ Overview: The LanguageTransformer network. Download a pre-trained language model and add head on it. + In the default case, we use BERT model as the text encoder, whose bi-directional character is good + for obtaining the embedding of the whole sentence. Interfaces: ``__init__``, ``forward`` """ + mode = ['compute_actor', 'compute_critic', 'compute_actor_critic'] def __init__( self, model_name: str = "bert-base-uncased", add_linear: bool = False, embedding_size: int = 128, - freeze_encoder: bool = True + freeze_encoder: bool = True, + hidden_dim: int = 768, + norm_embedding: bool = False ) -> None: """ Overview: @@ -32,14 +37,22 @@ def __init__( Arguments: - model_name (:obj:`str`): The base language model name in huggingface, such as "bert-base-uncased". - add_linear (:obj:`bool`): Whether to add a linear layer on the top of language model, defaults to be \ - ``False``. + ``False``. - embedding_size (:obj:`int`): The embedding size of the added linear layer, such as 128. - freeze_encoder (:obj:`bool`): Whether to freeze the encoder language model while training, \ - defaults to be ``True``. + defaults to be ``True``. + - hidden_dim (:obj:`int`): The embedding dimension of the encoding model (e.g. BERT). This value should \ + correspond to the model you use. For bert-base-uncased, this value is 768. + - norm_embedding (:obj:`bool`): Whether to normalize the embedding vectors. Default to be ``False``. """ super().__init__() self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.model = AutoModelForTokenClassification.from_pretrained(model_name) + in_channel = hidden_dim if not add_linear else embedding_size + self.value_head = nn.Linear(in_channel, 1) + self.norm = nn.Identity() if not norm_embedding else nn.LayerNorm( + normalized_shape=in_channel, elementwise_affine=False + ) # Freeze transformer encoder and only train the linear layer if freeze_encoder: @@ -49,9 +62,7 @@ def __init__( if add_linear: # Add a small, adjustable linear layer on top of language model tuned through RL self.embedding_size = embedding_size - self.linear = nn.Linear( - self.model.config.hidden_size, embedding_size - ) # 768 for bert-base-uncased, distilbert-base-uncased + self.linear = nn.Linear(self.model.config.hidden_size, embedding_size) else: self.linear = None @@ -66,19 +77,27 @@ def _calc_embedding(self, x: list) -> torch.Tensor: last_hidden_states = output.hidden_states[-1] # Get [CLS] hidden states sentence_embedding = last_hidden_states[:, 0, :] # len(input_list) x hidden_size + sentence_embedding = self.norm(sentence_embedding) if self.linear: sentence_embedding = self.linear(sentence_embedding) # len(input_list) x embedding_size return sentence_embedding - def forward(self, train_samples: List[str], candidate_samples: List[str]) -> Dict: + def forward( + self, + train_samples: List[str], + candidate_samples: Optional[List[str]] = None, + mode: str = 'compute_actor' + ) -> Dict: """ Overview: LanguageTransformer forward computation graph, input two lists of strings and predict their matching scores. + Different ``mode`` will forward with different network modules to get different outputs. Arguments: - train_samples (:obj:`List[str]`): One list of strings. - - candidate_samples (:obj:`List[str]`): The other list of strings to calculate the matching scores. + - candidate_samples (:obj:`Optional[List[str]]`): The other list of strings to calculate matching scores. + - - mode (:obj:`str`): The forward mode, all the modes are defined in the beginning of this class. Returns: - output (:obj:`Dict`): Output dict data, including the logit of matching scores and the \ corresponding ``torch.distributions.Categorical`` object. @@ -96,7 +115,15 @@ def forward(self, train_samples: List[str], candidate_samples: List[str]) -> Dic >>> scores = model(ctxt_list, cands_list) >>> assert scores.shape == (1, 3) """ + assert mode in self.mode prompt_embedding = self._calc_embedding(train_samples) - cands_embedding = self._calc_embedding(candidate_samples) - scores = torch.mm(prompt_embedding, cands_embedding.t()) - return {'dist': torch.distributions.Categorical(logits=scores), 'logit': scores} + + res_dict = {} + if mode in ['compute_actor', 'compute_actor_critic']: + cands_embedding = self._calc_embedding(candidate_samples) + scores = torch.mm(prompt_embedding, cands_embedding.t()) + res_dict.update({'dist': torch.distributions.Categorical(logits=scores), 'logit': scores}) + if mode in ['compute_critic', 'compute_actor_critic']: + value = self.value_head(prompt_embedding) + res_dict.update({'value': value}) + return res_dict diff --git a/ding/model/template/mavac.py b/ding/model/template/mavac.py index 78071e6783..9018aa9f04 100644 --- a/ding/model/template/mavac.py +++ b/ding/model/template/mavac.py @@ -1,4 +1,4 @@ -from typing import Union, Dict, Optional +from typing import Union, Dict, Tuple, Optional import torch import torch.nn as nn @@ -35,6 +35,7 @@ def __init__( norm_type: Optional[str] = None, sigma_type: Optional[str] = 'independent', bound_type: Optional[str] = None, + encoder: Optional[Tuple[torch.nn.Module, torch.nn.Module]] = None, ) -> None: """ Overview: @@ -66,6 +67,9 @@ def __init__( to ``independent``, which means state-independent sigma parameters. - bound_type (:obj:`Optional[str]`): The type of action bound methods in continuous action space, defaults \ to ``None``, which means no bound. + - encoder (:obj:`Optional[Tuple[torch.nn.Module, torch.nn.Module]]`): The encoder module list, defaults \ + to ``None``, you can define your own actor and critic encoder module and pass it into MAVAC to \ + deal with different observation space. """ super(MAVAC, self).__init__() agent_obs_shape: int = squeeze(agent_obs_shape) @@ -74,42 +78,38 @@ def __init__( self.global_obs_shape, self.agent_obs_shape, self.action_shape = global_obs_shape, agent_obs_shape, action_shape self.action_space = action_space # Encoder Type - # We directly connect the Head after a Liner layer instead of using the 3-layer FCEncoder. - # In SMAC task it can obviously improve the performance. - # Users can change the model according to their own needs. - self.actor_encoder = nn.Identity() - self.critic_encoder = nn.Identity() - # Head Type - self.critic_head = nn.Sequential( - nn.Linear(global_obs_shape, critic_head_hidden_size), activation, - RegressionHead( - critic_head_hidden_size, 1, critic_head_layer_num, activation=activation, norm_type=norm_type + if encoder: + self.actor_encoder, self.critic_encoder = encoder + else: + # We directly connect the Head after a Liner layer instead of using the 3-layer FCEncoder. + # In SMAC task it can obviously improve the performance. + # Users can change the model according to their own needs. + self.actor_encoder = nn.Sequential( + nn.Linear(agent_obs_shape, actor_head_hidden_size), + activation, + ) + self.critic_encoder = nn.Sequential( + nn.Linear(global_obs_shape, critic_head_hidden_size), + activation, ) + # Head Type + self.critic_head = RegressionHead( + critic_head_hidden_size, 1, critic_head_layer_num, activation=activation, norm_type=norm_type ) assert self.action_space in ['discrete', 'continuous'], self.action_space if self.action_space == 'discrete': - self.actor_head = nn.Sequential( - nn.Linear(agent_obs_shape, actor_head_hidden_size), activation, - DiscreteHead( - actor_head_hidden_size, - action_shape, - actor_head_layer_num, - activation=activation, - norm_type=norm_type - ) + self.actor_head = DiscreteHead( + actor_head_hidden_size, action_shape, actor_head_layer_num, activation=activation, norm_type=norm_type ) elif self.action_space == 'continuous': - self.actor_head = nn.Sequential( - nn.Linear(agent_obs_shape, actor_head_hidden_size), activation, - ReparameterizationHead( - actor_head_hidden_size, - action_shape, - actor_head_layer_num, - sigma_type=sigma_type, - activation=activation, - norm_type=norm_type, - bound_type=bound_type - ) + self.actor_head = ReparameterizationHead( + actor_head_hidden_size, + action_shape, + actor_head_layer_num, + sigma_type=sigma_type, + activation=activation, + norm_type=norm_type, + bound_type=bound_type ) # must use list, not nn.ModuleList self.actor = [self.actor_encoder, self.actor_head] @@ -261,7 +261,7 @@ def compute_actor_critic(self, x: Dict) -> Dict: - value (:obj:`torch.Tensor`): Q value tensor with same size as batch size. Shapes: - logit (:obj:`torch.FloatTensor`): :math:`(B, M, N)`, where B is batch size and N is ``action_shape`` \ - and M is ``agent_num``. + and M is ``agent_num``. - value (:obj:`torch.FloatTensor`): :math:`(B, M)`, where B is batch sizeand M is ``agent_num``. Examples: @@ -275,6 +275,16 @@ def compute_actor_critic(self, x: Dict) -> Dict: >>> assert outputs['value'].shape == torch.Size([10, 8]) >>> assert outputs['logit'].shape == torch.Size([10, 8, 14]) """ - logit = self.compute_actor(x)['logit'] - value = self.compute_critic(x)['value'] + x_actor = self.actor_encoder(x['agent_state']) + x_critic = self.critic_encoder(x['global_state']) + + if self.action_space == 'discrete': + action_mask = x['action_mask'] + x = self.actor_head(x_actor) + logit = x['logit'] + logit[action_mask == 0.0] = -99999999 + elif self.action_space == 'continuous': + x = self.actor_head(x_actor) + logit = x + value = self.critic_head(x_critic)['pred'] return {'logit': logit, 'value': value} diff --git a/ding/model/template/tests/test_language_transformer.py b/ding/model/template/tests/test_language_transformer.py index 40095c2ab2..eaaaae5a84 100644 --- a/ding/model/template/tests/test_language_transformer.py +++ b/ding/model/template/tests/test_language_transformer.py @@ -17,9 +17,33 @@ def check_model(self): cands_list = [problems[pid] for pid in cand_pids] model = LanguageTransformer(model_name="bert-base-uncased", add_linear=True, embedding_size=256) - scores = model(ctxt_list, cands_list) - assert scores.shape == (1, 3) + output = model(ctxt_list, cands_list, mode='compute_actor') + assert 'dist' in output.keys() and 'logit' in output.keys() and len(output.keys()) == 2 + assert output['logit'].shape == (1, 3) - model = LanguageTransformer(model_name="bert-base-uncased", add_linear=False, embedding_size=256) - scores = model(ctxt_list, cands_list) - assert scores.shape == (1, 3) + output = model(ctxt_list, cands_list, mode='compute_critic') + assert 'value' in output.keys() and len(output.keys()) == 1 + assert output['value'].shape == (1, ) + + output = model(ctxt_list, cands_list, mode='compute_critic') + assert 'value' in output.keys() and 'dist' in output.keys() and 'logit' in output.keys() and len( + output.keys() + ) == 3 + assert output['value'].shape == (1, ) + assert output['logit'].shape == (1, 3) + + model = LanguageTransformer(model_name="bert-base-uncased", add_linear=False, norm_embedding=True) + output = model(ctxt_list, cands_list, mode='compute_actor') + assert 'dist' in output.keys() and 'logit' in output.keys() and len(output.keys()) == 2 + assert output['logit'].shape == (1, 3) + + output = model(ctxt_list, cands_list, mode='compute_critic') + assert 'value' in output.keys() and len(output.keys()) == 1 + assert output['value'].shape == (1, ) + + output = model(ctxt_list, cands_list, mode='compute_critic') + assert 'value' in output.keys() and 'dist' in output.keys() and 'logit' in output.keys() and len( + output.keys() + ) == 3 + assert output['value'].shape == (1, ) + assert output['logit'].shape == (1, 3) diff --git a/ding/model/template/tests/test_mavac.py b/ding/model/template/tests/test_mavac.py index f6c6927373..9e8a3ec27b 100644 --- a/ding/model/template/tests/test_mavac.py +++ b/ding/model/template/tests/test_mavac.py @@ -1,6 +1,7 @@ import pytest import numpy as np import torch +import torch.nn as nn from itertools import product from ding.model import mavac @@ -50,3 +51,39 @@ def test_vac(self, agent_obs_shape, global_obs_shape): value = model(data, mode='compute_critic')['value'] assert value.shape == (B, agent_num) self.output_check(model.critic, value, action_shape) + + def test_vac_with_encoder(self, agent_obs_shape, global_obs_shape): + data = { + 'agent_state': torch.randn(B, agent_num, agent_obs_shape), + 'global_state': torch.randn(B, agent_num, global_obs_shape), + 'action_mask': torch.randint(0, 2, size=(B, agent_num, action_shape)) + } + + actor_size, critic_size = 128, 128 + encoder = [nn.Linear(agent_obs_shape, actor_size), nn.Linear(global_obs_shape, critic_size)] + model = MAVAC( + agent_obs_shape, + global_obs_shape, + action_shape, + agent_num, + encoder=encoder, + actor_head_hidden_size=actor_size, + critic_head_hidden_size=critic_size + ) + + logit = model(data, mode='compute_actor_critic')['logit'] + value = model(data, mode='compute_actor_critic')['value'] + + outputs = value.sum() + logit.sum() + self.output_check(model, outputs, action_shape) + + for p in model.parameters(): + p.grad = None + logit = model(data, mode='compute_actor')['logit'] + self.output_check(model.actor, logit, model.action_shape) + + for p in model.parameters(): + p.grad = None + value = model(data, mode='compute_critic')['value'] + assert value.shape == (B, agent_num) + self.output_check(model.critic, value, action_shape) diff --git a/ding/model/wrapper/model_wrappers.py b/ding/model/wrapper/model_wrappers.py index e427587327..94f5b86ac4 100644 --- a/ding/model/wrapper/model_wrappers.py +++ b/ding/model/wrapper/model_wrappers.py @@ -866,10 +866,14 @@ def forward(self, *args, **kwargs): assert isinstance(output, dict), "model output must be dict, but find {}".format(type(output)) if 'action' in output or 'action_args' in output: key = 'action' if 'action' in output else 'action_args' - action = output[key] + # handle hybrid action space by adding noise to continuous part of model output + action = output[key]['action_args'] if isinstance(output[key], dict) else output[key] assert isinstance(action, torch.Tensor) action = self.add_noise(action) - output[key] = action + if isinstance(output[key], dict): + output[key]['action_args'] = action + else: + output[key] = action return output def add_noise(self, action: torch.Tensor) -> torch.Tensor: diff --git a/ding/policy/__init__.py b/ding/policy/__init__.py index 1f202da3bb..5015823d71 100755 --- a/ding/policy/__init__.py +++ b/ding/policy/__init__.py @@ -56,4 +56,5 @@ # new-type policy from .ppof import PPOFPolicy from .prompt_pg import PromptPGPolicy +from .prompt_awr import PromptAWRPolicy from .happo import HAPPOPolicy diff --git a/ding/policy/command_mode_policy_instance.py b/ding/policy/command_mode_policy_instance.py index 2e817ead4b..1289d8e6ca 100644 --- a/ding/policy/command_mode_policy_instance.py +++ b/ding/policy/command_mode_policy_instance.py @@ -52,6 +52,7 @@ from .prompt_pg import PromptPGPolicy from .plan_diffuser import PDPolicy from .happo import HAPPOPolicy +from .prompt_awr import PromptAWRPolicy class EpsCommandModePolicy(CommandModePolicy): @@ -455,3 +456,8 @@ def _get_setting_eval(self, command_info: dict) -> dict: @POLICY_REGISTRY.register('prompt_pg_command') class PromptPGCommandModePolicy(PromptPGPolicy, DummyCommandModePolicy): pass + + +@POLICY_REGISTRY.register('prompt_awr_command') +class PromptAWRCommandModePolicy(PromptAWRPolicy, DummyCommandModePolicy): + pass diff --git a/ding/policy/prompt_awr.py b/ding/policy/prompt_awr.py new file mode 100644 index 0000000000..4b39057d22 --- /dev/null +++ b/ding/policy/prompt_awr.py @@ -0,0 +1,274 @@ +from collections import namedtuple +from typing import List, Dict, Any, Tuple, Union + +import torch + +from ding.model import model_wrap +from ding.rl_utils import get_train_sample +from ding.torch_utils import Adam, to_device +from ding.utils import POLICY_REGISTRY +from ding.utils.data import default_collate, default_decollate +from .base_policy import Policy + + +@POLICY_REGISTRY.register('prompt_awr') +class PromptAWRPolicy(Policy): + """ + Overview: + Policy class of AWR (Advantage Weighted Regression) algorithm, proposed in https://arxiv.org/abs/1910.00177. + Especially, this policy is designed for training a language model policy. + In this policy, the environment's observation includes the current context, a list of optional actions + (strings). The final output of the policy is a set of optional actions with a size of ``shot_number``. + """ + config = dict( + # (str) Name of the registered RL policy (refer to the "register_policy" function). + type='prompt_awr', + # (bool) Flag to enable CUDA for model computation. + cuda=False, + # (bool) Flag for using on-policy training (training policy is the same as the behavior policy). + on_policy=False, + # (bool) Flag for enabling priority experience replay. Must be False when priority_IS_weight is False. + priority=False, + # (bool) Flag for using Importance Sampling weights to correct updates. Requires `priority` to be True. + priority_IS_weight=False, + # (str) Type of action space used in the policy, with valid options ['discrete', 'continuous']. + action_space='discrete', + # (int) The number of actions that can be done simultaneously in one timestep. + shot_number=1, + # learn_mode configuration + learn=dict( + # (int) Number of updates per data collection. A2C requires this to be set to 1. + update_per_collect=1, + # (int) Batch size for learning. + batch_size=64, + # (float) Learning rate for optimizer. + learning_rate=0.001, + # (Tuple[float, float]) Coefficients used for computing running averages of gradient and its square. + betas=(0.9, 0.999), + # (float) Term added to the denominator to improve numerical stability in optimizer. + eps=1e-8, + # (float) Maximum norm for gradients. + grad_norm=0.5, + # (float) Scaling factor for value network loss relative to policy network loss. + value_weight=0.5, + # (float) Coefficient that controls the exp scale in awr algorithm. + beta=1.0, + # (float) Weight of entropy regularization in the loss function. + entropy_weight=0.001, + # (Tuple[float, float]) The range of adv. Value that exceeds this range will be clipped. + adv_range=(-0.5, 0.5), + # (bool) If set to True, the 'done' signals that indicate the end of an episode due to environment time + # limits are disregarded. By default, this is set to False. This setting is particularly useful for tasks + # that have a predetermined episode length, such as HalfCheetah and various other MuJoCo environments, + # where the maximum length is capped at 1000 steps. When enabled, any 'done' signal triggered by reaching + # the maximum episode steps will be overridden to 'False'. This ensures the accurate calculation of the + # Temporal Difference (TD) error, using the formula `gamma * (1 - done) * next_v + reward`, + # even when the episode surpasses the predefined step limit. + ignore_done=False, + ), + # collect_mode configuration + collect=dict( + # (int) The length of rollout for data collection. + unroll_len=1, + # (float) Discount factor for calculating future rewards, typically in the range [0, 1]. + discount_factor=0.9, + # (float) Trade-off parameter for balancing TD-error and Monte Carlo error in GAE. + gae_lambda=0.95, + ), + # eval_mode configuration (kept empty for compatibility purposes) + eval=dict(), + ) + + def default_model(self) -> Tuple[str, List[str]]: + """ + Overview: + Returns the default model configuration used by the AWR algorithm. ``__init__`` method will \ + automatically call this method to get the default model setting and create model. + + Returns: + - model_info (:obj:`Tuple[str, List[str]]`): \ + Tuple containing the registered model name and model's import_names. + """ + return 'language_transformer', ['ding.model.template.language_transformer'] + + def _init_learn(self) -> None: + """ + Overview: + Initialize the learn mode of policy, including related attributes and modules. For AWR, it mainly \ + contains optimizer, algorithm-specific arguments such as value_weight, entropy_weight, adv_norm + and grad_norm, and main model. \ + This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``. + + .. note:: + For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \ + and ``_load_state_dict_learn`` methods. + + .. note:: + For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method. + + .. note:: + If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \ + with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``. + """ + assert self._cfg.action_space == "discrete" + # Optimizer + self._optimizer = Adam( + self._model.parameters(), + lr=self._cfg.learn.learning_rate, + betas=self._cfg.learn.betas, + eps=self._cfg.learn.eps + ) + + # Algorithm config + self._priority = self._cfg.priority + self._priority_IS_weight = self._cfg.priority_IS_weight + self._value_weight = self._cfg.learn.value_weight + self._entropy_weight = self._cfg.learn.entropy_weight + self._adv_norm = self._cfg.learn.adv_norm + self._grad_norm = self._cfg.learn.grad_norm + + # Main and target models + self._learn_model = self._model + + def _forward_learn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: + # Data preprocessing operations, such as stack data, cpu to cuda device + self._learn_model.train() + + for i in range(0, len(data), self._cfg.learn.batch_size): + batch = default_collate(data[i:i + self._cfg.learn.batch_size]) + if self._cuda: + batch = to_device(batch, self._device) + + # Prepare train_sample (the question to be answered) and the candidate_samples (the prompts to be selected) + train_samples, cand_samples = batch["obs"]["train_sample"], batch["obs"]["candidate_samples"] + for cand_n in range(len(cand_samples)): + cand_samples[cand_n] = cand_samples[cand_n][0] + output = self._learn_model.forward(train_samples, cand_samples, mode='compute_actor_critic') + return_ = batch['return'] + + # Calculate AWR loss + real_act = batch['action'] + + # Ensure the shape of real_act is: (B, shot_number) + if len(real_act.shape) == 1: + real_act = real_act.unsqueeze(-1) + + # Calculate different parts of loss. + total_policy_loss, total_entropy_loss, total_value_loss = 0, 0, 0 + for shot_n in range(self._cfg.shot_number): + log_prob = output['dist'].log_prob(real_act[:, shot_n]) + # Clamp the adv for better stability. + adv = torch.clamp( + return_ - batch['value'], min=self._cfg.learn.norm_range[0], max=self._cfg.learn.norm_range[1] + ) + # The policy loss for AWR algorithm. + policy_loss = -(log_prob * torch.exp(adv / self._cfg.learn.beta)).mean() + total_policy_loss += policy_loss + # The value loss for AWR algorithm. + value_loss = ((return_ - output['value']) ** 2).mean() + total_value_loss += value_loss + # The entropy loss for AWR algorithm. + total_entropy_loss += -self._cfg.learn.entropy_weight * output['dist'].entropy().mean() + total_loss = total_entropy_loss + total_policy_loss + total_value_loss + + self._optimizer.zero_grad() + total_loss.backward() + + grad_norm = torch.nn.utils.clip_grad_norm_( + list(self._learn_model.parameters()), + max_norm=self._grad_norm, + ) + self._optimizer.step() + + return { + 'cur_lr': self._optimizer.param_groups[0]['lr'], + 'total_loss': total_loss.item(), + 'policy_loss': total_policy_loss.item(), + 'entropy_loss': total_entropy_loss.item(), + 'value_loss': total_value_loss.item(), + 'return_abs_max': return_.abs().max().item(), + 'grad_norm': grad_norm, + } + + def _init_collect(self) -> None: + self._unroll_len = self._cfg.collect.unroll_len + self._gamma = self._cfg.collect.discount_factor + self._collect_model = model_wrap(self._model, wrapper_name='combination_multinomial_sample') + + def _forward_collect(self, data: Dict[int, Any]) -> Dict[int, Any]: + """ + Overview: + Policy forward function of collect mode (collecting training data by interacting with envs). Forward means \ + that the policy gets some necessary data (mainly observation) from the envs and then returns the output \ + data, such as the action to interact with the envs. + Arguments: + - data (:obj:`Dict[int, Any]`): The input data used for policy forward, including at least the obs. The \ + key of the dict is environment id and the value is the corresponding data of the env. + Returns: + - output (:obj:`Dict[int, Any]`): The output data of policy forward, including at least the action and \ + other necessary data for learn mode defined in ``self._process_transition`` method. The key of the \ + dict is the same as the input data, i.e. environment id. + """ + data_id = list(data.keys()) + data = default_collate(list(data.values())) + self._model.eval() + with torch.no_grad(): + # Prepare train_sample (the question to be answered) and the candidate_samples (the prompts to be selected) + for ii in range(len(data['candidate_samples'])): + data['candidate_samples'][ii] = data['candidate_samples'][ii][0] + output = self._collect_model.forward( + self._cfg.shot_number, data['train_sample'], data['candidate_samples'], mode="compute_actor_critic" + ) + if self._cuda: + output = to_device(output, 'cpu') + output = default_decollate(output) + return {i: d for i, d in zip(data_id, output)} + + def _process_transition(self, obs: Any, policy_output: Dict[str, torch.Tensor], + timestep: namedtuple) -> Dict[str, torch.Tensor]: + return { + 'obs': obs, + 'action': policy_output['action'], + 'value': policy_output['value'], + 'reward': timestep.reward, + 'done': timestep.done, + } + + def _get_train_sample(self, data: list) -> Union[None, List[Any]]: + r""" + Overview: + Get the trajectory and the n step return data, then sample from the n_step return data + Arguments: + - data (:obj:`list`): The trajectory's buffer list + Returns: + - samples (:obj:`dict`): The training samples generated + """ + if self._cfg.learn.ignore_done: + raise NotImplementedError + + R = 0. + for i in reversed(range(len(data))): + R = self._gamma * R + data[i]['reward'] + data[i]['return'] = R + return get_train_sample(data, self._unroll_len) + + def _init_eval(self) -> None: + self._eval_model = model_wrap(self._model, wrapper_name='combination_argmax_sample') + + def _forward_eval(self, data: dict) -> dict: + data_id = list(data.keys()) + data = default_collate(list(data.values())) + self._model.eval() + with torch.no_grad(): + # Prepare train_sample (the question to be answered) and the candidate_samples (the prompts to be selected) + for ii in range(len(data['candidate_samples'])): + data['candidate_samples'][ii] = data['candidate_samples'][ii][0] + output = self._eval_model.forward(self._cfg.shot_number, data['train_sample'], data['candidate_samples']) + if self._cuda: + output = to_device(output, 'cpu') + output = default_decollate(output) + return {i: d for i, d in zip(data_id, output)} + + def _monitor_vars_learn(self) -> List[str]: + return super()._monitor_vars_learn() + \ + ['policy_loss', 'entropy_loss', 'return_abs_max', 'grad_norm', 'value_loss'] diff --git a/ding/policy/prompt_pg.py b/ding/policy/prompt_pg.py index ebccadb8a3..a76e0e5faf 100644 --- a/ding/policy/prompt_pg.py +++ b/ding/policy/prompt_pg.py @@ -26,6 +26,8 @@ class PromptPGPolicy(Policy): on_policy=True, # for pg strictly on policy algorithm, this line should not be modified by users # (bool) whether to use deterministic action for evaluation. deterministic_eval=True, + # (int) The number of actions that can be done simultaneously in one timestep. + shot_number=1, learn=dict( # (int) the number of samples for one update. batch_size=64, @@ -98,6 +100,8 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: # calculate PG loss real_act = batch['action'] # shape: (B, shot_number) + if len(real_act.shape) == 1: + real_act = real_act.unsqueeze(-1) # Calculate loss. total_policy_loss, total_entropy_loss = 0, 0 for ii in range(self._cfg.shot_number): diff --git a/dizoo/gym_anytrading/envs/trading_env.py b/dizoo/gym_anytrading/envs/trading_env.py index d4ff57a057..a29c5b2215 100644 --- a/dizoo/gym_anytrading/envs/trading_env.py +++ b/dizoo/gym_anytrading/envs/trading_env.py @@ -252,7 +252,7 @@ def create_collector_env_cfg(cfg: dict) -> List[dict]: collector_env_num = cfg.pop('collector_env_num') collector_env_cfg = [copy.deepcopy(cfg) for _ in range(collector_env_num)] for i in range(collector_env_num): - collector_env_cfg[i]['env_id'] += ('-' + str(i) + 'e') + collector_env_cfg[i]['env_id'] += ('-' + str(i) + 'c') return collector_env_cfg # override diff --git a/dizoo/mujoco/config/ant_onppo_config.py b/dizoo/mujoco/config/ant_onppo_config.py index 32793ffecc..036d651391 100644 --- a/dizoo/mujoco/config/ant_onppo_config.py +++ b/dizoo/mujoco/config/ant_onppo_config.py @@ -5,13 +5,10 @@ exp_name="ant_onppo_seed0", env=dict( env_id='Ant-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), collector_env_num=10, evaluator_env_num=10, n_evaluator_episode=10, stop_value=6000, - manager=dict(shared_memory=False, ) ), policy=dict( cuda=True, diff --git a/dizoo/mujoco/config/halfcheetah_onppo_config.py b/dizoo/mujoco/config/halfcheetah_onppo_config.py index f63f296167..b7646bbede 100644 --- a/dizoo/mujoco/config/halfcheetah_onppo_config.py +++ b/dizoo/mujoco/config/halfcheetah_onppo_config.py @@ -7,8 +7,6 @@ exp_name='halfcheetah_onppo_seed0', env=dict( env_id='HalfCheetah-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), collector_env_num=collector_env_num, evaluator_env_num=evaluator_env_num, n_evaluator_episode=8, @@ -78,4 +76,4 @@ if __name__ == "__main__": # or you can enter `ding -m serial_onpolicy -c halfcheetah_onppo_config.py -s 0` from ding.entry import serial_pipeline_onpolicy - serial_pipeline_onpolicy((main_config, create_config), seed=0) \ No newline at end of file + serial_pipeline_onpolicy((main_config, create_config), seed=0) diff --git a/dizoo/mujoco/config/hopper_onppo_config.py b/dizoo/mujoco/config/hopper_onppo_config.py index 0853aa4abb..6d1c57e70e 100644 --- a/dizoo/mujoco/config/hopper_onppo_config.py +++ b/dizoo/mujoco/config/hopper_onppo_config.py @@ -5,15 +5,13 @@ exp_name='hopper_onppo_seed0', env=dict( env_id='Hopper-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), collector_env_num=8, evaluator_env_num=10, n_evaluator_episode=10, stop_value=4000, ), policy=dict( - cuda=False, + cuda=True, recompute_adv=True, action_space='continuous', model=dict( diff --git a/dizoo/mujoco/config/walker2d_onppo_config.py b/dizoo/mujoco/config/walker2d_onppo_config.py index 2437d62e43..58f5e9054f 100644 --- a/dizoo/mujoco/config/walker2d_onppo_config.py +++ b/dizoo/mujoco/config/walker2d_onppo_config.py @@ -7,8 +7,6 @@ exp_name='walker2d_onppo_seed0', env=dict( env_id='Walker2d-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), collector_env_num=collector_env_num, evaluator_env_num=evaluator_env_num, n_evaluator_episode=8, diff --git a/dizoo/mujoco/envs/mujoco_env.py b/dizoo/mujoco/envs/mujoco_env.py index c150581a5b..700142583b 100644 --- a/dizoo/mujoco/envs/mujoco_env.py +++ b/dizoo/mujoco/envs/mujoco_env.py @@ -151,7 +151,6 @@ def create_collector_env_cfg(cfg: dict) -> List[dict]: def create_evaluator_env_cfg(cfg: dict) -> List[dict]: evaluator_cfg = copy.deepcopy(cfg) evaluator_env_num = evaluator_cfg.pop('evaluator_env_num', 1) - evaluator_cfg.norm_reward.use_norm = False return [evaluator_cfg for _ in range(evaluator_env_num)] @property diff --git a/dizoo/tabmwp/config/tabmwp_awr_config.py b/dizoo/tabmwp/config/tabmwp_awr_config.py new file mode 100644 index 0000000000..7e3f22865f --- /dev/null +++ b/dizoo/tabmwp/config/tabmwp_awr_config.py @@ -0,0 +1,66 @@ +from easydict import EasyDict + +tabmwp_prompt_awr_config = dict( + exp_name='tabmwp_prompt_awr_seed0', + env=dict( + collector_env_num=1, + evaluator_env_num=1, + n_evaluator_episode=1, + stop_value=1, + cand_number=16, + train_number=80, + engine='text-davinci-002', + temperature=0., + max_tokens=512, + top_p=1., + frequency_penalty=0., + presence_penalty=0., + option_inds=["A", "B", "C", "D", "E", "F"], + # The API-key of openai. You can get your key in this website: https://platform.openai.com/ + api_key='', + enable_replay=True, + prompt_format='TQ-A', + seed=0, + ), + policy=dict( + cuda=True, + shot_number=2, + model=dict( + model_name="bert-base-uncased", + add_linear=True, + freeze_encoder=True, + embedding_size=128, + ), + learn=dict( + batch_size=10, + # (bool) Whether to normalize advantage. Default to False. + learning_rate=0.001, + # (float) loss weight of the value network, the weight of policy network is set to 1 + entropy_weight=0.001, + weight_decay=5e-3, + grad_norm=0.5, + ), + collect=dict( + # (int) collect n_sample data, train model 1 times + n_sample=20, + discount_factor=0., + ), + eval=dict(evaluator=dict(eval_freq=500, )), + ), +) +main_config = EasyDict(tabmwp_prompt_awr_config) + +tabmwp_prompt_awr_config = dict( + env=dict( + type='tabmwp', + import_names=['dizoo.tabmwp.envs.tabmwp_env'], + ), + env_manager=dict(type='base'), + policy=dict(type='prompt_awr'), + replay_buffer=dict(type='naive'), +) +create_config = EasyDict(tabmwp_prompt_awr_config) + +if __name__ == '__main__': + from ding.entry import serial_pipeline_onpolicy + serial_pipeline_onpolicy((main_config, create_config), seed=0)