diff --git a/README.md b/README.md
index b7b3cd76fd..73f4dfdf79 100644
--- a/README.md
+++ b/README.md
@@ -58,7 +58,7 @@ It provides **python-first** and **asynchronous-native** task and middleware abs
 - Offline RL algorithms: BCQ, CQL, TD3BC, Decision Transformer, EDAC, Diffuser, Decision Diffuser, SO2
 - Model-based RL algorithms: SVG, STEVE, MBPO, DDPPO, DreamerV3
 - Exploration algorithms: HER, RND, ICM, NGU
-- LLM + RL Algorithms: PPO-max, DPO, PromptPG
+- LLM + RL Algorithms: PPO-max, DPO, PromptPG, PromptAWR
 - Other algorithms: such as PER, PLR, PCGrad
 - MCTS + RL algorithms: AlphaZero, MuZero, please refer to [LightZero](https://github.com/opendilab/LightZero)
 - Generative Model + RL algorithms: Diffusion-QL, QGPO, SRPO, please refer to [GenerativeRL](https://github.com/opendilab/GenerativeRL)
@@ -283,6 +283,7 @@ P.S: The `.py` file in `Runnable Demo` can be found in `dizoo`
 | 54 |                                           [ST-DIM](https://arxiv.org/pdf/1906.08226.pdf)                                           |                                                            ![other](https://img.shields.io/badge/-other-lightgrey)                                                            |                                                                              [torch_utils/loss/contrastive_loss](https://github.com/opendilab/DI-engine/blob/main/ding/torch_utils/loss/contrastive_loss.py)                                                                              |                   ding -m serial -c cartpole_dqn_stdim_config.py -s 0                   |
 | 55 |                                             [PLR](https://arxiv.org/pdf/2010.03934.pdf)                                             |                                                            ![other](https://img.shields.io/badge/-other-lightgrey)                                                            |                                       [PLR doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/plr.html)<br>[data/level_replay/level_sampler](https://github.com/opendilab/DI-engine/blob/main/ding/data/level_replay/level_sampler.py)                                       |                          python3 -u bigfish_plr_config.py -s 0                          |
 | 56 |                                           [PCGrad](https://arxiv.org/pdf/2001.06782.pdf)                                           |                                                            ![other](https://img.shields.io/badge/-other-lightgrey)                                                            |                                                                             [torch_utils/optimizer_helper/PCGrad](https://github.com/opendilab/DI-engine/blob/main/ding/data/torch_utils/optimizer_helper.py)                                                                             |                        python3 -u multi_mnist_pcgrad_main.py -s 0                        |
+| 57 |                                           [AWR](https://arxiv.org/pdf/1910.00177)                                                   |                                                            ![discrete](https://img.shields.io/badge/-discrete-brightgreen)                                                    |                                                                             [policy/ibc](https://github.com/opendilab/DI-engine/blob/main/ding/policy/prompt_awr.py)                                                                                                                     |                        python3 -u tabmwp_awr_config.py                                   |
 
 </details>
 
@@ -491,7 +492,7 @@ We appreciate all the feedbacks and contributions to improve DI-engine, both alg
 ```latex
 @misc{ding,
     title={DI-engine: A Universal AI System/Engine for Decision Intelligence},
-    author={Yazhe Niu, Jingxin Xu, Yuan Pu, Yunpeng Nie, Jinouwen Zhang, Shuai Hu, Liangxuan Zhao, Ming Zhang, Yu Liu},
+    author={Niu, Yazhe and Xu, Jingxin and Pu, Yuan and Nie, Yunpeng and Zhang, Jinouwen and Hu, Shuai and Zhao, Liangxuan and Zhang,  Ming and Liu, Yu},
     publisher={GitHub},
     howpublished={\url{https://github.com/opendilab/DI-engine}},
     year={2021},
diff --git a/ding/model/template/language_transformer.py b/ding/model/template/language_transformer.py
index cac2d69adf..796b2ba0a7 100644
--- a/ding/model/template/language_transformer.py
+++ b/ding/model/template/language_transformer.py
@@ -1,4 +1,4 @@
-from typing import List, Dict
+from typing import List, Dict, Optional
 import torch
 from torch import nn
 
@@ -15,16 +15,21 @@ class LanguageTransformer(nn.Module):
     """
     Overview:
         The LanguageTransformer network. Download a pre-trained language model and add head on it.
+        In the default case, we use BERT model as the text encoder, whose bi-directional character is good
+        for obtaining the embedding of the whole sentence.
     Interfaces:
         ``__init__``, ``forward``
     """
+    mode = ['compute_actor', 'compute_critic', 'compute_actor_critic']
 
     def __init__(
             self,
             model_name: str = "bert-base-uncased",
             add_linear: bool = False,
             embedding_size: int = 128,
-            freeze_encoder: bool = True
+            freeze_encoder: bool = True,
+            hidden_dim: int = 768,
+            norm_embedding: bool = False
     ) -> None:
         """
         Overview:
@@ -32,14 +37,22 @@ def __init__(
         Arguments:
             - model_name (:obj:`str`): The base language model name in huggingface, such as "bert-base-uncased".
             - add_linear (:obj:`bool`): Whether to add a linear layer on the top of language model, defaults to be \
-            ``False``.
+                ``False``.
             - embedding_size (:obj:`int`): The embedding size of the added linear layer, such as 128.
             - freeze_encoder (:obj:`bool`): Whether to freeze the encoder language model while training, \
-            defaults to be ``True``.
+                defaults to be ``True``.
+            - hidden_dim (:obj:`int`): The embedding dimension of the encoding model (e.g. BERT). This value should \
+                correspond to the model you use. For bert-base-uncased, this value is 768.
+            - norm_embedding (:obj:`bool`): Whether to normalize the embedding vectors. Default to be ``False``.
         """
         super().__init__()
         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
         self.model = AutoModelForTokenClassification.from_pretrained(model_name)
+        in_channel = hidden_dim if not add_linear else embedding_size
+        self.value_head = nn.Linear(in_channel, 1)
+        self.norm = nn.Identity() if not norm_embedding else nn.LayerNorm(
+            normalized_shape=in_channel, elementwise_affine=False
+        )
 
         # Freeze transformer encoder and only train the linear layer
         if freeze_encoder:
@@ -49,9 +62,7 @@ def __init__(
         if add_linear:
             # Add a small, adjustable linear layer on top of language model tuned through RL
             self.embedding_size = embedding_size
-            self.linear = nn.Linear(
-                self.model.config.hidden_size, embedding_size
-            )  # 768 for bert-base-uncased, distilbert-base-uncased
+            self.linear = nn.Linear(self.model.config.hidden_size, embedding_size)
         else:
             self.linear = None
 
@@ -66,19 +77,27 @@ def _calc_embedding(self, x: list) -> torch.Tensor:
         last_hidden_states = output.hidden_states[-1]
         # Get [CLS] hidden states
         sentence_embedding = last_hidden_states[:, 0, :]  # len(input_list) x hidden_size
+        sentence_embedding = self.norm(sentence_embedding)
 
         if self.linear:
             sentence_embedding = self.linear(sentence_embedding)  # len(input_list) x embedding_size
 
         return sentence_embedding
 
-    def forward(self, train_samples: List[str], candidate_samples: List[str]) -> Dict:
+    def forward(
+            self,
+            train_samples: List[str],
+            candidate_samples: Optional[List[str]] = None,
+            mode: str = 'compute_actor'
+    ) -> Dict:
         """
         Overview:
             LanguageTransformer forward computation graph, input two lists of strings and predict their matching scores.
+            Different ``mode`` will forward with different network modules to get different outputs.
         Arguments:
             - train_samples (:obj:`List[str]`): One list of strings.
-            - candidate_samples (:obj:`List[str]`): The other list of strings to calculate the matching scores.
+            - candidate_samples (:obj:`Optional[List[str]]`): The other list of strings to calculate matching scores.
+            - - mode (:obj:`str`): The forward mode, all the modes are defined in the beginning of this class.
         Returns:
             - output (:obj:`Dict`): Output dict data, including the logit of matching scores and the \
             corresponding ``torch.distributions.Categorical`` object.
@@ -96,7 +115,15 @@ def forward(self, train_samples: List[str], candidate_samples: List[str]) -> Dic
             >>> scores = model(ctxt_list, cands_list)
             >>> assert scores.shape == (1, 3)
         """
+        assert mode in self.mode
         prompt_embedding = self._calc_embedding(train_samples)
-        cands_embedding = self._calc_embedding(candidate_samples)
-        scores = torch.mm(prompt_embedding, cands_embedding.t())
-        return {'dist': torch.distributions.Categorical(logits=scores), 'logit': scores}
+
+        res_dict = {}
+        if mode in ['compute_actor', 'compute_actor_critic']:
+            cands_embedding = self._calc_embedding(candidate_samples)
+            scores = torch.mm(prompt_embedding, cands_embedding.t())
+            res_dict.update({'dist': torch.distributions.Categorical(logits=scores), 'logit': scores})
+        if mode in ['compute_critic', 'compute_actor_critic']:
+            value = self.value_head(prompt_embedding)
+            res_dict.update({'value': value})
+        return res_dict
diff --git a/ding/model/template/mavac.py b/ding/model/template/mavac.py
index 78071e6783..9018aa9f04 100644
--- a/ding/model/template/mavac.py
+++ b/ding/model/template/mavac.py
@@ -1,4 +1,4 @@
-from typing import Union, Dict, Optional
+from typing import Union, Dict, Tuple, Optional
 import torch
 import torch.nn as nn
 
@@ -35,6 +35,7 @@ def __init__(
         norm_type: Optional[str] = None,
         sigma_type: Optional[str] = 'independent',
         bound_type: Optional[str] = None,
+        encoder: Optional[Tuple[torch.nn.Module, torch.nn.Module]] = None,
     ) -> None:
         """
         Overview:
@@ -66,6 +67,9 @@ def __init__(
                 to ``independent``, which means state-independent sigma parameters.
             - bound_type (:obj:`Optional[str]`): The type of action bound methods in continuous action space, defaults \
                 to ``None``, which means no bound.
+            - encoder (:obj:`Optional[Tuple[torch.nn.Module, torch.nn.Module]]`): The encoder module list, defaults \
+                to ``None``, you can define your own actor and critic encoder module and pass it into MAVAC to \
+                deal with different observation space.
         """
         super(MAVAC, self).__init__()
         agent_obs_shape: int = squeeze(agent_obs_shape)
@@ -74,42 +78,38 @@ def __init__(
         self.global_obs_shape, self.agent_obs_shape, self.action_shape = global_obs_shape, agent_obs_shape, action_shape
         self.action_space = action_space
         # Encoder Type
-        # We directly connect the Head after a Liner layer instead of using the 3-layer FCEncoder.
-        # In SMAC task it can obviously improve the performance.
-        # Users can change the model according to their own needs.
-        self.actor_encoder = nn.Identity()
-        self.critic_encoder = nn.Identity()
-        # Head Type
-        self.critic_head = nn.Sequential(
-            nn.Linear(global_obs_shape, critic_head_hidden_size), activation,
-            RegressionHead(
-                critic_head_hidden_size, 1, critic_head_layer_num, activation=activation, norm_type=norm_type
+        if encoder:
+            self.actor_encoder, self.critic_encoder = encoder
+        else:
+            # We directly connect the Head after a Liner layer instead of using the 3-layer FCEncoder.
+            # In SMAC task it can obviously improve the performance.
+            # Users can change the model according to their own needs.
+            self.actor_encoder = nn.Sequential(
+                nn.Linear(agent_obs_shape, actor_head_hidden_size),
+                activation,
+            )
+            self.critic_encoder = nn.Sequential(
+                nn.Linear(global_obs_shape, critic_head_hidden_size),
+                activation,
             )
+        # Head Type
+        self.critic_head = RegressionHead(
+            critic_head_hidden_size, 1, critic_head_layer_num, activation=activation, norm_type=norm_type
         )
         assert self.action_space in ['discrete', 'continuous'], self.action_space
         if self.action_space == 'discrete':
-            self.actor_head = nn.Sequential(
-                nn.Linear(agent_obs_shape, actor_head_hidden_size), activation,
-                DiscreteHead(
-                    actor_head_hidden_size,
-                    action_shape,
-                    actor_head_layer_num,
-                    activation=activation,
-                    norm_type=norm_type
-                )
+            self.actor_head = DiscreteHead(
+                actor_head_hidden_size, action_shape, actor_head_layer_num, activation=activation, norm_type=norm_type
             )
         elif self.action_space == 'continuous':
-            self.actor_head = nn.Sequential(
-                nn.Linear(agent_obs_shape, actor_head_hidden_size), activation,
-                ReparameterizationHead(
-                    actor_head_hidden_size,
-                    action_shape,
-                    actor_head_layer_num,
-                    sigma_type=sigma_type,
-                    activation=activation,
-                    norm_type=norm_type,
-                    bound_type=bound_type
-                )
+            self.actor_head = ReparameterizationHead(
+                actor_head_hidden_size,
+                action_shape,
+                actor_head_layer_num,
+                sigma_type=sigma_type,
+                activation=activation,
+                norm_type=norm_type,
+                bound_type=bound_type
             )
         # must use list, not nn.ModuleList
         self.actor = [self.actor_encoder, self.actor_head]
@@ -261,7 +261,7 @@ def compute_actor_critic(self, x: Dict) -> Dict:
             - value (:obj:`torch.Tensor`): Q value tensor with same size as batch size.
         Shapes:
             - logit (:obj:`torch.FloatTensor`): :math:`(B, M, N)`, where B is batch size and N is ``action_shape`` \
-              and M is ``agent_num``.
+                and M is ``agent_num``.
             - value (:obj:`torch.FloatTensor`): :math:`(B, M)`, where B is batch sizeand M is ``agent_num``.
 
         Examples:
@@ -275,6 +275,16 @@ def compute_actor_critic(self, x: Dict) -> Dict:
             >>> assert outputs['value'].shape == torch.Size([10, 8])
             >>> assert outputs['logit'].shape == torch.Size([10, 8, 14])
         """
-        logit = self.compute_actor(x)['logit']
-        value = self.compute_critic(x)['value']
+        x_actor = self.actor_encoder(x['agent_state'])
+        x_critic = self.critic_encoder(x['global_state'])
+
+        if self.action_space == 'discrete':
+            action_mask = x['action_mask']
+            x = self.actor_head(x_actor)
+            logit = x['logit']
+            logit[action_mask == 0.0] = -99999999
+        elif self.action_space == 'continuous':
+            x = self.actor_head(x_actor)
+            logit = x
+        value = self.critic_head(x_critic)['pred']
         return {'logit': logit, 'value': value}
diff --git a/ding/model/template/tests/test_language_transformer.py b/ding/model/template/tests/test_language_transformer.py
index 40095c2ab2..eaaaae5a84 100644
--- a/ding/model/template/tests/test_language_transformer.py
+++ b/ding/model/template/tests/test_language_transformer.py
@@ -17,9 +17,33 @@ def check_model(self):
         cands_list = [problems[pid] for pid in cand_pids]
 
         model = LanguageTransformer(model_name="bert-base-uncased", add_linear=True, embedding_size=256)
-        scores = model(ctxt_list, cands_list)
-        assert scores.shape == (1, 3)
+        output = model(ctxt_list, cands_list, mode='compute_actor')
+        assert 'dist' in output.keys() and 'logit' in output.keys() and len(output.keys()) == 2
+        assert output['logit'].shape == (1, 3)
 
-        model = LanguageTransformer(model_name="bert-base-uncased", add_linear=False, embedding_size=256)
-        scores = model(ctxt_list, cands_list)
-        assert scores.shape == (1, 3)
+        output = model(ctxt_list, cands_list, mode='compute_critic')
+        assert 'value' in output.keys() and len(output.keys()) == 1
+        assert output['value'].shape == (1, )
+
+        output = model(ctxt_list, cands_list, mode='compute_critic')
+        assert 'value' in output.keys() and 'dist' in output.keys() and 'logit' in output.keys() and len(
+            output.keys()
+        ) == 3
+        assert output['value'].shape == (1, )
+        assert output['logit'].shape == (1, 3)
+
+        model = LanguageTransformer(model_name="bert-base-uncased", add_linear=False, norm_embedding=True)
+        output = model(ctxt_list, cands_list, mode='compute_actor')
+        assert 'dist' in output.keys() and 'logit' in output.keys() and len(output.keys()) == 2
+        assert output['logit'].shape == (1, 3)
+
+        output = model(ctxt_list, cands_list, mode='compute_critic')
+        assert 'value' in output.keys() and len(output.keys()) == 1
+        assert output['value'].shape == (1, )
+
+        output = model(ctxt_list, cands_list, mode='compute_critic')
+        assert 'value' in output.keys() and 'dist' in output.keys() and 'logit' in output.keys() and len(
+            output.keys()
+        ) == 3
+        assert output['value'].shape == (1, )
+        assert output['logit'].shape == (1, 3)
diff --git a/ding/model/template/tests/test_mavac.py b/ding/model/template/tests/test_mavac.py
index f6c6927373..9e8a3ec27b 100644
--- a/ding/model/template/tests/test_mavac.py
+++ b/ding/model/template/tests/test_mavac.py
@@ -1,6 +1,7 @@
 import pytest
 import numpy as np
 import torch
+import torch.nn as nn
 from itertools import product
 
 from ding.model import mavac
@@ -50,3 +51,39 @@ def test_vac(self, agent_obs_shape, global_obs_shape):
         value = model(data, mode='compute_critic')['value']
         assert value.shape == (B, agent_num)
         self.output_check(model.critic, value, action_shape)
+
+    def test_vac_with_encoder(self, agent_obs_shape, global_obs_shape):
+        data = {
+            'agent_state': torch.randn(B, agent_num, agent_obs_shape),
+            'global_state': torch.randn(B, agent_num, global_obs_shape),
+            'action_mask': torch.randint(0, 2, size=(B, agent_num, action_shape))
+        }
+
+        actor_size, critic_size = 128, 128
+        encoder = [nn.Linear(agent_obs_shape, actor_size), nn.Linear(global_obs_shape, critic_size)]
+        model = MAVAC(
+            agent_obs_shape,
+            global_obs_shape,
+            action_shape,
+            agent_num,
+            encoder=encoder,
+            actor_head_hidden_size=actor_size,
+            critic_head_hidden_size=critic_size
+        )
+
+        logit = model(data, mode='compute_actor_critic')['logit']
+        value = model(data, mode='compute_actor_critic')['value']
+
+        outputs = value.sum() + logit.sum()
+        self.output_check(model, outputs, action_shape)
+
+        for p in model.parameters():
+            p.grad = None
+        logit = model(data, mode='compute_actor')['logit']
+        self.output_check(model.actor, logit, model.action_shape)
+
+        for p in model.parameters():
+            p.grad = None
+        value = model(data, mode='compute_critic')['value']
+        assert value.shape == (B, agent_num)
+        self.output_check(model.critic, value, action_shape)
diff --git a/ding/model/wrapper/model_wrappers.py b/ding/model/wrapper/model_wrappers.py
index e427587327..94f5b86ac4 100644
--- a/ding/model/wrapper/model_wrappers.py
+++ b/ding/model/wrapper/model_wrappers.py
@@ -866,10 +866,14 @@ def forward(self, *args, **kwargs):
         assert isinstance(output, dict), "model output must be dict, but find {}".format(type(output))
         if 'action' in output or 'action_args' in output:
             key = 'action' if 'action' in output else 'action_args'
-            action = output[key]
+            # handle hybrid action space by adding noise to continuous part of model output
+            action = output[key]['action_args'] if isinstance(output[key], dict) else output[key]
             assert isinstance(action, torch.Tensor)
             action = self.add_noise(action)
-            output[key] = action
+            if isinstance(output[key], dict):
+                output[key]['action_args'] = action
+            else:
+                output[key] = action
         return output
 
     def add_noise(self, action: torch.Tensor) -> torch.Tensor:
diff --git a/ding/policy/__init__.py b/ding/policy/__init__.py
index 1f202da3bb..5015823d71 100755
--- a/ding/policy/__init__.py
+++ b/ding/policy/__init__.py
@@ -56,4 +56,5 @@
 # new-type policy
 from .ppof import PPOFPolicy
 from .prompt_pg import PromptPGPolicy
+from .prompt_awr import PromptAWRPolicy
 from .happo import HAPPOPolicy
diff --git a/ding/policy/command_mode_policy_instance.py b/ding/policy/command_mode_policy_instance.py
index 2e817ead4b..1289d8e6ca 100644
--- a/ding/policy/command_mode_policy_instance.py
+++ b/ding/policy/command_mode_policy_instance.py
@@ -52,6 +52,7 @@
 from .prompt_pg import PromptPGPolicy
 from .plan_diffuser import PDPolicy
 from .happo import HAPPOPolicy
+from .prompt_awr import PromptAWRPolicy
 
 
 class EpsCommandModePolicy(CommandModePolicy):
@@ -455,3 +456,8 @@ def _get_setting_eval(self, command_info: dict) -> dict:
 @POLICY_REGISTRY.register('prompt_pg_command')
 class PromptPGCommandModePolicy(PromptPGPolicy, DummyCommandModePolicy):
     pass
+
+
+@POLICY_REGISTRY.register('prompt_awr_command')
+class PromptAWRCommandModePolicy(PromptAWRPolicy, DummyCommandModePolicy):
+    pass
diff --git a/ding/policy/prompt_awr.py b/ding/policy/prompt_awr.py
new file mode 100644
index 0000000000..4b39057d22
--- /dev/null
+++ b/ding/policy/prompt_awr.py
@@ -0,0 +1,274 @@
+from collections import namedtuple
+from typing import List, Dict, Any, Tuple, Union
+
+import torch
+
+from ding.model import model_wrap
+from ding.rl_utils import get_train_sample
+from ding.torch_utils import Adam, to_device
+from ding.utils import POLICY_REGISTRY
+from ding.utils.data import default_collate, default_decollate
+from .base_policy import Policy
+
+
+@POLICY_REGISTRY.register('prompt_awr')
+class PromptAWRPolicy(Policy):
+    """
+    Overview:
+        Policy class of AWR (Advantage Weighted Regression) algorithm, proposed in https://arxiv.org/abs/1910.00177.
+        Especially, this policy is designed for training a language model policy.
+        In this policy, the environment's observation includes the current context, a list of optional actions
+        (strings). The final output of the policy is a set of optional actions with a size of ``shot_number``.
+    """
+    config = dict(
+        # (str) Name of the registered RL policy (refer to the "register_policy" function).
+        type='prompt_awr',
+        # (bool) Flag to enable CUDA for model computation.
+        cuda=False,
+        # (bool) Flag for using on-policy training (training policy is the same as the behavior policy).
+        on_policy=False,
+        # (bool) Flag for enabling priority experience replay. Must be False when priority_IS_weight is False.
+        priority=False,
+        # (bool) Flag for using Importance Sampling weights to correct updates. Requires `priority` to be True.
+        priority_IS_weight=False,
+        # (str) Type of action space used in the policy, with valid options ['discrete', 'continuous'].
+        action_space='discrete',
+        # (int) The number of actions that can be done simultaneously in one timestep.
+        shot_number=1,
+        # learn_mode configuration
+        learn=dict(
+            # (int) Number of updates per data collection. A2C requires this to be set to 1.
+            update_per_collect=1,
+            # (int) Batch size for learning.
+            batch_size=64,
+            # (float) Learning rate for optimizer.
+            learning_rate=0.001,
+            # (Tuple[float, float]) Coefficients used for computing running averages of gradient and its square.
+            betas=(0.9, 0.999),
+            # (float) Term added to the denominator to improve numerical stability in optimizer.
+            eps=1e-8,
+            # (float) Maximum norm for gradients.
+            grad_norm=0.5,
+            # (float) Scaling factor for value network loss relative to policy network loss.
+            value_weight=0.5,
+            # (float) Coefficient that controls the exp scale in awr algorithm.
+            beta=1.0,
+            # (float) Weight of entropy regularization in the loss function.
+            entropy_weight=0.001,
+            # (Tuple[float, float]) The range of adv. Value that exceeds this range will be clipped.
+            adv_range=(-0.5, 0.5),
+            # (bool) If set to True, the 'done' signals that indicate the end of an episode due to environment time
+            # limits are disregarded. By default, this is set to False. This setting is particularly useful for tasks
+            # that have a predetermined episode length, such as HalfCheetah and various other MuJoCo environments,
+            # where the maximum length is capped at 1000 steps. When enabled, any 'done' signal triggered by reaching
+            # the maximum episode steps will be overridden to 'False'. This ensures the accurate calculation of the
+            # Temporal Difference (TD) error, using the formula `gamma * (1 - done) * next_v + reward`,
+            # even when the episode surpasses the predefined step limit.
+            ignore_done=False,
+        ),
+        # collect_mode configuration
+        collect=dict(
+            # (int) The length of rollout for data collection.
+            unroll_len=1,
+            # (float) Discount factor for calculating future rewards, typically in the range [0, 1].
+            discount_factor=0.9,
+            # (float) Trade-off parameter for balancing TD-error and Monte Carlo error in GAE.
+            gae_lambda=0.95,
+        ),
+        # eval_mode configuration (kept empty for compatibility purposes)
+        eval=dict(),
+    )
+
+    def default_model(self) -> Tuple[str, List[str]]:
+        """
+        Overview:
+            Returns the default model configuration used by the AWR algorithm. ``__init__`` method will \
+            automatically call this method to get the default model setting and create model.
+
+        Returns:
+            - model_info (:obj:`Tuple[str, List[str]]`): \
+                Tuple containing the registered model name and model's import_names.
+        """
+        return 'language_transformer', ['ding.model.template.language_transformer']
+
+    def _init_learn(self) -> None:
+        """
+        Overview:
+            Initialize the learn mode of policy, including related attributes and modules. For AWR, it mainly \
+            contains optimizer, algorithm-specific arguments such as value_weight, entropy_weight, adv_norm
+            and grad_norm, and main model. \
+            This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``.
+
+        .. note::
+            For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \
+            and ``_load_state_dict_learn`` methods.
+
+        .. note::
+            For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method.
+
+        .. note::
+            If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \
+            with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``.
+        """
+        assert self._cfg.action_space == "discrete"
+        # Optimizer
+        self._optimizer = Adam(
+            self._model.parameters(),
+            lr=self._cfg.learn.learning_rate,
+            betas=self._cfg.learn.betas,
+            eps=self._cfg.learn.eps
+        )
+
+        # Algorithm config
+        self._priority = self._cfg.priority
+        self._priority_IS_weight = self._cfg.priority_IS_weight
+        self._value_weight = self._cfg.learn.value_weight
+        self._entropy_weight = self._cfg.learn.entropy_weight
+        self._adv_norm = self._cfg.learn.adv_norm
+        self._grad_norm = self._cfg.learn.grad_norm
+
+        # Main and target models
+        self._learn_model = self._model
+
+    def _forward_learn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        # Data preprocessing operations, such as stack data, cpu to cuda device
+        self._learn_model.train()
+
+        for i in range(0, len(data), self._cfg.learn.batch_size):
+            batch = default_collate(data[i:i + self._cfg.learn.batch_size])
+            if self._cuda:
+                batch = to_device(batch, self._device)
+
+            # Prepare train_sample (the question to be answered) and the candidate_samples (the prompts to be selected)
+            train_samples, cand_samples = batch["obs"]["train_sample"], batch["obs"]["candidate_samples"]
+            for cand_n in range(len(cand_samples)):
+                cand_samples[cand_n] = cand_samples[cand_n][0]
+            output = self._learn_model.forward(train_samples, cand_samples, mode='compute_actor_critic')
+            return_ = batch['return']
+
+            # Calculate AWR loss
+            real_act = batch['action']
+
+            # Ensure the shape of real_act is: (B, shot_number)
+            if len(real_act.shape) == 1:
+                real_act = real_act.unsqueeze(-1)
+
+            # Calculate different parts of loss.
+            total_policy_loss, total_entropy_loss, total_value_loss = 0, 0, 0
+            for shot_n in range(self._cfg.shot_number):
+                log_prob = output['dist'].log_prob(real_act[:, shot_n])
+                # Clamp the adv for better stability.
+                adv = torch.clamp(
+                    return_ - batch['value'], min=self._cfg.learn.norm_range[0], max=self._cfg.learn.norm_range[1]
+                )
+                # The policy loss for AWR algorithm.
+                policy_loss = -(log_prob * torch.exp(adv / self._cfg.learn.beta)).mean()
+                total_policy_loss += policy_loss
+            # The value loss for AWR algorithm.
+            value_loss = ((return_ - output['value']) ** 2).mean()
+            total_value_loss += value_loss
+            # The entropy loss for AWR algorithm.
+            total_entropy_loss += -self._cfg.learn.entropy_weight * output['dist'].entropy().mean()
+            total_loss = total_entropy_loss + total_policy_loss + total_value_loss
+
+            self._optimizer.zero_grad()
+            total_loss.backward()
+
+            grad_norm = torch.nn.utils.clip_grad_norm_(
+                list(self._learn_model.parameters()),
+                max_norm=self._grad_norm,
+            )
+            self._optimizer.step()
+
+        return {
+            'cur_lr': self._optimizer.param_groups[0]['lr'],
+            'total_loss': total_loss.item(),
+            'policy_loss': total_policy_loss.item(),
+            'entropy_loss': total_entropy_loss.item(),
+            'value_loss': total_value_loss.item(),
+            'return_abs_max': return_.abs().max().item(),
+            'grad_norm': grad_norm,
+        }
+
+    def _init_collect(self) -> None:
+        self._unroll_len = self._cfg.collect.unroll_len
+        self._gamma = self._cfg.collect.discount_factor
+        self._collect_model = model_wrap(self._model, wrapper_name='combination_multinomial_sample')
+
+    def _forward_collect(self, data: Dict[int, Any]) -> Dict[int, Any]:
+        """
+        Overview:
+            Policy forward function of collect mode (collecting training data by interacting with envs). Forward means \
+            that the policy gets some necessary data (mainly observation) from the envs and then returns the output \
+            data, such as the action to interact with the envs.
+        Arguments:
+            - data (:obj:`Dict[int, Any]`): The input data used for policy forward, including at least the obs. The \
+                key of the dict is environment id and the value is the corresponding data of the env.
+        Returns:
+            - output (:obj:`Dict[int, Any]`): The output data of policy forward, including at least the action and \
+                other necessary data for learn mode defined in ``self._process_transition`` method. The key of the \
+                dict is the same as the input data, i.e. environment id.
+        """
+        data_id = list(data.keys())
+        data = default_collate(list(data.values()))
+        self._model.eval()
+        with torch.no_grad():
+            # Prepare train_sample (the question to be answered) and the candidate_samples (the prompts to be selected)
+            for ii in range(len(data['candidate_samples'])):
+                data['candidate_samples'][ii] = data['candidate_samples'][ii][0]
+            output = self._collect_model.forward(
+                self._cfg.shot_number, data['train_sample'], data['candidate_samples'], mode="compute_actor_critic"
+            )
+        if self._cuda:
+            output = to_device(output, 'cpu')
+        output = default_decollate(output)
+        return {i: d for i, d in zip(data_id, output)}
+
+    def _process_transition(self, obs: Any, policy_output: Dict[str, torch.Tensor],
+                            timestep: namedtuple) -> Dict[str, torch.Tensor]:
+        return {
+            'obs': obs,
+            'action': policy_output['action'],
+            'value': policy_output['value'],
+            'reward': timestep.reward,
+            'done': timestep.done,
+        }
+
+    def _get_train_sample(self, data: list) -> Union[None, List[Any]]:
+        r"""
+        Overview:
+            Get the trajectory and the n step return data, then sample from the n_step return data
+        Arguments:
+            - data (:obj:`list`): The trajectory's buffer list
+        Returns:
+            - samples (:obj:`dict`): The training samples generated
+        """
+        if self._cfg.learn.ignore_done:
+            raise NotImplementedError
+
+        R = 0.
+        for i in reversed(range(len(data))):
+            R = self._gamma * R + data[i]['reward']
+            data[i]['return'] = R
+        return get_train_sample(data, self._unroll_len)
+
+    def _init_eval(self) -> None:
+        self._eval_model = model_wrap(self._model, wrapper_name='combination_argmax_sample')
+
+    def _forward_eval(self, data: dict) -> dict:
+        data_id = list(data.keys())
+        data = default_collate(list(data.values()))
+        self._model.eval()
+        with torch.no_grad():
+            # Prepare train_sample (the question to be answered) and the candidate_samples (the prompts to be selected)
+            for ii in range(len(data['candidate_samples'])):
+                data['candidate_samples'][ii] = data['candidate_samples'][ii][0]
+            output = self._eval_model.forward(self._cfg.shot_number, data['train_sample'], data['candidate_samples'])
+        if self._cuda:
+            output = to_device(output, 'cpu')
+        output = default_decollate(output)
+        return {i: d for i, d in zip(data_id, output)}
+
+    def _monitor_vars_learn(self) -> List[str]:
+        return super()._monitor_vars_learn() + \
+               ['policy_loss', 'entropy_loss', 'return_abs_max', 'grad_norm', 'value_loss']
diff --git a/ding/policy/prompt_pg.py b/ding/policy/prompt_pg.py
index ebccadb8a3..a76e0e5faf 100644
--- a/ding/policy/prompt_pg.py
+++ b/ding/policy/prompt_pg.py
@@ -26,6 +26,8 @@ class PromptPGPolicy(Policy):
         on_policy=True,  # for pg strictly on policy algorithm, this line should not be modified by users
         # (bool) whether to use deterministic action for evaluation.
         deterministic_eval=True,
+        # (int) The number of actions that can be done simultaneously in one timestep.
+        shot_number=1,
         learn=dict(
             # (int) the number of samples for one update.
             batch_size=64,
@@ -98,6 +100,8 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]:
 
             # calculate PG loss
             real_act = batch['action']  # shape: (B, shot_number)
+            if len(real_act.shape) == 1:
+                real_act = real_act.unsqueeze(-1)
             # Calculate loss.
             total_policy_loss, total_entropy_loss = 0, 0
             for ii in range(self._cfg.shot_number):
diff --git a/dizoo/gym_anytrading/envs/trading_env.py b/dizoo/gym_anytrading/envs/trading_env.py
index d4ff57a057..a29c5b2215 100644
--- a/dizoo/gym_anytrading/envs/trading_env.py
+++ b/dizoo/gym_anytrading/envs/trading_env.py
@@ -252,7 +252,7 @@ def create_collector_env_cfg(cfg: dict) -> List[dict]:
         collector_env_num = cfg.pop('collector_env_num')
         collector_env_cfg = [copy.deepcopy(cfg) for _ in range(collector_env_num)]
         for i in range(collector_env_num):
-            collector_env_cfg[i]['env_id'] += ('-' + str(i) + 'e')
+            collector_env_cfg[i]['env_id'] += ('-' + str(i) + 'c')
         return collector_env_cfg
 
     # override
diff --git a/dizoo/mujoco/config/ant_onppo_config.py b/dizoo/mujoco/config/ant_onppo_config.py
index 32793ffecc..036d651391 100644
--- a/dizoo/mujoco/config/ant_onppo_config.py
+++ b/dizoo/mujoco/config/ant_onppo_config.py
@@ -5,13 +5,10 @@
     exp_name="ant_onppo_seed0",
     env=dict(
         env_id='Ant-v3',
-        norm_obs=dict(use_norm=False, ),
-        norm_reward=dict(use_norm=False, ),
         collector_env_num=10,
         evaluator_env_num=10,
         n_evaluator_episode=10,
         stop_value=6000,
-        manager=dict(shared_memory=False, )
     ),
     policy=dict(
         cuda=True,
diff --git a/dizoo/mujoco/config/halfcheetah_onppo_config.py b/dizoo/mujoco/config/halfcheetah_onppo_config.py
index f63f296167..b7646bbede 100644
--- a/dizoo/mujoco/config/halfcheetah_onppo_config.py
+++ b/dizoo/mujoco/config/halfcheetah_onppo_config.py
@@ -7,8 +7,6 @@
     exp_name='halfcheetah_onppo_seed0',
     env=dict(
         env_id='HalfCheetah-v3',
-        norm_obs=dict(use_norm=False, ),
-        norm_reward=dict(use_norm=False, ),
         collector_env_num=collector_env_num,
         evaluator_env_num=evaluator_env_num,
         n_evaluator_episode=8,
@@ -78,4 +76,4 @@
 if __name__ == "__main__":
     # or you can enter `ding -m serial_onpolicy -c halfcheetah_onppo_config.py -s 0`
     from ding.entry import serial_pipeline_onpolicy
-    serial_pipeline_onpolicy((main_config, create_config), seed=0)
\ No newline at end of file
+    serial_pipeline_onpolicy((main_config, create_config), seed=0)
diff --git a/dizoo/mujoco/config/hopper_onppo_config.py b/dizoo/mujoco/config/hopper_onppo_config.py
index 0853aa4abb..6d1c57e70e 100644
--- a/dizoo/mujoco/config/hopper_onppo_config.py
+++ b/dizoo/mujoco/config/hopper_onppo_config.py
@@ -5,15 +5,13 @@
     exp_name='hopper_onppo_seed0',
     env=dict(
         env_id='Hopper-v3',
-        norm_obs=dict(use_norm=False, ),
-        norm_reward=dict(use_norm=False, ),
         collector_env_num=8,
         evaluator_env_num=10,
         n_evaluator_episode=10,
         stop_value=4000,
     ),
     policy=dict(
-        cuda=False,
+        cuda=True,
         recompute_adv=True,
         action_space='continuous',
         model=dict(
diff --git a/dizoo/mujoco/config/walker2d_onppo_config.py b/dizoo/mujoco/config/walker2d_onppo_config.py
index 2437d62e43..58f5e9054f 100644
--- a/dizoo/mujoco/config/walker2d_onppo_config.py
+++ b/dizoo/mujoco/config/walker2d_onppo_config.py
@@ -7,8 +7,6 @@
     exp_name='walker2d_onppo_seed0',
     env=dict(
         env_id='Walker2d-v3',
-        norm_obs=dict(use_norm=False, ),
-        norm_reward=dict(use_norm=False, ),
         collector_env_num=collector_env_num,
         evaluator_env_num=evaluator_env_num,
         n_evaluator_episode=8,
diff --git a/dizoo/mujoco/envs/mujoco_env.py b/dizoo/mujoco/envs/mujoco_env.py
index c150581a5b..700142583b 100644
--- a/dizoo/mujoco/envs/mujoco_env.py
+++ b/dizoo/mujoco/envs/mujoco_env.py
@@ -151,7 +151,6 @@ def create_collector_env_cfg(cfg: dict) -> List[dict]:
     def create_evaluator_env_cfg(cfg: dict) -> List[dict]:
         evaluator_cfg = copy.deepcopy(cfg)
         evaluator_env_num = evaluator_cfg.pop('evaluator_env_num', 1)
-        evaluator_cfg.norm_reward.use_norm = False
         return [evaluator_cfg for _ in range(evaluator_env_num)]
 
     @property
diff --git a/dizoo/tabmwp/config/tabmwp_awr_config.py b/dizoo/tabmwp/config/tabmwp_awr_config.py
new file mode 100644
index 0000000000..7e3f22865f
--- /dev/null
+++ b/dizoo/tabmwp/config/tabmwp_awr_config.py
@@ -0,0 +1,66 @@
+from easydict import EasyDict
+
+tabmwp_prompt_awr_config = dict(
+    exp_name='tabmwp_prompt_awr_seed0',
+    env=dict(
+        collector_env_num=1,
+        evaluator_env_num=1,
+        n_evaluator_episode=1,
+        stop_value=1,
+        cand_number=16,
+        train_number=80,
+        engine='text-davinci-002',
+        temperature=0.,
+        max_tokens=512,
+        top_p=1.,
+        frequency_penalty=0.,
+        presence_penalty=0.,
+        option_inds=["A", "B", "C", "D", "E", "F"],
+        # The API-key of openai. You can get your key in this website: https://platform.openai.com/
+        api_key='',
+        enable_replay=True,
+        prompt_format='TQ-A',
+        seed=0,
+    ),
+    policy=dict(
+        cuda=True,
+        shot_number=2,
+        model=dict(
+            model_name="bert-base-uncased",
+            add_linear=True,
+            freeze_encoder=True,
+            embedding_size=128,
+        ),
+        learn=dict(
+            batch_size=10,
+            # (bool) Whether to normalize advantage. Default to False.
+            learning_rate=0.001,
+            # (float) loss weight of the value network, the weight of policy network is set to 1
+            entropy_weight=0.001,
+            weight_decay=5e-3,
+            grad_norm=0.5,
+        ),
+        collect=dict(
+            # (int) collect n_sample data, train model 1 times
+            n_sample=20,
+            discount_factor=0.,
+        ),
+        eval=dict(evaluator=dict(eval_freq=500, )),
+    ),
+)
+main_config = EasyDict(tabmwp_prompt_awr_config)
+
+tabmwp_prompt_awr_config = dict(
+    env=dict(
+        type='tabmwp',
+        import_names=['dizoo.tabmwp.envs.tabmwp_env'],
+    ),
+    env_manager=dict(type='base'),
+    policy=dict(type='prompt_awr'),
+    replay_buffer=dict(type='naive'),
+)
+create_config = EasyDict(tabmwp_prompt_awr_config)
+
+if __name__ == '__main__':
+    from ding.entry import serial_pipeline_onpolicy
+    serial_pipeline_onpolicy((main_config, create_config), seed=0)