diff --git a/libai/layers/layer_norm.py b/libai/layers/layer_norm.py
index b154645cc..98f8027b8 100644
--- a/libai/layers/layer_norm.py
+++ b/libai/layers/layer_norm.py
@@ -126,4 +126,4 @@ def __init__(self, normalized_shape, eps=1e-6, layer_idx=0):
         self.l2norm_epsilon = eps
 
     def forward(self, hidden_states):
-        return flow._C.rms_layer_norm(hidden_states, self.weight, self.l2norm_epsilon)
+        return flow._C.rms_norm(hidden_states, self.weight, self.weight.shape, self.l2norm_epsilon)
diff --git a/projects/T5/configs/mt5_pretrain.py b/projects/T5/configs/mt5_pretrain.py
index bd9fd3bd8..304e6c5d3 100644
--- a/projects/T5/configs/mt5_pretrain.py
+++ b/projects/T5/configs/mt5_pretrain.py
@@ -18,7 +18,7 @@
 train_data_path = "projects/T5/data/training_data/part_0"
 pretrained_model_path = None
 
-micro_batch_size = 64
+micro_batch_size = 4
 optim["lr"] = 1e-4
 
 # dataloader
@@ -30,7 +30,7 @@
         )
     ],
     collate_fn=collate_fn(
-        vocab_size=12902,
+        vocab_size=12900,
         max_seq_length=512,
         noise_density=0.15,
         mean_noise_span_length=3,
@@ -43,7 +43,7 @@
 model = LazyCall(T5ForPreTraining)(cfg=cfg)
 
 # model config
-model.cfg.vocab_size = 12902
+model.cfg.vocab_size = 12900
 model.cfg.hidden_size = 512
 model.cfg.hidden_layers = 8
 model.cfg.num_attention_heads = 6
@@ -53,7 +53,7 @@
 model.cfg.attention_probs_dropout_prob = 0.0
 model.cfg.embedding_dropout_prob = 0.0
 model.cfg.layernorm_eps = 1e-6
-model.cfg.model_type = "mt5"
+
 model.cfg.pretrained_model_path = pretrained_model_path
 
 train.update(
@@ -63,7 +63,7 @@
         train_epoch=1,
         train_iter=24000,
         log_period=10,
-        amp=dict(enabled=False),
+        amp=dict(enabled=True),
         warmup_ratio=1 / 24,
         # checkpointer=dict(period=10, max_to_keep=20),
         dist=dict(
@@ -89,3 +89,5 @@
 
 train.zero_optimization.enabled = True
 train.zero_optimization.stage = 2
+train.activation_checkpoint.enabled = False
+train.num_accumulation_steps = 8
diff --git a/projects/T5/configs/t5_model_config.py b/projects/T5/configs/t5_model_config.py
index 50523f756..53e124239 100644
--- a/projects/T5/configs/t5_model_config.py
+++ b/projects/T5/configs/t5_model_config.py
@@ -15,7 +15,7 @@
     initializer_range=0.02,
     layernorm_eps=1e-5,
     amp_enabled=False,
-    model_type="t5",
+    model_type="mt5",
 )
 
 cfg = DictConfig(cfg)
diff --git a/projects/T5/models/attention.py b/projects/T5/models/attention.py
index a825f681a..20b2b0a08 100644
--- a/projects/T5/models/attention.py
+++ b/projects/T5/models/attention.py
@@ -147,17 +147,13 @@ def forward(
             use_cache (bool, optional): it will be set to True, when the model is in the inference
                 phase and used for incremental decoding. Defaults to False.
         """
-
-        # hidden_states, encoder_states: [S(0), B]
-        # attention_mask: [S(0), B]
-
         if encoder_states is not None:
             encoder_states = encoder_states.to_global(placement=hidden_states.placement)
 
         if attention_mask is not None:
             attention_mask = attention_mask.to_global(placement=hidden_states.placement)
 
-        bsz, real_seq_length = hidden_states.size()[:2]
+        real_seq_length, bsz = hidden_states.size()[:2]
 
         if past_key_value is not None:
             assert (
@@ -166,47 +162,39 @@ def forward(
             f"Got {len(past_key_value)} past states.\n"
             real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
 
-        key_length = real_seq_length if encoder_states is None else encoder_states.shape[1]
+        key_length = real_seq_length if encoder_states is None else encoder_states.shape[0]
 
         if self.is_cross_attention:
-            # if it is cross attention, key and value should be calculated only once, and the
-            # result can be reused.
             query = self.query(hidden_states)
-            query = query.view(bsz, -1, self.num_heads, self.head_size)
-            query = query.permute(0, 2, 1, 3)
+            query = query.view(-1, bsz, self.num_heads, self.head_size)
+            query = query.permute(1, 2, 0, 3)  # bsz, num_head, seq_len, head_size
+
             if past_key_value is not None:
                 key, value = past_key_value
             elif encoder_states is not None:
                 key_value = self.key_value(encoder_states)
-                key_value = key_value.view(bsz, -1, self.num_heads, 2 * self.head_size)
-                key_value = key_value.permute(0, 2, 1, 3)
+                key_value = key_value.view(-1, bsz, self.num_heads, 2 * self.head_size)
+                key_value = key_value.permute(1, 2, 0, 3)
                 key, value = flow.chunk(key_value, chunks=2, dim=-1)
             else:
                 raise ValueError(
                     "past_key_value and encoder_states cannot be None at the same time."
                 )
         else:
-            # if it is self attention, query, key, and value are all obtained from hidden_states.
-            # when in the inference phase of an incremental decoder,
-            # hidden_states is the last-added state,
-            # the full key and value could be obtained by concatenating with past_key_value.
             query_key_value = self.query_key_value(hidden_states)
-            query_key_value = query_key_value.view(bsz, -1, self.num_heads, 3 * self.head_size)
-            query_key_value = query_key_value.permute(
-                0, 2, 1, 3
-            )  # [bsz, num_heads, src_len, 3 * head_size]
-            query, key, value = flow.chunk(query_key_value, chunks=3, dim=-1)
+            attention_scores, value = flow._C.fused_self_attention(
+                query_key_value, head_size=self.head_size, alpha=1
+            )
             if past_key_value is not None:
                 past_key, past_value = past_key_value
                 key = flow.cat((past_key.type_as(key), key), dim=2)
                 value = flow.cat((past_value.type_as(value), value), dim=2)
 
-        # query, key, value: [S(0), S(1)], shape: [bsz, num_heads, seq_length, head_size]
         if use_cache:
             past_key_value = (key, value)
 
-        # [bsz, num_heads, tgt_len, src_len] with [S(0), S(1)]
-        attention_scores = flow.matmul(query, key, transpose_b=True)
+        if self.is_cross_attention:
+            attention_scores = flow.matmul(query, key, transpose_b=True, alpha=1)
 
         if position_bias is None:
             if not self.has_relative_attention_bias:
@@ -223,35 +211,27 @@ def forward(
             if past_key_value is not None:
                 position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
 
-            position_bias = position_bias + (1 - attention_mask) * -1000
-            position_bias = position_bias.to_global(placement=attention_scores.placement)
-
-        attention_scores = attention_scores + position_bias
-
-        # [S(0), S(1)] x [S(0), B] = [S(0), S(1)]
         if attention_mask is not None:
-            attention_scores = flow.mul(attention_scores, attention_mask)
-            attention_scores = attention_scores - 10000.0 * (1 - attention_mask)
-            # TODO(xingyu.liao): graph will occur `where_scalar` errors
-            # when using `masked_fill`
-            # attention_scores = attention_scores.masked_fill(1 - attention_mask, -10000.0)
-            attention_weights = flow.softmax(attention_scores, dim=-1)
-            # [bsz, num_heads, tgt_len, src_len]
-            attention_weights = self.dropout(attention_weights)
+            if use_cache:
+                attention_mask = attention_mask.expand_as(attention_scores)
+
+            attention_weights = flow._C.fused_bias_add_scale_mask_softmax_dropout(
+                attention_scores,
+                position_bias,
+                attention_mask,
+                fill_value=-10000.0,
+                scale=1,
+                p=self.attention_dropout_prob,
+            )[0]
         else:
+            attention_scores = attention_scores + position_bias
             attention_weights = flow.softmax(attention_scores, dim=-1)
-            # [bsz, num_heads, tgt_len, src_len]
             attention_weights = self.dropout(attention_weights)
 
-        # Context shape: [bsz, num_heads, tgt_len, head_size] with [S(0), S(1)]
         context = flow.matmul(attention_weights, value)
-        # Change shape: [bsz, num_heads, tgt_len, head_size] -> [bsz, tgt_len, num_heads, head_size]
-        context = context.transpose(1, 2)
 
-        # Concat multi-head results from
-        # [bsz, tgt_len, num_heads, head_size] -> [bsz, tgt_len, num_heads * head_size]
-        # SBP sign: [S(0), S(2)]
-        # [S(0), S(2)] x [B, S(0)] = [S(0), P] -> [S(0), B]
+        context = flow._C.transpose(context, perm=(2, 0, 1, 3))
+
         output = self.dense(context.flatten(2))
 
         output = self.output_dropout(output)
@@ -272,7 +252,6 @@ def extra_repr(self) -> str:
     def _relative_position_bucket(
         self, relative_position, bidirectional=True, num_buckets=32, max_distance=128
     ):
-        # relative_position: (seq_len, seq_len)
         relative_buckets = 0
         if bidirectional:
             num_buckets //= 2
diff --git a/projects/T5/models/embedding.py b/projects/T5/models/embedding.py
index 3d51a16b0..5c981ab78 100644
--- a/projects/T5/models/embedding.py
+++ b/projects/T5/models/embedding.py
@@ -97,15 +97,9 @@ def __init__(
             )
         )
         self.init_method(self.weight)
-        # FIXME(lxy): Fill padding_idx is not supported in nd_sbp right now.
-        # self._fill_padding_idx_with_zero()
 
     def forward(self, input_ids):
         weight = flow._C.amp_white_identity(self.weight) if self.amp_enabled else self.weight
-        # embeddings with sbp sign: [B, B]
-        #   [B, B] x [S(0), B] --> [S(0), B]
-        #     ↑         ↑              ↑
-        #   embed    pos_ids       pos_embed
         input_embeds = flow._C.gather(weight, input_ids, axis=0)
         return input_embeds
 
diff --git a/projects/T5/models/layer_norm.py b/projects/T5/models/layer_norm.py
index b7e9864a5..35c3262a1 100644
--- a/projects/T5/models/layer_norm.py
+++ b/projects/T5/models/layer_norm.py
@@ -33,4 +33,4 @@ def __init__(self, normalized_shape, eps=1e-6, layer_idx=0):
         self.l2norm_epsilon = eps
 
     def forward(self, hidden_states):
-        return flow._C.rms_layer_norm(hidden_states, self.weight, self.l2norm_epsilon)
+        return flow._C.rms_norm(hidden_states, self.weight, self.weight.shape, self.l2norm_epsilon)
diff --git a/projects/T5/models/logits.py b/projects/T5/models/logits.py
index 27fd2ae01..0eb8e213a 100644
--- a/projects/T5/models/logits.py
+++ b/projects/T5/models/logits.py
@@ -13,40 +13,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import oneflow as flow
 from oneflow import nn
 
 from libai.layers import Linear
-from libai.utils import distributed as dist
 
 
 class LMLogits(nn.Module):
-    def __init__(self, vocab_size, hidden_size=None, bias=False, model_type="t5", layer_idx=-1):
+    def __init__(self, vocab_size, hidden_size=None, bias=False, layer_idx=-1):
         super().__init__()
-        self.model_type = model_type
-        if model_type == "t5":
-            self.bias = (
-                nn.Parameter(
-                    flow.zeros(
-                        (vocab_size,),
-                        dtype=flow.float32,
-                        placement=dist.get_layer_placement(layer_idx),
-                        sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)]),
-                    )
-                )
-                if bias
-                else None
-            )
-        elif model_type == "mt5":
-            self.linear = Linear(hidden_size, vocab_size, bias=False, layer_idx=layer_idx)
+        self.linear = Linear(hidden_size, vocab_size, bias=bias, layer_idx=layer_idx)
 
-    def forward(self, input, word_embeddings=None):
-        if self.model_type == "t5":
-            w = word_embeddings.to_global(placement=input.placement)
-            input = input.to_global(grad_sbp=input.sbp)
-            logits = flow._C.matmul(input, w, transpose_b=True)
-            if self.bias is not None:
-                logits = logits + self.bias
-        else:
-            logits = self.linear(input)
+    def forward(self, input):
+        logits = self.linear(input)
         return logits
diff --git a/projects/T5/models/mlp.py b/projects/T5/models/mlp.py
index 3a69d5816..7160599b0 100644
--- a/projects/T5/models/mlp.py
+++ b/projects/T5/models/mlp.py
@@ -13,58 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import oneflow as flow
 from oneflow import nn
 
-from libai.layers import Linear, build_activation
-
-
-class T5MLP(nn.Module):
-    def __init__(
-        self,
-        hidden_size,
-        ffn_hidden_size,
-        output_dropout_prob=0.0,
-        init_method=nn.init.xavier_normal_,
-        output_layer_init_method=None,
-        *,
-        layer_idx=0,
-    ):
-        super().__init__()
-        self.output_dropout_prob = output_dropout_prob
-
-        if output_layer_init_method is None:
-            output_layer_init_method = init_method
-
-        self.dense_h_to_4h = Linear(
-            hidden_size,
-            ffn_hidden_size,
-            bias=False,
-            parallel="col",
-            skip_bias_add=False,
-            init_method=init_method,
-            layer_idx=layer_idx,
-        )
-
-        self.activation_func = build_activation("relu")
-
-        self.dense_4h_to_h = Linear(
-            ffn_hidden_size,
-            hidden_size,
-            bias=False,
-            parallel="row",
-            skip_bias_add=False,
-            init_method=output_layer_init_method,
-            layer_idx=layer_idx,
-        )
-
-        self.dropout = nn.Dropout(self.output_dropout_prob)
-
-    def forward(self, hidden_states):
-        intermediate = self.dense_h_to_4h(hidden_states)
-        intermediate = self.activation_func(intermediate)
-        output = self.dense_4h_to_h(intermediate)
-        output = self.dropout(output)
-        return output
+from libai.layers import Linear
 
 
 class MT5MLP(nn.Module):
@@ -104,8 +56,6 @@ def __init__(
             layer_idx=layer_idx,
         )
 
-        self.activation_func = build_activation("gelu_tanh")
-
         self.wo = Linear(
             ffn_hidden_size,
             hidden_size,
@@ -120,9 +70,8 @@ def __init__(
 
     def forward(self, hidden_states):
         wi_0_out = self.wi_0(hidden_states)
-        hidden_gelu = self.activation_func(wi_0_out)
         hidden_linear = self.wi_1(hidden_states)
-        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = flow._C.fused_fast_gelu_mul(wi_0_out, hidden_linear)
         output = self.wo(hidden_states)
         output = self.dropout(output)
         return output
diff --git a/projects/T5/models/t5_model.py b/projects/T5/models/t5_model.py
index 2551d6c54..251961b27 100644
--- a/projects/T5/models/t5_model.py
+++ b/projects/T5/models/t5_model.py
@@ -17,7 +17,7 @@
 import oneflow.nn as nn
 
 from libai.config import configurable
-from libai.layers import Linear, LMLogits
+from libai.layers import Linear
 from libai.models.t5_model import T5Loss
 from libai.models.utils import init_method_normal, scaled_init_method_normal
 from libai.utils import distributed as dist
@@ -45,10 +45,9 @@ def __init__(
         initializer_range=0.02,
         layernorm_eps=1e-12,
         amp_enabled=False,
-        model_type="t5",
     ) -> None:
         super().__init__()
-        self.model_type = model_type
+        self.model_type = "mt5"
         init_method = init_method_normal(initializer_range)
         scaled_init_method = scaled_init_method_normal(initializer_range, hidden_layers)
         self.embedding = T5Embedding(
@@ -75,7 +74,6 @@ def __init__(
                     init_method=init_method,
                     output_layer_init_method=scaled_init_method,
                     layer_idx=i,
-                    model_type=model_type,
                     has_relative_attention_bias=bool(i == 0),
                 )
                 for i in range(hidden_layers)
@@ -107,7 +105,6 @@ def __init__(
                     init_method=init_method,
                     output_layer_init_method=scaled_init_method,
                     layer_idx=i,
-                    model_type=model_type,
                     has_relative_attention_bias=bool(i - hidden_layers == 0),
                 )
                 for i in range(hidden_layers, 2 * hidden_layers)
@@ -127,12 +124,7 @@ def __init__(
         self.encoder_states = None
         self.past_length = 0
 
-        if model_type == "mt5":
-            self.lm_head = Linear(
-                hidden_size, vocab_size, bias=False, layer_idx=2 * hidden_layers - 1
-            )
-        else:
-            self.lm_head = LMLogits(vocab_size, bias=False)
+        self.lm_head = Linear(hidden_size, vocab_size, bias=False, layer_idx=2 * hidden_layers - 1)
 
     @classmethod
     def from_config(cls, cfg):
@@ -150,7 +142,7 @@ def from_config(cls, cfg):
             "initializer_range": cfg.initializer_range,
             "layernorm_eps": cfg.layernorm_eps,
             "amp_enabled": cfg.amp_enabled,
-            "model_type": cfg.model_type,
+            # "model_type": cfg.model_type,
         }
 
     def forward(
@@ -177,8 +169,10 @@ def forward(
             encoder_decoder_position_bias = None
             self.set_cache(encoder_states=None, past_key_values=None)
             encoder_attn_mask = self.extended_attn_mask(encoder_attn_mask)
-            enc_embedding_output = self.embedding(encoder_input_ids)
-            enc_hidden_states = enc_embedding_output
+
+            enc_hidden_states = self.embedding(encoder_input_ids)
+
+            enc_hidden_states = enc_hidden_states.transpose(0, 1)
 
             for layer in self.encoder.layers:
                 enc_hidden_states, position_bias = layer(
@@ -193,8 +187,10 @@ def forward(
         )
         encoder_decoder_attn_mask = self.extended_attn_mask(encoder_decoder_attn_mask)
 
-        dec_embedding_output = self.embedding(decoder_input_ids)
-        dec_hidden_states = dec_embedding_output
+        dec_hidden_states = self.embedding(decoder_input_ids)
+
+        dec_hidden_states = dec_hidden_states.transpose(0, 1)
+
         if use_cache:
             presents = []
 
@@ -219,10 +215,7 @@ def forward(
 
         decoder_states = self.decoder.final_layernorm(dec_hidden_states)
 
-        if self.model_type == "mt5":
-            logits = self.lm_head(decoder_states)
-        else:
-            logits = self.lm_head(decoder_states, self.embedding.word_embeddings.weight)
+        logits = self.lm_head(decoder_states)
 
         return logits
 
@@ -271,7 +264,7 @@ def forward(
             encoder_decoder_attn_mask,
             use_cache=use_cache,
         )
-
+        logits = logits.transpose(0, 1)
         if lm_labels is not None:
             lm_loss = self.loss_func(logits, lm_labels, loss_mask)
             return lm_loss
@@ -342,3 +335,14 @@ def set_pipeline_stage_id(model):
                 dist_utils.get_layer_stage_id(model.t5_model.decoder.final_layernorm.layer_idx),
                 dist.get_layer_placement(model.t5_model.decoder.final_layernorm.layer_idx),
             )
+
+    @staticmethod
+    def set_activation_checkpoint(model):
+        for module_block in model.modules():
+            # Old API in OneFlow 0.8
+            if hasattr(module_block, "origin"):
+                if isinstance(module_block.origin, TransformerLayer):
+                    module_block.config.activation_checkpointing = True
+            else:
+                if isinstance(module_block.to(nn.Module), TransformerLayer):
+                    module_block.to(nn.graph.GraphModule).activation_checkpointing = True
diff --git a/projects/T5/models/transformer_layer.py b/projects/T5/models/transformer_layer.py
index c23cb903d..9b3f8f671 100644
--- a/projects/T5/models/transformer_layer.py
+++ b/projects/T5/models/transformer_layer.py
@@ -19,7 +19,7 @@
 from libai.utils import distributed as dist
 from projects.T5.models.attention import MultiheadAttention
 from projects.T5.models.layer_norm import LayerNorm
-from projects.T5.models.mlp import MT5MLP, T5MLP
+from projects.T5.models.mlp import MT5MLP
 
 
 class TransformerLayer(nn.Module):
@@ -60,7 +60,6 @@ def __init__(
         output_layer_init_method=None,
         *,
         layer_idx=0,
-        model_type="t5",
         has_relative_attention_bias=False
     ):
         super().__init__()
@@ -104,24 +103,14 @@ def __init__(
             self.post_cross_attention_layernorm = LayerNorm(
                 self.hidden_size, eps=self.layernorm_epsilon, layer_idx=self.layer_idx
             )
-        if model_type == "mt5":
-            self.mlp = MT5MLP(
-                self.hidden_size,
-                self.ffn_hidden_size,
-                self.output_dropout_prob,
-                self.init_method,
-                output_layer_init_method=self.output_layer_init_method,
-                layer_idx=self.layer_idx,
-            )
-        elif model_type == "t5":
-            self.mlp = T5MLP(
-                self.hidden_size,
-                self.ffn_hidden_size,
-                self.output_dropout_prob,
-                self.init_method,
-                output_layer_init_method=self.output_layer_init_method,
-                layer_idx=self.layer_idx,
-            )
+        self.mlp = MT5MLP(
+            self.hidden_size,
+            self.ffn_hidden_size,
+            self.output_dropout_prob,
+            self.init_method,
+            output_layer_init_method=self.output_layer_init_method,
+            layer_idx=self.layer_idx,
+        )
 
     def forward(
         self,
@@ -152,10 +141,8 @@ def forward(
             use_cache: it will be set to `True` when the model is in the inference phase and
                 used for incremental decoding.
         """
-        # Change placement for pipeline parallelsim
         hidden_states = hidden_states.to_global(placement=dist.get_layer_placement(self.layer_idx))
 
-        # hidden_states shape: (batch_size, seq_length, hidden_size)
         if attention_mask is not None:
             attention_mask = attention_mask.to_global(
                 placement=dist.get_layer_placement(self.layer_idx)
diff --git a/tests/model_utils/test_t5_loader.py b/tests/model_utils/test_t5_loader.py
index 62587b3cb..2bd1d59e9 100644
--- a/tests/model_utils/test_t5_loader.py
+++ b/tests/model_utils/test_t5_loader.py
@@ -26,15 +26,15 @@
 from libai.utils import distributed as dist
 from libai.utils.file_utils import get_data_from_cache
 from libai.utils.logger import setup_logger
-from projects.MT5.configs.mt5_base import cfg as libai_cfg
-from projects.MT5.mt5_model import MT5Model
 from projects.MT5.utils.mt5_loader import T5LoaderHuggerFace
+from projects.T5.configs.t5_model_config import cfg as libai_cfg
+from projects.T5.models.t5_model import T5Model
 
-PRETRAINED_MODEL_URL = "http://oneflow-static.oss-cn-beijing.aliyuncs.com/ci-files/dataset/libai/model_utils_test/t5_utils/pytorch_model.bin"  # noqa
-PRETRAINED_MODEL_CONFIG_URL = "http://oneflow-static.oss-cn-beijing.aliyuncs.com/ci-files/dataset/libai/model_utils_test/t5_utils/config.json"  # noqa
+PRETRAINED_MODEL_URL = "http://oneflow-static.oss-cn-beijing.aliyuncs.com/ci-files/dataset/libai/model_utils_test/mt5_utils/pytorch_model.bin"  # noqa
+PRETRAINED_MODEL_CONFIG_URL = "http://oneflow-static.oss-cn-beijing.aliyuncs.com/ci-files/dataset/libai/model_utils_test/mt5_utils/config.json"  # noqa
 
-PRETRAINED_MODEL_MD5 = "952862a8ba425a25739a69e5f33b0df8"
-PRETRAINED_MODEL_CONFIG_MD5 = "7ebc91dc4377c01190f4116c3c1ac6cd"
+PRETRAINED_MODEL_MD5 = "4c9c0be541b89de9b01c597ec4cc371a"
+PRETRAINED_MODEL_CONFIG_MD5 = "b159e41603b7eeaf9a9c489165bbcaca"
 
 TEST_OUTPUT = os.path.join(os.getenv("TEST_OUTPUT", "output_unittest"), "test_t5_utils")
 
@@ -64,18 +64,21 @@ def setUp(self) -> None:
             [101, 2009, 1005, 1055, 2986, 2651, 1012, 102],
             [101, 2028, 12314, 3377, 102, 0, 0, 0],
             [101, 2064, 2017, 3305, 2009, 102, 0, 0],
+            [101, 2064, 2017, 3305, 2009, 102, 0, 0],
         ]
         self.encoder_att_mask = [
             [1, 1, 1, 1, 1, 1, 1, 1],
             [1, 1, 1, 1, 1, 1, 1, 1],
             [1, 1, 1, 1, 1, 1, 1, 1],
+            [1, 1, 1, 1, 1, 1, 1, 1],
         ]
         self.decoder_input_ids = [
             [101, 2009, 1005, 1055, 2986],
             [101, 2028, 12314, 3377, 102],
             [101, 2064, 2017, 3305, 2009],
+            [101, 2064, 2017, 3305, 2009],
         ]
-        self.decoder_att_mask = [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]
+        self.decoder_att_mask = [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]
 
     @classmethod
     def tearDownClass(cls) -> None:
@@ -96,13 +99,13 @@ def test_t5_loader_with_data_tensor_parallel(self):
 
         # load model
         load_func = T5LoaderHuggerFace(
-            model=MT5Model,
+            model=T5Model,
             libai_cfg=libai_cfg,
             pretrained_model_path=self.pretrained_model_path,
             hidden_dropout_prob=0.0,
             attention_probs_dropout_prob=0.0,
             embedding_dropout_prob=0.0,
-            model_type="t5",
+            model_type="mt5",
         )
         model = load_func.load()
         model.eval()
@@ -121,13 +124,13 @@ def test_t5_loader_with_data_tensor_parallel(self):
         )
         encode_att_mask = flow.tensor(
             self.encoder_att_mask,
-            dtype=flow.long,
+            dtype=flow.bool,
             sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
             placement=dist.get_layer_placement(0),
         )
         decoder_att_mask = flow.tensor(
             self.decoder_att_mask,
-            dtype=flow.long,
+            dtype=flow.bool,
             sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
             placement=dist.get_layer_placement(0),
         )
@@ -137,7 +140,7 @@ def test_t5_loader_with_data_tensor_parallel(self):
         )
         self.assertTrue(
             np.allclose(
-                np.array(-9836561.0),
+                np.array(-1.1011268e08),
                 logits.sum().data.numpy(),
             )
         )
@@ -150,20 +153,20 @@ def test_t5_loader_with_data_tensor_pipeline_parallel(self):
                 data_parallel_size=2,
                 tensor_parallel_size=1,
                 pipeline_parallel_size=2,
-                pipeline_num_layers=24,
+                pipeline_num_layers=16,
             )
         )
         dist.setup_dist_util(dist_cfg)
 
         # load model
         load_func = T5LoaderHuggerFace(
-            model=MT5Model,
+            model=T5Model,
             libai_cfg=libai_cfg,
             pretrained_model_path=self.pretrained_model_path,
             hidden_dropout_prob=0.0,
             attention_probs_dropout_prob=0.0,
             embedding_dropout_prob=0.0,
-            model_type="t5",
+            model_type="mt5",
         )
         model = load_func.load()
         model.eval()
@@ -182,13 +185,13 @@ def test_t5_loader_with_data_tensor_pipeline_parallel(self):
         )
         encode_att_mask = flow.tensor(
             self.encoder_att_mask,
-            dtype=flow.long,
+            dtype=flow.bool,
             sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
             placement=dist.get_layer_placement(0),
         )
         decoder_att_mask = flow.tensor(
             self.decoder_att_mask,
-            dtype=flow.long,
+            dtype=flow.bool,
             sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
             placement=dist.get_layer_placement(0),
         )
@@ -198,7 +201,7 @@ def test_t5_loader_with_data_tensor_pipeline_parallel(self):
         )
         self.assertTrue(
             np.allclose(
-                np.array(-9836561.0),
+                np.array(-1.1011268e08),
                 logits.sum().data.numpy(),
             )
         )