diff --git a/configs/common/models/t5.py b/configs/common/models/t5.py
index 2d5625c41..a23c637fd 100644
--- a/configs/common/models/t5.py
+++ b/configs/common/models/t5.py
@@ -13,7 +13,6 @@
     attention_probs_dropout_prob=0.1,
     max_position_embeddings=512,
     embedding_dropout_prob=0.1,
-    num_tokentypes=0,
     initializer_range=0.02,
     layernorm_eps=1e-5,
     bias_gelu_fusion=True,
diff --git a/libai/engine/default.py b/libai/engine/default.py
index 1e670b651..c5c8cf21f 100644
--- a/libai/engine/default.py
+++ b/libai/engine/default.py
@@ -303,7 +303,17 @@ def __init__(self, cfg):
         self.auto_scale_hyperparams(cfg, self.train_loader)
 
         # Assume these objects must be constructed in this order.
+        dist.synchronize()
+        start_time = time.time()
+        logger.info("> Start building model...")
         self.model = self.build_model(cfg)
+
+        dist.synchronize()
+        logger.info(
+            ">>> done with building model. "
+            "Building time: {:.3f} seconds".format(time.time() - start_time)
+        )
+
         self.optimizer = self.build_optimizer(cfg, self.model)
         self.lr_scheduler = self.build_lr_scheduler(cfg, self.optimizer)
 
diff --git a/libai/inference/generator/generation_logits_processor.py b/libai/inference/generator/generation_logits_processor.py
index f650177d6..7ca04684b 100644
--- a/libai/inference/generator/generation_logits_processor.py
+++ b/libai/inference/generator/generation_logits_processor.py
@@ -19,11 +19,8 @@
 import math
 from typing import Callable, List, Tuple
 
-import numpy as np
 import oneflow as flow
 
-from libai.utils import distributed as dist
-
 
 class LogitsProcessorList(list):
     def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor, **kwargs) -> flow.Tensor:
@@ -125,20 +122,7 @@ def __call__(self, input_ids, scores, current_tokens, beam_group_idx) -> flow.Te
             previous_group_tokens = current_tokens[
                 batch_idx * self._num_beams : batch_idx * self._num_beams + group_start_idx
             ]
-            # TODO: bincount
-            previous_group_tokens = (
-                previous_group_tokens.to_global(
-                    sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
-                    placement=flow.placement("cuda", list(range(dist.get_world_size()))),
-                )
-                .to_local()
-                .numpy()
-            )
-            token_frequency = np.bincount(previous_group_tokens, minlength=vocab_size)
-            token_frequency = token_frequency.to_global(
-                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
-                placement=flow.placement("cuda", list(range(dist.get_world_size()))),
-            )
+            token_frequency = flow.bincount(previous_group_tokens, minlength=vocab_size)
             scores[batch_idx * group_size : (batch_idx + 1) * group_size] = (
                 scores[batch_idx * group_size : (batch_idx + 1) * group_size]
                 - self._diversity_penalty * token_frequency
diff --git a/libai/inference/generator/generation_utils.py b/libai/inference/generator/generation_utils.py
index ed14c2e19..5e2513bba 100644
--- a/libai/inference/generator/generation_utils.py
+++ b/libai/inference/generator/generation_utils.py
@@ -468,7 +468,7 @@ def greedy_search(
             stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
 
         # keep track of which sequences are already finished
-        unfinished_sequences = flow.zeros(input_ids.shape[0]).fill_(1)
+        unfinished_sequences = flow.ones(input_ids.shape[0])
         cur_len = input_ids.shape[-1]
         while True:
             # prepare model inputs
@@ -517,6 +517,10 @@ def greedy_search(
             if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
                 break
 
+        # Release records
+        self.past_key_values = [None] * len(self.decoder.layers)
+        self.encoder_states = None
+
         return input_ids
 
     def multinomial_sample(
@@ -553,7 +557,7 @@ def multinomial_sample(
             stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
         logits_warper = logits_warper if logits_warper is not None else LogitsProcessorList()
 
-        unfinished_sequences = flow.zeros(input_ids.shape[0]).fill_(1)
+        unfinished_sequences = flow.ones(input_ids.shape[0])
         cur_len = input_ids.shape[-1]
 
         while True:
@@ -612,6 +616,10 @@ def multinomial_sample(
             if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
                 break
 
+        # Release records
+        self.past_key_values = [None] * len(self.decoder.layers)
+        self.encoder_states = None
+
         return input_ids
 
     def beam_search(
@@ -747,6 +755,10 @@ def beam_search(
             beam_indices=beam_indices,
         )
 
+        # Release records
+        self.past_key_values = [None] * len(self.decoder.layers)
+        self.encoder_states = None
+
         return sequence_outputs["sequences"]
 
     @flow.no_grad()
diff --git a/libai/inference/text_generation.py b/libai/inference/text_generation.py
index d003d63b8..96647f6f1 100644
--- a/libai/inference/text_generation.py
+++ b/libai/inference/text_generation.py
@@ -14,18 +14,10 @@
 # limitations under the License.
 
 from libai.inference.basic import BasePipeline
-from libai.tokenizer import T5Tokenizer
 from libai.utils import distributed as dist
 
 
 class TextGenerationPipeline(BasePipeline):
-    def build_tokenizer(self, cfg):
-        tokenizer = T5Tokenizer(
-            "data_test/t5_inference_model/spiece.model",
-            add_bos_token=True,
-        )
-        return tokenizer
-
     def load_pretrain_weight(self, libai_cfg_model, model_path, mode="huggingface"):
         """load pretrained model.
 
@@ -48,12 +40,21 @@ def load_pretrain_weight(self, libai_cfg_model, model_path, mode="huggingface"):
                 model_type="t5",
             )
             return model_loader.load()
-        else:
-            return super().load_pretrain_weight(
+        elif mode == "libai":
+            from projects.MT5.utils.mt5_loader import T5LoaderLibai
+
+            model_loader = T5LoaderLibai(
                 libai_cfg_model,
+                libai_cfg_model.cfg,
                 model_path,
-                mode=mode,
             )
+            return model_loader.load()
+        elif mode == "random":
+            from libai.engine import DefaultTrainer
+
+            return DefaultTrainer.build_model(self.cfg)
+        else:
+            raise NotImplementedError
 
     def _parse_parameters(self, **pipeline_parameters):
         preprocess_params = {}
@@ -82,8 +83,11 @@ def forward(self, encoder_input_dict, **kwargs) -> dict:
         return {"return_ids": outputs}
 
     def postprocess(self, model_output_dict, **kwargs) -> dict:
-        text = self.tokenizer.decode(model_output_dict["return_ids"][0], skip_special_tokens=True)
-        records = {"generated_text": text}
+        return_ids = model_output_dict["return_ids"]
+        records = [
+            {"generated_text": self.tokenizer.decode(return_ids[i], skip_special_tokens=True)}
+            for i in range(return_ids.size(0))
+        ]
         return records
 
 
diff --git a/projects/MAE/modeling/mae.py b/projects/MAE/modeling/mae.py
index 221e9634b..c0ac7ca99 100644
--- a/projects/MAE/modeling/mae.py
+++ b/projects/MAE/modeling/mae.py
@@ -362,8 +362,8 @@ def forward_loss(self, imgs, pred, mask):
             target = (target - mean) / (var + 1.0e-6) ** 0.5
 
         loss = (pred - target) ** 2
-        # We want the prev loss to be calculated with float32,
-        # and mean/sum below to be calculated with float16.
+        # We want the prev loss to be calculated with float16,
+        # and mean/sum below to be calculated with float32.
         # this amp_white_identity will affect preceding ops to be float16
         loss = flow._C.amp_white_identity(loss)
         # this amp_black_identity will affect succeeding ops to be float32
diff --git a/projects/MT5/configs/t5_inference.py b/projects/MT5/configs/t5_inference.py
index 5f6ab7a84..528aca972 100644
--- a/projects/MT5/configs/t5_inference.py
+++ b/projects/MT5/configs/t5_inference.py
@@ -1,7 +1,9 @@
 from .mt5_base import cfg
 from libai.config import LazyCall
+from libai.tokenizer import T5Tokenizer
 from projects.MT5.mt5_model import MT5Model, MT5ForPreTraining
 from configs.common.train import train
+from configs.common.data.t5_dataset import tokenization
 
 cfg.update(
     model_type="t5",
@@ -38,3 +40,7 @@
 )
 
 model = LazyCall(MT5Model)(cfg=cfg)
+tokenization.tokenizer = LazyCall(T5Tokenizer)(
+    vocab_file="/path/to/spiece.model",
+    add_bos_token=True,
+)
diff --git a/projects/T5/configs/t5_model_config.py b/projects/T5/configs/t5_model_config.py
index 3fd2a3b66..50523f756 100644
--- a/projects/T5/configs/t5_model_config.py
+++ b/projects/T5/configs/t5_model_config.py
@@ -12,7 +12,6 @@
     attention_probs_dropout_prob=0.1,
     relative_attention_num_buckets=32,
     embedding_dropout_prob=0.1,
-    num_tokentypes=0,
     initializer_range=0.02,
     layernorm_eps=1e-5,
     amp_enabled=False,