diff --git a/.github/workflows/py.yml b/.github/workflows/py.yml
index a94de2947..f297ab306 100644
--- a/.github/workflows/py.yml
+++ b/.github/workflows/py.yml
@@ -4,7 +4,7 @@ env:
   ONEFLOW_SRC: oneflow-src
 on:
   pull_request:
-    types: [review_requested]
+    types: [opened, review_requested, ready_for_review, synchronize, unlocked]
     branches:
       - "*"
   workflow_dispatch:
diff --git a/docs/source/tutorials/basics/Distributed_Configuration.md b/docs/source/tutorials/basics/Distributed_Configuration.md
index 7a722766f..77c4e997f 100644
--- a/docs/source/tutorials/basics/Distributed_Configuration.md
+++ b/docs/source/tutorials/basics/Distributed_Configuration.md
@@ -65,7 +65,7 @@ train.dist.pipeline_num_layers = model.cfg.hidden_layers
 - `train.dist.pipeline_num_layers` must be set consistent with the model layers. If unset, it will use the default value `1000`,
 which might trigger unexpected behavior.
 
-- For models which have been configured with pipeline parallelism(e.g., BERT, GPT-2, T5 and ViT), you can simply update the distributed config to execute pipeline parallel training on them. If you need to train your own model with pipeline parallel strategy, please refer to [Write Models](https://libai.readthedocs.io/en/latest/tutorials/basics/Write_Models.html) for more details about configuring your own model with pipeline parallelism.
+- For models which have been configured with pipeline parallelism(e.g., BERT, GPT-2, T5 and ViT), you can simply update the distributed config to execute pipeline parallel training on them. If you need to train your own model with pipeline parallel strategy, please refer to [Customize Parallelism](https://libai.readthedocs.io/en/latest/tutorials/advanced_tutorials/customize_parallel.html#write-your-own-pipeline-parallel-model) for more details about configuring your own model with pipeline parallelism.
 
 #### **Data Parallel + Tensor Parallel for 2D Parallel Training on 8 GPUs**
 
diff --git a/libai/engine/default.py b/libai/engine/default.py
index 14a107167..1f7e6b815 100644
--- a/libai/engine/default.py
+++ b/libai/engine/default.py
@@ -508,8 +508,8 @@ def get_batch(
 
         if mixup_func is not None:
             images, labels = mixup_func(
-                data.get("images").tensor.cuda(),
-                data.get("labels").tensor.cuda(),
+                data.get("images").tensor.to(input_placement_device),
+                data.get("labels").tensor.to(input_placement_device),
             )
             data.get("images").tensor = images
             data.get("labels").tensor = labels
diff --git a/libai/evaluation/utils.py b/libai/evaluation/utils.py
index a1c36ca90..3664e2723 100644
--- a/libai/evaluation/utils.py
+++ b/libai/evaluation/utils.py
@@ -21,7 +21,7 @@
 from libai.utils import distributed as dist
 
 
-def pad_batch(x_dict, batch_size, last_batch_lack, is_last_batch):
+def pad_batch(x_dict, batch_size, last_batch_lack, is_last_batch, device="cuda"):
     x = list(x_dict.values())[0]
     tensor_batch = x.shape[0]
     assert tensor_batch <= batch_size
@@ -37,9 +37,9 @@ def pad_batch(x_dict, batch_size, last_batch_lack, is_last_batch):
     for key, xi in x_dict.items():
         pad_shape = (batch_size, *xi.shape[1:])
         local_xi = xi.to_global(
-            sbp=flow.sbp.broadcast, placement=flow.env.all_device_placement("cuda")
+            sbp=flow.sbp.broadcast, placement=flow.env.all_device_placement(device)
         ).to_local()
-        padded_xi = flow.zeros(pad_shape, dtype=xi.dtype, device="cuda")
+        padded_xi = flow.zeros(pad_shape, dtype=xi.dtype, device=device)
         padded_xi[:tensor_batch, ...] = padded_xi[:tensor_batch, ...] + local_xi
         for i in range(last_batch_lack - 1):
             start_idx = tensor_micro_batch_size * (data_parallel_size - i - 1) - 1
diff --git a/libai/inference/basic.py b/libai/inference/basic.py
index 94d3f1781..b869e56cc 100644
--- a/libai/inference/basic.py
+++ b/libai/inference/basic.py
@@ -15,6 +15,7 @@
 
 import logging
 from abc import ABCMeta, abstractmethod
+from pathlib import Path
 from typing import Any, Dict
 
 import oneflow as flow
@@ -43,6 +44,7 @@ def __init__(
         pipeline_num_layers=None,
         model_path=None,
         mode="libai",
+        device="cuda",
         **kwargs,
     ):
         # init cfg
@@ -60,10 +62,21 @@ def __init__(
             pipeline_stage_id,
             pipeline_num_layers,
         )
+        self.device = device
+        self.cfg.train.dist.device_type = device
         dist.setup_dist_util(self.cfg.train.dist)
         logger.info(self.cfg.train.dist)
 
         # initial and load model
+        self.model_path = model_path
+        if self.model_path is not None:
+            # If a model_path is provided in BasePipeline,
+            # we use it with priority, overwrite the pretrained_model_path in config
+            self.cfg.model.cfg.pretrained_model_path = self.model_path
+        else:
+            # If the model_path in BasePipeline is None, then use the one from the config
+            assert "pretrained_model_path" in self.cfg.model.cfg
+            self.model_path = self.cfg.model.cfg.pretrained_model_path
 
         self.model = self.load_pretrain_weight(self.cfg.model, model_path, mode=mode)
         self.model._apply(dist.convert_to_distributed_default_setting)
@@ -134,6 +147,13 @@ def load_pretrain_weight(
     def build_tokenizer(self, cfg):
         tokenizer = None
         if try_get_key(cfg, "tokenization") is not None:
+            tokenizer_cfg = cfg.tokenization.tokenizer
+            if "pretrained_model_path" not in tokenizer_cfg:
+                # If "pretrained_model_path" does not exist in the tokenizer's config,
+                # set it to default as f"{model_path}/tokenizer.model"
+                tokenizer_cfg.pretrained_model_path = str(
+                    Path(self.model_path).joinpath("tokenizer.model")
+                )
             tokenizer = DefaultTrainer.build_tokenizer(cfg)
         return tokenizer
 
@@ -167,7 +187,9 @@ def to_local(self, model_outputs_dict):
         for key, value in model_outputs_dict.items():
             if isinstance(value, flow.Tensor) and value.is_global:
                 model_outputs_dict[key] = dist.ttol(
-                    value, ranks=[0] if value.placement.ranks.ndim == 1 else [[0]]
+                    value,
+                    device=self.device,
+                    ranks=[0] if value.placement.ranks.ndim == 1 else [[0]],
                 )
         if flow.cuda.is_available():
             dist.synchronize()
diff --git a/libai/inference/generator/generation_beam_search.py b/libai/inference/generator/generation_beam_search.py
index 5fb24f5b5..e43217f94 100644
--- a/libai/inference/generator/generation_beam_search.py
+++ b/libai/inference/generator/generation_beam_search.py
@@ -96,6 +96,7 @@ def __init__(
         do_early_stopping: Optional[bool] = False,
         num_beam_hyps_to_keep: Optional[int] = 1,
         num_beam_groups: Optional[int] = 1,
+        device: Optional[str] = "cuda",
         **kwargs,
     ):
         self.num_beams = num_beams
@@ -119,7 +120,7 @@ def __init__(
             [False for _ in range(batch_size)],
             dtype=flow.bool,
             sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
-            placement=flow.placement("cuda", list(range(dist.get_world_size()))),
+            placement=flow.placement(device, list(range(dist.get_world_size()))),
         )
 
         if not isinstance(num_beams, int) or num_beams <= 1:
@@ -159,6 +160,7 @@ def process(
         pad_token_id: Optional[int] = None,
         eos_token_id: Optional[int] = None,
         beam_indices: Optional[flow.Tensor] = None,
+        device: Optional[str] = "cuda",
     ) -> Tuple[flow.Tensor]:
         cur_len = input_ids.shape[-1]
         batch_size = len(self._beam_hyps)
@@ -177,19 +179,19 @@ def process(
             (batch_size, self.group_size),
             dtype=next_scores.dtype,
             sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
-            placement=flow.placement("cuda", list(range(dist.get_world_size()))),
+            placement=flow.placement(device, list(range(dist.get_world_size()))),
         )
         next_beam_tokens = flow.zeros(
             (batch_size, self.group_size),
             dtype=next_tokens.dtype,
             sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
-            placement=flow.placement("cuda", list(range(dist.get_world_size()))),
+            placement=flow.placement(device, list(range(dist.get_world_size()))),
         )
         next_beam_indices = flow.zeros(
             (batch_size, self.group_size),
             dtype=next_indices.dtype,
             sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
-            placement=flow.placement("cuda", list(range(dist.get_world_size()))),
+            placement=flow.placement(device, list(range(dist.get_world_size()))),
         )
 
         for batch_idx, beam_hyp in enumerate(self._beam_hyps):
@@ -274,6 +276,7 @@ def finalize(
         pad_token_id: Optional[int] = None,
         eos_token_id: Optional[int] = None,
         beam_indices: Optional[flow.Tensor] = None,
+        device: Optional[str] = "cuda",
     ):
         batch_size = len(self._beam_hyps)
         # finalize all open beam hypotheses and add to generated hypotheses
@@ -303,7 +306,7 @@ def finalize(
             batch_size * self.num_beam_hyps_to_keep,
             dtype=flow.float32,
             sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
-            placement=flow.placement("cuda", list(range(dist.get_world_size()))),
+            placement=flow.placement(device, list(range(dist.get_world_size()))),
         )
 
         # retrieve best hypotheses
diff --git a/libai/inference/generator/generation_utils.py b/libai/inference/generator/generation_utils.py
index 3b3a94adc..31aa82c76 100644
--- a/libai/inference/generator/generation_utils.py
+++ b/libai/inference/generator/generation_utils.py
@@ -526,8 +526,8 @@ def greedy_search(
 
             # if eos_token was found in one sentence, set sentence to finished
             if eos_token_id is not None:
-                unfinished_sequences = flow.mul(
-                    unfinished_sequences, (next_tokens != eos_token_id).long()
+                unfinished_sequences = unfinished_sequences.mul(
+                    next_tokens.ne(eos_token_id).prod(dim=0)
                 )
 
             if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
diff --git a/libai/models/gpt_model.py b/libai/models/gpt_model.py
index 27f6bc8e9..ac11cc416 100644
--- a/libai/models/gpt_model.py
+++ b/libai/models/gpt_model.py
@@ -244,7 +244,9 @@ def forward(self, input_ids, past_length=0):
         bsz, seq_length = input_ids.size()
 
         position_ids = self.position_ids[:, past_length : past_length + seq_length]
-        position_ids = position_ids.expand_as(input_ids).to_global(sbp=input_ids.sbp)
+        position_ids = position_ids.expand_as(input_ids).to_global(
+            sbp=input_ids.sbp, placement=input_ids.placement
+        )
 
         token_embeds = self.token_embeddings(input_ids)
         position_embeds = self.position_embeddings(position_ids)
diff --git a/libai/models/utils/graph_base.py b/libai/models/utils/graph_base.py
index 651209ccd..316a70ed7 100644
--- a/libai/models/utils/graph_base.py
+++ b/libai/models/utils/graph_base.py
@@ -39,12 +39,14 @@ def __init__(
         is_train=True,
         auto_parallel_conf=None,
         global_mode=None,
+        device="cuda",
     ):
         super().__init__()
 
         self.model = model
         self.is_train = is_train
         self.global_mode = global_mode
+        self.device = device
 
         if is_train:
             self.add_optimizer(optimizer, lr_sch=lr_scheduler)
@@ -103,7 +105,7 @@ def build(self, **kwargs):
         if self.is_train:
             placement_sbp_dict = (
                 dict(
-                    placement=flow.env.all_device_placement("cuda"),
+                    placement=flow.env.all_device_placement(self.device),
                     sbp=flow.sbp.split(0),
                 )
                 if self.global_mode.enabled
diff --git a/libai/models/utils/model_loader/base_loader.py b/libai/models/utils/model_loader/base_loader.py
index 8cf275539..5222e9f06 100644
--- a/libai/models/utils/model_loader/base_loader.py
+++ b/libai/models/utils/model_loader/base_loader.py
@@ -20,6 +20,7 @@
 
 import omegaconf
 import oneflow as flow
+from safetensors import safe_open
 from termcolor import colored
 from safetensors import safe_open
 
@@ -384,6 +385,11 @@ def _convert_tensor(self, tensor):
         Returns:
             flow.Tensor: The target tensor.
         """
+        import torch
+
+        if tensor.dtype == torch.bfloat16:
+            data = tensor.detach().half().cpu().numpy()
+            return flow.Tensor(data)
         return flow.Tensor(tensor.detach().cpu().numpy())
 
     def _convert_tensors(self, torch_state_dict):
diff --git a/libai/tokenizer/tokenization_base.py b/libai/tokenizer/tokenization_base.py
index 026902fdf..e5e5f121d 100644
--- a/libai/tokenizer/tokenization_base.py
+++ b/libai/tokenizer/tokenization_base.py
@@ -774,7 +774,9 @@ def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, Lis
             ids.append(self._convert_token_to_id_with_added_voc(token))
         return ids
 
-    def convert_to_tensors(self, token_ids, return_tensors=None, is_global=False, **kwargs):
+    def convert_to_tensors(
+        self, token_ids, return_tensors=None, is_global=False, device="cuda", **kwargs
+    ):
         if return_tensors is None:
             return_token_ids = token_ids
         elif return_tensors == "of":
@@ -783,7 +785,7 @@ def convert_to_tensors(self, token_ids, return_tensors=None, is_global=False, **
             elif is_global:
                 sbp = kwargs.get("sbp", dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]))
                 placement = kwargs.get(
-                    "placement", flow.placement("cuda", list(range(dist.get_world_size())))
+                    "placement", flow.placement(device, list(range(dist.get_world_size())))
                 )
                 return_token_ids = flow.tensor(
                     token_ids, sbp=sbp, placement=placement, dtype=flow.long
@@ -803,14 +805,18 @@ def _convert_token_to_id_with_added_voc(self, token):
     def _convert_token_to_id(self, token):
         raise NotImplementedError
 
-    def encode(self, text, return_tensors=None, is_global=False, **kwargs):
+    def encode(self, text, return_tensors=None, is_global=False, device="cuda", **kwargs):
         if isinstance(text, str):
             tokens = self.tokenize(text)
             token_ids = self.convert_tokens_to_ids(tokens)
             if hasattr(self, "build_inputs_with_special_tokens"):
                 token_ids = self.build_inputs_with_special_tokens(token_ids)
             token_ids = self.convert_to_tensors(
-                token_ids, return_tensors=return_tensors, is_global=is_global, **kwargs
+                token_ids,
+                return_tensors=return_tensors,
+                is_global=is_global,
+                device=device,
+                **kwargs,
             )
             return token_ids
         elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
diff --git a/libai/utils/distributed.py b/libai/utils/distributed.py
index e7914a0ad..f64479210 100644
--- a/libai/utils/distributed.py
+++ b/libai/utils/distributed.py
@@ -72,6 +72,18 @@ def _init_distributed_env(self, cfg):
 
         # Add set device type
         self._device_type = try_get_key(cfg, "device_type", default="cuda")
+        if self._device_type == "npu":
+            try:
+                import oneflow_npu  # noqa: F401
+            except ImportError:
+                raise ImportError("'oneflow_npu' is missing. Install it to use NPU devices.")
+        elif self._device_type == "xpu":
+            try:
+                import oneflow_xpu  # noqa: F401
+            except ImportError:
+                raise ImportError("'oneflow_xpu' is missing. Install it to use NPU devices.")
+        elif self._device_type not in ("cuda", "npu", "xpu", "cpu"):
+            raise NotImplementedError(f"Unsupported device {self._device_type}")
 
     def _init_parallel_size(self, cfg):
 
@@ -228,7 +240,7 @@ def device_type(self):
         return self._device_type
 
     def set_device_type(self, device_type):
-        assert device_type in ["cpu", "cuda"], f"not supported for {device_type}"
+        # assert device in ["cpu", "cuda"], f"not supported for device:{device}"
         self._device_type = device_type
 
     def get_layer_ranks(self, layer_idx):
@@ -435,10 +447,10 @@ def convert_to_distributed_default_setting(t):
         return t.to_global(placement=flow.placement(device_type, ranks=t.placement.ranks))
 
 
-def ttol(tensor, pure_local=False, ranks=None):
+def ttol(tensor, pure_local=False, device="cuda", ranks=None):
     """Global tensor to local tensor."""
     if tensor.is_global:
-        placement = tensor.placement if not ranks else flow.placement("cuda", ranks)
+        placement = tensor.placement if not ranks else flow.placement(device, ranks)
         if pure_local:
             tensor = tensor.to_global(placement=placement).to_local()
         else:
@@ -459,7 +471,7 @@ def tton(tensor, local_only=False, ranks=None):
 
 def tensor_to_rank0(tensor, device="cuda", to_local=False):
     """Global tensor to rank0."""
-    assert device in ["cpu", "cuda"], f"not supported for device:{device}"
+    # assert device in ["cpu", "cuda"], f"not supported for device:{device}"
     if tensor.is_global:
         # Consider if it's 2d mesh, ranks should be [[0]] instead of [0]
         placement = flow.placement(device, ranks=[0] if tensor.placement.ranks.ndim == 1 else [[0]])
diff --git a/projects/Aquila/README.md b/projects/Aquila/README.md
new file mode 100644
index 000000000..07a9f0dd2
--- /dev/null
+++ b/projects/Aquila/README.md
@@ -0,0 +1,58 @@
+
+
+## Aquila
+### 推理
+- cuda
+
+```bash
+python projects/Aquila/pipeline.py --model_path=/root/models/Aquila-7B --mode=huggingface
+```
+
+- npu
+
+```bash
+python projects/Aquila/pipeline.py --model_path=/data0/hf_models/Aquila-7B --mode=huggingface --device=npu
+```
+
+- xpu
+
+```bash
+python projects/Aquila/pipeline.py --model_path=/root/models/Aquila-7B --mode=huggingface --device=xpu
+```
+
+### 训练 
+- data preparation
+```bash
+python projects/Aquila/utils/data_prepare.py
+```
+- cuda
+```bash
+export NUM_GPUS=4
+python3 -m oneflow.distributed.launch \
+    --nproc_per_node ${NUM_GPUS} \
+    --nnodes 1 \
+    --node_rank 0 \
+    --master_addr 127.0.0.1 \
+    --master_port 12345 \
+        tools/train_net.py --config-file=projects/Aquila/configs/aquila_sft.py \
+            graph.enabled=True \
+            train.input_placement_device="cuda" \
+            train.dist.device_type="cuda" \
+            train.dist.pipeline_parallel_size=${NUM_GPUS}
+```
+
+- xpu
+```bash
+export NUM_GPUS=1
+python3 -m oneflow.distributed.launch \
+    --nproc_per_node ${NUM_GPUS} \
+    --nnodes 1 \
+    --node_rank 0 \
+    --master_addr 127.0.0.1 \
+    --master_port 12345 \
+        tools/train_net.py --config-file=projects/Aquila/configs/aquila_sft.py \
+            graph.enabled=False \
+            train.input_placement_device="xpu" \
+            train.dist.device_type="xpu" \
+            train.dist.pipeline_parallel_size=${NUM_GPUS}
+```
diff --git a/projects/Aquila/aquila.py b/projects/Aquila/aquila.py
new file mode 100644
index 000000000..1d73e7ff9
--- /dev/null
+++ b/projects/Aquila/aquila.py
@@ -0,0 +1,653 @@
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Tuple
+
+import oneflow as flow
+from oneflow import nn
+
+from libai.config import configurable
+from libai.inference.generator.generation_utils import Generator
+from libai.layers import Linear, RMSLayerNorm, VocabEmbedding
+from libai.layers.attention import AttnMaskType
+from libai.models.utils import init_method_normal, scaled_init_method_normal
+from libai.utils import distributed as dist
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return flow.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    cos = cos[position_ids].unsqueeze(1)
+    sin = sin[position_ids].unsqueeze(1)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+
+    def forward(self, x, seq_len=None, cos_cached=None, sin_cached=None):
+        if seq_len > self.max_position_embeddings:
+            raise ValueError(
+                f"The maximum supported length is {self.max_position_embeddings}, "
+                f"and the current length is{seq_len}."
+            )
+
+        return (
+            cos_cached[:seq_len].to_global(placement=x.placement),
+            sin_cached[:seq_len].to_global(placement=x.placement),
+        )
+
+
+class MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        intermediate_size,
+        init_method=nn.init.xavier_normal_,
+        output_layer_init_method=None,
+        *,
+        layer_idx=0,
+    ):
+        super().__init__()
+
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+
+        self.gate_proj = Linear(
+            hidden_size,
+            intermediate_size,
+            bias=False,
+            parallel="col",
+            init_method=init_method,
+            layer_idx=layer_idx,
+        )
+
+        self.up_proj = Linear(
+            hidden_size,
+            intermediate_size,
+            bias=False,
+            parallel="col",
+            init_method=init_method,
+            layer_idx=layer_idx,
+        )
+
+        self.down_proj = Linear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            parallel="row",
+            init_method=output_layer_init_method,
+            layer_idx=layer_idx,
+        )
+
+        self.activation_func = nn.SiLU()
+
+    def forward(self, hidden_states):
+        gate_out = self.activation_func(self.gate_proj(hidden_states))
+        up_out = self.up_proj(hidden_states)
+        output = self.down_proj(gate_out * up_out)
+        return output
+
+
+class MultiheadAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        num_attention_heads,
+        max_position_embeddings,
+        init_method=nn.init.xavier_normal_,
+        output_layer_init_method=None,
+        scale_mask_softmax_fusion=False,
+        attn_mask_type=AttnMaskType.padding,
+        *,
+        layer_idx=0,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+
+        self.num_heads = num_attention_heads
+        self.head_size = hidden_size // num_attention_heads
+        self.attn_mask_type = attn_mask_type
+
+        self.norm_factor = 1.0 / math.sqrt(float(self.head_size))
+
+        self.scale_mask_softmax_fusion = scale_mask_softmax_fusion
+
+        self.query_key_value = Linear(
+            self.hidden_size,
+            self.hidden_size * 3,
+            bias=False,
+            parallel="col",
+            init_method=init_method,
+            layer_idx=layer_idx,
+        )
+
+        self.o_proj = Linear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=False,
+            parallel="row",
+            init_method=output_layer_init_method,
+            layer_idx=layer_idx,
+        )
+
+        self.coeff = None
+
+        rotary_dim = self.head_size
+        self.rotary_embed = RotaryEmbedding(
+            dim=rotary_dim,
+            max_position_embeddings=max_position_embeddings,
+        )
+
+    def forward(
+        self,
+        hidden_states: flow.Tensor,
+        encoder_states: flow.Tensor = None,
+        attention_mask: flow.Tensor = None,
+        position_ids=None,
+        past_key_value: Tuple[flow.Tensor, flow.Tensor] = None,
+        cos_cached: flow.Tensor = None,
+        sin_cached: flow.Tensor = None,
+        use_cache: bool = False,
+    ):
+        if encoder_states is not None:
+            encoder_states = encoder_states.to_global(placement=hidden_states.placement)
+
+        if attention_mask is not None:
+            attention_mask = attention_mask.to_global(placement=hidden_states.placement)
+
+        bsz, tgt_len = hidden_states.size()[:2]
+
+        query_key_value = self.query_key_value(hidden_states)
+        query_key_value = query_key_value.view(bsz, -1, self.num_heads, 3 * self.head_size)
+        query_key_value = query_key_value.permute(
+            0, 2, 1, 3
+        )  # [bsz, num_heads, src_len, 3 * head_size]
+        query, key, value = flow.chunk(query_key_value, chunks=3, dim=-1)
+
+        kv_seq_len = key.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_embed(
+            value, seq_len=kv_seq_len, cos_cached=cos_cached, sin_cached=sin_cached
+        )
+        query, key = apply_rotary_pos_emb(query, key, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            past_key, past_value = past_key_value
+            key = flow.cat((past_key.type_as(key), key), dim=2)
+            value = flow.cat((past_value.type_as(value), value), dim=2)
+
+        # query, key, value: [S(0), S(1)], shape: [bsz, num_heads, seq_length, head_size]
+        if use_cache:
+            past_key_value = (key, value)
+
+        # [bsz, num_heads, tgt_len, src_len] with [S(0), S(1)]
+        attention_scores = flow.matmul(query, key, transpose_b=True, alpha=self.norm_factor)
+        attention_weights = attention_scores + attention_mask
+
+        attention_weights = flow.softmax(attention_weights, dim=-1)
+        # Context shape: [bsz, num_heads, tgt_len, head_size] with [S(0), S(1)]
+        context = flow.matmul(attention_weights, value)
+
+        # Change shape: [bsz, num_heads, tgt_len, head_size] -> [bsz, tgt_len, num_heads, head_size]
+        context = context.transpose(1, 2)
+        output = self.o_proj(context.flatten(2))
+
+        if use_cache:
+            output = (output, past_key_value)
+
+        return output
+
+
+class AquilaCasualMask(nn.Module):
+    def __init__(self, max_positions=1024, dtype=flow.float16, *, layer_idx=0):
+        super().__init__()
+        self.dtype = dtype
+        self.mask = flow.full(
+            (max_positions, max_positions),
+            flow.finfo(dtype).min,
+            placement=dist.get_layer_placement(layer_idx),
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+        )
+        mask_cond = flow.arange(
+            self.mask.size(-1),
+            placement=dist.get_layer_placement(layer_idx),
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+        )
+        self.mask.masked_fill_(mask_cond < (mask_cond + 1).view(self.mask.size(-1), 1), 0)
+        self.mask = self.mask.to(dtype)
+
+    def forward(self, input_ids, past_length=0, attention_mask=None, input_dtype=None):
+        bsz, tgt_len = input_ids.size()
+        casual_mask = self.mask[:tgt_len, :tgt_len]
+        if past_length > 0:
+            # in case past_key_values are used, we need to add a prefix ones mask to casual mask
+            casual_mask = flow.cat(
+                [flow.zeros(tgt_len, past_length, dtype=self.dtype), casual_mask], dim=-1
+            )
+        casual_mask = (
+            casual_mask.unsqueeze(0).unsqueeze(1).expand(bsz, 1, tgt_len, tgt_len + past_length)
+        )
+        casual_mask = casual_mask.to_global(sbp=input_ids.sbp)
+        if attention_mask is not None:
+            bsz, src_len = attention_mask.size()
+            attention_mask = (
+                attention_mask[:, None, None, :]
+                .expand(bsz, 1, tgt_len, src_len)
+                .to(casual_mask.dtype)
+            )
+            inverted_attention_mask = 1.0 - attention_mask
+            inverted_attention_mask.masked_fill(
+                inverted_attention_mask.to(flow.bool), flow.finfo(casual_mask.dtype).min
+            )
+            inverted_attention_mask = inverted_attention_mask.to_global(
+                placement=casual_mask.placement
+            )
+            casual_mask = casual_mask + inverted_attention_mask
+        if input_dtype is not None:
+            casual_mask = casual_mask.to(input_dtype)
+        return casual_mask
+
+
+class AquilaDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        intermediate_size,
+        num_attention_heads,
+        is_decoder=False,
+        rms_norm_eps=1e-5,
+        max_position_embeddings=None,
+        init_method=nn.init.xavier_normal_,
+        output_layer_init_method=None,
+        scale_mask_softmax_fusion=False,
+        attn_mask_type=AttnMaskType.padding,
+        *,
+        layer_idx=0,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_attention_heads = num_attention_heads
+        self.rms_norm_eps = rms_norm_eps
+        self.max_position_embeddings = max_position_embeddings
+        self.attn_mask_type = attn_mask_type
+
+        self.layer_idx = layer_idx
+        self.is_decoder = is_decoder
+
+        self.scale_mask_softmax_fusion = scale_mask_softmax_fusion
+
+        self.init_method = init_method
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+        self.output_layer_init_method = output_layer_init_method
+
+        self.input_layernorm = RMSLayerNorm(
+            self.hidden_size, eps=self.rms_norm_eps, layer_idx=self.layer_idx
+        )
+
+        self.self_attn = self.build_attention()
+        self.post_attention_layernorm = RMSLayerNorm(
+            self.hidden_size, eps=self.rms_norm_eps, layer_idx=self.layer_idx
+        )
+
+        self.mlp = MLP(
+            self.hidden_size,
+            self.intermediate_size,
+            self.init_method,
+            output_layer_init_method=self.output_layer_init_method,
+            layer_idx=self.layer_idx,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        past_key_value=None,
+        cos_cached=None,
+        sin_cached=None,
+        use_cache=False,
+    ):
+        hidden_states = hidden_states.to_global(placement=dist.get_layer_placement(self.layer_idx))
+
+        # hidden_states shape: (batch_size, seq_length, hidden_size)
+        if attention_mask is not None:
+            attention_mask = attention_mask.to_global(
+                placement=dist.get_layer_placement(self.layer_idx)
+            )
+
+        if past_key_value is not None:
+            if self.is_decoder:
+                assert len(past_key_value) == 4
+                self_attn_past_key_value = past_key_value[:2]
+            else:
+                self_attn_past_key_value = past_key_value
+        else:
+            self_attn_past_key_value = None
+
+        layernorm_output = self.input_layernorm(hidden_states)
+        attention_output = self.self_attn(
+            layernorm_output,
+            attention_mask=attention_mask,
+            past_key_value=self_attn_past_key_value,
+            cos_cached=cos_cached,
+            sin_cached=sin_cached,
+            use_cache=use_cache,
+        )
+
+        if use_cache:
+            attention_output, presents = attention_output
+
+        hidden_states = hidden_states + attention_output
+
+        layernorm_output = self.post_attention_layernorm(hidden_states)
+
+        mlp_output = self.mlp(layernorm_output)
+
+        output = hidden_states + mlp_output
+
+        if use_cache:
+            output = (output, presents)
+        return output
+
+    def build_attention(self):
+        return MultiheadAttention(
+            self.hidden_size,
+            self.num_attention_heads,
+            self.max_position_embeddings,
+            init_method=self.init_method,
+            output_layer_init_method=self.output_layer_init_method,
+            scale_mask_softmax_fusion=self.scale_mask_softmax_fusion,
+            attn_mask_type=self.attn_mask_type,
+            layer_idx=self.layer_idx,
+        )
+
+
+class AquilaModel(nn.Module):
+    def __init__(
+        self,
+        hidden_layers,
+        vocab_size,
+        hidden_size,
+        intermediate_size,
+        num_attention_heads,
+        max_position_embeddings=1024,
+        rms_norm_eps=1e-5,
+        initializer_range=0.02,
+        use_scaled_init_for_output_weights=True,
+        scale_mask_softmax_fusion=False,
+        amp_enabled=False,
+    ):
+        super().__init__()
+        init_method = init_method_normal(sigma=initializer_range)
+        if use_scaled_init_for_output_weights:
+            output_layer_init_method = scaled_init_method_normal(initializer_range, hidden_layers)
+        else:
+            output_layer_init_method = init_method
+
+        self.embed_tokens = VocabEmbedding(
+            vocab_size, hidden_size, init_method=init_method, amp_enabled=amp_enabled
+        )
+        self.layers = nn.ModuleList(
+            [
+                AquilaDecoderLayer(
+                    hidden_size,
+                    intermediate_size,
+                    num_attention_heads,
+                    rms_norm_eps=rms_norm_eps,
+                    max_position_embeddings=max_position_embeddings,
+                    init_method=init_method,
+                    output_layer_init_method=output_layer_init_method,
+                    scale_mask_softmax_fusion=scale_mask_softmax_fusion,
+                    attn_mask_type=AttnMaskType.causal,
+                    layer_idx=i,
+                )
+                for i in range(hidden_layers)
+            ]
+        )
+        self.norm = RMSLayerNorm(hidden_size, eps=rms_norm_eps, layer_idx=-1)
+
+        self._set_cos_sin_cache(
+            rotary_dim=hidden_size // num_attention_heads,
+            seq_len=max_position_embeddings,
+            dtype=flow.float32,
+            layer_idx=0,
+        )
+
+    def _set_cos_sin_cache(self, rotary_dim, seq_len, base=10000, dtype=None, layer_idx=0):
+        position = flow.arange(
+            0,
+            rotary_dim,
+            2,
+            dtype=dtype,
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            placement=dist.get_layer_placement(layer_idx),
+        )
+        inv_freq = 1.0 / (base ** (position / rotary_dim))
+
+        t = flow.arange(
+            seq_len,
+            dtype=inv_freq.dtype,
+            sbp=inv_freq.sbp,
+            placement=inv_freq.placement,
+        )
+
+        freqs = flow.einsum("i,j->ij", t, inv_freq)
+        emb = flow.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype))
+        self.register_buffer("sin_cached", emb.sin().to(dtype))
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask=None,
+        past_key_values=None,
+        use_cache=False,
+        set_cache=None,
+    ):
+        if use_cache:
+            presents = []
+        input_ids = input_ids.to_global(placement=dist.get_layer_placement(0))
+        hidden_states = self.embed_tokens(input_ids)
+
+        for layer, past_key_value in zip(self.layers, past_key_values):
+            hidden_states = layer(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                past_key_value=past_key_value,
+                cos_cached=self.cos_cached,
+                sin_cached=self.sin_cached,
+                use_cache=False,
+            )
+            if use_cache:
+                hidden_states, present = hidden_states
+                presents.append(present)
+
+        hidden_states = self.norm(hidden_states)
+
+        if use_cache:
+            set_cache(presents)
+
+        return hidden_states
+
+
+class CrossEntropyLoss(nn.Module):
+    def forward(self, logits: flow.Tensor, target: flow.Tensor):
+        assert logits.ndim == 3
+        assert target.ndim == 2
+        assert logits.shape[0:2] == target.shape
+
+        target = target.to_global(placement=logits.placement)
+        target = target * (target >= 0)
+
+        lm_loss = flow._C.cross_entropy(
+            logits.view(-1, logits.shape[-1]), target.view(-1), ignore_index=0
+        )
+        return lm_loss
+
+
+class SFTLoss(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.lm_loss = CrossEntropyLoss()
+
+    def forward(self, logits, lm_labels):
+        lm_loss = self.lm_loss(logits, lm_labels)
+        lm_loss = lm_loss.mean()
+        return {"lm_loss": lm_loss}
+
+
+class AquilaForCausalLM(nn.Module, Generator):
+    @configurable
+    def __init__(
+        self,
+        hidden_layers,
+        vocab_size,
+        hidden_size,
+        intermediate_size,
+        num_attention_heads,
+        max_position_embeddings=1024,
+        rms_norm_eps=1e-5,
+        initializer_range=0.02,
+        use_scaled_init_for_output_weights=True,
+        scale_mask_softmax_fusion=False,
+        amp_enabled=False,
+        cfg=None,
+    ):
+        super().__init__()
+        self.cfg = cfg
+        self.model = AquilaModel(
+            hidden_layers=hidden_layers,
+            vocab_size=vocab_size,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_attention_heads=num_attention_heads,
+            max_position_embeddings=max_position_embeddings,
+            rms_norm_eps=rms_norm_eps,
+            initializer_range=initializer_range,
+            use_scaled_init_for_output_weights=use_scaled_init_for_output_weights,
+            scale_mask_softmax_fusion=scale_mask_softmax_fusion,
+            amp_enabled=amp_enabled,
+        )
+        self.casual_mask = AquilaCasualMask(max_position_embeddings, layer_idx=0)
+        self.lm_head = Linear(hidden_size, vocab_size, bias=False, layer_idx=-1)
+        self.loss_func = SFTLoss()
+
+        self.past_key_values = [None] * hidden_layers
+        self.past_length = 0
+
+    def forward(self, input_ids, attention_mask=None, labels=None, use_cache=False):
+        input_ids = input_ids.to_global(placement=dist.get_layer_placement(0))
+        attention_mask = (
+            attention_mask.to_global(placement=dist.get_layer_placement(0))
+            if attention_mask is not None
+            else attention_mask
+        )
+        labels = (
+            labels.to_global(placement=dist.get_layer_placement(0))
+            if labels is not None
+            else labels
+        )
+
+        if use_cache and self.past_key_values[0] is not None:
+            self.past_length = self.past_key_values[0][0].size(-2)
+        else:
+            self.past_length = 0
+
+        mask = self.casual_mask(
+            input_ids,
+            past_length=self.past_length,
+            attention_mask=attention_mask,
+            input_dtype=self.lm_head.weight.dtype,
+        )
+
+        output = self.model(
+            input_ids,
+            attention_mask=mask,
+            past_key_values=self.past_key_values,
+            use_cache=use_cache,
+            set_cache=self.set_cache,
+        )
+
+        logits = self.lm_head(output)
+
+        if labels is not None:
+            lm_loss = self.loss_func(logits, labels)
+            return lm_loss
+        else:
+            return {"logits": logits}
+
+    def set_cache(self, past_key_values):
+        self.past_length = 0 if past_key_values is None else past_key_values[0][0].shape[2]
+
+        if past_key_values is None:
+            past_key_values = [None] * self.cfg.hidden_layers
+
+        assert len(past_key_values) == self.cfg.hidden_layers, (
+            f"past_key_values's length {len(past_key_values)} doesn't match "
+            f"num_layers:' {self.cfg.hidden_layers}"
+        )
+
+    def prepare_inputs_for_generation(self, input_ids: flow.Tensor, **kwargs):
+        if "attention_mask" in kwargs:
+            attention_mask = kwargs.pop("attention_mask").float()
+            attention_mask = attention_mask - 1
+            attention_mask.masked_fill_(attention_mask == -1, flow.finfo(flow.float32).min)
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+
+    @classmethod
+    def from_config(cls, cfg):
+        return {
+            "hidden_layers": cfg.hidden_layers,
+            "vocab_size": cfg.vocab_size,
+            "hidden_size": cfg.hidden_size,
+            "intermediate_size": cfg.intermediate_size,
+            "num_attention_heads": cfg.num_attention_heads,
+            "max_position_embeddings": cfg.max_position_embeddings,
+            "rms_norm_eps": cfg.rms_norm_eps,
+            "initializer_range": cfg.initializer_range,
+            "use_scaled_init_for_output_weights": cfg.use_scaled_init_for_output_weights,
+            "scale_mask_softmax_fusion": cfg.scale_mask_softmax_fusion,
+            "amp_enabled": cfg.amp_enabled,
+            "cfg": cfg,
+        }
+
+    @staticmethod
+    def set_activation_checkpoint(model):
+        for module_block in model.modules():
+            # Old API in OneFlow 0.8
+            if hasattr(module_block, "origin"):
+                if isinstance(module_block.origin, AquilaDecoderLayer):
+                    module_block.config.activation_checkpointing = True
+            else:
+                if isinstance(module_block.to(nn.Module), AquilaDecoderLayer):
+                    module_block.to(nn.graph.GraphModule).activation_checkpointing = True
diff --git a/projects/Aquila/aquila_dataset.py b/projects/Aquila/aquila_dataset.py
new file mode 100644
index 000000000..536a4325e
--- /dev/null
+++ b/projects/Aquila/aquila_dataset.py
@@ -0,0 +1,19 @@
+import oneflow as flow
+from oneflow.utils.data import Dataset
+
+from libai.data.structures import DistTensorData, Instance
+
+
+class AquilaDataset(Dataset):
+    def __init__(self, path, tokenizer):
+        self.data = flow.load(path)
+        self.tokenizer = tokenizer
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        return Instance(
+            input_ids=DistTensorData(self.data[index]["input_ids"]),
+            labels=DistTensorData(self.data[index]["labels"]),
+        )
diff --git a/projects/Aquila/configs/aquila_config.py b/projects/Aquila/configs/aquila_config.py
new file mode 100644
index 000000000..442d7c484
--- /dev/null
+++ b/projects/Aquila/configs/aquila_config.py
@@ -0,0 +1,63 @@
+from omegaconf import DictConfig, OmegaConf
+
+from libai.config import LazyCall
+from projects.Aquila.aquila import AquilaForCausalLM
+from projects.Aquila.tokenizer import AquilaTokenizer
+from configs.common.train import train
+
+
+cfg = dict(
+    # Model
+    model_type="aquila",
+    hidden_act="silu",
+    hidden_size=4096,
+    initializer_range=0.02,
+    intermediate_size=11008,
+    max_position_embeddings=2048,
+    num_attention_heads=32,
+    hidden_layers=32,
+    pretraining_tp=1,
+    rms_norm_eps=1e-06,
+    rope_scaling=None,
+    tie_word_embeddings=False,
+    vocab_size=100008,
+    use_scaled_init_for_output_weights=False,
+    scale_mask_softmax_fusion=False,
+    amp_enabled=True,
+    # Inference
+    is_encoder_decoder=False,
+    max_length=256,
+    min_length=0,
+    do_sample=False,
+    early_stopping=False,
+    num_beams=1,
+    num_beam_groups=1,
+    diversity_penalty=0.0,
+    temperature=0.9,
+    top_k=50,
+    top_p=0.6,
+    typical_p=1.0,
+    repetition_penalty=1.0,
+    length_penalty=1.0,
+    no_repeat_ngram_size=0,
+    encoder_no_repeat_ngram_size=0,
+    num_return_sequences=1,
+    chunk_size_feed_forward=0,
+    output_scores=False,
+    use_cache=True,
+    bos_token_id=1,
+    eos_token_id=2,
+    pad_token_id=0,
+    # train
+    pretrained_model_path="/root/models/Aquila-7B",
+)
+
+cfg = DictConfig(cfg)
+
+model = LazyCall(AquilaForCausalLM)(cfg=cfg)
+tokenization = OmegaConf.create()
+tokenization.make_vocab_size_divisible_by = 1
+tokenization.tokenizer = LazyCall(AquilaTokenizer)(
+    # vocab_file=cfg.pretrained_model_path+"/vocab.json",
+    # merges_file=cfg.pretrained_model_path+"/merges.txt",
+)
diff --git a/projects/Aquila/configs/aquila_sft.py b/projects/Aquila/configs/aquila_sft.py
new file mode 100644
index 000000000..3665d75b6
--- /dev/null
+++ b/projects/Aquila/configs/aquila_sft.py
@@ -0,0 +1,102 @@
+import os
+from omegaconf import OmegaConf
+
+from libai.config import LazyCall
+from libai.evaluation import PPLEvaluator
+from libai.scheduler import WarmupExponentialLR
+from libai.data.build import build_nlp_train_loader, build_nlp_test_loader
+
+from configs.common.train import train
+from configs.common.models.graph import graph
+from configs.common.optim import optim
+
+from projects.Aquila.aquila import AquilaForCausalLM
+from projects.Aquila.tokenizer import AquilaTokenizer
+from projects.Aquila.configs.aquila_config import cfg
+from projects.Aquila.aquila_dataset import AquilaDataset
+
+
+# Hyperparameters
+weight_decay = 0.1
+learning_rate = 5e-5
+dataset_path = "./alpaca_data"
+pretrained_model_path = "/root/models/Aquila-7B"
+
+# graph & optim
+graph["enabled"] = False
+optim.update(
+    dict(
+        lr=learning_rate,
+        weight_decay=weight_decay,
+    )
+)
+
+# tokenize
+tokenization = OmegaConf.create()
+tokenization.make_vocab_size_divisible_by = 1
+tokenization.tokenizer = LazyCall(AquilaTokenizer)(
+    vocab_file=pretrained_model_path + "/vocab.json",
+    merges_file=pretrained_model_path + "/merges.txt",
+)
+
+
+# model
+cfg.pretrained_model_path = pretrained_model_path
+model = LazyCall(AquilaForCausalLM)(cfg=cfg)
+
+# datasets
+dataloader = OmegaConf.create()
+dataloader.train = LazyCall(build_nlp_train_loader)(
+    dataset=[
+        LazyCall(AquilaDataset)(
+            path=os.path.join(dataset_path, "train"), tokenizer=tokenization.tokenizer
+        )
+    ],
+)
+dataloader.test = [
+    LazyCall(build_nlp_test_loader)(
+        dataset=LazyCall(AquilaDataset)(
+            path=os.path.join(dataset_path, "test"), tokenizer=tokenization.tokenizer
+        ),
+    ),
+]
+
+train.update(
+    dict(
+        output_dir="./sft_result",
+        train_micro_batch_size=4,
+        test_micro_batch_size=1,
+        train_epoch=5,
+        train_iter=1,
+        log_period=1,
+        warmup_ratio=1 / 3,
+        num_accumulation_steps=8,
+        rdma_enabled=False,
+        train_with_fp16=True,
+        amp=dict(enabled=True),
+        activation_checkpoint=dict(enabled=True),
+        input_placement_device="cuda",
+        checkpointer=dict(
+            period=100,
+            max_to_keep=20,
+        ),
+        dist=dict(
+            data_parallel_size=1,
+            tensor_parallel_size=1,
+            pipeline_parallel_size=1,
+            pipeline_num_layers=cfg.hidden_layers,
+            device_type="cuda",
+        ),
+        evaluation=dict(
+            enabled=False,
+            evaluator=LazyCall(PPLEvaluator)(),
+            eval_period=1000,
+            eval_iter=1e5,
+        ),
+        scheduler=LazyCall(WarmupExponentialLR)(
+            warmup_factor=0.0,
+            gamma=1.0,
+            warmup_method="linear",
+        ),
+    )
+)
diff --git a/projects/Aquila/pipeline.py b/projects/Aquila/pipeline.py
new file mode 100644
index 000000000..eed851a3e
--- /dev/null
+++ b/projects/Aquila/pipeline.py
@@ -0,0 +1,151 @@
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from pathlib import Path
+
+import click
+
+from libai.config import try_get_key
+from libai.engine import DefaultTrainer
+from libai.inference.basic import BasePipeline
+from libai.utils import distributed as dist
+
+
+class TextGenerationPipeline(BasePipeline):
+    def load_pretrain_weight(self, libai_cfg_model, model_path, mode="huggingface"):
+        """load pretrained model.
+
+        Args:
+            libai_cfg_model (libai.models): Lazy config Model in Libai, you can import it
+                by `from libai.config.configs.common.models.bert
+                    import pretrain_model as libai_cfg_model`
+            model_path (str): The directory path of pretrained model,
+        """
+        if mode == "huggingface":
+            from projects.Aquila.utils.aquila_loader import AquilaLoaderHuggerFace
+
+            model_loader = AquilaLoaderHuggerFace(
+                libai_cfg_model,
+                libai_cfg_model.cfg,
+                model_path,
+            )
+            model = model_loader.load()
+            model.eval()
+            return model
+
+        elif mode == "libai":
+            from projects.Aquila.utils.aquila_loader import AquilaLoaderLiBai
+
+            model_loader = AquilaLoaderLiBai(
+                libai_cfg_model,
+                libai_cfg_model.cfg,
+                model_path,
+            )
+            model = model_loader.load()
+            model.eval()
+            return model
+
+        elif mode == "random":
+            from libai.engine import DefaultTrainer
+
+            return DefaultTrainer.build_model(self.cfg)
+        else:
+            raise NotImplementedError
+
+    def _parse_parameters(self, **pipeline_parameters):
+        preprocess_params = {}
+        forward_params = {**pipeline_parameters}
+        postprocess_params = {}
+
+        return preprocess_params, forward_params, postprocess_params
+
+    def preprocess(self, inputs, **kwargs) -> dict:
+        # tokenizer encoderW
+        import oneflow as flow
+
+        inputs = flow.tensor(self.tokenizer.encode(inputs, add_bos=True, padding=True))
+
+        inputs = {
+            "input_ids": inputs,
+        }
+
+        return inputs
+
+    def forward(self, inputs, **kwargs) -> dict:
+        inputs = dist.convert_to_distributed_default_setting(inputs["input_ids"])
+        outputs = self.model.generate(inputs, max_length=50, **kwargs)
+        return {"return_ids": outputs}
+
+    def postprocess(self, model_output_dict, **kwargs) -> dict:
+        return_ids = model_output_dict["return_ids"]
+
+        records = [
+            {"generated_text": self.tokenizer.decode(return_ids[i])}
+            for i in range(return_ids.size(0))
+        ]
+        return records
+
+    def build_tokenizer(self, cfg):
+        tokenizer = None
+        if try_get_key(cfg, "tokenization") is not None:
+            tokenizer_cfg = cfg.tokenization.tokenizer
+            if "vocab_file" not in tokenizer_cfg:
+                # If "vocab_file" does not exist in the tokenizer's config,
+                # set it to default as f"{model_path}/vocab.json"
+                tokenizer_cfg.vocab_file = str(Path(self.model_path).joinpath("vocab.json"))
+            if "merges_file" not in tokenizer_cfg:
+                # If "merges_file" does not exist in the tokenizer's config,
+                # set it to default as f"{model_path}/merges.txt"
+                tokenizer_cfg.merges_file = str(Path(self.model_path).joinpath("merges.txt"))
+            tokenizer = DefaultTrainer.build_tokenizer(cfg)
+        return tokenizer
+
+
+@click.command()
+@click.option(
+    "--config_file",
+    default="projects/Aquila/configs/aquila_config.py",
+    help="Path to the configuration file.",
+)
+@click.option("--model_path", default=None, help="Path to the model checkpoint.")
+@click.option(
+    "--mode",
+    default="libai",
+    help="Mode for the dataloader pipeline, e.g., 'libai' or 'huggingface'.",
+)
+@click.option(
+    "--device", default="cuda", help="Device to run the model on, e.g., 'cuda', 'xpu', 'npu'."
+)
+def main(config_file, model_path, mode, device):
+    pipeline = TextGenerationPipeline(
+        config_file,
+        data_parallel=1,
+        tensor_parallel=1,
+        pipeline_parallel=1,
+        pipeline_num_layers=32,
+        model_path=model_path,
+        mode=mode,
+        device=device,
+    )
+
+    text = [
+        "Give three tips for staying healthy.",
+    ]
+    output = pipeline(inputs=text)
+    if dist.is_main_process():
+        print(output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/projects/Aquila/tokenizer.py b/projects/Aquila/tokenizer.py
new file mode 100644
index 000000000..a1357e72a
--- /dev/null
+++ b/projects/Aquila/tokenizer.py
@@ -0,0 +1,247 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import logging
+import os
+import unicodedata
+from functools import lru_cache
+from io import open
+from typing import Optional
+
+import regex as re
+
+from libai.tokenizer.tokenization_base import PreTrainedTokenizer
+
+logger = logging.getLogger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "aquila/gpt2-tokenizer": "https://huggingface.co/BAAI/Aquila-7B/resolve/main/vocab.json"
+    },
+    "merges_file": {
+        "aquila/gpt2-tokenizer": "https://huggingface.co/BAAI/Aquila-7B/blob/main/merges.txt"
+    },
+}
+
+MAX_MODEL_INPUT_SIZES = {"aquila/gpt2-tokenizer": 2048}
+
+PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""  # noqa: E501
+
+
+@lru_cache()
+def bytes_to_unicode():
+    bs = (
+        list(range(ord("!"), ord("~") + 1))
+        + list(range(ord("¡"), ord("¬") + 1))
+        + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2 ** 8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2 ** 8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class AquilaTokenizer(PreTrainedTokenizer):
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = MAX_MODEL_INPUT_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        unk_token="<|endoftext|>",
+        bos_token=None,
+        eos_token="<|endoftext|>",
+        pad_token="<|endoftext|>",
+        model_max_length=None,
+        **kwargs,
+    ):
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        self.model_max_length = model_max_length
+        bpe_merges = []
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            for line in merges_handle:
+                line = line.strip()
+                if not line or line.startswith("#"):
+                    continue
+                bpe_merges.append(tuple(line.split()))
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+
+        self.pat = re.compile(PRETOKENIZE_REGEX)
+
+        super(AquilaTokenizer, self).__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+
+    def _tokenize(self, text, **kwargs):
+        """Tokenize a string."""
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) to a single string."""
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+
+    def decode(
+        self,
+        token_ids,
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = False,
+        spaces_between_special_tokens: bool = False,
+        **kwargs,
+    ) -> str:
+        return super().decode(
+            token_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+            **kwargs,
+        )
+
+    def save_vocabulary(self, save_directory, filename_prefix=None):
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"],
+        )
+        merge_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"],
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write("#version: 0.2\n")
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        return (vocab_file, merge_file)
+
+    def prepare_for_tokenization(self, text, **kwargs):
+        text = unicodedata.normalize("NFC", text)
+        return (text, kwargs)
diff --git a/projects/Aquila/utils/aquila_loader.py b/projects/Aquila/utils/aquila_loader.py
new file mode 100644
index 000000000..bae4c6cea
--- /dev/null
+++ b/projects/Aquila/utils/aquila_loader.py
@@ -0,0 +1,106 @@
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+import oneflow as flow
+
+from libai.models.utils.model_loader.base_loader import ModelLoaderHuggerFace, ModelLoaderLiBai
+
+
+class AquilaLoaderHuggerFace(ModelLoaderHuggerFace):
+    def __init__(self, model, libai_cfg, pretrained_model_path, **kwargs):
+        super().__init__(model, libai_cfg, pretrained_model_path, **kwargs)
+
+        self.base_model_prefix_1 = "model"
+        self.base_model_prefix_2 = "model"
+
+    def _convert_state_dict(self, flow_state_dict, cfg):
+        """Convert state_dict's keys to match model.
+
+        Args:
+            flow_state_dict (OrderedDict): model state dict.
+            cfg (dict): model's default config dict in LiBai.
+
+        Returns:
+            OrderedDict: flow state dict.
+        """
+        # The converted checkpoint.
+        oneflow_state_dict = flow_state_dict.copy()
+        old_keys = list(oneflow_state_dict.keys())
+
+        # Get configs
+        num_attention_heads = cfg.get("num_attention_heads")
+        hidden_size = cfg.get("hidden_size")
+        head_size = int(hidden_size // num_attention_heads)
+
+        new_key_qkv = "model.layers.{}.self_attn.query_key_value.weight"
+        old_key_qkv = "model.layers.{}.self_attn.{}.weight"
+        for layer_idx in range(cfg.get("hidden_layers")):
+            query = old_key_qkv.format(layer_idx, "q_proj")
+            key = old_key_qkv.format(layer_idx, "k_proj")
+            value = old_key_qkv.format(layer_idx, "v_proj")
+            q = oneflow_state_dict[query]
+            k = oneflow_state_dict[key]
+            v = oneflow_state_dict[value]
+            qkv = flow.cat([q, k, v], dim=0)
+            qkv = self._fix_qkv_ordering(qkv, head_size, num_attention_heads, hidden_size)
+            oneflow_state_dict[new_key_qkv.format(layer_idx)] = qkv
+            oneflow_state_dict.pop(query)
+            oneflow_state_dict.pop(key)
+            oneflow_state_dict.pop(value)
+
+        for k in old_keys:
+            if "inv_freq" in k:
+                oneflow_state_dict.pop(k)
+
+        return oneflow_state_dict
+
+    def _load_config_from_json(self, config_file):
+        """load config from `config.json`, and update default config.
+
+        Args:
+            config_file (str): Path of config file.
+        """
+        with open(config_file, mode="r", encoding="utf-8") as f:
+            cfg_dict = json.load(f)
+
+        # update libai_cfg by config.json
+        self._update_cfg("hidden_layers", cfg_dict["num_hidden_layers"])
+        self._update_cfg("hidden_size", cfg_dict["hidden_size"])
+        self._update_cfg("num_attention_heads", cfg_dict["num_attention_heads"])
+        self._update_cfg("max_position_embeddings", cfg_dict["max_position_embeddings"])
+        self._update_cfg("intermediate_size", cfg_dict["intermediate_size"])
+        self._update_cfg("rms_norm_eps", cfg_dict["rms_norm_eps"])
+        self._update_cfg("vocab_size", cfg_dict["vocab_size"])
+        self._update_cfg("initializer_range", cfg_dict["initializer_range"])
+        self._update_cfg(
+            "ffn_hidden_size",
+            cfg_dict.get("n_inner")
+            if cfg_dict.get("n_inner") is not None
+            else 4 * self.libai_cfg["hidden_size"],
+        )
+
+        # update libai_cfg by kwargs
+        for k, v in self.kwargs.items():
+            self._update_cfg(k, v)
+
+        self._update_cfg_log()
+
+
+class AquilaLoaderLiBai(ModelLoaderLiBai):
+    def __init__(self, model, libai_cfg, pretrained_model_path, **kwargs):
+        super().__init__(model, libai_cfg, pretrained_model_path, **kwargs)
+        self.base_model_prefix_2 = "model"
diff --git a/projects/Aquila/utils/data_prepare.py b/projects/Aquila/utils/data_prepare.py
new file mode 100644
index 000000000..9e73025f2
--- /dev/null
+++ b/projects/Aquila/utils/data_prepare.py
@@ -0,0 +1,160 @@
+import copy
+import json
+import math
+import os
+from pathlib import Path
+from typing import Optional
+
+import oneflow as flow
+import requests
+from oneflow.utils.data import random_split
+from tqdm import tqdm
+
+from libai.config import instantiate
+from libai.utils.logger import setup_logger
+from projects.Aquila.configs.aquila_sft import tokenization
+
+logger = setup_logger()
+
+
+def prepare(
+    destination_path: Path = Path("./alpaca_data"),
+    checkpoint_dir: Path = Path("/root/models/Aquila-7B"),
+    test_split_fraction: float = 0.03865,  # to get exactly 2000 test samples,
+    seed: int = 42,
+    mask_inputs: bool = False,  # as in alpaca-lora
+    data_file_name: str = "alpaca_data_cleaned_archive.json",
+    data_file_url: str = "https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_cleaned_archive.json",  # noqa
+    ignore_index: int = -100,
+    max_seq_length: Optional[int] = 512,
+) -> None:
+    """Prepare the Alpaca dataset for instruction tuning.
+    The output is a training and test dataset saved as `train.pt` and `test.pt`,
+    which stores the preprocessed and tokenized prompts and labels.
+    """
+    if max_seq_length is None:
+        with open(os.path.join(checkpoint_dir, "config.json"), "r", encoding="utf-8") as file:
+            config = json.load(file)
+            max_seq_length = config["max_position_embeddings"]
+
+    destination_path.mkdir(parents=True, exist_ok=True)
+    data_file_path = destination_path / data_file_name
+    logger.info("Loading data file...")
+    download_if_missing(data_file_path, data_file_url)
+    with open(data_file_path, "r", encoding="utf-8") as file:
+        data = json.load(file)
+
+    logger.info("Loading tokenizer...")
+    tokenizer = instantiate(tokenization.tokenizer)
+
+    # Partition the dataset into train and test
+    num_of_test_samples = math.floor(test_split_fraction * len(data))
+    num_of_train_samples = len(data) - num_of_test_samples
+    train_set, test_set = random_split(
+        data,
+        [num_of_train_samples, num_of_test_samples],
+        generator=flow.Generator().manual_seed(seed),
+    )
+    train_set, test_set = list(train_set), list(test_set)
+
+    logger.info(f"train has {len(train_set):,} samples")
+    logger.info(f"test has {len(test_set):,} samples")
+
+    logger.info("Processing train split ...")
+    train_set = [
+        prepare_sample(
+            example=sample,
+            tokenizer=tokenizer,
+            max_length=max_seq_length,
+        )
+        for sample in tqdm(train_set)
+    ]
+    flow.save(train_set, destination_path / "train")
+
+    logger.info("Processing test split ...")
+    test_set = [
+        prepare_sample(
+            example=sample,
+            tokenizer=tokenizer,
+            max_length=max_seq_length,
+        )
+        for sample in tqdm(test_set)
+    ]
+    flow.save(test_set, destination_path / "test")
+
+    max_length = max([i["input_ids"].shape[0] for i in train_set])
+    logger.info("Max length of training dataset: {}".format(max_length))
+
+
+def download_if_missing(file_path: Path, file_url: str) -> None:
+    """Downloads the raw json data file and saves it in the given destination."""
+    if file_path.exists() and file_path.stat().st_size > 0:
+        return
+    with open(file_path, "w", encoding="utf-8") as f:
+        f.write(requests.get(file_url).text)
+
+
+def prepare_sample(example: dict, tokenizer, max_length: int) -> dict:
+    """Processes a single sample.
+    Each sample in the dataset consists of:
+    - instruction: A string describing the task
+    - input: A string holding a special input value for the instruction.
+        This only applies to some samples, and in others this is empty.
+    - output: The response string
+    This function processes this data to produce a prompt text and a label for
+    supervised training. The prompt text is formed as a single message including both
+    the instruction and the input. The label/target is the same message but with the
+    response attached.
+    Finally, both the prompt and the label get tokenized. If desired, all tokens
+    in the label that correspond to the original input prompt get masked out (default).
+    """
+    full_prompt = generate_prompt(example)
+    full_prompt_and_response = full_prompt + example["output"]
+
+    prompt = tokenizer.encode(full_prompt, device="cpu")
+    prompt = flow.tensor(prompt, dtype=flow.int64, device="cpu")
+    example = tokenizer.encode(full_prompt_and_response, device="cpu")
+    example = flow.tensor(example, dtype=flow.int64, device="cpu")
+
+    padding = max_length - example.shape[0]
+    if padding > 0:
+        example = flow.cat((example, flow.zeros(padding, dtype=flow.long) - 1))
+    elif padding < 0:
+        example = example[:max_length]
+    labels = copy.deepcopy(example)
+    labels[: len(prompt)] = -1
+    example_mask = example.ge(0)
+    label_mask = labels.ge(0)
+    example[~example_mask] = 0
+    labels[~label_mask] = -1
+    example = example[:-1]
+    labels = labels[1:]
+    example_mask = flow.where(
+        example_mask, flow.tensor(0, dtype=flow.float), flow.tensor(-float("inf"))
+    )
+    example_mask = example_mask[:-1]
+    return {
+        "input_ids": example,
+        "labels": labels,
+    }
+
+
+def generate_prompt(example: dict) -> str:
+    """Generates a standardized message to prompt the model with an instruction, optional input and a
+    'response' field."""
+
+    if example["input"]:
+        return (
+            "Below is an instruction that describes a task, paired with an input that provides further context. "  # noqa
+            "Write a response that appropriately completes the request.\n\n"
+            f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:"  # noqa
+        )
+    return (
+        "Below is an instruction that describes a task. "
+        "Write a response that appropriately completes the request.\n\n"
+        f"### Instruction:\n{example['instruction']}\n\n### Response:"
+    )
+
+
+if __name__ == "__main__":
+    prepare()
diff --git a/projects/BLOOM/configs/bloom_inference.py b/projects/BLOOM/configs/bloom_inference.py
index 6ee9a1e37..998c00306 100644
--- a/projects/BLOOM/configs/bloom_inference.py
+++ b/projects/BLOOM/configs/bloom_inference.py
@@ -6,6 +6,7 @@
 cfg = dict(
     # model
     vocab_size=250880,
+    max_position_embeddings=512,
     hidden_size=64,
     hidden_layers=2,
     n_head=8,
diff --git a/projects/BLOOM/utils/model_loader.py b/projects/BLOOM/utils/model_loader.py
index b292a362f..0580aa4d3 100644
--- a/projects/BLOOM/utils/model_loader.py
+++ b/projects/BLOOM/utils/model_loader.py
@@ -43,7 +43,7 @@ def _convert_state_dict(self, flow_state_dict, cfg):
 
         # prefix
         has_prefix = any(s.startswith(self.base_model_prefix_1) for s in oneflow_state_dict)
-        prefix2 = "transformer." if has_prefix else ""
+        prefix2 = "transformer." if not has_prefix else ""
 
         # Convert layers.
         for key in old_keys:
@@ -61,8 +61,13 @@ def _load_config_from_json(self, config_file):
             cfg_dict = json.load(f)
 
         self._update_cfg("hidden_layers", cfg_dict["n_layer"])
-        self._update_cfg("hidden_size", cfg_dict["n_embed"])
-        self._update_cfg("n_head", cfg_dict["num_attention_heads"])
+
+        if "n_embed" in cfg_dict.keys():
+            self._update_cfg("hidden_size", cfg_dict["n_embed"])
+            self._update_cfg("n_head", cfg_dict["num_attention_heads"])
+        else:
+            self._update_cfg("hidden_size", cfg_dict["hidden_size"])
+            self._update_cfg("n_head", cfg_dict["n_head"])
 
         # update libai_cfg by config.json
         for k, v in cfg_dict.items():
diff --git a/projects/ChatGLM/README.md b/projects/ChatGLM/README.md
index a2d8446b0..50f558ca7 100644
--- a/projects/ChatGLM/README.md
+++ b/projects/ChatGLM/README.md
@@ -47,3 +47,12 @@ python projects/ChatGLM/pipeline.py
 
 ### ChatGLM Lora Inference
 - set `projects/ChatGLM/configs/chatglm_config.py`, lora_enable=True, same step with no lora.
+
+### npu/xpu/cuda example
+```python
+python projects/ChatGLM/pipeline.py --model_path=/data0/hf_models/chatglm/chatglm2-6b --mode=huggingface --device=npu
+
+python projects/ChatGLM/pipeline.py --model_path=/root/models/chatglm2-6b/ --mode=huggingface --device=xpu
+
+python projects/ChatGLM/pipeline.py --model_path=/root/models/chatglm2-6b/ --mode=huggingface --device=cuda
+```
diff --git a/projects/ChatGLM/configs/chatglm_config.py b/projects/ChatGLM/configs/chatglm_config.py
index aa97363af..9fec6f3b9 100644
--- a/projects/ChatGLM/configs/chatglm_config.py
+++ b/projects/ChatGLM/configs/chatglm_config.py
@@ -23,6 +23,7 @@
     layernorm_epsilon=1e-05,
     multi_query_attention=True,
     multi_query_group_num=2,
+    max_position_embeddings=2048,
     num_attention_heads=32,
     num_layers=28,
     padded_vocab_size=65024,
@@ -60,7 +61,7 @@
     output_scores=False,
     output_hidden_states=False,
     # train
-    pretrained_model_path=os.environ["CHATGLM_HF_DIR"],
+    pretrained_model_path="chatglm/chatglm2-6b",
     # lora_cfg
     lora_enable=False,
     lora_cfg=dict(
@@ -85,6 +86,4 @@
 model = LazyCall(ChatGLMForConditionalGeneration)(cfg=cfg)
 tokenization = OmegaConf.create()
 tokenization.make_vocab_size_divisible_by = 1
-tokenization.tokenizer = LazyCall(ChatGLMTokenizer)(
-    vocab_file=f"{os.environ['CHATGLM_HF_DIR']}/tokenizer.model"
-)
+tokenization.tokenizer = LazyCall(ChatGLMTokenizer)()
diff --git a/projects/ChatGLM/lora/layers.py b/projects/ChatGLM/lora/layers.py
index 7fd54feb9..eaa9b85a3 100644
--- a/projects/ChatGLM/lora/layers.py
+++ b/projects/ChatGLM/lora/layers.py
@@ -18,7 +18,7 @@
 import math
 import warnings
 from abc import ABC
-from typing import Any, List, Optional, Union
+from typing import Any, List, Optional, Tuple, Union
 
 import oneflow as flow
 import oneflow.nn as nn
@@ -41,18 +41,18 @@ class BaseTunerLayer(ABC):
     active_adapter = None
 
     # All names of layers that may contain adapter (trainable) weights
-    adapter_layer_names: tuple[str] = ()
+    adapter_layer_names: Tuple[str, ...] = ()
     # All names of other parameters that may contain adapter-related parameters
-    other_param_names: tuple[str] = ()
+    other_param_names: Tuple[str, ...] = ()
 
     # indicates whether all adapters should be disabled
     _disable_adapters: bool = False
 
     # the currently active adapter(s)
-    _active_adapter: str | list[str] = "default"
+    _active_adapter: Union[str, List[str]] = "default"
 
     # List all merged adapters
-    merged_adapters: list[str] = []
+    merged_adapters: List[str] = []
 
     def get_base_layer(self) -> nn.Module:
         """
@@ -72,7 +72,7 @@ def weight(self) -> flow.Tensor:
         weight = base_layer.weight
         return weight
 
-    def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
         raise NotImplementedError
 
     def unmerge(self) -> None:
@@ -119,7 +119,7 @@ def enable_adapters(self, enabled: bool) -> None:
                 layer.requires_grad_(False)
             self._disable_adapters = True
 
-    def set_adapter(self, adapter_names: str | list[str]) -> None:
+    def set_adapter(self, adapter_names: Union[str, List[str]]) -> None:
         """Set the active adapter(s).
 
         Args:
@@ -142,7 +142,7 @@ def set_adapter(self, adapter_names: str | list[str]) -> None:
 
         self._active_adapter = adapter_names
 
-    def _all_available_adapter_names(self) -> list[str]:
+    def _all_available_adapter_names(self) -> List[str]:
         """Return a sorted list of all available adapter names"""
         adapter_names = set()
         for name in self.adapter_layer_names + self.other_param_names:
diff --git a/projects/ChatGLM/lora/lora_model.py b/projects/ChatGLM/lora/lora_model.py
index 2a19c6675..941289ba0 100644
--- a/projects/ChatGLM/lora/lora_model.py
+++ b/projects/ChatGLM/lora/lora_model.py
@@ -22,7 +22,7 @@
 from dataclasses import asdict
 from enum import Enum
 from itertools import chain
-from typing import Any, List, Optional
+from typing import Any, List, Optional, Union
 
 from oneflow import nn
 from tqdm import tqdm
@@ -50,7 +50,7 @@ def __init__(self, model, peft_config, adapter_name: str) -> None:
         self.inject_adapter(self.model, adapter_name)
 
     @property
-    def active_adapters(self) -> list[str]:
+    def active_adapters(self) -> List[str]:
         if isinstance(self.active_adapter, str):
             return [self.active_adapter]
         # is already a list of str
@@ -192,7 +192,7 @@ def inject_adapter(self, model: nn.Module, adapter_name: str):
                 if adapter_name in n:
                     p.requires_grad = False
 
-    def merge_adapter(self, safe_merge=False, adapter_names: Optional[list[str]] = None) -> None:
+    def merge_adapter(self, safe_merge=False, adapter_names: Optional[List[str]] = None) -> None:
         """
         This method merges the adapter layers into the base model.
 
@@ -404,7 +404,7 @@ def disable_adapter_layers(self) -> None:
                 warnings.warn(msg)
         self._set_adapter_layers(enabled=False)
 
-    def set_adapter(self, adapter_name: str | list[str]) -> None:
+    def set_adapter(self, adapter_name: Union[str, List[str]]) -> None:
         """Set the active adapter(s).
 
         Args:
diff --git a/projects/ChatGLM/lora/utils.py b/projects/ChatGLM/lora/utils.py
index a1c195547..648ad510a 100644
--- a/projects/ChatGLM/lora/utils.py
+++ b/projects/ChatGLM/lora/utils.py
@@ -15,14 +15,14 @@
 # limitations under the License.
 
 import re
-from typing import List
+from typing import List, Optional, Union
 
 import oneflow as flow
 
 COMMON_LAYERS_PATTERN = ["layers", "h", "block", "blocks", "layer"]
 
 
-def check_target_module_exists(config, key: str) -> bool | re.Match[str] | None:
+def check_target_module_exists(config, key: str) -> Union[bool, Optional[re.Match]]:
     """A helper method to check if the passed module's key name matches
        any of the target modules in the adapter_config.
 
diff --git a/projects/ChatGLM/pipeline.py b/projects/ChatGLM/pipeline.py
index 0505cc855..c238f46dc 100644
--- a/projects/ChatGLM/pipeline.py
+++ b/projects/ChatGLM/pipeline.py
@@ -12,8 +12,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
+from pathlib import Path
+from typing import Union
 
+import click
+
+from libai.config import try_get_key
+from libai.engine import DefaultTrainer
 from libai.inference.basic import BasePipeline
 from libai.utils import distributed as dist
 
@@ -81,7 +86,7 @@ def load_pretrain_weight(self, libai_cfg_model, model_path, mode="huggingface"):
             return model
 
         elif mode == "random":
-            from libai.engine import DefaultTrainer
+            # from libai.engine import DefaultTrainer
 
             return DefaultTrainer.build_model(self.cfg)
         else:
@@ -94,19 +99,32 @@ def _parse_parameters(self, **pipeline_parameters):
 
         return preprocess_params, forward_params, postprocess_params
 
-    def preprocess(self, sentence: str | list, **kwargs) -> dict:
+    def preprocess(self, sentence: Union[str, list], **kwargs) -> dict:
         #
         if type(sentence) is str:
             inputs = {
                 "inputs": sentence,
             }
         else:
-            inputs = self.tokenizer.encode(sentence, return_tensors="of", is_global=True)
+            inputs = self.tokenizer.encode(
+                sentence, return_tensors="of", is_global=True, device=self.device
+            )
             inputs = {
                 "input_ids": inputs,
             }
         return inputs
 
+    def build_tokenizer(self, cfg):
+        tokenizer = None
+        if try_get_key(cfg, "tokenization") is not None:
+            tokenizer_cfg = cfg.tokenization.tokenizer
+            if "vocab_file" not in tokenizer_cfg:
+                # If "vocab_file" does not exist in the tokenizer's config,
+                # set it to default as f"{model_path}/tokenizer.model"
+                tokenizer_cfg.vocab_file = str(Path(self.model_path).joinpath("tokenizer.model"))
+            tokenizer = DefaultTrainer.build_tokenizer(cfg)
+        return tokenizer
+
     def forward(self, inputs, **kwargs) -> dict:
         if "input_ids" not in inputs:
             if "history" in kwargs:
@@ -143,85 +161,53 @@ def reset_conversation(self):
         self.history = []
 
 
-if __name__ == "__main__":
-    # ----- load huggingface checkpoint -----
+@click.command()
+@click.option(
+    "--config_file",
+    default="projects/ChatGLM/configs/chatglm_config.py",
+    help="Path to the configuration file.",
+)
+@click.option("--model_path", default=None, help="Path to the model checkpoint.")
+@click.option(
+    "--mode",
+    default="libai",
+    help="Mode for the dataloader pipeline, e.g., 'libai' or 'huggingface'.",
+)
+@click.option(
+    "--device", default="cuda", help="Device to run the model on, e.g., 'cuda', 'xpu', 'npu'."
+)
+def main(config_file, model_path, mode, device):
     text = "浏览器输入www.baidu.com 并且显示网页,从计算机网络的角度说明实现的全过程"
     text2 = (
         "5600分为A、B、C三部分,如果A比C的比例是1/7:1/7:1/14,那么A比C多多少?\n"
         "选项:\n(A) 300\n(B) 992 \n(C) 1120\n(D) 552\n(E) 312 让我们先想想。一些随机推理:"
     )
     texts = [
+        text,
+        text2,
         "a dog is flying on the sky",
         "Wikipedia is a free online",
         "what is beam search?",
         "what is beam search?",
     ]
     pipeline = TextGenerationPipeline(
-        "projects/ChatGLM/configs/chatglm_config.py",
+        config_file,
         data_parallel=1,
         tensor_parallel=1,
         pipeline_parallel=1,
         pipeline_num_layers=28,
-        model_path=os.environ["CHATGLM_HF_DIR"],
-        mode="huggingface",
+        model_path=model_path,
+        mode=mode,
+        device=device,
     )
     pipeline.model = pipeline.model.half()
 
     if isinstance(texts, list):
-        output = pipeline(inputs=texts, do_sample=False, max_length=50)
+        output = pipeline(inputs=texts, do_sample=False, max_length=400)
         if dist.is_main_process():
             for text, record in zip(texts, output):
                 print(f"Q:{text}||A:{record}")
 
-    # if isinstance(text, str):
-    #     output = pipeline(inputs=text, do_sample=False, max_length=400)
-    #     if dist.is_main_process():
-    #         for record in output:
-    #             print(record["generated_text"])
-    #     pipeline.reset_conversation()
-    #     output = pipeline(inputs=text2, do_sample=False, max_length=400)
-    #     if dist.is_main_process():
-    #         for record in output:
-    #             print(record["generated_text"])
-
-    # # ----- load libai checkpoint -----
-    # pipeline = TextGenerationPipeline(
-    #     "projects/ChatGLM/configs/chatglm_config.py",
-    #     data_parallel=1,
-    #     tensor_parallel=1,
-    #     pipeline_parallel=1,
-    #     pipeline_num_layers=28,
-    #     model_path="/home/lixin/codes/libai/lora_sft_result/model_final/model",
-    #     mode="libai",
-    # )
-    # pipeline.model = pipeline.model.half()
-
-    # if isinstance(texts, list):
-    #     output = pipeline(inputs=texts, do_sample=False, max_length=50)
-    #     if dist.is_main_process():
-    #         for text, record in zip(texts, output):
-    #             print(f"Q:{text}||A:{record}")
-
-    # if isinstance(text, str):
-    #     output = pipeline(inputs=text, do_sample=False, max_length=400)
-    #     if dist.is_main_process():
-    #         for record in output:
-    #             print(record['generated_text'])
-    #     pipeline.reset_conversation()
-    #     output = pipeline(inputs=text2, do_sample=False, max_length=400)
-    #     if dist.is_main_process():
-    #         for record in output:
-    #             print(record['generated_text'])
-
-    # ----- pure huggingface predict -----
-    # from transformers import AutoModel, AutoTokenizer
-
-    # tokenizer = AutoTokenizer.from_pretrained(glm_model_path, trust_remote_code=True)
-    # model = AutoModel.from_pretrained(glm_model_path, trust_remote_code=True).half().cuda()
-    # model = model.eval()
-    # history = []
-    # for _ in range(1):
-    #     response, history = model.chat(
-    #         tokenizer, text, history=history, do_sample=False, max_length=400
-    #     )
-    #     print(response)
+
+if __name__ == "__main__":
+    main()
diff --git a/projects/Eval_LLM/README.md b/projects/Eval_LLM/README.md
new file mode 100644
index 000000000..7cf0af530
--- /dev/null
+++ b/projects/Eval_LLM/README.md
@@ -0,0 +1,49 @@
+# LLM Evaluation
+
+A tool for evaluating OneFlow models based on [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness/)
+
+## Environment
+
+Follow this [Installation Instruction](https://libai.readthedocs.io/en/latest/tutorials/get_started/Installation.html) to install oneflow(1.0.0) and libai first. Conda is recommended.  
+**Make sure you have python>=3.10 to run evaluation for GLM.**
+Then run ```pip install -r ./projects/Eval_LLM/requirements.txt``` to install dependencies.
+
+## Run Eval
+
+### Set the parameters in ./projects/Eval_LLM/config.py
+
+> pretrained_model_path: The path of your model weights, either huggingface weights or libai weights is ok.
+> hf_tokenizer_path: The path of huggingface tokenizer.
+> model_type: Type of your model, this argument is need for loading model. All choices are listed in ./projects/Eval_LLM/special_arguments.json
+> model_weight_type: Whether your weights are huggingface weights or libai weights.
+> eval_tasks: Tasks you want to evaluate you model on.
+> batch_size_per_gpu: Batch size on a single gpu, if you want to accelerate you evaluation, set it larger. But this may lead to OOM error.
+
+Tasks for Evaluation are listed [here](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks).
+
+### Run the following command to start eval
+```
+bash tools/infer.sh projects/Eval_LLM/main.py 1
+```
+Notice: The number stands for how many gpus you want to use.
+
+If you want to eval GLM(ChatGLM), run this:
+```
+CHATGLM_HF_DIR=YOUR_MODEL_PATH bash tools/infer.sh projects/Eval_LLM/main.py 1
+```
+
+Notice: To run a model with 6B parameters, you are about to have VRAM more than 24GB. You can use tensor or pipeline parallel on multiple devices.
+
+To know more about distributed inference: https://docs.oneflow.org/en/master/parallelism/04_launch.html
+
+## Example of Eval Result
+Using Llama2-7b
+```
+{'sciq': 
+    {'acc,none': 0.794, 'acc_stderr,none': 0.012795613612786583, 'acc_norm,none': 0.707, 'acc_norm_stderr,none': 0.014399942998441271, 'alias': 'sciq'}, 
+'lambada_openai': 
+    {'perplexity,none': 28.778403569948463, 'perplexity_stderr,none': 1.0792474430271395, 'acc,none': 0.33980205705414324, 'acc_stderr,none': 0.006598757339311441, 'alias': 'lambada_openai'}, 
+'gsm8k': 
+    {'exact_match,strict-match': 0.001516300227445034, 'exact_match_stderr,strict-match': 0.0010717793485492675, 'exact_match,flexible-extract': 0.01061410159211524, 'exact_match_stderr,flexible-extract': 0.002822713322387704, 'alias': 'gsm8k'}
+}
+```
\ No newline at end of file
diff --git a/projects/Eval_LLM/config.py b/projects/Eval_LLM/config.py
new file mode 100644
index 000000000..cb1d180a3
--- /dev/null
+++ b/projects/Eval_LLM/config.py
@@ -0,0 +1,22 @@
+from omegaconf import DictConfig
+
+parallel_config = DictConfig(
+    dict(
+        data_parallel_size=1,
+        tensor_parallel_size=1,
+        pipeline_parallel_size=1,
+        pipeline_num_layers=32,
+        device_type="cuda",
+    )
+)
+
+eval_config = DictConfig(
+    dict(
+        pretrained_model_path="",
+        hf_tokenizer_path="",
+        model_type="llama",
+        model_weight_type="libai",  # libai or huggingface
+        eval_tasks=["lambada_openai", "gsm8k"],
+        batch_size_per_gpu=1,
+    )
+)
diff --git a/projects/Eval_LLM/eval_harness.py b/projects/Eval_LLM/eval_harness.py
new file mode 100644
index 000000000..814e0a01e
--- /dev/null
+++ b/projects/Eval_LLM/eval_harness.py
@@ -0,0 +1,342 @@
+import json
+import os
+from pathlib import Path
+from typing import Dict, List, Optional, TypeVar
+
+import oneflow as flow
+import oneflow.nn.functional as F
+
+flow.mock_torch.enable(lazy=True)
+
+import oneflow as torch  # noqa
+from lm_eval import evaluator, tasks, utils  # noqa
+from lm_eval.api.model import LM  # noqa
+from lm_eval.models.utils import chunks  # noqa
+from tqdm import tqdm  # noqa
+
+import libai.utils.distributed as dist  # noqa
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+T = TypeVar("T")
+
+
+class EvalHarnessBase(LM):
+    def __init__(self, model, tokenizer, model_name, batch_size: int, cfg: dict):
+        super().__init__()
+        self.model = model
+        self.tokenizer = tokenizer
+        self.model_name = model_name
+        self.batch_size_per_gpu = batch_size
+        self.cfg = cfg
+
+    @classmethod
+    def create_from_arg_string(cls, arg_string, additional_config=None):
+        pass
+
+    @property
+    def eos_token_id(self):
+        return self.tokenizer.eos_token_id
+
+    @property
+    def pad_token_id(self):
+        return self.tokenizer.pad_token_id
+
+    @property
+    def max_length(self):
+        return self.cfg.max_position_embeddings
+
+    @property
+    def vocab_size(self):
+        return self.cfg.vocab_size
+
+    @property
+    def max_gen_toks(self):
+        return self.cfg.get("max_length", 64)
+
+    @property
+    def batch_size(self):
+        return self.batch_size_per_gpu * dist.get_world_size()
+
+    @property
+    def device(self):
+        return flow.device("cuda:0")
+
+    def tok_encode(self, string: str) -> List[int]:
+        return self.tokenizer.encode(string, add_special_tokens=False)
+
+    def tok_decode(self, tokens: List[int]) -> str:
+        return self.tokenizer.decode(tokens)
+
+    def batch_encode(self, strings: List[str]) -> Dict:
+        return self.tokenizer.batch_encode_plus(strings, padding=True)
+
+    @flow.inference_mode()
+    def _model_call(self, inps):
+        inps = inps.to_global(
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            placement=dist.get_layer_placement(0),
+        )
+        return self.model(inps)["logits"].to_local().to(flow.float32)
+
+    def _model_generate(self, context, max_length, eos_token_id) -> flow.Tensor:
+        context = dist.convert_to_distributed_default_setting(context)
+        out = self.model.generate(
+            context,
+            max_length,
+            eos_token_id=eos_token_id,
+        )
+        return out.unsqueeze(0)
+
+    def loglikelihood(self, requests, disable_tqdm=False):
+        new_reqs = []
+        for request in tqdm(requests, disable=disable_tqdm):
+            context, continuation = request.arguments
+            if context == "":
+                # end of text as context
+                context_enc = [self.eos_token_id]
+            else:
+                context_enc = self.tok_encode(context)
+
+            continuation_enc = self.tok_encode(continuation)[: self.max_length]
+
+            new_reqs.append(((context, continuation), context_enc, continuation_enc))
+        return self._loglikelihood_tokens(new_reqs)
+
+    def loglikelihood_rolling(self, requests):
+        # TODO: Implement caching once we've confirmed the perplexity implementation
+        # TODO: automatic batch size detection for vectorization
+
+        loglikelihoods = []
+        for (string,) in tqdm(requests):
+            rolling_token_windows = list(
+                map(
+                    utils.make_disjoint_window,
+                    utils.get_rolling_token_windows(
+                        token_list=self.tok_encode(string),
+                        prefix_token=self.eot_token_id,
+                        max_seq_len=self.max_length,
+                        context_len=1,
+                    ),
+                )
+            )
+
+            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+
+            string_nll = self._loglikelihood_tokens(rolling_token_windows, disable_tqdm=True)
+
+            # discard is_greedy
+            string_nll = [x[0] for x in string_nll]
+
+            string_nll = sum(string_nll)
+            loglikelihoods.append(string_nll)
+
+        return loglikelihoods
+
+    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
+        res = []
+
+        def _collate(x):
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates,
+            #   which is more useful for planning
+            # - to know the size of a batch when going through the list,
+            #   you know the first one is always the batch
+            #   padded context length. this is useful to simplify
+            #   the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+
+            toks = x[1] + x[2]
+            return -len(toks), tuple(toks)
+
+        # TODO: automatic (variable) batch size detection for vectorization
+        re_ord = utils.Reorderer(requests, _collate)
+        for chunk in chunks(tqdm(re_ord.get_reordered(), disable=disable_tqdm), self.batch_size):
+            inps = []
+            cont_toks_list = []
+            inplens = []
+
+            padding_length = None
+
+            # because vectorizing is annoying,
+            # we first convert each (context, continuation) pair to padded tensors,
+            # then we pack them together into a batch, call the model,
+            # and then pick it all apart again because vectorizing is annoying
+
+            for _, context_enc, continuation_enc in chunk:
+                # sanity check
+                assert len(context_enc) > 0
+                assert len(continuation_enc) > 0
+                assert len(continuation_enc) <= self.max_length
+
+                # how this all works:
+                #          CTX      CONT
+                # inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
+                # gpt2    \               \
+                # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
+                # cont_toks      4 5 6 7 8 9   [:, -len(continuation_enc):, :self.vocab_size] slice
+
+                # when too long to fit in context, truncate from the left
+                inp = torch.tensor(
+                    (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
+                    dtype=torch.long,
+                ).to(self.device)
+                (inplen,) = inp.shape
+
+                cont = continuation_enc
+
+                # since in _collate we make sure length is descending,
+                # the longest is always the first one.
+                padding_length = padding_length if padding_length is not None else inplen
+
+                # pad length from seq to padding_length
+                inp = torch.cat(
+                    [
+                        inp,  # [seq]
+                        torch.zeros(padding_length - inplen, dtype=torch.long).to(
+                            inp.device
+                        ),  # [padding_length - seq]
+                    ],
+                    dim=0,
+                )
+
+                inps.append(inp.unsqueeze(0))  # [1, padding_length]
+                cont_toks_list.append(cont)
+                inplens.append(inplen)
+
+            batched_inps = torch.cat(inps, dim=0)  # [batch, padding_length
+            multi_logits = F.log_softmax(
+                self._model_call(batched_inps), dim=-1
+            ).cpu()  # [batch, padding_length, vocab]
+
+            for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(
+                chunk, multi_logits, inps, inplens, cont_toks_list
+            ):
+
+                # Slice to original seq length
+                contlen = len(cont_toks)
+                logits = logits[inplen - contlen : inplen].unsqueeze(0)  # [1, seq, vocab]
+
+                # Check if per-token argmax is exactly equal to continuation
+                greedy_tokens = logits.argmax(dim=-1)
+                cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0)  # [1, seq]
+                max_equal = (greedy_tokens == cont_toks).all()
+
+                # Obtain log-probs at the corresponding continuation token indices
+                # last_token_slice = logits[:, -1, :].squeeze(0).tolist()
+                logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1)  # [1, seq]
+
+                # Answer: (log prob, is-exact-match)
+                answer = (float(logits.sum()), bool(max_equal))
+
+                # partial caching
+                if cache_key is not None:
+                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+
+                res.append(answer)
+
+        return re_ord.get_original(res)
+
+    def generate_until(self, requests, disable_tqdm=False) -> List[str]:
+        res = []
+
+        for chunk in chunks(
+            tqdm(requests, disable=disable_tqdm, desc="Running generate_until requests"),
+            self.batch_size,
+        ):
+            _, until = chunk[0].arguments
+            if isinstance(until, dict):
+                until = until["until"]
+            if isinstance(until, str):
+                until = [until]
+            primary_until = self.tok_encode(until[0])
+            reqs = []
+            for request in chunk:
+                reqs.append(request.arguments[0])
+            context_enc = torch.tensor(self.batch_encode(reqs)["input_ids"]).to(self.device)[
+                :, self.max_gen_toks - self.max_length :
+            ]
+            cont = self._model_generate(
+                context_enc, context_enc.shape[1] + self.max_gen_toks, primary_until[0]
+            )
+
+            for i in range(cont[0].shape[0]):
+                s = self.tok_decode(cont[0].tolist()[i][context_enc.shape[1] :])
+                for term in until:
+                    s = s.split(term)[0]
+
+                res.append(s)
+        return res
+
+    @flow.inference_mode()
+    def run_eval(
+        self,
+        eval_tasks: List[str],
+        limit: Optional[int],
+        bootstrap_iters: int,
+    ) -> Dict:
+        import fnmatch
+
+        task_manager = tasks.TaskManager()
+        all_tasks = task_manager.all_tasks
+
+        def pattern_match(patterns, source_list):
+            task_names = set()
+            for pattern in patterns:
+                for matching in fnmatch.filter(source_list, pattern):
+                    task_names.add(matching)
+            task_names = list(task_names)
+            task_names.sort()
+            return task_names
+
+        eval_tasks = pattern_match(eval_tasks, all_tasks)
+        print(f"Found tasks: {eval_tasks}")
+
+        if dist.is_main_process() == 0:
+            tasks.get_task_dict(eval_tasks)
+        dist.synchronize()
+
+        lm = self
+        results = evaluator.evaluate(
+            lm=lm,
+            task_dict=tasks.get_task_dict(task_name_list=eval_tasks),
+            limit=limit,
+            bootstrap_iters=bootstrap_iters,
+        )
+        results["config"] = dict(
+            model=self.model_name,
+            batch_size=self.batch_size,
+            device=str(self.device),
+            limit=limit,
+            bootstrap_iters=bootstrap_iters,
+        )
+        return results
+
+
+@flow.inference_mode()
+def run_eval_harness(
+    model,
+    tokenizer,
+    model_name,
+    eval_tasks: List[str] = [
+        "hellaswag",
+    ],
+    batch_size_per_gpu: int = 1,
+    save_filepath: Optional[Path] = None,
+    limit: Optional[int] = None,
+    bootstrap_iters: int = 100000,
+    dtype=flow.float16,
+    cfg=None,
+):
+    model.eval()
+    model = model.to(dtype)
+    with flow.no_grad():
+        eval_harness = EvalHarnessBase(model, tokenizer, model_name, batch_size_per_gpu, cfg)
+        results = eval_harness.run_eval(eval_tasks, limit, bootstrap_iters)
+    if save_filepath is None:
+        print(results["results"])
+    else:
+        print(f"Saving results to {str(save_filepath)!r}")
+        data = json.dumps(results)
+        with open(save_filepath, "w") as fw:
+            fw.write(data)
diff --git a/projects/Eval_LLM/main.py b/projects/Eval_LLM/main.py
new file mode 100644
index 000000000..487dfe975
--- /dev/null
+++ b/projects/Eval_LLM/main.py
@@ -0,0 +1,86 @@
+import importlib
+import json
+
+from transformers import AutoTokenizer as HF_AutoTokenizer
+
+import libai.utils.distributed as dist  # noqa
+from libai.config import LazyConfig
+from libai.models.utils.model_loader.base_loader import ModelLoaderLiBai  # noqa
+
+
+class LLMLoaderLibai(ModelLoaderLiBai):
+    def __init__(self, model, libai_cfg, pretrained_model_path, base_model_prefix, **kwargs):
+        super().__init__(model, libai_cfg, pretrained_model_path, **kwargs)
+        self.base_model_prefix_2 = base_model_prefix
+
+
+def get_special_arguments(cfg):
+    with open("./projects/Eval_LLM/special_arguments.json", "r") as f:
+        arguments = json.load(f)
+    special_arguments = arguments[cfg.eval_config.model_type]
+    return special_arguments
+
+
+def main():
+    cfg = LazyConfig.load("./projects/Eval_LLM/config.py")
+    dist.setup_dist_util(cfg.parallel_config)
+    special_arguments = get_special_arguments(cfg)
+    print("Loading Model...")
+    model_cfg = LazyConfig.load(special_arguments["config_path"])
+    if model_cfg.cfg.max_position_embeddings is None:
+        model_cfg.cfg.max_position_embeddings = 1024
+
+    model_class = getattr(
+        importlib.import_module(special_arguments["model_class_prefix"]),
+        special_arguments["model_class"],
+    )
+
+    assert cfg.eval_config.model_weight_type in [
+        "huggingface",
+        "libai",
+    ], "model_weight_type must be huggingface or libai"
+    if cfg.eval_config.model_weight_type == "huggingface":
+        huggingface_loader = getattr(
+            importlib.import_module(special_arguments["huggingface_loader_prefix"]),
+            special_arguments["huggingface_loader"],
+        )
+        load_func = huggingface_loader(
+            model=model_class,
+            libai_cfg=model_cfg.cfg,
+            pretrained_model_path=cfg.eval_config.pretrained_model_path,
+        )
+    else:
+        load_func = LLMLoaderLibai(
+            model=model_class,
+            libai_cfg=model_cfg.cfg,
+            pretrained_model_path=cfg.eval_config.pretrained_model_path,
+            base_model_prefix=special_arguments["base_model_prefix_2"],
+        )
+
+    tokenizer = HF_AutoTokenizer.from_pretrained(
+        cfg.eval_config.hf_tokenizer_path, trust_remote_code=True
+    )
+    with open(cfg.eval_config.hf_tokenizer_path + "/config.json", "r") as f:
+        generation_config = json.load(f)
+
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token_id = generation_config["pad_token_id"]
+    if tokenizer.eos_token_id is None:
+        tokenizer.eos_token_id = generation_config["eos_token_id"]
+    model = load_func.load()
+    print("Model Loaded!")
+
+    from projects.Eval_LLM.eval_harness import run_eval_harness  # noqa
+
+    run_eval_harness(
+        model,
+        tokenizer,
+        cfg.eval_config.model_type,
+        eval_tasks=cfg.eval_config.eval_tasks,
+        batch_size_per_gpu=cfg.eval_config.batch_size_per_gpu,
+        cfg=model_cfg.cfg,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/projects/Eval_LLM/requirements.txt b/projects/Eval_LLM/requirements.txt
new file mode 100644
index 000000000..6785b5a10
--- /dev/null
+++ b/projects/Eval_LLM/requirements.txt
@@ -0,0 +1,6 @@
+torch>=2.0.0
+tokenizers
+transformers
+datasets
+huggingface-hub
+lm-eval==0.4.2
\ No newline at end of file
diff --git a/projects/Eval_LLM/special_arguments.json b/projects/Eval_LLM/special_arguments.json
new file mode 100644
index 000000000..2c863fb6f
--- /dev/null
+++ b/projects/Eval_LLM/special_arguments.json
@@ -0,0 +1,32 @@
+{
+    "llama":{
+        "n_layers_hf":"num_hidden_layers",
+        "n_layer_libai":"hidden_layers",
+        "base_model_prefix_2":"model",
+        "config_path":"./projects/Llama/configs/llama_config.py",
+        "model_class_prefix":"projects.Llama.llama",
+        "model_class":"LlamaForCausalLM",
+        "huggingface_loader_prefix":"projects.Llama.utils.llama_loader",
+        "huggingface_loader":"LlamaLoaderHuggerFace"
+    },
+    "bloom":{
+        "n_layers_hf":"n_layer",
+        "n_layer_libai":"hidden_layers",
+        "base_model_prefix_2":"transformer",
+        "config_path":"./projects/BLOOM/configs/bloom_inference.py",
+        "model_class_prefix":"projects.BLOOM.modeling.bloom_model",
+        "model_class":"BloomForCausalLM",
+        "huggingface_loader_prefix":"projects.BLOOM.utils.model_loader",
+        "huggingface_loader":"BlooMLoaderHuggerFace"
+    },
+    "glm":{
+        "n_layers_hf":"num_layers",
+        "n_layer_libai":"num_layers",
+        "base_model_prefix_2":"model",
+        "config_path":"./projects/ChatGLM/configs/chatglm_config.py",
+        "model_class_prefix":"projects.ChatGLM.chatglm",
+        "model_class":"ChatGLMForConditionalGeneration",
+        "huggingface_loader_prefix":"projects.ChatGLM.utils.chatglm_loader",
+        "huggingface_loader":"ChatGLMLoaderHuggerFace"
+    }
+}
\ No newline at end of file
diff --git a/projects/Llama/readme.md b/projects/Llama/README.md
similarity index 84%
rename from projects/Llama/readme.md
rename to projects/Llama/README.md
index 9adb3d925..f58e416c1 100644
--- a/projects/Llama/readme.md
+++ b/projects/Llama/README.md
@@ -44,4 +44,17 @@ python projects/Llama/utils/eval_adapter.py
 - Adjust the parameters in the `projects/Llama/pipeline.py`, and running:
 ```bash
 bash tools/infer.sh projects/Llama/pipeline.py 8
-```
\ No newline at end of file
+```
+
+## npu/xpu example
+
+- npu
+```bash
+python projects/Llama/pipeline.py --device=npu --mode=huggingface --model_path /your/model/path
+```
+
+- xpu
+```bash
+python projects/Llama/pipeline.py --device=xpu --mode=huggingface --model_path /your/model/path
+```
+
diff --git a/projects/Llama/adapter/adapter_config.py b/projects/Llama/adapter/adapter_config.py
index 7381e64af..80f13cb71 100644
--- a/projects/Llama/adapter/adapter_config.py
+++ b/projects/Llama/adapter/adapter_config.py
@@ -11,7 +11,7 @@
     hidden_size=4096,
     initializer_range=0.02,
     intermediate_size=11008,
-    max_position_embeddings=4096,
+    max_position_embeddings=2048,
     num_attention_heads=32,
     hidden_layers=32,
     pretraining_tp=1,
diff --git a/projects/Llama/configs/llama_config.py b/projects/Llama/configs/llama_config.py
index 58b86ecd6..36f95d126 100644
--- a/projects/Llama/configs/llama_config.py
+++ b/projects/Llama/configs/llama_config.py
@@ -12,7 +12,7 @@
     hidden_size=4096,
     initializer_range=0.02,
     intermediate_size=11008,
-    max_position_embeddings=4096,
+    max_position_embeddings=2048,
     num_attention_heads=32,
     hidden_layers=32,
     pretraining_tp=1,
@@ -57,5 +57,5 @@
 tokenization = OmegaConf.create()
 tokenization.make_vocab_size_divisible_by = 1
 tokenization.tokenizer = LazyCall(LlamaTokenizer)(
-    pretrained_model_path="meta-llama/Llama-2-7b-hf/tokenizer.model"
+    # pretrained_model_path="meta-llama/Llama-2-7b-hf/tokenizer.model"
 )
diff --git a/projects/Llama/pipeline.py b/projects/Llama/pipeline.py
index bea4a2f56..4b65d2895 100644
--- a/projects/Llama/pipeline.py
+++ b/projects/Llama/pipeline.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import click
+
 from libai.inference.basic import BasePipeline
 from libai.utils import distributed as dist
 
@@ -67,7 +69,7 @@ def _parse_parameters(self, **pipeline_parameters):
 
     def preprocess(self, inputs, **kwargs) -> dict:
         # tokenizer encoderW
-        inputs = self.tokenizer.tokenize(inputs, add_bos=True, padding=True)
+        inputs = self.tokenizer.tokenize(inputs, add_bos=True, padding=True, device=self.device)
         inputs = {
             "input_ids": inputs,
         }
@@ -87,31 +89,31 @@ def postprocess(self, model_output_dict, **kwargs) -> dict:
         return records
 
 
-if __name__ == "__main__":
-    # ----- load huggingface checkpoint -----
-    # pipeline = TextGenerationPipeline(
-    #     "projects/Llama/configs/llama_config.py",
-    #     data_parallel=1,
-    #     tensor_parallel=1,
-    #     pipeline_parallel=1,
-    #     pipeline_num_layers=32,
-    #     model_path="",
-    #     mode="huggingface",
-    # )
-
-    # output = pipeline(inputs=text)
-    # if dist.is_main_process():
-    #     print(output)
-
-    # ----- load libai checkpoint -----
+@click.command()
+@click.option(
+    "--config_file",
+    default="projects/Llama/configs/llama_config.py",
+    help="Path to the configuration file.",
+)
+@click.option("--model_path", default=None, help="Path to the model checkpoint.")
+@click.option(
+    "--mode",
+    default="libai",
+    help="Mode for the dataloader pipeline, e.g., 'libai' or 'huggingface'.",
+)
+@click.option(
+    "--device", default="cuda", help="Device to run the model on, e.g., 'cuda', 'xpu', 'npu'."
+)
+def main(config_file, model_path, mode, device):
     pipeline = TextGenerationPipeline(
-        "projects/Llama/configs/llama_config.py",
+        config_file,
         data_parallel=1,
         tensor_parallel=1,
         pipeline_parallel=1,
         pipeline_num_layers=32,
-        model_path="",
-        mode="libai",
+        model_path=model_path,
+        mode=mode,
+        device=device,
     )
 
     text = [
@@ -120,3 +122,7 @@ def postprocess(self, model_output_dict, **kwargs) -> dict:
     output = pipeline(inputs=text)
     if dist.is_main_process():
         print(output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/projects/Llama/tokenizer.py b/projects/Llama/tokenizer.py
index 56aca8336..1598a1dbe 100644
--- a/projects/Llama/tokenizer.py
+++ b/projects/Llama/tokenizer.py
@@ -75,9 +75,9 @@ def tokenize(
         if add_eos:
             tokens = [token + [self.eos_token_id] for token in tokens]
 
-        if device == "cuda":
+        if device:
             sbp = kwargs.get("sbp", dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]))
-            placement = kwargs.get("placement", flow.placement("cuda", [0]))
+            placement = kwargs.get("placement", flow.placement(device, [0]))
             return_token_ids = flow.tensor(tokens, sbp=sbp, placement=placement, dtype=flow.long)
         else:
             return_token_ids = flow.tensor(tokens, dtype=flow.long)
diff --git a/projects/Llama/utils/llama_loader.py b/projects/Llama/utils/llama_loader.py
index 20b9ba258..c46cb480a 100644
--- a/projects/Llama/utils/llama_loader.py
+++ b/projects/Llama/utils/llama_loader.py
@@ -26,6 +26,8 @@ def __init__(self, model, libai_cfg, pretrained_model_path, **kwargs):
 
         self.base_model_prefix_1 = "model"
         self.base_model_prefix_2 = "model"
+        if not pretrained_model_path:
+            self.pretrained_model_path = libai_cfg.pretrained_model_path
 
     def _convert_state_dict(self, flow_state_dict, cfg):
         """Convert state_dict's keys to match model.
@@ -104,3 +106,5 @@ class LlamaLoaderLiBai(ModelLoaderLiBai):
     def __init__(self, model, libai_cfg, pretrained_model_path, **kwargs):
         super().__init__(model, libai_cfg, pretrained_model_path, **kwargs)
         self.base_model_prefix_2 = "model"
+        if not pretrained_model_path:
+            self.pretrained_model_path = libai_cfg.pretrained_model_path
diff --git a/projects/mock_transformers/mock_tokenization.py b/projects/mock_transformers/mock_tokenization.py
index ab70cd362..22f42e693 100644
--- a/projects/mock_transformers/mock_tokenization.py
+++ b/projects/mock_transformers/mock_tokenization.py
@@ -16,13 +16,19 @@
 import os
 
 import oneflow as flow
+import oneflow.mock_torch as mock
 
 from libai.utils import distributed as dist
 import oneflow.mock_torch as mock
 
 with mock.enable(lazy=True):
-
-    from transformers import BertTokenizer, GPT2Tokenizer, MT5Tokenizer, T5Tokenizer, Qwen2Tokenizer  # noqa
+    from transformers import (  # noqa
+        BertTokenizer,
+        GPT2Tokenizer,
+        MT5Tokenizer,
+        Qwen2Tokenizer,
+        T5Tokenizer,
+    )
     from transformers.tokenization_utils_base import *  # noqa
     from transformers.utils import generic  # noqa
     from transformers.utils.generic import TensorType  # noqa
@@ -36,10 +42,8 @@ class TensorType(ExplicitEnum):  # noqa
         NUMPY = "np"
         JAX = "jax"
 
-
     generic.TensorType = TensorType
 
-
     # ---------------- mock convert_to_tensors ------------------
     def flow_convert_to_tensors(self, tensor_type=None, prepend_batch_axis=False):
         if tensor_type is None:
@@ -74,7 +78,10 @@ def flow_convert_to_tensors(self, tensor_type=None, prepend_batch_axis=False):
             try:
                 import oneflow  # noqa
             except ImportError as e:
-                msg = "Unable to convert output to OneFlow tensors format, OneFlow is not installed."
+
+                msg = (
+                    "Unable to convert output to OneFlow tensors format, OneFlow is not installed."
+                )
                 raise ImportError(msg) from e
             as_tensor = flow.tensor
             is_tensor = flow.is_tensor
@@ -100,7 +107,8 @@ def flow_convert_to_tensors(self, tensor_type=None, prepend_batch_axis=False):
                 if not is_tensor(value):
                     tensor = as_tensor(value)
 
-                    # Removing this for now in favor of controlling the shape with `prepend_batch_axis`
+                    # Removing this for now in favor of controlling the shape
+                    # with `prepend_batch_axis`
                     # # at-least2d
                     # if tensor.ndim > 2:
                     #     tensor = tensor.squeeze(0)
@@ -111,9 +119,9 @@ def flow_convert_to_tensors(self, tensor_type=None, prepend_batch_axis=False):
             except Exception as e:
                 if key == "overflowing_tokens":
                     raise ValueError(
-                        "Unable to create tensor returning overflowing tokens of different lengths. "
-                        "Please see if a fast version of this tokenizer is available to have this "
-                        "feature available."
+                        "Unable to create tensor returning overflowing tokens of different "
+                        "lengths. Please see if a fast version of this tokenizer is "
+                        "available to have this feature available."
                     ) from e
                 raise ValueError(
                     "Unable to create tensor, you should probably activate truncation and/or "
diff --git a/requirements.txt b/requirements.txt
index dbad4c6fa..ad2675928 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,7 @@ wget
 hydra-core
 nltk
 numpy
-omegaconf==2.1.0
+omegaconf==2.1.2
 Pygments
 PyYAML
 jieba
@@ -27,3 +27,4 @@ black==21.4b2
 autoflake
 tensorboardX<=2.5.1
 pytest
+safetensors
diff --git a/setup.py b/setup.py
index bee065fcc..1fb5a2829 100644
--- a/setup.py
+++ b/setup.py
@@ -120,7 +120,7 @@ def get_libai_configs() -> List[str]:
             "hydra-core",
             "nltk",
             "numpy",
-            "omegaconf==2.1.0",
+            "omegaconf==2.1.2",
             "Pygments",
             "PyYAML",
             "jieba",
@@ -140,6 +140,7 @@ def get_libai_configs() -> List[str]:
             "autoflake",
             "tensorboardX<=2.5.1",
             "pytest",
+            "safetensors",
         ],
         packages=find_packages(),
         package_data={"libai.config": get_libai_configs()},
diff --git a/tools/train_net.py b/tools/train_net.py
index 458849c75..eb63bc64b 100644
--- a/tools/train_net.py
+++ b/tools/train_net.py
@@ -36,7 +36,8 @@ def main(args):
 
     seed_for_rank = cfg.train.seed + flow.env.get_rank()
     flow.manual_seed(seed_for_rank)
-    flow.cuda.manual_seed(seed_for_rank)
+    if flow.cuda.is_available():
+        flow.cuda.manual_seed(seed_for_rank)
     np.random.seed(seed_for_rank)
     random.seed(seed_for_rank)