From f3bd8e681f8bf70141012c5fe621416a36edca46 Mon Sep 17 00:00:00 2001
From: void-main <voidmain1313113@gmail.com>
Date: Sun, 23 Apr 2023 10:27:34 +0000
Subject: [PATCH 01/27] get llama coded

---
 examples/cpp/CMakeLists.txt                   |    2 +
 examples/cpp/llama/CMakeLists.txt             |   22 +
 examples/cpp/llama/bad_words.csv              |    2 +
 examples/cpp/llama/check_with_huggingface.py  |   15 +
 .../cpp/llama/huggingface_llama_convert.py    |  187 +++
 examples/cpp/llama/llama_config.ini           |   32 +
 examples/cpp/llama/llama_example.cc           |  503 +++++++
 examples/cpp/llama/llama_triton_example.cc    |  457 +++++++
 examples/cpp/llama/model_config.json          |    1 +
 examples/cpp/llama/start_ids.csv              |    8 +
 examples/cpp/llama/stop_words.csv             |    2 +
 src/fastertransformer/models/CMakeLists.txt   |    2 +
 .../models/llama/CMakeLists.txt               |   69 +
 src/fastertransformer/models/llama/Llama.cc   | 1211 +++++++++++++++++
 src/fastertransformer/models/llama/Llama.h    |  218 +++
 .../models/llama/LlamaContextDecoder.cc       |  504 +++++++
 .../models/llama/LlamaContextDecoder.h        |  117 ++
 .../models/llama/LlamaDecoder.cc              |  381 ++++++
 .../models/llama/LlamaDecoder.h               |  104 ++
 .../models/llama/LlamaDecoderLayerWeight.cc   |  225 +++
 .../models/llama/LlamaDecoderLayerWeight.h    |   62 +
 .../models/llama/LlamaWeight.cc               |  301 ++++
 .../models/llama/LlamaWeight.h                |  106 ++
 .../triton_backend/CMakeLists.txt             |    2 +
 24 files changed, 4533 insertions(+)
 create mode 100644 examples/cpp/llama/CMakeLists.txt
 create mode 100644 examples/cpp/llama/bad_words.csv
 create mode 100644 examples/cpp/llama/check_with_huggingface.py
 create mode 100644 examples/cpp/llama/huggingface_llama_convert.py
 create mode 100644 examples/cpp/llama/llama_config.ini
 create mode 100644 examples/cpp/llama/llama_example.cc
 create mode 100644 examples/cpp/llama/llama_triton_example.cc
 create mode 100644 examples/cpp/llama/model_config.json
 create mode 100644 examples/cpp/llama/start_ids.csv
 create mode 100644 examples/cpp/llama/stop_words.csv
 create mode 100644 src/fastertransformer/models/llama/CMakeLists.txt
 create mode 100644 src/fastertransformer/models/llama/Llama.cc
 create mode 100644 src/fastertransformer/models/llama/Llama.h
 create mode 100644 src/fastertransformer/models/llama/LlamaContextDecoder.cc
 create mode 100644 src/fastertransformer/models/llama/LlamaContextDecoder.h
 create mode 100644 src/fastertransformer/models/llama/LlamaDecoder.cc
 create mode 100644 src/fastertransformer/models/llama/LlamaDecoder.h
 create mode 100644 src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
 create mode 100644 src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h
 create mode 100644 src/fastertransformer/models/llama/LlamaWeight.cc
 create mode 100644 src/fastertransformer/models/llama/LlamaWeight.h

diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt
index da24d72c6..64da9d2e7 100644
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
@@ -28,6 +28,8 @@ add_subdirectory(gptj)
 add_subdirectory(gptneox)
 add_subdirectory(multi_gpu_gpt)
 
+add_subdirectory(llama)
+
 if(ENABLE_FP8)
     add_subdirectory(gpt_fp8)
     add_subdirectory(bert_fp8)
diff --git a/examples/cpp/llama/CMakeLists.txt b/examples/cpp/llama/CMakeLists.txt
new file mode 100644
index 000000000..0495d3bf2
--- /dev/null
+++ b/examples/cpp/llama/CMakeLists.txt
@@ -0,0 +1,22 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_executable(llama_example llama_example.cc)
+target_link_libraries(llama_example PUBLIC -lcublas -lcublasLt -lcudart
+                      Llama nvtx_utils gpt_example_utils word_list mpi_utils nccl_utils)
+
+# add_executable(llama_triton_example llama_triton_example.cc)
+# target_link_libraries(llama_triton_example PUBLIC -lcublas -lcublasLt -lcudart -lpthread
+#                       LlamaTritonBackend TransformerTritonBackend custom_ar_comm
+#                       gpt_example_utils word_list mpi_utils nccl_utils nvtx_utils)
diff --git a/examples/cpp/llama/bad_words.csv b/examples/cpp/llama/bad_words.csv
new file mode 100644
index 000000000..6a1126ebd
--- /dev/null
+++ b/examples/cpp/llama/bad_words.csv
@@ -0,0 +1,2 @@
+7768,3908
+1,2
diff --git a/examples/cpp/llama/check_with_huggingface.py b/examples/cpp/llama/check_with_huggingface.py
new file mode 100644
index 000000000..0ba69036e
--- /dev/null
+++ b/examples/cpp/llama/check_with_huggingface.py
@@ -0,0 +1,15 @@
+import transformers
+import torch
+
+from transformers import LlamaForCausalLM, LlamaTokenizer
+
+tokenizer = LlamaTokenizer.from_pretrained('/data/llama-7b-hf')
+prompt = "Hey"
+inputs = tokenizer(prompt, return_tensors='pt')
+print(inputs)
+
+model = LlamaForCausalLM.from_pretrained("/data/llama-7b-hf")
+generated_ids = model.generate(inputs.input_ids, max_length=10)
+print(generated_ids)
+output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+print(output)
diff --git a/examples/cpp/llama/huggingface_llama_convert.py b/examples/cpp/llama/huggingface_llama_convert.py
new file mode 100644
index 000000000..28a552a28
--- /dev/null
+++ b/examples/cpp/llama/huggingface_llama_convert.py
@@ -0,0 +1,187 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import configparser
+import numpy as np
+from pathlib import Path
+
+import os
+from transformers import LlamaForCausalLM
+
+def get_weight_data_type(data_type):
+    if data_type == "fp32":
+        return np.float32
+    elif data_type == "fp16":
+        return np.float16
+    else:
+        assert False, f"Invalid weight data type {data_type}"
+
+
+def split_and_convert_process(saved_dir, factor, key, val):
+    if key.find("input_layernorm.weight") != -1 or key.find("post_attention_layernorm.weight") != -1:
+        # shared weights, only need to convert the weights of rank 0
+        saved_path = saved_dir + "/" + key + ".bin"
+        val.tofile(saved_path)
+    elif key.find("attention.dense.weight") != -1 or key.find("mlp.down_proj.weight") != -1:
+        split_vals = np.split(val, factor, axis=0)
+        for j in range(factor):
+            saved_path = saved_dir + "/" + key + ".%d.bin" % j
+            split_vals[j].tofile(saved_path)
+    elif key.find("mlp.gate_proj.weight") != -1 or key.find("mlp.up_proj.weight") != -1:
+        split_vals = np.split(val, factor, axis=-1)
+        for j in range(factor):
+            saved_path = saved_dir + "/" + key + ".%d.bin" % j
+            split_vals[j].tofile(saved_path)
+    elif key.find("attention.query_key_value.weight") != -1:
+        split_vals = np.split(val, factor, axis=-1)
+        for j in range(factor):
+            saved_path = saved_dir + "/" + key + ".%d.bin" % j
+            split_vals[j].tofile(saved_path)
+    else:
+        print("[ERROR] cannot find key '{}'".format(key))
+
+def split_and_convert(args):
+    saved_dir = args.saved_dir + "/%d-gpu/" % args.infer_gpu_num
+
+    if(os.path.exists(saved_dir) == False):
+        os.makedirs(saved_dir)
+
+    t_gpu_num = args.trained_gpu_num
+    i_gpu_num = args.infer_gpu_num
+    assert(i_gpu_num % t_gpu_num == 0)
+
+    factor = (int)(i_gpu_num / t_gpu_num)
+
+    # load position_embedding from rank 0
+    # model = torch.load(ckpt_name)
+    model = LlamaForCausalLM.from_pretrained(args.in_file)
+    hf_config = vars(model.config)
+    print(f"hf_config: {hf_config}")
+
+    print("named parameters:")
+    for name, param in model.named_parameters():
+        print(f"- {name}")
+
+    hidden_size = hf_config["hidden_size"]
+    head_num = hf_config["num_attention_heads"]
+    head_size = hidden_size // head_num
+    num_layers = hf_config["num_hidden_layers"]
+
+
+    np_weight_data_type = get_weight_data_type(args.weight_data_type)
+
+    try:
+        model_name = args.model_name
+        config = configparser.ConfigParser()
+        config['llama'] = {}
+        config['llama']['model_name'] = model_name
+        config['llama']["head_num"] = str(head_num)
+        config['llama']["size_per_head"] = str(head_size)
+        config['llama']["inter_size"] = str(hf_config["intermediate_size"])
+        config['llama']["num_layer"] = str(num_layers)
+        config['llama']["rotary_embedding"] = str(head_size)
+        config['llama']["vocab_size"] = str(hf_config["vocab_size"])
+        config['llama']["start_id"] = str(hf_config["bos_token_id"])
+        config['llama']["end_id"] = str(hf_config["eos_token_id"])
+        config['llama']["weight_data_type"] = args.weight_data_type
+
+        with open((Path(saved_dir) / f"config.ini").as_posix(), 'w') as configfile:
+            config.write(configfile)
+    except Exception as e:
+        print(f"Fail to save the config in config.ini.")
+        print(e)
+
+    param_to_weights = lambda param: param.detach().cpu().numpy().astype(np_weight_data_type)
+
+    # layer-wise weights, example:
+    #   - model.layers.0.self_attn.q_proj.weight
+    #   - model.layers.0.self_attn.k_proj.weight
+    #   - model.layers.0.self_attn.v_proj.weight
+    #   - model.layers.0.self_attn.o_proj.weight
+    #   - model.layers.0.mlp.gate_proj.weight
+    #   - model.layers.0.mlp.down_proj.weight
+    #   - model.layers.0.mlp.up_proj.weight
+    #   - model.layers.0.input_layernorm.weight
+    #   - model.layers.0.post_attention_layernorm.weight
+    for l in range(num_layers):
+        print(f"converting layer {l}")
+        # first merge QKV into a single weight
+        # concat direct to FT shape: [hidden_size, 3, head_num, head_size]
+        qkv_weights = np.empty((hidden_size, 3, head_num, head_size), dtype=np_weight_data_type)
+        q_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.q_proj.weight'])
+        k_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.k_proj.weight'])
+        v_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.v_proj.weight'])
+        qkv_weights[:, 0, :, :] = q_weight.reshape(hidden_size, head_num, head_size)
+        qkv_weights[:, 1, :, :] = k_weight.reshape(hidden_size, head_num, head_size)
+        qkv_weights[:, 2, :, :] = v_weight.reshape(hidden_size, head_num, head_size)
+        qkv_weights_base_name = f'model.layers.{l}.attention.query_key_value.weight'
+        split_and_convert_process(saved_dir, factor, qkv_weights_base_name, qkv_weights)
+
+        # attention dense
+        o_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.o_proj.weight'])
+        o_weight_base_name = f'model.layers.{l}.attention.dense.weight'
+        split_and_convert_process(saved_dir, factor, o_weight_base_name, o_weight)
+
+        # MLP
+        mlp_down_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.mlp.down_proj.weight'])
+        mlp_down_base_name = f'model.layers.{l}.mlp.down_proj.weight'
+        split_and_convert_process(saved_dir, factor, mlp_down_base_name, mlp_down_weight)
+
+        mlp_gate_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.mlp.gate_proj.weight'])
+        mlp_gate_base_name = f'model.layers.{l}.mlp.gate_proj.weight'
+        split_and_convert_process(saved_dir, factor, mlp_gate_base_name, mlp_gate_weight)
+
+        mlp_up_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.mlp.up_proj.weight'])
+        mlp_up_base_name = f'model.layers.{l}.mlp.up_proj.weight'
+        split_and_convert_process(saved_dir, factor, mlp_up_base_name, mlp_up_weight)
+
+        # LayerNorm
+        input_ln_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.input_layernorm.weight'])
+        input_ln_base_name = f'model.layers.{l}.input_layernorm.weight'
+        split_and_convert_process(saved_dir, factor, input_ln_base_name, input_ln_weight)
+
+        post_attn_ln_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.post_attention_layernorm.weight'])
+        post_attn_ln_base_name = f'model.layers.{l}.post_attention_layernorm.weight'
+        split_and_convert_process(saved_dir, factor, post_attn_ln_base_name, post_attn_ln_weight)
+
+        print(f"done layer {l}")
+
+
+    # final common weights
+    for name, param in model.named_parameters():
+        if name == 'model.embed_tokens.weight':
+            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.wte.weight.bin")
+        elif name == 'model.norm.weight':
+            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.final_layernorm.weight.bin")
+        elif name == 'lm_head.weight':
+            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.lm_head.weight.bin")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument('-saved_dir', '-o', type=str, help='file name of output file', required=True)
+    parser.add_argument('-in_file', '-i', type=str, help='file name of input checkpoint file', required=True)
+    parser.add_argument('-trained_gpu_num', '-t_g', type=int, help='How many gpus for inference', default=1)
+    parser.add_argument('-infer_gpu_num', '-i_g', type=int, help='How many gpus for inference', required=True)
+    parser.add_argument("-weight_data_type", type=str, default="fp32", choices=["fp32", "fp16"])
+    parser.add_argument('-model_name', '-m_n', type=str, help='model name', required=True)
+
+    args = parser.parse_args()
+    print("\n=============== Argument ===============")
+    for key in vars(args):
+        print("{}: {}".format(key, vars(args)[key]))
+    print("========================================")
+
+    split_and_convert(args)
diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini
new file mode 100644
index 000000000..5882e03a4
--- /dev/null
+++ b/examples/cpp/llama/llama_config.ini
@@ -0,0 +1,32 @@
+[ft_instance_hyperparameter]
+data_type=fp16
+enable_custom_all_reduce=0
+
+tensor_para_size=1
+pipeline_para_size=1
+
+model_name=llama_7b
+model_dir=/data/llama-7b-hf-converted
+
+[request]
+beam_width=1 # beam width for beam search
+top_k=1 ; k value for top k sampling
+top_p=0.0 ; p value for top p sampling
+temperature=1.0 ; Use for sampling
+repetition_penalty=1.0 ; Use for sampling
+presence_penalty=0.0  ; Only one of repetition_penalty and presence_penalty are allowed.
+len_penalty=0.0
+beam_search_diversity_rate=0.0
+request_batch_size=8 # determine by the request
+request_output_len=32 # determine by the request
+
+[llama_7b]
+head_num = 32
+size_per_head = 128
+inter_size = 11008
+num_layer = 32
+rotary_embedding = 128
+vocab_size = 32000
+start_id = 0
+end_id = 1
+weight_data_type = fp16
diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
new file mode 100644
index 000000000..c72a7a8b7
--- /dev/null
+++ b/examples/cpp/llama/llama_example.cc
@@ -0,0 +1,503 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "3rdparty/INIReader.h"
+#include "examples/cpp/multi_gpu_gpt/gpt_example_utils.h"
+#include "src/fastertransformer/models/llama/Llama.h"
+#include "src/fastertransformer/utils/mpi_utils.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+#include "src/fastertransformer/utils/nvtx_utils.h"
+#include "src/fastertransformer/utils/word_list.h"
+
+#include <cuda_profiler_api.h>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <sys/time.h>
+#include <vector>
+
+using namespace fastertransformer;
+
+template<typename T>
+void llama_example(const INIReader reader);
+
+int main(int argc, char* argv[])
+{
+    mpi::initialize(&argc, &argv);
+    srand(0);
+
+    std::string ini_name;
+    if (argc == 2) {
+        ini_name = std::string(argv[1]);
+    }
+    else {
+        ini_name = "../examples/cpp/llama/llama_config.ini";
+    }
+
+    INIReader reader = INIReader(ini_name);
+    if (reader.ParseError() < 0) {
+        std::cout << "[ERROR] Can't load '" << ini_name << "'\n";
+        return -1;
+    }
+    const std::string data_type = reader.Get("ft_instance_hyperparameter", "data_type");
+
+    if (data_type == "fp32") {
+        llama_example<float>(reader);
+    }
+    else if (data_type == "fp16") {
+        llama_example<half>(reader);
+    }
+    else {
+        FT_LOG_ERROR("is_fp16 should be 0 (use float) or 1 (use half).");
+        return -1;
+    }
+    mpi::finalize();
+    return 0;
+}
+
+template<typename T>
+void llama_example(const INIReader reader)
+{
+    const std::string model_name = reader.Get("ft_instance_hyperparameter", "model_name");
+    std::string       model_dir  = std::string(reader.Get("ft_instance_hyperparameter", "model_dir"));
+
+    int tensor_para_size   = reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size");
+    int pipeline_para_size = reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size");
+
+    const size_t head_num             = reader.GetInteger(model_name, "head_num");
+    const size_t size_per_head        = reader.GetInteger(model_name, "size_per_head");
+    const size_t vocab_size           = reader.GetInteger(model_name, "vocab_size");
+    const size_t decoder_layers       = reader.GetInteger(model_name, "num_layer");
+    const size_t rotary_embedding_dim = reader.GetInteger(model_name, "rotary_embedding");
+    const int    start_id             = reader.GetInteger(model_name, "start_id");
+    const int    end_id               = reader.GetInteger(model_name, "end_id");
+
+    const size_t hidden_units = head_num * size_per_head;
+    const size_t inter_size   = reader.GetInteger(model_name, "inter_size");
+
+    const size_t beam_width                 = reader.GetInteger("request", "beam_width");
+    const uint   top_k                      = (uint)reader.GetInteger("request", "top_k");
+    const float  top_p                      = reader.GetFloat("request", "top_p");
+    const float  temperature                = reader.GetFloat("request", "temperature");
+    const float  repetition_penalty         = reader.GetFloat("request", "repetition_penalty", 1.0f);
+    const float  presence_penalty           = reader.GetFloat("request", "presence_penalty", 0.0f);
+    const float  len_penalty                = reader.GetFloat("request", "len_penalty");
+    const float  beam_search_diversity_rate = reader.GetFloat("request", "beam_search_diversity_rate");
+    const int    min_length                 = reader.GetInteger("request", "min_length", 0);
+    const size_t request_batch_size         = 1; // reader.GetInteger("request", "request_batch_size");
+    // The length of tokens we hope this model to generate
+    const int request_output_len = reader.GetInteger("request", "request_output_len");
+
+    FT_CHECK(head_num % tensor_para_size == 0);
+    FT_CHECK(decoder_layers % pipeline_para_size == 0);
+    FT_CHECK_WITH_INFO(
+        repetition_penalty == 1.0f || presence_penalty == 0.0f,
+        fmtstr("Found ambiguous parameters repetition_penalty (%f) and presence_penalty (%f) "
+               "which are mutually exclusive. Please remove one of repetition_penalty or presence_penalty "
+               "or set to a default value.",
+               repetition_penalty,
+               presence_penalty));
+
+    // Prepare the parallelism parameters
+    int rank       = mpi::getCommWorldRank();
+    int world_size = mpi::getCommWorldSize();
+    if (rank == 0) {
+        printf("Total ranks: %d.\n", world_size);
+    }
+    int device, device_count;
+    check_cuda_error(cudaGetDeviceCount(&device_count));
+    check_cuda_error(cudaSetDevice(rank % device_count));
+    check_cuda_error(cudaGetDevice(&device));
+
+    struct cudaDeviceProp prop;
+    check_cuda_error(cudaGetDeviceProperties(&prop, device));
+    printf("Device %s\n", prop.name);
+
+    printf("P%d is running with GPU #%d.\n", rank, device);
+    if (tensor_para_size * pipeline_para_size != world_size) {
+        if (world_size % pipeline_para_size) {
+            printf("[ERROR] tensor_para_size * pipeline_para_size should equal to world_size \n");
+            exit(-1);
+        }
+        tensor_para_size = world_size / pipeline_para_size;
+        printf("[INFO] Setting tensor_para_size to %d \n", tensor_para_size);
+    }
+
+    const int layers_per_group = decoder_layers / pipeline_para_size;
+    if (layers_per_group * pipeline_para_size != (int)decoder_layers) {
+        printf("[ERROR] layers_per_group (%d) * pipeline_para_size (%d) should equal to decoder_layers (%ld) \n",
+               layers_per_group,
+               pipeline_para_size,
+               decoder_layers);
+        exit(-1);
+    }
+
+    // assume gpu_num = k * n,
+    // tensor parallelism group size is n
+    // pipeline parallelism group size is k
+    NcclParam tensor_para;
+    NcclParam pipeline_para;
+    ftNcclInitialize(tensor_para, pipeline_para, tensor_para_size, pipeline_para_size);
+
+    // Handle bad_words dictionary
+    std::vector<int> bad_words;
+    read_word_list("../examples/cpp/llama/bad_words.csv", bad_words);
+
+    int* d_bad_words = nullptr;
+    deviceMalloc(&d_bad_words, bad_words.size(), false);
+    cudaH2Dcpy(d_bad_words, bad_words.data(), bad_words.size());
+
+    // Handle stop_words dictionary
+    std::vector<int> stop_words;
+    read_word_list("../examples/cpp/llama/stop_words.csv", stop_words);
+
+    const size_t stop_words_len = stop_words.size() / 2;
+    // Tile with same dict for each element
+    std::vector<int> tiled_stop_words;
+    for (int i = 0; i < request_batch_size; i++) {
+        tiled_stop_words.insert(tiled_stop_words.end(), stop_words.begin(), stop_words.end());
+    }
+
+    int* d_stop_words = nullptr;
+    deviceMalloc(&d_stop_words, tiled_stop_words.size(), false);
+    cudaH2Dcpy(d_stop_words, tiled_stop_words.data(), tiled_stop_words.size());
+
+    // Read ids of request from file.
+    size_t           max_input_len = -1;
+    std::vector<int> v_start_lengths;
+    std::vector<int> v_start_ids;
+    read_start_ids(request_batch_size,
+                   &v_start_lengths,
+                   &v_start_ids,
+                   max_input_len,
+                   end_id,
+                   1,
+                   "../examples/cpp/llama/start_ids.csv");
+
+    int* d_input_ids;
+    int* d_input_lengths;
+    if (max_input_len == 0) {
+        // unconditional case, no input ids, so do nothing.
+        d_input_ids     = nullptr;
+        d_input_lengths = nullptr;
+    }
+    else {
+        // conditional case.
+        deviceMalloc(&d_input_ids, request_batch_size * max_input_len, false);
+        deviceMalloc(&d_input_lengths, request_batch_size, false);
+        cudaH2Dcpy(d_input_ids, v_start_ids.data(), request_batch_size * max_input_len);
+        cudaH2Dcpy(d_input_lengths, v_start_lengths.data(), request_batch_size);
+    }
+    std::vector<int> start_ids(request_batch_size, start_id);
+    std::vector<int> end_ids(request_batch_size, end_id);
+
+    // Prompt Learning Configurations
+    // NOTE: if you don't need prefix prompts, remember to set max_prefix_len to 0 and others to nullptr
+    int prompt_learning_start_id = reader.GetInteger(model_name, "prompt_learning_start_id", end_id + 1);
+    fastertransformer::PromptLearningType prompt_learning_type =
+        static_cast<fastertransformer::PromptLearningType>(reader.GetInteger(model_name, "prompt_learning_type", 0));
+
+    // NOTE: specify task names, take name id, prompt length in order to load those prompt learning tables.
+    // NOTE: Please make sure task ids are continuous and start from 0
+    // for example:
+    // std::map<std::string, std::pair<int, int>> prefix_prompt_table_pair{{"no_prompt", {0, 0}},
+    //                                                                     {"prompt_1", {1, 1}},
+    //                                                                     {"prompt_2", {2, 2}},
+    //                                                                     {"prompt_3", {3, 3}},
+    //                                                                     {"prompt_4", {4, 4}},
+    //                                                                     {"prompt_5", {5, 5}}};
+
+    std::map<std::string, std::pair<int, int>> prefix_prompt_table_pair;
+
+    // NOTE: get prompt table pairs from configuration files
+    const int num_tasks = reader.GetInteger(model_name, "num_tasks", 0);
+    for (int task_name_id = 0; task_name_id < num_tasks; task_name_id++) {
+        std::string config_task_name = model_name + "_task_" + std::to_string(task_name_id);
+        std::string task_name        = reader.Get(config_task_name, "task_name");
+        const int   prompt_length    = reader.GetInteger(config_task_name, "prompt_length", 0);
+        prefix_prompt_table_pair.insert({task_name, {task_name_id, prompt_length}});
+    }
+
+    // NOTE: task_name_ids for each sequence in one batch
+    // Each sequence can have different prompt learning task ids
+    std::vector<int> prefix_prompt_task_ids(request_batch_size, 0);
+
+    // Set different task ids
+    for (int i = 0; i < request_batch_size; i++) {
+        prefix_prompt_task_ids[i] = (num_tasks > 0) ? i % num_tasks : 0;
+    }
+
+    const int total_output_len = max_input_len + request_output_len;
+
+    cudaStream_t     stream;
+    cublasHandle_t   cublas_handle;
+    cublasLtHandle_t cublaslt_handle;
+    cudaStreamCreate(&stream);
+    cublasCreate(&cublas_handle);
+    cublasLtCreate(&cublaslt_handle);
+    cublasSetStream(cublas_handle, stream);
+    cublasAlgoMap* cublas_algo_map = new cublasAlgoMap("gemm_config.in");
+
+    Allocator<AllocatorType::CUDA> allocator(getDevice());
+
+    std::mutex*     cublas_wrapper_mutex = new std::mutex();
+    cublasMMWrapper cublas_wrapper =
+        cublasMMWrapper(cublas_handle, cublaslt_handle, stream, cublas_algo_map, cublas_wrapper_mutex, &allocator);
+    if (std::is_same<T, half>::value) {
+        cublas_wrapper.setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F);
+    }
+    else if (std::is_same<T, float>::value) {
+        cublas_wrapper.setFP32GemmConfig();
+    }
+
+    const bool                          use_gptj_residual = (bool)reader.GetInteger(model_name, "use_gptj_residual", 1);
+    fastertransformer::LlamaWeight<T> gpt_weights(hidden_units,
+                                                  inter_size,
+                                                  vocab_size,
+                                                  decoder_layers,
+                                                  0,  // max_seq_len, deprecated
+                                                  tensor_para.world_size_,
+                                                  tensor_para.rank_,
+                                                  pipeline_para.world_size_,
+                                                  pipeline_para.rank_,
+                                                  use_gptj_residual,
+                                                  prompt_learning_type,
+                                                  prefix_prompt_table_pair);
+
+    model_dir = model_dir + "/" + std::to_string(tensor_para.world_size_) + "-gpu";
+    gpt_weights.loadModel(model_dir);
+    unsigned long long random_seed;
+    if (rank == 0) {
+        random_seed = (unsigned long long)(0);
+    }
+    if (world_size > 1) {
+        mpi::bcast(&random_seed, 1, mpi::MPI_TYPE_UNSIGNED_LONG_LONG, 0, mpi::COMM_WORLD);
+    }
+
+    AttentionType attention_type = getAttentionType<T>(size_per_head,
+                                                       getSMVersion(),
+                                                       true,   // remove_padding
+                                                       0,      // gpt supports any-seq-length fmha
+                                                       true,   // is_fuse
+                                                       false,  // with_relative_position_bias
+                                                       true);  // causal_mask
+
+    Llama<T> gpt = Llama<T>(head_num,
+                            size_per_head,
+                            inter_size,
+                            decoder_layers,
+                            vocab_size,
+                            rotary_embedding_dim,
+                            start_id,
+                            end_id,
+                            prompt_learning_start_id,
+                            prompt_learning_type,
+                            use_gptj_residual,
+                            0.0f,
+                            top_k,
+                            top_p,
+                            random_seed,
+                            temperature,
+                            len_penalty,
+                            repetition_penalty,
+                            tensor_para,
+                            pipeline_para,
+                            stream,
+                            &cublas_wrapper,
+                            &allocator,
+                            false,
+                            &prop,
+                            attention_type);
+
+    int* d_output_ids;
+    int* d_sequence_lengths;
+    deviceMalloc(&d_output_ids, request_batch_size * beam_width * total_output_len, false);
+    deviceMalloc(&d_sequence_lengths, request_batch_size * beam_width, false);
+    std::vector<uint32_t>                   output_seq_len(request_batch_size, total_output_len);
+    std::unordered_map<std::string, Tensor> input_tensors = std::unordered_map<std::string, Tensor>{
+        {"input_ids",
+         Tensor{MEMORY_GPU, TYPE_INT32, std::vector<size_t>{request_batch_size, (size_t)max_input_len}, d_input_ids}},
+        {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, std::vector<size_t>{request_batch_size}, d_input_lengths}},
+        // NOTE: if you need prefix prompts, remember to add prefix_prompt_task_ids here
+        // {"prompt_learning_task_name_ids", Tensor{MEMORY_CPU, TYPE_INT32, std::vector<size_t>{request_batch_size},
+        // prefix_prompt_task_ids.data()}},
+        {"output_seq_len",
+         Tensor{MEMORY_CPU, TYPE_UINT32, std::vector<size_t>{request_batch_size}, output_seq_len.data()}},
+        {"bad_words_list", Tensor{MEMORY_GPU, TYPE_INT32, {2, bad_words.size() / 2}, d_bad_words}},
+        {"stop_words_list", Tensor{MEMORY_GPU, TYPE_INT32, {request_batch_size, 2, stop_words_len}, d_stop_words}},
+        {"temperature", Tensor{MEMORY_CPU, TYPE_FP32, std::vector<size_t>{1}, &temperature}},
+        {"len_penalty", Tensor{MEMORY_CPU, TYPE_FP32, std::vector<size_t>{1}, &len_penalty}},
+        {"min_length", Tensor{MEMORY_CPU, TYPE_INT32, std::vector<size_t>{1}, &min_length}},
+        {"start_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector<size_t>{request_batch_size}, start_ids.data()}},
+        {"end_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector<size_t>{request_batch_size}, end_ids.data()}}};
+
+    if (repetition_penalty != 1.0f) {
+        input_tensors.insert(
+            {"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, std::vector<size_t>{1}, &repetition_penalty}});
+    }
+    if (presence_penalty != 0.0f) {
+        input_tensors.insert(
+            {"presence_penalty", Tensor{MEMORY_CPU, TYPE_FP32, std::vector<size_t>{1}, &presence_penalty}});
+    }
+
+    if (num_tasks > 0) {
+        // Prefix Prompt Task Name Ids here
+        input_tensors.insert(
+            {"prompt_learning_task_name_ids",
+             Tensor{MEMORY_CPU, TYPE_INT32, std::vector<size_t>{request_batch_size}, prefix_prompt_task_ids.data()}});
+    }
+
+    if (top_k == 0 && top_p == 0.0f) {
+        FT_CHECK(beam_width > 1);
+        input_tensors.insert({"beam_search_diversity_rate",
+                              Tensor{MEMORY_CPU, TYPE_FP32, std::vector<size_t>{1}, &beam_search_diversity_rate}});
+    }
+    else {
+        input_tensors.insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector<size_t>{1}, &random_seed}});
+        if (top_p != 0.0f) {
+            input_tensors.insert({"runtime_top_p", Tensor{MEMORY_CPU, TYPE_FP32, std::vector<size_t>{1}, &top_p}});
+        }
+        if (top_k != 0) {
+            input_tensors.insert({"runtime_top_k", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector<size_t>{1}, &top_k}});
+        }
+    }
+
+    std::unordered_map<std::string, Tensor> output_tensors = std::unordered_map<std::string, Tensor>{
+        {"output_ids",
+         Tensor{MEMORY_GPU,
+                TYPE_INT32,
+                std::vector<size_t>{request_batch_size, beam_width, (size_t)total_output_len},
+                d_output_ids}},
+        {"sequence_length",
+         Tensor{MEMORY_GPU, TYPE_INT32, std::vector<size_t>{request_batch_size, beam_width}, d_sequence_lengths}},
+        {"output_log_probs",
+         Tensor{MEMORY_GPU,
+                TYPE_FP32,
+                std::vector<size_t>{(size_t)request_output_len, request_batch_size, beam_width},
+                nullptr}}};
+
+    print_mem_usage();
+
+    int ite = 1;
+    cudaDeviceSynchronize();
+    mpi::barrier();
+
+    cudaProfilerStart();
+    // warm up
+    ite = 1;
+    ft_nvtx::setScope("warmup_time");
+    PUSH_RANGE("warmup time")
+    for (int i = 0; i < ite; ++i) {
+        gpt.forward(&output_tensors, &input_tensors, &gpt_weights);
+    }
+    cudaDeviceSynchronize();
+    mpi::barrier();
+
+    POP_RANGE;
+    ft_nvtx::resetScope();
+
+    if (rank == 0) {
+
+        std::string fName   = "out";
+        auto        outFile = std::ofstream(fName, std::ios::out);
+        if (!outFile.is_open()) {
+            printf("[WARNING] Cannot write results into output file %s \n", fName.c_str());
+        }
+        else {
+            size_t outCount = total_output_len * request_batch_size * beam_width;
+            int*   hBuf     = new int[outCount];
+            cudaD2Hcpy(hBuf, d_output_ids, outCount);
+
+            {
+                std::cout << "Writing " << outCount << " elements\n";
+                int zeroCount = 0;
+                for (size_t i = 0; i < outCount; i++) {
+                    if (hBuf[i] == int(0)) {
+                        zeroCount++;
+                    }
+                    outFile << hBuf[i] << " ";
+                    if ((i + 1) % (total_output_len) == 0) {
+                        outFile << std::endl;
+                    }
+
+                    if (i < 10) {
+                        printf("%5d ", hBuf[i]);
+                    }
+                    if ((i + 1) % (total_output_len) == 0 && i < 10) {
+                        std::cout << std::endl;
+                    }
+                }
+                std::cout << std::endl << "zeroCount = " << zeroCount << std::endl;
+            }
+            delete[] hBuf;
+        }
+    }
+
+    // test time
+    struct timeval start, end;
+    mpi::barrier();
+    cudaDeviceSynchronize();
+    gettimeofday(&start, NULL);
+
+    ft_nvtx::setScope("total_time");
+    PUSH_RANGE("total time")
+    for (int i = 0; i < ite; ++i) {
+        gpt.forward(&output_tensors, &input_tensors, &gpt_weights);
+    }
+
+    cudaDeviceSynchronize();
+    mpi::barrier();
+
+    POP_RANGE;
+    ft_nvtx::resetScope();
+    gettimeofday(&end, NULL);
+
+    cudaProfilerStop();
+
+    printf("[INFO] request_batch_size %ld beam_width %ld head_num %ld size_per_head %ld total_output_len %d"
+           " decoder_layers %ld vocab_size %ld FT-CPP-decoding-beamsearch-time %.2f ms\n",
+           request_batch_size,
+           beam_width,
+           head_num,
+           size_per_head,
+           total_output_len,
+           decoder_layers,
+           vocab_size,
+           ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001) / ite);
+
+    ftNcclParamDestroy(tensor_para);
+    ftNcclParamDestroy(pipeline_para);
+
+    delete cublas_algo_map;
+    delete cublas_wrapper_mutex;
+
+    cudaFree(d_bad_words);
+    cudaFree(d_stop_words);
+    if (d_input_ids != nullptr) {
+        cudaFree(d_input_ids);
+    }
+    if (d_input_lengths != nullptr) {
+        cudaFree(d_input_lengths);
+    }
+    if (d_output_ids != nullptr) {
+        deviceFree(d_output_ids);
+    }
+    if (d_sequence_lengths != nullptr) {
+        deviceFree(d_sequence_lengths);
+    }
+
+    return;
+}
diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
new file mode 100644
index 000000000..1840035a2
--- /dev/null
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -0,0 +1,457 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "3rdparty/INIReader.h"
+#include "examples/cpp/multi_gpu_gpt/gpt_example_utils.h"
+#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h"
+#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
+#include "src/fastertransformer/utils/custom_ar_comm.h"
+#include "src/fastertransformer/utils/mpi_utils.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+#include "src/fastertransformer/utils/nvtx_utils.h"
+#include "src/fastertransformer/utils/word_list.h"
+
+#include <memory>
+#include <thread>
+
+namespace ft = fastertransformer;
+
+struct RequestParam {
+    int                    beam_width;
+    int                    request_output_len;
+    float                  beam_search_diversity_rate;
+    uint                   runtime_top_k;
+    float                  runtime_top_p;
+    float                  temperature;
+    float                  len_penalty;
+    float                  repetition_penalty;
+    float                  presence_penalty;
+    int                    min_length;
+    unsigned long long int random_seed;
+    int                    start_id;
+    int                    end_id;
+};
+
+std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>>
+broadCastRequest(const std::vector<int>& v_start_ids,
+                 const std::vector<int>& v_start_lengths,
+                 const std::vector<int>& v_bad_words,
+                 const int               node_id,
+                 const int               gpu_count,
+                 const RequestParam      param,
+                 std::vector<void*>*     pointer_record)
+{
+    // broadcast the request to all nodes, and copy "gpu_count" copies on different gpu
+    int size_1         = v_start_ids.size();
+    int size_2         = v_start_lengths.size();
+    int size_bad_words = v_bad_words.size();
+    ft::mpi::bcast(&size_1, 1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD);
+    ft::mpi::bcast(&size_2, 1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD);
+    ft::mpi::bcast(&size_bad_words, 1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD);
+
+    std::vector<int> v_input_ids(size_1);
+    std::vector<int> v_input_lengths(size_2);
+    std::vector<int> v_input_bad_words(size_bad_words);
+
+    if (node_id == 0) {
+        memcpy(v_input_ids.data(), v_start_ids.data(), size_1 * sizeof(int));
+        memcpy(v_input_lengths.data(), v_start_lengths.data(), size_2 * sizeof(int));
+        memcpy(v_input_bad_words.data(), v_bad_words.data(), size_bad_words * sizeof(int));
+    }
+    ft::mpi::barrier();
+
+    int request_batch_size = size_2;
+    int max_input_len      = size_1 / size_2;
+
+    ft::mpi::bcast(v_input_ids.data(), size_1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD);
+    ft::mpi::bcast(v_input_lengths.data(), size_2, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD);
+    ft::mpi::bcast(v_input_bad_words.data(), size_bad_words, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD);
+
+    std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list;
+    for (int device_id = 0; device_id < gpu_count; device_id++) {
+        ft::check_cuda_error(cudaSetDevice(device_id));
+
+        int* d_input_ids;
+        int* d_input_lengths;
+        int* d_input_bad_words;
+
+        if (max_input_len == 0) {
+            // unconditional case, no input ids, so do nothing.
+            d_input_ids     = nullptr;
+            d_input_lengths = nullptr;
+            max_input_len   = 0;
+        }
+        else {
+            // conditional case.
+            ft::deviceMalloc(&d_input_ids, size_1, false);
+            ft::deviceMalloc(&d_input_lengths, size_2, false);
+            ft::cudaH2Dcpy(d_input_ids, v_input_ids.data(), size_1);
+            ft::cudaH2Dcpy(d_input_lengths, v_input_lengths.data(), size_2);
+        }
+        ft::deviceMalloc(&d_input_bad_words, size_bad_words, false);
+        ft::cudaH2Dcpy(d_input_bad_words, v_input_bad_words.data(), size_bad_words);
+
+        uint32_t* request_output_len_ptr = (uint32_t*)malloc(request_batch_size * sizeof(uint32_t));
+        for (int i = 0; i < request_batch_size; i++) {
+            request_output_len_ptr[i] = param.request_output_len;
+        }
+
+        int* start_ids_ptr = (int*)malloc(request_batch_size * sizeof(int));
+        int* end_ids_ptr   = (int*)malloc(request_batch_size * sizeof(int));
+        for (int i = 0; i < request_batch_size; i++) {
+            start_ids_ptr[i] = param.start_id;
+            end_ids_ptr[i]   = param.end_id;
+        }
+        pointer_record->push_back(start_ids_ptr);
+        pointer_record->push_back(end_ids_ptr);
+
+        std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors(
+            new std::unordered_map<std::string, triton::Tensor>{
+                {"input_ids",
+                 triton::Tensor{triton::MEMORY_GPU,
+                                triton::TYPE_INT32,
+                                std::vector<size_t>{(size_t)request_batch_size, (size_t)max_input_len},
+                                d_input_ids}},
+                {"input_lengths",
+                 triton::Tensor{triton::MEMORY_GPU,
+                                triton::TYPE_INT32,
+                                std::vector<size_t>{(size_t)request_batch_size},
+                                d_input_lengths}},
+                // NOTE: add prefix prompt task ids here if you need
+                // {"prefix_prompt_task_ids", triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32,
+                // std::vector<size_t>{request_batch_size}, task_name_ids}},
+                {"request_output_len",
+                 triton::Tensor{triton::MEMORY_CPU,
+                                triton::TYPE_UINT32,
+                                std::vector<size_t>{(size_t)request_batch_size},
+                                request_output_len_ptr}},
+                {"start_id",
+                 triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, {(size_t)request_batch_size}, start_ids_ptr}},
+                {"end_id",
+                 triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, {(size_t)request_batch_size}, end_ids_ptr}}});
+        if (!v_input_bad_words.empty()) {
+            input_tensors->insert(
+                {"bad_words_list",
+                 triton::Tensor{
+                     triton::MEMORY_GPU, triton::TYPE_INT32, {2, v_input_bad_words.size() / 2}, d_input_bad_words}});
+        }
+        request_list.push_back(input_tensors);
+
+        int* beam_width_ptr = new int(param.beam_width);
+        pointer_record->push_back(beam_width_ptr);
+        request_list[device_id]->insert(
+            {"beam_width",
+             triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, std::vector<size_t>{1}, beam_width_ptr}});
+        if (param.beam_width > 1) {
+            float* beam_search_diversity_rate_ptr = new float(param.beam_search_diversity_rate);
+            pointer_record->push_back(beam_search_diversity_rate_ptr);
+            request_list[device_id]->insert(
+                {"beam_search_diversity_rate",
+                 triton::Tensor{
+                     triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, beam_search_diversity_rate_ptr}});
+        }
+        else {
+            if (param.runtime_top_p != 0.0f) {
+                float* runtime_top_p_ptr = new float(param.runtime_top_p);
+                pointer_record->push_back(runtime_top_p_ptr);
+                request_list[device_id]->insert(
+                    {"runtime_top_p",
+                     triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, runtime_top_p_ptr}});
+            }
+            if (param.runtime_top_k != 0) {
+                uint* runtime_top_k_ptr = new uint(param.runtime_top_k);
+                pointer_record->push_back(runtime_top_k_ptr);
+                request_list[device_id]->insert(
+                    {"runtime_top_k",
+                     triton::Tensor{
+                         triton::MEMORY_CPU, triton::TYPE_UINT32, std::vector<size_t>{1}, runtime_top_k_ptr}});
+            }
+        }
+        float* temperature_ptr = new float(param.temperature);
+        pointer_record->push_back(temperature_ptr);
+        request_list[device_id]->insert(
+            {"temperature",
+             triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, temperature_ptr}});
+        float* len_penalty_ptr = new float(param.len_penalty);
+        pointer_record->push_back(len_penalty_ptr);
+        request_list[device_id]->insert(
+            {"len_penalty",
+             triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, len_penalty_ptr}});
+        if (param.repetition_penalty != 1.0f) {
+            float* repetition_penalty_ptr = new float(param.repetition_penalty);
+            pointer_record->push_back(repetition_penalty_ptr);
+            request_list[device_id]->insert(
+                {"repetition_penalty",
+                 triton::Tensor{
+                     triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, repetition_penalty_ptr}});
+        }
+        if (param.presence_penalty != 0.0f) {
+            float* presence_penalty_ptr = new float(param.presence_penalty);
+            pointer_record->push_back(presence_penalty_ptr);
+            request_list[device_id]->insert(
+                {"presence_penalty",
+                 triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, presence_penalty_ptr}});
+        }
+        int* min_length_ptr = new int(param.min_length);
+        pointer_record->push_back(min_length_ptr);
+        request_list[device_id]->insert(
+            {"min_length",
+             triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, std::vector<size_t>{1}, min_length_ptr}});
+        unsigned long long int* random_seed_ptr = new unsigned long long int(param.random_seed);
+        pointer_record->push_back(random_seed_ptr);
+        request_list[device_id]->insert(
+            {"random_seed",
+             triton::Tensor{triton::MEMORY_CPU, triton::TYPE_UINT64, std::vector<size_t>{1}, random_seed_ptr}});
+
+        pointer_record->push_back(d_input_ids);
+        pointer_record->push_back(d_input_lengths);
+        pointer_record->push_back(d_input_bad_words);
+        pointer_record->push_back(request_output_len_ptr);
+    }
+
+    return request_list;
+}
+
+std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>>
+prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std::vector<void*>* pointer_record)
+{
+    INIReader reader = INIReader(ini_name);
+    if (reader.ParseError() < 0) {
+        std::cout << "[ERROR] Can't load '" << ini_name << "'\n";
+        ft::FT_CHECK(false);
+    }
+
+    const size_t      request_batch_size = reader.GetInteger("request", "request_batch_size");
+    const std::string model_name         = reader.Get("ft_instance_hyperparameter", "model_name");
+    const int         start_id           = reader.GetInteger(model_name, "start_id");
+    const int         end_id             = reader.GetInteger(model_name, "end_id");
+
+    std::vector<int> v_start_ids;
+    std::vector<int> v_start_lengths;
+
+    size_t max_input_len = 0;
+    ft::read_start_ids(request_batch_size,
+                       &v_start_lengths,
+                       &v_start_ids,
+                       max_input_len,
+                       end_id,
+                       1,
+                       "../examples/cpp/gptj/start_ids.csv");
+
+    std::vector<int> v_bad_words;
+    ft::read_word_list("../examples/cpp/gptj/bad_words.csv", v_bad_words);
+
+    RequestParam param;
+    param.beam_width                 = reader.GetInteger("ft_instance_hyperparameter", "beam_width");
+    param.request_output_len         = reader.GetInteger("request", "request_output_len");
+    param.beam_search_diversity_rate = reader.GetFloat("ft_instance_hyperparameter", "beam_search_diversity_rate");
+    param.runtime_top_k              = (uint)reader.GetInteger("ft_instance_hyperparameter", "top_k");
+    param.runtime_top_p              = reader.GetFloat("ft_instance_hyperparameter", "top_p");
+    param.temperature                = reader.GetFloat("ft_instance_hyperparameter", "temperature");
+    param.len_penalty                = reader.GetFloat("ft_instance_hyperparameter", "len_penalty");
+    param.repetition_penalty         = reader.GetFloat("ft_instance_hyperparameter", "repetition_penalty", 1.0f);
+    param.presence_penalty           = reader.GetFloat("ft_instance_hyperparameter", "presence_penalty", 0.0f);
+    param.min_length                 = reader.GetInteger("ft_instance_hyperparameter", "min_length", 0);
+    param.random_seed                = (unsigned long long int)0;
+    param.start_id                   = start_id;
+    param.end_id                     = end_id;
+
+    auto request_list =
+        broadCastRequest(v_start_ids, v_start_lengths, v_bad_words, node_id, gpu_count, param, pointer_record);
+    return request_list;
+}
+
+int threadCreateModelInstances(std::shared_ptr<AbstractTransformerModel>                         model,
+                               std::vector<std::unique_ptr<AbstractTransformerModelInstance>>*   model_instances,
+                               const int                                                         device_id,
+                               const int                                                         rank,
+                               std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
+                               std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm = nullptr)
+{
+    printf("[INFO] rank = %d \n", rank);
+    ft::check_cuda_error(cudaSetDevice(device_id));
+    cudaStream_t stream;
+    ft::check_cuda_error(cudaStreamCreate(&stream));
+    model->createSharedWeights(device_id, rank);
+    auto model_instance = model->createModelInstance(device_id, rank, stream, nccl_params, custom_all_reduce_comm);
+    model_instances->at(device_id) = std::move(model_instance);
+    printf("model instance %d is created \n", device_id);
+    ft::print_mem_usage();
+    return 0;
+}
+
+int threadForward(std::unique_ptr<AbstractTransformerModelInstance>*                model_instance,
+                  std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>  request,
+                  std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>* output_tensors,
+                  const int                                                         device_id)
+{
+    ft::check_cuda_error(cudaSetDevice(device_id));
+    *output_tensors = (*model_instance)->forward(request);
+    return 0;
+}
+
+int main(int argc, char* argv[])
+{
+    /*
+        Prepare the nccl ids, node id, device id and world size
+        by MPI or triton
+    */
+
+    ft::mpi::initialize(&argc, &argv);
+    int node_id  = ft::mpi::getCommWorldRank();
+    int node_num = ft::mpi::getCommWorldSize();
+
+    // Note: Only supports that all nodes have same gpu count
+    const int   gpu_count  = ft::getDeviceCount();
+    const int   world_size = node_num * gpu_count;
+    std::string ini_name   = argc >= 2 ? std::string(argv[1]) : "../examples/cpp/gptj/gptj_config.ini";
+
+    // step 1: Create model
+    std::shared_ptr<AbstractTransformerModel> model              = AbstractTransformerModel::createGptJModel(ini_name);
+    int                                       tensor_para_size   = model->getTensorParaSize();
+    int                                       pipeline_para_size = model->getPipelineParaSize();
+    FT_CHECK_WITH_INFO(world_size == (tensor_para_size * pipeline_para_size),
+                       "World Size != Tensor Parallel Size * Pipeline Parallel Size !");
+
+    std::cout << model->toString();
+
+    // step 2: Initialize the NCCL
+    std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params = model->createNcclParams(node_id);
+    cudaDeviceSynchronize();
+
+    // Optional Step: create custom all reduce comm
+    std::vector<std::shared_ptr<ft::AbstractCustomComm>> custom_all_reduce_comms;
+    model->createCustomComms(&custom_all_reduce_comms, world_size);
+
+    // step 3: Create model instances
+    std::vector<std::unique_ptr<AbstractTransformerModelInstance>> model_instances((size_t)gpu_count);
+    std::vector<std::thread>                                       threads;
+    for (int device_id = 0; device_id < gpu_count; device_id++) {
+        const int rank = node_id * gpu_count + device_id;
+        threads.push_back(std::thread(threadCreateModelInstances,
+                                      model,
+                                      &model_instances,
+                                      device_id,
+                                      rank,
+                                      nccl_params,
+                                      custom_all_reduce_comms[rank]));
+    }
+    for (auto& t : threads) {
+        t.join();
+    }
+
+    // step 4: prepare request
+    std::vector<void*> pointer_record;  // Used to prevent the pointers are release after leaving functions
+    std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list =
+        prepareRequest(ini_name, node_id, gpu_count, &pointer_record);
+    printf("[INFO] request is created \n");
+
+    // step 5: Forward
+    std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> output_tensors_lists(
+        (size_t)gpu_count);
+    for (int i = 0; i < 2; i++) {
+        threads.clear();
+        for (int device_id = 0; device_id < gpu_count; device_id++) {
+            threads.push_back(std::thread(threadForward,
+                                          &model_instances[device_id],
+                                          request_list[device_id],
+                                          &output_tensors_lists[device_id],
+                                          device_id));
+        }
+        for (auto& t : threads) {
+            t.join();
+        }
+    }
+    printf("[INFO] forward is completed. \n");
+
+    const int* d_output_ids = (const int*)output_tensors_lists[0].get()->at("output_ids").data;
+    const int  batch_size   = output_tensors_lists[0].get()->at("output_ids").shape[0];
+    const int  beam_width   = output_tensors_lists[0].get()->at("output_ids").shape[1];
+    const int  seq_len      = output_tensors_lists[0].get()->at("output_ids").shape[2];
+    // step 6: check results
+    if (node_id == 0) {
+
+        std::string fName   = "out";
+        auto        outFile = std::ofstream(fName, std::ios::out);
+        if (!outFile.is_open()) {
+            printf("[WARNING] Cannot write results into output file %s \n", fName.c_str());
+        }
+        else {
+            size_t outCount = batch_size * beam_width * seq_len;
+            int*   hBuf     = new int[outCount];
+            ft::cudaD2Hcpy(hBuf, d_output_ids, outCount);
+
+            {
+                std::cout << "Writing " << outCount << " elements\n";
+                int zeroCount = 0;
+                for (size_t i = 0; i < outCount; i++) {
+                    if (hBuf[i] == int(0)) {
+                        zeroCount++;
+                    }
+                    outFile << hBuf[i] << " ";
+                    if ((i + 1) % (seq_len) == 0) {
+                        outFile << std::endl;
+                    }
+
+                    if (i < 10) {
+                        printf("%5d ", hBuf[i]);
+                    }
+                    if ((i + 1) % (seq_len) == 0 && i < 10) {
+                        std::cout << std::endl;
+                    }
+                }
+                std::cout << std::endl << "zeroCount = " << zeroCount << std::endl;
+            }
+            delete[] hBuf;
+        }
+    }
+
+    // test time
+    struct timeval start, end;
+    ft::mpi::barrier();
+    cudaDeviceSynchronize();
+    gettimeofday(&start, NULL);
+
+    const int ite = 1;
+    for (int i = 0; i < ite; i++) {
+        threads.clear();
+        for (int device_id = 0; device_id < gpu_count; device_id++) {
+            threads.push_back(std::thread(threadForward,
+                                          &model_instances[device_id],
+                                          request_list[device_id],
+                                          &output_tensors_lists[device_id],
+                                          device_id));
+        }
+        for (auto& t : threads) {
+            t.join();
+        }
+    }
+
+    cudaDeviceSynchronize();
+    ft::mpi::barrier();
+
+    gettimeofday(&end, NULL);
+
+    printf("[INFO] batch_size %d beam_width %d seq_len %d"
+           " FT-CPP-GPT-Triton-time %.2f ms\n",
+           batch_size,
+           beam_width,
+           seq_len,
+           ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001) / ite);
+
+    ft::mpi::finalize();
+    return 0;
+}
diff --git a/examples/cpp/llama/model_config.json b/examples/cpp/llama/model_config.json
new file mode 100644
index 000000000..70266f26b
--- /dev/null
+++ b/examples/cpp/llama/model_config.json
@@ -0,0 +1 @@
+{"vocab_size": 32000, "max_position_embeddings": 2048, "hidden_size": 4096, "intermediate_size": 11008, "num_hidden_layers": 32, "num_attention_heads": 32, "hidden_act": "silu", "initializer_range": 0.02, "rms_norm_eps": 1e-06, "use_cache": True, "return_dict": True, "output_hidden_states": False, "output_attentions": False, "torchscript": False, "torch_dtype": torch.float16, "use_bfloat16": False, "tf_legacy_loss": False, "pruned_heads": {}, "tie_word_embeddings": False, "is_encoder_decoder": False, "is_decoder": False, "cross_attention_hidden_size": None, "add_cross_attention": False, "tie_encoder_decoder": False, "max_length": 20, "min_length": 0, "do_sample": False, "early_stopping": False, "num_beams": 1, "num_beam_groups": 1, "diversity_penalty": 0.0, "temperature": 1.0, "top_k": 50, "top_p": 1.0, "typical_p": 1.0, "repetition_penalty": 1.0, "length_penalty": 1.0, "no_repeat_ngram_size": 0, "encoder_no_repeat_ngram_size": 0, "bad_words_ids": None, "num_return_sequences": 1, "chunk_size_feed_forward": 0, "output_scores": False, "return_dict_in_generate": False, "forced_bos_token_id": None, "forced_eos_token_id": None, "remove_invalid_values": False, "exponential_decay_length_penalty": None, "suppress_tokens": None, "begin_suppress_tokens": None, "architectures": ["LLaMAForCausalLM"], "finetuning_task": None, "id2label": {0: "LABEL_0", 1: "LABEL_1"}, "label2id": {"LABEL_0": 0, "LABEL_1": 1}, "tokenizer_class": None, "prefix": None, "bos_token_id": 0, "pad_token_id": -1, "eos_token_id": 1, "sep_token_id": None, "decoder_start_token_id": None, "task_specific_params": None, "problem_type": None, "_name_or_path": "/data/llama-7b-hf/", "_commit_hash": None, "transformers_version": "4.27.0.dev0", "max_sequence_length": 2048, "model_type": "llama"}
diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv
new file mode 100644
index 000000000..35ddd915d
--- /dev/null
+++ b/examples/cpp/llama/start_ids.csv
@@ -0,0 +1,8 @@
+0, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
+0, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
+0, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
+0, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
+0, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
+0, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
+0, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
+0, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
diff --git a/examples/cpp/llama/stop_words.csv b/examples/cpp/llama/stop_words.csv
new file mode 100644
index 000000000..9b9b09eba
--- /dev/null
+++ b/examples/cpp/llama/stop_words.csv
@@ -0,0 +1,2 @@
+287, 4346, 12
+3, -1, -1
diff --git a/src/fastertransformer/models/CMakeLists.txt b/src/fastertransformer/models/CMakeLists.txt
index 248b4af3d..d55782717 100644
--- a/src/fastertransformer/models/CMakeLists.txt
+++ b/src/fastertransformer/models/CMakeLists.txt
@@ -37,3 +37,5 @@ add_subdirectory(vit)
 add_subdirectory(vit_int8)
 
 add_subdirectory(wenet)
+
+add_subdirectory(llama)
diff --git a/src/fastertransformer/models/llama/CMakeLists.txt b/src/fastertransformer/models/llama/CMakeLists.txt
new file mode 100644
index 000000000..ec836068d
--- /dev/null
+++ b/src/fastertransformer/models/llama/CMakeLists.txt
@@ -0,0 +1,69 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.8)
+
+add_library(LlamaDecoderLayerWeight STATIC LlamaDecoderLayerWeight.cc)
+set_property(TARGET LlamaDecoderLayerWeight PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET LlamaDecoderLayerWeight PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(LlamaDecoderLayerWeight PUBLIC memory_utils cuda_utils logger)
+
+add_library(LlamaDecoder STATIC LlamaDecoder.cc)
+set_property(TARGET LlamaDecoder PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET LlamaDecoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(LlamaDecoder PUBLIC -lcudart cublasMMWrapper
+                      TensorParallelDecoderSelfAttentionLayer
+                      TensorParallelGeluFfnLayer
+                      layernorm_kernels
+                      add_residual_kernels
+                      LlamaDecoderLayerWeight
+                      tensor
+                      nccl_utils
+                      cuda_utils
+                      logger)
+
+add_library(LlamaContextDecoder STATIC LlamaContextDecoder.cc)
+set_property(TARGET LlamaContextDecoder PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET LlamaContextDecoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(LlamaContextDecoder PUBLIC -lcudart cublasMMWrapper
+                      TensorParallelGptContextAttentionLayer
+                      TensorParallelGeluFfnLayer
+                      layernorm_kernels
+                      add_residual_kernels
+                      gpt_kernels
+                      tensor
+                      nccl_utils
+                      cuda_utils
+                      logger)
+
+add_library(LlamaWeight STATIC LlamaWeight.cc)
+set_property(TARGET LlamaWeight PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET LlamaWeight PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(LlamaWeight PUBLIC LlamaDecoderLayerWeight cuda_utils logger)
+
+add_library(Llama STATIC Llama.cc)
+set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(Llama PUBLIC -lcudart
+                      LlamaDecoder
+                      LlamaContextDecoder
+                      decoding_kernels
+                      gpt_kernels
+                      DynamicDecodeLayer
+                      BaseBeamSearchLayer
+                      bert_preprocess_kernels
+                      tensor
+                      LlamaWeight
+                      cuda_utils
+                      logger)
diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
new file mode 100644
index 000000000..0091fce48
--- /dev/null
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -0,0 +1,1211 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/models/llama/Llama.h"
+#include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
+#include "src/fastertransformer/kernels/decoding_kernels.h"
+#include "src/fastertransformer/kernels/gpt_kernels.h"
+#include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h"
+#include <algorithm>
+
+namespace fastertransformer {
+
+template<typename T>
+void Llama<T>::initialize()
+{
+    gpt_context_decoder_ = new LlamaContextDecoder<T>(head_num_,
+                                                      size_per_head_,
+                                                      inter_size_,
+                                                      num_layer_,
+                                                      rotary_embedding_dim_,
+                                                      neox_rotary_style_,
+                                                      use_gptj_residual_,
+                                                      layernorm_eps_,
+                                                      tensor_para_,
+                                                      pipeline_para_,
+                                                      stream_,
+                                                      cublas_wrapper_,
+                                                      allocator_,
+                                                      is_free_buffer_after_forward_,
+                                                      is_context_qk_buf_float_,
+                                                      attention_type_,
+                                                      custom_all_reduce_comm_,
+                                                      enable_custom_all_reduce_);
+
+    gpt_decoder_ = new LlamaDecoder<T>(head_num_,
+                                       size_per_head_,
+                                       inter_size_,
+                                       num_layer_,
+                                       rotary_embedding_dim_,
+                                       neox_rotary_style_,
+                                       use_gptj_residual_,
+                                       layernorm_eps_,
+                                       tensor_para_,
+                                       pipeline_para_,
+                                       stream_,
+                                       cublas_wrapper_,
+                                       allocator_,
+                                       is_free_buffer_after_forward_,
+                                       custom_all_reduce_comm_,
+                                       enable_custom_all_reduce_);
+
+    dynamic_decode_layer_ = new DynamicDecodeLayer<float>(vocab_size_,
+                                                          vocab_size_padded_,
+                                                          0,  // end_id, deprecated
+                                                          stream_,
+                                                          cublas_wrapper_,
+                                                          allocator_,
+                                                          is_free_buffer_after_forward_,
+                                                          cuda_device_prop_);
+}
+
+template<typename T>
+void Llama<T>::allocateBuffer()
+{
+    FT_CHECK(false);
+}
+
+template<typename T>
+void Llama<T>::allocateBuffer(
+    size_t batch_size, size_t beam_width, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    const size_t batchxbeam      = batch_size * beam_width;
+    const size_t self_cache_size = (num_layer_ / pipeline_para_.world_size_) * batchxbeam * max_cache_seq_len
+                                   * hidden_units_ / tensor_para_.world_size_;
+
+    if (vocab_size_ != vocab_size_padded_) {
+        padded_embedding_kernel_ =
+            (T*)(allocator_->reMalloc(padded_embedding_kernel_, sizeof(T) * hidden_units_ * vocab_size_padded_, true));
+        padded_embedding_kernel_ptr_ = padded_embedding_kernel_;
+
+        padded_embedding_bias_ =
+            (T*)(allocator_->reMalloc(padded_embedding_bias_, sizeof(T) * vocab_size_padded_, true));
+    }
+
+    input_attention_mask_ = (T*)(allocator_->reMalloc(
+        input_attention_mask_, sizeof(T) * batchxbeam * max_seq_len * max_cache_seq_len, false));
+    decoder_input_buf_ = (T*)(allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
+    decoder_output_buf_ =
+        (T*)(allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
+    normed_decoder_output_buf_ =
+        (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units_, false));
+    logits_buf_ = (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false));
+    nccl_logits_buf_ =
+        (float*)(allocator_->reMalloc(nccl_logits_buf_, sizeof(float) * batchxbeam * vocab_size_padded_, false));
+    cum_log_probs_    = (float*)(allocator_->reMalloc(cum_log_probs_, sizeof(float) * batchxbeam, false));
+    finished_buf_     = (bool*)(allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false));
+    h_finished_buf_   = new bool[batchxbeam];
+    sequence_lengths_ = (int*)(allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false));
+
+    key_cache_   = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, true));
+    value_cache_ = key_cache_ + self_cache_size;
+    if (beam_width > 1) {
+        cache_indirections_[0] =
+            (int*)(allocator_->reMalloc(cache_indirections_[0], sizeof(int) * batchxbeam * max_seq_len * 2, true));
+        cache_indirections_[1] = cache_indirections_[0] + batchxbeam * max_seq_len;
+    }
+
+    // prompt_learning weight batch ptrs
+    prompt_learning_weight_batch_ =
+        (const T**)(allocator_->reMalloc(prompt_learning_weight_batch_, sizeof(T*) * batchxbeam, false));
+    tiled_prompt_lengths_buf_ =
+        (int*)(allocator_->reMalloc(tiled_prompt_lengths_buf_, sizeof(int) * batchxbeam, false));
+
+    tiled_input_ids_buf_ =
+        (int*)(allocator_->reMalloc(tiled_input_ids_buf_, sizeof(int) * batchxbeam * max_input_len, true));
+    tiled_input_lengths_buf_ = (int*)(allocator_->reMalloc(tiled_input_lengths_buf_, sizeof(int) * batchxbeam, true));
+    tiled_total_padding_count_ =
+        (int*)allocator_->reMalloc(tiled_total_padding_count_, batchxbeam * sizeof(int), false);
+
+    transposed_output_ids_buf_ =
+        (int*)(allocator_->reMalloc(transposed_output_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true));
+    output_ids_buf_ = (int*)(allocator_->reMalloc(output_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true));
+    parent_ids_buf_ = (int*)(allocator_->reMalloc(parent_ids_buf_, sizeof(int) * batchxbeam * max_seq_len, true));
+    seq_limit_len_  = (uint32_t*)(allocator_->reMalloc(seq_limit_len_, sizeof(uint32_t) * batch_size, false));
+    masked_tokens_ = (bool*)(allocator_->reMalloc(masked_tokens_, sizeof(bool) * batchxbeam * max_cache_seq_len, true));
+
+    start_ids_buf_ = (int*)(allocator_->reMalloc(start_ids_buf_, sizeof(int) * batch_size, false));
+    end_ids_buf_   = (int*)(allocator_->reMalloc(end_ids_buf_, sizeof(int) * batch_size, false));
+
+    context_decoder_input_buf_  = (T*)(allocator_->reMalloc(
+        context_decoder_input_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, false));
+    context_decoder_output_buf_ = (T*)(allocator_->reMalloc(
+        context_decoder_output_buf_, sizeof(T) * batchxbeam * max_input_len * hidden_units_, false));
+    output_log_probs_buf_ =
+        (float*)(allocator_->reMalloc(output_log_probs_buf_, sizeof(float) * batchxbeam * max_seq_len, false));
+
+    generation_should_stop_ = (bool*)allocator_->reMalloc(generation_should_stop_, sizeof(bool), true, true);
+
+    is_allocate_buffer_ = true;
+}
+
+template<typename T>
+void Llama<T>::freeBuffer()
+{
+    if (is_allocate_buffer_) {
+        if (vocab_size_ != vocab_size_padded_) {
+            padded_embedding_kernel_ptr_ = nullptr;
+            allocator_->free((void**)(&padded_embedding_kernel_));
+            allocator_->free((void**)(&padded_embedding_bias_));
+        }
+
+        allocator_->free((void**)(&input_attention_mask_));
+        allocator_->free((void**)(&decoder_input_buf_));
+        allocator_->free((void**)(&decoder_output_buf_));
+        allocator_->free((void**)(&normed_decoder_output_buf_));
+        allocator_->free((void**)(&logits_buf_));
+        allocator_->free((void**)(&nccl_logits_buf_));
+        allocator_->free((void**)(&cum_log_probs_));
+        allocator_->free((void**)(&finished_buf_));
+        delete[] h_finished_buf_;
+        allocator_->free((void**)(&sequence_lengths_));
+
+        allocator_->free((void**)(&key_cache_));
+        if (cache_indirections_[0] != nullptr) {
+            allocator_->free((void**)(&cache_indirections_)[0]);
+        }
+
+        allocator_->free((void**)(&prompt_learning_weight_batch_));
+        allocator_->free((void**)(&tiled_prompt_lengths_buf_));
+
+        allocator_->free((void**)(&tiled_input_ids_buf_));
+        allocator_->free((void**)(&tiled_input_lengths_buf_));
+        allocator_->free((void**)(&tiled_total_padding_count_));
+
+        allocator_->free((void**)(&transposed_output_ids_buf_));
+        allocator_->free((void**)(&output_ids_buf_));
+        allocator_->free((void**)(&parent_ids_buf_));
+        allocator_->free((void**)(&seq_limit_len_));
+        allocator_->free((void**)(&masked_tokens_));
+
+        allocator_->free((void**)(&start_ids_buf_));
+        allocator_->free((void**)(&end_ids_buf_));
+
+        allocator_->free((void**)(&context_decoder_input_buf_));
+        allocator_->free((void**)(&context_decoder_output_buf_));
+        allocator_->free((void**)(&output_log_probs_buf_));
+
+        allocator_->free((void**)(&generation_should_stop_), true);
+
+        is_allocate_buffer_ = false;
+    }
+}
+
+template<typename T>
+Llama<T>::Llama(size_t                              head_num,
+                size_t                              size_per_head,
+                size_t                              inter_size,
+                size_t                              num_layer,
+                size_t                              vocab_size,
+                size_t                              rotary_embedding_dim,
+                int                                 start_id,
+                int                                 end_id,
+                int                                 prompt_learning_start_id,  // only needed by p/prompt-tuning
+                PromptLearningType                  prompt_learning_type,
+                bool                                use_gptj_residual,
+                float                               beam_search_diversity_rate,
+                size_t                              top_k,
+                float                               top_p,
+                unsigned long long                  random_seed,
+                float                               temperature,
+                float                               len_penalty,
+                float                               repetition_penalty,
+                cudaStream_t                        stream,
+                cublasMMWrapper*                    cublas_wrapper,
+                IAllocator*                         allocator,
+                bool                                is_free_buffer_after_forward,
+                cudaDeviceProp*                     cuda_device_prop,
+                AttentionType                       attention_type,
+                std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
+                int                                 enable_custom_all_reduce):
+    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop),
+    head_num_(head_num),
+    size_per_head_(size_per_head),
+    inter_size_(inter_size),
+    num_layer_(num_layer),
+    vocab_size_(vocab_size),
+    rotary_embedding_dim_(rotary_embedding_dim),
+    start_id_(start_id),
+    end_id_(end_id),
+    prompt_learning_start_id_(prompt_learning_start_id),
+    prompt_learning_type_(prompt_learning_type),
+    use_gptj_residual_(use_gptj_residual),
+    hidden_units_(head_num * size_per_head),
+    local_head_num_(head_num / 1),
+    attention_type_(attention_type)
+{
+    tensor_para_.world_size_   = 1;
+    tensor_para_.rank_         = 0;
+    pipeline_para_.world_size_ = 1;
+    pipeline_para_.rank_       = 0;
+
+    int local_vacab_size = ceil(vocab_size_ / 1.f / tensor_para_.world_size_);
+    if (std::is_same<half, T>::value) {
+        local_vacab_size = ceil(local_vacab_size / 8.f) * 8;
+    }
+    vocab_size_padded_ = (size_t)local_vacab_size * tensor_para_.world_size_;
+    initialize();
+}
+
+template<typename T>
+Llama<T>::Llama(size_t                              head_num,
+                size_t                              size_per_head,
+                size_t                              inter_size,
+                size_t                              num_layer,
+                size_t                              vocab_size,
+                size_t                              rotary_embedding_dim,
+                int                                 start_id,
+                int                                 end_id,
+                int                                 prompt_learning_start_id,  // only needed by p/prompt-tuning
+                PromptLearningType                  prompt_learning_type,
+                bool                                use_gptj_residual,
+                float                               beam_search_diversity_rate,
+                size_t                              top_k,
+                float                               top_p,
+                unsigned long long                  random_seed,
+                float                               temperature,
+                float                               len_penalty,
+                float                               repetition_penalty,
+                NcclParam                           tensor_para,
+                NcclParam                           pipeline_para,
+                cudaStream_t                        stream,
+                cublasMMWrapper*                    cublas_wrapper,
+                IAllocator*                         allocator,
+                bool                                is_free_buffer_after_forward,
+                cudaDeviceProp*                     cuda_device_prop,
+                AttentionType                       attention_type,
+                std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
+                int                                 enable_custom_all_reduce):
+    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop),
+    head_num_(head_num),
+    size_per_head_(size_per_head),
+    inter_size_(inter_size),
+    num_layer_(num_layer),
+    vocab_size_(vocab_size),
+    rotary_embedding_dim_(rotary_embedding_dim),
+    start_id_(start_id),
+    end_id_(end_id),
+    prompt_learning_start_id_(prompt_learning_start_id),
+    prompt_learning_type_(prompt_learning_type),
+    use_gptj_residual_(use_gptj_residual),
+    hidden_units_(head_num * size_per_head),
+    tensor_para_(tensor_para),
+    pipeline_para_(pipeline_para),
+    local_head_num_(head_num / tensor_para.world_size_),
+    custom_all_reduce_comm_(custom_all_reduce_comm),
+    enable_custom_all_reduce_(enable_custom_all_reduce),
+    attention_type_(attention_type)
+{
+    int local_vacab_size = ceil(vocab_size_ / 1.f / tensor_para_.world_size_);
+    if (std::is_same<half, T>::value) {
+        local_vacab_size = ceil(local_vacab_size / 8.f) * 8;
+    }
+    vocab_size_padded_ = (size_t)local_vacab_size * tensor_para_.world_size_;
+    initialize();
+}
+
+template<typename T>
+Llama<T>::Llama(Llama<T> const& gpt):
+    BaseLayer(gpt),
+    head_num_(gpt.head_num_),
+    size_per_head_(gpt.size_per_head_),
+    inter_size_(gpt.inter_size_),
+    num_layer_(gpt.num_layer_),
+    vocab_size_(gpt.vocab_size_),
+    rotary_embedding_dim_(gpt.rotary_embedding_dim_),
+    start_id_(gpt.start_id_),
+    end_id_(gpt.end_id_),
+    prompt_learning_start_id_(gpt.prompt_learning_start_id_),
+    prompt_learning_type_(gpt.prompt_learning_type_),
+    use_gptj_residual_(gpt.use_gptj_residual_),
+    hidden_units_(gpt.hidden_units_),
+    tensor_para_(gpt.tensor_para_),
+    pipeline_para_(gpt.pipeline_para_),
+    local_head_num_(gpt.local_head_num_),
+    vocab_size_padded_(gpt.vocab_size_padded_),
+    custom_all_reduce_comm_(gpt.custom_all_reduce_comm_),
+    enable_custom_all_reduce_(gpt.enable_custom_all_reduce_),
+    attention_type_(gpt.attention_type_)
+{
+    initialize();
+}
+
+template<typename T>
+Llama<T>::~Llama()
+{
+    delete gpt_decoder_;
+    delete dynamic_decode_layer_;
+    delete gpt_context_decoder_;
+    freeBuffer();
+}
+
+template<typename T>
+void Llama<T>::registerCallback(callback_sig* fn, void* ctx)
+{
+    token_generated_cb_  = fn;
+    token_generated_ctx_ = ctx;
+}
+
+template<typename T>
+void Llama<T>::unRegisterCallback()
+{
+    token_generated_cb_  = nullptr;
+    token_generated_ctx_ = nullptr;
+}
+
+template<typename T>
+void Llama<T>::forward(std::vector<Tensor>*       output_tensors,
+                         const std::vector<Tensor>* input_tensors,
+                         const LlamaWeight<T>*    gpt_weights)
+{
+    FT_CHECK(false);
+}
+
+template<typename T>
+void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_tensors,
+                         const std::unordered_map<std::string, Tensor>* input_tensors,
+                         const LlamaWeight<T>*                        gpt_weights)
+{
+    // input_tensors:
+    //      input_ids [batch_size, max_input_length]
+    //      input_lengths [batch_size]
+    //      prompt_learning_task_name_ids [batch_size] on cpu, optional
+    //      output_seq_len [batch_size] on cpu
+    //      start_id [batch_size] on cpu, optional
+    //      end_id [batch_size] on cpu, optional
+    //      stop_words_list [batch_size, 2, stop_words_length], optional
+    //      bad_words_list [2, bad_words_length] or [batch_size, 2, bad_words_length], optional
+    //      runtime_top_k [1] or [batch_size] on cpu, optional, uint.
+    //      runtime_top_p [1] or [batch_size] on cpu, optional, float.
+    //      beam_search_diversity_rate [1] or [batch_size] on cpu, optional, float.
+    //      temperature [1] or [batch_size] on cpu, optional, float.
+    //      len_penalty [1] or [batch_size] on cpu, optional, float.
+    //      repetition_penalty [1] or [batch_size] on cpu, optional, float.
+    //      min_length [1] or [batch_size] on cpu, optional, int
+    //      random_seed [1] or [batch_size] on cpu, optional, unsigned long long int.
+    //      request_prompt_lengths [batch_size], optional
+    //      request_prompt_embedding [batch_size, max_prompt_length, hidden_units], float, optional
+    //      requst_prompt_type [batch_size], int, optional
+    //      top_p_decay [batch_size] on gpu, float, optional
+    //      top_p_min [batch_size] on gpu, float, optional
+    //      top_p_reset_ids [batch_size] on gpu, uint32, optional
+
+    // output_tensors:
+    //      output_ids [batch_size, beam_width, max_output_seq_len]
+    //      sequence_length [batch_size, beam_width]
+    //      output_log_probs [batch_size, beam_width, request_output_seq_len], must be float*.
+    //          optional. It leads to additional computing cost. If we don't need this result, don't put it.
+    //      cum_log_probs [batch_size, beam], optional, must be float*.
+    //          optional. It leads to additional computing cost. If we don't need this result, don't put it.
+
+    // Step is from max_input_length ~ max_output_seq_len,
+    // When step = k,  we put output ids and caches at step k, and the sequence_length would be k - 1 before
+    // complete this step.
+    // When there is no input_ids, put the start token at step 0 of output_ids_buf_. After forward, only copy
+    // the step 1 ~ max_output_seq_len of output_ids_buf_ to output_tensors->at(0).data
+
+    FT_CHECK_WITH_INFO(input_tensors->size() >= 3, "input_tensors->size() >= 3");
+    FT_CHECK_WITH_INFO(output_tensors->size() >= 2, "output_tensors->size() >= 2");
+    FT_CHECK(input_tensors->at("input_ids").shape.size() == 2);
+    FT_CHECK(input_tensors->at("input_lengths").shape.size() == 1);
+    FT_CHECK(input_tensors->find("output_seq_len") != input_tensors->end()
+             && input_tensors->at("output_seq_len").shape.size() == 1);
+    FT_CHECK(output_tensors->at("output_ids").shape.size() == 3);
+    FT_CHECK(output_tensors->at("sequence_length").shape.size() == 2);
+    FT_CHECK_WITH_INFO(input_tensors->at("input_ids").shape[0] == output_tensors->at("output_ids").shape[0],
+                       "input_tensors->at(\"input_ids\").shape[0] == output_tensors->at(\"output_ids\").shape[0]");
+
+    const size_t batch_size = output_tensors->at("output_ids").shape[0];
+    const size_t beam_width = output_tensors->at("output_ids").shape[1];
+
+    PromptLearningType request_prompt_type = PromptLearningType::no_prompt;
+    int                valid_prompt_inputs = input_tensors->count("request_prompt_type")
+                              + input_tensors->count("request_prompt_lengths")
+                              + input_tensors->count("request_prompt_embedding");
+
+    if (valid_prompt_inputs == 3) {
+        request_prompt_type = static_cast<PromptLearningType>(input_tensors->at("request_prompt_type").getVal<int>());
+        FT_LOG_INFO("Apply prompt embedding from input, will ignore task name ids");
+    }
+    else if (valid_prompt_inputs > 0) {
+        FT_LOG_WARNING(
+            "Prompts not applied: request_prompt_embedding, request_prompt_lengths, request_prompt_type are all needed!");
+    }
+    if (request_prompt_type == PromptLearningType::prefix_prompt) {
+        FT_LOG_WARNING("Request prompt doesn't support prefix prompt currently!");
+    }
+
+    // Prefix Prompt Inputs
+    // Padding works as follows: p p x x i i i x x --> p p i i i x x x x (p denotes prompt, i denotes input, x denotes
+    // pad)
+    // TODO (perkzz): move unnecessary paddings
+    const int* prompt_learning_task_name_ids =
+        input_tensors->count("prompt_learning_task_name_ids") ?
+            input_tensors->at("prompt_learning_task_name_ids").getPtr<const int>() :
+            nullptr;
+    has_prefix_prompt_ =
+        (prompt_learning_task_name_ids != nullptr) && (prompt_learning_type_ == PromptLearningType::prefix_prompt);
+    int max_prefix_prompt_length = 0;
+
+    FT_CHECK_WITH_INFO(
+        !(prompt_learning_task_name_ids != nullptr
+          && (prompt_learning_type_ == PromptLearningType::no_prompt
+              || prompt_learning_type_ == PromptLearningType::soft_prompt)),
+        "prompt_learning_type is prefix_prompt either p_prompt_tuning when prompt_learning_task_name_ids are provided.");
+
+    // NOTE: Prefix Prompt PreProcessing
+    // get prefix_prompt_weight for each batch --> shape [batch, beam_width]
+    // --> ptrs with shape [num_layers, 2, num_heads, perfix_seq_len, size_per_head]
+    std::vector<const T*> prefix_prompt_weight_batch_ptrs;
+    std::vector<int>      prefix_prompt_lengths;
+    if (has_prefix_prompt_) {
+        for (int bs_id = 0; bs_id < batch_size; ++bs_id) {
+            int task_id = prompt_learning_task_name_ids[bs_id];
+            // throw errors when prompt task_name_ids are not found
+            std::pair<const T*, int> prefix_prompt_weight_length_pair;
+            try {
+                prefix_prompt_weight_length_pair = gpt_weights->prompt_learning_table.at(task_id);
+            }
+            catch (const std::out_of_range& oor) {
+                FT_LOG_ERROR("prefix_prompt_weights_lengths not found for prompt task id: " + task_id);
+                throw oor;
+            }
+            for (int bw_id = 0; bw_id < beam_width; ++bw_id) {
+                prefix_prompt_weight_batch_ptrs.push_back(prefix_prompt_weight_length_pair.first);
+                prefix_prompt_lengths.push_back(prefix_prompt_weight_length_pair.second);
+            }
+        }
+
+        max_prefix_prompt_length = *max_element(prefix_prompt_lengths.begin(), prefix_prompt_lengths.end());
+
+        FT_LOG_DEBUG("max_prefix_prompt_length: %d", max_prefix_prompt_length);
+
+        if (max_prefix_prompt_length == 0) {
+            has_prefix_prompt_ = false;
+            FT_LOG_DEBUG("prompts are not applied !");
+        }
+    }
+
+    int max_input_length = input_tensors->at("input_ids").shape[1];
+    FT_CHECK_WITH_INFO(!(max_input_length == 0 && max_prefix_prompt_length > 0),
+                       "Prefix Prompt should come with inputs!");
+
+    // Prefix Soft Prompt
+    has_prefix_soft_prompt_ = request_prompt_type == PromptLearningType::soft_prompt;
+    const size_t max_prefix_soft_prompt_length =
+        has_prefix_soft_prompt_ ? input_tensors->at("request_prompt_embedding").shape[1] : 0;
+    const size_t limit_len_offset   = max_prefix_soft_prompt_length + (max_input_length == 0 ? 1 : 0);
+    const size_t max_output_seq_len = input_tensors->at("output_seq_len").max<uint32_t>() + limit_len_offset;
+    const size_t max_seq_len        = max_output_seq_len;
+    // max cache seq len should include max prefix prompt length as it has k/v states
+    const size_t max_cache_seq_len = max_output_seq_len + max_prefix_prompt_length;
+    if (max_cache_seq_len < max_seq_len) {
+        FT_LOG_WARNING("max_cache_seq_len (%d) is less than max_seq_len (%d). "
+                       "Note that this reduces the memory cost of k/v cache, but may hurt the accuracy.",
+                       max_cache_seq_len,
+                       max_seq_len);
+    }
+    else if (max_cache_seq_len > max_seq_len) {
+        FT_LOG_WARNING("max_cache_seq_len (%d) is larger than max_seq_len (%d). "
+                       "This may lead to additional memory cost. Suggest to use smaller max_cache_seq_len.",
+                       max_cache_seq_len,
+                       max_seq_len);
+    }
+    const cudaDataType_t gemm_data_type = getCudaDataType<T>();
+    allocateBuffer(
+        batch_size, beam_width, max_seq_len, max_cache_seq_len, max_input_length + max_prefix_soft_prompt_length);
+    setSeqLimitLen(seq_limit_len_, input_tensors->at("output_seq_len"), limit_len_offset, batch_size);
+
+    sync_check_cuda_error();
+    {
+        TensorMap input_map(*input_tensors);
+        dynamic_decode_layer_->setup(batch_size, beam_width, &input_map);
+        handleOptArg(&input_map, "start_id", start_ids_buf_, start_id_, batch_size);
+        handleOptArg(&input_map, "end_id", end_ids_buf_, end_id_, batch_size);
+    }
+
+    const DataType data_type = getTensorType<T>();
+
+    const std::vector<size_t> self_k_cache_shape = {num_layer_ / pipeline_para_.world_size_,
+                                                    batch_size * beam_width,
+                                                    local_head_num_,
+                                                    size_per_head_ / (16 / sizeof(T)),
+                                                    max_cache_seq_len,
+                                                    16 / sizeof(T)};
+    const std::vector<size_t> self_v_cache_shape = {num_layer_ / pipeline_para_.world_size_,
+                                                    batch_size * beam_width,
+                                                    local_head_num_,
+                                                    max_cache_seq_len,
+                                                    size_per_head_};
+
+    // initialize the output ids and parent ids
+    cudaMemsetAsync(output_ids_buf_, 0, sizeof(int) * batch_size * beam_width * max_seq_len, stream_);
+    cudaMemsetAsync(parent_ids_buf_, 0, sizeof(int) * batch_size * beam_width * max_seq_len, stream_);
+    cudaMemsetAsync(masked_tokens_, false, sizeof(bool) * batch_size * beam_width * max_cache_seq_len, stream_);
+    cudaMemsetAsync(tiled_total_padding_count_, 0, sizeof(int) * batch_size * beam_width, stream_);
+    if (beam_width > 1) {
+        cudaMemsetAsync(cache_indirections_[0], 0, 2 * sizeof(int) * batch_size * beam_width * max_seq_len, stream_);
+    }
+
+    // Prefix prompts
+    if (has_prefix_prompt_) {
+        cudaMemcpyAsync(prompt_learning_weight_batch_,
+                        prefix_prompt_weight_batch_ptrs.data(),
+                        sizeof(T*) * batch_size * beam_width,
+                        cudaMemcpyDefault,
+                        stream_);
+        cudaMemcpyAsync(tiled_prompt_lengths_buf_,
+                        prefix_prompt_lengths.data(),
+                        sizeof(int) * batch_size * beam_width,
+                        cudaMemcpyDefault,
+                        stream_);
+    }
+
+    sync_check_cuda_error();
+
+    // handle first step
+    if (has_prefix_prompt_ || has_prefix_soft_prompt_ || max_input_length > 1) {
+        invokeTileGptInputs(tiled_input_ids_buf_,
+                            tiled_input_lengths_buf_,
+                            input_tensors->at("input_ids").getPtr<int>(),
+                            input_tensors->at("input_lengths").getPtr<const int>(),
+                            batch_size,
+                            beam_width,
+                            max_input_length,
+                            stream_);
+        sync_check_cuda_error();
+
+        if (has_prefix_soft_prompt_) {
+            inputIdsEmbeddingLookupPosEncodingSoftPromptParam<T> param;
+            param.from_tensor                   = context_decoder_input_buf_;
+            param.output_ids                    = output_ids_buf_;
+            param.input_lengths                 = tiled_input_lengths_buf_;
+            param.embedding_table               = gpt_weights->pre_decoder_embedding_table;
+            param.pos_table                     = gpt_weights->position_encoding_table;
+            param.prefix_soft_prompt_embedding  = input_tensors->at("request_prompt_embedding").getPtr<float>();
+            param.prefix_soft_prompt_lengths    = input_tensors->at("request_prompt_lengths").getPtr<int>();
+            param.input_ids                     = tiled_input_ids_buf_;
+            param.start_step                    = 1;
+            param.max_input_length              = max_input_length;
+            param.max_prefix_soft_prompt_length = max_prefix_soft_prompt_length;
+            param.batch_size                    = batch_size;
+            param.beam_width                    = beam_width;
+            param.hidden_units                  = hidden_units_;
+            param.stream                        = stream_;
+
+            invokeInputIdsEmbeddingLookupPosEncodingSoftPrompt(param);
+            sync_check_cuda_error();
+            max_input_length += max_prefix_soft_prompt_length;  // view soft_prompt as input
+        }
+        else {
+            invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf_,
+                                                     output_ids_buf_,
+                                                     gpt_weights->pre_decoder_embedding_table,
+                                                     gpt_weights->position_encoding_table,
+                                                     pPromptTuningParam<T>{},  // no p/prompt tuning
+                                                     tiled_input_ids_buf_,
+                                                     1,
+                                                     max_input_length,
+                                                     max_input_length,
+                                                     batch_size * beam_width,
+                                                     hidden_units_,
+                                                     stream_);
+            sync_check_cuda_error();
+        }
+
+        invokeBuildDecoderAttentionMask(input_attention_mask_,
+                                        tiled_input_lengths_buf_,
+                                        tiled_prompt_lengths_buf_,
+                                        batch_size * beam_width,
+                                        max_input_length,
+                                        max_prefix_prompt_length,
+                                        stream_);
+        sync_check_cuda_error();
+
+        std::unordered_map<std::string, Tensor> decoder_input_tensors{
+            {"decoder_input",
+             Tensor{MEMORY_GPU,
+                    data_type,
+                    {batch_size * beam_width, (size_t)max_input_length, hidden_units_},
+                    context_decoder_input_buf_}},
+            {"attention_mask",
+             Tensor{MEMORY_GPU,
+                    data_type,
+                    {batch_size * beam_width,
+                     1,
+                     (size_t)max_input_length,
+                     (size_t)(max_input_length + max_prefix_prompt_length)},
+                    input_attention_mask_}},
+            {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, tiled_input_lengths_buf_}},
+            {"d_prefix_prompt_batch",
+             Tensor{MEMORY_GPU,
+                    data_type,
+                    {batch_size * beam_width},
+                    has_prefix_prompt_ ? prompt_learning_weight_batch_ : nullptr}},
+            {"d_prefix_prompt_lengths",
+             Tensor{MEMORY_GPU,
+                    TYPE_INT32,
+                    {batch_size * beam_width},
+                    has_prefix_prompt_ ? tiled_prompt_lengths_buf_ : nullptr}}};
+
+        std::unordered_map<std::string, Tensor> decoder_output_tensors{
+            {"decoder_output",
+             Tensor{MEMORY_GPU,
+                    data_type,
+                    {batch_size * beam_width, (size_t)max_input_length, hidden_units_},
+                    context_decoder_output_buf_}},
+            {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_shape, key_cache_}},
+            {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_shape, value_cache_}},
+            {"last_token_hidden_units",
+             Tensor{MEMORY_GPU, data_type, {batch_size * beam_width, hidden_units_}, decoder_output_buf_}}};
+
+        gpt_context_decoder_->forward(
+            &decoder_output_tensors, &decoder_input_tensors, &gpt_weights->decoder_layer_weights);
+        sync_check_cuda_error();
+        invokeDecodingInitialize(finished_buf_,
+                                 sequence_lengths_,
+                                 nullptr,
+                                 cum_log_probs_,
+                                 start_ids_buf_,
+                                 batch_size,
+                                 beam_width,
+                                 max_input_length - 1,
+                                 stream_);
+        sync_check_cuda_error();
+    }
+    else if (max_input_length == 0) {
+        FT_CHECK(prompt_learning_type_ == PromptLearningType::no_prompt
+                 && request_prompt_type == PromptLearningType::no_prompt);  // Not support prompts in this case
+        max_input_length++;
+        invokeDecodingInitialize(finished_buf_,
+                                 sequence_lengths_,
+                                 output_ids_buf_,
+                                 cum_log_probs_,
+                                 start_ids_buf_,
+                                 batch_size,
+                                 beam_width,
+                                 max_input_length - 1,
+                                 stream_);
+        std::vector<int> h_input_lengths(batch_size * beam_width, 1);
+        cudaMemcpyAsync(tiled_input_lengths_buf_,
+                        h_input_lengths.data(),
+                        sizeof(int) * batch_size * beam_width,
+                        cudaMemcpyHostToDevice,
+                        stream_);
+        sync_check_cuda_error();
+    }
+    else if (max_input_length == 1) {
+        FT_CHECK(prompt_learning_type_ == PromptLearningType::no_prompt
+                 && request_prompt_type == PromptLearningType::no_prompt);  // Not support prompts in this case
+        invokeDecodingInitialize(finished_buf_,
+                                 sequence_lengths_,
+                                 nullptr,
+                                 cum_log_probs_,
+                                 start_ids_buf_,
+                                 batch_size,
+                                 beam_width,
+                                 max_input_length - 1,
+                                 stream_);
+        sync_check_cuda_error();
+        invokeTileGptInputs(tiled_input_ids_buf_,
+                            tiled_input_lengths_buf_,
+                            input_tensors->at("input_ids").getPtr<int>(),
+                            input_tensors->at("input_lengths").getPtr<const int>(),
+                            batch_size,
+                            beam_width,
+                            max_input_length,
+                            stream_);
+        sync_check_cuda_error();
+
+        cudaMemcpyAsync(output_ids_buf_,
+                        tiled_input_ids_buf_,
+                        sizeof(int) * batch_size * beam_width,
+                        cudaMemcpyDeviceToDevice,
+                        stream_);
+    }
+
+    if (vocab_size_ == vocab_size_padded_) {
+        padded_embedding_kernel_ptr_ = gpt_weights->post_decoder_embedding.kernel;
+    }
+    else {
+        cudaMemcpyAsync(padded_embedding_kernel_,
+                        gpt_weights->post_decoder_embedding.kernel,
+                        sizeof(T) * vocab_size_ * hidden_units_,
+                        cudaMemcpyDeviceToDevice,
+                        stream_);
+        cudaMemcpyAsync(padded_embedding_bias_,
+                        gpt_weights->post_decoder_embedding.bias,
+                        sizeof(T) * vocab_size_,
+                        cudaMemcpyDeviceToDevice,
+                        stream_);
+        sync_check_cuda_error();
+    }
+
+    invokeMaskPaddingTokens(masked_tokens_,
+                            input_tensors->at("input_lengths").getPtr<const int>(),  // not_tiled
+                            tiled_prompt_lengths_buf_,
+                            max_cache_seq_len,
+                            max_input_length + max_prefix_prompt_length,
+                            0,
+                            batch_size,
+                            beam_width,
+                            stream_);
+
+    for (int step = max_input_length; step < (int)max_output_seq_len; step++) {
+        const int src_indir_idx = (step - max_input_length) % 2;
+        const int tgt_indir_idx = 1 - src_indir_idx;
+
+        const size_t local_batch_size = getLocalBatchSize(batch_size, 1, pipeline_para_.world_size_);
+        FT_CHECK(batch_size % local_batch_size == 0);
+        const size_t iteration_num = batch_size / local_batch_size;
+        *generation_should_stop_   = true;
+
+        for (uint ite = 0; ite < iteration_num; ++ite) {
+            const int id_offset               = ite * local_batch_size * beam_width;
+            const int hidden_units_offset     = id_offset * hidden_units_;
+            const int vocab_size_units_offset = id_offset * vocab_size_padded_;
+
+            if (!(max_input_length > 1 && step == max_input_length)) {
+                if (pipeline_para_.rank_ == 0) {
+                    invokeEmbeddingLookupPosEncodingPadCount(decoder_input_buf_ + hidden_units_offset,
+                                                             gpt_weights->pre_decoder_embedding_table,
+                                                             gpt_weights->position_encoding_table,
+                                                             output_ids_buf_ + id_offset,
+                                                             tiled_total_padding_count_ + id_offset,
+                                                             local_batch_size * beam_width,
+                                                             hidden_units_,
+                                                             (T)(1.0f),
+                                                             step - 1,
+                                                             batch_size * beam_width,
+                                                             0,
+                                                             stream_);
+                    sync_check_cuda_error();
+                }
+                std::unordered_map<std::string, Tensor> decoder_input_tensors{
+                    {"decoder_input",
+                     Tensor{MEMORY_GPU,
+                            data_type,
+                            {local_batch_size * beam_width, hidden_units_},
+                            decoder_input_buf_ + hidden_units_offset}},
+                    {"finished",
+                     Tensor{MEMORY_GPU, TYPE_BOOL, {local_batch_size * beam_width}, finished_buf_ + id_offset}},
+                    {"sequence_lengths",
+                     Tensor{MEMORY_GPU, TYPE_INT32, {local_batch_size * beam_width}, sequence_lengths_ + id_offset}},
+                    {"total_padding_tokens",
+                     Tensor{MEMORY_GPU,
+                            TYPE_INT32,
+                            {local_batch_size * beam_width},
+                            tiled_total_padding_count_ + id_offset}},
+                    {"d_prefix_prompt_lengths",
+                     Tensor{MEMORY_GPU,
+                            TYPE_INT32,
+                            {local_batch_size},
+                            has_prefix_prompt_ ? (tiled_prompt_lengths_buf_ + id_offset) : nullptr}},
+                    {"max_prefix_prompt_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_prefix_prompt_length}},
+                    {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_length}},
+                    {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}},
+                    {"ite", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &ite}},
+                    {"cache_indirection",
+                     Tensor{MEMORY_GPU,
+                            TYPE_INT32,
+                            {local_batch_size, beam_width, max_output_seq_len},
+                            beam_width > 1 ? cache_indirections_[src_indir_idx] + id_offset * max_output_seq_len :
+                                             nullptr}},
+                    {"masked_tokens",
+                     Tensor{MEMORY_GPU,
+                            TYPE_BOOL,
+                            {local_batch_size * beam_width, max_cache_seq_len},
+                            masked_tokens_ + id_offset * max_cache_seq_len}}};
+                std::unordered_map<std::string, Tensor> decoder_output_tensors{
+                    {"decoder_output",
+                     Tensor{MEMORY_GPU,
+                            data_type,
+                            {local_batch_size * beam_width, hidden_units_},
+                            decoder_output_buf_ + hidden_units_offset}},
+                    {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_shape, key_cache_}},
+                    {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_shape, value_cache_}}};
+                gpt_decoder_->forward(
+                    &decoder_output_tensors, &decoder_input_tensors, &gpt_weights->decoder_layer_weights);
+            }
+
+            if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) {
+                invokeGeneralLayerNorm(normed_decoder_output_buf_ + hidden_units_offset,
+                                       decoder_output_buf_ + hidden_units_offset,
+                                       gpt_weights->post_decoder_layernorm.gamma,
+                                       gpt_weights->post_decoder_layernorm.beta,
+                                       layernorm_eps_,
+                                       local_batch_size * beam_width,
+                                       hidden_units_,
+                                       (float*)nullptr,
+                                       0,
+                                       stream_);
+                sync_check_cuda_error();
+
+                if (tensor_para_.world_size_ == 1) {
+                    float alpha = 1.0f;
+                    float beta  = 0.0f;
+                    cublas_wrapper_->Gemm(CUBLAS_OP_T,
+                                          CUBLAS_OP_N,
+                                          vocab_size_padded_,  // n
+                                          local_batch_size * beam_width,
+                                          hidden_units_,  // k
+                                          &alpha,
+                                          padded_embedding_kernel_ptr_,
+                                          gemm_data_type,
+                                          hidden_units_,  // k
+                                          normed_decoder_output_buf_ + hidden_units_offset,
+                                          gemm_data_type,
+                                          hidden_units_,  // k
+                                          &beta,
+                                          logits_buf_ + vocab_size_units_offset,
+                                          CUDA_R_32F,
+                                          vocab_size_padded_, /* n */
+                                          CUDA_R_32F,
+                                          cublasGemmAlgo_t(-1));
+                }
+                else {
+                    FT_CHECK(vocab_size_padded_ % tensor_para_.world_size_ == 0);
+                    const int local_vocab_size = vocab_size_padded_ / tensor_para_.world_size_;
+                    float     alpha            = 1.0f;
+                    float     beta             = 0.0f;
+                    cublas_wrapper_->Gemm(CUBLAS_OP_T,
+                                          CUBLAS_OP_N,
+                                          local_vocab_size,  // n
+                                          local_batch_size * beam_width,
+                                          hidden_units_,  // k
+                                          &alpha,
+                                          padded_embedding_kernel_ptr_
+                                              + tensor_para_.rank_ * local_vocab_size * hidden_units_,
+                                          gemm_data_type,
+                                          hidden_units_,  // k
+                                          normed_decoder_output_buf_ + hidden_units_offset,
+                                          gemm_data_type,
+                                          hidden_units_,  // k
+                                          &beta,
+                                          nccl_logits_buf_ + vocab_size_units_offset
+                                              + tensor_para_.rank_ * local_batch_size * beam_width * local_vocab_size,
+                                          CUDA_R_32F,
+                                          local_vocab_size, /* n */
+                                          CUDA_R_32F,
+                                          cublasGemmAlgo_t(-1));
+                    ftNcclAllGather(nccl_logits_buf_ + vocab_size_units_offset,
+                                    nccl_logits_buf_ + vocab_size_units_offset,
+                                    local_batch_size * beam_width * local_vocab_size,
+                                    tensor_para_.rank_,
+                                    tensor_para_,
+                                    stream_);
+                    invokeTransposeAxis01(logits_buf_ + vocab_size_units_offset,
+                                          nccl_logits_buf_ + vocab_size_units_offset,
+                                          tensor_para_.world_size_,
+                                          local_batch_size * beam_width,
+                                          local_vocab_size,
+                                          stream_);
+                }
+
+                int                                     tmp_local_batch_size       = local_batch_size;
+                bool                                    is_initialize_random_table = step == max_input_length;
+                std::unordered_map<std::string, Tensor> dynamic_decode_input_tensors{
+                    {"logits",
+                     Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size_padded_}, logits_buf_}},
+                    // {"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size_padded_}, nullptr}},
+                    {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}},
+                    {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_length}},
+                    {"input_lengths",
+                     Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, tiled_input_lengths_buf_}},
+                    {"sequence_limit_length", Tensor{MEMORY_GPU, TYPE_UINT32, {batch_size}, seq_limit_len_}},
+                    {"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}},
+                    {"src_cache_indirection",
+                     Tensor{MEMORY_GPU,
+                            TYPE_INT32,
+                            {local_batch_size, beam_width, max_output_seq_len},
+                            cache_indirections_[src_indir_idx] + id_offset * max_output_seq_len}},
+                    {"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &tmp_local_batch_size}},
+                    {"end_id", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, end_ids_buf_}},
+                    {"is_initialize_random_table", Tensor{MEMORY_CPU, TYPE_BOOL, {1}, &is_initialize_random_table}}};
+
+                for (auto t = input_tensors->begin(); t != input_tensors->end(); ++t) {
+                    if (dynamic_decode_input_tensors.find(t->first) == dynamic_decode_input_tensors.end()) {
+                        dynamic_decode_input_tensors.insert(*t);
+                    }
+                }
+
+                // common outputs
+                bool                                    subbatch_should_stop = false;
+                std::unordered_map<std::string, Tensor> dynamic_decode_output_tensors{
+                    {"output_ids",
+                     Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, output_ids_buf_}},
+                    {"finished", Tensor{MEMORY_GPU, TYPE_BOOL, {batch_size * beam_width}, finished_buf_}},
+                    // cum_log_probs is necessary for beam search, while it is optional for sampling.
+                    {"cum_log_probs",
+                     Tensor{MEMORY_GPU,
+                            TYPE_FP32,
+                            {batch_size * beam_width},
+                            ((beam_width > 1) || (output_tensors->count("cum_log_probs") > 0)) ? cum_log_probs_ :
+                                                                                                 nullptr}},
+                    {"output_log_probs",
+                     Tensor{MEMORY_GPU,
+                            TYPE_FP32,
+                            {max_seq_len, batch_size, beam_width},
+                            output_tensors->count("output_log_probs") > 0
+                                    && output_tensors->at("output_log_probs").data != nullptr ?
+                                output_log_probs_buf_ :
+                                nullptr}},
+                    {"parent_ids",
+                     Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, parent_ids_buf_}},
+                    {"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, sequence_lengths_}},
+                    {"tgt_cache_indirection",
+                     Tensor{MEMORY_GPU,
+                            TYPE_INT32,
+                            {local_batch_size, beam_width, max_output_seq_len},
+                            cache_indirections_[tgt_indir_idx] + id_offset * max_output_seq_len}},
+                    {"should_stop", Tensor{MEMORY_CPU, TYPE_BOOL, {1}, &subbatch_should_stop}}};
+
+                for (auto t = output_tensors->begin(); t != output_tensors->end(); ++t) {
+                    // Handle exceptions.
+                    if (t->first == "cum_log_probs" || t->first == "output_log_probs") {
+                        continue;
+                    }
+                    dynamic_decode_output_tensors.insert(*t);
+                }
+
+                dynamic_decode_layer_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
+                *generation_should_stop_ &= subbatch_should_stop;
+            }
+        }
+
+        if (pipeline_para_.world_size_ > 1) {
+            ftNcclGroupStart();
+            ftNcclBroadCast(output_ids_buf_ + step * batch_size * beam_width,
+                            batch_size * beam_width,
+                            pipeline_para_.world_size_ - 1,
+                            pipeline_para_,
+                            stream_);
+
+            ftNcclBroadCast(
+                sequence_lengths_, batch_size * beam_width, pipeline_para_.world_size_ - 1, pipeline_para_, stream_);
+
+            ftNcclBroadCast(generation_should_stop_, 1, pipeline_para_.world_size_ - 1, pipeline_para_, stream_);
+
+            if (beam_width > 1) {
+                ftNcclBroadCast(cache_indirections_[tgt_indir_idx],
+                                batch_size * beam_width * max_output_seq_len,
+                                pipeline_para_.world_size_ - 1,
+                                pipeline_para_,
+                                stream_);
+            }
+            ftNcclGroupEnd();
+            // throw errors when detected
+            ftNcclStreamSynchronize(tensor_para_, pipeline_para_, stream_);
+            sync_check_cuda_error();
+        }
+
+        if (*generation_should_stop_) {
+            break;
+        }
+        if (token_generated_cb_ && step + 1 < (int)max_output_seq_len) {
+            setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len);
+            sendTensorsToFirstPipelineNode(output_tensors, input_tensors);
+
+            if (pipeline_para_.rank_ == 0 && tensor_para_.rank_ == 0) {
+                token_generated_cb_(output_tensors, token_generated_ctx_);
+            }
+        }
+        if (step == max_input_length) {
+            /* We have just finished processing input: update the padding count:
+             * total_padding_count += (max_input_length - input_lengths)
+             * if has prefix prompts, += (max_prefix_prompt_length - prompt_length)
+             */
+            invokeUpdatePaddingCount(tiled_total_padding_count_,
+                                     input_tensors->at("input_lengths").getPtr<const int>(),  // not_tiled
+                                     has_prefix_prompt_ ? tiled_prompt_lengths_buf_ : (const int*)nullptr,
+                                     max_input_length,
+                                     has_prefix_prompt_ ? max_prefix_prompt_length : 0,
+                                     batch_size,
+                                     beam_width,
+                                     stream_);
+        }
+    }
+
+    setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len);
+    sendTensorsToFirstPipelineNode(output_tensors, input_tensors);
+}
+
+template<typename T>
+void Llama<T>::sendTensorsToFirstPipelineNode(std::unordered_map<std::string, Tensor>*       output_tensors,
+                                                const std::unordered_map<std::string, Tensor>* input_tensors)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (pipeline_para_.world_size_ == 1) {
+        // throw errors when detected
+        ftNcclStreamSynchronize(tensor_para_, pipeline_para_, stream_);
+        return;
+    }
+
+    const auto pp_rank = pipeline_para_.rank_;
+
+    ftNcclGroupStart();
+    for (auto const& it : *output_tensors) {
+        if (it.second.data == nullptr) {
+            continue;
+        }
+
+        if (pp_rank == pipeline_para_.world_size_ - 1) {
+            ftNcclSend(it.second.getPtr<char>(), it.second.sizeBytes(), 0, pipeline_para_, stream_);
+        }
+        else if (pp_rank == 0) {
+            ftNcclRecv(it.second.getPtr<char>(),
+                       it.second.sizeBytes(),
+                       pipeline_para_.world_size_ - 1,
+                       pipeline_para_,
+                       stream_);
+        }
+    }
+    ftNcclGroupEnd();
+    // throw errors when detected
+    ftNcclStreamSynchronize(tensor_para_, pipeline_para_, stream_);
+}
+
+template<typename T>
+void Llama<T>::setOutputTensors(std::unordered_map<std::string, Tensor>*       output_tensors,
+                                  const std::unordered_map<std::string, Tensor>* input_tensors,
+                                  const size_t                                   max_input_length,
+                                  const size_t                                   max_output_seq_len)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (pipeline_para_.rank_ != pipeline_para_.world_size_ - 1) {
+        return;
+    }
+
+    const size_t batch_size       = output_tensors->at("output_ids").shape[0];
+    const size_t beam_width       = output_tensors->at("output_ids").shape[1];
+    uint*        sequence_lengths = output_tensors->at("sequence_length").getPtr<uint>();
+    const size_t max_prefix_soft_prompt_length =
+        has_prefix_soft_prompt_ ? input_tensors->at("request_prompt_embedding").shape[1] : 0;
+
+    if (input_tensors->at("input_ids").shape[1] == 0) {
+        invokeCudaD2DcpyConvert(
+            sequence_lengths, sequence_lengths_, output_tensors->at("sequence_length").size(), stream_);
+        // TODO: D2D sequence_lenghts
+        if (beam_width > 1) {
+            // For beam search, do gather_tree
+            // take output_parent_ids as inter buffer
+            invokeGatherTree(transposed_output_ids_buf_,
+                             sequence_lengths_,
+                             max_output_seq_len,
+                             batch_size,
+                             beam_width,
+                             output_ids_buf_ + batch_size * beam_width,
+                             parent_ids_buf_ + batch_size * beam_width,
+                             end_ids_buf_,
+                             stream_);
+
+            // transpose and take output_parent_ids as inter buffer
+            invokeTransposeAxis01(output_tensors->at("output_ids").getPtr<int>(),
+                                  transposed_output_ids_buf_,
+                                  max_output_seq_len - 1,
+                                  batch_size * beam_width,
+                                  1,
+                                  stream_);
+        }
+        else {
+            // For sampling, only copy the results to output_tensor
+            invokeTransposeAxis01(output_tensors->at("output_ids").getPtr<int>(),
+                                  output_ids_buf_ + batch_size * beam_width,
+                                  max_output_seq_len - 1,
+                                  batch_size * beam_width,
+                                  1,
+                                  stream_);
+        }
+    }
+    else {
+
+        // For sampling, it is equivalent to all parent ids are 0.
+        gatherTreeParam param;
+        param.beams                = transposed_output_ids_buf_;
+        param.max_sequence_lengths = sequence_lengths_;
+        // add sequence_length 1 here because the sequence_length of time step t is t - 1
+        param.max_sequence_length_final_step = 1;
+        param.max_time                       = max_output_seq_len;
+        param.batch_size                     = batch_size;
+        param.beam_width                     = beam_width;
+        param.step_ids                       = output_ids_buf_;
+        param.parent_ids                     = beam_width == 1 ? nullptr : parent_ids_buf_;
+        param.end_tokens                     = end_ids_buf_;
+        param.max_input_length               = max_input_length;
+        param.prefix_soft_prompt_lengths =
+            has_prefix_soft_prompt_ ? input_tensors->at("request_prompt_lengths").getPtr<int>() : nullptr;
+        param.input_lengths                   = tiled_input_lengths_buf_;
+        param.max_prefix_soft_prompt_length   = max_prefix_soft_prompt_length;
+        param.max_input_without_prompt_length = max_input_length;
+        param.stream                          = stream_;
+        param.output_ids                      = output_tensors->at("output_ids").getPtr<int>();
+        invokeGatherTree(param);
+        invokeCudaD2DcpyConvert(
+            sequence_lengths, sequence_lengths_, output_tensors->at("sequence_length").size(), stream_);
+        sync_check_cuda_error();
+    }
+    if ((output_tensors->count("output_log_probs") > 0 && output_tensors->at("output_log_probs").data != nullptr)) {
+        invokeTransposeAxis01(output_tensors->at("output_log_probs").getPtr<float>(),
+                              output_log_probs_buf_,
+                              input_tensors->at("output_seq_len").max<uint32_t>() - max_input_length,
+                              batch_size * beam_width,
+                              1,
+                              stream_);
+    }
+    // Return the cumulative log probability if requested.
+    if (output_tensors->count("cum_log_probs") > 0) {
+        Tensor cum_log_probs = output_tensors->at("cum_log_probs");
+        FT_CHECK_WITH_INFO(cum_log_probs.size() == batch_size * beam_width,
+                           "The shape of cum_log_probs does not match with batch_size x beam_width.");
+        cudaAutoCpy(cum_log_probs.getPtr<float>(), cum_log_probs_, cum_log_probs.size(), stream_);
+    }
+}
+
+template<typename T>
+size_t Llama<T>::getPipelineParallelRank()
+{
+    return pipeline_para_.rank_;
+}
+
+template<typename T>
+size_t Llama<T>::getPipelineParallelSize()
+{
+    return pipeline_para_.world_size_;
+}
+
+template<typename T>
+size_t Llama<T>::getTensorParallelRank()
+{
+    return tensor_para_.rank_;
+}
+
+template<typename T>
+size_t Llama<T>::getTensorParallelSize()
+{
+    return tensor_para_.world_size_;
+}
+
+template<typename T>
+bool* Llama<T>::getFinishBuffer()
+{
+    return finished_buf_;
+}
+
+template class Llama<float>;
+template class Llama<half>;
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/Llama.h b/src/fastertransformer/models/llama/Llama.h
new file mode 100644
index 000000000..cd79ac0b8
--- /dev/null
+++ b/src/fastertransformer/models/llama/Llama.h
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+#include "src/fastertransformer/layers/DynamicDecodeLayer.h"
+#include "src/fastertransformer/models/llama/LlamaContextDecoder.h"
+#include "src/fastertransformer/models/llama/LlamaDecoder.h"
+#include "src/fastertransformer/models/llama/LlamaWeight.h"
+#include "src/fastertransformer/utils/custom_ar_comm.h"
+#include "src/fastertransformer/utils/prompt_learning.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class Llama: public BaseLayer {
+private:
+    // meta data
+    size_t head_num_;
+    size_t size_per_head_;
+    size_t inter_size_;
+    size_t num_layer_;
+    size_t vocab_size_;
+    size_t rotary_embedding_dim_;
+
+    static constexpr bool  neox_rotary_style_ = true;
+    static constexpr float layernorm_eps_     = 1e-5f;
+
+    int    start_id_;
+    int    end_id_;
+    size_t hidden_units_;
+
+    size_t    local_head_num_;
+    NcclParam tensor_para_;
+    NcclParam pipeline_para_;
+
+    std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm_;
+    int                                 enable_custom_all_reduce_;
+
+    AttentionType attention_type_;
+
+    size_t     vocab_size_padded_;
+    const bool is_context_qk_buf_float_ =
+        (std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM") == nullptr ||
+         std::string(std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM")) != "ON");
+
+    // Residual Type
+    const bool use_gptj_residual_ = true;
+
+    // Prompt Learning Parameters
+    PromptLearningType prompt_learning_type_;
+    int                prompt_learning_start_id_;  // start_id for prompt_learning (only needed by prefix prompts)
+    bool               has_prefix_prompt_;
+    bool               has_prefix_soft_prompt_;
+
+    LlamaDecoder<T>*         gpt_decoder_;
+    LlamaContextDecoder<T>*  gpt_context_decoder_;
+    DynamicDecodeLayer<float>* dynamic_decode_layer_;
+
+    void allocateBuffer() override;
+    void allocateBuffer(
+        size_t batch_size, size_t beam_width, size_t max_seq_len, size_t max_cache_seq_len, size_t max_input_len);
+    void freeBuffer() override;
+
+    void initialize();
+
+protected:
+    T*       padded_embedding_kernel_;
+    T*       padded_embedding_bias_;
+    const T* padded_embedding_kernel_ptr_;
+
+    T* input_attention_mask_;
+
+    T* decoder_input_buf_;
+    T* decoder_output_buf_;
+    T* normed_decoder_output_buf_;
+
+    float* logits_buf_;
+    float* nccl_logits_buf_;
+    float* cum_log_probs_;
+
+    bool*     finished_buf_;
+    bool*     h_finished_buf_;
+    int*      sequence_lengths_          = nullptr;
+    int*      tiled_total_padding_count_ = nullptr;
+    uint32_t* seq_limit_len_             = nullptr;
+
+    T*   key_cache_;
+    T*   value_cache_;
+    int* cache_indirections_[2] = {nullptr, nullptr};
+
+    // prompt_learning weight_batch ptrs
+    const T** prompt_learning_weight_batch_;
+    int*      tiled_prompt_lengths_buf_;  // only needed by prefix prompts
+
+    int*  tiled_input_ids_buf_;
+    int*  tiled_input_lengths_buf_;
+    int*  transposed_output_ids_buf_;
+    int*  output_ids_buf_;
+    int*  parent_ids_buf_;
+    int*  start_ids_buf_;
+    int*  end_ids_buf_;
+    bool* masked_tokens_ = nullptr;
+
+    bool* generation_should_stop_ = nullptr;
+
+    T*     context_decoder_input_buf_;
+    T*     context_decoder_output_buf_;
+    float* output_log_probs_buf_;
+
+    // function pointer callback
+    using callback_sig                 = void(std::unordered_map<std::string, Tensor>*, void*);
+    callback_sig* token_generated_cb_  = nullptr;
+    void*         token_generated_ctx_ = nullptr;
+
+    void setOutputTensors(std::unordered_map<std::string, Tensor>*       output_tensors,
+                          const std::unordered_map<std::string, Tensor>* input_tensors,
+                          const size_t                                   max_input_length,
+                          const size_t                                   max_seq_len);
+    void sendTensorsToFirstPipelineNode(std::unordered_map<std::string, Tensor>*       output_tensors,
+                                        const std::unordered_map<std::string, Tensor>* input_tensors);
+
+public:
+    Llama(size_t                              head_num,
+          size_t                              size_per_head,
+          size_t                              inter_size,
+          size_t                              num_layer,
+          size_t                              vocab_size,
+          size_t                              rotary_embedding_dim,
+          int                                 start_id,
+          int                                 end_id,
+          int                                 prompt_learning_start_id,  // only needed by p/prompt-tuning
+          PromptLearningType                  prompt_learning_type,
+          bool                                use_gptj_residual,
+          float                               beam_search_diversity_rate,
+          size_t                              top_k,
+          float                               top_p,
+          unsigned long long                  random_seed,
+          float                               temperature,
+          float                               len_penalty,
+          float                               repetition_penalty,
+          cudaStream_t                        stream,
+          cublasMMWrapper*                    cublas_wrapper,
+          IAllocator*                         allocator,
+          bool                                is_free_buffer_after_forward,
+          cudaDeviceProp*                     cuda_device_prop         = nullptr,
+          AttentionType                       attention_type           = AttentionType::UNFUSED_MHA,
+          std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm   = nullptr,
+          int                                 enable_custom_all_reduce = 0);
+
+    Llama(size_t                              head_num,
+          size_t                              size_per_head,
+          size_t                              inter_size,
+          size_t                              num_layer,
+          size_t                              vocab_size,
+          size_t                              rotary_embedding_dim,
+          int                                 start_id,
+          int                                 end_id,
+          int                                 prompt_learning_start_id,  // only needed by p/prompt-tuning
+          PromptLearningType                  prompt_learning_type,
+          bool                                use_gptj_residual,
+          float                               beam_search_diversity_rate,
+          size_t                              top_k,
+          float                               top_p,
+          unsigned long long                  random_seed,
+          float                               temperature,
+          float                               len_penalty,
+          float                               repetition_penalty,
+          NcclParam                           tensor_para,
+          NcclParam                           pipeline_para,
+          cudaStream_t                        stream,
+          cublasMMWrapper*                    cublas_wrapper,
+          IAllocator*                         allocator,
+          bool                                is_free_buffer_after_forward,
+          cudaDeviceProp*                     cuda_device_prop         = nullptr,
+          AttentionType                       attention_type           = AttentionType::UNFUSED_MHA,
+          std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm   = nullptr,
+          int                                 enable_custom_all_reduce = 0);
+
+    Llama(Llama<T> const& Llama);
+
+    ~Llama();
+
+    void forward(std::vector<Tensor>*       output_tensors,
+                 const std::vector<Tensor>* input_tensors,
+                 const LlamaWeight<T>*      gpt_weights);
+
+    void forward(std::unordered_map<std::string, Tensor>*       output_tensors,
+                 const std::unordered_map<std::string, Tensor>* input_tensors,
+                 const LlamaWeight<T>*                          gpt_weights);
+
+    size_t getPipelineParallelRank();
+    size_t getPipelineParallelSize();
+    size_t getTensorParallelRank();
+    size_t getTensorParallelSize();
+    bool*  getFinishBuffer();
+
+    void registerCallback(callback_sig* fn, void* ctx);
+    void unRegisterCallback();
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LlamaContextDecoder.cc b/src/fastertransformer/models/llama/LlamaContextDecoder.cc
new file mode 100644
index 000000000..f107d38c1
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaContextDecoder.cc
@@ -0,0 +1,504 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/models/llama/LlamaContextDecoder.h"
+#include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
+#include "src/fastertransformer/kernels/gpt_kernels.h"
+
+#include "src/fastertransformer/layers/TensorParallelGeluFfnLayer.h"
+#include "src/fastertransformer/layers/attention_layers/TensorParallelGptContextAttentionLayer.h"
+
+namespace fastertransformer {
+
+template<typename T>
+void LlamaContextDecoder<T>::initialize()
+{
+    self_attention_layer_ = new TensorParallelGptContextAttentionLayer<T>(0,  // max_batch_size
+                                                                          0,  // max_seq_len
+                                                                          head_num_,
+                                                                          size_per_head_,
+                                                                          rotary_embedding_dim_,
+                                                                          neox_rotary_style_,
+                                                                          tensor_para_,
+                                                                          stream_,
+                                                                          cublas_wrapper_,
+                                                                          allocator_,
+                                                                          !use_gptj_residual_,
+                                                                          is_free_buffer_after_forward_,
+                                                                          is_qk_buf_float_,
+                                                                          false,
+                                                                          0,
+                                                                          custom_all_reduce_comm_,
+                                                                          enable_custom_all_reduce_);
+
+    ffn_layer_ = new TensorParallelGeluFfnLayer<T>(0,  // max_batch_size
+                                                   0,  // max_seq_len
+                                                   head_num_,
+                                                   size_per_head_,
+                                                   0,  // expert_num
+                                                   inter_size_,
+                                                   tensor_para_,
+                                                   stream_,
+                                                   cublas_wrapper_,
+                                                   allocator_,
+                                                   !use_gptj_residual_,
+                                                   is_free_buffer_after_forward_,
+                                                   false,
+                                                   0,
+                                                   true,  // use_gated_activation = true;
+                                                   custom_all_reduce_comm_,
+                                                   enable_custom_all_reduce_);
+}
+
+template<typename T>
+void LlamaContextDecoder<T>::allocateBuffer()
+{
+    FT_CHECK(false);
+}
+
+template<typename T>
+void LlamaContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len)
+{
+    decoder_normed_input_ = reinterpret_cast<T*>(
+        allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+    self_attn_output_ = reinterpret_cast<T*>(
+        allocator_->reMalloc(self_attn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+    ffn_output_ = reinterpret_cast<T*>(
+        allocator_->reMalloc(ffn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+    decoder_layer_output_ = reinterpret_cast<T*>(
+        allocator_->reMalloc(decoder_layer_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+    h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), true, true);
+    padding_offset_ =
+        reinterpret_cast<int*>(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false));
+    cu_seqlens_ = reinterpret_cast<int*>(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false));
+    is_allocate_buffer_ = true;
+}
+
+template<typename T>
+void LlamaContextDecoder<T>::freeBuffer()
+{
+    if (is_allocate_buffer_ == true) {
+        allocator_->free((void**)(&decoder_normed_input_));
+        allocator_->free((void**)(&self_attn_output_));
+        allocator_->free((void**)(&ffn_output_));
+        allocator_->free((void**)(&decoder_layer_output_));
+        allocator_->free((void**)(&h_pinned_token_num_ptr_), true);
+        allocator_->free((void**)(&padding_offset_));
+        allocator_->free((void**)(&cu_seqlens_));
+        is_allocate_buffer_ = false;
+    }
+}
+
+template<typename T>
+bool LlamaContextDecoder<T>::isValidLayerParallelId(uint l)
+{
+    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
+    return l < num_layer_ && (l >= local_num_layer * pipeline_para_.rank_)
+           && (l < local_num_layer * (pipeline_para_.rank_ + 1));
+}
+
+template<typename T>
+bool LlamaContextDecoder<T>::isFirstLayerParallelId(uint l)
+{
+    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
+    return l < num_layer_ && (l == local_num_layer * pipeline_para_.rank_);
+}
+
+template<typename T>
+bool LlamaContextDecoder<T>::isLastLayerParallelId(uint l)
+{
+    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
+    return l < num_layer_ && (l == local_num_layer * (pipeline_para_.rank_ + 1) - 1);
+}
+
+template<typename T>
+int LlamaContextDecoder<T>::getFirstLayerParallelId()
+{
+    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
+    return local_num_layer * pipeline_para_.rank_;
+}
+
+template<typename T>
+LlamaContextDecoder<T>::LlamaContextDecoder(size_t                              head_num,
+                                            size_t                              size_per_head,
+                                            size_t                              inter_size,
+                                            size_t                              num_layer,
+                                            size_t                              rotary_embedding_dim,
+                                            bool                                neox_rotary_style,
+                                            bool                                use_gptj_residual,
+                                            float                               layernorm_eps,
+                                            NcclParam                           tensor_para,
+                                            NcclParam                           pipeline_para,
+                                            cudaStream_t                        stream,
+                                            cublasMMWrapper*                    cublas_wrapper,
+                                            IAllocator*                         allocator,
+                                            bool                                is_free_buffer_after_forward,
+                                            bool                                is_qk_buf_float,
+                                            AttentionType                       attention_type,
+                                            std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
+                                            int                                 enable_custom_all_reduce):
+    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
+    head_num_(head_num),
+    size_per_head_(size_per_head),
+    inter_size_(inter_size),
+    num_layer_(num_layer),
+    rotary_embedding_dim_(rotary_embedding_dim),
+    neox_rotary_style_(neox_rotary_style),
+    use_gptj_residual_(use_gptj_residual),
+    layernorm_eps_(layernorm_eps),
+    hidden_units_(head_num * size_per_head),
+    tensor_para_(tensor_para),
+    pipeline_para_(pipeline_para),
+    is_qk_buf_float_(is_qk_buf_float),
+    attention_type_(attention_type),
+    custom_all_reduce_comm_(custom_all_reduce_comm),
+    enable_custom_all_reduce_(enable_custom_all_reduce)
+{
+    initialize();
+}
+
+template<typename T>
+LlamaContextDecoder<T>::LlamaContextDecoder(LlamaContextDecoder<T> const& decoder):
+    BaseLayer(decoder.stream_, decoder.cublas_wrapper_, decoder.allocator_, decoder.is_free_buffer_after_forward_),
+    head_num_(decoder.head_num_),
+    size_per_head_(decoder.size_per_head_),
+    inter_size_(decoder.inter_size_),
+    num_layer_(decoder.num_layer_),
+    rotary_embedding_dim_(decoder.rotary_embedding_dim_),
+    neox_rotary_style_(decoder.neox_rotary_style_),
+    use_gptj_residual_(decoder.use_gptj_residual_),
+    layernorm_eps_(decoder.layernorm_eps_),
+    hidden_units_(decoder.hidden_units_),
+    tensor_para_(decoder.tensor_para_),
+    pipeline_para_(decoder.pipeline_para_),
+    is_qk_buf_float_(decoder.is_qk_buf_float_),
+    attention_type_(decoder.attention_type_),
+    custom_all_reduce_comm_(decoder.custom_all_reduce_comm_),
+    enable_custom_all_reduce_(decoder.enable_custom_all_reduce_)
+{
+    initialize();
+}
+
+template<typename T>
+LlamaContextDecoder<T>::~LlamaContextDecoder()
+{
+    delete self_attention_layer_;
+    delete ffn_layer_;
+    freeBuffer();
+}
+
+template<typename T>
+void LlamaContextDecoder<T>::forward(std::vector<Tensor>*                              output_tensors,
+                                       const std::vector<Tensor>*                        input_tensors,
+                                       const std::vector<LlamaDecoderLayerWeight<T>*>* gpt_decoder_layer_weight)
+{
+    std::unordered_map<std::string, Tensor> input_tensors_map{{"decoder_input", input_tensors->at(0)},
+                                                              {"attention_mask", input_tensors->at(1)},
+                                                              {"input_lengths", input_tensors->at(2)}};
+    std::unordered_map<std::string, Tensor> output_tensors_map{{"decoder_output", output_tensors->at(0)},
+                                                               {"key_cache", output_tensors->at(1)},
+                                                               {"value_cache", output_tensors->at(2)},
+                                                               {"last_token_hidden_units", output_tensors->at(3)}};
+
+    forward(&output_tensors_map, &input_tensors_map, gpt_decoder_layer_weight);
+}
+
+template<typename T>
+void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*          output_tensors,
+                                       const std::unordered_map<std::string, Tensor>*    input_tensors,
+                                       const std::vector<LlamaDecoderLayerWeight<T>*>* gpt_decoder_layer_weight)
+{
+    // input tensors:
+    //      decoder_input [batch_size, seq_len, hidden_dimension],
+    //      attention_mask [batch_size, 1, seq_len, seq_len + max_prompt_length]
+    //      input_lengths [batch_size]
+    //      d_prefix_prompt_batch [batch_size],
+    //          each element contains ptr with buffer shape[2, local_head_num_, prompt_length, size_per_head]
+    //      prefix_prompt_lengths [batch size]
+
+    // output tensors:
+    //      decoder_output [batch_size, seq_len, hidden_dimension],
+    //      key_cache [num_layer, batch, local_head_num, size_per_head // x, max_seq_len, x]
+    //      value_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head]
+    //      last_token_hidden_units [batch_size, hidden_dimension]
+
+    // To use layer/pipeline parallelism, we view the shape of 'batch_size' to 'ite * local_batch_size'.
+    // For example, the shape of decoder_input becomes [ite, batch_size, seq_len, hidden_dimension] during
+    // computing.
+
+    FT_CHECK(input_tensors->size() == 5);
+    FT_CHECK(output_tensors->size() == 4);
+
+    const int batch_size = input_tensors->at("decoder_input").shape[0];
+    const int seq_len    = input_tensors->at("decoder_input").shape[1];
+    const int max_prompt_length =
+        input_tensors->at("attention_mask").shape[3] - input_tensors->at("attention_mask").shape[2];
+    const DataType data_type = getTensorType<T>();
+    allocateBuffer(batch_size, seq_len);
+
+    T*         decoder_input           = input_tensors->at("decoder_input").getPtr<T>();
+    T*         decoder_output          = output_tensors->at("decoder_output").getPtr<T>();
+    const T*   attention_mask          = input_tensors->at("attention_mask").getPtr<const T>();
+    const T**  d_prefix_prompt_batch   = input_tensors->at("d_prefix_prompt_batch").getPtr<const T*>();
+    const int* d_prefix_prompt_lengths = input_tensors->at("d_prefix_prompt_lengths").getPtr<const int>();
+
+    const int local_batch_size = getLocalBatchSize(batch_size, seq_len, pipeline_para_.world_size_);
+    FT_CHECK(batch_size % local_batch_size == 0);
+    const int iteration_num = batch_size / local_batch_size;
+
+    Tensor&             k_cache = output_tensors->at("key_cache");
+    Tensor&             v_cache = output_tensors->at("value_cache");
+    std::vector<size_t> self_k_cache_size;
+    self_k_cache_size.push_back(local_batch_size);
+    for (auto t = k_cache.shape.begin() + 2; t != k_cache.shape.end(); ++t) {
+        self_k_cache_size.push_back(*t);
+    }
+    std::vector<size_t> self_v_cache_size;
+    self_v_cache_size.push_back(local_batch_size);
+    for (auto t = v_cache.shape.begin() + 2; t != v_cache.shape.end(); ++t) {
+        self_v_cache_size.push_back(*t);
+    }
+
+    AttentionType attention_type  = (d_prefix_prompt_lengths != nullptr) ?
+                                        getUnfusedAttentionType(attention_type_) :
+                                        attention_type_;
+    const bool    is_unpadded_mha = isUnPaddedMHA(attention_type);
+
+    for (int ite = 0; ite < iteration_num; ite++) {
+        size_t h_token_num = local_batch_size * seq_len;
+        if (is_unpadded_mha) {
+            const int* base_input_lengths = input_tensors->at("input_lengths").getPtr<int>();
+            invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_,
+                                               &h_token_num,
+                                               padding_offset_,
+                                               cu_seqlens_,
+                                               base_input_lengths + ite * local_batch_size,
+                                               local_batch_size,
+                                               seq_len,
+                                               stream_);
+        }
+        for (int l = 0; l < num_layer_; l++) {
+            if (isValidLayerParallelId(l) == false) {
+                continue;
+            }
+
+            if (l == 0 && is_unpadded_mha) {
+                invokeRemovePadding(decoder_layer_output_,
+                                    decoder_input + ite * local_batch_size * seq_len * hidden_units_,
+                                    padding_offset_,
+                                    h_token_num,
+                                    hidden_units_,
+                                    stream_);
+            }
+
+            const bool is_final     = false;  // TODO(bhsueh) remove this flag
+            T*         layer_input  = decoder_layer_output_;
+            T*         layer_output = decoder_layer_output_;
+            if (!is_unpadded_mha) {
+                if (l == 0) {
+                    layer_input = decoder_input;
+                    layer_input += ite * local_batch_size * seq_len * hidden_units_;
+                }
+                if (l == num_layer_ - 1) {
+                    layer_output = decoder_output;
+                    layer_output += ite * local_batch_size * seq_len * hidden_units_;
+                }
+            }
+
+            if (isFirstLayerParallelId(l) && pipeline_para_.rank_ != 0 && pipeline_para_.world_size_ > 1) {
+                int data_size = h_token_num * hidden_units_ / tensor_para_.world_size_;
+                ftNcclRecv(layer_input + data_size * tensor_para_.rank_,
+                           data_size,
+                           pipeline_para_.rank_ - 1,
+                           pipeline_para_,
+                           stream_);
+                if (tensor_para_.world_size_ > 1) {
+                    ftNcclAllGather(layer_input, layer_input, data_size, tensor_para_.rank_, tensor_para_, stream_);
+                }
+            }
+
+            invokeGeneralT5LayerNorm(decoder_normed_input_,
+                                   layer_input,
+                                   gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma,
+                                   gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.beta,
+                                   layernorm_eps_,
+                                   h_token_num,
+                                   hidden_units_,
+                                   stream_);
+            sync_check_cuda_error();
+
+            TensorMap self_attention_input_tensors{
+                {"input_query",
+                 Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}},
+                {"attention_mask",
+                 Tensor{MEMORY_GPU,
+                        data_type,
+                        {(size_t)local_batch_size, (size_t)1, (size_t)seq_len, (size_t)(seq_len + max_prompt_length)},
+                        attention_mask + local_batch_size * ite * seq_len * (seq_len + max_prompt_length)}},
+                {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type}},
+                {"is_final_layer", Tensor{MEMORY_CPU, TYPE_BOOL, {(size_t)1}, &is_final}},
+                {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}};
+            self_attention_input_tensors.insertIfValid(
+                "d_prefix_prompt_batch",
+                Tensor{MEMORY_GPU,
+                       data_type,
+                       {(size_t)local_batch_size},
+                       d_prefix_prompt_batch != nullptr ? d_prefix_prompt_batch + ite * local_batch_size : nullptr});
+            self_attention_input_tensors.insertIfValid("d_prefix_prompt_lengths",
+                                                       Tensor{MEMORY_GPU,
+                                                              TYPE_INT32,
+                                                              {(size_t)local_batch_size},
+                                                              d_prefix_prompt_lengths != nullptr ?
+                                                                  d_prefix_prompt_lengths + ite * local_batch_size :
+                                                                  nullptr});
+
+            if (is_unpadded_mha) {
+                self_attention_input_tensors.insert("padding_offset",
+                                                    Tensor{MEMORY_GPU, TYPE_INT32, {h_token_num}, padding_offset_});
+                self_attention_input_tensors.insert(
+                    "cu_seqlens", Tensor{MEMORY_GPU, TYPE_INT32, {size_t(local_batch_size + 1)}, cu_seqlens_});
+            }
+
+            size_t cache_offset = l - getFirstLayerParallelId();
+            for (auto t = k_cache.shape.begin() + 1; t != k_cache.shape.end(); ++t) {
+                cache_offset *= *t;
+            };
+            size_t ite_cache_offset = ite * local_batch_size;
+            for (auto t = k_cache.shape.begin() + 2; t != k_cache.shape.end(); ++t) {
+                ite_cache_offset *= *t;
+            }
+            cache_offset += ite_cache_offset;
+
+            TensorMap self_attention_output_tensors{
+                {"hidden_features",
+                 Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, self_attn_output_}},
+                {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}},
+                {"value_cache",
+                 Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}};
+
+            self_attention_layer_->forward(&self_attention_output_tensors,
+                                           &self_attention_input_tensors,
+                                           &gpt_decoder_layer_weight->at(l)->self_attention_weights);
+
+            if (is_final == false) {
+                if (use_gptj_residual_) {
+                    invokeGeneralLayerNorm(decoder_normed_input_,
+                                           layer_input,
+                                           gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
+                                           gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta,
+                                           layernorm_eps_,
+                                           h_token_num,
+                                           hidden_units_,
+                                           (float*)nullptr,
+                                           0,
+                                           stream_);
+                }
+                else {
+                    invokeGeneralAddResidualT5PreLayerNorm(
+                        self_attn_output_,
+                        decoder_normed_input_,
+                        layer_input,
+                        gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
+                        layernorm_eps_,
+                        h_token_num,
+                        hidden_units_,
+                        stream_);
+                }
+
+                TensorMap ffn_input_tensors(
+                    {{"ffn_input",
+                      Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}});
+                TensorMap ffn_output_tensors({{"ffn_output",
+                                               Tensor{MEMORY_GPU,
+                                                      data_type,
+                                                      {h_token_num, (size_t)hidden_units_},
+                                                      use_gptj_residual_ ? ffn_output_ : layer_output}}});
+                ffn_layer_->forward(
+                    &ffn_output_tensors, &ffn_input_tensors, &gpt_decoder_layer_weight->at(l)->ffn_weights);
+
+                if (use_gptj_residual_) {
+                    // Original workflow:
+                    //      layer_output = layer_input + reduceSum(ffn_output + self_attn_output + ffn_output_bias)
+                    // Our workflow:
+                    //      layer_output = reduceSum(ffn_output + self_attn_output + ffn_output_bias + layer_input /
+                    //      TP_size)
+                    // They are equivalent on math, but we can use same buffer for layer_input and layer_output
+
+                    invokeAddBiasAttentionFfnResidual(layer_output,
+                                                      ffn_output_,
+                                                      self_attn_output_,
+                                                      layer_input,
+                                                      gpt_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
+                                                      h_token_num,
+                                                      hidden_units_,
+                                                      tensor_para_.world_size_,
+                                                      stream_);
+                    if (tensor_para_.world_size_ > 1) {
+                        ftNcclAllReduceSum(
+                            layer_output, layer_output, h_token_num * hidden_units_, tensor_para_, stream_);
+                    }
+                }
+                else {
+                    invokeAddBiasResidual(layer_output,
+                                          self_attn_output_,
+                                          gpt_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
+                                          h_token_num,
+                                          hidden_units_,
+                                          stream_);
+                }
+
+                sync_check_cuda_error();
+
+                if (isLastLayerParallelId(l) && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1
+                    && pipeline_para_.world_size_ > 1) {
+                    int data_size = h_token_num * hidden_units_ / tensor_para_.world_size_;
+                    ftNcclSend(layer_output + data_size * tensor_para_.rank_,
+                               data_size,
+                               pipeline_para_.rank_ + 1,
+                               pipeline_para_,
+                               stream_);
+                }
+
+                if ((l == num_layer_ - 1) && is_unpadded_mha) {
+                    invokeRebuildPadding(decoder_output + ite * local_batch_size * seq_len * hidden_units_,
+                                         decoder_layer_output_,
+                                         padding_offset_,
+                                         h_token_num,
+                                         head_num_ * size_per_head_,
+                                         stream_);
+                }
+            }
+        }
+    }
+
+    // TODO(bhsueh) We could optimize this point by only computing the last token for the last layer
+    invokeLookupHiddenStateOfLastToken(output_tensors->at("last_token_hidden_units").getPtr<T>(),
+                                       output_tensors->at("decoder_output").getPtr<T>(),
+                                       input_tensors->at("input_lengths").getPtr<int>(),
+                                       seq_len,
+                                       batch_size,
+                                       hidden_units_,
+                                       stream_);
+    sync_check_cuda_error();
+    if (is_free_buffer_after_forward_ == true) {
+        freeBuffer();
+    }
+}
+
+template class LlamaContextDecoder<float>;
+template class LlamaContextDecoder<half>;
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LlamaContextDecoder.h b/src/fastertransformer/models/llama/LlamaContextDecoder.h
new file mode 100644
index 000000000..788d1d45d
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaContextDecoder.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <vector>
+
+#include "src/fastertransformer/kernels/add_residual_kernels.h"
+#include "src/fastertransformer/kernels/layernorm_kernels.h"
+#include "src/fastertransformer/layers/BaseLayer.h"
+#include "src/fastertransformer/layers/FfnLayer.h"
+#include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h"
+#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/cublasMMWrapper.h"
+#include "src/fastertransformer/utils/custom_ar_comm.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class LlamaContextDecoder: public BaseLayer {
+private:
+    // meta data
+    size_t head_num_;
+    size_t size_per_head_;
+    size_t inter_size_;
+    size_t num_layer_;
+    size_t rotary_embedding_dim_;
+    bool   neox_rotary_style_;
+    bool   use_gptj_residual_;
+    float  layernorm_eps_;
+
+    // calculated data
+    size_t hidden_units_;
+
+    NcclParam tensor_para_;
+    NcclParam pipeline_para_;
+
+    std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm_;
+    int                                 enable_custom_all_reduce_;
+
+    AttentionType attention_type_;
+
+    bool is_qk_buf_float_;
+
+    BaseAttentionLayer<T>* self_attention_layer_;
+    FfnLayer<T>*           ffn_layer_;
+
+    void allocateBuffer() override;
+    void allocateBuffer(size_t batch_size, size_t seq_len);
+    void freeBuffer() override;
+
+    bool isValidLayerParallelId(uint l);
+    bool isFirstLayerParallelId(uint l);
+    bool isLastLayerParallelId(uint l);
+    int  getFirstLayerParallelId();
+
+    void initialize();
+
+protected:
+    T*      decoder_normed_input_   = nullptr;
+    T*      self_attn_output_       = nullptr;
+    T*      ffn_output_             = nullptr;
+    T*      decoder_layer_output_   = nullptr;
+    size_t* h_pinned_token_num_ptr_ = nullptr;
+    int*    padding_offset_         = nullptr;
+    int*    cu_seqlens_             = nullptr;
+
+public:
+    LlamaContextDecoder(size_t                              head_num,
+                        size_t                              size_per_head,
+                        size_t                              inter_size,
+                        size_t                              num_layer,
+                        size_t                              rotary_embedding_dim,
+                        bool                                neox_rotary_style,
+                        bool                                use_gptj_residual,
+                        float                               layernorm_eps,
+                        NcclParam                           tensor_para,
+                        NcclParam                           pipeline_para,
+                        cudaStream_t                        stream,
+                        cublasMMWrapper*                    cublas_wrapper,
+                        IAllocator*                         allocator,
+                        bool                                is_free_buffer_after_forward,
+                        bool                                is_qk_buf_float,
+                        AttentionType                       attention_type            = AttentionType::FUSED_MHA,
+                        std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm    = nullptr,
+                        int                                 enable_custom_all_reduce_ = 0);
+
+    LlamaContextDecoder(LlamaContextDecoder<T> const& decoder);
+
+    ~LlamaContextDecoder();
+
+    void forward(std::vector<Tensor>*                           output_tensors,
+                 const std::vector<Tensor>*                     input_tensors,
+                 const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
+
+    void forward(std::unordered_map<std::string, Tensor>*       output_tensors,
+                 const std::unordered_map<std::string, Tensor>* input_tensors,
+                 const std::vector<LlamaDecoderLayerWeight<T>*>* gpt_decoder_layer_weight);
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LlamaDecoder.cc b/src/fastertransformer/models/llama/LlamaDecoder.cc
new file mode 100644
index 000000000..d5fb58fee
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaDecoder.cc
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/models/llama/LlamaDecoder.h"
+#include "src/fastertransformer/layers/TensorParallelGeluFfnLayer.h"
+#include "src/fastertransformer/layers/attention_layers/TensorParallelDecoderSelfAttentionLayer.h"
+
+namespace fastertransformer {
+
+template<typename T>
+void LlamaDecoder<T>::initialize()
+{
+    self_attention_layer_ = new TensorParallelDecoderSelfAttentionLayer<T>(0,  // max_batch_size
+                                                                           head_num_,
+                                                                           size_per_head_,
+                                                                           rotary_embedding_dim_,
+                                                                           neox_rotary_style_,
+                                                                           tensor_para_,
+                                                                           stream_,
+                                                                           cublas_wrapper_,
+                                                                           allocator_,
+                                                                           !use_gptj_residual_,
+                                                                           is_free_buffer_after_forward_,
+                                                                           false,
+                                                                           0,
+                                                                           custom_all_reduce_comm_,
+                                                                           enable_custom_all_reduce_);
+
+    ffn_layer_ = new TensorParallelGeluFfnLayer<T>(0,  // max_batch_size
+                                                   1,
+                                                   head_num_,
+                                                   size_per_head_,
+                                                   0,  // expert_num
+                                                   inter_size_,
+                                                   tensor_para_,
+                                                   stream_,
+                                                   cublas_wrapper_,
+                                                   allocator_,
+                                                   !use_gptj_residual_,
+                                                   is_free_buffer_after_forward_,
+                                                   false,
+                                                   0,
+                                                   true,  // use_gated_activation = true;
+                                                   custom_all_reduce_comm_,
+                                                   enable_custom_all_reduce_);
+}
+
+template<typename T>
+void LlamaDecoder<T>::allocateBuffer()
+{
+    FT_CHECK(false);
+}
+
+template<typename T>
+void LlamaDecoder<T>::allocateBuffer(size_t batch_size)
+{
+    decoder_normed_input_ = reinterpret_cast<T*>(
+        allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * hidden_units_, false));
+    self_attn_output_ =
+        reinterpret_cast<T*>(allocator_->reMalloc(self_attn_output_, sizeof(T) * batch_size * hidden_units_, false));
+    ffn_output_ =
+        reinterpret_cast<T*>(allocator_->reMalloc(ffn_output_, sizeof(T) * batch_size * hidden_units_, false));
+    decoder_layer_output_ = reinterpret_cast<T*>(
+        allocator_->reMalloc(decoder_layer_output_, sizeof(T) * batch_size * hidden_units_, false));
+    is_allocate_buffer_ = true;
+}
+
+template<typename T>
+void LlamaDecoder<T>::freeBuffer()
+{
+    if (is_allocate_buffer_ == true) {
+        allocator_->free((void**)(&decoder_normed_input_));
+        allocator_->free((void**)(&self_attn_output_));
+        allocator_->free((void**)(&ffn_output_));
+        allocator_->free((void**)(&decoder_layer_output_));
+        is_allocate_buffer_ = false;
+    }
+}
+
+template<typename T>
+bool LlamaDecoder<T>::isValidLayerParallelId(uint l)
+{
+    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
+    return l < num_layer_ && (l >= local_num_layer * pipeline_para_.rank_)
+           && (l < local_num_layer * (pipeline_para_.rank_ + 1));
+}
+
+template<typename T>
+bool LlamaDecoder<T>::isFirstLayerParallelId(uint l)
+{
+    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
+    return l < num_layer_ && (l == local_num_layer * pipeline_para_.rank_);
+}
+
+template<typename T>
+bool LlamaDecoder<T>::isLastLayerParallelId(uint l)
+{
+    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
+    return l < num_layer_ && (l == local_num_layer * (pipeline_para_.rank_ + 1) - 1);
+}
+
+template<typename T>
+int LlamaDecoder<T>::getFirstLayerParallelId()
+{
+    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / pipeline_para_.world_size_));
+    return local_num_layer * pipeline_para_.rank_;
+}
+
+template<typename T>
+LlamaDecoder<T>::LlamaDecoder(size_t                              head_num,
+                              size_t                              size_per_head,
+                              size_t                              inter_size,
+                              size_t                              num_layer,
+                              size_t                              rotary_embedding_dim,
+                              bool                                neox_rotary_style,
+                              bool                                use_gptj_residual,
+                              float                               layernorm_eps,
+                              NcclParam                           tensor_para,
+                              NcclParam                           pipeline_para,
+                              cudaStream_t                        stream,
+                              cublasMMWrapper*                    cublas_wrapper,
+                              IAllocator*                         allocator,
+                              bool                                is_free_buffer_after_forward,
+                              std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
+                              int                                 enable_custom_all_reduce):
+    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
+    head_num_(head_num),
+    size_per_head_(size_per_head),
+    inter_size_(inter_size),
+    num_layer_(num_layer),
+    rotary_embedding_dim_(rotary_embedding_dim),
+    neox_rotary_style_(neox_rotary_style),
+    use_gptj_residual_(use_gptj_residual),
+    layernorm_eps_(layernorm_eps),
+    hidden_units_(head_num_ * size_per_head),
+    tensor_para_(tensor_para),
+    pipeline_para_(pipeline_para),
+    custom_all_reduce_comm_(custom_all_reduce_comm),
+    enable_custom_all_reduce_(enable_custom_all_reduce)
+{
+    initialize();
+}
+
+template<typename T>
+LlamaDecoder<T>::LlamaDecoder(LlamaDecoder<T> const& decoder):
+    BaseLayer(decoder.stream_, decoder.cublas_wrapper_, decoder.allocator_, decoder.is_free_buffer_after_forward_),
+    head_num_(decoder.head_num_),
+    size_per_head_(decoder.size_per_head_),
+    inter_size_(decoder.inter_size_),
+    num_layer_(decoder.num_layer_),
+    rotary_embedding_dim_(decoder.rotary_embedding_dim_),
+    neox_rotary_style_(decoder.neox_rotary_style_),
+    use_gptj_residual_(decoder.use_gptj_residual_),
+    layernorm_eps_(decoder.layernorm_eps_),
+    hidden_units_(decoder.hidden_units_),
+    tensor_para_(decoder.tensor_para_),
+    pipeline_para_(decoder.pipeline_para_),
+    custom_all_reduce_comm_(decoder.custom_all_reduce_comm_),
+    enable_custom_all_reduce_(decoder.enable_custom_all_reduce_)
+{
+    initialize();
+}
+
+template<typename T>
+LlamaDecoder<T>::~LlamaDecoder()
+{
+    delete self_attention_layer_;
+    delete ffn_layer_;
+    freeBuffer();
+}
+
+template<typename T>
+void LlamaDecoder<T>::forward(std::vector<Tensor>*                              output_tensors,
+                              const std::vector<Tensor>*                        input_tensors,
+                              const std::vector<LlamaDecoderLayerWeight<T>*>*   gpt_decoder_layer_weight)
+{
+    FT_CHECK(false);
+}
+
+template<typename T>
+void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*          output_tensors,
+                              const std::unordered_map<std::string, Tensor>*    input_tensors,
+                              const std::vector<LlamaDecoderLayerWeight<T>*>* gpt_decoder_layer_weight)
+{
+    // input tensors:
+    //      decoder_input [local_batch_size, hidden_dimension],
+    //      finished [local_batch_size],
+    //      sequence_lengths [local_batch_size]
+    //      total_padding_tokens [local_batch_size],
+    //      max_input_length [1] on cpu
+    //      d_prefix_prompt_lengths [local_batch_size], on GPU
+    //      max_prefix_prompt_length [1] on cpu
+    //      step [1] on cpu
+    //      ite [1] on cpu
+    //      cache_indirection [local_batch_size / beam_width, beam_width, memory_len]
+    //              Here, local_batch_size contains the beam_width, so local_batch_size / beam_width
+    //              is real local_batch_size.
+    //      masked_tokens[local_batch_size, memory_len]
+
+    // output tensors:
+    //      decoder_output [local_batch_size, hidden_dimension],
+    //      key_cache [num_layer, batch_size, head_num, size_per_head // x, memory_len, x]
+    //      value_cache [num_layer, batch_size, head_num, memory_len, size_per_head]
+
+    FT_CHECK(input_tensors->size() == 11);
+    FT_CHECK(output_tensors->size() == 3);
+
+    const DataType data_type        = getTensorType<T>();
+    const size_t   local_batch_size = input_tensors->at("decoder_input").shape[0];
+    allocateBuffer(local_batch_size);
+    const int ite = input_tensors->at("ite").getVal<const int>();
+
+    T* decoder_input  = input_tensors->at("decoder_input").getPtr<T>();
+    T* decoder_output = output_tensors->at("decoder_output").getPtr<T>();
+
+    Tensor&             k_cache = output_tensors->at("key_cache");
+    Tensor&             v_cache = output_tensors->at("value_cache");
+    std::vector<size_t> self_k_cache_size;
+    self_k_cache_size.push_back(local_batch_size);
+    for (auto t = k_cache.shape.begin() + 2; t != k_cache.shape.end(); ++t) {
+        self_k_cache_size.push_back(*t);
+    }
+    std::vector<size_t> self_v_cache_size;
+    self_v_cache_size.push_back(local_batch_size);
+    for (auto t = v_cache.shape.begin() + 2; t != v_cache.shape.end(); ++t) {
+        self_v_cache_size.push_back(*t);
+    }
+
+    for (uint l = 0; l < num_layer_; l++) {
+        if (isValidLayerParallelId(l) == false) {
+            continue;
+        }
+        T* layer_input  = (l == 0) ? decoder_input : decoder_layer_output_;
+        T* layer_output = (l == num_layer_ - 1) ? decoder_output : decoder_layer_output_;
+
+        if (isFirstLayerParallelId(l) == true && pipeline_para_.rank_ != 0 && pipeline_para_.world_size_ > 1) {
+            int data_size = local_batch_size * hidden_units_ / tensor_para_.world_size_;
+            // ftNcclRecv(layer_input, local_batch_size * hidden_units_, pipeline_para_.rank_ - 1, pipeline_para_,
+            // stream_);
+
+            ftNcclRecv(layer_input + data_size * tensor_para_.rank_,
+                       data_size,
+                       pipeline_para_.rank_ - 1,
+                       pipeline_para_,
+                       stream_);
+            if (tensor_para_.world_size_ > 1) {
+                ftNcclAllGather(layer_input, layer_input, data_size, tensor_para_.rank_, tensor_para_, stream_);
+            }
+        }
+
+        invokeGeneralT5LayerNorm(decoder_normed_input_,
+                                 layer_input,
+                                 gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma,
+                                 gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.beta,
+                                 layernorm_eps_,
+                                 local_batch_size,
+                                 hidden_units_,
+                                 stream_);
+        sync_check_cuda_error();
+
+        TensorMap self_attention_input_tensors(*input_tensors);
+        self_attention_input_tensors.insert(
+            "input_query", Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, decoder_normed_input_});
+
+        size_t cache_offset = l - getFirstLayerParallelId();
+        for (auto t = k_cache.shape.begin() + 1; t != k_cache.shape.end(); ++t) {
+            cache_offset *= *t;
+        };
+        size_t ite_cache_offset = ite * local_batch_size;
+        for (auto t = k_cache.shape.begin() + 2; t != k_cache.shape.end(); ++t) {
+            ite_cache_offset *= *t;
+        }
+        cache_offset += ite_cache_offset;
+
+        TensorMap self_attention_output_tensors{
+            {"hidden_features", Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, self_attn_output_}},
+            {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}},
+            {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}};
+
+        self_attention_layer_->forward(&self_attention_output_tensors,
+                                       &self_attention_input_tensors,
+                                       &gpt_decoder_layer_weight->at(l)->self_attention_weights);
+        if (use_gptj_residual_) {
+            invokeGeneralLayerNorm(decoder_normed_input_,
+                                   layer_input,
+                                   gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
+                                   gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta,
+                                   layernorm_eps_,
+                                   local_batch_size,
+                                   hidden_units_,
+                                   (float*)nullptr,
+                                   0,
+                                   stream_);
+        }
+        else {
+            invokeGeneralAddResidualT5PreLayerNorm(
+                self_attn_output_,
+                decoder_normed_input_,
+                layer_input,
+                gpt_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
+                layernorm_eps_,
+                local_batch_size,
+                hidden_units_,
+                stream_);
+        }
+
+        TensorMap ffn_input_tensors(
+            {{"ffn_input", Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, decoder_normed_input_}}});
+        TensorMap ffn_output_tensors({{"ffn_output",
+                                       Tensor{MEMORY_GPU,
+                                              data_type,
+                                              {local_batch_size, hidden_units_},
+                                              use_gptj_residual_ ? ffn_output_ : layer_output}}});
+        ffn_layer_->forward(&ffn_output_tensors, &ffn_input_tensors, &gpt_decoder_layer_weight->at(l)->ffn_weights);
+
+        if (use_gptj_residual_) {
+            // Original workflow:
+            //      layer_output = layer_input + reduceSum(ffn_output + self_attn_output + ffn_output_bias)
+            // Our workflow:
+            //      layer_output = reduceSum(ffn_output + self_attn_output + ffn_output_bias + layer_input / TP_size)
+            // They are equivalent on math, but we can use same buffer for layer_input and layer_output
+            invokeAddBiasAttentionFfnResidual(layer_output,
+                                              ffn_output_,
+                                              self_attn_output_,
+                                              layer_input,
+                                              gpt_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
+                                              local_batch_size,
+                                              hidden_units_,
+                                              tensor_para_.world_size_,
+                                              stream_);
+            if (tensor_para_.world_size_ > 1) {
+                ftNcclAllReduceSum(layer_output, layer_output, local_batch_size * hidden_units_, tensor_para_, stream_);
+            }
+        }
+        else {
+            invokeAddBiasResidual(layer_output,
+                                  self_attn_output_,
+                                  gpt_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
+                                  local_batch_size,
+                                  hidden_units_,
+                                  stream_);
+        }
+
+        sync_check_cuda_error();
+
+        if (isLastLayerParallelId(l) == true && pipeline_para_.rank_ != pipeline_para_.world_size_ - 1
+            && pipeline_para_.world_size_ > 1) {
+            int data_size = local_batch_size * hidden_units_ / tensor_para_.world_size_;
+            // ftNcclSend(layer_output, local_batch_size * hidden_units_, pipeline_para_.rank_ + 1, pipeline_para_,
+            // stream_);
+
+            ftNcclSend(layer_output + data_size * tensor_para_.rank_,
+                       data_size,
+                       pipeline_para_.rank_ + 1,
+                       pipeline_para_,
+                       stream_);
+        }
+    }
+
+    if (is_free_buffer_after_forward_ == true) {
+        freeBuffer();
+    }
+}
+
+template class LlamaDecoder<float>;
+template class LlamaDecoder<half>;
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LlamaDecoder.h b/src/fastertransformer/models/llama/LlamaDecoder.h
new file mode 100644
index 000000000..6cdd7df27
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaDecoder.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <vector>
+
+#include "src/fastertransformer/kernels/add_residual_kernels.h"
+#include "src/fastertransformer/kernels/layernorm_kernels.h"
+#include "src/fastertransformer/layers/BaseLayer.h"
+#include "src/fastertransformer/layers/FfnLayer.h"
+#include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h"
+#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/cublasMMWrapper.h"
+#include "src/fastertransformer/utils/custom_ar_comm.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class LlamaDecoder: public BaseLayer {
+private:
+protected:
+    void         allocateBuffer() override;
+    void         allocateBuffer(size_t batch_size);
+    void         freeBuffer() override;
+    bool         isValidLayerParallelId(uint l);
+    bool         isFirstLayerParallelId(uint l);
+    bool         isLastLayerParallelId(uint l);
+    int          getFirstLayerParallelId();
+    virtual void initialize();
+
+    // meta data
+    size_t head_num_;
+    size_t size_per_head_;
+    size_t inter_size_;
+    size_t num_layer_;
+    size_t rotary_embedding_dim_;
+    bool   neox_rotary_style_;
+    bool   use_gptj_residual_;
+    size_t hidden_units_;
+    float  layernorm_eps_;
+
+    NcclParam tensor_para_;
+    NcclParam pipeline_para_;
+
+    std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm_;
+    int                                 enable_custom_all_reduce_;
+
+    T* decoder_normed_input_ = nullptr;
+    T* self_attn_output_     = nullptr;
+    T* ffn_output_           = nullptr;
+    T* decoder_layer_output_ = nullptr;
+
+    BaseAttentionLayer<T>* self_attention_layer_;
+    FfnLayer<T>*           ffn_layer_;
+
+public:
+    LlamaDecoder(size_t                              head_num,
+                 size_t                              size_per_head,
+                 size_t                              inter_size,
+                 size_t                              num_layer,
+                 size_t                              rotary_embedding_dim,
+                 bool                                neox_rotary_style,
+                 bool                                use_gptj_residual,
+                 float                               layernorm_eps,
+                 NcclParam                           tensor_para,
+                 NcclParam                           pipeline_para,
+                 cudaStream_t                        stream,
+                 cublasMMWrapper*                    cublas_wrapper,
+                 IAllocator*                         allocator,
+                 bool                                is_free_buffer_after_forward,
+                 std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm    = nullptr,
+                 int                                 enable_custom_all_reduce_ = 0);
+
+    LlamaDecoder(LlamaDecoder<T> const& decoder);
+
+    virtual ~LlamaDecoder();
+
+    virtual void forward(std::unordered_map<std::string, Tensor>*          output_tensors,
+                         const std::unordered_map<std::string, Tensor>*    input_tensors,
+                         const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
+
+    virtual void forward(std::vector<Tensor>*                              output_tensors,
+                         const std::vector<Tensor>*                        input_tensors,
+                         const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
new file mode 100644
index 000000000..ecd539a75
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(const int  hidden_units,
+                                                    const int  inter_size,
+                                                    const int  tensor_para_size,
+                                                    const int  tensor_para_rank,
+                                                    const bool use_gptj_residual):
+    hidden_units_(hidden_units),
+    inter_size_(inter_size),
+    tensor_para_size_(tensor_para_size),
+    tensor_para_rank_(tensor_para_rank),
+    use_gptj_residual_(use_gptj_residual)
+{
+    mallocWeights();
+    setWeightPtr();
+}
+
+template<typename T>
+LlamaDecoderLayerWeight<T>::~LlamaDecoderLayerWeight()
+{
+    if (is_maintain_buffer == true) {
+        for (int i = 0; i < 12; i++) {
+            if (!use_gptj_residual_ && i != attention_dense_bias_weight_id) {
+                cudaFree(weights_ptr[i]);
+            }
+        }
+
+        pre_layernorm_weights.beta                            = nullptr;
+        pre_layernorm_weights.gamma                           = nullptr;
+        self_attention_weights.query_weight.kernel            = nullptr;
+        self_attention_weights.query_weight.bias              = nullptr;
+        self_attention_weights.attention_output_weight.kernel = nullptr;
+        self_attention_weights.attention_output_weight.bias   = nullptr;
+        post_attention_layernorm_weights.beta                 = nullptr;
+        post_attention_layernorm_weights.gamma                = nullptr;
+
+        ffn_weights.intermediate_weight.kernel = nullptr;
+        ffn_weights.intermediate_weight.bias   = nullptr;
+        ffn_weights.intermediate_weight2.kernel = nullptr;
+        ffn_weights.intermediate_weight2.bias   = nullptr;
+        ffn_weights.output_weight.kernel       = nullptr;
+        ffn_weights.output_weight.bias         = nullptr;
+        is_maintain_buffer                     = false;
+    }
+}
+
+template<typename T>
+LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other):
+    hidden_units_(other.hidden_units_),
+    inter_size_(other.inter_size_),
+    tensor_para_size_(other.tensor_para_size_),
+    tensor_para_rank_(other.tensor_para_rank_),
+    use_gptj_residual_(other.use_gptj_residual_)
+{
+    mallocWeights();
+    cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_);
+    cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
+    cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
+    cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_);
+    cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
+    if (!use_gptj_residual_) {
+        cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_);
+    }
+
+    cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);
+    cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_);
+
+    cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_);
+    cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], inter_size_ / tensor_para_size_);
+
+    cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_);
+    cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], hidden_units_);
+    cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_);
+    cudaD2Dcpy(weights_ptr[13], other.weights_ptr[13], hidden_units_);
+    setWeightPtr();
+}
+
+template<typename T>
+LlamaDecoderLayerWeight<T>& LlamaDecoderLayerWeight<T>::operator=(const LlamaDecoderLayerWeight& other)
+{
+    hidden_units_      = other.hidden_units_;
+    inter_size_        = other.inter_size_;
+    tensor_para_size_  = other.tensor_para_size_;
+    tensor_para_rank_  = other.tensor_para_rank_;
+    use_gptj_residual_ = other.use_gptj_residual_;
+
+    mallocWeights();
+
+    cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_);
+    cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
+    cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
+    cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_);
+    cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
+    if (!use_gptj_residual_) {
+        cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_);
+    }
+    cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);
+    cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_);
+    cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_);
+    cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], inter_size_ / tensor_para_size_);
+    cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_);
+    cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], hidden_units_);
+    cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_);
+    cudaD2Dcpy(weights_ptr[13], other.weights_ptr[13], hidden_units_);
+    setWeightPtr();
+    return *this;
+}
+
+template<typename T>
+void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType model_file_type)
+{
+    FT_CHECK(is_maintain_buffer == true);
+    const std::string rank_spec = std::to_string(tensor_para_rank_);
+
+    // fill all bias to zeros
+    deviceFill(weights_ptr[0], (size_t)hidden_units_, (T)0.0);
+    loadWeightFromBin<T>(
+        weights_ptr[1], {(size_t)hidden_units_}, dir_path + ".input_layernorm.weight.bin", model_file_type);
+
+    loadWeightFromBin<T>(weights_ptr[2],
+                         {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)},
+                         dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin",
+                         model_file_type);
+    deviceFill(weights_ptr[3], (size_t)(3 * hidden_units_ / tensor_para_size_), (T)0.0);
+
+    loadWeightFromBin<T>(weights_ptr[4],
+                         {(size_t)(hidden_units_ / tensor_para_size_), (size_t)hidden_units_},
+                         dir_path + ".attention.dense.weight." + rank_spec + ".bin",
+                         model_file_type);
+    if (!use_gptj_residual_) {
+        deviceFill(weights_ptr[5], (size_t)hidden_units_, (T)0.0);
+    }
+
+    // FIXME(sunpeng17): check if the weights are correct
+    loadWeightFromBin<T>(weights_ptr[6],
+                         {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)},
+                         dir_path + ".mlp.up_proj.weight." + rank_spec + ".bin",
+                         model_file_type);
+    deviceFill(weights_ptr[7], (size_t)(inter_size_ / tensor_para_size_), (T)0.0);
+
+    loadWeightFromBin<T>(weights_ptr[8],
+                         {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)},
+                         dir_path + ".mlp.gate_proj.weight." + rank_spec + ".bin",
+                         model_file_type);
+    deviceFill(weights_ptr[9], (size_t)(inter_size_ / tensor_para_size_), (T)0.0);
+
+    loadWeightFromBin<T>(weights_ptr[10],
+                         {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_},
+                         dir_path + ".mlp.down_proj.weight." + rank_spec + ".bin",
+                         model_file_type);
+    deviceFill(weights_ptr[11], (size_t)(hidden_units_), (T)0.0);
+
+    deviceFill(weights_ptr[12], (size_t)(hidden_units_), (T)0.0);
+    loadWeightFromBin<T>(
+        weights_ptr[13], {(size_t)hidden_units_}, dir_path + ".post_attention_layernorm.weight.bin", model_file_type);
+}
+
+template<typename T>
+void LlamaDecoderLayerWeight<T>::setWeightPtr()
+{
+    pre_layernorm_weights.beta                            = weights_ptr[0];
+    pre_layernorm_weights.gamma                           = weights_ptr[1];
+    self_attention_weights.query_weight.kernel            = weights_ptr[2];
+    self_attention_weights.query_weight.bias              = weights_ptr[3];
+    self_attention_weights.attention_output_weight.kernel = weights_ptr[4];
+    self_attention_weights.attention_output_weight.bias   = use_gptj_residual_ ? nullptr : weights_ptr[5];
+
+    ffn_weights.intermediate_weight.kernel  = weights_ptr[6];
+    ffn_weights.intermediate_weight.bias    = weights_ptr[7];
+    ffn_weights.intermediate_weight2.kernel = weights_ptr[8];
+    ffn_weights.intermediate_weight2.bias   = weights_ptr[9];
+    ffn_weights.output_weight.kernel        = weights_ptr[10];
+    ffn_weights.output_weight.bias          = weights_ptr[11];
+
+    post_attention_layernorm_weights.beta  = weights_ptr[12];
+    post_attention_layernorm_weights.gamma = weights_ptr[13];
+    is_maintain_buffer                     = true;
+}
+
+template<typename T>
+void LlamaDecoderLayerWeight<T>::mallocWeights()
+{
+    deviceMalloc(&weights_ptr[0], hidden_units_); // pre layernorm beta
+    deviceMalloc(&weights_ptr[1], hidden_units_); // pre layernorm gamma
+    deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); // qkv kernel
+    deviceMalloc(&weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); // qkv bias
+    deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); // attention output weight
+    if (!use_gptj_residual_) {
+        deviceMalloc(&weights_ptr[5], hidden_units_); // attention output bias
+    }
+
+    deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight kernel
+    deviceMalloc(&weights_ptr[7], inter_size_ / tensor_para_size_);                 // intermediate_weight bias
+    deviceMalloc(&weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight2 kernel
+    deviceMalloc(&weights_ptr[9], inter_size_ / tensor_para_size_);                 // intermediate_weight2 bias
+    deviceMalloc(&weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_); // output_weight kernel
+    deviceMalloc(&weights_ptr[11], hidden_units_);                                   // output_weight bias
+    deviceMalloc(&weights_ptr[12], hidden_units_); // post attn layernorm beta
+    deviceMalloc(&weights_ptr[13], hidden_units_); // post attn layernorm gamma
+}
+
+template struct LlamaDecoderLayerWeight<float>;
+template struct LlamaDecoderLayerWeight<half>;
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h
new file mode 100644
index 000000000..008e1a3b4
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <string>
+
+#include "src/fastertransformer/kernels/layernorm_kernels.h"
+#include "src/fastertransformer/layers/FfnWeight.h"
+#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+struct LlamaDecoderLayerWeight {
+public:
+    LlamaDecoderLayerWeight() = default;
+    LlamaDecoderLayerWeight(const int  hidden_units,
+                            const int  inter_size,
+                            const int  tensor_para_size  = 1,
+                            const int  tensor_para_rank  = 0,
+                            const bool use_gptj_residual = true);
+    ~LlamaDecoderLayerWeight();
+    LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other);
+    LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight& other);
+
+    void loadModel(std::string dir_path, FtCudaDataType model_file_type);
+
+    LayerNormWeight<T> pre_layernorm_weights;
+    AttentionWeight<T> self_attention_weights;
+    LayerNormWeight<T> post_attention_layernorm_weights;
+    FfnWeight<T>       ffn_weights;
+
+private:
+    int       hidden_units_;
+    int       inter_size_;
+    int       tensor_para_size_;
+    int       tensor_para_rank_;
+    bool      use_gptj_residual_;
+    const int attention_dense_bias_weight_id = 5;
+    bool      is_maintain_buffer             = false;
+    T*        weights_ptr[14];
+
+    void setWeightPtr();
+    void mallocWeights();
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LlamaWeight.cc b/src/fastertransformer/models/llama/LlamaWeight.cc
new file mode 100644
index 000000000..65c2f762d
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaWeight.cc
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/models/llama/LlamaWeight.h"
+
+namespace fastertransformer {
+
+template<typename T>
+LlamaWeight<T>::LlamaWeight(const int                                  hidden_units,
+                            const int                                  inter_size,
+                            const int                                  vocab_size,
+                            const int                                  num_layer,
+                            const int                                  max_seq_len,
+                            const int                                  tensor_para_size,
+                            const int                                  tensor_para_rank,
+                            const int                                  layer_para_size,
+                            const int                                  layer_para_rank,
+                            const bool                                 use_gptj_residual,
+                            PromptLearningType                         prompt_learning_type,
+                            std::map<std::string, std::pair<int, int>> prompt_learning_pair):
+    hidden_units_(hidden_units),
+    inter_size_(inter_size),
+    vocab_size_(vocab_size),
+    num_layer_(num_layer),
+    max_seq_len_(max_seq_len),
+    tensor_para_size_(tensor_para_size),
+    tensor_para_rank_(tensor_para_rank),
+    layer_para_size_(layer_para_size),
+    layer_para_rank_(layer_para_rank),
+    use_gptj_residual_(use_gptj_residual),
+    prompt_learning_type_(prompt_learning_type),
+    prompt_learning_pair_(prompt_learning_pair)
+{
+    FT_CHECK(num_layer_ % layer_para_size_ == 0);
+    // set prompt weight size
+    if (prompt_learning_type_ == PromptLearningType::prefix_prompt) {
+        prompt_token_weight_size_ = 2 * num_layer_ * hidden_units_ / tensor_para_size_;
+    }
+    else if (prompt_learning_type_ == PromptLearningType::p_prompt_tuning) {
+        prompt_token_weight_size_ = hidden_units_;
+    }
+
+    // set if load and malloc prompt weights
+    malloc_load_prompt_weights_ = !prompt_learning_pair_.empty()
+                                  && (prompt_learning_type_ == PromptLearningType::p_prompt_tuning
+                                      || prompt_learning_type_ == PromptLearningType::prefix_prompt);
+
+    decoder_layer_weights.reserve(num_layer_);
+    for (int l = 0; l < num_layer_; l++) {
+        if (isValidLayerParallelId(l)) {
+            decoder_layer_weights.push_back(new LlamaDecoderLayerWeight<T>(
+                hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_, use_gptj_residual_));
+        }
+        else {
+            // Layer-parallelism: allocate empty layer because
+            // this rank does not compute it:
+            decoder_layer_weights.push_back(new LlamaDecoderLayerWeight<T>(0, 0));
+        }
+    }
+
+    mallocWeights();
+    setWeightPtr();
+}
+
+template<typename T>
+LlamaWeight<T>::~LlamaWeight()
+{
+    if (is_maintain_buffer == true) {
+        for (int i = 0; i < weights_ptr.size(); i++) {
+            deviceFree(weights_ptr[i]);
+        }
+
+        pre_decoder_embedding_table   = nullptr;
+        post_decoder_layernorm.beta   = nullptr;
+        post_decoder_layernorm.gamma  = nullptr;
+        post_decoder_embedding.kernel = nullptr;
+        is_maintain_buffer            = false;
+    }
+}
+
+template<typename T>
+LlamaWeight<T>::LlamaWeight(const LlamaWeight& other):
+    hidden_units_(other.hidden_units_),
+    inter_size_(other.inter_size_),
+    vocab_size_(other.vocab_size_),
+    num_layer_(other.num_layer_),
+    max_seq_len_(other.max_seq_len_),
+    tensor_para_size_(other.tensor_para_size_),
+    tensor_para_rank_(other.tensor_para_rank_),
+    layer_para_size_(other.layer_para_size_),
+    layer_para_rank_(other.layer_para_rank_),
+    use_gptj_residual_(other.use_gptj_residual_),
+    prompt_token_weight_size_(other.prompt_token_weight_size_),
+    malloc_load_prompt_weights_(other.malloc_load_prompt_weights_),
+    prompt_learning_type_(other.prompt_learning_type_),
+    prompt_learning_pair_(other.prompt_learning_pair_)
+{
+    mallocWeights();
+    cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], vocab_size_ * hidden_units_);
+    cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
+    cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_);
+    cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], hidden_units_ * vocab_size_);
+
+    // prompt learning table: malloc weights and set weight ptr
+    if (malloc_load_prompt_weights_) {
+        for (auto const& prompt : prompt_learning_pair_) {
+            std::string task_name     = prompt.first;
+            int         task_name_id  = prompt.second.first;
+            int         prompt_length = prompt.second.second;
+            size_t      prompt_id     = num_base_weights + (size_t)task_name_id;
+
+            // cuda device to device memcpy prompt table weights buffer memory
+            cudaD2Dcpy(weights_ptr[prompt_id], other.weights_ptr[prompt_id], prompt_length * prompt_token_weight_size_);
+        }
+    }
+
+    setWeightPtr();
+
+    decoder_layer_weights.clear();
+    decoder_layer_weights.reserve(num_layer_);
+    for (int l = 0; l < num_layer_; l++) {
+        decoder_layer_weights.push_back(other.decoder_layer_weights[l]);
+    }
+}
+
+template<typename T>
+LlamaWeight<T>& LlamaWeight<T>::operator=(const LlamaWeight& other)
+{
+    hidden_units_               = other.hidden_units_;
+    inter_size_                 = other.inter_size_;
+    vocab_size_                 = other.vocab_size_;
+    num_layer_                  = other.num_layer_;
+    max_seq_len_                = other.max_seq_len_;
+    tensor_para_size_           = other.tensor_para_size_;
+    tensor_para_rank_           = other.tensor_para_rank_;
+    layer_para_size_            = other.layer_para_size_;
+    layer_para_rank_            = other.layer_para_rank_;
+    use_gptj_residual_          = other.use_gptj_residual_;
+    prompt_token_weight_size_   = other.prompt_token_weight_size_;
+    malloc_load_prompt_weights_ = other.malloc_load_prompt_weights_;
+    prompt_learning_type_       = other.prompt_learning_type_;
+    prompt_learning_pair_       = other.prompt_learning_pair_;
+
+    mallocWeights();
+    cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], vocab_size_ * hidden_units_);
+    cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
+    cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_);
+    cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], hidden_units_ * vocab_size_);
+
+    // prompt learning table: malloc weights and set weight ptr
+    if (malloc_load_prompt_weights_) {
+        for (auto const& prompt : prompt_learning_pair_) {
+            std::string task_name     = prompt.first;
+            int         task_name_id  = prompt.second.first;
+            int         prompt_length = prompt.second.second;
+            size_t      prompt_id     = num_base_weights + (size_t)task_name_id;
+
+            // cuda device to device memcpy prompt table weights buffer memory
+            cudaD2Dcpy(weights_ptr[prompt_id], other.weights_ptr[prompt_id], prompt_length * prompt_token_weight_size_);
+        }
+    }
+
+    setWeightPtr();
+
+    decoder_layer_weights.clear();
+    decoder_layer_weights.reserve(num_layer_);
+    for (int l = 0; l < num_layer_; l++) {
+        decoder_layer_weights.push_back(other.decoder_layer_weights[l]);
+    }
+    return *this;
+}
+
+template<typename T>
+void LlamaWeight<T>::setWeightPtr()
+{
+    prompt_learning_table.resize(prompt_learning_pair_.size());
+
+    pre_decoder_embedding_table   = weights_ptr[0];
+    post_decoder_layernorm.beta   = weights_ptr[1];
+    post_decoder_layernorm.gamma  = weights_ptr[2];
+    post_decoder_embedding.kernel = weights_ptr[3];
+
+    // prompt learning tables: set weight ptr
+    if (malloc_load_prompt_weights_) {
+        for (auto const& prompt : prompt_learning_pair_) {
+            int    task_name_id   = prompt.second.first;
+            int    prompt_length  = prompt.second.second;
+            size_t task_weight_id = num_base_weights + (size_t)task_name_id;
+
+            // set weight ptr
+            prompt_learning_table[task_name_id] = {weights_ptr[task_weight_id], prompt_length};
+        }
+    }
+}
+
+template<typename T>
+void LlamaWeight<T>::mallocWeights()
+{
+    weights_ptr.resize(num_base_weights + prompt_learning_pair_.size());
+
+    deviceMalloc(&weights_ptr[0], vocab_size_ * hidden_units_);
+    deviceMalloc(&weights_ptr[1], hidden_units_);
+    deviceMalloc(&weights_ptr[2], hidden_units_);
+    deviceMalloc(&weights_ptr[3], hidden_units_ * vocab_size_);
+
+    // prompt learning tables: malloc weights
+    if (malloc_load_prompt_weights_) {
+        for (auto const& prompt : prompt_learning_pair_) {
+            int    task_name_id   = prompt.second.first;
+            int    prompt_length  = prompt.second.second;
+            size_t task_weight_id = num_base_weights + (size_t)task_name_id;
+
+            // malloc weights
+            T* prompt_weights_ptr = nullptr;
+            deviceMalloc(&prompt_weights_ptr, prompt_length * prompt_token_weight_size_);
+            weights_ptr[task_weight_id] = prompt_weights_ptr;
+        }
+    }
+    is_maintain_buffer = true;
+}
+
+template<typename T>
+void LlamaWeight<T>::loadModel(std::string dir_path)
+{
+    FtCudaDataType model_file_type = getModelFileType(dir_path + "/config.ini", "llama");
+    FT_CHECK(is_maintain_buffer == true);
+
+    loadWeightFromBin<T>(
+        weights_ptr[0], {(size_t)(vocab_size_ * hidden_units_)}, dir_path + "/model.wte.weight.bin", model_file_type);
+    deviceFill(weights_ptr[1], (size_t)hidden_units_, (T)0.0);
+    loadWeightFromBin<T>(
+        weights_ptr[2], {(size_t)hidden_units_}, dir_path + "/model.final_layernorm.weight.bin", model_file_type);
+    loadWeightFromBin<T>(weights_ptr[3],
+                         {(size_t)(vocab_size_ * hidden_units_)},
+                         dir_path + "/model.lm_head.weight.bin",
+                         model_file_type);
+
+    // prompt table: load weights from bin
+    if (malloc_load_prompt_weights_) {
+        for (auto const& prompt : prompt_learning_pair_) {
+            std::string task_name      = prompt.first;
+            int         task_name_id   = prompt.second.first;
+            int         prompt_length  = prompt.second.second;
+            size_t      task_weight_id = num_base_weights + (size_t)task_name_id;
+
+            std::string prompt_weight_path_name = (prompt_learning_type_ == PromptLearningType::p_prompt_tuning) ?
+                                                      (dir_path + "/model.prompt_table." + task_name + ".weight.bin") :
+                                                      (dir_path + "/model.prefix_prompt." + task_name + ".weight."
+                                                       + std::to_string(tensor_para_rank_) + ".bin");
+
+            if (prompt_length > 0) {
+                loadWeightFromBin<T>(weights_ptr[task_weight_id],
+                                     {(size_t)(prompt_length * (int)prompt_token_weight_size_)},
+                                     prompt_weight_path_name,
+                                     model_file_type);
+            }
+        }
+    }
+
+    for (int l = 0; l < num_layer_; l++) {
+        if (isValidLayerParallelId(l)) {
+            decoder_layer_weights[l]->loadModel(dir_path + "/model.layers." + std::to_string(l), model_file_type);
+        }
+    }
+}
+
+template<typename T>
+void LlamaWeight<T>::resizeLayer(const int num_layer)
+{
+    num_layer_ = num_layer;
+    decoder_layer_weights.reserve(num_layer_);
+    for (int l = 0; l < num_layer_; l++) {
+        decoder_layer_weights.push_back(new LlamaDecoderLayerWeight<T>());
+    }
+}
+
+template<typename T>
+bool LlamaWeight<T>::isValidLayerParallelId(int l)
+{
+    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / layer_para_size_));
+    return l < num_layer_ && (l >= local_num_layer * layer_para_rank_)
+           && (l < local_num_layer * (layer_para_rank_ + 1));
+}
+
+template struct LlamaWeight<float>;
+template struct LlamaWeight<half>;
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LlamaWeight.h b/src/fastertransformer/models/llama/LlamaWeight.h
new file mode 100644
index 000000000..2f5b2632a
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaWeight.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/kernels/layernorm_kernels.h"
+#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+#include "src/fastertransformer/utils/prompt_learning.h"
+
+namespace fastertransformer {
+
+template<typename T>
+struct LlamaWeight {
+
+    LlamaWeight() = default;
+    LlamaWeight(
+        const int                                  hidden_units,
+        const int                                  inter_size,
+        const int                                  vocab_size,
+        const int                                  num_layer,
+        const int                                  max_seq_len,
+        const int                                  tensor_para_size     = 1,
+        const int                                  tensor_para_rank     = 0,
+        const int                                  layer_para_size      = 1,
+        const int                                  layer_para_rank      = 0,
+        const bool                                 use_gptj_residual_   = true,
+        PromptLearningType                         prompt_learning_type = PromptLearningType::no_prompt,
+        std::map<std::string, std::pair<int, int>> prompt_learning_pair = std::map<std::string, std::pair<int, int>>{});
+
+    ~LlamaWeight();
+    LlamaWeight(const LlamaWeight& other);
+    LlamaWeight& operator=(const LlamaWeight& other);
+
+    void loadModel(std::string dir_path);
+
+    void resizeLayer(const int num_layer);
+
+    std::vector<LlamaDecoderLayerWeight<T>*> decoder_layer_weights;
+    const T*                                   pre_decoder_embedding_table = nullptr;
+    // GPT-J does not use embedding table, but we leave the ptr such that
+    // GptNeoX::forward and Gpt::forward become identical
+    const T* position_encoding_table = nullptr;
+
+    /*
+        prompt_learning_pair = vectors of [weight ptr, prompt length] pair
+        prompt_length is stored here for compatible prompt learning table
+        prefix_prompt weights store as shape [num_layers, 2, num_heads, perfix_seq_len, size_per_head]
+        p/prompt tuning weights store as shape [prompt_len, hidden_units]
+        idx is the task_name_id of the prompt tables
+    */
+    std::vector<std::pair<const T*, int>> prompt_learning_table = {};
+
+    LayerNormWeight<T> post_decoder_layernorm;
+    DenseWeight<T>     post_decoder_embedding;
+
+    inline void setMaxSeqLen(size_t max_seq_len)
+    {
+        max_seq_len_ = max_seq_len;
+    }
+
+private:
+    void setWeightPtr();
+    void mallocWeights();
+    bool isValidLayerParallelId(int l);
+
+    int hidden_units_;
+    int inter_size_;
+    int vocab_size_;
+    int num_layer_;
+    int max_seq_len_;
+
+    int tensor_para_size_;
+    int tensor_para_rank_;
+    int layer_para_size_;
+    int layer_para_rank_;
+
+    // residual type
+    bool use_gptj_residual_;
+
+    // prompt learning pair (task_name, (task_name_id, prompt_len))
+    PromptLearningType                         prompt_learning_type_;
+    std::map<std::string, std::pair<int, int>> prompt_learning_pair_;
+    bool                                       malloc_load_prompt_weights_ = false;
+    // each prompt token's weight size
+    size_t prompt_token_weight_size_ = 0;
+
+    bool            is_maintain_buffer = false;
+    const size_t    num_base_weights   = 4;
+    std::vector<T*> weights_ptr        = std::vector<T*>(num_base_weights);
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/triton_backend/CMakeLists.txt b/src/fastertransformer/triton_backend/CMakeLists.txt
index 0079e087a..9620cc1a3 100644
--- a/src/fastertransformer/triton_backend/CMakeLists.txt
+++ b/src/fastertransformer/triton_backend/CMakeLists.txt
@@ -26,3 +26,5 @@ if (ENABLE_FP8)
     add_subdirectory(multi_gpu_gpt_fp8)
 endif()
 add_subdirectory(bert)
+
+# add_subdirectory(llama)

From a32fc1def50b2d07f8f131e306bdfb1c005ce0b8 Mon Sep 17 00:00:00 2001
From: void-main <voidmain1313113@gmail.com>
Date: Mon, 24 Apr 2023 07:32:13 +0000
Subject: [PATCH 02/27] make the code work :yay:

---
 examples/cpp/llama/check_with_huggingface.py  | 15 +++++++------
 .../cpp/llama/huggingface_llama_convert.py    | 22 +++++++++----------
 examples/cpp/llama/llama_example.cc           |  2 +-
 .../models/llama/CMakeLists.txt               |  4 ++--
 src/fastertransformer/models/llama/Llama.h    |  4 ++--
 .../models/llama/LlamaContextDecoder.cc       |  7 +++---
 .../models/llama/LlamaDecoder.cc              |  7 +++---
 .../models/llama/LlamaDecoderLayerWeight.cc   |  4 ++--
 .../models/llama/LlamaWeight.h                |  2 +-
 9 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/examples/cpp/llama/check_with_huggingface.py b/examples/cpp/llama/check_with_huggingface.py
index 0ba69036e..d1f356cc1 100644
--- a/examples/cpp/llama/check_with_huggingface.py
+++ b/examples/cpp/llama/check_with_huggingface.py
@@ -1,15 +1,16 @@
 import transformers
-import torch
 
 from transformers import LlamaForCausalLM, LlamaTokenizer
 
 tokenizer = LlamaTokenizer.from_pretrained('/data/llama-7b-hf')
-prompt = "Hey"
-inputs = tokenizer(prompt, return_tensors='pt')
-print(inputs)
 
+prompt = "Hey, are you consciours? Can you talk to me?"
+inputs = tokenizer(prompt, return_tensors='pt')
 model = LlamaForCausalLM.from_pretrained("/data/llama-7b-hf")
-generated_ids = model.generate(inputs.input_ids, max_length=10)
+hf_config = vars(model.config)
+print(hf_config)
+generated_ids = model.forward(inputs.input_ids, output_hidden_states=True)
 print(generated_ids)
-output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-print(output)
+
+tokens = [0,18637,29892,526,366,1136,455,2470,29973,1815,366,5193,304,592,29973,18637,29892,526,366,1136,455,2470,29973,1815,366,5193,304,592,29973,18637,29892,526,366,1136,455,2470,29973,1815,366,5193,304,592,29973,18637,29892,526,366]
+print(tokenizer.decode(tokens))
diff --git a/examples/cpp/llama/huggingface_llama_convert.py b/examples/cpp/llama/huggingface_llama_convert.py
index 28a552a28..37868d0bd 100644
--- a/examples/cpp/llama/huggingface_llama_convert.py
+++ b/examples/cpp/llama/huggingface_llama_convert.py
@@ -119,31 +119,31 @@ def split_and_convert(args):
         print(f"converting layer {l}")
         # first merge QKV into a single weight
         # concat direct to FT shape: [hidden_size, 3, head_num, head_size]
-        qkv_weights = np.empty((hidden_size, 3, head_num, head_size), dtype=np_weight_data_type)
-        q_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.q_proj.weight'])
-        k_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.k_proj.weight'])
-        v_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.v_proj.weight'])
-        qkv_weights[:, 0, :, :] = q_weight.reshape(hidden_size, head_num, head_size)
-        qkv_weights[:, 1, :, :] = k_weight.reshape(hidden_size, head_num, head_size)
-        qkv_weights[:, 2, :, :] = v_weight.reshape(hidden_size, head_num, head_size)
+        # copied from huggingface_gptj_ckpt_convert.py
+        qkv_weights = np.stack([
+            param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.q_proj.weight']),
+            param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.k_proj.weight']),
+            param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.v_proj.weight']),
+        ])
+        qkv_weights = np.transpose(qkv_weights, (2, 0, 1))
         qkv_weights_base_name = f'model.layers.{l}.attention.query_key_value.weight'
         split_and_convert_process(saved_dir, factor, qkv_weights_base_name, qkv_weights)
 
         # attention dense
-        o_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.o_proj.weight'])
+        o_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.o_proj.weight']).T
         o_weight_base_name = f'model.layers.{l}.attention.dense.weight'
         split_and_convert_process(saved_dir, factor, o_weight_base_name, o_weight)
 
         # MLP
-        mlp_down_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.mlp.down_proj.weight'])
+        mlp_down_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.mlp.down_proj.weight']).T
         mlp_down_base_name = f'model.layers.{l}.mlp.down_proj.weight'
         split_and_convert_process(saved_dir, factor, mlp_down_base_name, mlp_down_weight)
 
-        mlp_gate_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.mlp.gate_proj.weight'])
+        mlp_gate_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.mlp.gate_proj.weight']).T
         mlp_gate_base_name = f'model.layers.{l}.mlp.gate_proj.weight'
         split_and_convert_process(saved_dir, factor, mlp_gate_base_name, mlp_gate_weight)
 
-        mlp_up_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.mlp.up_proj.weight'])
+        mlp_up_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.mlp.up_proj.weight']).T
         mlp_up_base_name = f'model.layers.{l}.mlp.up_proj.weight'
         split_and_convert_process(saved_dir, factor, mlp_up_base_name, mlp_up_weight)
 
diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
index c72a7a8b7..5672fc2cd 100644
--- a/examples/cpp/llama/llama_example.cc
+++ b/examples/cpp/llama/llama_example.cc
@@ -263,7 +263,7 @@ void llama_example(const INIReader reader)
         cublas_wrapper.setFP32GemmConfig();
     }
 
-    const bool                          use_gptj_residual = (bool)reader.GetInteger(model_name, "use_gptj_residual", 1);
+    const bool                          use_gptj_residual = false;
     fastertransformer::LlamaWeight<T> gpt_weights(hidden_units,
                                                   inter_size,
                                                   vocab_size,
diff --git a/src/fastertransformer/models/llama/CMakeLists.txt b/src/fastertransformer/models/llama/CMakeLists.txt
index ec836068d..88ad2ad42 100644
--- a/src/fastertransformer/models/llama/CMakeLists.txt
+++ b/src/fastertransformer/models/llama/CMakeLists.txt
@@ -24,7 +24,7 @@ set_property(TARGET LlamaDecoder PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET LlamaDecoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 target_link_libraries(LlamaDecoder PUBLIC -lcudart cublasMMWrapper
                       TensorParallelDecoderSelfAttentionLayer
-                      TensorParallelGeluFfnLayer
+                      TensorParallelSiluFfnLayer
                       layernorm_kernels
                       add_residual_kernels
                       LlamaDecoderLayerWeight
@@ -38,7 +38,7 @@ set_property(TARGET LlamaContextDecoder PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET LlamaContextDecoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 target_link_libraries(LlamaContextDecoder PUBLIC -lcudart cublasMMWrapper
                       TensorParallelGptContextAttentionLayer
-                      TensorParallelGeluFfnLayer
+                      TensorParallelSiluFfnLayer
                       layernorm_kernels
                       add_residual_kernels
                       gpt_kernels
diff --git a/src/fastertransformer/models/llama/Llama.h b/src/fastertransformer/models/llama/Llama.h
index cd79ac0b8..8ba6a88df 100644
--- a/src/fastertransformer/models/llama/Llama.h
+++ b/src/fastertransformer/models/llama/Llama.h
@@ -40,7 +40,7 @@ class Llama: public BaseLayer {
     size_t rotary_embedding_dim_;
 
     static constexpr bool  neox_rotary_style_ = true;
-    static constexpr float layernorm_eps_     = 1e-5f;
+    static constexpr float layernorm_eps_     = 1e-6f;
 
     int    start_id_;
     int    end_id_;
@@ -61,7 +61,7 @@ class Llama: public BaseLayer {
          std::string(std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM")) != "ON");
 
     // Residual Type
-    const bool use_gptj_residual_ = true;
+    const bool use_gptj_residual_ = false;
 
     // Prompt Learning Parameters
     PromptLearningType prompt_learning_type_;
diff --git a/src/fastertransformer/models/llama/LlamaContextDecoder.cc b/src/fastertransformer/models/llama/LlamaContextDecoder.cc
index f107d38c1..8ea1494fb 100644
--- a/src/fastertransformer/models/llama/LlamaContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LlamaContextDecoder.cc
@@ -18,7 +18,7 @@
 #include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
 #include "src/fastertransformer/kernels/gpt_kernels.h"
 
-#include "src/fastertransformer/layers/TensorParallelGeluFfnLayer.h"
+#include "src/fastertransformer/layers/TensorParallelSiluFfnLayer.h"
 #include "src/fastertransformer/layers/attention_layers/TensorParallelGptContextAttentionLayer.h"
 
 namespace fastertransformer {
@@ -44,7 +44,7 @@ void LlamaContextDecoder<T>::initialize()
                                                                           custom_all_reduce_comm_,
                                                                           enable_custom_all_reduce_);
 
-    ffn_layer_ = new TensorParallelGeluFfnLayer<T>(0,  // max_batch_size
+    ffn_layer_ = new TensorParallelSiluFfnLayer<T>(0,  // max_batch_size
                                                    0,  // max_seq_len
                                                    head_num_,
                                                    size_per_head_,
@@ -57,7 +57,6 @@ void LlamaContextDecoder<T>::initialize()
                                                    !use_gptj_residual_,
                                                    is_free_buffer_after_forward_,
                                                    false,
-                                                   0,
                                                    true,  // use_gated_activation = true;
                                                    custom_all_reduce_comm_,
                                                    enable_custom_all_reduce_);
@@ -333,7 +332,7 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             invokeGeneralT5LayerNorm(decoder_normed_input_,
                                    layer_input,
                                    gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma,
-                                   gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.beta,
+                                   (const T*)nullptr,
                                    layernorm_eps_,
                                    h_token_num,
                                    hidden_units_,
diff --git a/src/fastertransformer/models/llama/LlamaDecoder.cc b/src/fastertransformer/models/llama/LlamaDecoder.cc
index d5fb58fee..23e9b0eec 100644
--- a/src/fastertransformer/models/llama/LlamaDecoder.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoder.cc
@@ -15,7 +15,7 @@
  */
 
 #include "src/fastertransformer/models/llama/LlamaDecoder.h"
-#include "src/fastertransformer/layers/TensorParallelGeluFfnLayer.h"
+#include "src/fastertransformer/layers/TensorParallelSiluFfnLayer.h"
 #include "src/fastertransformer/layers/attention_layers/TensorParallelDecoderSelfAttentionLayer.h"
 
 namespace fastertransformer {
@@ -39,7 +39,7 @@ void LlamaDecoder<T>::initialize()
                                                                            custom_all_reduce_comm_,
                                                                            enable_custom_all_reduce_);
 
-    ffn_layer_ = new TensorParallelGeluFfnLayer<T>(0,  // max_batch_size
+    ffn_layer_ = new TensorParallelSiluFfnLayer<T>(0,  // max_batch_size
                                                    1,
                                                    head_num_,
                                                    size_per_head_,
@@ -52,7 +52,6 @@ void LlamaDecoder<T>::initialize()
                                                    !use_gptj_residual_,
                                                    is_free_buffer_after_forward_,
                                                    false,
-                                                   0,
                                                    true,  // use_gated_activation = true;
                                                    custom_all_reduce_comm_,
                                                    enable_custom_all_reduce_);
@@ -264,7 +263,7 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         invokeGeneralT5LayerNorm(decoder_normed_input_,
                                  layer_input,
                                  gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma,
-                                 gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.beta,
+                                 (const T*)nullptr,
                                  layernorm_eps_,
                                  local_batch_size,
                                  hidden_units_,
diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
index ecd539a75..39a1d5c1f 100644
--- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
@@ -154,13 +154,13 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
     // FIXME(sunpeng17): check if the weights are correct
     loadWeightFromBin<T>(weights_ptr[6],
                          {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)},
-                         dir_path + ".mlp.up_proj.weight." + rank_spec + ".bin",
+                         dir_path + ".mlp.gate_proj.weight." + rank_spec + ".bin",
                          model_file_type);
     deviceFill(weights_ptr[7], (size_t)(inter_size_ / tensor_para_size_), (T)0.0);
 
     loadWeightFromBin<T>(weights_ptr[8],
                          {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)},
-                         dir_path + ".mlp.gate_proj.weight." + rank_spec + ".bin",
+                         dir_path + ".mlp.up_proj.weight." + rank_spec + ".bin",
                          model_file_type);
     deviceFill(weights_ptr[9], (size_t)(inter_size_ / tensor_para_size_), (T)0.0);
 
diff --git a/src/fastertransformer/models/llama/LlamaWeight.h b/src/fastertransformer/models/llama/LlamaWeight.h
index 2f5b2632a..ec909ca49 100644
--- a/src/fastertransformer/models/llama/LlamaWeight.h
+++ b/src/fastertransformer/models/llama/LlamaWeight.h
@@ -37,7 +37,7 @@ struct LlamaWeight {
         const int                                  tensor_para_rank     = 0,
         const int                                  layer_para_size      = 1,
         const int                                  layer_para_rank      = 0,
-        const bool                                 use_gptj_residual_   = true,
+        const bool                                 use_gptj_residual_   = false,
         PromptLearningType                         prompt_learning_type = PromptLearningType::no_prompt,
         std::map<std::string, std::pair<int, int>> prompt_learning_pair = std::map<std::string, std::pair<int, int>>{});
 

From ce8700fa866b3b2616d8788e6005be731afc9377 Mon Sep 17 00:00:00 2001
From: void-main <voidmain1313113@gmail.com>
Date: Mon, 24 Apr 2023 09:30:41 +0000
Subject: [PATCH 03/27] fix llama rms ln

---
 src/fastertransformer/models/llama/Llama.cc | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index 0091fce48..d0305e84a 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -843,16 +843,14 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
             }
 
             if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) {
-                invokeGeneralLayerNorm(normed_decoder_output_buf_ + hidden_units_offset,
-                                       decoder_output_buf_ + hidden_units_offset,
-                                       gpt_weights->post_decoder_layernorm.gamma,
-                                       gpt_weights->post_decoder_layernorm.beta,
-                                       layernorm_eps_,
-                                       local_batch_size * beam_width,
-                                       hidden_units_,
-                                       (float*)nullptr,
-                                       0,
-                                       stream_);
+                invokeGeneralT5LayerNorm(normed_decoder_output_buf_ + hidden_units_offset,
+                                         decoder_output_buf_ + hidden_units_offset,
+                                         gpt_weights->post_decoder_layernorm.gamma,
+                                         (const T*)nullptr,
+                                         layernorm_eps_,
+                                         local_batch_size * beam_width,
+                                         hidden_units_,
+                                         stream_);
                 sync_check_cuda_error();
 
                 if (tensor_para_.world_size_ == 1) {

From 91989cb6fdf84e048247ffa4396f7ac3f574254d Mon Sep 17 00:00:00 2001
From: void-main <voidmain1313113@gmail.com>
Date: Tue, 25 Apr 2023 03:08:37 +0000
Subject: [PATCH 04/27] add bf16 support

---
 examples/cpp/llama/llama_example.cc                          | 5 +++++
 src/fastertransformer/models/llama/Llama.cc                  | 4 ++++
 src/fastertransformer/models/llama/LlamaContextDecoder.cc    | 4 ++++
 src/fastertransformer/models/llama/LlamaDecoder.cc           | 4 ++++
 .../models/llama/LlamaDecoderLayerWeight.cc                  | 4 ++++
 src/fastertransformer/models/llama/LlamaWeight.cc            | 4 ++++
 6 files changed, 25 insertions(+)

diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
index 5672fc2cd..0f8e4a5aa 100644
--- a/examples/cpp/llama/llama_example.cc
+++ b/examples/cpp/llama/llama_example.cc
@@ -60,6 +60,11 @@ int main(int argc, char* argv[])
     else if (data_type == "fp16") {
         llama_example<half>(reader);
     }
+#ifdef ENABLE_BF16
+    else if (data_type == "bf16") {
+        llama_example<__nv_bfloat16>(reader);
+    }
+#endif
     else {
         FT_LOG_ERROR("is_fp16 should be 0 (use float) or 1 (use half).");
         return -1;
diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index d0305e84a..0f547d2c5 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -1206,4 +1206,8 @@ bool* Llama<T>::getFinishBuffer()
 template class Llama<float>;
 template class Llama<half>;
 
+#ifdef ENABLE_BF16
+template class Llama<__nv_bfloat16>;
+#endif
+
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LlamaContextDecoder.cc b/src/fastertransformer/models/llama/LlamaContextDecoder.cc
index 8ea1494fb..900e1f016 100644
--- a/src/fastertransformer/models/llama/LlamaContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LlamaContextDecoder.cc
@@ -500,4 +500,8 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
 template class LlamaContextDecoder<float>;
 template class LlamaContextDecoder<half>;
 
+#ifdef ENABLE_BF16
+template class LlamaContextDecoder<__nv_bfloat16>;
+#endif
+
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LlamaDecoder.cc b/src/fastertransformer/models/llama/LlamaDecoder.cc
index 23e9b0eec..dd9a51cee 100644
--- a/src/fastertransformer/models/llama/LlamaDecoder.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoder.cc
@@ -377,4 +377,8 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
 template class LlamaDecoder<float>;
 template class LlamaDecoder<half>;
 
+#ifdef ENABLE_BF16
+template class LlamaDecoder<__nv_bfloat16>;
+#endif
+
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
index 39a1d5c1f..dc99451b9 100644
--- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
@@ -222,4 +222,8 @@ void LlamaDecoderLayerWeight<T>::mallocWeights()
 template struct LlamaDecoderLayerWeight<float>;
 template struct LlamaDecoderLayerWeight<half>;
 
+#ifdef ENABLE_BF16
+template class LlamaDecoderLayerWeight<__nv_bfloat16>;
+#endif
+
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LlamaWeight.cc b/src/fastertransformer/models/llama/LlamaWeight.cc
index 65c2f762d..01741e4f2 100644
--- a/src/fastertransformer/models/llama/LlamaWeight.cc
+++ b/src/fastertransformer/models/llama/LlamaWeight.cc
@@ -298,4 +298,8 @@ bool LlamaWeight<T>::isValidLayerParallelId(int l)
 template struct LlamaWeight<float>;
 template struct LlamaWeight<half>;
 
+#ifdef ENABLE_BF16
+template class LlamaWeight<__nv_bfloat16>;
+#endif
+
 }  // namespace fastertransformer

From 4bc97c333cb60671547de71de30c5aad0c76eae1 Mon Sep 17 00:00:00 2001
From: void-main <voidmain1313113@gmail.com>
Date: Tue, 25 Apr 2023 04:56:41 +0000
Subject: [PATCH 05/27] add triton model for streaming callback

---
 examples/cpp/llama/CMakeLists.txt             |   8 +-
 examples/cpp/llama/llama_config.ini           |   2 +-
 examples/cpp/llama/llama_example.cc           |   1 -
 examples/cpp/llama/llama_triton_example.cc    |  73 +++--
 .../triton_backend/CMakeLists.txt             |   2 +-
 .../triton_backend/llama/CMakeLists.txt       |  25 ++
 .../triton_backend/llama/LlamaTritonModel.cc  | 253 +++++++++++++++++
 .../triton_backend/llama/LlamaTritonModel.h   |  82 ++++++
 .../llama/LlamaTritonModelInstance.cc         | 265 ++++++++++++++++++
 .../llama/LlamaTritonModelInstance.h          |  82 ++++++
 .../transformer_triton_backend.hpp            |   1 +
 11 files changed, 746 insertions(+), 48 deletions(-)
 create mode 100644 src/fastertransformer/triton_backend/llama/CMakeLists.txt
 create mode 100644 src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc
 create mode 100644 src/fastertransformer/triton_backend/llama/LlamaTritonModel.h
 create mode 100644 src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc
 create mode 100644 src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h

diff --git a/examples/cpp/llama/CMakeLists.txt b/examples/cpp/llama/CMakeLists.txt
index 0495d3bf2..cdf9033dd 100644
--- a/examples/cpp/llama/CMakeLists.txt
+++ b/examples/cpp/llama/CMakeLists.txt
@@ -16,7 +16,7 @@ add_executable(llama_example llama_example.cc)
 target_link_libraries(llama_example PUBLIC -lcublas -lcublasLt -lcudart
                       Llama nvtx_utils gpt_example_utils word_list mpi_utils nccl_utils)
 
-# add_executable(llama_triton_example llama_triton_example.cc)
-# target_link_libraries(llama_triton_example PUBLIC -lcublas -lcublasLt -lcudart -lpthread
-#                       LlamaTritonBackend TransformerTritonBackend custom_ar_comm
-#                       gpt_example_utils word_list mpi_utils nccl_utils nvtx_utils)
+add_executable(llama_triton_example llama_triton_example.cc)
+target_link_libraries(llama_triton_example PUBLIC -lcublas -lcublasLt -lcudart -lpthread
+                      LlamaTritonBackend TransformerTritonBackend custom_ar_comm
+                      gpt_example_utils word_list mpi_utils nccl_utils nvtx_utils)
diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini
index 5882e03a4..6fa7117ca 100644
--- a/examples/cpp/llama/llama_config.ini
+++ b/examples/cpp/llama/llama_config.ini
@@ -6,7 +6,7 @@ tensor_para_size=1
 pipeline_para_size=1
 
 model_name=llama_7b
-model_dir=/data/llama-7b-hf-converted
+model_dir=/data/llama-7b-hf-converted/1-gpu
 
 [request]
 beam_width=1 # beam width for beam search
diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
index 0f8e4a5aa..5e84e13d9 100644
--- a/examples/cpp/llama/llama_example.cc
+++ b/examples/cpp/llama/llama_example.cc
@@ -282,7 +282,6 @@ void llama_example(const INIReader reader)
                                                   prompt_learning_type,
                                                   prefix_prompt_table_pair);
 
-    model_dir = model_dir + "/" + std::to_string(tensor_para.world_size_) + "-gpu";
     gpt_weights.loadModel(model_dir);
     unsigned long long random_seed;
     if (rank == 0) {
diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index 1840035a2..e639ae9ca 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -118,7 +118,7 @@ broadCastRequest(const std::vector<int>& v_start_ids,
         pointer_record->push_back(start_ids_ptr);
         pointer_record->push_back(end_ids_ptr);
 
-        std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors(
+        request_list.push_back(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>(
             new std::unordered_map<std::string, triton::Tensor>{
                 {"input_ids",
                  triton::Tensor{triton::MEMORY_GPU,
@@ -130,25 +130,18 @@ broadCastRequest(const std::vector<int>& v_start_ids,
                                 triton::TYPE_INT32,
                                 std::vector<size_t>{(size_t)request_batch_size},
                                 d_input_lengths}},
-                // NOTE: add prefix prompt task ids here if you need
-                // {"prefix_prompt_task_ids", triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32,
-                // std::vector<size_t>{request_batch_size}, task_name_ids}},
                 {"request_output_len",
                  triton::Tensor{triton::MEMORY_CPU,
-                                triton::TYPE_UINT32,
+                                triton::TYPE_INT32,
                                 std::vector<size_t>{(size_t)request_batch_size},
                                 request_output_len_ptr}},
+                {"bad_words_list",
+                 triton::Tensor{
+                     triton::MEMORY_GPU, triton::TYPE_INT32, {2, v_input_bad_words.size() / 2}, d_input_bad_words}},
                 {"start_id",
                  triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, {(size_t)request_batch_size}, start_ids_ptr}},
                 {"end_id",
-                 triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, {(size_t)request_batch_size}, end_ids_ptr}}});
-        if (!v_input_bad_words.empty()) {
-            input_tensors->insert(
-                {"bad_words_list",
-                 triton::Tensor{
-                     triton::MEMORY_GPU, triton::TYPE_INT32, {2, v_input_bad_words.size() / 2}, d_input_bad_words}});
-        }
-        request_list.push_back(input_tensors);
+                 triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, {(size_t)request_batch_size}, end_ids_ptr}}}));
 
         int* beam_width_ptr = new int(param.beam_width);
         pointer_record->push_back(beam_width_ptr);
@@ -234,10 +227,10 @@ prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std
         ft::FT_CHECK(false);
     }
 
-    const size_t      request_batch_size = reader.GetInteger("request", "request_batch_size");
-    const std::string model_name         = reader.Get("ft_instance_hyperparameter", "model_name");
-    const int         start_id           = reader.GetInteger(model_name, "start_id");
-    const int         end_id             = reader.GetInteger(model_name, "end_id");
+    const size_t request_batch_size = reader.GetInteger("request", "request_batch_size");
+
+    const int start_id = reader.GetInteger("llama_7b", "start_id");
+    const int end_id   = reader.GetInteger("llama_7b", "end_id");
 
     std::vector<int> v_start_ids;
     std::vector<int> v_start_lengths;
@@ -249,22 +242,22 @@ prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std
                        max_input_len,
                        end_id,
                        1,
-                       "../examples/cpp/gptj/start_ids.csv");
+                       "../examples/cpp/llama/start_ids.csv");
 
     std::vector<int> v_bad_words;
-    ft::read_word_list("../examples/cpp/gptj/bad_words.csv", v_bad_words);
+    ft::read_word_list("../examples/cpp/llama/bad_words.csv", v_bad_words);
 
     RequestParam param;
-    param.beam_width                 = reader.GetInteger("ft_instance_hyperparameter", "beam_width");
+    param.beam_width                 = reader.GetInteger("request", "beam_width");
     param.request_output_len         = reader.GetInteger("request", "request_output_len");
-    param.beam_search_diversity_rate = reader.GetFloat("ft_instance_hyperparameter", "beam_search_diversity_rate");
-    param.runtime_top_k              = (uint)reader.GetInteger("ft_instance_hyperparameter", "top_k");
-    param.runtime_top_p              = reader.GetFloat("ft_instance_hyperparameter", "top_p");
-    param.temperature                = reader.GetFloat("ft_instance_hyperparameter", "temperature");
-    param.len_penalty                = reader.GetFloat("ft_instance_hyperparameter", "len_penalty");
-    param.repetition_penalty         = reader.GetFloat("ft_instance_hyperparameter", "repetition_penalty", 1.0f);
-    param.presence_penalty           = reader.GetFloat("ft_instance_hyperparameter", "presence_penalty", 0.0f);
-    param.min_length                 = reader.GetInteger("ft_instance_hyperparameter", "min_length", 0);
+    param.beam_search_diversity_rate = reader.GetFloat("request", "beam_search_diversity_rate");
+    param.runtime_top_k              = reader.GetInteger("request", "top_k");
+    param.runtime_top_p              = reader.GetFloat("request", "top_p");
+    param.temperature                = reader.GetFloat("request", "temperature");
+    param.len_penalty                = reader.GetFloat("request", "len_penalty");
+    param.repetition_penalty         = reader.GetFloat("request", "repetition_penalty", 1.0f);
+    param.presence_penalty           = reader.GetFloat("request", "presence_penalty", 0.0f);
+    param.min_length                 = reader.GetInteger("request", "min_length", 0);
     param.random_seed                = (unsigned long long int)0;
     param.start_id                   = start_id;
     param.end_id                     = end_id;
@@ -310,18 +303,20 @@ int main(int argc, char* argv[])
         by MPI or triton
     */
 
+    MPICHECK(MPI_Init(&argc, &argv));
     ft::mpi::initialize(&argc, &argv);
     int node_id  = ft::mpi::getCommWorldRank();
     int node_num = ft::mpi::getCommWorldSize();
+    std::cout << "node_id: " << node_id << ", node_num: " << node_num << std::endl;
 
     // Note: Only supports that all nodes have same gpu count
     const int   gpu_count  = ft::getDeviceCount();
     const int   world_size = node_num * gpu_count;
-    std::string ini_name   = argc >= 2 ? std::string(argv[1]) : "../examples/cpp/gptj/gptj_config.ini";
+    std::string ini_name   = argc >= 2 ? std::string(argv[1]) : "../examples/cpp/llama/llama_config.ini";
 
     // step 1: Create model
-    std::shared_ptr<AbstractTransformerModel> model              = AbstractTransformerModel::createGptJModel(ini_name);
-    int                                       tensor_para_size   = model->getTensorParaSize();
+    std::shared_ptr<AbstractTransformerModel> model            = AbstractTransformerModel::createLlamaModel(ini_name);
+    int                                       tensor_para_size = model->getTensorParaSize();
     int                                       pipeline_para_size = model->getPipelineParaSize();
     FT_CHECK_WITH_INFO(world_size == (tensor_para_size * pipeline_para_size),
                        "World Size != Tensor Parallel Size * Pipeline Parallel Size !");
@@ -329,7 +324,7 @@ int main(int argc, char* argv[])
     std::cout << model->toString();
 
     // step 2: Initialize the NCCL
-    std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params = model->createNcclParams(node_id);
+    std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_comms = model->createNcclParams(node_id);
     cudaDeviceSynchronize();
 
     // Optional Step: create custom all reduce comm
@@ -346,7 +341,7 @@ int main(int argc, char* argv[])
                                       &model_instances,
                                       device_id,
                                       rank,
-                                      nccl_params,
+                                      nccl_comms,
                                       custom_all_reduce_comms[rank]));
     }
     for (auto& t : threads) {
@@ -398,20 +393,16 @@ int main(int argc, char* argv[])
                 std::cout << "Writing " << outCount << " elements\n";
                 int zeroCount = 0;
                 for (size_t i = 0; i < outCount; i++) {
-                    if (hBuf[i] == int(0)) {
+                    if (hBuf[i] == int(0))
                         zeroCount++;
-                    }
                     outFile << hBuf[i] << " ";
-                    if ((i + 1) % (seq_len) == 0) {
+                    if ((i + 1) % (seq_len) == 0)
                         outFile << std::endl;
-                    }
 
-                    if (i < 10) {
+                    if (i < 10)
                         printf("%5d ", hBuf[i]);
-                    }
-                    if ((i + 1) % (seq_len) == 0 && i < 10) {
+                    if ((i + 1) % (seq_len) == 0 && i < 10)
                         std::cout << std::endl;
-                    }
                 }
                 std::cout << std::endl << "zeroCount = " << zeroCount << std::endl;
             }
diff --git a/src/fastertransformer/triton_backend/CMakeLists.txt b/src/fastertransformer/triton_backend/CMakeLists.txt
index 9620cc1a3..63f3526da 100644
--- a/src/fastertransformer/triton_backend/CMakeLists.txt
+++ b/src/fastertransformer/triton_backend/CMakeLists.txt
@@ -27,4 +27,4 @@ if (ENABLE_FP8)
 endif()
 add_subdirectory(bert)
 
-# add_subdirectory(llama)
+add_subdirectory(llama)
diff --git a/src/fastertransformer/triton_backend/llama/CMakeLists.txt b/src/fastertransformer/triton_backend/llama/CMakeLists.txt
new file mode 100644
index 000000000..d5ba5547e
--- /dev/null
+++ b/src/fastertransformer/triton_backend/llama/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.8)
+
+set(parallel_gpt_triton_backend_files
+    LlamaTritonModel.cc
+    LlamaTritonModelInstance.cc
+)
+
+add_library(LlamaTritonBackend STATIC ${parallel_gpt_triton_backend_files})
+set_property(TARGET LlamaTritonBackend PROPERTY POSITION_INDEPENDENT_CODE  ON)
+target_link_libraries(LlamaTritonBackend PRIVATE TransformerTritonBackend Llama tensor memory_utils -lcublasLt)
+target_compile_features(LlamaTritonBackend PRIVATE cxx_std_14)
diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc
new file mode 100644
index 000000000..02cfc6e9e
--- /dev/null
+++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h"
+#include "3rdparty/INIReader.h"
+#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
+#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
+#include "src/fastertransformer/utils/allocator.h"
+
+namespace ft = fastertransformer;
+
+std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaModel(std::string inifile)
+{
+    INIReader reader = INIReader(inifile);
+    if (reader.ParseError() < 0) {
+        std::cout << "[ERROR] Can't load '" << inifile << "'\n";
+        return nullptr;
+    }
+
+    const std::string data_type        = reader.Get("ft_instance_hyperparameter", "data_type");
+    int               tensor_para_size = reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size");
+    std::string       model_dir        = reader.Get("ft_instance_hyperparameter", "model_dir");
+
+    if (data_type == "half") {
+        return std::make_shared<LlamaTritonModel<half>>(
+            reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"),
+            reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"),
+            reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0),
+            model_dir);
+    }
+    if (data_type == "bf16") {
+        return std::make_shared<LlamaTritonModel<__nv_bfloat16>>(
+            reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"),
+            reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"),
+            reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0),
+            model_dir);
+    }
+    else {
+        return std::make_shared<LlamaTritonModel<float>>(
+            reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"),
+            reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"),
+            reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0),
+            model_dir);
+    }
+}
+
+template<typename T>
+LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
+                                      size_t      pipeline_para_size,
+                                      int         enable_custom_all_reduce,
+                                      std::string model_dir):
+    tensor_para_size_(tensor_para_size),
+    pipeline_para_size_(pipeline_para_size),
+    shared_weights_(std::vector<std::shared_ptr<ft::LlamaWeight<T>>>(ft::getDeviceCount())),
+    enable_custom_all_reduce_(enable_custom_all_reduce)
+{
+    model_dir_ = model_dir;
+    const std::string inifile{model_dir + "/config.ini"};
+    INIReader         reader = INIReader(inifile);
+    if (reader.ParseError() < 0) {
+        std::cout << "[ERROR] Can't load '" << inifile << "'\n";
+        ft::FT_CHECK(false);
+    }
+
+    model_name_           = reader.Get("llama", "model_name");
+    head_num_             = reader.GetInteger("llama", "head_num");
+    size_per_head_        = reader.GetInteger("llama", "size_per_head");
+    inter_size_           = reader.GetInteger("llama", "inter_size");
+    num_layer_            = reader.GetInteger("llama", "num_layer");
+    vocab_size_           = reader.GetInteger("llama", "vocab_size");
+    rotary_embedding_dim_ = reader.GetInteger("llama", "rotary_embedding");
+    start_id_             = reader.GetInteger("llama", "start_id");
+    end_id_               = reader.GetInteger("llama", "end_id");
+    use_gptj_residual_    = false;
+
+    num_tasks_ = reader.GetInteger("llama", "num_tasks", 0);
+
+    prompt_learning_start_id_ = reader.GetInteger("llama", "prompt_learning_start_id", end_id_ + 1);
+    prompt_learning_type_ =
+        static_cast<ft::PromptLearningType>(reader.GetInteger("llama", "prompt_learning_type", 0));
+
+    for (int task_name_id = 0; task_name_id < num_tasks_; task_name_id++) {
+        std::string config_task_name = "task_" + std::to_string(task_name_id);
+        std::string task_name        = reader.Get(config_task_name, "task_name");
+        const int   prompt_length    = reader.GetInteger(config_task_name, "prompt_length", 0);
+        prompt_learning_table_pair_.insert({task_name, {task_name_id, prompt_length}});
+    }
+}
+
+template<typename T>
+std::unique_ptr<AbstractTransformerModelInstance> LlamaTritonModel<T>::createModelInstance(
+    int                                                               device_id,
+    int                                                               rank,
+    cudaStream_t                                                      stream,
+    std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
+    std::shared_ptr<ft::AbstractCustomComm>                           custom_all_reduce_comm)
+{
+    ft::check_cuda_error(cudaSetDevice(device_id));
+    const int comms_rank = device_id % (tensor_para_size_ * pipeline_para_size_);
+
+    std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator(
+        new ft::Allocator<ft::AllocatorType::CUDA>(device_id));
+
+    allocator->setStream(stream);
+
+    cublasHandle_t   cublas_handle;
+    cublasLtHandle_t cublaslt_handle;
+
+    cublasCreate(&cublas_handle);
+    cublasLtCreate(&cublaslt_handle);
+    cublasSetStream(cublas_handle, stream);
+
+    std::unique_ptr<ft::cublasAlgoMap>   cublas_algo_map(new ft::cublasAlgoMap("gemm_config.in"));
+    std::unique_ptr<std::mutex>          cublas_wrapper_mutex(new std::mutex());
+    std::unique_ptr<ft::cublasMMWrapper> cublas_wrapper(new ft::cublasMMWrapper(
+        cublas_handle, cublaslt_handle, stream, cublas_algo_map.get(), cublas_wrapper_mutex.get(), allocator.get()));
+
+    std::unique_ptr<cudaDeviceProp> cuda_device_prop_ptr(new cudaDeviceProp);
+    ft::check_cuda_error(cudaGetDeviceProperties(cuda_device_prop_ptr.get(), device_id));
+
+    if (std::is_same<T, half>::value) {
+        cublas_wrapper->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F);
+    }
+    else if (std::is_same<T, float>::value) {
+        cublas_wrapper->setFP32GemmConfig();
+    }
+
+    ft::NcclParam tensor_para   = nccl_params.first[comms_rank];
+    ft::NcclParam pipeline_para = nccl_params.second[comms_rank];
+
+    ft::AttentionType attention_type = ft::getAttentionType<T>(size_per_head_,
+                                                               ft::getSMVersion(),
+                                                               true,   // remove_padding
+                                                               0,      // gpt supports any-seq-length fmha
+                                                               true,   // is_fuse
+                                                               false,  // with_relative_position_bias
+                                                               true);  // causal_mask
+    auto              gpt            = std::make_unique<ft::Llama<T>>(
+        ft::Llama<T>(head_num_,
+                     size_per_head_,
+                     inter_size_,
+                     num_layer_,
+                     vocab_size_,
+                     rotary_embedding_dim_,
+                     start_id_,
+                     end_id_,
+                     prompt_learning_start_id_,  // p/prompt tuning virtual token start id
+                     prompt_learning_type_,
+                     use_gptj_residual_,
+                     0.0f,  // beam_search_diversity_rate_,
+                     0,     // top_k_,
+                     0.0f,  // top_p_,
+                     0,     // random seed, note that all gpus should use same seed
+                     0.0f,  // temperature_,
+                     0.0f,  // len_penalty_,
+                     0.0f,  // repetition_penalty_,
+                     tensor_para,
+                     pipeline_para,
+                     stream,
+                     cublas_wrapper.get(),
+                     allocator.get(),
+                     false,
+                     cuda_device_prop_ptr.get(),
+                     attention_type,
+                     custom_all_reduce_comm,
+                     enable_custom_all_reduce_));
+
+    return std::unique_ptr<LlamaTritonModelInstance<T>>(
+        new LlamaTritonModelInstance<T>(std::move(gpt),
+                                        shared_weights_[device_id],
+                                        std::move(allocator),
+                                        std::move(cublas_algo_map),
+                                        std::move(cublas_wrapper_mutex),
+                                        std::move(cublas_wrapper),
+                                        std::move(cuda_device_prop_ptr)));
+}
+
+template<typename T>
+void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
+{
+    ft::check_cuda_error(cudaSetDevice(device_id));
+    const int tensor_para_rank   = rank % tensor_para_size_;
+    const int pipeline_para_rank = rank / tensor_para_size_;
+    shared_weights_[device_id]   = std::make_shared<ft::LlamaWeight<T>>(head_num_ * size_per_head_,
+                                                                        inter_size_,
+                                                                        vocab_size_,
+                                                                        num_layer_,
+                                                                        0,  // max_seq_len, deprecated
+                                                                        tensor_para_size_,
+                                                                        tensor_para_rank,
+                                                                        pipeline_para_size_,
+                                                                        pipeline_para_rank,
+                                                                        use_gptj_residual_,
+                                                                        prompt_learning_type_,
+                                                                        prompt_learning_table_pair_);
+    shared_weights_[device_id]->loadModel(model_dir_);
+    return;
+}
+
+template<typename T>
+std::string LlamaTritonModel<T>::toString()
+{
+    std::stringstream ss;
+    ss << "Model: "
+       << "\nhead_num: " << head_num_ << "\nsize_per_head: " << size_per_head_ << "\ninter_size: " << inter_size_
+       << "\nnum_layer: " << num_layer_ << "\nvocab_size: " << vocab_size_ << "\nstart_id: " << start_id_
+       << "\nend_id: " << end_id_ << "\nuse_gptj_residual: " << use_gptj_residual_
+       << "\nprompt_learning_type_: " << static_cast<int>(prompt_learning_type_)
+       << "\nprompt_learning_start_id_: " << prompt_learning_start_id_ << "\ntensor_para_size: " << tensor_para_size_
+       << "\npipeline_para_size: " << pipeline_para_size_ << "\nenable_custom_all_reduce: " << enable_custom_all_reduce_
+       << "\nmodel_name: " << model_name_ << "\nmodel_dir: " << model_dir_ << std::endl;
+    return ss.str();
+}
+
+template<typename T>
+void LlamaTritonModel<T>::createCustomComms(
+    std::vector<std::shared_ptr<ft::AbstractCustomComm>>* custom_all_reduce_comms, int world_size)
+{
+    using commDataType = typename ft::CustomARCommTypeConverter<T>::Type;
+    ft::initCustomAllReduceComm<commDataType>(custom_all_reduce_comms, enable_custom_all_reduce_, world_size);
+}
+
+template<typename T>
+int LlamaTritonModel<T>::getTensorParaSize()
+{
+    return tensor_para_size_;
+}
+
+template<typename T>
+int LlamaTritonModel<T>::getPipelineParaSize()
+{
+    return pipeline_para_size_;
+}
+
+template struct LlamaTritonModel<float>;
+template struct LlamaTritonModel<half>;
+
+#ifdef ENABLE_BF16
+template struct LlamaTritonModel<__nv_bfloat16>;
+#endif
diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.h b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.h
new file mode 100644
index 000000000..0775ed05f
--- /dev/null
+++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/models/llama/Llama.h"
+#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/custom_ar_comm.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+#include <cuda_fp16.h>
+
+namespace ft = fastertransformer;
+
+template<typename T>
+struct LlamaTritonModel: public AbstractTransformerModel {
+    LlamaTritonModel(size_t      tensor_para_size,
+                     size_t      pipeline_para_size,
+                     int         enable_custom_all_reduce,
+                     std::string model_dir);
+
+    ~LlamaTritonModel() = default;
+
+    virtual std::unique_ptr<AbstractTransformerModelInstance>
+    createModelInstance(int                                                               deviceId,
+                        int                                                               rank,
+                        cudaStream_t                                                      stream,
+                        std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
+                        std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm = nullptr) override;
+
+    virtual void createSharedWeights(int deviceId, int rank) override;
+
+    virtual void createCustomComms(std::vector<std::shared_ptr<ft::AbstractCustomComm>>* custom_all_reduce_comms,
+                                   int                                                   world_size) override;
+
+    virtual std::string toString() override;
+    virtual int         getTensorParaSize() override;
+    virtual int         getPipelineParaSize() override;
+
+private:
+    size_t head_num_;
+    size_t size_per_head_;
+    size_t inter_size_;
+    size_t num_layer_;
+    size_t vocab_size_;
+    size_t rotary_embedding_dim_;
+    int    start_id_;
+    int    end_id_;
+    size_t tensor_para_size_;
+    size_t pipeline_para_size_;
+
+    // shared weights for each device
+    std::vector<std::shared_ptr<ft::LlamaWeight<T>>> shared_weights_;
+
+    // residual type
+    bool use_gptj_residual_ = false;
+
+    // number of tasks (for prefix-prompt, p/prompt-tuning)
+    size_t                                     num_tasks_                  = 0;
+    int                                        prompt_learning_start_id_   = 0;
+    ft::PromptLearningType                     prompt_learning_type_       = ft::PromptLearningType::no_prompt;
+    std::map<std::string, std::pair<int, int>> prompt_learning_table_pair_ = {};
+
+    bool is_fp16_;
+    int  enable_custom_all_reduce_ = 0;
+
+    std::string model_name_;
+    std::string model_dir_;
+};
diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc
new file mode 100644
index 000000000..4e6f841f9
--- /dev/null
+++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
+#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
+#include "src/fastertransformer/triton_backend/triton_utils.hpp"
+#include "src/fastertransformer/utils/Tensor.h"
+#include <algorithm>
+#include <functional>
+#include <numeric>
+#include <vector>
+
+namespace ft = fastertransformer;
+
+template<typename T>
+void triton_stream_callback(std::unordered_map<std::string, ft::Tensor>* output_tensors, void* ctx)
+{
+    LlamaTritonModelInstance<T>* model    = reinterpret_cast<LlamaTritonModelInstance<T>*>(ctx);
+    auto                           result = LlamaTritonModelInstance<T>::convert_outputs(*output_tensors);
+
+    model->stream_cb_(result, model->stream_ctx_);
+}
+
+template<typename T>
+LlamaTritonModelInstance<T>::LlamaTritonModelInstance(
+    std::unique_ptr<ft::Llama<T>>                           gpt,
+    std::shared_ptr<ft::LlamaWeight<T>>                     gpt_weight,
+    std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator,
+    std::unique_ptr<ft::cublasAlgoMap>                      cublas_algo_map,
+    std::unique_ptr<std::mutex>                             cublas_wrapper_mutex,
+    std::unique_ptr<ft::cublasMMWrapper>                    cublas_wrapper,
+    std::unique_ptr<cudaDeviceProp>                         cuda_device_prop_ptr):
+    gpt_(std::move(gpt)),
+    gpt_weight_(gpt_weight),
+    allocator_(std::move(allocator)),
+    cublas_algo_map_(std::move(cublas_algo_map)),
+    cublas_wrapper_mutex_(std::move(cublas_wrapper_mutex)),
+    cublas_wrapper_(std::move(cublas_wrapper)),
+    cuda_device_prop_ptr_(std::move(cuda_device_prop_ptr))
+{
+}
+
+template<typename T>
+std::unordered_map<std::string, ft::Tensor> LlamaTritonModelInstance<T>::convert_inputs(
+    std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+    move_tensor_H2D(input_tensors->at("input_ids"), d_input_ids_, &allocator_);
+    move_tensor_H2D(input_tensors->at("input_lengths"), d_input_lengths_, &allocator_);
+
+    const size_t request_batch_size = input_tensors->at("input_ids").shape[0];
+    const size_t input_data_len     = input_tensors->at("input_ids").shape[1];
+    h_total_output_lengths_         = reinterpret_cast<uint32_t*>(malloc(request_batch_size * sizeof(uint32_t)));
+    for (int i = 0; i < request_batch_size; ++i) {
+        h_total_output_lengths_[i] =
+            reinterpret_cast<const uint32_t*>(input_tensors->at("request_output_len").data)[i] + input_data_len;
+    }
+
+    std::unordered_map<std::string, ft::Tensor> ft_input_tensors = std::unordered_map<std::string, ft::Tensor>{
+        {"input_ids", as_GPU_tensor(input_tensors->at("input_ids"), d_input_ids_)},
+        {"input_lengths", as_GPU_tensor(input_tensors->at("input_lengths"), d_input_lengths_)},
+        {"output_seq_len",
+         ft::Tensor{ft::MEMORY_CPU,
+                    ft::TYPE_UINT32,
+                    {input_tensors->at("request_output_len").shape[0]},
+                    h_total_output_lengths_}}};
+
+    if (input_tensors->find("bad_words_list") != input_tensors->end()) {
+        move_tensor_H2D(input_tensors->at("bad_words_list"), d_input_bad_words_, &allocator_);
+        ft_input_tensors.insert(
+            {"bad_words_list", as_GPU_tensor(input_tensors->at("bad_words_list"), d_input_bad_words_)});
+    }
+
+    if (input_tensors->find("stop_words_list") != input_tensors->end()) {
+        move_tensor_H2D(input_tensors->at("stop_words_list"), d_input_stop_words_, &allocator_);
+        ft_input_tensors.insert(
+            {"stop_words_list", as_GPU_tensor(input_tensors->at("stop_words_list"), d_input_stop_words_)});
+    }
+
+    if (input_tensors->count("request_prompt_embedding") && input_tensors->count("request_prompt_lengths")
+        && input_tensors->count("request_prompt_type")) {
+
+        move_tensor_H2D(input_tensors->at("request_prompt_lengths"), d_request_prompt_lengths_, &allocator_);
+        ft_input_tensors.insert(
+            {"request_prompt_lengths",
+             as_GPU_tensor(input_tensors->at("request_prompt_lengths"), d_request_prompt_lengths_)});
+
+        move_tensor_H2D(input_tensors->at("request_prompt_embedding"), d_request_prompt_embedding_, &allocator_);
+        ft_input_tensors.insert(
+            {"request_prompt_embedding",
+             as_GPU_tensor(input_tensors->at("request_prompt_embedding"), d_request_prompt_embedding_)});
+    }
+
+    if (input_tensors->find("top_p_decay") != input_tensors->end()) {
+        move_tensor_H2D(input_tensors->at("top_p_decay"), d_top_p_decay_, &allocator_);
+        ft_input_tensors.insert({"top_p_decay", as_GPU_tensor(input_tensors->at("top_p_decay"), d_top_p_decay_)});
+    }
+    if (input_tensors->find("top_p_min") != input_tensors->end()) {
+        move_tensor_H2D(input_tensors->at("top_p_min"), d_top_p_min_, &allocator_);
+        ft_input_tensors.insert({"top_p_min", as_GPU_tensor(input_tensors->at("top_p_min"), d_top_p_min_)});
+    }
+    if (input_tensors->find("top_p_reset_ids") != input_tensors->end()) {
+        move_tensor_H2D(input_tensors->at("top_p_reset_ids"), d_top_p_reset_ids_, &allocator_);
+        ft_input_tensors.insert(
+            {"top_p_reset_ids", as_GPU_tensor(input_tensors->at("top_p_reset_ids"), d_top_p_reset_ids_)});
+    }
+
+    for (auto t = input_tensors->begin(); t != input_tensors->end(); ++t) {
+        if (t->first.find("input_ids") == std::string::npos && t->first.find("input_lengths") == std::string::npos
+            && t->first.find("output_seq_len") == std::string::npos
+            && t->first.find("prefix_soft_prompt_embedding") == std::string::npos
+            && t->first.find("prefix_soft_prompt_lengths") == std::string::npos) {
+            if (ft_input_tensors.count(t->first) == 0) {
+                ft_input_tensors.insert({t->first, t->second.convertTritonTensorToFt()});
+            }
+        }
+    }
+
+    return ft_input_tensors;
+}
+
+template<typename T>
+std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
+LlamaTritonModelInstance<T>::convert_outputs(const std::unordered_map<std::string, ft::Tensor>& output_tensors)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    std::unordered_map<std::string, triton::Tensor>* outputs_mapping =
+        new std::unordered_map<std::string, triton::Tensor>();
+
+    for (auto it = output_tensors.begin(); it != output_tensors.end(); it++) {
+        outputs_mapping->insert({it->first, triton::Tensor::convertFtTensorToTriton(it->second)});
+    }
+
+    return std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>(outputs_mapping);
+}
+
+template<typename T>
+std::shared_ptr<std::vector<triton::Tensor>>
+LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::vector<triton::Tensor>> input_tensors)
+{
+    ft::FT_CHECK(false);
+    return nullptr;
+}
+
+template<typename T>
+std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
+LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    FT_CHECK_WITH_INFO(input_tensors->at("input_ids").shape.size() == 2,
+                       "input_tensors->at(\"input_ids\").shape.size() == 2");
+    FT_CHECK_WITH_INFO(input_tensors->at("input_lengths").shape.size() == 1,
+                       "input_tensors->at(\"input_lengths\").shape.size() == 1");
+
+    const uint32_t request_batch_size     = input_tensors->at("input_ids").shape[0];
+    const uint32_t max_request_output_len = (size_t)*std::max_element(
+        (int*)input_tensors->at("request_output_len").data,
+        (int*)input_tensors->at("request_output_len").data + input_tensors->at("request_output_len").shape[0]);
+    const uint32_t total_output_len = max_request_output_len + input_tensors->at("input_ids").shape[1];
+    const uint32_t beam_width =
+        input_tensors->count("beam_width") ? (size_t)(*(uint*)input_tensors->at("beam_width").data) : 1;
+
+    allocateBuffer(request_batch_size, beam_width, total_output_len, max_request_output_len);
+
+    std::unordered_map<std::string, ft::Tensor> ft_input_tensors = convert_inputs(input_tensors);
+
+    std::unordered_map<std::string, ft::Tensor> output_tensors = std::unordered_map<std::string, ft::Tensor>{
+        {"output_ids",
+         ft::Tensor{ft::MEMORY_GPU,
+                    ft::TYPE_UINT32,
+                    std::vector<size_t>{request_batch_size, beam_width, total_output_len},
+                    d_output_ids_}},
+        {"sequence_length",
+         ft::Tensor{ft::MEMORY_GPU,
+                    ft::TYPE_UINT32,
+                    std::vector<size_t>{request_batch_size, beam_width},
+                    d_sequence_lengths_}}};
+
+    if (input_tensors->count("is_return_log_probs") && *((bool*)input_tensors->at("is_return_log_probs").data)) {
+        output_tensors.insert({"output_log_probs",
+                               ft::Tensor{ft::MEMORY_GPU,
+                                          ft::TYPE_FP32,
+                                          std::vector<size_t>{request_batch_size, beam_width, max_request_output_len},
+                                          d_output_log_probs_}});
+        output_tensors.insert({"cum_log_probs",
+                               ft::Tensor{ft::MEMORY_GPU,
+                                          ft::TYPE_FP32,
+                                          std::vector<size_t>{request_batch_size, beam_width},
+                                          d_cum_log_probs_}});
+    }
+    try {
+        if (stream_cb_ != nullptr) {
+            gpt_->registerCallback(triton_stream_callback<T>, this);
+        }
+
+        gpt_->forward(&output_tensors, &ft_input_tensors, gpt_weight_.get());
+
+        if (stream_cb_ != nullptr) {
+            gpt_->unRegisterCallback();
+        }
+    }
+    catch (...) {
+        h_exception_ = std::current_exception();
+        output_tensors.insert({"error_message", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_BYTES, {1}, &h_exception_}});
+    }
+
+    if (h_total_output_lengths_ != nullptr) {
+        free(h_total_output_lengths_);
+        h_total_output_lengths_ = nullptr;
+    }
+
+    return convert_outputs(output_tensors);
+}
+
+template<typename T>
+LlamaTritonModelInstance<T>::~LlamaTritonModelInstance()
+{
+    freeBuffer();
+}
+
+template<typename T>
+void LlamaTritonModelInstance<T>::allocateBuffer(const size_t request_batch_size,
+                                                   const size_t beam_width,
+                                                   const size_t total_output_len,
+                                                   const size_t max_request_output_len)
+{
+    d_output_ids_ = (int*)(allocator_->reMalloc(
+        d_output_ids_, sizeof(int) * request_batch_size * beam_width * total_output_len, false));
+    d_sequence_lengths_ =
+        (int*)(allocator_->reMalloc(d_sequence_lengths_, sizeof(int) * request_batch_size * beam_width, false));
+    d_output_log_probs_ = (float*)(allocator_->reMalloc(
+        d_output_log_probs_, sizeof(float) * request_batch_size * beam_width * max_request_output_len, false));
+    d_cum_log_probs_ =
+        (float*)(allocator_->reMalloc(d_cum_log_probs_, sizeof(float) * request_batch_size * beam_width, false));
+}
+
+template<typename T>
+void LlamaTritonModelInstance<T>::freeBuffer()
+{
+    allocator_->free((void**)(&d_output_ids_));
+    allocator_->free((void**)(&d_sequence_lengths_));
+    allocator_->free((void**)(&d_output_log_probs_));
+    allocator_->free((void**)(&d_cum_log_probs_));
+}
+
+template struct LlamaTritonModelInstance<float>;
+template struct LlamaTritonModelInstance<half>;
+
+#ifdef ENABLE_BF16
+template struct LlamaTritonModelInstance<__nv_bfloat16>;
+#endif
diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h
new file mode 100644
index 000000000..0a2418641
--- /dev/null
+++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/models/llama/Llama.h"
+#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h"
+#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
+#include <memory>
+
+namespace ft = fastertransformer;
+
+template<typename T>
+struct LlamaTritonModelInstance: AbstractTransformerModelInstance {
+
+    LlamaTritonModelInstance(std::unique_ptr<ft::Llama<T>>                           gpt,
+                             std::shared_ptr<ft::LlamaWeight<T>>                     gpt_weight,
+                             std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator,
+                             std::unique_ptr<ft::cublasAlgoMap>                      cublas_algo_map,
+                             std::unique_ptr<std::mutex>                             cublas_wrapper_mutex,
+                             std::unique_ptr<ft::cublasMMWrapper>                    cublas_wrapper,
+                             std::unique_ptr<cudaDeviceProp>                         cuda_device_prop_ptr);
+    ~LlamaTritonModelInstance();
+
+    std::shared_ptr<std::vector<triton::Tensor>>
+    forward(std::shared_ptr<std::vector<triton::Tensor>> input_tensors) override;
+
+    std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
+    forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors) override;
+
+    static std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
+    convert_outputs(const std::unordered_map<std::string, ft::Tensor>& output_tensors);
+
+private:
+    const std::unique_ptr<ft::Llama<T>>                           gpt_;
+    const std::shared_ptr<ft::LlamaWeight<T>>                     gpt_weight_;
+    const std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator_;
+    const std::unique_ptr<ft::cublasAlgoMap>                      cublas_algo_map_;
+    const std::unique_ptr<std::mutex>                             cublas_wrapper_mutex_;
+    const std::unique_ptr<ft::cublasMMWrapper>                    cublas_wrapper_;
+    const std::unique_ptr<cudaDeviceProp>                         cuda_device_prop_ptr_;
+
+    std::unordered_map<std::string, ft::Tensor>
+    convert_inputs(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors);
+
+    void allocateBuffer(const size_t request_batch_size,
+                        const size_t beam_width,
+                        const size_t total_output_len,
+                        const size_t max_request_output_len);
+    void freeBuffer();
+
+    int*   d_input_ids_                = nullptr;
+    int*   d_input_lengths_            = nullptr;
+    int*   d_input_bad_words_          = nullptr;
+    int*   d_input_stop_words_         = nullptr;
+    int*   d_request_prompt_lengths_   = nullptr;
+    T*     d_request_prompt_embedding_ = nullptr;
+    float* d_top_p_decay_              = nullptr;
+    float* d_top_p_min_                = nullptr;
+    int*   d_top_p_reset_ids_          = nullptr;
+
+    int*   d_output_ids_       = nullptr;
+    int*   d_sequence_lengths_ = nullptr;
+    float* d_output_log_probs_ = nullptr;
+    float* d_cum_log_probs_    = nullptr;
+
+    uint32_t*          h_total_output_lengths_ = nullptr;
+    std::exception_ptr h_exception_            = nullptr;
+};
diff --git a/src/fastertransformer/triton_backend/transformer_triton_backend.hpp b/src/fastertransformer/triton_backend/transformer_triton_backend.hpp
index 47cf6750c..1567b7310 100644
--- a/src/fastertransformer/triton_backend/transformer_triton_backend.hpp
+++ b/src/fastertransformer/triton_backend/transformer_triton_backend.hpp
@@ -293,6 +293,7 @@ struct AbstractTransformerModel {
     static std::shared_ptr<AbstractTransformerModel> createGptNeoXModel(std::string inifile);
     static std::shared_ptr<AbstractTransformerModel> createT5Model(std::string model_dir);
     static std::shared_ptr<AbstractTransformerModel> createT5EncoderModel(std::string model_dir);
+    static std::shared_ptr<AbstractTransformerModel> createLlamaModel(std::string inifile);
 
     std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>
     createNcclParams(const int node_id, const int device_id_start = 0, const bool multi_node = false);

From a6d51ec08d428cc27a0e949be3f3c5ea4fd5a0c6 Mon Sep 17 00:00:00 2001
From: void-main <voidmain1313113@gmail.com>
Date: Tue, 25 Apr 2023 05:37:26 +0000
Subject: [PATCH 06/27] register RMS for bf16

---
 src/fastertransformer/kernels/layernorm_kernels.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/fastertransformer/kernels/layernorm_kernels.cu b/src/fastertransformer/kernels/layernorm_kernels.cu
index 369030b37..60e6f001a 100644
--- a/src/fastertransformer/kernels/layernorm_kernels.cu
+++ b/src/fastertransformer/kernels/layernorm_kernels.cu
@@ -1490,6 +1490,17 @@ template void invokeGeneralAddResidualT5PreLayerNorm(half*        output,
                                                      int          n,
                                                      cudaStream_t stream);
 
+#ifdef ENABLE_BF16
+template void invokeGeneralAddResidualT5PreLayerNorm(__nv_bfloat16*       output,
+                                                     __nv_bfloat16*       norm_output,
+                                                     const __nv_bfloat16* input,
+                                                     const __nv_bfloat16* gamma,
+                                                     const float          layernorm_eps,
+                                                     int                  m,
+                                                     int                  n,
+                                                     cudaStream_t         stream);
+#endif
+
 template<typename T>
 void invokeGeneralAddBiasResidualT5PreLayerNorm(T*           output,
                                                 T*           norm_output,

From 7a72ca33f49ba901fb093ad3d753561deca8654f Mon Sep 17 00:00:00 2001
From: void-main <voidmain1313113@gmail.com>
Date: Tue, 25 Apr 2023 12:10:57 +0000
Subject: [PATCH 07/27] revert bf16

---
 src/fastertransformer/models/llama/Llama.cc                   | 4 ----
 src/fastertransformer/models/llama/LlamaContextDecoder.cc     | 4 ----
 src/fastertransformer/models/llama/LlamaDecoder.cc            | 4 ----
 src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc | 4 ----
 src/fastertransformer/models/llama/LlamaWeight.cc             | 4 ----
 5 files changed, 20 deletions(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index 0f547d2c5..d0305e84a 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -1206,8 +1206,4 @@ bool* Llama<T>::getFinishBuffer()
 template class Llama<float>;
 template class Llama<half>;
 
-#ifdef ENABLE_BF16
-template class Llama<__nv_bfloat16>;
-#endif
-
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LlamaContextDecoder.cc b/src/fastertransformer/models/llama/LlamaContextDecoder.cc
index 900e1f016..8ea1494fb 100644
--- a/src/fastertransformer/models/llama/LlamaContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LlamaContextDecoder.cc
@@ -500,8 +500,4 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
 template class LlamaContextDecoder<float>;
 template class LlamaContextDecoder<half>;
 
-#ifdef ENABLE_BF16
-template class LlamaContextDecoder<__nv_bfloat16>;
-#endif
-
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LlamaDecoder.cc b/src/fastertransformer/models/llama/LlamaDecoder.cc
index dd9a51cee..23e9b0eec 100644
--- a/src/fastertransformer/models/llama/LlamaDecoder.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoder.cc
@@ -377,8 +377,4 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
 template class LlamaDecoder<float>;
 template class LlamaDecoder<half>;
 
-#ifdef ENABLE_BF16
-template class LlamaDecoder<__nv_bfloat16>;
-#endif
-
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
index dc99451b9..39a1d5c1f 100644
--- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
@@ -222,8 +222,4 @@ void LlamaDecoderLayerWeight<T>::mallocWeights()
 template struct LlamaDecoderLayerWeight<float>;
 template struct LlamaDecoderLayerWeight<half>;
 
-#ifdef ENABLE_BF16
-template class LlamaDecoderLayerWeight<__nv_bfloat16>;
-#endif
-
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LlamaWeight.cc b/src/fastertransformer/models/llama/LlamaWeight.cc
index 01741e4f2..65c2f762d 100644
--- a/src/fastertransformer/models/llama/LlamaWeight.cc
+++ b/src/fastertransformer/models/llama/LlamaWeight.cc
@@ -298,8 +298,4 @@ bool LlamaWeight<T>::isValidLayerParallelId(int l)
 template struct LlamaWeight<float>;
 template struct LlamaWeight<half>;
 
-#ifdef ENABLE_BF16
-template class LlamaWeight<__nv_bfloat16>;
-#endif
-
 }  // namespace fastertransformer

From 9820565367d9c8f3d89a5769c74ad09cf853e781 Mon Sep 17 00:00:00 2001
From: void-main <voidmain1313113@gmail.com>
Date: Tue, 25 Apr 2023 13:15:42 +0000
Subject: [PATCH 08/27] revert bf16

---
 CMakeLists.txt                                         | 10 ++++++++--
 .../triton_backend/llama/LlamaTritonModel.cc           | 10 ----------
 .../triton_backend/llama/LlamaTritonModelInstance.cc   |  3 ---
 3 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 870e67f0a..5eb40abe8 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -348,6 +348,12 @@ add_library(transformer-shared SHARED
   $<TARGET_OBJECTS:ParallelGptDecoderLayerWeight>
   $<TARGET_OBJECTS:ParallelGptTritonBackend>
   $<TARGET_OBJECTS:ParallelGptWeight>
+  $<TARGET_OBJECTS:Llama>
+  $<TARGET_OBJECTS:LlamaContextDecoder>
+  $<TARGET_OBJECTS:LlamaDecoder>
+  $<TARGET_OBJECTS:LlamaDecoderLayerWeight>
+  $<TARGET_OBJECTS:LlamaTritonBackend>
+  $<TARGET_OBJECTS:LlamaWeight>
   $<TARGET_OBJECTS:T5Common>
   $<TARGET_OBJECTS:T5Decoder>
   $<TARGET_OBJECTS:T5Decoding>
@@ -428,9 +434,9 @@ target_link_libraries(transformer-shared PUBLIC
   -lnvToolsExt
 )
 endif()
-  
+
 if (ENABLE_FP8)
-target_link_libraries(transformer-shared PUBLIC 
+target_link_libraries(transformer-shared PUBLIC
   $<TARGET_OBJECTS:BertFP8>
   $<TARGET_OBJECTS:BertFP8Weight>
   $<TARGET_OBJECTS:DecoderSelfAttentionFP8Layer>
diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc
index 02cfc6e9e..a7f336843 100644
--- a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc
@@ -41,13 +41,6 @@ std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaM
             reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0),
             model_dir);
     }
-    if (data_type == "bf16") {
-        return std::make_shared<LlamaTritonModel<__nv_bfloat16>>(
-            reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"),
-            reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"),
-            reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0),
-            model_dir);
-    }
     else {
         return std::make_shared<LlamaTritonModel<float>>(
             reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"),
@@ -248,6 +241,3 @@ int LlamaTritonModel<T>::getPipelineParaSize()
 template struct LlamaTritonModel<float>;
 template struct LlamaTritonModel<half>;
 
-#ifdef ENABLE_BF16
-template struct LlamaTritonModel<__nv_bfloat16>;
-#endif
diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc
index 4e6f841f9..593fc6c97 100644
--- a/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc
@@ -260,6 +260,3 @@ void LlamaTritonModelInstance<T>::freeBuffer()
 template struct LlamaTritonModelInstance<float>;
 template struct LlamaTritonModelInstance<half>;
 
-#ifdef ENABLE_BF16
-template struct LlamaTritonModelInstance<__nv_bfloat16>;
-#endif

From bfeebefaaf486432ca57249392d4db3b85a83570 Mon Sep 17 00:00:00 2001
From: void-main <voidmain1313113@gmail.com>
Date: Tue, 25 Apr 2023 13:38:12 +0000
Subject: [PATCH 09/27] bugfix

---
 examples/cpp/llama/llama_example.cc | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
index 5e84e13d9..0105cb6aa 100644
--- a/examples/cpp/llama/llama_example.cc
+++ b/examples/cpp/llama/llama_example.cc
@@ -60,11 +60,6 @@ int main(int argc, char* argv[])
     else if (data_type == "fp16") {
         llama_example<half>(reader);
     }
-#ifdef ENABLE_BF16
-    else if (data_type == "bf16") {
-        llama_example<__nv_bfloat16>(reader);
-    }
-#endif
     else {
         FT_LOG_ERROR("is_fp16 should be 0 (use float) or 1 (use half).");
         return -1;

From 0379cc5cbf085673a2527ec9fd28f7dd99eec537 Mon Sep 17 00:00:00 2001
From: void-main <voidmain1313113@gmail.com>
Date: Fri, 28 Apr 2023 07:31:04 +0000
Subject: [PATCH 10/27] add megatron llama convert

---
 .../gpt/utils/megatron_ckpt_convert_llama.py  | 539 ++++++++++++++++++
 1 file changed, 539 insertions(+)
 create mode 100644 examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py

diff --git a/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py b/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py
new file mode 100644
index 000000000..02b4cf12a
--- /dev/null
+++ b/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py
@@ -0,0 +1,539 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import configparser
+import datetime
+import json
+import multiprocessing
+import pathlib
+import re
+import shutil
+import sys
+
+import numpy as np
+import torch  # pytype: disable=import-error
+
+# verify if root package is in PYTHONPATH
+__root_package_path__ = pathlib.Path(__file__).parent.parent.parent.parent.parent.absolute().as_posix()
+if __root_package_path__ not in sys.path:
+    print(
+        f"[ERROR] add project root directory to your PYTHONPATH with "
+        f"'export PYTHONPATH={__root_package_path__}:${{PYTHONPATH}}'"
+    )
+
+from examples.pytorch.gpt.utils.gpt import DEFAULT_START_TAG, DEFAULT_END_TAG, OPENAI_GPT2_START_ID, OPENAI_GPT2_END_ID
+from examples.pytorch.utils import torch2np, safe_transpose, cpu_map_location, gpu_map_location, WEIGHT2DTYPE
+
+
+def _inject_model_parallel_rank(
+    filepath,
+    tensor_model_parallel_size=1,
+    pipeline_model_parallel_size=1,
+    tensor_model_parallel_rank=0,
+    pipeline_model_parallel_rank=0,
+):
+    """
+    Injects tensor/pipeline model parallel ranks into the filepath.
+    Does nothing if not using model parallelism.
+    """
+    filepath = pathlib.Path(filepath)
+    if tensor_model_parallel_size > 1 or pipeline_model_parallel_size > 1:
+        # filepath needs to be updated to include mp_rank
+        if pipeline_model_parallel_size is None or pipeline_model_parallel_size == 1:
+            filepath = filepath.parent / f"mp_rank_{tensor_model_parallel_rank:02d}" / filepath.name
+        else:
+            filepath = (
+                    filepath.parent /
+                    f"mp_rank_{tensor_model_parallel_rank:02d}_{pipeline_model_parallel_rank:03d}" /
+                    filepath.name
+            )
+            if not filepath.exists():
+                filepath = (
+                    filepath.parent /
+                    f"tp_rank_{tensor_model_parallel_rank:02d}_pp_rank_{pipeline_model_parallel_rank:03d}" /
+                    filepath.name
+                )
+        return filepath
+    else:
+        if filepath.exists():
+            return filepath
+        else:
+            return filepath.parent / "mp_rank_00" / filepath.name
+
+
+def _create_model_training_args_for_checkpoint_version_0(args, model_00):
+    model_training_args = argparse.Namespace()
+    if args.head_num is None or args.trained_tensor_parallel_size is None:
+        raise ValueError(
+            "Provided checkpoint have missing training args. "
+            "Thus it is required to provide -head_num and -trained_tensor_parallel_size CLI arguments"
+        )
+    model_training_args.num_attention_heads = args.head_num
+    model_training_args.tensor_model_parallel_size = args.trained_tensor_parallel_size
+    # megatron ckpt_ver=0 only supports pipeline_parallel_size = 1
+    model_training_args.pipeline_model_parallel_size = 1
+    model_training_args.max_position_embeddings = \
+        model_00["model"]["language_model"]["embedding"]["position_embeddings"]["weight"].shape[0]
+    model_training_args.hidden_size = \
+        model_00["model"]["language_model"]["embedding"]["position_embeddings"]["weight"].shape[1]
+    model_training_args.ffn_hidden_size = 4 * model_training_args.hidden_size
+
+    def get_layer_num_from_weights(model_keys):
+        layer_num = 1
+        for key in model_keys:
+            if re.search(r'\d+', key) is not None:
+                layer_num = max(int(re.search(r'\d+', key).group()), layer_num)
+        return layer_num + 1
+
+    model_training_args.num_layers = \
+        get_layer_num_from_weights(model_00["model"]["language_model"]['transformer'].keys())
+
+    model_training_args.layernorm_epsilon = 1e-6
+
+    return model_training_args
+
+
+# This tool is used to support the new megatron model trained by pipeline parallel + tensor parallel
+def merge_and_convert_process(i, pipeline_para_rank, saved_dir, factor, key, model_training_args, transformer_model_list, ckpt_ver, np_weight_data_type):
+    saved_dir = pathlib.Path(saved_dir)
+    if key.find("layers.") != -1:
+        layer_index = (int)(key[7 : key.find(".", 7)])
+        saved_key = key.replace(
+            "layers.%d." % layer_index,
+            "layers.%d." % (layer_index + pipeline_para_rank * model_training_args.num_layers // model_training_args.pipeline_model_parallel_size))
+
+        if saved_key.find("self_attention") != -1:
+            saved_key = saved_key.replace("self_attention", "attention")
+        if saved_key.find("adaptor1") != -1:
+            saved_key = saved_key.replace("adaptor1", "after_attention_adapter")
+        if saved_key.find("adaptor2") != -1:
+            saved_key = saved_key.replace("adaptor2", "after_ffn_adapter")
+    else:
+        saved_key = key
+    major_device = transformer_model_list[0][key].device
+
+    if (
+        key.find("input_layernorm.weight") != -1
+        or key.find("post_attention_layernorm.weight") != -1
+        or key.find("final_layernorm.weight") != -1):
+
+        # shared weights, only need to convert the weights of rank 0
+        if i == 0:
+            saved_path = saved_dir / f"model.{saved_key}.bin"
+            val = safe_transpose(transformer_model_list[0][key])
+            val = torch2np(val, np_weight_data_type)
+            val = np.squeeze(val)
+            val.tofile(saved_path)
+
+    elif (key.find("attention.dense.weight") != -1
+        or key.find("mlp.dense_4h_to_h.weight") != -1
+        or key.find("adaptor1.dense_4h_to_h.weight") != -1
+        or key.find("adaptor2.dense_4h_to_h.weight") != -1):
+        vals = [
+            safe_transpose(transformer_model_list[k][key]).float().to(major_device)
+            for k in range(factor)
+        ]
+        val = torch.cat(vals, dim=0)
+        val = torch2np(val, np_weight_data_type)
+        saved_path = saved_dir / f"model.{saved_key}.{i:d}.bin"
+        val.tofile(saved_path)
+
+    elif (key.find("mlp.dense_h_to_4h.weight") != -1
+        or key.find("adaptor1.dense_h_to_4h.weight") != -1
+        or key.find("adaptor2.dense_h_to_4h.weight") != -1):
+        vals = [
+            safe_transpose(transformer_model_list[k][key]).float().to(major_device)
+            for k in range(factor)
+        ]
+        val = torch.cat(vals, dim=-1)
+        val = torch2np(val, np_weight_data_type)
+        saved_path = saved_dir / f"model.{saved_key}.{i:d}.bin"
+        val.tofile(saved_path)
+
+
+    elif key.find("attention.query_key_value.weight") != -1:
+        vals = []
+        for k in range(factor):
+            val = safe_transpose(transformer_model_list[k][key]).float()
+            hidden_dim = val.shape[0]
+            local_dim = int(val.shape[-1] / 3)
+            if ckpt_ver == 3:
+                num_splits = 3
+                head_num = model_training_args.num_attention_heads
+                size_per_head = hidden_dim // head_num
+                head_num = head_num // model_training_args.tensor_model_parallel_size
+                val = val.reshape(hidden_dim, head_num, num_splits, size_per_head)
+                val = val.permute(0, 2, 1, 3)
+            val = val.reshape(hidden_dim, 3, local_dim)
+            vals.append(val.to(major_device))
+        val = torch.cat(vals, dim=-1)
+        val = torch2np(val, np_weight_data_type)
+        saved_path = saved_dir / f"model.{saved_key}.{i:d}.bin"
+        val.tofile(saved_path)
+
+    else:
+        print(f"[ERROR] cannot find key '{key}'")
+
+def preprocess_h_to_4h(val, training_tp_size, np_weight_data_type):
+    """
+    megatron saved format: [TP_train, 2, inter_size / 2 / TP, hidden_size]
+    FT needed format: [hidden_size, inter_size], [hidden_size, inter_size]
+    This function translates megatron weight to FT weight
+    """
+    val_shape = val.shape
+    val = val.view(training_tp_size,
+                   2,
+                   val_shape[0] // (2 * training_tp_size),
+                   val_shape[1])
+    val = val.transpose(0, 1).contiguous()
+    val = val.view(*val_shape)
+
+    val = safe_transpose(val)
+    return torch2np(val, np_weight_data_type)
+
+def split_and_convert_process(i, pipeline_para_rank, saved_dir, factor, key, model_training_args, transformer_model_list, ckpt_ver, np_weight_data_type):
+    val = safe_transpose(transformer_model_list[0][key])
+    val = torch2np(val, np_weight_data_type)
+    print(f"key: {key}, shape: {val.shape}")
+    if key.find("layers.") != -1:
+        layer_index = (int)(key[7 : key.find(".", 7)])
+        saved_key = key.replace(
+            "layers.%d." % layer_index,
+            "layers.%d." % (layer_index + pipeline_para_rank * model_training_args.num_layers // model_training_args.pipeline_model_parallel_size))
+
+        if saved_key.find("self_attention") != -1:
+            saved_key = saved_key.replace("self_attention", "attention")
+        if saved_key.find("mlp.dense_4h_to_h")!= -1:
+            saved_key = saved_key.replace("mlp.dense_4h_to_h", "mlp.down_proj")
+
+    else:
+        saved_key = key
+
+    if (
+        key.find("input_layernorm.weight") != -1
+        or key.find("post_attention_layernorm.weight") != -1
+        or key.find("final_layernorm.weight") != -1
+    ):
+        # shared weights, only need to convert the weights of rank 0
+        if i == 0:
+            saved_path = saved_dir / f"model.{saved_key}.bin"
+            val.tofile(saved_path.as_posix())
+
+    elif (key.find("attention.dense.weight") != -1
+        or key.find("mlp.dense_4h_to_h.weight") != -1):
+        split_vals = np.split(val, factor, axis=0)
+        for j in range(factor):
+            print(f'saving {saved_key}, shape: {split_vals[j].shape}')
+            saved_path = saved_dir / f"model.{saved_key}.{i * factor + j:d}.bin"
+            split_vals[j].tofile(saved_path.as_posix())
+
+    elif (key.find("mlp.dense_h_to_4h.weight") != -1):
+        gate_weight, up_weight = np.split(val, 2, axis=-1)
+        print(f'dense_h_to_4h shape: {val.shape}, gate shape: {gate_weight.shape}, up shape: {up_weight.shape}')
+
+        split_gate_weight = np.split(gate_weight, factor, axis=-1)
+        proj_key = saved_key.replace('mlp.dense_h_to_4h.weight','mlp.gate_proj.weight')
+        for j in range(factor):
+            print(f'saving {proj_key}, shape: {split_gate_weight[j].shape}')
+            saved_path = saved_dir / f"model.{proj_key}.{i * factor + j:d}.bin"
+            split_gate_weight[j].tofile(saved_path.as_posix())
+
+        split_up_weight = np.split(up_weight, factor, axis=-1)
+        proj_key = saved_key.replace('mlp.dense_h_to_4h.weight','mlp.up_proj.weight')
+        for j in range(factor):
+            print(f'saving {proj_key}, shape: {split_up_weight[j].shape}')
+            saved_path = saved_dir / f"model.{proj_key}.{i * factor + j:d}.bin"
+            split_up_weight[j].tofile(saved_path.as_posix())
+
+
+    elif key.find("attention.query_key_value.weight") != -1:
+        hidden_dim = val.shape[0]
+        local_dim = int(val.shape[-1] / 3)
+
+        if ckpt_ver == 3:
+            num_splits = 3
+            head_num = model_training_args.num_attention_heads
+            size_per_head = hidden_dim // head_num
+            head_num = head_num // model_training_args.tensor_model_parallel_size
+            val = val.reshape(hidden_dim, head_num, num_splits, size_per_head)
+            val = val.transpose(0, 2, 1, 3)
+
+        val = val.reshape(hidden_dim, 3, local_dim)
+        split_vals = np.split(val, factor, axis=-1)
+
+        for j in range(factor):
+            print(f'saving {saved_key}, shape: {split_vals[j].shape}')
+            saved_path = saved_dir / f"model.{saved_key}.{i * factor + j:d}.bin"
+            split_vals[j].tofile(saved_path.as_posix())
+
+    else:
+        print(f"[ERROR] cannot find key '{key}'")
+
+
+def _get_checkpoint_name(checkpoint_dir):
+
+    checkpoint_dir = pathlib.Path(checkpoint_dir)
+    patterns = [
+        "model_optim_rng.pt",  # older megatron checkpoints
+        "*last.ckpt",  # newer format of checkpoints
+    ]
+    for pattern in patterns:
+        model_files = sorted(list(checkpoint_dir.rglob(pattern)))
+        if model_files:
+            return model_files[0].name
+
+    raise ValueError(f"Could not find checkpoint files in {checkpoint_dir}")
+
+
+def convert_checkpoint(args):
+    saved_dir = pathlib.Path(args.saved_dir) / f"{args.infer_gpu_num:d}-gpu"
+    if saved_dir.exists():
+        print(f"[ERROR] Remove {saved_dir} target directory before running conversion")
+        sys.exit(1)
+    saved_dir.mkdir(parents=True)
+
+    if args.vocab_path:
+        shutil.copy(args.vocab_path, (saved_dir / "vocab.json").as_posix())
+    if args.merges_path:
+        shutil.copy(args.merges_path, (saved_dir / "merges.txt").as_posix())
+
+    load_checkpoints_to_cpu = bool(args.load_checkpoints_to_cpu)
+    map_location_fn = cpu_map_location if load_checkpoints_to_cpu else gpu_map_location
+
+    checkpoints_dir = pathlib.Path(args.in_file)
+    checkpoint_name = _get_checkpoint_name(checkpoints_dir)
+
+    # load position_embedding from rank 0
+    checkpoints_paths = sorted(checkpoints_dir.rglob(checkpoint_name))
+    if not checkpoints_paths:
+        print(f"[ERROR] Cannot find checkpoint in {checkpoints_dir}.")
+        exit(1)
+    model_00 = torch.load(checkpoints_paths[0].as_posix(), map_location=map_location_fn)
+
+    if "hyper_parameters" in list(model_00.keys()):
+        print("Use nemo_ckpt_converter.py script for conversion of this checkpoint")
+        exit(1)
+    elif "args" in list(model_00.keys()):
+        checkpoint_version = model_00["checkpoint_version"]
+        model_training_args = model_00["args"]
+        megatron_gpt_key = "encoder"
+    else:
+        checkpoint_version = 0
+        model_training_args = _create_model_training_args_for_checkpoint_version_0(args, model_00)
+        megatron_gpt_key = "transformer"
+
+    with (saved_dir / "args.txt").open("w") as training_args_file:
+        for k, v in vars(model_training_args).items():
+            training_args_file.write(f"{k}:{v}\n")
+
+    np_weight_data_type = WEIGHT2DTYPE[args.weight_data_type]
+
+    del model_00
+    w_e_list = []
+    w_e_head_list = []
+
+    training_tensor_para_size = model_training_args.tensor_model_parallel_size
+    training_pipeline_para_size = model_training_args.pipeline_model_parallel_size
+    inference_tensor_para_size = args.infer_gpu_num
+
+    model_weights_paths = [
+        [
+            _inject_model_parallel_rank(
+                checkpoints_dir / checkpoint_name,
+                tensor_model_parallel_size=training_tensor_para_size,
+                pipeline_model_parallel_size=training_pipeline_para_size,
+                tensor_model_parallel_rank=tp_rank,
+                pipeline_model_parallel_rank=pp_rank,
+            )
+            for pp_rank in range(training_pipeline_para_size)
+        ]
+        for tp_rank in range(training_tensor_para_size)
+    ]
+
+    if training_tensor_para_size > inference_tensor_para_size:
+        assert training_tensor_para_size % inference_tensor_para_size == 0
+        is_merge_ckpt = True
+        factor = int(training_tensor_para_size / inference_tensor_para_size)
+    else:
+        assert inference_tensor_para_size % training_tensor_para_size == 0
+        is_merge_ckpt = False
+        factor = int(inference_tensor_para_size / training_tensor_para_size)
+
+    main_loop = min(training_tensor_para_size, inference_tensor_para_size)
+    vocab_size_list = [0 for i in range(main_loop)]
+
+    torch.multiprocessing.set_start_method("spawn")
+    torch.multiprocessing.set_sharing_strategy("file_system")
+    pool = multiprocessing.Pool(args.processes)
+    has_adapters = False
+    for i in range(main_loop): # tp
+        for j in range(training_pipeline_para_size): # pp
+
+            transformer_models = []
+            if is_merge_ckpt:
+                for k in range(factor):
+                    m = torch.load(model_weights_paths[i * factor + k][j].as_posix(), map_location=map_location_fn)
+                    if not has_adapters:
+                        has_adapters = any("adaptor" in key for key in m['model']['language_model'][megatron_gpt_key].keys())
+                    transformer_models.append(m["model"]["language_model"][megatron_gpt_key])
+
+                    if j == 0:
+                        vocab_size_list[i] = m["model"]["language_model"]["embedding"]["word_embeddings"]["weight"].shape[0]
+                        w_e_list.append(torch2np(m["model"]["language_model"]["embedding"]["word_embeddings"]["weight"], np_weight_data_type))
+                    if j == training_pipeline_para_size - 1:
+                        w_e_head_list.append(torch2np(m["model"]["word_embeddings"]["weight"], np_weight_data_type))
+
+            else:
+                m = torch.load(model_weights_paths[i][j].as_posix(), map_location=map_location_fn)
+
+                if not has_adapters:
+                    has_adapters = any("adaptor" in key for key in m['model']['language_model'][megatron_gpt_key].keys())
+
+                if j == 0:
+                    vocab_size_list[i] = m["model"]["language_model"]["embedding"]["word_embeddings"]["weight"].shape[0]
+                    w_e_list.append(torch2np(
+                        m["model"]["language_model"]["embedding"]["word_embeddings"]["weight"],
+                        np_weight_data_type
+                    ))
+                if j == training_pipeline_para_size - 1:
+                    w_e_head_list.append(torch2np(
+                        m["model"]["word_embeddings_for_head"]["weight"],
+                        np_weight_data_type
+                    ))
+                transformer_models.append(m["model"]["language_model"][megatron_gpt_key])
+
+            pool.starmap(
+                merge_and_convert_process if is_merge_ckpt else split_and_convert_process,
+                [
+                    (
+                        i,
+                        j,
+                        saved_dir,
+                        factor,
+                        k,
+                        model_training_args,
+                        transformer_models,
+                        checkpoint_version,
+                        np_weight_data_type,
+                    )
+                    for (k, v) in transformer_models[0].items()
+                ],
+            )
+
+    pool.close()
+    pool.join()
+
+    torch.cuda.synchronize()
+
+    np.concatenate(w_e_list, axis=0).tofile((saved_dir / "model.wte.weight.bin").as_posix())
+    np.concatenate(w_e_head_list, axis=0).tofile((saved_dir / "model.lm_head.weight.bin").as_posix())
+
+    # save vocab_size
+    full_vocab_size = sum(vocab_size_list)
+    if not hasattr(model_training_args, "padded_vocab_size"):
+        model_training_args.padded_vocab_size = full_vocab_size
+
+    # Configuration for the model (load by triton backends)
+    config = configparser.ConfigParser()
+    config["llama"] = {}
+    try:
+        config["llama"]["model_name"] = "llama"
+        config["llama"]["head_num"] = str(model_training_args.num_attention_heads)
+        config["llama"]["size_per_head"] = str(model_training_args.hidden_size // model_training_args.num_attention_heads)
+        config["llama"]["rotary_embedding"] = str(model_training_args.hidden_size // model_training_args.num_attention_heads)
+        config["llama"]["inter_size"] = str(model_training_args.ffn_hidden_size)
+        config["llama"]["num_layer"] = str(model_training_args.num_layers)
+        config["llama"]["max_pos_seq_len"] = str(model_training_args.max_position_embeddings)
+        config["llama"]["vocab_size"] = str(model_training_args.padded_vocab_size)
+        config["llama"]["start_id"] = '0'
+        config["llama"]["end_id"] = '1'
+        config["llama"]["weight_data_type"] = args.weight_data_type
+        config["llama"]["tensor_para_size"] = str(args.infer_gpu_num)
+        with open((saved_dir / f"config.ini").as_posix(), 'w') as configfile:
+            config.write(configfile)
+    except Exception as e:
+        print(f"Fail to save the config in config.ini: {e}")
+
+
+def main():
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument("--saved-dir", "-saved_dir", "-o", help="folder name of output files", required=True)
+    parser.add_argument(
+        "--in-file", "-in_file", "-i", help="file name of input checkpoint file", required=True
+    )
+    parser.add_argument(
+        "--infer-gpu-num", "-infer_gpu_num", "-i_g", type=int, help="How many gpus for inference", required=True
+    )
+    # -h_n and -t_g are needed when megatron_ckpt_version = 0, for example the public megatron 345M gpt model
+    parser.add_argument(
+        "--head-num",
+        "-head_num",
+        "-h_n",
+        type=int,
+        help="The number of heads, only needed when weight doesn't contain structure hyperparameters"
+    )
+    parser.add_argument(
+        "--trained-tensor-parallel-size",
+        "-trained_tensor_parallel_size",
+        "-t_g",
+        type=int,
+        help="the tensor parallel size for training"
+    )
+    parser.add_argument(
+        "--processes",
+        "-processes",
+        "-p",
+        type=int,
+        default=16,
+        help="How many processes to spawn for conversion",
+    )
+    parser.add_argument(
+        "--weight-data-type", "-weight_data_type", choices=["fp32", "fp16"], default="fp32", help=""
+    )
+    parser.add_argument(
+        "--load-checkpoints-to-cpu",
+        "-load_checkpoints_to_cpu",
+        "-cpu",
+        type=int,
+        choices=[0, 1],
+        default=1,
+        help="Whether to load model weights to CPU",
+    )
+    parser.add_argument(
+        "--vocab-path",
+        type=str,
+        help="Path to vocabulary file to embed in FasterTransformer checkpoint",
+        required=False,
+    )
+    parser.add_argument(
+        "--merges-path", type=str, help="Path to merges file to embed in FasterTransformer checkpoint", required=False
+    )
+
+    args = parser.parse_args()
+    print("\n=============== Argument ===============")
+    for key in vars(args):
+        print(f"{key}: {vars(args)[key]}")
+    print("========================================")
+
+    print("[INFO] Started to convert the model, normally it takes around 10 minutes.")
+
+    start_time = datetime.datetime.now()
+    convert_checkpoint(args)
+    run_time = datetime.datetime.now() - start_time
+    print(f"[INFO] Spent {run_time} (h:m:s) to convert the model")
+
+
+if __name__ == "__main__":
+    main()

From d65adf119c2ffd30b7804360fe8a2f198dfe0f20 Mon Sep 17 00:00:00 2001
From: Void Main <voidmain1313113@gmail.com>
Date: Sat, 29 Apr 2023 01:15:33 +0800
Subject: [PATCH 11/27] Update
 src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h

Co-authored-by: Bram Wasti <bwasti@fb.com>
---
 .../triton_backend/llama/LlamaTritonModelInstance.h             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h
index 0a2418641..a75e0692d 100644
--- a/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h
+++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h
@@ -45,9 +45,9 @@ struct LlamaTritonModelInstance: AbstractTransformerModelInstance {
     convert_outputs(const std::unordered_map<std::string, ft::Tensor>& output_tensors);
 
 private:
+    const std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator_;
     const std::unique_ptr<ft::Llama<T>>                           gpt_;
     const std::shared_ptr<ft::LlamaWeight<T>>                     gpt_weight_;
-    const std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator_;
     const std::unique_ptr<ft::cublasAlgoMap>                      cublas_algo_map_;
     const std::unique_ptr<std::mutex>                             cublas_wrapper_mutex_;
     const std::unique_ptr<ft::cublasMMWrapper>                    cublas_wrapper_;

From cf1b9b170c13c8648a49ee464c0037a6d4d61a73 Mon Sep 17 00:00:00 2001
From: void-main <voidmain1313113@gmail.com>
Date: Sat, 29 Apr 2023 14:34:12 +0000
Subject: [PATCH 12/27] donot callback too frequnetly

---
 src/fastertransformer/models/llama/Llama.cc | 17 ++++++++++++++++-
 src/fastertransformer/models/llama/Llama.h  |  3 +++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index d0305e84a..f078861e2 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -70,6 +70,21 @@ void Llama<T>::initialize()
                                                           allocator_,
                                                           is_free_buffer_after_forward_,
                                                           cuda_device_prop_);
+
+    // parse env overrides
+    if (std::getenv("LLAMA_STREAM_CB_STEP") != nullptr) {
+        try {
+            int callback_step_from_env = stoi(
+                std::string(std::getenv("LLAMA_STREAM_CB_STEP"))
+                );
+            token_generated_cb_step_ = callback_step_from_env;
+            FT_LOG_INFO("Override stream callback step to %d from LLAMA_STREAM_CB_STEP",
+                token_generated_cb_step_);
+        } catch (...) {
+            FT_LOG_WARNING("convert LLAMA_STREAM_CB_STEP err, use default value %d",
+                token_generated_cb_step_);
+        }
+    }
 }
 
 template<typename T>
@@ -1014,7 +1029,7 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         if (*generation_should_stop_) {
             break;
         }
-        if (token_generated_cb_ && step + 1 < (int)max_output_seq_len) {
+        if (token_generated_cb_ && (step + 1) % token_generated_cb_step_ == 0  && step + 1 < (int)max_output_seq_len) {
             setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len);
             sendTensorsToFirstPipelineNode(output_tensors, input_tensors);
 
diff --git a/src/fastertransformer/models/llama/Llama.h b/src/fastertransformer/models/llama/Llama.h
index 8ba6a88df..587a6f72a 100644
--- a/src/fastertransformer/models/llama/Llama.h
+++ b/src/fastertransformer/models/llama/Llama.h
@@ -129,6 +129,9 @@ class Llama: public BaseLayer {
     callback_sig* token_generated_cb_  = nullptr;
     void*         token_generated_ctx_ = nullptr;
 
+    // callback step
+    size_t token_generated_cb_step_ = 50; // default 50, override by env LLAMA_STREAM_CB_STEP
+
     void setOutputTensors(std::unordered_map<std::string, Tensor>*       output_tensors,
                           const std::unordered_map<std::string, Tensor>* input_tensors,
                           const size_t                                   max_input_length,

From 95afed4a569e37942f095fb7bf6659e7641167db Mon Sep 17 00:00:00 2001
From: void-main <voidmain1313113@gmail.com>
Date: Sat, 29 Apr 2023 23:25:40 +0000
Subject: [PATCH 13/27] add bf16

---
 src/fastertransformer/models/llama/Llama.cc    |  3 +++
 .../models/llama/LlamaContextDecoder.cc        |  3 +++
 .../models/llama/LlamaDecoder.cc               |  3 +++
 .../models/llama/LlamaDecoderLayerWeight.cc    |  3 +++
 .../models/llama/LlamaWeight.cc                |  3 +++
 .../triton_backend/llama/LlamaTritonModel.cc   | 18 +++++++++++++++++-
 .../llama/LlamaTritonModelInstance.cc          |  4 +++-
 7 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index f078861e2..2c1c2db5f 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -1220,5 +1220,8 @@ bool* Llama<T>::getFinishBuffer()
 
 template class Llama<float>;
 template class Llama<half>;
+#ifdef ENABLE_BF16
+template class Llama<__nv_bfloat16>;
+#endif
 
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LlamaContextDecoder.cc b/src/fastertransformer/models/llama/LlamaContextDecoder.cc
index 8ea1494fb..e3afd1780 100644
--- a/src/fastertransformer/models/llama/LlamaContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LlamaContextDecoder.cc
@@ -499,5 +499,8 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
 
 template class LlamaContextDecoder<float>;
 template class LlamaContextDecoder<half>;
+#ifdef ENABLE_BF16
+template class LlamaContextDecoder<__nv_bfloat16>;
+#endif
 
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LlamaDecoder.cc b/src/fastertransformer/models/llama/LlamaDecoder.cc
index 23e9b0eec..a5cffa731 100644
--- a/src/fastertransformer/models/llama/LlamaDecoder.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoder.cc
@@ -376,5 +376,8 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
 
 template class LlamaDecoder<float>;
 template class LlamaDecoder<half>;
+#ifdef ENABLE_BF16
+template class LlamaDecoder<__nv_bfloat16>;
+#endif
 
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
index 39a1d5c1f..3e97b67d0 100644
--- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
@@ -221,5 +221,8 @@ void LlamaDecoderLayerWeight<T>::mallocWeights()
 
 template struct LlamaDecoderLayerWeight<float>;
 template struct LlamaDecoderLayerWeight<half>;
+#ifdef ENABLE_BF16
+template class LlamaDecoderLayerWeight<__nv_bfloat16>;
+#endif
 
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LlamaWeight.cc b/src/fastertransformer/models/llama/LlamaWeight.cc
index 65c2f762d..6105267ff 100644
--- a/src/fastertransformer/models/llama/LlamaWeight.cc
+++ b/src/fastertransformer/models/llama/LlamaWeight.cc
@@ -297,5 +297,8 @@ bool LlamaWeight<T>::isValidLayerParallelId(int l)
 
 template struct LlamaWeight<float>;
 template struct LlamaWeight<half>;
+#ifdef ENABLE_BF16
+template class LlamaWeight<__nv_bfloat16>;
+#endif
 
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc
index a7f336843..6816e34e6 100644
--- a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc
@@ -41,6 +41,15 @@ std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaM
             reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0),
             model_dir);
     }
+#ifdef ENABLE_BF16
+    else if (data_type == "bf16") {
+        return std::make_shared<LlamaTritonModel<__nv_bfloat16>>(
+            reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"),
+            reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"),
+            reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0),
+            model_dir);
+    }
+#endif
     else {
         return std::make_shared<LlamaTritonModel<float>>(
             reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"),
@@ -127,6 +136,11 @@ std::unique_ptr<AbstractTransformerModelInstance> LlamaTritonModel<T>::createMod
     if (std::is_same<T, half>::value) {
         cublas_wrapper->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F);
     }
+#ifdef ENABLE_BF16
+    else if (std::is_same<T, __nv_bfloat16>::value) {
+        cublas_wrapper->setBF16GemmConfig();
+    }
+#endif
     else if (std::is_same<T, float>::value) {
         cublas_wrapper->setFP32GemmConfig();
     }
@@ -240,4 +254,6 @@ int LlamaTritonModel<T>::getPipelineParaSize()
 
 template struct LlamaTritonModel<float>;
 template struct LlamaTritonModel<half>;
-
+#ifdef ENABLE_BF16
+template class LlamaTritonModel<__nv_bfloat16>;
+#endif
diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc
index 593fc6c97..e46adf87d 100644
--- a/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc
@@ -259,4 +259,6 @@ void LlamaTritonModelInstance<T>::freeBuffer()
 
 template struct LlamaTritonModelInstance<float>;
 template struct LlamaTritonModelInstance<half>;
-
+#ifdef ENABLE_BF16
+template class LlamaTritonModelInstance<__nv_bfloat16>;
+#endif

From 9aee02eb6e54e172d34ca7cac0c908cd778832a2 Mon Sep 17 00:00:00 2001
From: void-main <voidmain1313113@gmail.com>
Date: Sat, 29 Apr 2023 23:52:01 +0000
Subject: [PATCH 14/27] make sure examples work for bf16

---
 examples/cpp/llama/llama_example.cc         | 10 ++++++++++
 src/fastertransformer/models/llama/Llama.cc |  6 +++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
index 0105cb6aa..e80f59b1f 100644
--- a/examples/cpp/llama/llama_example.cc
+++ b/examples/cpp/llama/llama_example.cc
@@ -60,6 +60,11 @@ int main(int argc, char* argv[])
     else if (data_type == "fp16") {
         llama_example<half>(reader);
     }
+#ifdef ENABLE_BF16
+    else if (data_type == "bf16") {
+        llama_example<__nv_bfloat16>(reader);
+    }
+#endif
     else {
         FT_LOG_ERROR("is_fp16 should be 0 (use float) or 1 (use half).");
         return -1;
@@ -259,6 +264,11 @@ void llama_example(const INIReader reader)
     if (std::is_same<T, half>::value) {
         cublas_wrapper.setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F);
     }
+#ifdef ENABLE_BF16
+    else if (std::is_same<T, __nv_bfloat16>::value) {
+        cublas_wrapper.setBF16GemmConfig();
+    }
+#endif
     else if (std::is_same<T, float>::value) {
         cublas_wrapper.setFP32GemmConfig();
     }
diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index 2c1c2db5f..2fe508803 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -269,7 +269,11 @@ Llama<T>::Llama(size_t                              head_num,
     pipeline_para_.rank_       = 0;
 
     int local_vacab_size = ceil(vocab_size_ / 1.f / tensor_para_.world_size_);
-    if (std::is_same<half, T>::value) {
+    if (std::is_same<half, T>::value
+#ifdef ENABLE_BF16
+        || std::is_same<__nv_bfloat16, T>::value
+#endif
+    ) {
         local_vacab_size = ceil(local_vacab_size / 8.f) * 8;
     }
     vocab_size_padded_ = (size_t)local_vacab_size * tensor_para_.world_size_;

From 8ddac81e2d278f0451d7021c130a7f2c301612e3 Mon Sep 17 00:00:00 2001
From: void-main <voidmain1313113@gmail.com>
Date: Sun, 30 Apr 2023 02:54:29 +0000
Subject: [PATCH 15/27] support bf16 conversion with bfloat 16 numpy ext

---
 examples/cpp/llama/huggingface_llama_convert.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/examples/cpp/llama/huggingface_llama_convert.py b/examples/cpp/llama/huggingface_llama_convert.py
index 37868d0bd..f45c8b99b 100644
--- a/examples/cpp/llama/huggingface_llama_convert.py
+++ b/examples/cpp/llama/huggingface_llama_convert.py
@@ -20,11 +20,17 @@
 import os
 from transformers import LlamaForCausalLM
 
+# using numpy extension: https://github.com/GreenWaves-Technologies/bfloat16
+# install the library with `pip install bfloat16`
+from bfloat16 import bfloat16
+
 def get_weight_data_type(data_type):
     if data_type == "fp32":
         return np.float32
     elif data_type == "fp16":
         return np.float16
+    elif data_type == "bf16":
+        return bfloat16
     else:
         assert False, f"Invalid weight data type {data_type}"
 
@@ -175,7 +181,7 @@ def split_and_convert(args):
     parser.add_argument('-in_file', '-i', type=str, help='file name of input checkpoint file', required=True)
     parser.add_argument('-trained_gpu_num', '-t_g', type=int, help='How many gpus for inference', default=1)
     parser.add_argument('-infer_gpu_num', '-i_g', type=int, help='How many gpus for inference', required=True)
-    parser.add_argument("-weight_data_type", type=str, default="fp32", choices=["fp32", "fp16"])
+    parser.add_argument("-weight_data_type", type=str, default="fp32", choices=["fp32", "fp16", "bf16"])
     parser.add_argument('-model_name', '-m_n', type=str, help='model name', required=True)
 
     args = parser.parse_args()

From 40fbe484dcdbc209ff701eb955776152c06847d5 Mon Sep 17 00:00:00 2001
From: void-main <voidmain1313113@gmail.com>
Date: Mon, 1 May 2023 11:05:17 +0000
Subject: [PATCH 16/27] bugfix

---
 .../pytorch/gpt/utils/megatron_ckpt_convert_llama.py     | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py b/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py
index 02b4cf12a..e15c25e27 100644
--- a/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py
+++ b/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py
@@ -206,7 +206,6 @@ def preprocess_h_to_4h(val, training_tp_size, np_weight_data_type):
 def split_and_convert_process(i, pipeline_para_rank, saved_dir, factor, key, model_training_args, transformer_model_list, ckpt_ver, np_weight_data_type):
     val = safe_transpose(transformer_model_list[0][key])
     val = torch2np(val, np_weight_data_type)
-    print(f"key: {key}, shape: {val.shape}")
     if key.find("layers.") != -1:
         layer_index = (int)(key[7 : key.find(".", 7)])
         saved_key = key.replace(
@@ -235,25 +234,24 @@ def split_and_convert_process(i, pipeline_para_rank, saved_dir, factor, key, mod
         or key.find("mlp.dense_4h_to_h.weight") != -1):
         split_vals = np.split(val, factor, axis=0)
         for j in range(factor):
-            print(f'saving {saved_key}, shape: {split_vals[j].shape}')
             saved_path = saved_dir / f"model.{saved_key}.{i * factor + j:d}.bin"
             split_vals[j].tofile(saved_path.as_posix())
 
     elif (key.find("mlp.dense_h_to_4h.weight") != -1):
+        val = preprocess_h_to_4h(transformer_model_list[0][key],
+            model_training_args.tensor_model_parallel_size,
+            np_weight_data_type)
         gate_weight, up_weight = np.split(val, 2, axis=-1)
-        print(f'dense_h_to_4h shape: {val.shape}, gate shape: {gate_weight.shape}, up shape: {up_weight.shape}')
 
         split_gate_weight = np.split(gate_weight, factor, axis=-1)
         proj_key = saved_key.replace('mlp.dense_h_to_4h.weight','mlp.gate_proj.weight')
         for j in range(factor):
-            print(f'saving {proj_key}, shape: {split_gate_weight[j].shape}')
             saved_path = saved_dir / f"model.{proj_key}.{i * factor + j:d}.bin"
             split_gate_weight[j].tofile(saved_path.as_posix())
 
         split_up_weight = np.split(up_weight, factor, axis=-1)
         proj_key = saved_key.replace('mlp.dense_h_to_4h.weight','mlp.up_proj.weight')
         for j in range(factor):
-            print(f'saving {proj_key}, shape: {split_up_weight[j].shape}')
             saved_path = saved_dir / f"model.{proj_key}.{i * factor + j:d}.bin"
             split_up_weight[j].tofile(saved_path.as_posix())
 
@@ -274,7 +272,6 @@ def split_and_convert_process(i, pipeline_para_rank, saved_dir, factor, key, mod
         split_vals = np.split(val, factor, axis=-1)
 
         for j in range(factor):
-            print(f'saving {saved_key}, shape: {split_vals[j].shape}')
             saved_path = saved_dir / f"model.{saved_key}.{i * factor + j:d}.bin"
             split_vals[j].tofile(saved_path.as_posix())
 

From f6cf9da598e01296d055be95113f25d79a93b02e Mon Sep 17 00:00:00 2001
From: void-main <voidmain1313113@gmail.com>
Date: Mon, 1 May 2023 23:18:25 +0000
Subject: [PATCH 17/27] load layernorm_eps from config; change cb default to 5

---
 examples/cpp/llama/huggingface_llama_convert.py             | 1 +
 examples/cpp/llama/llama_example.cc                         | 2 ++
 src/fastertransformer/models/llama/Llama.cc                 | 5 +++++
 src/fastertransformer/models/llama/Llama.h                  | 6 ++++--
 .../triton_backend/llama/LlamaTritonModel.cc                | 6 ++++--
 .../triton_backend/llama/LlamaTritonModel.h                 | 1 +
 6 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/examples/cpp/llama/huggingface_llama_convert.py b/examples/cpp/llama/huggingface_llama_convert.py
index 37868d0bd..92841a9c0 100644
--- a/examples/cpp/llama/huggingface_llama_convert.py
+++ b/examples/cpp/llama/huggingface_llama_convert.py
@@ -92,6 +92,7 @@ def split_and_convert(args):
         config['llama']["inter_size"] = str(hf_config["intermediate_size"])
         config['llama']["num_layer"] = str(num_layers)
         config['llama']["rotary_embedding"] = str(head_size)
+        config['llama']['layernorm_eps'] = str(hf_config["rms_norm_eps"])
         config['llama']["vocab_size"] = str(hf_config["vocab_size"])
         config['llama']["start_id"] = str(hf_config["bos_token_id"])
         config['llama']["end_id"] = str(hf_config["eos_token_id"])
diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
index e80f59b1f..a64c86bca 100644
--- a/examples/cpp/llama/llama_example.cc
+++ b/examples/cpp/llama/llama_example.cc
@@ -87,6 +87,7 @@ void llama_example(const INIReader reader)
     const size_t vocab_size           = reader.GetInteger(model_name, "vocab_size");
     const size_t decoder_layers       = reader.GetInteger(model_name, "num_layer");
     const size_t rotary_embedding_dim = reader.GetInteger(model_name, "rotary_embedding");
+    const float  layernorm_eps        = reader.GetFloat(model_name, "layernorm_eps");
     const int    start_id             = reader.GetInteger(model_name, "start_id");
     const int    end_id               = reader.GetInteger(model_name, "end_id");
 
@@ -310,6 +311,7 @@ void llama_example(const INIReader reader)
                             decoder_layers,
                             vocab_size,
                             rotary_embedding_dim,
+                            layernorm_eps,
                             start_id,
                             end_id,
                             prompt_learning_start_id,
diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index 2fe508803..37ee21b85 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -227,6 +227,7 @@ Llama<T>::Llama(size_t                              head_num,
                 size_t                              num_layer,
                 size_t                              vocab_size,
                 size_t                              rotary_embedding_dim,
+                float                               layernorm_eps,
                 int                                 start_id,
                 int                                 end_id,
                 int                                 prompt_learning_start_id,  // only needed by p/prompt-tuning
@@ -254,6 +255,7 @@ Llama<T>::Llama(size_t                              head_num,
     num_layer_(num_layer),
     vocab_size_(vocab_size),
     rotary_embedding_dim_(rotary_embedding_dim),
+    layernorm_eps_(layernorm_eps),
     start_id_(start_id),
     end_id_(end_id),
     prompt_learning_start_id_(prompt_learning_start_id),
@@ -287,6 +289,7 @@ Llama<T>::Llama(size_t                              head_num,
                 size_t                              num_layer,
                 size_t                              vocab_size,
                 size_t                              rotary_embedding_dim,
+                float                               layernorm_eps,
                 int                                 start_id,
                 int                                 end_id,
                 int                                 prompt_learning_start_id,  // only needed by p/prompt-tuning
@@ -316,6 +319,7 @@ Llama<T>::Llama(size_t                              head_num,
     num_layer_(num_layer),
     vocab_size_(vocab_size),
     rotary_embedding_dim_(rotary_embedding_dim),
+    layernorm_eps_(layernorm_eps),
     start_id_(start_id),
     end_id_(end_id),
     prompt_learning_start_id_(prompt_learning_start_id),
@@ -346,6 +350,7 @@ Llama<T>::Llama(Llama<T> const& gpt):
     num_layer_(gpt.num_layer_),
     vocab_size_(gpt.vocab_size_),
     rotary_embedding_dim_(gpt.rotary_embedding_dim_),
+    layernorm_eps_(gpt.layernorm_eps_),
     start_id_(gpt.start_id_),
     end_id_(gpt.end_id_),
     prompt_learning_start_id_(gpt.prompt_learning_start_id_),
diff --git a/src/fastertransformer/models/llama/Llama.h b/src/fastertransformer/models/llama/Llama.h
index 587a6f72a..f318e6947 100644
--- a/src/fastertransformer/models/llama/Llama.h
+++ b/src/fastertransformer/models/llama/Llama.h
@@ -38,9 +38,9 @@ class Llama: public BaseLayer {
     size_t num_layer_;
     size_t vocab_size_;
     size_t rotary_embedding_dim_;
+    float layernorm_eps_;
 
     static constexpr bool  neox_rotary_style_ = true;
-    static constexpr float layernorm_eps_     = 1e-6f;
 
     int    start_id_;
     int    end_id_;
@@ -130,7 +130,7 @@ class Llama: public BaseLayer {
     void*         token_generated_ctx_ = nullptr;
 
     // callback step
-    size_t token_generated_cb_step_ = 50; // default 50, override by env LLAMA_STREAM_CB_STEP
+    size_t token_generated_cb_step_ = 5; // default 5, override by env LLAMA_STREAM_CB_STEP
 
     void setOutputTensors(std::unordered_map<std::string, Tensor>*       output_tensors,
                           const std::unordered_map<std::string, Tensor>* input_tensors,
@@ -146,6 +146,7 @@ class Llama: public BaseLayer {
           size_t                              num_layer,
           size_t                              vocab_size,
           size_t                              rotary_embedding_dim,
+          float                               layernorm_eps,
           int                                 start_id,
           int                                 end_id,
           int                                 prompt_learning_start_id,  // only needed by p/prompt-tuning
@@ -173,6 +174,7 @@ class Llama: public BaseLayer {
           size_t                              num_layer,
           size_t                              vocab_size,
           size_t                              rotary_embedding_dim,
+          float                               layernorm_eps,
           int                                 start_id,
           int                                 end_id,
           int                                 prompt_learning_start_id,  // only needed by p/prompt-tuning
diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc
index 6816e34e6..ea34983e4 100644
--- a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc
@@ -84,6 +84,7 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     num_layer_            = reader.GetInteger("llama", "num_layer");
     vocab_size_           = reader.GetInteger("llama", "vocab_size");
     rotary_embedding_dim_ = reader.GetInteger("llama", "rotary_embedding");
+    layernorm_eps_        = reader.GetFloat("llama", "layernorm_eps");
     start_id_             = reader.GetInteger("llama", "start_id");
     end_id_               = reader.GetInteger("llama", "end_id");
     use_gptj_residual_    = false;
@@ -162,6 +163,7 @@ std::unique_ptr<AbstractTransformerModelInstance> LlamaTritonModel<T>::createMod
                      num_layer_,
                      vocab_size_,
                      rotary_embedding_dim_,
+                     layernorm_eps_,
                      start_id_,
                      end_id_,
                      prompt_learning_start_id_,  // p/prompt tuning virtual token start id
@@ -223,8 +225,8 @@ std::string LlamaTritonModel<T>::toString()
     std::stringstream ss;
     ss << "Model: "
        << "\nhead_num: " << head_num_ << "\nsize_per_head: " << size_per_head_ << "\ninter_size: " << inter_size_
-       << "\nnum_layer: " << num_layer_ << "\nvocab_size: " << vocab_size_ << "\nstart_id: " << start_id_
-       << "\nend_id: " << end_id_ << "\nuse_gptj_residual: " << use_gptj_residual_
+       << "\nnum_layer: " << num_layer_ << "\nvocab_size: " << vocab_size_ << "\nlayernorm_eps: " << layernorm_eps_
+       << "\nstart_id: " << start_id_ << "\nend_id: " << end_id_ << "\nuse_gptj_residual: " << use_gptj_residual_
        << "\nprompt_learning_type_: " << static_cast<int>(prompt_learning_type_)
        << "\nprompt_learning_start_id_: " << prompt_learning_start_id_ << "\ntensor_para_size: " << tensor_para_size_
        << "\npipeline_para_size: " << pipeline_para_size_ << "\nenable_custom_all_reduce: " << enable_custom_all_reduce_
diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.h b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.h
index 0775ed05f..da5a277cd 100644
--- a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.h
+++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.h
@@ -57,6 +57,7 @@ struct LlamaTritonModel: public AbstractTransformerModel {
     size_t num_layer_;
     size_t vocab_size_;
     size_t rotary_embedding_dim_;
+    float  layernorm_eps_;
     int    start_id_;
     int    end_id_;
     size_t tensor_para_size_;

From da2ad14e33c46b8ec7e5c20e2600c0bf003a2e9e Mon Sep 17 00:00:00 2001
From: void-main <voidmain1313113@gmail.com>
Date: Tue, 2 May 2023 02:10:02 +0000
Subject: [PATCH 18/27] update megatron convert script

---
 .../gpt/utils/megatron_ckpt_convert_llama.py  | 24 ++++---------------
 1 file changed, 4 insertions(+), 20 deletions(-)

diff --git a/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py b/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py
index e15c25e27..6ebec96d3 100644
--- a/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py
+++ b/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py
@@ -186,23 +186,6 @@ def merge_and_convert_process(i, pipeline_para_rank, saved_dir, factor, key, mod
     else:
         print(f"[ERROR] cannot find key '{key}'")
 
-def preprocess_h_to_4h(val, training_tp_size, np_weight_data_type):
-    """
-    megatron saved format: [TP_train, 2, inter_size / 2 / TP, hidden_size]
-    FT needed format: [hidden_size, inter_size], [hidden_size, inter_size]
-    This function translates megatron weight to FT weight
-    """
-    val_shape = val.shape
-    val = val.view(training_tp_size,
-                   2,
-                   val_shape[0] // (2 * training_tp_size),
-                   val_shape[1])
-    val = val.transpose(0, 1).contiguous()
-    val = val.view(*val_shape)
-
-    val = safe_transpose(val)
-    return torch2np(val, np_weight_data_type)
-
 def split_and_convert_process(i, pipeline_para_rank, saved_dir, factor, key, model_training_args, transformer_model_list, ckpt_ver, np_weight_data_type):
     val = safe_transpose(transformer_model_list[0][key])
     val = torch2np(val, np_weight_data_type)
@@ -238,9 +221,6 @@ def split_and_convert_process(i, pipeline_para_rank, saved_dir, factor, key, mod
             split_vals[j].tofile(saved_path.as_posix())
 
     elif (key.find("mlp.dense_h_to_4h.weight") != -1):
-        val = preprocess_h_to_4h(transformer_model_list[0][key],
-            model_training_args.tensor_model_parallel_size,
-            np_weight_data_type)
         gate_weight, up_weight = np.split(val, 2, axis=-1)
 
         split_gate_weight = np.split(gate_weight, factor, axis=-1)
@@ -454,6 +434,7 @@ def convert_checkpoint(args):
         config["llama"]["num_layer"] = str(model_training_args.num_layers)
         config["llama"]["max_pos_seq_len"] = str(model_training_args.max_position_embeddings)
         config["llama"]["vocab_size"] = str(model_training_args.padded_vocab_size)
+        config["llama"]["layernorm_eps"] = args.layernorm_eps
         config["llama"]["start_id"] = '0'
         config["llama"]["end_id"] = '1'
         config["llama"]["weight_data_type"] = args.weight_data_type
@@ -499,6 +480,9 @@ def main():
     parser.add_argument(
         "--weight-data-type", "-weight_data_type", choices=["fp32", "fp16"], default="fp32", help=""
     )
+    parser.add_argument(
+        "--layernorm-eps", default="1e-05", type=str, help="rms layernorm eps", required=True
+    )
     parser.add_argument(
         "--load-checkpoints-to-cpu",
         "-load_checkpoints_to_cpu",

From abd1e4dd9e0feb853fae064ced9666789e109ade Mon Sep 17 00:00:00 2001
From: void-main <voidmain1313113@gmail.com>
Date: Sat, 6 May 2023 07:56:48 +0000
Subject: [PATCH 19/27] fix callback issue

---
 src/fastertransformer/models/llama/Llama.cc | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index 37ee21b85..8e3f1cedf 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -1114,19 +1114,18 @@ void Llama<T>::setOutputTensors(std::unordered_map<std::string, Tensor>*       o
 
     const size_t batch_size       = output_tensors->at("output_ids").shape[0];
     const size_t beam_width       = output_tensors->at("output_ids").shape[1];
-    uint*        sequence_lengths = output_tensors->at("sequence_length").getPtr<uint>();
+    int*         sequence_lengths = output_tensors->at("sequence_length").getPtr<int>();
     const size_t max_prefix_soft_prompt_length =
         has_prefix_soft_prompt_ ? input_tensors->at("request_prompt_embedding").shape[1] : 0;
 
+    cudaAutoCpy(sequence_lengths, sequence_lengths_, output_tensors->at("sequence_length").size(), stream_);
     if (input_tensors->at("input_ids").shape[1] == 0) {
-        invokeCudaD2DcpyConvert(
-            sequence_lengths, sequence_lengths_, output_tensors->at("sequence_length").size(), stream_);
         // TODO: D2D sequence_lenghts
         if (beam_width > 1) {
             // For beam search, do gather_tree
             // take output_parent_ids as inter buffer
             invokeGatherTree(transposed_output_ids_buf_,
-                             sequence_lengths_,
+                             sequence_lengths,
                              max_output_seq_len,
                              batch_size,
                              beam_width,
@@ -1158,7 +1157,7 @@ void Llama<T>::setOutputTensors(std::unordered_map<std::string, Tensor>*       o
         // For sampling, it is equivalent to all parent ids are 0.
         gatherTreeParam param;
         param.beams                = transposed_output_ids_buf_;
-        param.max_sequence_lengths = sequence_lengths_;
+        param.max_sequence_lengths = sequence_lengths;
         // add sequence_length 1 here because the sequence_length of time step t is t - 1
         param.max_sequence_length_final_step = 1;
         param.max_time                       = max_output_seq_len;
@@ -1176,8 +1175,6 @@ void Llama<T>::setOutputTensors(std::unordered_map<std::string, Tensor>*       o
         param.stream                          = stream_;
         param.output_ids                      = output_tensors->at("output_ids").getPtr<int>();
         invokeGatherTree(param);
-        invokeCudaD2DcpyConvert(
-            sequence_lengths, sequence_lengths_, output_tensors->at("sequence_length").size(), stream_);
         sync_check_cuda_error();
     }
     if ((output_tensors->count("output_log_probs") > 0 && output_tensors->at("output_log_probs").data != nullptr)) {

From 50fdb0c603f7740a80f93501d7dc29bda131552a Mon Sep 17 00:00:00 2001
From: void-main <voidmain1313113@gmail.com>
Date: Sun, 11 Jun 2023 06:48:57 +0000
Subject: [PATCH 20/27] fix name

---
 examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py b/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py
index 6ebec96d3..99f573bb9 100644
--- a/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py
+++ b/examples/pytorch/gpt/utils/megatron_ckpt_convert_llama.py
@@ -386,7 +386,7 @@ def convert_checkpoint(args):
                     ))
                 if j == training_pipeline_para_size - 1:
                     w_e_head_list.append(torch2np(
-                        m["model"]["word_embeddings_for_head"]["weight"],
+                        m["model"]["language_model"]["output_layer"]["weight"],
                         np_weight_data_type
                     ))
                 transformer_models.append(m["model"]["language_model"][megatron_gpt_key])

From 3c03564ca045918d832c3a9e3889a222f607df95 Mon Sep 17 00:00:00 2001
From: shaoxin <shaoxin@xiaoice.com>
Date: Thu, 29 Jun 2023 20:16:02 +0800
Subject: [PATCH 21/27] support int8 & share context

---
 examples/cpp/llama/llama_example.cc           |  36 ++-
 src/fastertransformer/.DS_Store               | Bin 0 -> 6148 bytes
 src/fastertransformer/models/.DS_Store        | Bin 0 -> 8196 bytes
 src/fastertransformer/models/llama/Llama.cc   | 186 ++++++++-----
 src/fastertransformer/models/llama/Llama.h    |  15 +-
 .../models/llama/LlamaContextDecoder.cc       | 132 +++++++--
 .../models/llama/LlamaContextDecoder.h        |  11 +-
 .../models/llama/LlamaDecoder.cc              |   9 +-
 .../models/llama/LlamaDecoder.h               |   3 +
 .../models/llama/LlamaDecoderLayerWeight.cc   | 253 ++++++++++++++----
 .../models/llama/LlamaDecoderLayerWeight.h    |   9 +-
 .../models/llama/LlamaWeight.cc               |   6 +-
 .../models/llama/LlamaWeight.h                |   3 +
 13 files changed, 514 insertions(+), 149 deletions(-)
 create mode 100644 src/fastertransformer/.DS_Store
 create mode 100644 src/fastertransformer/models/.DS_Store

diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
index a64c86bca..49d966772 100644
--- a/examples/cpp/llama/llama_example.cc
+++ b/examples/cpp/llama/llama_example.cc
@@ -81,6 +81,7 @@ void llama_example(const INIReader reader)
 
     int tensor_para_size   = reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size");
     int pipeline_para_size = reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size");
+    int int8_mode  = reader.GetInteger("ft_instance_hyperparameter", "int8_mode", 0);
 
     const size_t head_num             = reader.GetInteger(model_name, "head_num");
     const size_t size_per_head        = reader.GetInteger(model_name, "size_per_head");
@@ -177,6 +178,7 @@ void llama_example(const INIReader reader)
         tiled_stop_words.insert(tiled_stop_words.end(), stop_words.begin(), stop_words.end());
     }
 
+    
     int* d_stop_words = nullptr;
     deviceMalloc(&d_stop_words, tiled_stop_words.size(), false);
     cudaH2Dcpy(d_stop_words, tiled_stop_words.data(), tiled_stop_words.size());
@@ -193,6 +195,7 @@ void llama_example(const INIReader reader)
                    1,
                    "../examples/cpp/llama/start_ids.csv");
 
+
     int* d_input_ids;
     int* d_input_lengths;
     if (max_input_len == 0) {
@@ -274,6 +277,7 @@ void llama_example(const INIReader reader)
         cublas_wrapper.setFP32GemmConfig();
     }
 
+    printf("******* Enter  gpt_weights ********** \n");
     const bool                          use_gptj_residual = false;
     fastertransformer::LlamaWeight<T> gpt_weights(hidden_units,
                                                   inter_size,
@@ -285,9 +289,12 @@ void llama_example(const INIReader reader)
                                                   pipeline_para.world_size_,
                                                   pipeline_para.rank_,
                                                   use_gptj_residual,
+                                                  int8_mode,
                                                   prompt_learning_type,
                                                   prefix_prompt_table_pair);
 
+    printf("******* Enter loadModel  ********* \n");
+
     gpt_weights.loadModel(model_dir);
     unsigned long long random_seed;
     if (rank == 0) {
@@ -305,6 +312,8 @@ void llama_example(const INIReader reader)
                                                        false,  // with_relative_position_bias
                                                        true);  // causal_mask
 
+    printf("******* Inilize  Llama  ********* \n");
+
     Llama<T> gpt = Llama<T>(head_num,
                             size_per_head,
                             inter_size,
@@ -331,12 +340,22 @@ void llama_example(const INIReader reader)
                             &allocator,
                             false,
                             &prop,
-                            attention_type);
+                            attention_type,
+                            int8_mode,
+                            nullptr,
+                            0,
+                            1.0f);
 
     int* d_output_ids;
     int* d_sequence_lengths;
+
+    printf("******* deviceMalloc start  ********* \n");
+
     deviceMalloc(&d_output_ids, request_batch_size * beam_width * total_output_len, false);
     deviceMalloc(&d_sequence_lengths, request_batch_size * beam_width, false);
+
+    printf("******* deviceMalloc end  ********* \n");
+
     std::vector<uint32_t>                   output_seq_len(request_batch_size, total_output_len);
     std::unordered_map<std::string, Tensor> input_tensors = std::unordered_map<std::string, Tensor>{
         {"input_ids",
@@ -402,6 +421,8 @@ void llama_example(const INIReader reader)
 
     print_mem_usage();
 
+    printf("******* before cudaDeviceSynchronize ********* \n");
+
     int ite = 1;
     cudaDeviceSynchronize();
     mpi::barrier();
@@ -411,15 +432,21 @@ void llama_example(const INIReader reader)
     ite = 1;
     ft_nvtx::setScope("warmup_time");
     PUSH_RANGE("warmup time")
+
+    printf("******* before gpt.forward ********* \n");
     for (int i = 0; i < ite; ++i) {
         gpt.forward(&output_tensors, &input_tensors, &gpt_weights);
     }
+
+    printf("******* end gpt.forward ********* \n");
     cudaDeviceSynchronize();
     mpi::barrier();
 
     POP_RANGE;
     ft_nvtx::resetScope();
 
+    printf("******* end cudaDeviceSynchronize ********* \n");
+
     if (rank == 0) {
 
         std::string fName   = "out";
@@ -430,8 +457,12 @@ void llama_example(const INIReader reader)
         else {
             size_t outCount = total_output_len * request_batch_size * beam_width;
             int*   hBuf     = new int[outCount];
+
+            printf("******* before cudaD2Hcpy ********* \n");
+
             cudaD2Hcpy(hBuf, d_output_ids, outCount);
 
+            printf("******* end cudaD2Hcpy ********* \n");
             {
                 std::cout << "Writing " << outCount << " elements\n";
                 int zeroCount = 0;
@@ -465,10 +496,11 @@ void llama_example(const INIReader reader)
 
     ft_nvtx::setScope("total_time");
     PUSH_RANGE("total time")
+    printf("******* before gpt forward ********* \n");
     for (int i = 0; i < ite; ++i) {
         gpt.forward(&output_tensors, &input_tensors, &gpt_weights);
     }
-
+    printf("******* after gpt forward ********* \n");
     cudaDeviceSynchronize();
     mpi::barrier();
 
diff --git a/src/fastertransformer/.DS_Store b/src/fastertransformer/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..d1845e2dff13bedf5e41cf42a41978e6bf67cb38
GIT binary patch
literal 6148
zcmeHK%}T>S5Z<-5O({YT3VK`cS}<)D3SL63FJMFuDm5WNgK4%jsX3HF?)pN$h|lB9
z?&c5-coVTRu=~x<&u->}><?p%PgcQzu^wa0f`-UZsSz~Sx^_%3B3E<7EcNqr;m6Xi
zndmQ?@Y`E#&Lj(1$g1D};V<GS&20OV?=+gbt+v&+y4HjHEOR&WvQ;|qmUlS1mNNEB
z-ScmvFrW4IFJzK=Q4%gyLKK7$a(^2ofy^gzl?18Eb+yClSe;q#XuTeuj{9OT9NU||
zSdT}ezBn7&n@z_$I6gVQn!UvHM80VzIgqYo&tM7fpj3)_b(cvjlP9p3Ib|#%F+dCu
z1H{1AF<_1YtGjiTQ^CXlG4K-uxIYMJh>pQRquM&4!|OBpn}{f&<68nz7<3F48o>j?
zbt<4v<>raObvoFEiE|7V8g)A3YGs(mtXw``xLO_TLWMK#Xr!JPAO@-oH1x2C=l>=A
zGPRHVY6^{r0b<~vF~FNscRGQh%-Q;_JUnX!v`1(t7+0VI0(#{V00Z1d4wX~;1?mvz
X7%Vj6ENEBhfOHX1giuEe`~m}CV@gb8

literal 0
HcmV?d00001

diff --git a/src/fastertransformer/models/.DS_Store b/src/fastertransformer/models/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..cd5987b054b2d9ac9e7e3b48fcc55e66056ebd71
GIT binary patch
literal 8196
zcmeHML2uJA6n^eHnlgrZ0BIbMB5|F@x@lq(m(q;`5?m-^2S6oR!xomsRg<ohrlH>9
zH}DrY^GEnEoZx%5J2~!pMF{N5eu@2kFMgk$rY|KT(H#%kL~BG8pt5bPpt+{-IM=yS
zGd)Ze;M22A9>$rBao(Ck`wp*wSHLUa74Qmp1^xyF@XY2CEO_qgQQvw6yaNBF0{s0D
zqO$E6JJnWwb)Zox0BjxIs^D1t^oI?(1GZ!AR9p0*2u+36RAEvKVKN7y=@{=A{8U>_
zC!wwkKjzB9WGKQU9wJn55*=-Q>lN?{%qzgTd!1s6Xhfxxzdym)#xGSA@fLM~D=47@
z)Rd0sm`d89V~6+6;j#Z@-oLo-;DQ;B`t*T@v|IHnK0ElPg-<ERTwKOahjE+s=mlap
z!F--0f&!cZ5tMZ9MqfK}xD6VBWg?-dIyYjizFJyo^dU*b8aZfI$VLYWrO_P8Q5CP0
zvhfXQKg6sT>tv(r2K~|T%Ies-nb#a~N;<Eil^!;FnKFDLI=xN{aof_uLdKI~Xx<Qg
zkB3=Wl$)DBL~W_Qe4`OGf|cN1?^sTHWmJxfZZvwsXRoBp;@NQ&zfO}$zjbF%=4F)T
z$w(2>K?2CzgESw=Nmq{ZL7})kJrFd5X1{gs;$pkqUJu(_k9IEC!;9VR?e%bHYxDB5
z8QgvF@bSz3Np_k`-exGmJZWfEpI^0KaAbva)*I!S%z3*o<0eFryoA`o5}W>%w|-Sh
z!tz%HS_NthUWqN)SBo_^*108lBUG4mY~Kl3dG!i>t5k5!84i9I-YOMF4YS#)dgU!p
z!7R}kyg75^TS0uXmOs#_MORj8*!X2clt976^{k?=BF~}Pbx4liMqVwRKky281+)T7
z+N`a>K3)9&-^=9{@Cy9H3W)lCZ@-HGXTNydSUJ}YQQxC-Vcb+(MM0y|aafg(!(RPi
bh~p4Y=5&mmYKtBe{~|!i;2W>NA64Kt&SGW%

literal 0
HcmV?d00001

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index 8e3f1cedf..8f66e7def 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -42,6 +42,7 @@ void Llama<T>::initialize()
                                                       is_free_buffer_after_forward_,
                                                       is_context_qk_buf_float_,
                                                       attention_type_,
+                                                      int8_mode_,
                                                       custom_all_reduce_comm_,
                                                       enable_custom_all_reduce_);
 
@@ -59,6 +60,7 @@ void Llama<T>::initialize()
                                        cublas_wrapper_,
                                        allocator_,
                                        is_free_buffer_after_forward_,
+                                       int8_mode_,
                                        custom_all_reduce_comm_,
                                        enable_custom_all_reduce_);
 
@@ -165,6 +167,13 @@ void Llama<T>::allocateBuffer(
 
     generation_should_stop_ = (bool*)allocator_->reMalloc(generation_should_stop_, sizeof(bool), true, true);
 
+    if (shared_contexts_ratio_ > 0.0f) {
+        shared_contexts_idx_  = (int*)allocator_->reMalloc(shared_contexts_idx_, batch_size * sizeof(int), false);
+        batch_to_compact_idx_ = (int*)allocator_->reMalloc(batch_to_compact_idx_, batchxbeam * sizeof(int), false);
+        compact_idx_          = (int*)allocator_->reMalloc(compact_idx_, batch_size * sizeof(int), false);
+        compact_size_         = (int*)allocator_->reMalloc(compact_size_, sizeof(int), false);
+    }
+
     is_allocate_buffer_ = true;
 }
 
@@ -216,6 +225,11 @@ void Llama<T>::freeBuffer()
 
         allocator_->free((void**)(&generation_should_stop_), true);
 
+        if (shared_contexts_ratio_ > 0.0f) {
+            allocator_->free((void**)(&shared_contexts_idx_));
+            allocator_->free((void**)(&compact_size_));
+        }
+
         is_allocate_buffer_ = false;
     }
 }
@@ -246,8 +260,10 @@ Llama<T>::Llama(size_t                              head_num,
                 bool                                is_free_buffer_after_forward,
                 cudaDeviceProp*                     cuda_device_prop,
                 AttentionType                       attention_type,
+                int                                 int8_mode,
                 std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
-                int                                 enable_custom_all_reduce):
+                int                                 enable_custom_all_reduce,
+                float                               shared_contexts_ratio):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop),
     head_num_(head_num),
     size_per_head_(size_per_head),
@@ -263,7 +279,9 @@ Llama<T>::Llama(size_t                              head_num,
     use_gptj_residual_(use_gptj_residual),
     hidden_units_(head_num * size_per_head),
     local_head_num_(head_num / 1),
-    attention_type_(attention_type)
+    attention_type_(attention_type),
+    int8_mode_(int8_mode),
+    shared_contexts_ratio_(shared_contexts_ratio)
 {
     tensor_para_.world_size_   = 1;
     tensor_para_.rank_         = 0;
@@ -310,8 +328,10 @@ Llama<T>::Llama(size_t                              head_num,
                 bool                                is_free_buffer_after_forward,
                 cudaDeviceProp*                     cuda_device_prop,
                 AttentionType                       attention_type,
+                int                                 int8_mode,
                 std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
-                int                                 enable_custom_all_reduce):
+                int                                 enable_custom_all_reduce,
+                float                               shared_contexts_ratio):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop),
     head_num_(head_num),
     size_per_head_(size_per_head),
@@ -331,7 +351,9 @@ Llama<T>::Llama(size_t                              head_num,
     local_head_num_(head_num / tensor_para.world_size_),
     custom_all_reduce_comm_(custom_all_reduce_comm),
     enable_custom_all_reduce_(enable_custom_all_reduce),
-    attention_type_(attention_type)
+    attention_type_(attention_type),
+    int8_mode_(int8_mode),
+    shared_contexts_ratio_(shared_contexts_ratio)
 {
     int local_vacab_size = ceil(vocab_size_ / 1.f / tensor_para_.world_size_);
     if (std::is_same<half, T>::value) {
@@ -363,7 +385,9 @@ Llama<T>::Llama(Llama<T> const& gpt):
     vocab_size_padded_(gpt.vocab_size_padded_),
     custom_all_reduce_comm_(gpt.custom_all_reduce_comm_),
     enable_custom_all_reduce_(gpt.enable_custom_all_reduce_),
-    attention_type_(gpt.attention_type_)
+    attention_type_(gpt.attention_type_),
+    int8_mode_(gpt.int8_mode_),
+    shared_contexts_ratio_(gpt.shared_contexts_ratio_)
 {
     initialize();
 }
@@ -585,6 +609,23 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         cudaMemsetAsync(cache_indirections_[0], 0, 2 * sizeof(int) * batch_size * beam_width * max_seq_len, stream_);
     }
 
+    int  compact_size;
+    bool use_shared_contexts = (shared_contexts_ratio_ > 0.0f) && (max_input_length >= 1) && (batch_size > 1);
+    if (use_shared_contexts) {
+        invokeFindContextDups(shared_contexts_idx_,
+                              batch_to_compact_idx_,
+                              compact_idx_,
+                              compact_size_,
+                              input_tensors->at("input_ids").getPtr<int>(),
+                              batch_size,
+                              beam_width,
+                              max_input_length,
+                              stream_);
+        cudaD2Hcpy(&compact_size, compact_size_, 1);
+        use_shared_contexts = compact_size <= shared_contexts_ratio_ * batch_size;
+        sync_check_cuda_error();
+    }
+
     // Prefix prompts
     if (has_prefix_prompt_) {
         cudaMemcpyAsync(prompt_learning_weight_batch_,
@@ -686,6 +727,14 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                     {batch_size * beam_width},
                     has_prefix_prompt_ ? tiled_prompt_lengths_buf_ : nullptr}}};
 
+        if (use_shared_contexts) {
+            decoder_input_tensors.insert(
+                {"compact_idx", Tensor(MEMORY_GPU, TYPE_INT32, {(size_t)compact_size}, compact_idx_)});
+            decoder_input_tensors.insert(
+                {"batch_to_compact_idx",
+                 Tensor(MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, batch_to_compact_idx_)});
+        }
+
         std::unordered_map<std::string, Tensor> decoder_output_tensors{
             {"decoder_output",
              Tensor{MEMORY_GPU,
@@ -877,67 +926,71 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                                          stream_);
                 sync_check_cuda_error();
 
-                if (tensor_para_.world_size_ == 1) {
-                    float alpha = 1.0f;
-                    float beta  = 0.0f;
-                    cublas_wrapper_->Gemm(CUBLAS_OP_T,
-                                          CUBLAS_OP_N,
-                                          vocab_size_padded_,  // n
-                                          local_batch_size * beam_width,
-                                          hidden_units_,  // k
-                                          &alpha,
-                                          padded_embedding_kernel_ptr_,
-                                          gemm_data_type,
-                                          hidden_units_,  // k
-                                          normed_decoder_output_buf_ + hidden_units_offset,
-                                          gemm_data_type,
-                                          hidden_units_,  // k
-                                          &beta,
-                                          logits_buf_ + vocab_size_units_offset,
-                                          CUDA_R_32F,
-                                          vocab_size_padded_, /* n */
-                                          CUDA_R_32F,
-                                          cublasGemmAlgo_t(-1));
-                }
-                else {
-                    FT_CHECK(vocab_size_padded_ % tensor_para_.world_size_ == 0);
-                    const int local_vocab_size = vocab_size_padded_ / tensor_para_.world_size_;
-                    float     alpha            = 1.0f;
-                    float     beta             = 0.0f;
-                    cublas_wrapper_->Gemm(CUBLAS_OP_T,
-                                          CUBLAS_OP_N,
-                                          local_vocab_size,  // n
-                                          local_batch_size * beam_width,
-                                          hidden_units_,  // k
-                                          &alpha,
-                                          padded_embedding_kernel_ptr_
-                                              + tensor_para_.rank_ * local_vocab_size * hidden_units_,
-                                          gemm_data_type,
-                                          hidden_units_,  // k
-                                          normed_decoder_output_buf_ + hidden_units_offset,
-                                          gemm_data_type,
-                                          hidden_units_,  // k
-                                          &beta,
-                                          nccl_logits_buf_ + vocab_size_units_offset
-                                              + tensor_para_.rank_ * local_batch_size * beam_width * local_vocab_size,
-                                          CUDA_R_32F,
-                                          local_vocab_size, /* n */
-                                          CUDA_R_32F,
-                                          cublasGemmAlgo_t(-1));
-                    ftNcclAllGather(nccl_logits_buf_ + vocab_size_units_offset,
-                                    nccl_logits_buf_ + vocab_size_units_offset,
-                                    local_batch_size * beam_width * local_vocab_size,
-                                    tensor_para_.rank_,
-                                    tensor_para_,
-                                    stream_);
-                    invokeTransposeAxis01(logits_buf_ + vocab_size_units_offset,
-                                          nccl_logits_buf_ + vocab_size_units_offset,
-                                          tensor_para_.world_size_,
-                                          local_batch_size * beam_width,
-                                          local_vocab_size,
-                                          stream_);
-                }
-
+                
+                // if (tensor_para_.world_size_ == 1) {
+                //     float alpha = 1.0f;
+                //     float beta  = 0.0f;
+                //     cublas_wrapper_->Gemm(CUBLAS_OP_T,
+                //                           CUBLAS_OP_N,
+                //                           vocab_size_padded_,  // n
+                //                           local_batch_size * beam_width,
+                //                           hidden_units_,  // k
+                //                           &alpha,
+                //                           padded_embedding_kernel_ptr_,
+                //                           gemm_data_type,
+                //                           hidden_units_,  // k
+                //                           normed_decoder_output_buf_ + hidden_units_offset,
+                //                           gemm_data_type,
+                //                           hidden_units_,  // k
+                //                           &beta,
+                //                           logits_buf_ + vocab_size_units_offset,
+                //                           CUDA_R_32F,
+                //                           vocab_size_padded_, /* n */
+                //                           CUDA_R_32F,
+                //                           cublasGemmAlgo_t(-1));
+                // }
+                // else {
+                //     FT_CHECK(vocab_size_padded_ % tensor_para_.world_size_ == 0);
+                //     const int local_vocab_size = vocab_size_padded_ / tensor_para_.world_size_;
+                //     float     alpha            = 1.0f;
+                //     float     beta             = 0.0f;
+                //     cublas_wrapper_->Gemm(CUBLAS_OP_T,
+                //                           CUBLAS_OP_N,
+                //                           local_vocab_size,  // n
+                //                           local_batch_size * beam_width,
+                //                           hidden_units_,  // k
+                //                           &alpha,
+                //                           padded_embedding_kernel_ptr_
+                //                               + tensor_para_.rank_ * local_vocab_size * hidden_units_,
+                //                           gemm_data_type,
+                //                           hidden_units_,  // k
+                //                           normed_decoder_output_buf_ + hidden_units_offset,
+                //                           gemm_data_type,
+                //                           hidden_units_,  // k
+                //                           &beta,
+                //                           nccl_logits_buf_ + vocab_size_units_offset
+                //                               + tensor_para_.rank_ * local_batch_size * beam_width * local_vocab_size,
+                //                           CUDA_R_32F,
+                //                           local_vocab_size, /* n */
+                //                           CUDA_R_32F,
+                //                           cublasGemmAlgo_t(-1));
+                    
+
+                //     ftNcclAllGather(nccl_logits_buf_ + vocab_size_units_offset,
+                //                     nccl_logits_buf_ + vocab_size_units_offset,
+                //                     local_batch_size * beam_width * local_vocab_size,
+                //                     tensor_para_.rank_,
+                //                     tensor_para_,
+                //                     stream_);
+                //     invokeTransposeAxis01(logits_buf_ + vocab_size_units_offset,
+                //                           nccl_logits_buf_ + vocab_size_units_offset,
+                //                           tensor_para_.world_size_,
+                //                           local_batch_size * beam_width,
+                //                           local_vocab_size,
+                //                           stream_);
+                // }
+                
+                
                 int                                     tmp_local_batch_size       = local_batch_size;
                 bool                                    is_initialize_random_table = step == max_input_length;
                 std::unordered_map<std::string, Tensor> dynamic_decode_input_tensors{
@@ -1229,5 +1282,4 @@ template class Llama<half>;
 #ifdef ENABLE_BF16
 template class Llama<__nv_bfloat16>;
 #endif
-
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/Llama.h b/src/fastertransformer/models/llama/Llama.h
index f318e6947..a0958280e 100644
--- a/src/fastertransformer/models/llama/Llama.h
+++ b/src/fastertransformer/models/llama/Llama.h
@@ -41,6 +41,7 @@ class Llama: public BaseLayer {
     float layernorm_eps_;
 
     static constexpr bool  neox_rotary_style_ = true;
+    float       shared_contexts_ratio_;
 
     int    start_id_;
     int    end_id_;
@@ -54,6 +55,7 @@ class Llama: public BaseLayer {
     int                                 enable_custom_all_reduce_;
 
     AttentionType attention_type_;
+    const int     int8_mode_      = 0;
 
     size_t     vocab_size_padded_;
     const bool is_context_qk_buf_float_ =
@@ -120,6 +122,11 @@ class Llama: public BaseLayer {
 
     bool* generation_should_stop_ = nullptr;
 
+    int* shared_contexts_idx_  = nullptr;
+    int* compact_idx_          = nullptr;
+    int* batch_to_compact_idx_ = nullptr;
+    int* compact_size_         = nullptr;
+
     T*     context_decoder_input_buf_;
     T*     context_decoder_output_buf_;
     float* output_log_probs_buf_;
@@ -165,8 +172,10 @@ class Llama: public BaseLayer {
           bool                                is_free_buffer_after_forward,
           cudaDeviceProp*                     cuda_device_prop         = nullptr,
           AttentionType                       attention_type           = AttentionType::UNFUSED_MHA,
+          int                                 int8_mode                = 0,
           std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm   = nullptr,
-          int                                 enable_custom_all_reduce = 0);
+          int                                 enable_custom_all_reduce = 0,
+          float                               shared_contexts_ratio    = 1.0f);
 
     Llama(size_t                              head_num,
           size_t                              size_per_head,
@@ -195,8 +204,10 @@ class Llama: public BaseLayer {
           bool                                is_free_buffer_after_forward,
           cudaDeviceProp*                     cuda_device_prop         = nullptr,
           AttentionType                       attention_type           = AttentionType::UNFUSED_MHA,
+          int                                 int8_mode                = 0,
           std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm   = nullptr,
-          int                                 enable_custom_all_reduce = 0);
+          int                                 enable_custom_all_reduce = 0,
+          float                               shared_contexts_ratio    = 1.0f);
 
     Llama(Llama<T> const& Llama);
 
diff --git a/src/fastertransformer/models/llama/LlamaContextDecoder.cc b/src/fastertransformer/models/llama/LlamaContextDecoder.cc
index e3afd1780..a1d78c852 100644
--- a/src/fastertransformer/models/llama/LlamaContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LlamaContextDecoder.cc
@@ -40,10 +40,11 @@ void LlamaContextDecoder<T>::initialize()
                                                                           is_free_buffer_after_forward_,
                                                                           is_qk_buf_float_,
                                                                           false,
-                                                                          0,
+                                                                          int8_mode_,
                                                                           custom_all_reduce_comm_,
                                                                           enable_custom_all_reduce_);
 
+// TODO: 这里的SiluFfnLayer是不支持int8的dataType，再不做修改的情况下试一下会不会报错。
     ffn_layer_ = new TensorParallelSiluFfnLayer<T>(0,  // max_batch_size
                                                    0,  // max_seq_len
                                                    head_num_,
@@ -69,7 +70,7 @@ void LlamaContextDecoder<T>::allocateBuffer()
 }
 
 template<typename T>
-void LlamaContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len)
+void LlamaContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len, bool use_shared_contexts)
 {
     decoder_normed_input_ = reinterpret_cast<T*>(
         allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
@@ -83,6 +84,20 @@ void LlamaContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len)
     padding_offset_ =
         reinterpret_cast<int*>(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false));
     cu_seqlens_ = reinterpret_cast<int*>(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false));
+    
+    if (use_shared_contexts) {
+        compact_decoder_features_ = reinterpret_cast<T*>(
+            allocator_->reMalloc(compact_decoder_features_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+        compact_attention_mask_ = reinterpret_cast<T*>(
+            allocator_->reMalloc(compact_attention_mask_, sizeof(T) * batch_size * seq_len * seq_len, false));
+        compact_input_lengths_ =
+            reinterpret_cast<int*>(allocator_->reMalloc(compact_input_lengths_, sizeof(int) * batch_size, false));
+        k_cache_layer_ = reinterpret_cast<T*>(
+            allocator_->reMalloc(k_cache_layer_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+        v_cache_layer_ = reinterpret_cast<T*>(
+            allocator_->reMalloc(v_cache_layer_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+    }
+
     is_allocate_buffer_ = true;
 }
 
@@ -97,6 +112,13 @@ void LlamaContextDecoder<T>::freeBuffer()
         allocator_->free((void**)(&h_pinned_token_num_ptr_), true);
         allocator_->free((void**)(&padding_offset_));
         allocator_->free((void**)(&cu_seqlens_));
+        if (compact_decoder_features_ != nullptr) {
+            allocator_->free((void**)(&compact_decoder_features_));
+            allocator_->free((void**)(&compact_attention_mask_));
+            allocator_->free((void**)(&compact_input_lengths_));
+            allocator_->free((void**)(&k_cache_layer_));
+            allocator_->free((void**)(&v_cache_layer_));
+        }
         is_allocate_buffer_ = false;
     }
 }
@@ -147,6 +169,7 @@ LlamaContextDecoder<T>::LlamaContextDecoder(size_t
                                             bool                                is_free_buffer_after_forward,
                                             bool                                is_qk_buf_float,
                                             AttentionType                       attention_type,
+                                            int                                 int8_mode,
                                             std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
                                             int                                 enable_custom_all_reduce):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
@@ -163,6 +186,7 @@ LlamaContextDecoder<T>::LlamaContextDecoder(size_t
     pipeline_para_(pipeline_para),
     is_qk_buf_float_(is_qk_buf_float),
     attention_type_(attention_type),
+    int8_mode_(int8_mode),
     custom_all_reduce_comm_(custom_all_reduce_comm),
     enable_custom_all_reduce_(enable_custom_all_reduce)
 {
@@ -185,6 +209,7 @@ LlamaContextDecoder<T>::LlamaContextDecoder(LlamaContextDecoder<T> const& decode
     pipeline_para_(decoder.pipeline_para_),
     is_qk_buf_float_(decoder.is_qk_buf_float_),
     attention_type_(decoder.attention_type_),
+    int8_mode_(decoder.int8_mode_),
     custom_all_reduce_comm_(decoder.custom_all_reduce_comm_),
     enable_custom_all_reduce_(decoder.enable_custom_all_reduce_)
 {
@@ -238,15 +263,23 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     // For example, the shape of decoder_input becomes [ite, batch_size, seq_len, hidden_dimension] during
     // computing.
 
-    FT_CHECK(input_tensors->size() == 5);
+    FT_CHECK(input_tensors->size() >= 5);
     FT_CHECK(output_tensors->size() == 4);
 
-    const int batch_size = input_tensors->at("decoder_input").shape[0];
-    const int seq_len    = input_tensors->at("decoder_input").shape[1];
+    const bool use_shared_contexts = input_tensors->find("compact_idx") != input_tensors->end();
+    FT_CHECK(!use_shared_contexts || (input_tensors->find("batch_to_compact_idx") != input_tensors->end()));
+    const size_t request_batch_size = input_tensors->at("decoder_input").shape[0];
+    // compacted batch size.
+    const size_t batch_size =
+        use_shared_contexts ? input_tensors->at("compact_idx").shape[0] : input_tensors->at("decoder_input").shape[0];
+    const int seq_len = input_tensors->at("decoder_input").shape[1];  // max_input_len
+    // The maximum length of generation.
+    const size_t max_seq_len = output_tensors->at("value_cache").shape[3];
+
     const int max_prompt_length =
         input_tensors->at("attention_mask").shape[3] - input_tensors->at("attention_mask").shape[2];
     const DataType data_type = getTensorType<T>();
-    allocateBuffer(batch_size, seq_len);
+    allocateBuffer(batch_size, seq_len, use_shared_contexts);
 
     T*         decoder_input           = input_tensors->at("decoder_input").getPtr<T>();
     T*         decoder_output          = output_tensors->at("decoder_output").getPtr<T>();
@@ -254,6 +287,20 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     const T**  d_prefix_prompt_batch   = input_tensors->at("d_prefix_prompt_batch").getPtr<const T*>();
     const int* d_prefix_prompt_lengths = input_tensors->at("d_prefix_prompt_lengths").getPtr<const int>();
 
+    if (use_shared_contexts) {
+        invokeCompactInputs(compact_decoder_features_,
+                            compact_attention_mask_,
+                            compact_input_lengths_,
+                            decoder_input,
+                            attention_mask,
+                            input_tensors->at("input_lengths").getPtr<int>(),
+                            input_tensors->at("compact_idx").getPtr<int>(),
+                            batch_size,
+                            seq_len,
+                            hidden_units_,
+                            stream_);
+    }
+
     const int local_batch_size = getLocalBatchSize(batch_size, seq_len, pipeline_para_.world_size_);
     FT_CHECK(batch_size % local_batch_size == 0);
     const int iteration_num = batch_size / local_batch_size;
@@ -271,6 +318,12 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         self_v_cache_size.push_back(*t);
     }
 
+    if (use_shared_contexts) {
+        // we use k_cache_layer_ and v_cache_layer_
+        self_k_cache_size[3] = seq_len;
+        self_v_cache_size[2] = seq_len;
+    }
+
     AttentionType attention_type  = (d_prefix_prompt_lengths != nullptr) ?
                                         getUnfusedAttentionType(attention_type_) :
                                         attention_type_;
@@ -279,7 +332,8 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     for (int ite = 0; ite < iteration_num; ite++) {
         size_t h_token_num = local_batch_size * seq_len;
         if (is_unpadded_mha) {
-            const int* base_input_lengths = input_tensors->at("input_lengths").getPtr<int>();
+            const int* base_input_lengths =
+                use_shared_contexts ? compact_input_lengths_ : input_tensors->at("input_lengths").getPtr<int>();
             invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_,
                                                &h_token_num,
                                                padding_offset_,
@@ -295,8 +349,9 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             }
 
             if (l == 0 && is_unpadded_mha) {
+                const T* base_input = (use_shared_contexts ? compact_decoder_features_ : decoder_input);
                 invokeRemovePadding(decoder_layer_output_,
-                                    decoder_input + ite * local_batch_size * seq_len * hidden_units_,
+                                    base_input + ite * local_batch_size * seq_len * hidden_units_,
                                     padding_offset_,
                                     h_token_num,
                                     hidden_units_,
@@ -308,11 +363,11 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             T*         layer_output = decoder_layer_output_;
             if (!is_unpadded_mha) {
                 if (l == 0) {
-                    layer_input = decoder_input;
+                    layer_input = use_shared_contexts ? compact_decoder_features_ : decoder_input;
                     layer_input += ite * local_batch_size * seq_len * hidden_units_;
                 }
                 if (l == num_layer_ - 1) {
-                    layer_output = decoder_output;
+                    layer_output = use_shared_contexts ? compact_decoder_features_ : decoder_output;
                     layer_output += ite * local_batch_size * seq_len * hidden_units_;
                 }
             }
@@ -328,7 +383,7 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                     ftNcclAllGather(layer_input, layer_input, data_size, tensor_para_.rank_, tensor_para_, stream_);
                 }
             }
-
+            // TODO: 这里用的LN跟neox不一样，不太清楚这里需不需要改成int8的LN
             invokeGeneralT5LayerNorm(decoder_normed_input_,
                                    layer_input,
                                    gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma,
@@ -337,8 +392,11 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                    h_token_num,
                                    hidden_units_,
                                    stream_);
+
             sync_check_cuda_error();
 
+            const T* attention_ptr = use_shared_contexts ? compact_attention_mask_ : attention_mask;
+
             TensorMap self_attention_input_tensors{
                 {"input_query",
                  Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}},
@@ -346,7 +404,7 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                  Tensor{MEMORY_GPU,
                         data_type,
                         {(size_t)local_batch_size, (size_t)1, (size_t)seq_len, (size_t)(seq_len + max_prompt_length)},
-                        attention_mask + local_batch_size * ite * seq_len * (seq_len + max_prompt_length)}},
+                        attention_ptr + local_batch_size * ite * seq_len * (seq_len + max_prompt_length)}},
                 {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type}},
                 {"is_final_layer", Tensor{MEMORY_CPU, TYPE_BOOL, {(size_t)1}, &is_final}},
                 {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}};
@@ -381,17 +439,43 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             }
             cache_offset += ite_cache_offset;
 
+            T* k_cache_ptr = use_shared_contexts ? k_cache_layer_ : k_cache.getPtrWithOffset<T>(cache_offset);
+            T* v_cache_ptr = use_shared_contexts ? v_cache_layer_ : v_cache.getPtrWithOffset<T>(cache_offset);
+
             TensorMap self_attention_output_tensors{
                 {"hidden_features",
                  Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, self_attn_output_}},
-                {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}},
-                {"value_cache",
-                 Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}};
+                {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache_ptr}},
+                {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache_ptr}}};
 
             self_attention_layer_->forward(&self_attention_output_tensors,
                                            &self_attention_input_tensors,
                                            &gpt_decoder_layer_weight->at(l)->self_attention_weights);
 
+            if (use_shared_contexts) {
+                // Even with local batches, we must process the whole K/V caches as any
+                // element in batch_idx_to_compact_idx may reference the local batch
+                // we're processing. We also need to discard references that aren't in
+                // that particular local batch.
+                const size_t cache_stride_per_batch = hidden_units_ / tensor_para_.world_size_ * max_seq_len;
+                const size_t cache_layer_offset =
+                    (l - getFirstLayerParallelId()) * request_batch_size * cache_stride_per_batch;
+                invokeUnCompactCaches(k_cache.getPtrWithOffset<T>(cache_layer_offset),
+                                      v_cache.getPtrWithOffset<T>(cache_layer_offset),
+                                      k_cache_layer_,
+                                      v_cache_layer_,
+                                      input_tensors->at("batch_to_compact_idx").getPtr<int>(),
+                                      request_batch_size,  // batch_size (uncompact)
+                                      v_cache.shape[2],    // local_head_num
+                                      max_seq_len,
+                                      seq_len,
+                                      size_per_head_,
+                                      local_batch_size,
+                                      ite,
+                                      stream_);
+                sync_check_cuda_error();
+            }
+
             if (is_final == false) {
                 if (use_gptj_residual_) {
                     invokeGeneralLayerNorm(decoder_normed_input_,
@@ -402,10 +486,11 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                            h_token_num,
                                            hidden_units_,
                                            (float*)nullptr,
-                                           0,
+                                           int8_mode_,
                                            stream_);
                 }
                 else {
+                    // TODO: modify or not ?
                     invokeGeneralAddResidualT5PreLayerNorm(
                         self_attn_output_,
                         decoder_normed_input_,
@@ -472,7 +557,8 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                 }
 
                 if ((l == num_layer_ - 1) && is_unpadded_mha) {
-                    invokeRebuildPadding(decoder_output + ite * local_batch_size * seq_len * hidden_units_,
+                    T* base_ptr = use_shared_contexts ? compact_decoder_features_ : decoder_output;
+                    invokeRebuildPadding(base_ptr + ite * local_batch_size * seq_len * hidden_units_,
                                          decoder_layer_output_,
                                          padding_offset_,
                                          h_token_num,
@@ -483,12 +569,22 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         }
     }
 
+    if (use_shared_contexts) {
+        invokeUnCompactOutputs(decoder_output,
+                               compact_decoder_features_,
+                               input_tensors->at("batch_to_compact_idx").getPtr<int>(),
+                               request_batch_size,  // batch
+                               seq_len * hidden_units_,
+                               stream_);
+        sync_check_cuda_error();
+    }
+
     // TODO(bhsueh) We could optimize this point by only computing the last token for the last layer
     invokeLookupHiddenStateOfLastToken(output_tensors->at("last_token_hidden_units").getPtr<T>(),
                                        output_tensors->at("decoder_output").getPtr<T>(),
                                        input_tensors->at("input_lengths").getPtr<int>(),
                                        seq_len,
-                                       batch_size,
+                                       request_batch_size,
                                        hidden_units_,
                                        stream_);
     sync_check_cuda_error();
diff --git a/src/fastertransformer/models/llama/LlamaContextDecoder.h b/src/fastertransformer/models/llama/LlamaContextDecoder.h
index 788d1d45d..a2f91f7b8 100644
--- a/src/fastertransformer/models/llama/LlamaContextDecoder.h
+++ b/src/fastertransformer/models/llama/LlamaContextDecoder.h
@@ -56,13 +56,15 @@ class LlamaContextDecoder: public BaseLayer {
 
     AttentionType attention_type_;
 
+    int int8_mode_ = 0;
+
     bool is_qk_buf_float_;
 
     BaseAttentionLayer<T>* self_attention_layer_;
     FfnLayer<T>*           ffn_layer_;
 
     void allocateBuffer() override;
-    void allocateBuffer(size_t batch_size, size_t seq_len);
+    void allocateBuffer(size_t batch_size, size_t seq_len, bool use_shared_contexts);
     void freeBuffer() override;
 
     bool isValidLayerParallelId(uint l);
@@ -81,6 +83,12 @@ class LlamaContextDecoder: public BaseLayer {
     int*    padding_offset_         = nullptr;
     int*    cu_seqlens_             = nullptr;
 
+    T*   compact_decoder_features_ = nullptr;
+    T*   compact_attention_mask_   = nullptr;
+    int* compact_input_lengths_    = nullptr;
+    T*   k_cache_layer_            = nullptr;
+    T*   v_cache_layer_            = nullptr;
+
 public:
     LlamaContextDecoder(size_t                              head_num,
                         size_t                              size_per_head,
@@ -98,6 +106,7 @@ class LlamaContextDecoder: public BaseLayer {
                         bool                                is_free_buffer_after_forward,
                         bool                                is_qk_buf_float,
                         AttentionType                       attention_type            = AttentionType::FUSED_MHA,
+                        int                                 int8_mode                = 0,
                         std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm    = nullptr,
                         int                                 enable_custom_all_reduce_ = 0);
 
diff --git a/src/fastertransformer/models/llama/LlamaDecoder.cc b/src/fastertransformer/models/llama/LlamaDecoder.cc
index a5cffa731..4685217b0 100644
--- a/src/fastertransformer/models/llama/LlamaDecoder.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoder.cc
@@ -35,10 +35,11 @@ void LlamaDecoder<T>::initialize()
                                                                            !use_gptj_residual_,
                                                                            is_free_buffer_after_forward_,
                                                                            false,
-                                                                           0,
+                                                                           int8_mode_,
                                                                            custom_all_reduce_comm_,
                                                                            enable_custom_all_reduce_);
 
+    // TODO: SiLu ftn layer not support int8
     ffn_layer_ = new TensorParallelSiluFfnLayer<T>(0,  // max_batch_size
                                                    1,
                                                    head_num_,
@@ -133,6 +134,7 @@ LlamaDecoder<T>::LlamaDecoder(size_t                              head_num,
                               cublasMMWrapper*                    cublas_wrapper,
                               IAllocator*                         allocator,
                               bool                                is_free_buffer_after_forward,
+                              int                                 int8_mode,
                               std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
                               int                                 enable_custom_all_reduce):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
@@ -147,6 +149,7 @@ LlamaDecoder<T>::LlamaDecoder(size_t                              head_num,
     hidden_units_(head_num_ * size_per_head),
     tensor_para_(tensor_para),
     pipeline_para_(pipeline_para),
+    int8_mode_(int8_mode),
     custom_all_reduce_comm_(custom_all_reduce_comm),
     enable_custom_all_reduce_(enable_custom_all_reduce)
 {
@@ -167,6 +170,7 @@ LlamaDecoder<T>::LlamaDecoder(LlamaDecoder<T> const& decoder):
     hidden_units_(decoder.hidden_units_),
     tensor_para_(decoder.tensor_para_),
     pipeline_para_(decoder.pipeline_para_),
+    int8_mode_(decoder.int8_mode_),
     custom_all_reduce_comm_(decoder.custom_all_reduce_comm_),
     enable_custom_all_reduce_(decoder.enable_custom_all_reduce_)
 {
@@ -260,6 +264,7 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             }
         }
 
+        // TODO 使用的是T5 LN，这里是没有int8的参数支持
         invokeGeneralT5LayerNorm(decoder_normed_input_,
                                  layer_input,
                                  gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma,
@@ -301,7 +306,7 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                    local_batch_size,
                                    hidden_units_,
                                    (float*)nullptr,
-                                   0,
+                                   int8_mode_,
                                    stream_);
         }
         else {
diff --git a/src/fastertransformer/models/llama/LlamaDecoder.h b/src/fastertransformer/models/llama/LlamaDecoder.h
index 6cdd7df27..dc44b0f32 100644
--- a/src/fastertransformer/models/llama/LlamaDecoder.h
+++ b/src/fastertransformer/models/llama/LlamaDecoder.h
@@ -70,6 +70,8 @@ class LlamaDecoder: public BaseLayer {
     BaseAttentionLayer<T>* self_attention_layer_;
     FfnLayer<T>*           ffn_layer_;
 
+    int int8_mode_ = 0;
+
 public:
     LlamaDecoder(size_t                              head_num,
                  size_t                              size_per_head,
@@ -85,6 +87,7 @@ class LlamaDecoder: public BaseLayer {
                  cublasMMWrapper*                    cublas_wrapper,
                  IAllocator*                         allocator,
                  bool                                is_free_buffer_after_forward,
+                 int                                 int8_mode                 = 0,
                  std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm    = nullptr,
                  int                                 enable_custom_all_reduce_ = 0);
 
diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
index 3e97b67d0..a8dadefea 100644
--- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
@@ -24,15 +24,26 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(const int  hidden_units,
                                                     const int  inter_size,
                                                     const int  tensor_para_size,
                                                     const int  tensor_para_rank,
-                                                    const bool use_gptj_residual):
+                                                    const bool use_gptj_residual,
+                                                    const int  int8_mode):
     hidden_units_(hidden_units),
     inter_size_(inter_size),
     tensor_para_size_(tensor_para_size),
     tensor_para_rank_(tensor_para_rank),
+    int8_mode_(int8_mode),
     use_gptj_residual_(use_gptj_residual)
 {
     mallocWeights();
     setWeightPtr();
+
+    FT_CHECK_WITH_INFO(int8_mode_ != 2, "Llama doesn't support int8_model == 2");
+    FT_CHECK_WITH_INFO(!(std::is_same<T, float>::value && int8_mode_ == 1),
+                       "Weight only quant does not work with FP32 compute.");
+}
+
+template<typename T>
+LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(const int int8_mode): int8_mode_(int8_mode)
+{
 }
 
 template<typename T>
@@ -60,38 +71,92 @@ LlamaDecoderLayerWeight<T>::~LlamaDecoderLayerWeight()
         ffn_weights.intermediate_weight2.bias   = nullptr;
         ffn_weights.output_weight.kernel       = nullptr;
         ffn_weights.output_weight.bias         = nullptr;
+
+        if (int8_mode_ != 0) {
+            for (int i = 0; i < int8_weights_ptr.size(); i++) {
+                if (int8_weights_ptr[i] != nullptr) {
+                    deviceFree(int8_weights_ptr[i]);
+                }
+            }
+
+            if (int8_mode_ == 1) {
+                for (int i = 0; i < weight_only_scale_ptr.size(); i++) {
+                    if (weight_only_scale_ptr[i] != nullptr) {
+                        deviceFree(weight_only_scale_ptr[i]);
+                    }
+                }
+            }
+
+            self_attention_weights.query_weight.int8_kernel                             = nullptr;
+            self_attention_weights.query_weight.weight_only_quant_scale                 = nullptr;
+            self_attention_weights.attention_output_weight.int8_kernel                  = nullptr;
+            self_attention_weights.attention_output_weight.weight_only_quant_scale      = nullptr;
+
+            //  作一下标记  intermediate_weight => gate_proj;  intermediate_weight2 => up_proj; output_weight => down_proj.
+            ffn_weights.intermediate_weight.int8_kernel                                 = nullptr;
+            ffn_weights.intermediate_weight.weight_only_quant_scale                     = nullptr;
+            ffn_weights.intermediate_weight2.int8_kernel                                = nullptr;
+            ffn_weights.intermediate_weight2.weight_only_quant_scale                    = nullptr;
+            ffn_weights.output_weight.int8_kernel                                       = nullptr;
+            ffn_weights.output_weight.weight_only_quant_scale                           = nullptr;
+        }
+
         is_maintain_buffer                     = false;
     }
 }
 
 template<typename T>
-LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other):
-    hidden_units_(other.hidden_units_),
-    inter_size_(other.inter_size_),
-    tensor_para_size_(other.tensor_para_size_),
-    tensor_para_rank_(other.tensor_para_rank_),
-    use_gptj_residual_(other.use_gptj_residual_)
+void LlamaDecoderLayerWeight<T>::copyFrom(const LlamaDecoderLayerWeight& other)
 {
-    mallocWeights();
     cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_);
     cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
-    cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
     cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
     if (!use_gptj_residual_) {
         cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_);
     }
-
-    cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);
     cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_);
-
-    cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_);
     cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], inter_size_ / tensor_para_size_);
-
-    cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_);
     cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], hidden_units_);
     cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_);
     cudaD2Dcpy(weights_ptr[13], other.weights_ptr[13], hidden_units_);
+
+    if (int8_mode_ == 0) {
+        cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
+        cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
+        cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);
+        cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_);
+        cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_);
+    }
+    else {
+        cudaD2Dcpy(int8_weights_ptr[0], other.int8_weights_ptr[0], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
+        cudaD2Dcpy(int8_weights_ptr[1], other.int8_weights_ptr[1], hidden_units_ / tensor_para_size_ * hidden_units_);
+        cudaD2Dcpy(int8_weights_ptr[2], other.int8_weights_ptr[2], hidden_units_ * inter_size_ / tensor_para_size_);
+        cudaD2Dcpy(int8_weights_ptr[3], other.int8_weights_ptr[3], hidden_units_ * inter_size_ / tensor_para_size_);
+        cudaD2Dcpy(int8_weights_ptr[4], other.int8_weights_ptr[4], inter_size_ / tensor_para_size_ * hidden_units_);
+        
+        if (int8_mode_ == 1) {
+            cudaD2Dcpy(weight_only_scale_ptr[0], other.weight_only_scale_ptr[0], 3 * hidden_units_ / tensor_para_size_);
+            cudaD2Dcpy(weight_only_scale_ptr[1], other.weight_only_scale_ptr[1], hidden_units_);
+            cudaD2Dcpy(weight_only_scale_ptr[2], other.weight_only_scale_ptr[2], inter_size_ / tensor_para_size_);
+
+            // TODO: 不太清楚这里存的缩放因子对应的是gate_pro_weight 还是给 up_proj/down_proj用的，后面做一下验证，回来再改一下
+            cudaD2Dcpy(weight_only_scale_ptr[3], other.weight_only_scale_ptr[3], inter_size_ / tensor_para_size_);
+            cudaD2Dcpy(weight_only_scale_ptr[4], other.weight_only_scale_ptr[4], hidden_units_);
+        }
+    }
+}
+
+template<typename T>
+LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other):
+    hidden_units_(other.hidden_units_),
+    inter_size_(other.inter_size_),
+    tensor_para_size_(other.tensor_para_size_),
+    tensor_para_rank_(other.tensor_para_rank_),
+    int8_mode_(other.int8_mode_),
+    use_gptj_residual_(other.use_gptj_residual_)
+{
+    mallocWeights();
+    copyFrom(other);
     setWeightPtr();
 }
 
@@ -102,26 +167,12 @@ LlamaDecoderLayerWeight<T>& LlamaDecoderLayerWeight<T>::operator=(const LlamaDec
     inter_size_        = other.inter_size_;
     tensor_para_size_  = other.tensor_para_size_;
     tensor_para_rank_  = other.tensor_para_rank_;
+    int8_mode_          = other.int8_mode_;
     use_gptj_residual_ = other.use_gptj_residual_;
 
     mallocWeights();
 
-    cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_);
-    cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
-    cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
-    if (!use_gptj_residual_) {
-        cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_);
-    }
-    cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], inter_size_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_);
-    cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], hidden_units_);
-    cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_);
-    cudaD2Dcpy(weights_ptr[13], other.weights_ptr[13], hidden_units_);
+    copyFrom(other);
     setWeightPtr();
     return *this;
 }
@@ -137,42 +188,90 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
     loadWeightFromBin<T>(
         weights_ptr[1], {(size_t)hidden_units_}, dir_path + ".input_layernorm.weight.bin", model_file_type);
 
-    loadWeightFromBin<T>(weights_ptr[2],
+    deviceFill(weights_ptr[3], (size_t)(3 * hidden_units_ / tensor_para_size_), (T)0.0);
+
+    if (!use_gptj_residual_) {
+        deviceFill(weights_ptr[5], (size_t)hidden_units_, (T)0.0);
+    }
+
+    // FIXME(sunpeng17): check if the weights are correct
+    // loadWeightFromBin<T>(weights_ptr[6],
+    //                      {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)},
+    //                      dir_path + ".mlp.gate_proj.weight." + rank_spec + ".bin",
+    //                      model_file_type);
+
+    deviceFill(weights_ptr[7], (size_t)(inter_size_ / tensor_para_size_), (T)0.0);
+
+    deviceFill(weights_ptr[9], (size_t)(inter_size_ / tensor_para_size_), (T)0.0);
+
+    // loadWeightFromBin<T>(weights_ptr[10],
+    //                      {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_},
+    //                      dir_path + ".mlp.down_proj.weight." + rank_spec + ".bin",
+    //                      model_file_type);
+
+
+    deviceFill(weights_ptr[11], (size_t)(hidden_units_), (T)0.0);
+
+    deviceFill(weights_ptr[12], (size_t)(hidden_units_), (T)0.0);
+    loadWeightFromBin<T>(
+        weights_ptr[13], {(size_t)hidden_units_}, dir_path + ".post_attention_layernorm.weight.bin", model_file_type);
+
+    if (int8_mode_ == 0) {        
+        loadWeightFromBin<T>(weights_ptr[2],
                          {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)},
                          dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin",
                          model_file_type);
-    deviceFill(weights_ptr[3], (size_t)(3 * hidden_units_ / tensor_para_size_), (T)0.0);
 
-    loadWeightFromBin<T>(weights_ptr[4],
+        loadWeightFromBin<T>(weights_ptr[4],
                          {(size_t)(hidden_units_ / tensor_para_size_), (size_t)hidden_units_},
                          dir_path + ".attention.dense.weight." + rank_spec + ".bin",
                          model_file_type);
-    if (!use_gptj_residual_) {
-        deviceFill(weights_ptr[5], (size_t)hidden_units_, (T)0.0);
-    }
 
-    // FIXME(sunpeng17): check if the weights are correct
-    loadWeightFromBin<T>(weights_ptr[6],
+        loadWeightFromBin<T>(weights_ptr[6],
                          {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)},
                          dir_path + ".mlp.gate_proj.weight." + rank_spec + ".bin",
                          model_file_type);
-    deviceFill(weights_ptr[7], (size_t)(inter_size_ / tensor_para_size_), (T)0.0);
 
-    loadWeightFromBin<T>(weights_ptr[8],
-                         {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)},
+        loadWeightFromBin<T>(weights_ptr[8],
+                         {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_},
                          dir_path + ".mlp.up_proj.weight." + rank_spec + ".bin",
                          model_file_type);
-    deviceFill(weights_ptr[9], (size_t)(inter_size_ / tensor_para_size_), (T)0.0);
-
-    loadWeightFromBin<T>(weights_ptr[10],
+        loadWeightFromBin<T>(weights_ptr[10],
                          {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_},
                          dir_path + ".mlp.down_proj.weight." + rank_spec + ".bin",
                          model_file_type);
-    deviceFill(weights_ptr[11], (size_t)(hidden_units_), (T)0.0);
+    }
+    else if (int8_mode_ == 1) {
+        loadWeightFromBinAndQuantizeForWeightOnly<T>(int8_weights_ptr[0],
+                                                     weight_only_scale_ptr[0],
+                                                     {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)},
+                                                     dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin",
+                                                     model_file_type);
 
-    deviceFill(weights_ptr[12], (size_t)(hidden_units_), (T)0.0);
-    loadWeightFromBin<T>(
-        weights_ptr[13], {(size_t)hidden_units_}, dir_path + ".post_attention_layernorm.weight.bin", model_file_type);
+        loadWeightFromBinAndQuantizeForWeightOnly<T>(int8_weights_ptr[1],
+                                                     weight_only_scale_ptr[1],
+                                                     {(size_t)(hidden_units_ / tensor_para_size_), (size_t)hidden_units_},
+                                                     dir_path + ".attention.dense.weight." + rank_spec + ".bin",
+                                                     model_file_type);
+
+        loadWeightFromBinAndQuantizeForWeightOnly<T>(int8_weights_ptr[2],
+                                                     weight_only_scale_ptr[2],
+                                                     {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)},
+                                                     dir_path + ".mlp.gate_proj.weight." + rank_spec + ".bin",
+                                                     model_file_type);
+
+        loadWeightFromBinAndQuantizeForWeightOnly<T>(int8_weights_ptr[3],
+                                                     weight_only_scale_ptr[3],
+                                                     {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)},
+                                                     dir_path + ".mlp.up_proj.weight." + rank_spec + ".bin",
+                                                     model_file_type);
+        loadWeightFromBinAndQuantizeForWeightOnly<T>(int8_weights_ptr[4],
+                                                     weight_only_scale_ptr[4],
+                                                     {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_},
+                                                     dir_path + ".mlp.down_proj.weight." + rank_spec + ".bin",
+                                                     model_file_type);
+
+    }
 }
 
 template<typename T>
@@ -194,6 +293,23 @@ void LlamaDecoderLayerWeight<T>::setWeightPtr()
 
     post_attention_layernorm_weights.beta  = weights_ptr[12];
     post_attention_layernorm_weights.gamma = weights_ptr[13];
+
+    if (int8_mode_ != 0) {
+        self_attention_weights.query_weight.int8_kernel                 = int8_weights_ptr[0];
+        self_attention_weights.attention_output_weight.int8_kernel      = int8_weights_ptr[1];
+        ffn_weights.intermediate_weight.int8_kernel                     = int8_weights_ptr[2];
+        ffn_weights.intermediate_weight2.int8_kernel                    = int8_weights_ptr[3];
+        ffn_weights.output_weight.int8_kernel                           = int8_weights_ptr[4];
+
+        if (int8_mode_ == 1) {
+            self_attention_weights.query_weight.weight_only_quant_scale                 = weight_only_scale_ptr[0];
+            self_attention_weights.attention_output_weight.weight_only_quant_scale      = weight_only_scale_ptr[1];
+            ffn_weights.intermediate_weight.weight_only_quant_scale                     = weight_only_scale_ptr[2];
+            ffn_weights.intermediate_weight2.weight_only_quant_scale                    = weight_only_scale_ptr[3];
+            ffn_weights.output_weight.weight_only_quant_scale                           = weight_only_scale_ptr[4];
+        }
+    }
+    
     is_maintain_buffer                     = true;
 }
 
@@ -202,21 +318,48 @@ void LlamaDecoderLayerWeight<T>::mallocWeights()
 {
     deviceMalloc(&weights_ptr[0], hidden_units_); // pre layernorm beta
     deviceMalloc(&weights_ptr[1], hidden_units_); // pre layernorm gamma
-    deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); // qkv kernel
+    // deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); // qkv kernel
     deviceMalloc(&weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); // qkv bias
-    deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); // attention output weight
+    // deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); // attention output weight
     if (!use_gptj_residual_) {
         deviceMalloc(&weights_ptr[5], hidden_units_); // attention output bias
     }
 
-    deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight kernel
+    // deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight kernel
     deviceMalloc(&weights_ptr[7], inter_size_ / tensor_para_size_);                 // intermediate_weight bias
-    deviceMalloc(&weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight2 kernel
+    // deviceMalloc(&weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight2 kernel
     deviceMalloc(&weights_ptr[9], inter_size_ / tensor_para_size_);                 // intermediate_weight2 bias
-    deviceMalloc(&weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_); // output_weight kernel
+    // deviceMalloc(&weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_); // output_weight kernel
     deviceMalloc(&weights_ptr[11], hidden_units_);                                   // output_weight bias
     deviceMalloc(&weights_ptr[12], hidden_units_); // post attn layernorm beta
     deviceMalloc(&weights_ptr[13], hidden_units_); // post attn layernorm gamma
+
+    if (int8_mode_ == 0) {
+        deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);  // qkv weight
+        deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);  // attention output weight
+        deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);   // intermediate_weight kernel
+        deviceMalloc(&weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_);  // intermediate_weight2 kernel
+        deviceMalloc(&weights_ptr[10],  inter_size_ / tensor_para_size_ * hidden_units_);  // output_weight kernel    
+    }
+    else {
+        // Alloc FFN and Attention int8 weights
+        deviceMalloc(&int8_weights_ptr[0], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
+        deviceMalloc(&int8_weights_ptr[1], hidden_units_ / tensor_para_size_ * hidden_units_);
+        deviceMalloc(&int8_weights_ptr[2], hidden_units_ * inter_size_ / tensor_para_size_);
+        deviceMalloc(&int8_weights_ptr[3], hidden_units_ * inter_size_ / tensor_para_size_);
+        deviceMalloc(&int8_weights_ptr[4], inter_size_ / tensor_para_size_ * hidden_units_);
+
+
+        if (int8_mode_ == 1) {
+            // Alloc scales for weight only quant for attention and FFN weights
+            deviceMalloc(&weight_only_scale_ptr[0], 3 * hidden_units_ / tensor_para_size_);
+            deviceMalloc(&weight_only_scale_ptr[1], hidden_units_);
+            deviceMalloc(&weight_only_scale_ptr[2], inter_size_ / tensor_para_size_);
+            deviceMalloc(&weight_only_scale_ptr[3], inter_size_ / tensor_para_size_);
+            deviceMalloc(&weight_only_scale_ptr[4], hidden_units_);
+        }
+    }
+
 }
 
 template struct LlamaDecoderLayerWeight<float>;
diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h
index 008e1a3b4..5a76ba4c5 100644
--- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h
+++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h
@@ -29,11 +29,13 @@ template<typename T>
 struct LlamaDecoderLayerWeight {
 public:
     LlamaDecoderLayerWeight() = default;
+    LlamaDecoderLayerWeight(const int int8_mode);
     LlamaDecoderLayerWeight(const int  hidden_units,
                             const int  inter_size,
                             const int  tensor_para_size  = 1,
                             const int  tensor_para_rank  = 0,
-                            const bool use_gptj_residual = true);
+                            const bool use_gptj_residual = true,
+                            const int int8_mode = 0);
     ~LlamaDecoderLayerWeight();
     LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other);
     LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight& other);
@@ -54,9 +56,14 @@ struct LlamaDecoderLayerWeight {
     const int attention_dense_bias_weight_id = 5;
     bool      is_maintain_buffer             = false;
     T*        weights_ptr[14];
+    int       int8_mode_ = 0;
+
+    std::vector<int8_t*> int8_weights_ptr = std::vector<int8_t*>(5, nullptr);
+    std::vector<T*>      weight_only_scale_ptr = std::vector<T*>(5, nullptr);
 
     void setWeightPtr();
     void mallocWeights();
+    void copyFrom(const LlamaDecoderLayerWeight& other);
 };
 
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LlamaWeight.cc b/src/fastertransformer/models/llama/LlamaWeight.cc
index 6105267ff..e9e11b6a1 100644
--- a/src/fastertransformer/models/llama/LlamaWeight.cc
+++ b/src/fastertransformer/models/llama/LlamaWeight.cc
@@ -29,6 +29,7 @@ LlamaWeight<T>::LlamaWeight(const int                                  hidden_un
                             const int                                  layer_para_size,
                             const int                                  layer_para_rank,
                             const bool                                 use_gptj_residual,
+                            const int                                  int8_mode,
                             PromptLearningType                         prompt_learning_type,
                             std::map<std::string, std::pair<int, int>> prompt_learning_pair):
     hidden_units_(hidden_units),
@@ -41,6 +42,7 @@ LlamaWeight<T>::LlamaWeight(const int                                  hidden_un
     layer_para_size_(layer_para_size),
     layer_para_rank_(layer_para_rank),
     use_gptj_residual_(use_gptj_residual),
+    int8_mode_(int8_mode),
     prompt_learning_type_(prompt_learning_type),
     prompt_learning_pair_(prompt_learning_pair)
 {
@@ -62,7 +64,7 @@ LlamaWeight<T>::LlamaWeight(const int                                  hidden_un
     for (int l = 0; l < num_layer_; l++) {
         if (isValidLayerParallelId(l)) {
             decoder_layer_weights.push_back(new LlamaDecoderLayerWeight<T>(
-                hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_, use_gptj_residual_));
+                hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_, use_gptj_residual_, int8_mode_));
         }
         else {
             // Layer-parallelism: allocate empty layer because
@@ -103,6 +105,7 @@ LlamaWeight<T>::LlamaWeight(const LlamaWeight& other):
     layer_para_size_(other.layer_para_size_),
     layer_para_rank_(other.layer_para_rank_),
     use_gptj_residual_(other.use_gptj_residual_),
+    int8_mode_(other.int8_mode_),
     prompt_token_weight_size_(other.prompt_token_weight_size_),
     malloc_load_prompt_weights_(other.malloc_load_prompt_weights_),
     prompt_learning_type_(other.prompt_learning_type_),
@@ -149,6 +152,7 @@ LlamaWeight<T>& LlamaWeight<T>::operator=(const LlamaWeight& other)
     layer_para_size_            = other.layer_para_size_;
     layer_para_rank_            = other.layer_para_rank_;
     use_gptj_residual_          = other.use_gptj_residual_;
+    int8_mode_                  = other.int8_mode_;
     prompt_token_weight_size_   = other.prompt_token_weight_size_;
     malloc_load_prompt_weights_ = other.malloc_load_prompt_weights_;
     prompt_learning_type_       = other.prompt_learning_type_;
diff --git a/src/fastertransformer/models/llama/LlamaWeight.h b/src/fastertransformer/models/llama/LlamaWeight.h
index ec909ca49..eb90dc5bf 100644
--- a/src/fastertransformer/models/llama/LlamaWeight.h
+++ b/src/fastertransformer/models/llama/LlamaWeight.h
@@ -38,6 +38,7 @@ struct LlamaWeight {
         const int                                  layer_para_size      = 1,
         const int                                  layer_para_rank      = 0,
         const bool                                 use_gptj_residual_   = false,
+        const int                                  int8_mode            = 0,
         PromptLearningType                         prompt_learning_type = PromptLearningType::no_prompt,
         std::map<std::string, std::pair<int, int>> prompt_learning_pair = std::map<std::string, std::pair<int, int>>{});
 
@@ -88,6 +89,8 @@ struct LlamaWeight {
     int layer_para_size_;
     int layer_para_rank_;
 
+    size_t int8_mode_    = 0;
+
     // residual type
     bool use_gptj_residual_;
 

From 6f0469f2c3840bbfe31aa39a276281f3710cbd7e Mon Sep 17 00:00:00 2001
From: shaoxin <shaoxin@xiaoice.com>
Date: Fri, 30 Jun 2023 14:08:17 +0800
Subject: [PATCH 22/27] update code with comments from reviewer

---
 .gitignore                                    |   4 +-
 examples/cpp/llama/llama_example.cc           |   1 -
 src/fastertransformer/.DS_Store               | Bin 6148 -> 6148 bytes
 src/fastertransformer/layers/FfnLayer.cc      |   3 +-
 src/fastertransformer/layers/FfnLayer.h       |   1 +
 .../layers/TensorParallelSiluFfnLayer.cc      |   4 +-
 .../layers/TensorParallelSiluFfnLayer.h       |   3 +-
 .../adapter_layers/LinearAdapterLayer.cc      |   3 +-
 src/fastertransformer/models/.DS_Store        | Bin 8196 -> 8196 bytes
 src/fastertransformer/models/llama/Llama.cc   | 120 +++++++++---------
 .../models/llama/LlamaContextDecoder.cc       |   5 +-
 .../models/llama/LlamaDecoder.cc              |   3 +-
 12 files changed, 78 insertions(+), 69 deletions(-)

diff --git a/.gitignore b/.gitignore
index 77849f435..8ecc0cca0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,8 @@ __pycache__/
 /models
 /notebooks
 **/.ipynb_checkpoints/
+.DS_Store
 
 /3rdparty/NeMo/
-/3rdparty/apex/
\ No newline at end of file
+/3rdparty/apex/
+
diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
index 49d966772..14cff7bee 100644
--- a/examples/cpp/llama/llama_example.cc
+++ b/examples/cpp/llama/llama_example.cc
@@ -541,6 +541,5 @@ void llama_example(const INIReader reader)
     if (d_sequence_lengths != nullptr) {
         deviceFree(d_sequence_lengths);
     }
-
     return;
 }
diff --git a/src/fastertransformer/.DS_Store b/src/fastertransformer/.DS_Store
index d1845e2dff13bedf5e41cf42a41978e6bf67cb38..d4edc9af914042cba222ab33425eac19388ca6d5 100644
GIT binary patch
literal 6148
zcmeHK%We}f6unOKm{1WS1X5Wbjl?!8Y0_4y*o3q!kYGa)EC3CeOlU`)j-$zhfTBoQ
z1K+?eu!SGtUs%Dp9v96dQ<fD1-^f0GoNLD?zIIGRqBr$+i0VXSAu$%OAj>iC=d@-8
z-BJSzHO7b@kxvtfk&GDyi~@hV0vx+FI;6)mpi@f6?>3_N9mNj=IdlW*g2#u%eWdN5
z%1|P@PD7f48Nh#nkApGbg*-&fC&{=n#+Q{bDr)8k%n`;}VT;=<?6R8cJ-E>t_v4?q
z?rbvFEBWe<1Ah`Xo8LuYqgX0etcta1z3QLHSw9}c(@Ad-z2MqYDFgSsJaC`+qnT5`
zaVW!h;D@6~Ir!cPl$X!_(37*CoQB>+`Sz0zt7=u9`qq5j+HW@OcI#l#u;&LYly}?v
zi$&GCdFSrEC(dbb7RnF!{xE^lRM}OH=fLaBtp9C43IiD)qn&-qbOyz}TpfR)KmE&P
zxG8PEUAi{6pv`WlVehm$O%|uqZa3_`R&zxhUOenVaXz6lv#1quuKnJY5?d!i&7m<i
z)0B>}QN!#eEwH3%xk})_frq4VmZ?JGb1Ed5d6LWIjh|-Lz>F|=tir@07eO>s!4Q&{
zokfK~+P9>_&d&)L)+x<IHQ)3uvo`a)O{-DBDDWREz~_UF#MsriOi&#i$kY=6SVXfj
zl=*K0``8-08kY&82PP6%VB#wD6GJ5KXty=5t8tlN;!Z+8K7^iG=ogAmvtxW)(Mfa@
zG`Ug0D3DcPBdON;{(o@s`#<YsYDNL0z(1vcC?55XdRUUaTh|uHcdd=|8i|d0%LJ7R
kGJPGZ0$;^jNXpP>a{<`ZxJ(c&F!M)1$zU?0z@;ki3m={olK=n!

delta 96
zcmZoMXfc=|#>B`mu~2NHo}wrd0|Nsi1A_nqLk>f6Qh9MfQcix-#KPs14MbQbv$B2M
sEW#nova#U{<7Rdaeh#3%&4L`?nJ4p$SOT?xwSr6l8NsqSLSzjy0AfWI5&!@I

diff --git a/src/fastertransformer/layers/FfnLayer.cc b/src/fastertransformer/layers/FfnLayer.cc
index 7ac441198..14bb5e3f6 100644
--- a/src/fastertransformer/layers/FfnLayer.cc
+++ b/src/fastertransformer/layers/FfnLayer.cc
@@ -684,6 +684,7 @@ SiluFfnLayer<T>::SiluFfnLayer(size_t           max_batch_size,
                               IAllocator*      allocator,
                               bool             is_free_buffer_after_forward,
                               bool             sparse,
+                              int              int8_mode,
                               bool             use_gated_activation):
     FfnLayer<T>(max_batch_size,
                 max_seq_len,
@@ -696,7 +697,7 @@ SiluFfnLayer<T>::SiluFfnLayer(size_t           max_batch_size,
                 allocator,
                 is_free_buffer_after_forward,
                 sparse,
-                0,
+                int8_mode,
                 use_gated_activation)
 {
 }
diff --git a/src/fastertransformer/layers/FfnLayer.h b/src/fastertransformer/layers/FfnLayer.h
index af7ae7606..f84915d2f 100644
--- a/src/fastertransformer/layers/FfnLayer.h
+++ b/src/fastertransformer/layers/FfnLayer.h
@@ -210,6 +210,7 @@ class SiluFfnLayer: public FfnLayer<T> {
                  IAllocator*      allocator,
                  bool             is_free_buffer_after_forward,
                  bool             sparse               = false,
+                 int              int8_mode            = 0,
                  bool             use_gated_activation = false);
 
     SiluFfnLayer(SiluFfnLayer<T> const& ffn_layer);
diff --git a/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc b/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc
index 25a2da86b..af4714d82 100644
--- a/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc
+++ b/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc
@@ -76,7 +76,8 @@ TensorParallelSiluFfnLayer<T>::TensorParallelSiluFfnLayer(size_t           max_b
                                                           bool             is_sparse,
                                                           bool             use_gated_activation,
                                                           std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
-                                                          int                                 enable_custom_all_reduce):
+                                                          int                                 enable_custom_all_reduce,
+                                                          int              int8_mode):
     SiluFfnLayer<T>(max_batch_size,
                     max_seq_len,
                     head_num,
@@ -88,6 +89,7 @@ TensorParallelSiluFfnLayer<T>::TensorParallelSiluFfnLayer(size_t           max_b
                     allocator,
                     is_free_buffer_after_forward,
                     is_sparse,
+                    int8_mode,
                     use_gated_activation),
     tensor_para_(tensor_para),
     custom_all_reduce_comm_(custom_all_reduce_comm),
diff --git a/src/fastertransformer/layers/TensorParallelSiluFfnLayer.h b/src/fastertransformer/layers/TensorParallelSiluFfnLayer.h
index ae481373a..5f0e6d625 100644
--- a/src/fastertransformer/layers/TensorParallelSiluFfnLayer.h
+++ b/src/fastertransformer/layers/TensorParallelSiluFfnLayer.h
@@ -47,7 +47,8 @@ class TensorParallelSiluFfnLayer: public SiluFfnLayer<T> {
                                bool                                is_sparse,
                                bool                                use_gated_activation     = false,
                                std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm   = nullptr,
-                               int                                 enable_custom_all_reduce = 0);
+                               int                                 enable_custom_all_reduce = 0,
+                               int                                 int8_mode                = 0);
 
     TensorParallelSiluFfnLayer(TensorParallelSiluFfnLayer<T> const& ffn_layer);
 
diff --git a/src/fastertransformer/layers/adapter_layers/LinearAdapterLayer.cc b/src/fastertransformer/layers/adapter_layers/LinearAdapterLayer.cc
index c5ea150f4..ef2ac7b3a 100644
--- a/src/fastertransformer/layers/adapter_layers/LinearAdapterLayer.cc
+++ b/src/fastertransformer/layers/adapter_layers/LinearAdapterLayer.cc
@@ -88,7 +88,8 @@ LinearAdapterLayer<T>::LinearAdapterLayer(LinearAdapterConfig const&          co
                                                  is_sparse,
                                                  false,
                                                  custom_all_reduce_comm,
-                                                 enable_custom_all_reduce)},
+                                                 enable_custom_all_reduce,
+                                                 0)},
     layer_norm_type_{config.layerNormType()},
     layer_norm_eps_{layer_norm_eps},
     max_token_size_{max_batch_size * max_seq_len},
diff --git a/src/fastertransformer/models/.DS_Store b/src/fastertransformer/models/.DS_Store
index cd5987b054b2d9ac9e7e3b48fcc55e66056ebd71..de13d48fdd4e9e824db21adcab91399ef3727fa6 100644
GIT binary patch
delta 22
dcmZp1XmQx^ONi6R#6m~G(8$7IGqW%wF91_V1=;`r

delta 22
dcmZp1XmQx^ONi6h&`?Lg(8%0$GqW%wF91^U1=Rom

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index 8f66e7def..1eac9fd20 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -927,68 +927,68 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                 sync_check_cuda_error();
 
                 
-                // if (tensor_para_.world_size_ == 1) {
-                //     float alpha = 1.0f;
-                //     float beta  = 0.0f;
-                //     cublas_wrapper_->Gemm(CUBLAS_OP_T,
-                //                           CUBLAS_OP_N,
-                //                           vocab_size_padded_,  // n
-                //                           local_batch_size * beam_width,
-                //                           hidden_units_,  // k
-                //                           &alpha,
-                //                           padded_embedding_kernel_ptr_,
-                //                           gemm_data_type,
-                //                           hidden_units_,  // k
-                //                           normed_decoder_output_buf_ + hidden_units_offset,
-                //                           gemm_data_type,
-                //                           hidden_units_,  // k
-                //                           &beta,
-                //                           logits_buf_ + vocab_size_units_offset,
-                //                           CUDA_R_32F,
-                //                           vocab_size_padded_, /* n */
-                //                           CUDA_R_32F,
-                //                           cublasGemmAlgo_t(-1));
-                // }
-                // else {
-                //     FT_CHECK(vocab_size_padded_ % tensor_para_.world_size_ == 0);
-                //     const int local_vocab_size = vocab_size_padded_ / tensor_para_.world_size_;
-                //     float     alpha            = 1.0f;
-                //     float     beta             = 0.0f;
-                //     cublas_wrapper_->Gemm(CUBLAS_OP_T,
-                //                           CUBLAS_OP_N,
-                //                           local_vocab_size,  // n
-                //                           local_batch_size * beam_width,
-                //                           hidden_units_,  // k
-                //                           &alpha,
-                //                           padded_embedding_kernel_ptr_
-                //                               + tensor_para_.rank_ * local_vocab_size * hidden_units_,
-                //                           gemm_data_type,
-                //                           hidden_units_,  // k
-                //                           normed_decoder_output_buf_ + hidden_units_offset,
-                //                           gemm_data_type,
-                //                           hidden_units_,  // k
-                //                           &beta,
-                //                           nccl_logits_buf_ + vocab_size_units_offset
-                //                               + tensor_para_.rank_ * local_batch_size * beam_width * local_vocab_size,
-                //                           CUDA_R_32F,
-                //                           local_vocab_size, /* n */
-                //                           CUDA_R_32F,
-                //                           cublasGemmAlgo_t(-1));
+                if (tensor_para_.world_size_ == 1) {
+                    float alpha = 1.0f;
+                    float beta  = 0.0f;
+                    cublas_wrapper_->Gemm(CUBLAS_OP_T,
+                                          CUBLAS_OP_N,
+                                          vocab_size_padded_,  // n
+                                          local_batch_size * beam_width,
+                                          hidden_units_,  // k
+                                          &alpha,
+                                          padded_embedding_kernel_ptr_,
+                                          gemm_data_type,
+                                          hidden_units_,  // k
+                                          normed_decoder_output_buf_ + hidden_units_offset,
+                                          gemm_data_type,
+                                          hidden_units_,  // k
+                                          &beta,
+                                          logits_buf_ + vocab_size_units_offset,
+                                          CUDA_R_32F,
+                                          vocab_size_padded_, /* n */
+                                          CUDA_R_32F,
+                                          cublasGemmAlgo_t(-1));
+                }
+                else {
+                    FT_CHECK(vocab_size_padded_ % tensor_para_.world_size_ == 0);
+                    const int local_vocab_size = vocab_size_padded_ / tensor_para_.world_size_;
+                    float     alpha            = 1.0f;
+                    float     beta             = 0.0f;
+                    cublas_wrapper_->Gemm(CUBLAS_OP_T,
+                                          CUBLAS_OP_N,
+                                          local_vocab_size,  // n
+                                          local_batch_size * beam_width,
+                                          hidden_units_,  // k
+                                          &alpha,
+                                          padded_embedding_kernel_ptr_
+                                              + tensor_para_.rank_ * local_vocab_size * hidden_units_,
+                                          gemm_data_type,
+                                          hidden_units_,  // k
+                                          normed_decoder_output_buf_ + hidden_units_offset,
+                                          gemm_data_type,
+                                          hidden_units_,  // k
+                                          &beta,
+                                          nccl_logits_buf_ + vocab_size_units_offset
+                                              + tensor_para_.rank_ * local_batch_size * beam_width * local_vocab_size,
+                                          CUDA_R_32F,
+                                          local_vocab_size, /* n */
+                                          CUDA_R_32F,
+                                          cublasGemmAlgo_t(-1));
                     
 
-                //     ftNcclAllGather(nccl_logits_buf_ + vocab_size_units_offset,
-                //                     nccl_logits_buf_ + vocab_size_units_offset,
-                //                     local_batch_size * beam_width * local_vocab_size,
-                //                     tensor_para_.rank_,
-                //                     tensor_para_,
-                //                     stream_);
-                //     invokeTransposeAxis01(logits_buf_ + vocab_size_units_offset,
-                //                           nccl_logits_buf_ + vocab_size_units_offset,
-                //                           tensor_para_.world_size_,
-                //                           local_batch_size * beam_width,
-                //                           local_vocab_size,
-                //                           stream_);
-                // }
+                    ftNcclAllGather(nccl_logits_buf_ + vocab_size_units_offset,
+                                    nccl_logits_buf_ + vocab_size_units_offset,
+                                    local_batch_size * beam_width * local_vocab_size,
+                                    tensor_para_.rank_,
+                                    tensor_para_,
+                                    stream_);
+                    invokeTransposeAxis01(logits_buf_ + vocab_size_units_offset,
+                                          nccl_logits_buf_ + vocab_size_units_offset,
+                                          tensor_para_.world_size_,
+                                          local_batch_size * beam_width,
+                                          local_vocab_size,
+                                          stream_);
+                }
                 
                 
                 int                                     tmp_local_batch_size       = local_batch_size;
diff --git a/src/fastertransformer/models/llama/LlamaContextDecoder.cc b/src/fastertransformer/models/llama/LlamaContextDecoder.cc
index a1d78c852..0595358a0 100644
--- a/src/fastertransformer/models/llama/LlamaContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LlamaContextDecoder.cc
@@ -44,7 +44,7 @@ void LlamaContextDecoder<T>::initialize()
                                                                           custom_all_reduce_comm_,
                                                                           enable_custom_all_reduce_);
 
-// TODO: 这里的SiluFfnLayer是不支持int8的dataType，再不做修改的情况下试一下会不会报错。
+
     ffn_layer_ = new TensorParallelSiluFfnLayer<T>(0,  // max_batch_size
                                                    0,  // max_seq_len
                                                    head_num_,
@@ -60,7 +60,8 @@ void LlamaContextDecoder<T>::initialize()
                                                    false,
                                                    true,  // use_gated_activation = true;
                                                    custom_all_reduce_comm_,
-                                                   enable_custom_all_reduce_);
+                                                   enable_custom_all_reduce_,
+                                                   int8_mode_); 
 }
 
 template<typename T>
diff --git a/src/fastertransformer/models/llama/LlamaDecoder.cc b/src/fastertransformer/models/llama/LlamaDecoder.cc
index 4685217b0..b68c82207 100644
--- a/src/fastertransformer/models/llama/LlamaDecoder.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoder.cc
@@ -55,7 +55,8 @@ void LlamaDecoder<T>::initialize()
                                                    false,
                                                    true,  // use_gated_activation = true;
                                                    custom_all_reduce_comm_,
-                                                   enable_custom_all_reduce_);
+                                                   enable_custom_all_reduce_,
+                                                   int8_mode_); 
 }
 
 template<typename T>

From 0e6ae5b1a48ad55a451788c39815518d9d6b5c3f Mon Sep 17 00:00:00 2001
From: shaoxin <shaoxin@xiaoice.com>
Date: Fri, 30 Jun 2023 20:38:06 +0800
Subject: [PATCH 23/27] rm unused code

---
 examples/cpp/llama/llama_example.cc    |  18 ------------------
 src/fastertransformer/.DS_Store        | Bin 6148 -> 0 bytes
 src/fastertransformer/models/.DS_Store | Bin 8196 -> 0 bytes
 3 files changed, 18 deletions(-)
 delete mode 100644 src/fastertransformer/.DS_Store
 delete mode 100644 src/fastertransformer/models/.DS_Store

diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
index 14cff7bee..b42aea9bc 100644
--- a/examples/cpp/llama/llama_example.cc
+++ b/examples/cpp/llama/llama_example.cc
@@ -277,7 +277,6 @@ void llama_example(const INIReader reader)
         cublas_wrapper.setFP32GemmConfig();
     }
 
-    printf("******* Enter  gpt_weights ********** \n");
     const bool                          use_gptj_residual = false;
     fastertransformer::LlamaWeight<T> gpt_weights(hidden_units,
                                                   inter_size,
@@ -293,8 +292,6 @@ void llama_example(const INIReader reader)
                                                   prompt_learning_type,
                                                   prefix_prompt_table_pair);
 
-    printf("******* Enter loadModel  ********* \n");
-
     gpt_weights.loadModel(model_dir);
     unsigned long long random_seed;
     if (rank == 0) {
@@ -312,8 +309,6 @@ void llama_example(const INIReader reader)
                                                        false,  // with_relative_position_bias
                                                        true);  // causal_mask
 
-    printf("******* Inilize  Llama  ********* \n");
-
     Llama<T> gpt = Llama<T>(head_num,
                             size_per_head,
                             inter_size,
@@ -349,13 +344,10 @@ void llama_example(const INIReader reader)
     int* d_output_ids;
     int* d_sequence_lengths;
 
-    printf("******* deviceMalloc start  ********* \n");
 
     deviceMalloc(&d_output_ids, request_batch_size * beam_width * total_output_len, false);
     deviceMalloc(&d_sequence_lengths, request_batch_size * beam_width, false);
 
-    printf("******* deviceMalloc end  ********* \n");
-
     std::vector<uint32_t>                   output_seq_len(request_batch_size, total_output_len);
     std::unordered_map<std::string, Tensor> input_tensors = std::unordered_map<std::string, Tensor>{
         {"input_ids",
@@ -421,8 +413,6 @@ void llama_example(const INIReader reader)
 
     print_mem_usage();
 
-    printf("******* before cudaDeviceSynchronize ********* \n");
-
     int ite = 1;
     cudaDeviceSynchronize();
     mpi::barrier();
@@ -433,19 +423,16 @@ void llama_example(const INIReader reader)
     ft_nvtx::setScope("warmup_time");
     PUSH_RANGE("warmup time")
 
-    printf("******* before gpt.forward ********* \n");
     for (int i = 0; i < ite; ++i) {
         gpt.forward(&output_tensors, &input_tensors, &gpt_weights);
     }
 
-    printf("******* end gpt.forward ********* \n");
     cudaDeviceSynchronize();
     mpi::barrier();
 
     POP_RANGE;
     ft_nvtx::resetScope();
 
-    printf("******* end cudaDeviceSynchronize ********* \n");
 
     if (rank == 0) {
 
@@ -458,11 +445,8 @@ void llama_example(const INIReader reader)
             size_t outCount = total_output_len * request_batch_size * beam_width;
             int*   hBuf     = new int[outCount];
 
-            printf("******* before cudaD2Hcpy ********* \n");
-
             cudaD2Hcpy(hBuf, d_output_ids, outCount);
 
-            printf("******* end cudaD2Hcpy ********* \n");
             {
                 std::cout << "Writing " << outCount << " elements\n";
                 int zeroCount = 0;
@@ -496,11 +480,9 @@ void llama_example(const INIReader reader)
 
     ft_nvtx::setScope("total_time");
     PUSH_RANGE("total time")
-    printf("******* before gpt forward ********* \n");
     for (int i = 0; i < ite; ++i) {
         gpt.forward(&output_tensors, &input_tensors, &gpt_weights);
     }
-    printf("******* after gpt forward ********* \n");
     cudaDeviceSynchronize();
     mpi::barrier();
 
diff --git a/src/fastertransformer/.DS_Store b/src/fastertransformer/.DS_Store
deleted file mode 100644
index d4edc9af914042cba222ab33425eac19388ca6d5..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHK%We}f6unOKm{1WS1X5Wbjl?!8Y0_4y*o3q!kYGa)EC3CeOlU`)j-$zhfTBoQ
z1K+?eu!SGtUs%Dp9v96dQ<fD1-^f0GoNLD?zIIGRqBr$+i0VXSAu$%OAj>iC=d@-8
z-BJSzHO7b@kxvtfk&GDyi~@hV0vx+FI;6)mpi@f6?>3_N9mNj=IdlW*g2#u%eWdN5
z%1|P@PD7f48Nh#nkApGbg*-&fC&{=n#+Q{bDr)8k%n`;}VT;=<?6R8cJ-E>t_v4?q
z?rbvFEBWe<1Ah`Xo8LuYqgX0etcta1z3QLHSw9}c(@Ad-z2MqYDFgSsJaC`+qnT5`
zaVW!h;D@6~Ir!cPl$X!_(37*CoQB>+`Sz0zt7=u9`qq5j+HW@OcI#l#u;&LYly}?v
zi$&GCdFSrEC(dbb7RnF!{xE^lRM}OH=fLaBtp9C43IiD)qn&-qbOyz}TpfR)KmE&P
zxG8PEUAi{6pv`WlVehm$O%|uqZa3_`R&zxhUOenVaXz6lv#1quuKnJY5?d!i&7m<i
z)0B>}QN!#eEwH3%xk})_frq4VmZ?JGb1Ed5d6LWIjh|-Lz>F|=tir@07eO>s!4Q&{
zokfK~+P9>_&d&)L)+x<IHQ)3uvo`a)O{-DBDDWREz~_UF#MsriOi&#i$kY=6SVXfj
zl=*K0``8-08kY&82PP6%VB#wD6GJ5KXty=5t8tlN;!Z+8K7^iG=ogAmvtxW)(Mfa@
zG`Ug0D3DcPBdON;{(o@s`#<YsYDNL0z(1vcC?55XdRUUaTh|uHcdd=|8i|d0%LJ7R
kGJPGZ0$;^jNXpP>a{<`ZxJ(c&F!M)1$zU?0z@;ki3m={olK=n!

diff --git a/src/fastertransformer/models/.DS_Store b/src/fastertransformer/models/.DS_Store
deleted file mode 100644
index de13d48fdd4e9e824db21adcab91399ef3727fa6..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 8196
zcmeHML2uJA6n^eHnld)^0Ma-hMdCV*ZevoVT}n3&NN}Nu9RQVN4O>_iS53N3s)~At
z-@sqs%pc*uaDwmI?v%Le6(O)I`z7}Gz4(20n!c2XL}%3BA=)G&50!0W4b2UO$GI+)
zis@mp0H2;E@+3@Ti1Wq*+I2VuoB~b(r+`zyDeyNafM+%rZ^?6CkGj?=;1u{T72xj&
zAC+y}*qOHKs{@Tn0br}>Rt3lEr$4OE9k6X<XWF6%MQAFlrV5i{2$MMoO~-iK;Ah%u
zItg`U_%T-&CPNV>@esa(lW1$}TBm?hU{L|i-K!K*Ktsx{{QU{WwtuOjh<B(1Tt*Qc
zqsDYfXOz=6omsqZ7LWZ$i~hv}3+K&o)T0kHpjO$h_-x_p1$;~?=3+B`T8s@kq!)<c
z9P@dO2r_UoM3B>!9ew50;?}4SmWhOd^4y5E`dZLRqfbaG*3d$;LPfNvP#VpVoR;xQ
zsUp4x?FX3ka-E9kszHCWys|n)+}vx5I3-<`(MpdZdX*A<0=l?K3$anqqJRv?*}%LZ
z`W_CFILn*OAEL5SUA<NFYTlanu6riO-8{%gStl62;j>p#CgJ=z2w%t1xYxLQDAPQM
z(`cv&aX$j&?Qxv;<+vk9X+Klkt{(8}UcJ}2Kbh=4Zf*HHyRGJQ%b&FN_O|@}-R5*!
z_wGG>bnvowo?N7ow;75sPa0a+=hy8Q92G*k><-gJro3I4aT}sYUP5ePiA{gOTfZzN
zVfjk}tphazFUOYbsl^%@Yu%E(5lYM|w(l6Mw0s4=RZ6(}90$KkZ<P|Gg4yhsz48_)
zVOHo8-mE$Dtsp)XmfzE;Wmi^e6!EKwD1w5K?O8=%MqWX+n~)rT5qZ6M?!YPF6wnH+
zXtTKn`*ivHe<znyz$x$#D<G;z-J=cyod4pnW93}iM}3dVg>f@&6$On-$6-}E4txEF
bA&z}OnbS6QrY(9<+=~DugKL}ue^i0r53^<j


From 9e374efb6d6707c7da12ed1bb6969f41a00ed286 Mon Sep 17 00:00:00 2001
From: shaoxin <shaoxin@xiaoice.com>
Date: Mon, 3 Jul 2023 11:00:05 +0800
Subject: [PATCH 24/27] upload code: llama int8&share_context triton backend

---
 .../triton_backend/llama/LlamaTritonModel.cc  | 20 +++++++++++++------
 .../triton_backend/llama/LlamaTritonModel.h   |  5 ++++-
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc
index ea34983e4..7cd3f5103 100644
--- a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc
@@ -34,12 +34,13 @@ std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaM
     int               tensor_para_size = reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size");
     std::string       model_dir        = reader.Get("ft_instance_hyperparameter", "model_dir");
 
-    if (data_type == "half") {
+    if (data_type == "half" || data_type == "fp16") {
         return std::make_shared<LlamaTritonModel<half>>(
             reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"),
             reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"),
             reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0),
-            model_dir);
+            model_dir,
+            reader.GetInteger("ft_instance_hyperparameter", "int8_mode", 0));
     }
 #ifdef ENABLE_BF16
     else if (data_type == "bf16") {
@@ -47,7 +48,8 @@ std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaM
             reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"),
             reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"),
             reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0),
-            model_dir);
+            model_dir,
+            reader.GetInteger("ft_instance_hyperparameter", "int8_mode", 0));
     }
 #endif
     else {
@@ -55,7 +57,8 @@ std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaM
             reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"),
             reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"),
             reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0),
-            model_dir);
+            model_dir,
+            reader.GetInteger("ft_instance_hyperparameter", "int8_mode", 0));
     }
 }
 
@@ -63,11 +66,13 @@ template<typename T>
 LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
                                       size_t      pipeline_para_size,
                                       int         enable_custom_all_reduce,
-                                      std::string model_dir):
+                                      std::string model_dir,
+                                      int         int8_mode):
     tensor_para_size_(tensor_para_size),
     pipeline_para_size_(pipeline_para_size),
     shared_weights_(std::vector<std::shared_ptr<ft::LlamaWeight<T>>>(ft::getDeviceCount())),
-    enable_custom_all_reduce_(enable_custom_all_reduce)
+    enable_custom_all_reduce_(enable_custom_all_reduce),
+    int8_mode_(int8_mode)
 {
     model_dir_ = model_dir;
     const std::string inifile{model_dir + "/config.ini"};
@@ -184,6 +189,7 @@ std::unique_ptr<AbstractTransformerModelInstance> LlamaTritonModel<T>::createMod
                      false,
                      cuda_device_prop_ptr.get(),
                      attention_type,
+                     int8_mode_,
                      custom_all_reduce_comm,
                      enable_custom_all_reduce_));
 
@@ -213,6 +219,7 @@ void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
                                                                         pipeline_para_size_,
                                                                         pipeline_para_rank,
                                                                         use_gptj_residual_,
+                                                                        int8_mode_,
                                                                         prompt_learning_type_,
                                                                         prompt_learning_table_pair_);
     shared_weights_[device_id]->loadModel(model_dir_);
@@ -230,6 +237,7 @@ std::string LlamaTritonModel<T>::toString()
        << "\nprompt_learning_type_: " << static_cast<int>(prompt_learning_type_)
        << "\nprompt_learning_start_id_: " << prompt_learning_start_id_ << "\ntensor_para_size: " << tensor_para_size_
        << "\npipeline_para_size: " << pipeline_para_size_ << "\nenable_custom_all_reduce: " << enable_custom_all_reduce_
+       << "\nint8_mode: " << int8_mode_
        << "\nmodel_name: " << model_name_ << "\nmodel_dir: " << model_dir_ << std::endl;
     return ss.str();
 }
diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.h b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.h
index da5a277cd..3ac8bdff4 100644
--- a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.h
+++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.h
@@ -30,7 +30,8 @@ struct LlamaTritonModel: public AbstractTransformerModel {
     LlamaTritonModel(size_t      tensor_para_size,
                      size_t      pipeline_para_size,
                      int         enable_custom_all_reduce,
-                     std::string model_dir);
+                     std::string model_dir,
+                     int         int8_mode);
 
     ~LlamaTritonModel() = default;
 
@@ -69,6 +70,8 @@ struct LlamaTritonModel: public AbstractTransformerModel {
     // residual type
     bool use_gptj_residual_ = false;
 
+    int  int8_mode_ = 0;
+
     // number of tasks (for prefix-prompt, p/prompt-tuning)
     size_t                                     num_tasks_                  = 0;
     int                                        prompt_learning_start_id_   = 0;

From c137bb5cb816a29eb33f531afd02a28f6f4d0fd2 Mon Sep 17 00:00:00 2001
From: shaoxin <shaoxin@xiaoice.com>
Date: Fri, 7 Jul 2023 15:26:03 +0800
Subject: [PATCH 25/27] fix bug: ft-llama-int8 output is incorrect

---
 src/fastertransformer/layers/FfnLayer.cc                      | 2 +-
 src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fastertransformer/layers/FfnLayer.cc b/src/fastertransformer/layers/FfnLayer.cc
index 14bb5e3f6..4b18b54ad 100644
--- a/src/fastertransformer/layers/FfnLayer.cc
+++ b/src/fastertransformer/layers/FfnLayer.cc
@@ -81,7 +81,7 @@ void FfnLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_tensors, c
     }
 
     // TODO: INT8 and Sparsity are currently not implemented (geglu or reglu)
-    const bool use_gated_activation = use_gated_activation_ && ffn_weights->intermediate_weight2.kernel != nullptr;
+    const bool use_gated_activation = use_gated_activation_ && (ffn_weights->intermediate_weight2.kernel != nullptr || ffn_weights->intermediate_weight2.int8_kernel != nullptr);
 
     // moe can't be used with use_gated_activation currently
     FT_CHECK(!(use_gated_activation && use_moe));
diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
index a8dadefea..b7e0fe002 100644
--- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
@@ -50,7 +50,7 @@ template<typename T>
 LlamaDecoderLayerWeight<T>::~LlamaDecoderLayerWeight()
 {
     if (is_maintain_buffer == true) {
-        for (int i = 0; i < 12; i++) {
+        for (int i = 0; i < 14; i++) {
             if (!use_gptj_residual_ && i != attention_dense_bias_weight_id) {
                 cudaFree(weights_ptr[i]);
             }

From 99a844cc7b434e1175277c7e438cbf1c348dca89 Mon Sep 17 00:00:00 2001
From: yandai <kanshuzhi@gmail.com>
Date: Sun, 9 Jul 2023 00:47:58 +0800
Subject: [PATCH 26/27] fix llama

---
 src/fastertransformer/models/llama/Llama.cc     | 17 ++++++++++++-----
 .../models/llama/LlamaWeight.cc                 |  2 ++
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index 1eac9fd20..3c1bfc9e3 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -820,11 +820,18 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                         sizeof(T) * vocab_size_ * hidden_units_,
                         cudaMemcpyDeviceToDevice,
                         stream_);
-        cudaMemcpyAsync(padded_embedding_bias_,
-                        gpt_weights->post_decoder_embedding.bias,
-                        sizeof(T) * vocab_size_,
-                        cudaMemcpyDeviceToDevice,
-                        stream_);
+        if (gpt_weights->post_decoder_embedding.bias) {
+            cudaMemcpyAsync(padded_embedding_bias_,
+                            gpt_weights->post_decoder_embedding.bias,
+                            sizeof(T) * vocab_size_,
+                            cudaMemcpyDeviceToDevice,
+                            stream_);
+        } else {
+            cudaMemsetAsync(padded_embedding_bias_,
+                            0,
+                            sizeof(T) * vocab_size_,
+                            stream_);
+        }
         sync_check_cuda_error();
     }
 
diff --git a/src/fastertransformer/models/llama/LlamaWeight.cc b/src/fastertransformer/models/llama/LlamaWeight.cc
index e9e11b6a1..84c4aaa08 100644
--- a/src/fastertransformer/models/llama/LlamaWeight.cc
+++ b/src/fastertransformer/models/llama/LlamaWeight.cc
@@ -89,6 +89,7 @@ LlamaWeight<T>::~LlamaWeight()
         post_decoder_layernorm.beta   = nullptr;
         post_decoder_layernorm.gamma  = nullptr;
         post_decoder_embedding.kernel = nullptr;
+        post_decoder_embedding.bias = nullptr;
         is_maintain_buffer            = false;
     }
 }
@@ -196,6 +197,7 @@ void LlamaWeight<T>::setWeightPtr()
     post_decoder_layernorm.beta   = weights_ptr[1];
     post_decoder_layernorm.gamma  = weights_ptr[2];
     post_decoder_embedding.kernel = weights_ptr[3];
+    post_decoder_embedding.bias   = nullptr;
 
     // prompt learning tables: set weight ptr
     if (malloc_load_prompt_weights_) {

From f1dd8fb4bd4952eadef90585daf6d5d1705c84ac Mon Sep 17 00:00:00 2001
From: yandai <kanshuzhi@gmail.com>
Date: Sun, 16 Jul 2023 17:09:38 +0800
Subject: [PATCH 27/27] remove padded_embedding_bias_

---
 src/fastertransformer/models/llama/Llama.cc | 16 ----------------
 src/fastertransformer/models/llama/Llama.h  |  1 -
 2 files changed, 17 deletions(-)

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index 3c1bfc9e3..4cbe593e6 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -108,9 +108,6 @@ void Llama<T>::allocateBuffer(
         padded_embedding_kernel_ =
             (T*)(allocator_->reMalloc(padded_embedding_kernel_, sizeof(T) * hidden_units_ * vocab_size_padded_, true));
         padded_embedding_kernel_ptr_ = padded_embedding_kernel_;
-
-        padded_embedding_bias_ =
-            (T*)(allocator_->reMalloc(padded_embedding_bias_, sizeof(T) * vocab_size_padded_, true));
     }
 
     input_attention_mask_ = (T*)(allocator_->reMalloc(
@@ -184,7 +181,6 @@ void Llama<T>::freeBuffer()
         if (vocab_size_ != vocab_size_padded_) {
             padded_embedding_kernel_ptr_ = nullptr;
             allocator_->free((void**)(&padded_embedding_kernel_));
-            allocator_->free((void**)(&padded_embedding_bias_));
         }
 
         allocator_->free((void**)(&input_attention_mask_));
@@ -820,18 +816,6 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                         sizeof(T) * vocab_size_ * hidden_units_,
                         cudaMemcpyDeviceToDevice,
                         stream_);
-        if (gpt_weights->post_decoder_embedding.bias) {
-            cudaMemcpyAsync(padded_embedding_bias_,
-                            gpt_weights->post_decoder_embedding.bias,
-                            sizeof(T) * vocab_size_,
-                            cudaMemcpyDeviceToDevice,
-                            stream_);
-        } else {
-            cudaMemsetAsync(padded_embedding_bias_,
-                            0,
-                            sizeof(T) * vocab_size_,
-                            stream_);
-        }
         sync_check_cuda_error();
     }
 
diff --git a/src/fastertransformer/models/llama/Llama.h b/src/fastertransformer/models/llama/Llama.h
index a0958280e..df621f009 100644
--- a/src/fastertransformer/models/llama/Llama.h
+++ b/src/fastertransformer/models/llama/Llama.h
@@ -84,7 +84,6 @@ class Llama: public BaseLayer {
 
 protected:
     T*       padded_embedding_kernel_;
-    T*       padded_embedding_bias_;
     const T* padded_embedding_kernel_ptr_;
 
     T* input_attention_mask_;