Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[enhancement] support llama #575

Open
wants to merge 34 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
f3bd8e6
get llama coded
void-main Apr 23, 2023
a32fc1d
make the code work :yay:
void-main Apr 24, 2023
ce8700f
fix llama rms ln
void-main Apr 24, 2023
91989cb
add bf16 support
void-main Apr 25, 2023
4bc97c3
add triton model for streaming callback
void-main Apr 25, 2023
a6d51ec
register RMS for bf16
void-main Apr 25, 2023
7a72ca3
revert bf16
void-main Apr 25, 2023
9820565
revert bf16
void-main Apr 25, 2023
bfeebef
bugfix
void-main Apr 25, 2023
0379cc5
add megatron llama convert
void-main Apr 28, 2023
d65adf1
Update src/fastertransformer/triton_backend/llama/LlamaTritonModelIns…
void-main Apr 28, 2023
cf1b9b1
donot callback too frequnetly
void-main Apr 29, 2023
95afed4
add bf16
void-main Apr 29, 2023
9aee02e
make sure examples work for bf16
void-main Apr 29, 2023
8ddac81
support bf16 conversion with bfloat 16 numpy ext
void-main Apr 30, 2023
694faec
Merge branch 'main' of https://github.com/void-main/FasterTransformer…
void-main Apr 30, 2023
40fbe48
bugfix
void-main May 1, 2023
f6cf9da
load layernorm_eps from config; change cb default to 5
void-main May 1, 2023
d752088
Merge branch 'main' of https://github.com/void-main/FasterTransformer…
void-main May 1, 2023
da2ad14
update megatron convert script
void-main May 2, 2023
abd1e4d
fix callback issue
void-main May 6, 2023
b942806
Merge branch 'NVIDIA:main' into main
void-main May 6, 2023
50fdb0c
fix name
void-main Jun 11, 2023
3c03564
support int8 & share context
Jun 29, 2023
6f0469f
update code with comments from reviewer
Jun 30, 2023
0e6ae5b
rm unused code
Jun 30, 2023
98435f1
Merge pull request #1 from CN-COTER/dev_support_int8_share_ctx
void-main Jul 2, 2023
9e374ef
upload code: llama int8&share_context triton backend
Jul 3, 2023
69de159
Merge pull request #2 from CN-COTER/dev_support_int8_share_ctx
void-main Jul 3, 2023
c137bb5
fix bug: ft-llama-int8 output is incorrect
Jul 7, 2023
3469ff0
Merge pull request #3 from CN-COTER/dev_support_int8_share_ctx
void-main Jul 7, 2023
99a844c
fix llama
yandai Jul 8, 2023
f1dd8fb
remove padded_embedding_bias_
yandai Jul 16, 2023
e770ddf
Merge pull request #4 from yandai/llama
void-main Jul 17, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,12 @@ add_library(transformer-shared SHARED
$<TARGET_OBJECTS:ParallelGptDecoderLayerWeight>
$<TARGET_OBJECTS:ParallelGptTritonBackend>
$<TARGET_OBJECTS:ParallelGptWeight>
$<TARGET_OBJECTS:Llama>
$<TARGET_OBJECTS:LlamaContextDecoder>
$<TARGET_OBJECTS:LlamaDecoder>
$<TARGET_OBJECTS:LlamaDecoderLayerWeight>
$<TARGET_OBJECTS:LlamaTritonBackend>
$<TARGET_OBJECTS:LlamaWeight>
$<TARGET_OBJECTS:T5Common>
$<TARGET_OBJECTS:T5Decoder>
$<TARGET_OBJECTS:T5Decoding>
Expand Down Expand Up @@ -428,9 +434,9 @@ target_link_libraries(transformer-shared PUBLIC
-lnvToolsExt
)
endif()

if (ENABLE_FP8)
target_link_libraries(transformer-shared PUBLIC
target_link_libraries(transformer-shared PUBLIC
$<TARGET_OBJECTS:BertFP8>
$<TARGET_OBJECTS:BertFP8Weight>
$<TARGET_OBJECTS:DecoderSelfAttentionFP8Layer>
Expand Down
2 changes: 2 additions & 0 deletions examples/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ add_subdirectory(gptj)
add_subdirectory(gptneox)
add_subdirectory(multi_gpu_gpt)

add_subdirectory(llama)

if(ENABLE_FP8)
add_subdirectory(gpt_fp8)
add_subdirectory(bert_fp8)
Expand Down
22 changes: 22 additions & 0 deletions examples/cpp/llama/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

add_executable(llama_example llama_example.cc)
target_link_libraries(llama_example PUBLIC -lcublas -lcublasLt -lcudart
Llama nvtx_utils gpt_example_utils word_list mpi_utils nccl_utils)

add_executable(llama_triton_example llama_triton_example.cc)
target_link_libraries(llama_triton_example PUBLIC -lcublas -lcublasLt -lcudart -lpthread
LlamaTritonBackend TransformerTritonBackend custom_ar_comm
gpt_example_utils word_list mpi_utils nccl_utils nvtx_utils)
2 changes: 2 additions & 0 deletions examples/cpp/llama/bad_words.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
7768,3908
1,2
16 changes: 16 additions & 0 deletions examples/cpp/llama/check_with_huggingface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import transformers

from transformers import LlamaForCausalLM, LlamaTokenizer

tokenizer = LlamaTokenizer.from_pretrained('/data/llama-7b-hf')

prompt = "Hey, are you consciours? Can you talk to me?"
inputs = tokenizer(prompt, return_tensors='pt')
model = LlamaForCausalLM.from_pretrained("/data/llama-7b-hf")
hf_config = vars(model.config)
print(hf_config)
generated_ids = model.forward(inputs.input_ids, output_hidden_states=True)
print(generated_ids)

tokens = [0,18637,29892,526,366,1136,455,2470,29973,1815,366,5193,304,592,29973,18637,29892,526,366,1136,455,2470,29973,1815,366,5193,304,592,29973,18637,29892,526,366,1136,455,2470,29973,1815,366,5193,304,592,29973,18637,29892,526,366]
print(tokenizer.decode(tokens))
187 changes: 187 additions & 0 deletions examples/cpp/llama/huggingface_llama_convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import configparser
import numpy as np
from pathlib import Path

import os
from transformers import LlamaForCausalLM

def get_weight_data_type(data_type):
if data_type == "fp32":
return np.float32
elif data_type == "fp16":
return np.float16
else:
assert False, f"Invalid weight data type {data_type}"


def split_and_convert_process(saved_dir, factor, key, val):
if key.find("input_layernorm.weight") != -1 or key.find("post_attention_layernorm.weight") != -1:
# shared weights, only need to convert the weights of rank 0
saved_path = saved_dir + "/" + key + ".bin"
val.tofile(saved_path)
elif key.find("attention.dense.weight") != -1 or key.find("mlp.down_proj.weight") != -1:
split_vals = np.split(val, factor, axis=0)
for j in range(factor):
saved_path = saved_dir + "/" + key + ".%d.bin" % j
split_vals[j].tofile(saved_path)
elif key.find("mlp.gate_proj.weight") != -1 or key.find("mlp.up_proj.weight") != -1:
split_vals = np.split(val, factor, axis=-1)
for j in range(factor):
saved_path = saved_dir + "/" + key + ".%d.bin" % j
split_vals[j].tofile(saved_path)
elif key.find("attention.query_key_value.weight") != -1:
split_vals = np.split(val, factor, axis=-1)
for j in range(factor):
saved_path = saved_dir + "/" + key + ".%d.bin" % j
split_vals[j].tofile(saved_path)
else:
print("[ERROR] cannot find key '{}'".format(key))

def split_and_convert(args):
saved_dir = args.saved_dir + "/%d-gpu/" % args.infer_gpu_num

if(os.path.exists(saved_dir) == False):
os.makedirs(saved_dir)

t_gpu_num = args.trained_gpu_num
i_gpu_num = args.infer_gpu_num
assert(i_gpu_num % t_gpu_num == 0)

factor = (int)(i_gpu_num / t_gpu_num)

# load position_embedding from rank 0
# model = torch.load(ckpt_name)
model = LlamaForCausalLM.from_pretrained(args.in_file)
hf_config = vars(model.config)
print(f"hf_config: {hf_config}")

print("named parameters:")
for name, param in model.named_parameters():
print(f"- {name}")

hidden_size = hf_config["hidden_size"]
head_num = hf_config["num_attention_heads"]
head_size = hidden_size // head_num
num_layers = hf_config["num_hidden_layers"]


np_weight_data_type = get_weight_data_type(args.weight_data_type)

try:
model_name = args.model_name
config = configparser.ConfigParser()
config['llama'] = {}
config['llama']['model_name'] = model_name
config['llama']["head_num"] = str(head_num)
config['llama']["size_per_head"] = str(head_size)
config['llama']["inter_size"] = str(hf_config["intermediate_size"])
config['llama']["num_layer"] = str(num_layers)
config['llama']["rotary_embedding"] = str(head_size)
config['llama']["vocab_size"] = str(hf_config["vocab_size"])
config['llama']["start_id"] = str(hf_config["bos_token_id"])
config['llama']["end_id"] = str(hf_config["eos_token_id"])
config['llama']["weight_data_type"] = args.weight_data_type

with open((Path(saved_dir) / f"config.ini").as_posix(), 'w') as configfile:
config.write(configfile)
except Exception as e:
print(f"Fail to save the config in config.ini.")
print(e)

param_to_weights = lambda param: param.detach().cpu().numpy().astype(np_weight_data_type)

# layer-wise weights, example:
# - model.layers.0.self_attn.q_proj.weight
# - model.layers.0.self_attn.k_proj.weight
# - model.layers.0.self_attn.v_proj.weight
# - model.layers.0.self_attn.o_proj.weight
# - model.layers.0.mlp.gate_proj.weight
# - model.layers.0.mlp.down_proj.weight
# - model.layers.0.mlp.up_proj.weight
# - model.layers.0.input_layernorm.weight
# - model.layers.0.post_attention_layernorm.weight
for l in range(num_layers):
print(f"converting layer {l}")
# first merge QKV into a single weight
# concat direct to FT shape: [hidden_size, 3, head_num, head_size]
# copied from huggingface_gptj_ckpt_convert.py
qkv_weights = np.stack([
param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.q_proj.weight']),
param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.k_proj.weight']),
param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.v_proj.weight']),
])
qkv_weights = np.transpose(qkv_weights, (2, 0, 1))
qkv_weights_base_name = f'model.layers.{l}.attention.query_key_value.weight'
split_and_convert_process(saved_dir, factor, qkv_weights_base_name, qkv_weights)

# attention dense
o_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.self_attn.o_proj.weight']).T
o_weight_base_name = f'model.layers.{l}.attention.dense.weight'
split_and_convert_process(saved_dir, factor, o_weight_base_name, o_weight)

# MLP
mlp_down_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.mlp.down_proj.weight']).T
mlp_down_base_name = f'model.layers.{l}.mlp.down_proj.weight'
split_and_convert_process(saved_dir, factor, mlp_down_base_name, mlp_down_weight)

mlp_gate_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.mlp.gate_proj.weight']).T
mlp_gate_base_name = f'model.layers.{l}.mlp.gate_proj.weight'
split_and_convert_process(saved_dir, factor, mlp_gate_base_name, mlp_gate_weight)

mlp_up_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.mlp.up_proj.weight']).T
mlp_up_base_name = f'model.layers.{l}.mlp.up_proj.weight'
split_and_convert_process(saved_dir, factor, mlp_up_base_name, mlp_up_weight)

# LayerNorm
input_ln_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.input_layernorm.weight'])
input_ln_base_name = f'model.layers.{l}.input_layernorm.weight'
split_and_convert_process(saved_dir, factor, input_ln_base_name, input_ln_weight)

post_attn_ln_weight = param_to_weights(model.state_dict()[f'model.layers.{l}.post_attention_layernorm.weight'])
post_attn_ln_base_name = f'model.layers.{l}.post_attention_layernorm.weight'
split_and_convert_process(saved_dir, factor, post_attn_ln_base_name, post_attn_ln_weight)

print(f"done layer {l}")


# final common weights
for name, param in model.named_parameters():
if name == 'model.embed_tokens.weight':
param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.wte.weight.bin")
elif name == 'model.norm.weight':
param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.final_layernorm.weight.bin")
elif name == 'lm_head.weight':
param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.lm_head.weight.bin")


if __name__ == "__main__":
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('-saved_dir', '-o', type=str, help='file name of output file', required=True)
parser.add_argument('-in_file', '-i', type=str, help='file name of input checkpoint file', required=True)
parser.add_argument('-trained_gpu_num', '-t_g', type=int, help='How many gpus for inference', default=1)
parser.add_argument('-infer_gpu_num', '-i_g', type=int, help='How many gpus for inference', required=True)
parser.add_argument("-weight_data_type", type=str, default="fp32", choices=["fp32", "fp16"])
parser.add_argument('-model_name', '-m_n', type=str, help='model name', required=True)

args = parser.parse_args()
print("\n=============== Argument ===============")
for key in vars(args):
print("{}: {}".format(key, vars(args)[key]))
print("========================================")

split_and_convert(args)
32 changes: 32 additions & 0 deletions examples/cpp/llama/llama_config.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
[ft_instance_hyperparameter]
data_type=fp16
enable_custom_all_reduce=0

tensor_para_size=1
pipeline_para_size=1

model_name=llama_7b
model_dir=/data/llama-7b-hf-converted/1-gpu

[request]
beam_width=1 # beam width for beam search
top_k=1 ; k value for top k sampling
top_p=0.0 ; p value for top p sampling
temperature=1.0 ; Use for sampling
repetition_penalty=1.0 ; Use for sampling
presence_penalty=0.0 ; Only one of repetition_penalty and presence_penalty are allowed.
len_penalty=0.0
beam_search_diversity_rate=0.0
request_batch_size=8 # determine by the request
request_output_len=32 # determine by the request

[llama_7b]
head_num = 32
size_per_head = 128
inter_size = 11008
num_layer = 32
rotary_embedding = 128
vocab_size = 32000
start_id = 0

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add layernorm_eps=1e-06 will help new beginers.

end_id = 1
weight_data_type = fp16
Loading