-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert_llama_to_hf.py
49 lines (34 loc) · 1.61 KB
/
convert_llama_to_hf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import glob
import os
from convert_to_hf import convert_to_hf
from transformers import AutoTokenizer
import torch
# specify the directory where the checkpoint files are stored
ckpt_dir = './llama_2b/'
# find all checkpoint files in the directory
ckpt_files = glob.glob(os.path.join(ckpt_dir, 'model_*.pt'))
# extract the numbers from the file names and find the largest
latest_ckpt_num = max(int(os.path.splitext(os.path.basename(f))[0].split('_')[-1]) for f in ckpt_files)
# construct the path to the latest checkpoint file
# use zfill to add leading zeros to the checkpoint number
ckp_path = os.path.join(ckpt_dir, f'model_{str(latest_ckpt_num).zfill(5)}.pt')
model_name = 'llama'
hf_model = convert_to_hf(model_name, ckp_path)
# Target vocabulary size
new_vocab_size = 50257
# Update the embedding layer
old_embed_tokens = hf_model.model.embed_tokens.weight.data
hf_model.model.embed_tokens = torch.nn.Embedding(new_vocab_size, old_embed_tokens.size(1))
hf_model.model.embed_tokens.weight.data = old_embed_tokens[:new_vocab_size, :]
# Update the lm_head
old_lm_head = hf_model.lm_head.weight.data
hf_model.lm_head = torch.nn.Linear(old_lm_head.size(1), new_vocab_size, bias=False)
hf_model.lm_head.weight.data = old_lm_head[:new_vocab_size, :]
hf_model.config.vocab_size = new_vocab_size
# construct the save folder name based on the model name and checkpoint number
sav_fold = f'/src/dmas-{model_name}-{latest_ckpt_num}_steps'
# create the save folder if it doesn't exist
os.makedirs(sav_fold, exist_ok=True)
hf_model.save_pretrained(sav_fold)
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.save_pretrained(sav_fold)