forked from jaymody/picoGPT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils_nonp.py
109 lines (92 loc) · 3.59 KB
/
utils_nonp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import json
import os
import re
import requests
import tensorflow as tf
from tqdm import tqdm
from encoder import get_encoder
def download_gpt2_files(model_size, model_dir):
assert model_size in ["124M", "355M", "774M", "1558M"]
for filename in [
"checkpoint",
"encoder.json",
"hparams.json",
"model.ckpt.data-00000-of-00001",
"model.ckpt.index",
"model.ckpt.meta",
"vocab.bpe",
]:
url = "https://openaipublic.blob.core.windows.net/gpt-2/models"
r = requests.get(f"{url}/{model_size}/{filename}", stream=True)
r.raise_for_status()
with open(os.path.join(model_dir, filename), "wb") as f:
file_size = int(r.headers["content-length"])
chunk_size = 1000
with tqdm(
ncols=100,
desc="Fetching " + filename,
total=file_size,
unit_scale=True,
unit="b",
) as pbar:
# 1k for chunk_size, since Ethernet packet size is around 1500 bytes
for chunk in r.iter_content(chunk_size=chunk_size):
f.write(chunk)
pbar.update(chunk_size)
def squeeze(array):
"""
Custom function to mimic np.squeeze() for GPT model parameters.
It removes dimensions (nested lists) of size 1 from the shape of an array.
"""
if isinstance(array, list):
# If it's a nested list and the length of the list is 1, squeeze the inner list
if len(array) == 1 and isinstance(array[0], list):
return squeeze(array[0])
else:
# Apply squeeze to each sub-array and return the result
return [
squeeze(sub_array) for sub_array in array if isinstance(sub_array, list)
]
else:
# Not a list, return the element as is
return array
def set_in_nested_dict(d, keys, val):
"""
Recursively sets a value in a nested dictionary.
"""
if not keys:
return val
if keys[0] not in d:
d[keys[0]] = {}
d[keys[0]] = set_in_nested_dict(d[keys[0]], keys[1:], val)
return d
def load_gpt2_params_from_tf_ckpt(tf_ckpt_path, hparams):
"""
Loads GPT-2 parameters from TensorFlow checkpoint without using Numpy.
"""
params = {"blocks": [{} for _ in range(hparams["n_layer"])]}
for name, _ in tf.train.list_variables(tf_ckpt_path):
array = tf.train.load_variable(tf_ckpt_path, name)
array_list = array.tolist() # Convert TensorFlow tensor to Python list
squeezed_array = squeeze(array_list)
name = name[len("model/") :]
if name.startswith("h"):
m = re.match(r"h([0-9]+)/(.*)", name)
n = int(m[1])
sub_name = m[2]
set_in_nested_dict(params["blocks"][n], sub_name.split("/"), squeezed_array)
else:
set_in_nested_dict(params, name.split("/"), squeezed_array)
return params
def load_encoder_hparams_and_params(model_size, models_dir):
assert model_size in ["124M", "355M", "774M", "1558M"]
model_dir = os.path.join(models_dir, model_size)
tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
if not tf_ckpt_path: # download files if necessary
os.makedirs(model_dir, exist_ok=True)
download_gpt2_files(model_size, model_dir)
tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
encoder = get_encoder(model_size, models_dir)
hparams = json.load(open(os.path.join(model_dir, "hparams.json")))
params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, hparams)
return encoder, hparams, params