utils_nonp.py

import json
import os
import re

import requests
import tensorflow as tf
from tqdm import tqdm

from encoder import get_encoder


def download_gpt2_files(model_size, model_dir):
    assert model_size in ["124M", "355M", "774M", "1558M"]
    for filename in [
        "checkpoint",
        "encoder.json",
        "hparams.json",
        "model.ckpt.data-00000-of-00001",
        "model.ckpt.index",
        "model.ckpt.meta",
        "vocab.bpe",
    ]:
        url = "https://openaipublic.blob.core.windows.net/gpt-2/models"
        r = requests.get(f"{url}/{model_size}/{filename}", stream=True)
        r.raise_for_status()

        with open(os.path.join(model_dir, filename), "wb") as f:
            file_size = int(r.headers["content-length"])
            chunk_size = 1000
            with tqdm(
                ncols=100,
                desc="Fetching " + filename,
                total=file_size,
                unit_scale=True,
                unit="b",
            ) as pbar:
                # 1k for chunk_size, since Ethernet packet size is around 1500 bytes
                for chunk in r.iter_content(chunk_size=chunk_size):
                    f.write(chunk)
                    pbar.update(chunk_size)


def squeeze(array):
    """
    Custom function to mimic np.squeeze() for GPT model parameters.
    It removes dimensions (nested lists) of size 1 from the shape of an array.
    """
    if isinstance(array, list):
        # If it's a nested list and the length of the list is 1, squeeze the inner list
        if len(array) == 1 and isinstance(array[0], list):
            return squeeze(array[0])
        else:
            # Apply squeeze to each sub-array and return the result
            return [
                squeeze(sub_array) for sub_array in array if isinstance(sub_array, list)
            ]
    else:
        # Not a list, return the element as is
        return array


def set_in_nested_dict(d, keys, val):
    """
    Recursively sets a value in a nested dictionary.
    """
    if not keys:
        return val
    if keys[0] not in d:
        d[keys[0]] = {}
    d[keys[0]] = set_in_nested_dict(d[keys[0]], keys[1:], val)
    return d


def load_gpt2_params_from_tf_ckpt(tf_ckpt_path, hparams):
    """
    Loads GPT-2 parameters from TensorFlow checkpoint without using Numpy.
    """
    params = {"blocks": [{} for _ in range(hparams["n_layer"])]}
    for name, _ in tf.train.list_variables(tf_ckpt_path):
        array = tf.train.load_variable(tf_ckpt_path, name)
        array_list = array.tolist()  # Convert TensorFlow tensor to Python list
        squeezed_array = squeeze(array_list)
        name = name[len("model/") :]
        if name.startswith("h"):
            m = re.match(r"h([0-9]+)/(.*)", name)
            n = int(m[1])
            sub_name = m[2]
            set_in_nested_dict(params["blocks"][n], sub_name.split("/"), squeezed_array)
        else:
            set_in_nested_dict(params, name.split("/"), squeezed_array)

    return params


def load_encoder_hparams_and_params(model_size, models_dir):
    assert model_size in ["124M", "355M", "774M", "1558M"]

    model_dir = os.path.join(models_dir, model_size)
    tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
    if not tf_ckpt_path:  # download files if necessary
        os.makedirs(model_dir, exist_ok=True)
        download_gpt2_files(model_size, model_dir)
        tf_ckpt_path = tf.train.latest_checkpoint(model_dir)

    encoder = get_encoder(model_size, models_dir)
    hparams = json.load(open(os.path.join(model_dir, "hparams.json")))
    params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, hparams)

    return encoder, hparams, params