Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

eagle_llama but Transformers does not recognize this architecture #8

Open
flehn opened this issue Sep 2, 2024 · 1 comment
Open

Comments

@flehn
Copy link

flehn commented Sep 2, 2024

Hello,
I want to run:

from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("NVEagle/Eagle-X5-13B-Chat")

But I get:
ValueError: The checkpoint you are trying to load has model type eagle_llama but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date.

Transformer version: 4.44.2

@Nikitala0014
Copy link

Nikitala0014 commented Sep 2, 2024

Hello! You probably won't be able to achieve this because it's not included in the Transformers library, and I didn't find it in the model card on Hugging Face either.

What you might want to do instead is the following:

from eagle.model import EagleLlamaForCausalLM
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
model = EagleLlamaForCausalLM.from_pretrained(
    model_path,
    low_cpu_mem_usage=True
)

However, this alone isn't sufficient to get started, as Eagle is a family of Multimodal Large Language Models and its weights alone are just fine-tuned versions of LLaVa's. Additionally, the repository downloads other models like Vision Experts and Clip Encoder. Here's your simplified version of load_pretrained_model:

from transformers import AutoTokenizer, BitsAndBytesConfig
import torch
from eagle.model import *
from eagle.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN


def load_pretrained_model(model_path, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
    kwargs = {"device_map": device_map, **kwargs}

    if device != "cuda":
        kwargs['device_map'] = {"": device}

    if load_8bit:
        kwargs['load_in_8bit'] = True
    elif load_4bit:
        kwargs['load_in_4bit'] = True
        kwargs['quantization_config'] = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type='nf4'
        )
    else:
        kwargs['torch_dtype'] = torch.float16

    if use_flash_attn:
        kwargs['attn_implementation'] = 'flash_attention_2'

    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
    model = EagleLlamaForCausalLM.from_pretrained(
        model_path,
        low_cpu_mem_usage=True,
        **kwargs
    )

    mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
    mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
    if mm_use_im_patch_token:
        tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
    if mm_use_im_start_end:
        tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
    model.resize_token_embeddings(len(tokenizer))

    vision_tower = model.get_vision_tower()
    if not vision_tower.is_loaded:
        vision_tower.load_model(device_map=device_map)
    if device_map != 'auto':
        vision_tower.to(device=device_map, dtype=torch.float16)
    image_processor = vision_tower.image_processor

    if hasattr(model.config, "max_sequence_length"):
        context_len = model.config.max_sequence_length
    else:
        context_len = 2048

    return tokenizer, model, image_processor, context_len

Given that, if you intend to use it without Gradio for terminal testing or to create an endpoint, you'll need to determine how you'll receive your images. Once you have the images and their corresponding prompts, you can proceed with the following steps:

import argparse
import torch
from PIL import Image

from eagle.model.custom_builder import load_pretrained_model
from eagle.constants import DEFAULT_IMAGE_TOKEN
from eagle.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from eagle.mm_utils import tokenizer_image_token, process_images
from eagle.conversation import conv_templates


argparser = argparse.ArgumentParser()
argparser.add_argument("--model-path", default="NVEagle/Eagle-X5-7B", type=str)
argparser.add_argument("--conv-mode", type=str, default="default")
argparser.add_argument("--temperature", type=float, default=0.2)
argparser.add_argument("--max-new-tokens", type=int, default=512)
argparser.add_argument("--num_frames", type=int, default=16)
argparser.add_argument("--load-8bit", action="store_true")
argparser.add_argument("--load-4bit", action="store_true")

args = argparser.parse_args()
model_path = args.model_path
conv_mode = args.conv_mode

tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.load_8bit, args.load_4bit)


def generate(images, prompt, temperature=0.2, top_p=0.7):
    num_image_tokens = 0

    if images is not None and len(images) > 0:
        if len(images) > 0:
            if len(images) != prompt.count(DEFAULT_IMAGE_TOKEN):
                raise ValueError("Number of images does not match number of <image> tokens in prompt")
            
            #images = [load_image_from_base64(image) for image in images]
            image_sizes = [image.size for image in images]
            images = process_images(images, image_processor, model.config)

            if type(images) is list:
                images = [image.to(model.device, dtype=torch.float16) for image in images]
            else:
                images = images.to(model.device, dtype=torch.float16)
        else:
            images = None
            image_sizes = None
        image_args = {"images": images, "image_sizes": image_sizes}
    else:
        images = None
        image_args = {}

    max_context_length = getattr(model.config, 'max_position_embeddings', 2048)
    max_new_tokens = 512
    do_sample = True if temperature > 0.001 else False

    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)

    max_new_tokens = min(max_new_tokens, max_context_length - input_ids.shape[-1] - num_image_tokens)

    if max_new_tokens < 1:
        # yield json.dumps({"text": ori_prompt + "Exceeds max token length. Please start a new conversation, thanks.", "error_code": 0}).encode() + b"\0"
        return

    output_tensor = model.generate(
        inputs=input_ids,
        do_sample=do_sample,
        temperature=temperature,
        top_p=top_p,
        max_new_tokens=max_new_tokens,
        use_cache=True,
        pad_token_id=tokenizer.eos_token_id,
        **image_args
    )

    decoded_text = tokenizer.decode(output_tensor[0], skip_special_tokens=True)
    print("decoded_text", decoded_text)
    
    torch.cuda.empty_cache()



if __name__ == '__main__':
    message = "Explain this photo"
    image = Image.open("image_path.jpg").convert('RGB')
    prompt = DEFAULT_IMAGE_TOKEN + '\n' + message
    state = conv_templates[conv_mode].copy()
    image_process_mode = "Default"
    box = (prompt, image, image_process_mode)

    state.append_message(state.roles[0], box)
    state.append_message(state.roles[1], None)

    prompt = state.get_prompt()
    images = state.get_images(return_pil=True)

    generate(images, prompt)

Use an older version of Transformers, such as:
transformers==4.37.2

With more recent versions, you'll likely encounter the following exception:
TypeError: LlavaLlamaForCausalLM.forward() got an unexpected keyword argument 'cache_position'

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants