-
Notifications
You must be signed in to change notification settings - Fork 33
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
How to use multiple GPUs for inference? #12
Comments
Below is an example of running import torch
from PIL import Image
from transformers import AutoModelForCausalLM
device_map = {
"visual_tokenizer": 0,
"vte": 0,
"llm.model.embed_tokens": 0,
"llm.model.norm": 0,
"llm.lm_head": 0,
"llm.model.layers.0": 0,
"llm.model.layers.1": 0,
"llm.model.layers.2": 0,
"llm.model.layers.3": 0,
"llm.model.layers.4": 0,
"llm.model.layers.5": 0,
"llm.model.layers.6": 0,
"llm.model.layers.7": 0,
"llm.model.layers.8": 0,
"llm.model.layers.9": 0,
"llm.model.layers.10": 0,
"llm.model.layers.11": 0,
"llm.model.layers.12": 0,
"llm.model.layers.13": 0,
"llm.model.layers.14": 1,
"llm.model.layers.15": 1,
"llm.model.layers.16": 1,
"llm.model.layers.17": 1,
"llm.model.layers.18": 1,
"llm.model.layers.19": 1,
"llm.model.layers.20": 1,
"llm.model.layers.21": 1,
"llm.model.layers.22": 1,
"llm.model.layers.23": 1,
"llm.model.layers.24": 1,
"llm.model.layers.25": 1,
"llm.model.layers.26": 1,
"llm.model.layers.27": 1,
"llm.model.layers.28": 1,
"llm.model.layers.29": 1,
"llm.model.layers.30": 1,
"llm.model.layers.31": 1
}
# load model
model = AutoModelForCausalLM.from_pretrained("AIDC-AI/Ovis1.5-Llama3-8B",
torch_dtype=torch.bfloat16,
multimodal_max_length=8192,
device_map=device_map,
trust_remote_code=True)
text_tokenizer = model.get_text_tokenizer()
visual_tokenizer = model.get_visual_tokenizer()
conversation_formatter = model.get_conversation_formatter()
# enter image path and prompt
image_path = input("Enter image path: ")
image = Image.open(image_path)
text = input("Enter prompt: ")
query = f'<image>\n{text}'
prompt, input_ids = conversation_formatter.format_query(query)
input_ids = torch.unsqueeze(input_ids, dim=0).cuda()
attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id).cuda()
pixel_values = [visual_tokenizer.preprocess_image(image).to(
dtype=visual_tokenizer.dtype, device=visual_tokenizer.device)]
# generate output
with torch.inference_mode():
gen_kwargs = dict(
max_new_tokens=1024,
do_sample=False,
top_p=None,
top_k=None,
temperature=None,
repetition_penalty=None,
eos_token_id=model.generation_config.eos_token_id,
pad_token_id=text_tokenizer.pad_token_id,
use_cache=True
)
output_ids = model.generate(input_ids, pixel_values=pixel_values, attention_mask=attention_mask, **gen_kwargs)[0]
output = text_tokenizer.decode(output_ids, skip_special_tokens=True)
print(f'Output: {output}') |
what's the script for Ovis1.5-Gemma2-9B |
Did you solve it |
Below is an example of running import torch
from PIL import Image
from transformers import AutoModelForCausalLM
device_map = {
"visual_tokenizer": 0,
"vte": 0,
"llm.model.embed_tokens": 0,
"llm.model.norm": 0,
"llm.lm_head": 0,
"llm.model.layers.0": 0,
"llm.model.layers.1": 0,
"llm.model.layers.2": 0,
"llm.model.layers.3": 0,
"llm.model.layers.4": 0,
"llm.model.layers.5": 0,
"llm.model.layers.6": 0,
"llm.model.layers.7": 0,
"llm.model.layers.8": 0,
"llm.model.layers.9": 0,
"llm.model.layers.10": 0,
"llm.model.layers.11": 0,
"llm.model.layers.12": 0,
"llm.model.layers.13": 0,
"llm.model.layers.14": 0,
"llm.model.layers.15": 0,
"llm.model.layers.16": 0,
"llm.model.layers.17": 0,
"llm.model.layers.18": 0,
"llm.model.layers.19": 0,
"llm.model.layers.20": 1,
"llm.model.layers.21": 1,
"llm.model.layers.22": 1,
"llm.model.layers.23": 1,
"llm.model.layers.24": 1,
"llm.model.layers.25": 1,
"llm.model.layers.26": 1,
"llm.model.layers.27": 1,
"llm.model.layers.28": 1,
"llm.model.layers.29": 1,
"llm.model.layers.30": 1,
"llm.model.layers.31": 1,
"llm.model.layers.32": 1,
"llm.model.layers.33": 1,
"llm.model.layers.34": 1,
"llm.model.layers.35": 1,
"llm.model.layers.36": 1,
"llm.model.layers.37": 1,
"llm.model.layers.38": 1,
"llm.model.layers.39": 1,
"llm.model.layers.40": 1,
"llm.model.layers.41": 1
}
# load model
model = AutoModelForCausalLM.from_pretrained("AIDC-AI/Ovis1.5-Gemma2-9B",
torch_dtype=torch.bfloat16,
multimodal_max_length=8192,
device_map=device_map,
trust_remote_code=True)
text_tokenizer = model.get_text_tokenizer()
visual_tokenizer = model.get_visual_tokenizer()
conversation_formatter = model.get_conversation_formatter()
# enter image path and prompt
image_path = input("Enter image path: ")
image = Image.open(image_path)
text = input("Enter prompt: ")
query = f'<image>\n{text}'
prompt, input_ids = conversation_formatter.format_query(query)
input_ids = torch.unsqueeze(input_ids, dim=0).cuda()
attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id).cuda()
pixel_values = [visual_tokenizer.preprocess_image(image).to(
dtype=visual_tokenizer.dtype, device=visual_tokenizer.device)]
# generate output
with torch.inference_mode():
gen_kwargs = dict(
max_new_tokens=1024,
do_sample=False,
top_p=None,
top_k=None,
temperature=None,
repetition_penalty=None,
eos_token_id=model.generation_config.eos_token_id,
pad_token_id=text_tokenizer.pad_token_id,
use_cache=True
)
output_ids = model.generate(input_ids, pixel_values=pixel_values, attention_mask=attention_mask, **gen_kwargs)[0]
output = text_tokenizer.decode(output_ids, skip_special_tokens=True)
print(f'Output: {output}') |
@cyj95, @AntyRia, here’s the code, try running this: from dataclasses import field, dataclass
from typing import Optional, Union, List
import torch
from PIL import Image
from ovis.model.modeling_ovis import Ovis
from ovis.util.constants import IMAGE_TOKEN
device_map = {
"visual_tokenizer": 0,
"vte": 0,
"llm.model.embed_tokens": 0,
"llm.model.norm": 0,
"llm.lm_head": 0,
"llm.model.layers.0": 0,
"llm.model.layers.1": 0,
"llm.model.layers.2": 0,
"llm.model.layers.3": 0,
"llm.model.layers.4": 0,
"llm.model.layers.5": 0,
"llm.model.layers.6": 0,
"llm.model.layers.7": 0,
"llm.model.layers.8": 0,
"llm.model.layers.9": 0,
"llm.model.layers.10": 0,
"llm.model.layers.11": 0,
"llm.model.layers.12": 0,
"llm.model.layers.13": 0,
"llm.model.layers.14": 0,
"llm.model.layers.15": 0,
"llm.model.layers.16": 0,
"llm.model.layers.17": 0,
"llm.model.layers.18": 0,
"llm.model.layers.19": 0,
"llm.model.layers.20": 1,
"llm.model.layers.21": 1,
"llm.model.layers.22": 1,
"llm.model.layers.23": 1,
"llm.model.layers.24": 1,
"llm.model.layers.25": 1,
"llm.model.layers.26": 1,
"llm.model.layers.27": 1,
"llm.model.layers.28": 1,
"llm.model.layers.29": 1,
"llm.model.layers.30": 1,
"llm.model.layers.31": 1,
"llm.model.layers.32": 1,
"llm.model.layers.33": 1,
"llm.model.layers.34": 1,
"llm.model.layers.35": 1,
"llm.model.layers.36": 1,
"llm.model.layers.37": 1,
"llm.model.layers.38": 1,
"llm.model.layers.39": 1,
"llm.model.layers.40": 1,
"llm.model.layers.41": 1
}
@dataclass
class RunnerArguments:
model_path: str
max_new_tokens: int = field(default=512)
do_sample: bool = field(default=False)
top_p: Optional[float] = field(default=None)
top_k: Optional[int] = field(default=None)
temperature: Optional[float] = field(default=None)
max_partition: int = field(default=9)
class OvisRunner:
def __init__(self, args: RunnerArguments):
self.model_path = args.model_path
self.dtype = torch.bfloat16
self.device = torch.cuda.current_device()
self.dtype = torch.bfloat16
self.model = Ovis.from_pretrained(self.model_path, torch_dtype=self.dtype, multimodal_max_length=8192, device_map=device_map)
self.model = self.model.eval()
self.eos_token_id = self.model.generation_config.eos_token_id
self.text_tokenizer = self.model.get_text_tokenizer()
self.pad_token_id = self.text_tokenizer.pad_token_id
self.visual_tokenizer = self.model.get_visual_tokenizer()
self.conversation_formatter = self.model.get_conversation_formatter()
self.image_placeholder = IMAGE_TOKEN
self.max_partition = args.max_partition
self.gen_kwargs = dict(
max_new_tokens=args.max_new_tokens,
do_sample=args.do_sample,
top_p=args.top_p,
top_k=args.top_k,
temperature=args.temperature,
repetition_penalty=None,
eos_token_id=self.eos_token_id,
pad_token_id=self.pad_token_id,
use_cache=True
)
def preprocess(self, inputs: List[Union[Image.Image, str]]):
# for single image and single text inputs, ensure image ahead
if len(inputs) == 2 and isinstance(inputs[0], str) and isinstance(inputs[1], Image.Image):
inputs = reversed(inputs)
# build query
query = ''
images = []
for data in inputs:
if isinstance(data, Image.Image):
query += self.image_placeholder + '\n'
images.append(data)
elif isinstance(data, str):
query += data.replace(self.image_placeholder, '')
elif data is not None:
raise RuntimeError(f'Invalid input type, expected `PIL.Image.Image` or `str`, but got {type(data)}')
# format conversation
prompt, input_ids, pixel_values = self.model.preprocess_inputs(
query, images, max_partition=self.max_partition)
attention_mask = torch.ne(input_ids, self.text_tokenizer.pad_token_id)
input_ids = input_ids.unsqueeze(0).to(device=self.device)
attention_mask = attention_mask.unsqueeze(0).to(device=self.device)
if pixel_values is not None:
pixel_values = [pixel_values.to(device=self.device, dtype=self.dtype)]
else:
pixel_values = [None]
return prompt, input_ids, attention_mask, pixel_values
def run(self, inputs: List[Union[Image.Image, str]]):
prompt, input_ids, attention_mask, pixel_values = self.preprocess(inputs)
output_ids = self.model.generate(
input_ids,
pixel_values=pixel_values,
attention_mask=attention_mask,
**self.gen_kwargs
)
output = self.text_tokenizer.decode(output_ids[0], skip_special_tokens=True)
input_token_len = input_ids.shape[1]
output_token_len = output_ids.shape[1]
response = dict(
prompt=prompt,
output=output,
prompt_tokens=input_token_len,
total_tokens=input_token_len + output_token_len
)
return response
if __name__ == '__main__':
text = <prompt>
image_path = <image_path>
model_path = <model_path>
runner_args = RunnerArguments(model_path=model_path)
runner = OvisRunner(runner_args)
image = Image.open(image_path)
response = runner.run([image, text])
print(response['output']) |
Hi, for
(Following the device map specified by previous posts, for other device map for more gpus, edit the layer_device_map accordingly) |
what's the script for Ovis1.6-Gemma2-9B?How do I deploy on two cards? |
The code is available at: https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-9B/discussions/6 |
I want to use multiple GPUs for inference, and I use device_map='auto' to load the model. However, I always met that problem: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0!
Can you help me with that? Thanks a lot!
The text was updated successfully, but these errors were encountered: