Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

running on 4bit model #29

Open
haiderasad opened this issue Oct 4, 2024 · 2 comments
Open

running on 4bit model #29

haiderasad opened this issue Oct 4, 2024 · 2 comments

Comments

@haiderasad
Copy link

haiderasad commented Oct 4, 2024

hey, while running on 4bit quantized model from https://huggingface.co/ThetaCursed/Ovis1.6-Gemma2-9B-bnb-4bit i am getting the following error

{
	"name": "RuntimeError",
	"message": "self and mat2 must have the same dtype, but got BFloat16 and Byte",
	"stack": "---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[3], line 35
     23 with torch.inference_mode():
     24     gen_kwargs = dict(
     25         max_new_tokens=1024,
     26         do_sample=False,
   (...)
     33         use_cache=True
     34     )
---> 35     output_ids = model.generate(input_ids, pixel_values=pixel_values, attention_mask=attention_mask, **gen_kwargs)[0]
     36     output = text_tokenizer.decode(output_ids, skip_special_tokens=True)
     37     print(f'Output:\
{output}')

File ~/.cache/huggingface/modules/transformers_modules/AIDC-AI/Ovis1.6-Gemma2-9B/15954d60650e5d6f3cfebcb9407e809b4c5019e1/modeling_ovis.py:588, in Ovis.generate(self, inputs, **kwargs)
    583 def generate(
    584     self,
    585     inputs: Optional[torch.Tensor] = None,
    586     **kwargs
    587 ) -> Union[GenerateOutput, torch.LongTensor]:
--> 588     _, inputs_embeds, labels, attention_mask = self.merge_multimodal(
    589         text_input_ids=inputs,
    590         text_attention_masks=kwargs.pop('attention_mask'),
    591         text_labels=None,
    592         pixel_values=kwargs.pop('pixel_values'),
    593         left_padding=True
    594     )
    595     if getattr(self.generation_config, 'cache_implementation') == 'hybrid':  # mainly for Gemma2
    596         kwargs['past_key_values'] = self._get_hybrid_cache_for_llm(
    597             getattr(kwargs, \"num_beams\", inputs_embeds.shape[0]), kwargs['max_new_tokens'] + inputs_embeds.shape[-2])

File ~/.cache/huggingface/modules/transformers_modules/AIDC-AI/Ovis1.6-Gemma2-9B/15954d60650e5d6f3cfebcb9407e809b4c5019e1/modeling_ovis.py:385, in Ovis.merge_multimodal(self, text_input_ids, text_attention_masks, text_labels, pixel_values, left_padding)
    383 num_images = [x.shape[0] if x is not None else 0 for x in pixel_values]
    384 if sum(num_images) > 0:
--> 385     visual_tokens = self.visual_tokenizer(torch.cat([x for x in pixel_values if x is not None], dim=0))
    386     visual_embeds = torch.split(self.get_vte()(visual_tokens).to(dtype=self.dtype, device=input_device),
    387                                 split_size_or_sections=num_images, dim=0)
    388     visual_input_ids = torch.split(torch.argmax(visual_tokens, dim=-1).to(device=input_device),
    389                                    split_size_or_sections=num_images, dim=0)

File ~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
   1509     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1510 else:
-> 1511     return self._call_impl(*args, **kwargs)

File ~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
   1515 # If we don't have any hooks, we want to skip the rest of the logic in
   1516 # this function, and just call forward.
   1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1518         or _global_backward_pre_hooks or _global_backward_hooks
   1519         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520     return forward_call(*args, **kwargs)
   1522 try:
   1523     result = None

File ~/.local/lib/python3.8/site-packages/accelerate/hooks.py:170, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
    168         output = module._old_forward(*args, **kwargs)
    169 else:
--> 170     output = module._old_forward(*args, **kwargs)
    171 return module._hf_hook.post_forward(module, output)

File ~/.cache/huggingface/modules/transformers_modules/AIDC-AI/Ovis1.6-Gemma2-9B/15954d60650e5d6f3cfebcb9407e809b4c5019e1/modeling_ovis.py:221, in BaseVisualTokenizer.forward(self, pixel_values)
    220 def forward(self, pixel_values) -> torch.Tensor:  # [BatchSize, ImageShape] -> [BatchSize, #Token, VocabSize]
--> 221     features = self.encode(pixel_values)
    222     logits = self.head(features)
    223     tokens = self.tokenize(logits)

File ~/.cache/huggingface/modules/transformers_modules/AIDC-AI/Ovis1.6-Gemma2-9B/15954d60650e5d6f3cfebcb9407e809b4c5019e1/modeling_ovis.py:196, in BaseVisualTokenizer.encode(self, pixel_values)
    195 def encode(self, pixel_values):
--> 196     output = self.backbone(pixel_values, output_hidden_states=True, return_dict=True)
    197     features = output.hidden_states[-1]
    198     if self.config.drop_cls_token:

File ~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
   1509     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1510 else:
-> 1511     return self._call_impl(*args, **kwargs)

File ~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
   1515 # If we don't have any hooks, we want to skip the rest of the logic in
   1516 # this function, and just call forward.
   1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1518         or _global_backward_pre_hooks or _global_backward_hooks
   1519         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520     return forward_call(*args, **kwargs)
   1522 try:
   1523     result = None

File ~/.local/lib/python3.8/site-packages/accelerate/hooks.py:170, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
    168         output = module._old_forward(*args, **kwargs)
    169 else:
--> 170     output = module._old_forward(*args, **kwargs)
    171 return module._hf_hook.post_forward(module, output)

File ~/.local/lib/python3.8/site-packages/transformers/models/siglip/modeling_siglip.py:1189, in SiglipVisionModel.forward(self, pixel_values, output_attentions, output_hidden_states, return_dict, interpolate_pos_encoding)
   1165 r\"\"\"
   1166 Returns:
   1167 
   (...)
   1185 >>> pooled_output = outputs.pooler_output  # pooled features
   1186 ```\"\"\"
   1187 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-> 1189 return self.vision_model(
   1190     pixel_values=pixel_values,
   1191     output_attentions=output_attentions,
   1192     output_hidden_states=output_hidden_states,
   1193     return_dict=return_dict,
   1194     interpolate_pos_encoding=interpolate_pos_encoding,
   1195 )

File ~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
   1509     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1510 else:
-> 1511     return self._call_impl(*args, **kwargs)

File ~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
   1515 # If we don't have any hooks, we want to skip the rest of the logic in
   1516 # this function, and just call forward.
   1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1518         or _global_backward_pre_hooks or _global_backward_hooks
   1519         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520     return forward_call(*args, **kwargs)
   1522 try:
   1523     result = None

File ~/.local/lib/python3.8/site-packages/accelerate/hooks.py:170, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
    168         output = module._old_forward(*args, **kwargs)
    169 else:
--> 170     output = module._old_forward(*args, **kwargs)
    171 return module._hf_hook.post_forward(module, output)

File ~/.local/lib/python3.8/site-packages/transformers/models/siglip/modeling_siglip.py:1100, in SiglipVisionTransformer.forward(self, pixel_values, output_attentions, output_hidden_states, return_dict, interpolate_pos_encoding)
   1097 last_hidden_state = encoder_outputs[0]
   1098 last_hidden_state = self.post_layernorm(last_hidden_state)
-> 1100 pooler_output = self.head(last_hidden_state) if self.use_head else None
   1101 if not return_dict:
   1102     return (last_hidden_state, pooler_output) + encoder_outputs[1:]

File ~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
   1509     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1510 else:
-> 1511     return self._call_impl(*args, **kwargs)

File ~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
   1515 # If we don't have any hooks, we want to skip the rest of the logic in
   1516 # this function, and just call forward.
   1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1518         or _global_backward_pre_hooks or _global_backward_hooks
   1519         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520     return forward_call(*args, **kwargs)
   1522 try:
   1523     result = None

File ~/.local/lib/python3.8/site-packages/accelerate/hooks.py:170, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
    168         output = module._old_forward(*args, **kwargs)
    169 else:
--> 170     output = module._old_forward(*args, **kwargs)
    171 return module._hf_hook.post_forward(module, output)

File ~/.local/lib/python3.8/site-packages/transformers/models/siglip/modeling_siglip.py:1127, in SiglipMultiheadAttentionPoolingHead.forward(self, hidden_state)
   1124 batch_size = hidden_state.shape[0]
   1125 probe = self.probe.repeat(batch_size, 1, 1)
-> 1127 hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
   1129 residual = hidden_state
   1130 hidden_state = self.layernorm(hidden_state)

File ~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
   1509     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1510 else:
-> 1511     return self._call_impl(*args, **kwargs)

File ~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
   1515 # If we don't have any hooks, we want to skip the rest of the logic in
   1516 # this function, and just call forward.
   1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1518         or _global_backward_pre_hooks or _global_backward_hooks
   1519         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520     return forward_call(*args, **kwargs)
   1522 try:
   1523     result = None

File ~/.local/lib/python3.8/site-packages/accelerate/hooks.py:170, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
    168         output = module._old_forward(*args, **kwargs)
    169 else:
--> 170     output = module._old_forward(*args, **kwargs)
    171 return module._hf_hook.post_forward(module, output)

File ~/.local/lib/python3.8/site-packages/torch/nn/modules/activation.py:1241, in MultiheadAttention.forward(self, query, key, value, key_padding_mask, need_weights, attn_mask, average_attn_weights, is_causal)
   1227     attn_output, attn_output_weights = F.multi_head_attention_forward(
   1228         query, key, value, self.embed_dim, self.num_heads,
   1229         self.in_proj_weight, self.in_proj_bias,
   (...)
   1238         average_attn_weights=average_attn_weights,
   1239         is_causal=is_causal)
   1240 else:
-> 1241     attn_output, attn_output_weights = F.multi_head_attention_forward(
   1242         query, key, value, self.embed_dim, self.num_heads,
   1243         self.in_proj_weight, self.in_proj_bias,
   1244         self.bias_k, self.bias_v, self.add_zero_attn,
   1245         self.dropout, self.out_proj.weight, self.out_proj.bias,
   1246         training=self.training,
   1247         key_padding_mask=key_padding_mask,
   1248         need_weights=need_weights,
   1249         attn_mask=attn_mask,
   1250         average_attn_weights=average_attn_weights,
   1251         is_causal=is_causal)
   1252 if self.batch_first and is_batched:
   1253     return attn_output.transpose(1, 0), attn_output_weights

File ~/.local/lib/python3.8/site-packages/torch/nn/functional.py:5449, in multi_head_attention_forward(query, key, value, embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias, bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight, out_proj_bias, training, key_padding_mask, need_weights, attn_mask, use_separate_proj_weight, q_proj_weight, k_proj_weight, v_proj_weight, static_k, static_v, average_attn_weights, is_causal)
   5446 attn_output = torch.bmm(attn_output_weights, v)
   5448 attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim)
-> 5449 attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
   5450 attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
   5452 # optionally average attention weights over heads

RuntimeError: self and mat2 must have the same dtype, but got BFloat16 and Byte"
}
@wangyang581
Copy link

I have encountered the same problem. Is there a solution now

@runninglsy
Copy link
Collaborator

We've released quantized versions of Ovis1.6: Ovis1.6-Gemma2-9B-GPTQ-Int4 and Ovis1.6-Llama3.2-3B-GPTQ-Int4. Feel free to try them out and share your feedback!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

3 participants