running on 4bit model #29

haiderasad · 2024-10-04T07:15:11Z

hey, while running on 4bit quantized model from https://huggingface.co/ThetaCursed/Ovis1.6-Gemma2-9B-bnb-4bit i am getting the following error

{
	"name": "RuntimeError",
	"message": "self and mat2 must have the same dtype, but got BFloat16 and Byte",
	"stack": "---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[3], line 35
     23 with torch.inference_mode():
     24     gen_kwargs = dict(
     25         max_new_tokens=1024,
     26         do_sample=False,
   (...)
     33         use_cache=True
     34     )
---> 35     output_ids = model.generate(input_ids, pixel_values=pixel_values, attention_mask=attention_mask, **gen_kwargs)[0]
     36     output = text_tokenizer.decode(output_ids, skip_special_tokens=True)
     37     print(f'Output:\
{output}')

File ~/.cache/huggingface/modules/transformers_modules/AIDC-AI/Ovis1.6-Gemma2-9B/15954d60650e5d6f3cfebcb9407e809b4c5019e1/modeling_ovis.py:588, in Ovis.generate(self, inputs, **kwargs)
    583 def generate(
    584     self,
    585     inputs: Optional[torch.Tensor] = None,
    586     **kwargs
    587 ) -> Union[GenerateOutput, torch.LongTensor]:
--> 588     _, inputs_embeds, labels, attention_mask = self.merge_multimodal(
    589         text_input_ids=inputs,
    590         text_attention_masks=kwargs.pop('attention_mask'),
    591         text_labels=None,
    592         pixel_values=kwargs.pop('pixel_values'),
    593         left_padding=True
    594     )
    595     if getattr(self.generation_config, 'cache_implementation') == 'hybrid':  # mainly for Gemma2
    596         kwargs['past_key_values'] = self._get_hybrid_cache_for_llm(
    597             getattr(kwargs, \"num_beams\", inputs_embeds.shape[0]), kwargs['max_new_tokens'] + inputs_embeds.shape[-2])

File ~/.cache/huggingface/modules/transformers_modules/AIDC-AI/Ovis1.6-Gemma2-9B/15954d60650e5d6f3cfebcb9407e809b4c5019e1/modeling_ovis.py:385, in Ovis.merge_multimodal(self, text_input_ids, text_attention_masks, text_labels, pixel_values, left_padding)
    383 num_images = [x.shape[0] if x is not None else 0 for x in pixel_values]
    384 if sum(num_images) > 0:
--> 385     visual_tokens = self.visual_tokenizer(torch.cat([x for x in pixel_values if x is not None], dim=0))
    386     visual_embeds = torch.split(self.get_vte()(visual_tokens).to(dtype=self.dtype, device=input_device),
    387                                 split_size_or_sections=num_images, dim=0)
    388     visual_input_ids = torch.split(torch.argmax(visual_tokens, dim=-1).to(device=input_device),
    389                                    split_size_or_sections=num_images, dim=0)

File ~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
   1509     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1510 else:
-> 1511     return self._call_impl(*args, **kwargs)

File ~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
   1515 # If we don't have any hooks, we want to skip the rest of the logic in
   1516 # this function, and just call forward.
   1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1518         or _global_backward_pre_hooks or _global_backward_hooks
   1519         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520     return forward_call(*args, **kwargs)
   1522 try:
   1523     result = None

File ~/.local/lib/python3.8/site-packages/accelerate/hooks.py:170, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
    168         output = module._old_forward(*args, **kwargs)
    169 else:
--> 170     output = module._old_forward(*args, **kwargs)
    171 return module._hf_hook.post_forward(module, output)

File ~/.cache/huggingface/modules/transformers_modules/AIDC-AI/Ovis1.6-Gemma2-9B/15954d60650e5d6f3cfebcb9407e809b4c5019e1/modeling_ovis.py:221, in BaseVisualTokenizer.forward(self, pixel_values)
    220 def forward(self, pixel_values) -> torch.Tensor:  # [BatchSize, ImageShape] -> [BatchSize, #Token, VocabSize]
--> 221     features = self.encode(pixel_values)
    222     logits = self.head(features)
    223     tokens = self.tokenize(logits)

File ~/.cache/huggingface/modules/transformers_modules/AIDC-AI/Ovis1.6-Gemma2-9B/15954d60650e5d6f3cfebcb9407e809b4c5019e1/modeling_ovis.py:196, in BaseVisualTokenizer.encode(self, pixel_values)
    195 def encode(self, pixel_values):
--> 196     output = self.backbone(pixel_values, output_hidden_states=True, return_dict=True)
    197     features = output.hidden_states[-1]
    198     if self.config.drop_cls_token:

File ~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
   1509     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1510 else:
-> 1511     return self._call_impl(*args, **kwargs)

File ~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
   1515 # If we don't have any hooks, we want to skip the rest of the logic in
   1516 # this function, and just call forward.
   1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1518         or _global_backward_pre_hooks or _global_backward_hooks
   1519         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520     return forward_call(*args, **kwargs)
   1522 try:
   1523     result = None

File ~/.local/lib/python3.8/site-packages/accelerate/hooks.py:170, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
    168         output = module._old_forward(*args, **kwargs)
    169 else:
--> 170     output = module._old_forward(*args, **kwargs)
    171 return module._hf_hook.post_forward(module, output)

File ~/.local/lib/python3.8/site-packages/transformers/models/siglip/modeling_siglip.py:1189, in SiglipVisionModel.forward(self, pixel_values, output_attentions, output_hidden_states, return_dict, interpolate_pos_encoding)
   1165 r\"\"\"
   1166 Returns:
   1167 
   (...)
   1185 >>> pooled_output = outputs.pooler_output  # pooled features
   1186 ```\"\"\"
   1187 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-> 1189 return self.vision_model(
   1190     pixel_values=pixel_values,
   1191     output_attentions=output_attentions,
   1192     output_hidden_states=output_hidden_states,
   1193     return_dict=return_dict,
   1194     interpolate_pos_encoding=interpolate_pos_encoding,
   1195 )

File ~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
   1509     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1510 else:
-> 1511     return self._call_impl(*args, **kwargs)

File ~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
   1515 # If we don't have any hooks, we want to skip the rest of the logic in
   1516 # this function, and just call forward.
   1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1518         or _global_backward_pre_hooks or _global_backward_hooks
   1519         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520     return forward_call(*args, **kwargs)
   1522 try:
   1523     result = None

File ~/.local/lib/python3.8/site-packages/accelerate/hooks.py:170, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
    168         output = module._old_forward(*args, **kwargs)
    169 else:
--> 170     output = module._old_forward(*args, **kwargs)
    171 return module._hf_hook.post_forward(module, output)

File ~/.local/lib/python3.8/site-packages/transformers/models/siglip/modeling_siglip.py:1100, in SiglipVisionTransformer.forward(self, pixel_values, output_attentions, output_hidden_states, return_dict, interpolate_pos_encoding)
   1097 last_hidden_state = encoder_outputs[0]
   1098 last_hidden_state = self.post_layernorm(last_hidden_state)
-> 1100 pooler_output = self.head(last_hidden_state) if self.use_head else None
   1101 if not return_dict:
   1102     return (last_hidden_state, pooler_output) + encoder_outputs[1:]

File ~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
   1509     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1510 else:
-> 1511     return self._call_impl(*args, **kwargs)

File ~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
   1515 # If we don't have any hooks, we want to skip the rest of the logic in
   1516 # this function, and just call forward.
   1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1518         or _global_backward_pre_hooks or _global_backward_hooks
   1519         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520     return forward_call(*args, **kwargs)
   1522 try:
   1523     result = None

File ~/.local/lib/python3.8/site-packages/accelerate/hooks.py:170, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
    168         output = module._old_forward(*args, **kwargs)
    169 else:
--> 170     output = module._old_forward(*args, **kwargs)
    171 return module._hf_hook.post_forward(module, output)

File ~/.local/lib/python3.8/site-packages/transformers/models/siglip/modeling_siglip.py:1127, in SiglipMultiheadAttentionPoolingHead.forward(self, hidden_state)
   1124 batch_size = hidden_state.shape[0]
   1125 probe = self.probe.repeat(batch_size, 1, 1)
-> 1127 hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
   1129 residual = hidden_state
   1130 hidden_state = self.layernorm(hidden_state)

File ~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
   1509     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1510 else:
-> 1511     return self._call_impl(*args, **kwargs)

File ~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
   1515 # If we don't have any hooks, we want to skip the rest of the logic in
   1516 # this function, and just call forward.
   1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1518         or _global_backward_pre_hooks or _global_backward_hooks
   1519         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520     return forward_call(*args, **kwargs)
   1522 try:
   1523     result = None

File ~/.local/lib/python3.8/site-packages/accelerate/hooks.py:170, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
    168         output = module._old_forward(*args, **kwargs)
    169 else:
--> 170     output = module._old_forward(*args, **kwargs)
    171 return module._hf_hook.post_forward(module, output)

File ~/.local/lib/python3.8/site-packages/torch/nn/modules/activation.py:1241, in MultiheadAttention.forward(self, query, key, value, key_padding_mask, need_weights, attn_mask, average_attn_weights, is_causal)
   1227     attn_output, attn_output_weights = F.multi_head_attention_forward(
   1228         query, key, value, self.embed_dim, self.num_heads,
   1229         self.in_proj_weight, self.in_proj_bias,
   (...)
   1238         average_attn_weights=average_attn_weights,
   1239         is_causal=is_causal)
   1240 else:
-> 1241     attn_output, attn_output_weights = F.multi_head_attention_forward(
   1242         query, key, value, self.embed_dim, self.num_heads,
   1243         self.in_proj_weight, self.in_proj_bias,
   1244         self.bias_k, self.bias_v, self.add_zero_attn,
   1245         self.dropout, self.out_proj.weight, self.out_proj.bias,
   1246         training=self.training,
   1247         key_padding_mask=key_padding_mask,
   1248         need_weights=need_weights,
   1249         attn_mask=attn_mask,
   1250         average_attn_weights=average_attn_weights,
   1251         is_causal=is_causal)
   1252 if self.batch_first and is_batched:
   1253     return attn_output.transpose(1, 0), attn_output_weights

File ~/.local/lib/python3.8/site-packages/torch/nn/functional.py:5449, in multi_head_attention_forward(query, key, value, embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias, bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight, out_proj_bias, training, key_padding_mask, need_weights, attn_mask, use_separate_proj_weight, q_proj_weight, k_proj_weight, v_proj_weight, static_k, static_v, average_attn_weights, is_causal)
   5446 attn_output = torch.bmm(attn_output_weights, v)
   5448 attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim)
-> 5449 attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
   5450 attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
   5452 # optionally average attention weights over heads

RuntimeError: self and mat2 must have the same dtype, but got BFloat16 and Byte"
}

The text was updated successfully, but these errors were encountered:

wangyang581 · 2024-10-30T08:50:56Z

I have encountered the same problem. Is there a solution now

runninglsy · 2024-11-04T13:02:38Z

We've released quantized versions of Ovis1.6: Ovis1.6-Gemma2-9B-GPTQ-Int4 and Ovis1.6-Llama3.2-3B-GPTQ-Int4. Feel free to try them out and share your feedback!

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

running on 4bit model #29

running on 4bit model #29

haiderasad commented Oct 4, 2024 •

edited

Loading

wangyang581 commented Oct 30, 2024

runninglsy commented Nov 4, 2024

running on 4bit model #29

running on 4bit model #29

Comments

haiderasad commented Oct 4, 2024 • edited Loading

wangyang581 commented Oct 30, 2024

runninglsy commented Nov 4, 2024

haiderasad commented Oct 4, 2024 •

edited

Loading