Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[feat] support video evaluation for qwen2-vl and add mix-evals-video2text #275

Merged
merged 16 commits into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 0 additions & 12 deletions lmms_eval/models/llava_onevision.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,18 +126,6 @@ def __init__(
overwrite_config["mm_spatial_pool_mode"] = self.mm_spatial_pool_mode
cfg_pretrained = AutoConfig.from_pretrained(self.pretrained)

if cfg_pretrained.architectures[0] == "LlavaLlamaForCausalLM": # Ugly code, only used in vicuna that needs ROPE
if "224" in cfg_pretrained.mm_vision_tower:
least_token_number = self.max_frames_num * (16 // self.mm_spatial_pool_stride) ** 2 + 1000
else:
least_token_number = self.max_frames_num * (24 // self.mm_spatial_pool_stride) ** 2 + 1000

scaling_factor = math.ceil(least_token_number / 4096)
if scaling_factor >= 2:
overwrite_config["rope_scaling"] = {"factor": float(scaling_factor), "type": "linear"}
overwrite_config["max_sequence_length"] = 4096 * scaling_factor
overwrite_config["tokenizer_model_max_length"] = 4096 * scaling_factor

llava_model_args["overwrite_config"] = overwrite_config
try:
# Try to load the model with the multimodal argument
Expand Down
66 changes: 49 additions & 17 deletions lmms_eval/models/qwen2_vl.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
import base64
from io import BytesIO
from typing import List, Optional, Tuple, Union

import decord
import torch
from accelerate import Accelerator, DistributedType
from loguru import logger as eval_logger
from PIL import Image
from tqdm import tqdm
from transformers import AutoProcessor, AutoTokenizer, Qwen2VLForConditionalGeneration

Expand All @@ -11,6 +15,11 @@
from lmms_eval.api.model import lmms
from lmms_eval.api.registry import register_model

try:
from qwen_vl_utils import process_vision_info
except ImportError:
eval_logger.warning("Failed to import qwen_vl_utils; Please install it via `pip install qwen-vl-utils`")


@register_model("qwen2_vl")
class Qwen2_VL(lmms):
Expand Down Expand Up @@ -176,30 +185,54 @@ def _collate(x):
contexts[i] = contexts[i].replace("<image>", "")

messages = []

if len(visuals) == 0:
for context in contexts:
message = [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": [{"type": "text", "text": context}]}]
messages.append(message)
else:
for _, context in zip(visuals, contexts):
message = [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": context}]}]
messages.append(message)
processed_visuals = []
for i, context in enumerate(contexts):
if "<image>" in context:
context = context.replace("<image>", "")

message = [{"role": "system", "content": "You are a helpful assistant."}]

if len(visuals) > 0:
visual = visuals[i] if i < len(visuals) else None
if isinstance(visual, str) and visual.endswith((".mp4", ".avi", ".mov")): # Video file
vr = decord.VideoReader(visual)
first_frame = vr[0].asnumpy()
height, width = first_frame.shape[:2]
max_pixels = height * width
message.append({"role": "user", "content": [{"type": "video", "video": visual, "fps": 1.0, "max_pixels": max_pixels}, {"type": "text", "text": context}]})
elif isinstance(visual, Image.Image): # Single image
base64_image = visual.convert("RGB")
buffer = BytesIO()
base64_image.save(buffer, format="JPEG")
base64_bytes = base64.b64encode(buffer.getvalue())
base64_string = base64_bytes.decode("utf-8")
message.append({"role": "user", "content": [{"type": "image", "image": f"data:image/jpeg;base64,{base64_string}"}, {"type": "text", "text": context}]})
elif isinstance(visual, (list, tuple)) and all(isinstance(v, Image.Image) for v in visual): # Multiple images
image_content = []
for v in visual:
base64_image = v.convert("RGB")
buffer = BytesIO()
base64_image.save(buffer, format="JPEG")
base64_bytes = base64.b64encode(buffer.getvalue())
base64_string = base64_bytes.decode("utf-8")
image_content.append({"type": "image", "image": f"data:image/jpeg;base64,{base64_string}"})
message.append({"role": "user", "content": image_content + [{"type": "text", "text": context}]})
else:
message.append({"role": "user", "content": [{"type": "text", "text": context}]})
else:
message.append({"role": "user", "content": [{"type": "text", "text": context}]})

messages.append(message)

texts = [self.processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages]
inputs = self.processor(text=texts, images=[visuals], padding=True, return_tensors="pt")
image_inputs, video_inputs = process_vision_info(messages)
inputs = self.processor(text=texts, images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")

if self.device_map == "auto":
inputs = inputs.to("cuda")
else:
inputs = inputs.to(self.device)

# preconfigure gen_kwargs with defaults
if "image_sizes" not in gen_kwargs:
try:
gen_kwargs["image_sizes"] = [visuals[0].size]
except:
gen_kwargs["image_sizes"] = None
if "max_new_tokens" not in gen_kwargs:
gen_kwargs["max_new_tokens"] = 128
if "temperature" not in gen_kwargs:
Expand All @@ -221,7 +254,6 @@ def _collate(x):
num_beams=gen_kwargs["num_beams"],
max_new_tokens=gen_kwargs["max_new_tokens"],
use_cache=self.use_cache,
# kwargs=gen_kwargs
)

generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, cont)]
Expand Down
16 changes: 16 additions & 0 deletions lmms_eval/tasks/mix_evals/_default_template_yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
dataset_path: lmms-lab/MixEvals_Video2Text
dataset_kwargs:
token: True
video: True
cache_dir: mix_evals_video2text
lmms_eval_specific_kwargs:
default:
pre_prompt: ""
post_prompt: ""
gpt4v:
pre_prompt: "These are frames from a video. Please answer the following questions about the video."
post_prompt: ""
metadata:
modality: video
version: 0.0
gpt_eval_model_name: "gpt-4o-mini"
5 changes: 5 additions & 0 deletions lmms_eval/tasks/mix_evals/mix_evals_video2text.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
group: mix_evals_video2text
task:
# - mix_evals_video2text_openconv
- mix_evals_video2text_mc
- mix_evals_video2text_freeform
22 changes: 22 additions & 0 deletions lmms_eval/tasks/mix_evals/mix_evals_video2text_freeform.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
dataset_name: "video2text_closeended_free-form"
task: "mix_evals_video2text_freeform"
test_split: test
output_type: generate_until
doc_to_visual: !function utils.mix_evals_video2text_doc_to_visual
doc_to_text: !function utils.mix_evals_video2text_doc_to_text
doc_to_target: "{{target}}"
process_results: !function utils.mix_evals_video2text_process_results_freeform
metric_list:
- metric: gpt_eval
aggregation: !function utils.mix_evals_video2text_gpt_eval
higher_is_better: true

include: _default_template_yaml

lmms_eval_specific_kwargs:
default:
pre_prompt: "These are frames from a video. Please answer the following questions about the video."
post_prompt: "Answer the question using a single word or phrase."
gpt4v:
pre_prompt: "These are frames from a video. Please answer the following questions about the video with a short phrase."
post_prompt: ""
31 changes: 31 additions & 0 deletions lmms_eval/tasks/mix_evals/mix_evals_video2text_mc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
include: _default_template_yaml
dataset_name: "video2text_closeended_multiple-choice"
task: "mix_evals_video2text_mc"
test_split: test
output_type: generate_until
doc_to_visual: !function utils.mix_evals_video2text_doc_to_visual
doc_to_text: !function utils.mix_evals_video2text_doc_to_text
doc_to_target: "{{target}}"

metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true

filter_list:
- name: "flexible-extract"
filter:
- function: !function utils.MultiChoiceRegexFilter
group_select: 0
ignore_case: true
ignore_punctuation: true

lmms_eval_specific_kwargs:
default:
pre_prompt: "These are frames from a video. Please answer the following questions about the video."
post_prompt: "Answer with the option's letter from the given choices directly."
gpt4v:
pre_prompt: "These are frames from a video. Please answer the following questions about the video."
post_prompt: "Answer with the option's letter from the given choices directly."
22 changes: 22 additions & 0 deletions lmms_eval/tasks/mix_evals/mix_evals_video2text_openended.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
include: _default_template_yaml
dataset_name: "video2text_openended"
task: "mix_evals_video2text_openconv"
test_split: test
output_type: generate_until
doc_to_visual: !function utils.mix_evals_video2text_doc_to_visual
doc_to_text: !function utils.mix_evals_video2text_doc_to_text_open_convs
doc_to_target: ""
process_results: !function utils.mix_evals_video2text_process_results_open_convs

metric_list:
- metric: submission
aggregation: !function utils.mix_evals_video2text_aggregate_gen
higher_is_better: true

lmms_eval_specific_kwargs:
default:
pre_prompt: "These are frames from a video. Please answer the following questions about the video."
post_prompt: ""
gpt4v:
pre_prompt: "These are frames from a video. Please answer the following questions about the video."
post_prompt: ""
Loading
Loading