diff --git a/README.md b/README.md index b116e3a5..9f6eaa67 100755 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ cd lmms-eval pip install -e . ``` -If you wanted to test llava, you will have to clone their repo from [LLaVA](https://github.com/haotian-liu/LLaVA) and +If you want to test LLaVA, you will have to clone their repo from [LLaVA](https://github.com/haotian-liu/LLaVA) and ```bash # for llava 1.5 # git clone https://github.com/haotian-liu/LLaVA @@ -69,7 +69,7 @@ You can check the [environment install script](miscs/repr_scripts.sh) and [torch -If you want to test on caption dataset such as `coco`, `refcoco`, and `nocaps`, you will need to have `java==1.8.0 ` to let pycocoeval api to work. If you don't have it, you can install by using conda +If you want to test on caption dataset such as `coco`, `refcoco`, and `nocaps`, you will need to have `java==1.8.0` to let pycocoeval api to work. If you don't have it, you can install by using conda ``` conda install openjdk=8 ``` @@ -93,6 +93,11 @@ We also provide the raw data exported from Weights & Biases for the detailed res
+If you want to test [VILA](https://github.com/NVlabs/VILA), you should install the following dependencies: + +```bash +pip install s2wrapper@git+https://github.com/bfshi/scaling_on_scales +``` Our Development will be continuing on the main branch, and we encourage you to give us feedback on what features are desired and how to improve the library further, or ask questions, either in issues or PRs on GitHub. diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py index 2cecfe22..f77065e8 100755 --- a/lmms_eval/api/samplers.py +++ b/lmms_eval/api/samplers.py @@ -37,9 +37,7 @@ def get_context(self, doc, num_fewshot): + ( str(self.doc_to_target(doc)[0]) if type(self.doc_to_target(doc)) is list - else self.doc_to_target(doc) - if (self.config.doc_to_choice is None or type(self.doc_to_target(doc)) is str) - else str(self.doc_to_choice(doc)[self.doc_to_target(doc)]) + else self.doc_to_target(doc) if (self.config.doc_to_choice is None or type(self.doc_to_target(doc)) is str) else str(self.doc_to_choice(doc)[self.doc_to_target(doc)]) ) for doc in selected_docs ] diff --git a/lmms_eval/models/__init__.py b/lmms_eval/models/__init__.py index 4acc38a9..de1b533b 100755 --- a/lmms_eval/models/__init__.py +++ b/lmms_eval/models/__init__.py @@ -41,7 +41,7 @@ "video_llava": "VideoLLaVA", "vila": "VILA", "xcomposer2_4KHD": "XComposer2_4KHD", - "xcomposer2d5": "XComposer2D5" + "xcomposer2d5": "XComposer2D5", } for model_name, model_class in AVAILABLE_MODELS.items(): diff --git a/lmms_eval/models/llava_hf.py b/lmms_eval/models/llava_hf.py index af0b554e..3c5791fe 100644 --- a/lmms_eval/models/llava_hf.py +++ b/lmms_eval/models/llava_hf.py @@ -320,7 +320,7 @@ def _collate(x): pad_token_id=self.tokenizer.eos_token_id, eos_token_id=self.specified_eot_token_id, ) - cont = cont[:, inputs["input_ids"].shape[-1]:] + cont = cont[:, inputs["input_ids"].shape[-1] :] except Exception as e: eval_logger.error(f"Error {e} in generating") cont = "" diff --git a/lmms_eval/models/llava_vid.py b/lmms_eval/models/llava_vid.py index bdf7fb37..c1e1fd38 100755 --- a/lmms_eval/models/llava_vid.py +++ b/lmms_eval/models/llava_vid.py @@ -28,11 +28,13 @@ try: from llavavid.model.language_model.llava_qwen import LlavaQwenConfig + AutoConfig.register("llava_qwen", LlavaQwenConfig) except ImportError: eval_logger.debug("No Qwen for llava vid") from llavavid.model.language_model.llava_llama import LlavaConfig + AutoConfig.register("llava_llama", LlavaConfig) diff --git a/lmms_eval/models/mantis.py b/lmms_eval/models/mantis.py index fe1bb0b2..7b0a1569 100644 --- a/lmms_eval/models/mantis.py +++ b/lmms_eval/models/mantis.py @@ -27,10 +27,10 @@ from mantis.models.mllava import LlavaForConditionalGeneration, MLlavaProcessor from mantis.models.mfuyu import MFuyuForCausalLM, MFuyuProcessor from mantis.models.conversation import conv_mllava_v1 as default_conv, conv_templates - + except Exception as e: eval_logger.debug("Mantis is not installed. Please install Mantis to use this model.\nError: %s" % e) - + try: from transformers import AutoModelForVision2Seq, AutoProcessor except Exception as e: @@ -42,13 +42,14 @@ try: import flash_attn - + best_fit_attn_implementation = "flash_attention_2" except ImportError: best_fit_attn_implementation = "eager" DEFAULT_IMAGE_TOKEN = "" + @register_model("mantis") class Mantis(lmms): """ @@ -84,27 +85,27 @@ def __init__( else: self._device = torch.device(f"cuda:{accelerator.local_process_index}") self.device_map = f"cuda:{accelerator.local_process_index}" - + self._is_idefics = "idefics" in pretrained.lower() if isinstance(dtype, str) and dtype != "auto": dtype = getattr(torch, dtype) - + # Here we load the "non-idefics" Mantis model. if not self._is_idefics: - if 'fuyu' in pretrained.lower(): + if "fuyu" in pretrained.lower(): self._processor = MFuyuProcessor.from_pretrained(pretrained) self._model = MFuyuForCausalLM.from_pretrained(pretrained, device_map=self.device_map, attn_implementation=attn_implementation, torch_dtype=dtype) else: self._processor = MLlavaProcessor.from_pretrained(pretrained) self._model = LlavaForConditionalGeneration.from_pretrained(pretrained, device_map=self.device_map, attn_implementation=attn_implementation, torch_dtype=dtype) - + else: self._processor = AutoProcessor.from_pretrained(pretrained) self._model = AutoModelForVision2Seq.from_pretrained(pretrained, device_map=self.device_map, torch_dtype=dtype) eval_logger.info(f"Using {type(self._model)} to instantiate the Mantis model.") - + self._tokenizer = self._processor.tokenizer - + self._config = self._model.config self.model.eval() self.model.tie_weights() @@ -112,7 +113,7 @@ def __init__( self.batch_size_per_gpu = int(batch_size) self.use_cache = use_cache self.truncate_context = truncate_context - + if accelerator.num_processes > 1: assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported." # If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model @@ -222,7 +223,7 @@ def flatten(self, input): def generate_until(self, requests: List[Instance]) -> List[str]: res = [] - + def _collate(x): # the negative sign on len(toks) sorts descending - this has a few advantages: # - time estimates will always be over not underestimates, which is more useful for planning @@ -243,11 +244,11 @@ def _collate(x): for chunk in chunks: contexts, all_gen_kwargs, doc_to_visuals, doc_id, tasks, splits = zip(*chunk) visuals = [doc_to_visual(self.task_dict[task][split][ids]) for ids, task, split, doc_to_visual in zip(doc_id, tasks, splits, doc_to_visuals)] - + # we assume all gen kwargs in the batch are the same # this is safe to assume because the `grouper` object ensures it. gen_kwargs = all_gen_kwargs[0] - + until = gen_kwargs.pop("until", None) image_aspect_ratio = gen_kwargs.pop("image_aspect_ratio", None) @@ -261,7 +262,7 @@ def _collate(x): prompts = [] for visual, context in zip(visuals, contexts): if self._is_idefics: - # Follow the idefics implementation: + # Follow the idefics implementation: content = [] if DEFAULT_IMAGE_TOKEN not in context: for _ in visual: @@ -274,17 +275,14 @@ def _collate(x): # We follow the Mantis code base: https://github.com/TIGER-AI-Lab/Mantis/blob/main/mantis/models/mllava/utils.py#L33 to make sure they are consistent # Users don't need to define chat template as it is done here if "llama-3" in self._model.language_model.name_or_path.lower(): - conv = conv_templates['llama_3'] - terminators = [ - self._processor.tokenizer.eos_token_id, - self._processor.tokenizer.convert_tokens_to_ids("<|eot_id|>") - ] + conv = conv_templates["llama_3"] + terminators = [self._processor.tokenizer.eos_token_id, self._processor.tokenizer.convert_tokens_to_ids("<|eot_id|>")] else: conv = default_conv terminators = None - + gen_kwargs["eos_token_id"] = terminators - + conv = conv.copy() conv.append_message(conv.roles[0], context) conv.append_message(conv.roles[1], "") @@ -292,9 +290,9 @@ def _collate(x): prompts.append(prompt) inputs = self._processor(images=visuals, text=prompts, return_tensors="pt", truncation=True) if "image_patches" in inputs.keys(): - inputs["image_patches"] = inputs["image_patches"][0] # FIXME: Fuyu model would return a list instead of a pytorch tensor. This weird behavior needs fixing. - inputs = {k: v.to(self.device) for k, v in inputs.items()} - + inputs["image_patches"] = inputs["image_patches"][0] # FIXME: Fuyu model would return a list instead of a pytorch tensor. This weird behavior needs fixing. + inputs = {k: v.to(self.device) for k, v in inputs.items()} + output_ids = self.model.generate(**inputs, **gen_kwargs) for output_id, input_id in zip(output_ids, inputs["input_ids"]): generated_id = output_id[len(input_id) :] diff --git a/lmms_eval/models/xcomposer2d5.py b/lmms_eval/models/xcomposer2d5.py index 17ed8fa2..8bfe6c65 100644 --- a/lmms_eval/models/xcomposer2d5.py +++ b/lmms_eval/models/xcomposer2d5.py @@ -40,7 +40,6 @@ def __init__( if not os.path.exists(self.tmp_folder): os.makedirs(self.tmp_folder) eval_logger.info(f"Using temporary folder: {self.tmp_folder}") - batch_size = int(batch_size) assert batch_size == 1, f"Batch size should be 1 for InternVL2, but got {batch_size}." @@ -57,7 +56,7 @@ def __init__( else: self._device = torch.device(f"cuda:{accelerator.local_process_index}") self.device_map = f"cuda:{accelerator.local_process_index}" - + self.path = pretrained self._model = AutoModel.from_pretrained(self.path, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map=self.device_map).half().eval() self._tokenizer = AutoTokenizer.from_pretrained(self.path, trust_remote_code=True) @@ -130,7 +129,6 @@ def rank(self): def world_size(self): return self._world_size - def flatten(self, input): new_list = [] for i in input: @@ -168,7 +166,7 @@ def generate_until(self, requests) -> List[str]: gen_kwargs["num_beams"] = 1 try: - with torch.autocast(device_type='cuda', dtype=torch.float16): + with torch.autocast(device_type="cuda", dtype=torch.float16): response, his = self.model.chat(self.tokenizer, contexts, image, do_sample=False, num_beams=1, use_meta=True, max_new_tokens=gen_kwargs["max_new_tokens"]) except Exception as e: eval_logger.error(f"Error : {e}") diff --git a/lmms_eval/tasks/mirb/utils.py b/lmms_eval/tasks/mirb/utils.py index 2ed447d0..174659dc 100644 --- a/lmms_eval/tasks/mirb/utils.py +++ b/lmms_eval/tasks/mirb/utils.py @@ -1,4 +1,3 @@ - from lmms_eval.filters.extraction import ExtendedRegexFilter from lmms_eval.filters.transformation import MapFilter import re @@ -6,32 +5,37 @@ import logging + eval_logger = logging.getLogger("lmms-eval") + def get_task_instruction(dataset): - if dataset in ['analogy', 'attribute', 'plot_code', 'visual_chain', 'sightseeing']: - instr = 'Answer with a single word.' - elif dataset in ['codeu', 'food', 'image_jigsaw']: - instr = 'Answer with the option symbol.' - elif dataset in ['arxiv']: - instr = 'Answer with the paper title.' - elif dataset in ['count']: - instr = 'Answer with a single number.' - elif dataset in ['3d_scene']: - instr = 'The following images are different views of the same 3D scene. Answer with a single number.' - + if dataset in ["analogy", "attribute", "plot_code", "visual_chain", "sightseeing"]: + instr = "Answer with a single word." + elif dataset in ["codeu", "food", "image_jigsaw"]: + instr = "Answer with the option symbol." + elif dataset in ["arxiv"]: + instr = "Answer with the paper title." + elif dataset in ["count"]: + instr = "Answer with a single number." + elif dataset in ["3d_scene"]: + instr = "The following images are different views of the same 3D scene. Answer with a single number." + return instr + def mirb_doc_to_text(doc, model_specific_prompt_kwargs=None): - subset, question = doc['subset'], doc["questions"] + subset, question = doc["subset"], doc["questions"] task_instruction = get_task_instruction(subset) post_prompt = model_specific_prompt_kwargs["post_prompt"] pre_prompt = model_specific_prompt_kwargs["pre_prompt"] return f"{pre_prompt}{task_instruction}{question}{post_prompt}" + def mirb_doc_to_visual(doc): image_list = [image.convert("RGB") for image in doc["image_list"]] - return image_list + return image_list + def mirb_doc_to_target(doc): return doc["answers"] @@ -60,6 +64,7 @@ def extract_numbers(string): all_numbers = numbers_with_commas + numbers_scientific + numbers_simple return all_numbers + def check_is_number(string): """ Check if the given string a number. @@ -72,6 +77,7 @@ def check_is_number(string): # check if there's comma inside return False + def normalize_str(string): """ Normalize the str to lower case and make them float numbers if possible. @@ -97,6 +103,7 @@ def normalize_str(string): return [" " + string, string + " "] # avoid trivial matches return [string] + def parse_multi_choice_response(response): # here, we assume we have a list, in which each element is # a list of model responses for some particular input/target pair. @@ -185,10 +192,11 @@ def get_key_subresponses(response): return pred_list + def mirb_process_results(doc, results): pred = results[0] subset, answer = doc["subset"], doc["answers"] - if answer in ['A', 'B', 'C', 'D', 'E']: # MCQ tasks + if answer in ["A", "B", "C", "D", "E"]: # MCQ tasks parsed_pred = parse_multi_choice_response(pred) else: parsed_pred = parse_open_response(pred) @@ -196,6 +204,7 @@ def mirb_process_results(doc, results): data_dict = {"question_id": doc["question_id"], "subset": task_type, "pred_answer": parsed_pred, "answers": doc["answers"]} return {f"mirb_score": data_dict} + def eval_multi_choice(gold_i, pred_i): """ Evaluate a multiple choice instance. @@ -242,6 +251,7 @@ def eval_open(gold_i, pred_i): break return correct + def mirb_aggregation(results): task_num = {} score = 0 @@ -251,7 +261,7 @@ def mirb_aggregation(results): task_score[result["subset"]] = 0 task_num[result["subset"]] = 0 - if result['answers'] in ['A', 'B', 'C', 'D', 'E']: # MCQ tasks + if result["answers"] in ["A", "B", "C", "D", "E"]: # MCQ tasks correct = eval_multi_choice(result["answers"], result["pred_answer"]) task_score[result["subset"]] += correct score += correct @@ -262,7 +272,7 @@ def mirb_aggregation(results): task_num[result["subset"]] += 1 avg_score = score / len(results) - task_score = {k : v / task_num[k] for k,v in task_score.items()} + task_score = {k: v / task_num[k] for k, v in task_score.items()} print("Performances for different subsets:") print("=" * 50) @@ -271,12 +281,7 @@ def mirb_aggregation(results): print("=" * 50) # print across evaluation dimension - groups = { - "Knowledge": ["food", "sightseeing"], - "Reasoning": ["codeu", "plot_code", "analogy", "3d_scene"], - "Perception": ["image_jigsaw", "count", "attribute"], - "Multi-Hop": ["visual_chain", "arxiv"] - } + groups = {"Knowledge": ["food", "sightseeing"], "Reasoning": ["codeu", "plot_code", "analogy", "3d_scene"], "Perception": ["image_jigsaw", "count", "attribute"], "Multi-Hop": ["visual_chain", "arxiv"]} # Compute the averages for each group averages_dict = compute_averages_from_task_scores(task_score, groups) @@ -287,7 +292,7 @@ def mirb_aggregation(results): for group, score in averages_dict.items(): print(f"{group} : {score:.2f}") print("=" * 50) - + return avg_score @@ -297,4 +302,4 @@ def compute_averages_from_task_scores(task_score, groups): for group, features in groups.items(): values = [task_score[feature] for feature in features] averages[group] = np.mean(values) - return averages \ No newline at end of file + return averages diff --git a/lmms_eval/tasks/mmstar/utils.py b/lmms_eval/tasks/mmstar/utils.py index aba6dd6a..5d3a8f6e 100644 --- a/lmms_eval/tasks/mmstar/utils.py +++ b/lmms_eval/tasks/mmstar/utils.py @@ -92,4 +92,4 @@ def mmstar_aggregate_results(results): eval_logger.info(f"{l2_category}: {avg_score:.2f}") avg_score = sum(l2_category_avg_score.values()) / len(l2_category_avg_score) - return avg_score \ No newline at end of file + return avg_score diff --git a/lmms_eval/tasks/seedbench_2_plus/utils.py b/lmms_eval/tasks/seedbench_2_plus/utils.py index 3182b4bc..b722d250 100755 --- a/lmms_eval/tasks/seedbench_2_plus/utils.py +++ b/lmms_eval/tasks/seedbench_2_plus/utils.py @@ -1,8 +1,10 @@ import json + def seed_doc_to_visual(doc): return [doc["image"].convert("RGB")] + def parse_choice_img(choice: str, img_token: str): if "jpg" in choice or "png" in choice: return img_token @@ -16,7 +18,7 @@ def seed_doc_to_text(doc, model_specific_kwargs=None): question += f"B. {parse_choice_img(doc['choice_B'], model_specific_kwargs['img_token'])}\n" question += f"C. {parse_choice_img(doc['choice_C'], model_specific_kwargs['img_token'])}\n" question += f"D. {parse_choice_img(doc['choice_D'], model_specific_kwargs['img_token'])}" - + return f"{question}\n{model_specific_kwargs['post_prompt']}" diff --git a/pyproject.toml b/pyproject.toml index cb9f04e6..b5512035 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -79,9 +79,6 @@ dependencies = [ ] [project.optional-dependencies] -vila = [ - "s2wrapper@git+https://github.com/bfshi/scaling_on_scales" -] gemini = [ "google-generativeai", ]