Merge pull request #184 from EvolvingLMMs-Lab/pufanyi/pypi/0.2.2

New pypi
EvolvingLMMs-Lab · Aug 8, 2024 · 3f89773 · 3f89773
2 parents 2af043e + a365bf7
commit 3f89773
Show file tree

Hide file tree

Showing 11 changed files with 70 additions and 65 deletions.
diff --git a/README.md b/README.md
@@ -49,7 +49,7 @@ cd lmms-eval
 pip install -e .
 ```
 
-If you wanted to test llava, you will have to clone their repo from [LLaVA](https://github.com/haotian-liu/LLaVA) and
+If you want to test LLaVA, you will have to clone their repo from [LLaVA](https://github.com/haotian-liu/LLaVA) and
 ```bash
 # for llava 1.5
 # git clone https://github.com/haotian-liu/LLaVA
@@ -69,7 +69,7 @@ You can check the [environment install script](miscs/repr_scripts.sh) and [torch
 
 </details>
 
-If you want to test on caption dataset such as `coco`, `refcoco`, and `nocaps`, you will need to have `java==1.8.0 ` to let pycocoeval api to work. If you don't have it, you can install by using conda
+If you want to test on caption dataset such as `coco`, `refcoco`, and `nocaps`, you will need to have `java==1.8.0` to let pycocoeval api to work. If you don't have it, you can install by using conda
 ```
 conda install openjdk=8
 ```
@@ -93,6 +93,11 @@ We also provide the raw data exported from Weights & Biases for the detailed res
 </details>
 <br>
 
+If you want to test [VILA](https://github.com/NVlabs/VILA), you should install the following dependencies:
+
+```bash
+pip install s2wrapper@git+https://github.com/bfshi/scaling_on_scales
+```
 
 Our Development will be continuing on the main branch, and we encourage you to give us feedback on what features are desired and how to improve the library further, or ask questions, either in issues or PRs on GitHub.
 

diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py
@@ -37,9 +37,7 @@ def get_context(self, doc, num_fewshot):
                     + (
                         str(self.doc_to_target(doc)[0])
                         if type(self.doc_to_target(doc)) is list
-                        else self.doc_to_target(doc)
-                        if (self.config.doc_to_choice is None or type(self.doc_to_target(doc)) is str)
-                        else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
+                        else self.doc_to_target(doc) if (self.config.doc_to_choice is None or type(self.doc_to_target(doc)) is str) else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
                     )
                     for doc in selected_docs
                 ]

diff --git a/lmms_eval/models/__init__.py b/lmms_eval/models/__init__.py
@@ -41,7 +41,7 @@
     "video_llava": "VideoLLaVA",
     "vila": "VILA",
     "xcomposer2_4KHD": "XComposer2_4KHD",
-    "xcomposer2d5": "XComposer2D5"
+    "xcomposer2d5": "XComposer2D5",
 }
 
 for model_name, model_class in AVAILABLE_MODELS.items():

diff --git a/lmms_eval/models/llava_hf.py b/lmms_eval/models/llava_hf.py
@@ -320,7 +320,7 @@ def _collate(x):
                     pad_token_id=self.tokenizer.eos_token_id,
                     eos_token_id=self.specified_eot_token_id,
                 )
-                cont = cont[:, inputs["input_ids"].shape[-1]:]
+                cont = cont[:, inputs["input_ids"].shape[-1] :]
             except Exception as e:
                 eval_logger.error(f"Error {e} in generating")
                 cont = ""

diff --git a/lmms_eval/models/llava_vid.py b/lmms_eval/models/llava_vid.py
@@ -28,11 +28,13 @@
 
 try:
     from llavavid.model.language_model.llava_qwen import LlavaQwenConfig
+
     AutoConfig.register("llava_qwen", LlavaQwenConfig)
 except ImportError:
     eval_logger.debug("No Qwen for llava vid")
 
 from llavavid.model.language_model.llava_llama import LlavaConfig
+
 AutoConfig.register("llava_llama", LlavaConfig)
 
 

diff --git a/lmms_eval/models/mantis.py b/lmms_eval/models/mantis.py
@@ -27,10 +27,10 @@
     from mantis.models.mllava import LlavaForConditionalGeneration, MLlavaProcessor
     from mantis.models.mfuyu import MFuyuForCausalLM, MFuyuProcessor
     from mantis.models.conversation import conv_mllava_v1 as default_conv, conv_templates
-    
+
 except Exception as e:
     eval_logger.debug("Mantis is not installed. Please install Mantis to use this model.\nError: %s" % e)
-    
+
 try:
     from transformers import AutoModelForVision2Seq, AutoProcessor
 except Exception as e:
@@ -42,13 +42,14 @@
 
 try:
     import flash_attn
-    
+
     best_fit_attn_implementation = "flash_attention_2"
 except ImportError:
     best_fit_attn_implementation = "eager"
 
 DEFAULT_IMAGE_TOKEN = "<image>"
 
+
 @register_model("mantis")
 class Mantis(lmms):
     """
@@ -84,35 +85,35 @@ def __init__(
         else:
             self._device = torch.device(f"cuda:{accelerator.local_process_index}")
             self.device_map = f"cuda:{accelerator.local_process_index}"
-        
+
         self._is_idefics = "idefics" in pretrained.lower()
         if isinstance(dtype, str) and dtype != "auto":
             dtype = getattr(torch, dtype)
-        
+
         # Here we load the "non-idefics" Mantis model.
         if not self._is_idefics:
-            if 'fuyu' in pretrained.lower():
+            if "fuyu" in pretrained.lower():
                 self._processor = MFuyuProcessor.from_pretrained(pretrained)
                 self._model = MFuyuForCausalLM.from_pretrained(pretrained, device_map=self.device_map, attn_implementation=attn_implementation, torch_dtype=dtype)
             else:
                 self._processor = MLlavaProcessor.from_pretrained(pretrained)
                 self._model = LlavaForConditionalGeneration.from_pretrained(pretrained, device_map=self.device_map, attn_implementation=attn_implementation, torch_dtype=dtype)
-            
+
         else:
             self._processor = AutoProcessor.from_pretrained(pretrained)
             self._model = AutoModelForVision2Seq.from_pretrained(pretrained, device_map=self.device_map, torch_dtype=dtype)
         eval_logger.info(f"Using {type(self._model)} to instantiate the Mantis model.")
-        
+
         self._tokenizer = self._processor.tokenizer
-        
+
         self._config = self._model.config
         self.model.eval()
         self.model.tie_weights()
         self.truncation = truncation
         self.batch_size_per_gpu = int(batch_size)
         self.use_cache = use_cache
         self.truncate_context = truncate_context
-        
+
         if accelerator.num_processes > 1:
             assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported."
             # If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model
@@ -222,7 +223,7 @@ def flatten(self, input):
 
     def generate_until(self, requests: List[Instance]) -> List[str]:
         res = []
-    
+
         def _collate(x):
             # the negative sign on len(toks) sorts descending - this has a few advantages:
             # - time estimates will always be over not underestimates, which is more useful for planning
@@ -243,11 +244,11 @@ def _collate(x):
         for chunk in chunks:
             contexts, all_gen_kwargs, doc_to_visuals, doc_id, tasks, splits = zip(*chunk)
             visuals = [doc_to_visual(self.task_dict[task][split][ids]) for ids, task, split, doc_to_visual in zip(doc_id, tasks, splits, doc_to_visuals)]
-        
+
             # we assume all gen kwargs in the batch are the same
             # this is safe to assume because the `grouper` object ensures it.
             gen_kwargs = all_gen_kwargs[0]
-            
+
             until = gen_kwargs.pop("until", None)
             image_aspect_ratio = gen_kwargs.pop("image_aspect_ratio", None)
 
@@ -261,7 +262,7 @@ def _collate(x):
             prompts = []
             for visual, context in zip(visuals, contexts):
                 if self._is_idefics:
-                    # Follow the idefics implementation: 
+                    # Follow the idefics implementation:
                     content = []
                     if DEFAULT_IMAGE_TOKEN not in context:
                         for _ in visual:
@@ -274,27 +275,24 @@ def _collate(x):
                     # We follow the Mantis code base: https://github.com/TIGER-AI-Lab/Mantis/blob/main/mantis/models/mllava/utils.py#L33 to make sure they are consistent
                     # Users don't need to define chat template as it is done here
                     if "llama-3" in self._model.language_model.name_or_path.lower():
-                        conv = conv_templates['llama_3']
-                        terminators = [
-                            self._processor.tokenizer.eos_token_id,
-                            self._processor.tokenizer.convert_tokens_to_ids("<|eot_id|>")
-                        ]
+                        conv = conv_templates["llama_3"]
+                        terminators = [self._processor.tokenizer.eos_token_id, self._processor.tokenizer.convert_tokens_to_ids("<|eot_id|>")]
                     else:
                         conv = default_conv
                         terminators = None
-                        
+
                     gen_kwargs["eos_token_id"] = terminators
-                    
+
                     conv = conv.copy()
                     conv.append_message(conv.roles[0], context)
                     conv.append_message(conv.roles[1], "")
                     prompt = conv.get_prompt()
                     prompts.append(prompt)
             inputs = self._processor(images=visuals, text=prompts, return_tensors="pt", truncation=True)
             if "image_patches" in inputs.keys():
-                inputs["image_patches"] = inputs["image_patches"][0] # FIXME: Fuyu model would return a list instead of a pytorch tensor. This weird behavior needs fixing.
-            inputs = {k: v.to(self.device) for k, v in inputs.items()}    
-            
+                inputs["image_patches"] = inputs["image_patches"][0]  # FIXME: Fuyu model would return a list instead of a pytorch tensor. This weird behavior needs fixing.
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+
             output_ids = self.model.generate(**inputs, **gen_kwargs)
             for output_id, input_id in zip(output_ids, inputs["input_ids"]):
                 generated_id = output_id[len(input_id) :]

diff --git a/lmms_eval/models/xcomposer2d5.py b/lmms_eval/models/xcomposer2d5.py
@@ -40,7 +40,6 @@ def __init__(
         if not os.path.exists(self.tmp_folder):
             os.makedirs(self.tmp_folder)
         eval_logger.info(f"Using temporary folder: {self.tmp_folder}")
-
 
         batch_size = int(batch_size)
         assert batch_size == 1, f"Batch size should be 1 for InternVL2, but got {batch_size}."
@@ -57,7 +56,7 @@ def __init__(
         else:
             self._device = torch.device(f"cuda:{accelerator.local_process_index}")
             self.device_map = f"cuda:{accelerator.local_process_index}"
-        
+
         self.path = pretrained
         self._model = AutoModel.from_pretrained(self.path, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map=self.device_map).half().eval()
         self._tokenizer = AutoTokenizer.from_pretrained(self.path, trust_remote_code=True)
@@ -130,7 +129,6 @@ def rank(self):
     def world_size(self):
         return self._world_size
 
-
     def flatten(self, input):
         new_list = []
         for i in input:
@@ -168,7 +166,7 @@ def generate_until(self, requests) -> List[str]:
                 gen_kwargs["num_beams"] = 1
 
             try:
-                with torch.autocast(device_type='cuda', dtype=torch.float16):
+                with torch.autocast(device_type="cuda", dtype=torch.float16):
                     response, his = self.model.chat(self.tokenizer, contexts, image, do_sample=False, num_beams=1, use_meta=True, max_new_tokens=gen_kwargs["max_new_tokens"])
             except Exception as e:
                 eval_logger.error(f"Error : {e}")