From 9d632a28c5f521712b4ced30160b7871279ad924 Mon Sep 17 00:00:00 2001 From: JosselinSomervilleRoberts Date: Fri, 12 Jan 2024 17:32:00 -0800 Subject: [PATCH 01/21] First draft of the VLM Huggingface client --- .../vision_language/huggingface_vlm_client.py | 101 ++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 src/helm/proxy/clients/vision_language/huggingface_vlm_client.py diff --git a/src/helm/proxy/clients/vision_language/huggingface_vlm_client.py b/src/helm/proxy/clients/vision_language/huggingface_vlm_client.py new file mode 100644 index 00000000000..4b30b724619 --- /dev/null +++ b/src/helm/proxy/clients/vision_language/huggingface_vlm_client.py @@ -0,0 +1,101 @@ +from threading import Lock +from typing import Dict, List, Optional + +from transformers import pipeline +from transformers.pipelines import ImageToTextPipeline + +from helm.common.cache import CacheConfig +from helm.common.images_utils import open_image +from helm.common.media_object import TEXT_TYPE +from helm.common.optional_dependencies import handle_module_not_found_error +from helm.common.request import Request, RequestResult, Sequence, Token +from helm.common.tokenization_request import ( + TokenizationRequest, + TokenizationRequestResult, +) +from helm.common.request import wrap_request_time +from helm.proxy.clients.client import CachingClient, generate_uid_for_multimodal_prompt +from helm.proxy.tokenizers.tokenizer import Tokenizer + +try: + from PIL import Image +except ModuleNotFoundError as e: + handle_module_not_found_error(e, ["images"]) + + +class HuggingFaceVLMClient(CachingClient): + """ + General CLient for VLM models from HuggingFace. + """ + + _models_lock: Lock = Lock() + _models: Dict[str, ImageToTextPipeline] = {} + _models_aliases: Dict[str, str] = {} + + def __init__(self, tokenizer: Tokenizer, tokenizer_name: str, cache_config: CacheConfig): + super().__init__(cache_config=cache_config) + self.tokenizer = tokenizer + self.tokenizer_name = tokenizer_name + + def _get_model(self, model_name: str) -> ImageToTextPipeline: + with self._models_lock: + model_id: str = self._models_aliases.get(model_name, model_name) + if model_id not in self._models: + self._models[model_id] = pipeline("image-to-text", model=model_id) + return self._models[model_id] + + def make_request(self, request: Request) -> RequestResult: + assert request.multimodal_prompt is not None, "Multimodal prompt is required" + + # Build the prompt + prompt: str = "" + image: Optional[Image.Image] = None + for media_object in request.multimodal_prompt.media_objects: + if media_object.is_type("image") and media_object.location: + if image is not None: + raise ValueError("Only one image is supported in the multimodal prompt") + image = open_image(media_object.location) + elif media_object.is_type(TEXT_TYPE): + if media_object.text is None: + raise ValueError("MediaObject of text type has missing text field value") + prompt += f"\n{media_object.text}" + else: + raise ValueError(f"Unsupported media object type: {media_object.type}") + + # Generate + try: + generation_args = { + "max_new_tokens": request.max_tokens, + } + + def do_it(): + model: ImageToTextPipeline = self._get_model(request.model_deployment) + outputs = model(image, prompt=prompt, generate_kwargs=generation_args) + return outputs[0] + + cache_key = CachingClient.make_cache_key( + raw_request={ + "model": request.model, + "prompt": generate_uid_for_multimodal_prompt(request.multimodal_prompt), + **generation_args, + }, + request=request, + ) + result, cached = self.cache.get(cache_key, wrap_request_time(do_it)) + except RuntimeError as e: + return RequestResult(success=False, cached=False, error=str(e), completions=[], embedding=[]) + + tokenization_result: TokenizationRequestResult = self.tokenizer.tokenize( + TokenizationRequest(result["generated_text"], tokenizer=self.tokenizer_name) + ) + tokens: List[Token] = [ + Token(text=str(text), logprob=0, top_logprobs={}) for text in tokenization_result.raw_tokens + ] + completions: List[Sequence] = [Sequence(text=result["generated_text"], logprob=0, tokens=tokens)] + return RequestResult( + success=True, + cached=cached, + request_time=result["request_time"], + completions=completions, + embedding=[], + ) From 1f58a5af99ecfe7674cf89654a3961c859260ff2 Mon Sep 17 00:00:00 2001 From: JosselinSomervilleRoberts Date: Fri, 12 Jan 2024 17:41:45 -0800 Subject: [PATCH 02/21] Added Llava 1.5 7B --- src/helm/config/model_deployments.yaml | 7 +++++++ src/helm/config/model_metadata.yaml | 9 +++++++++ 2 files changed, 16 insertions(+) diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml index 1c0b45211e5..70df6fde037 100644 --- a/src/helm/config/model_deployments.yaml +++ b/src/helm/config/model_deployments.yaml @@ -691,6 +691,13 @@ model_deployments: client_spec: class_name: "helm.proxy.clients.microsoft_client.MicrosoftClient" + - name: llava-hf/llava-1.5-7b-hf + model_name: llava-hf/llava-1.5-7b-hf + tokenizer_name: hf-internal-testing/llama-tokenizer + max_sequence_length: 2048 + client_spec: + class_name: "helm.proxy.clients.vision_language.huggingface_vlm_client" + # Neurips - name: neurips/local model_name: neurips/local diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml index cec71fb00c6..f21f3868471 100644 --- a/src/helm/config/model_metadata.yaml +++ b/src/helm/config/model_metadata.yaml @@ -1067,6 +1067,15 @@ models: release_date: 2022-01-28 tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + - name: microsoft/llava-1.5-7b-hf + display_name: LLaVA 1.5 (7B) + description: LLaVa is an open-source chatbot trained by fine-tuning LlamA/Vicuna on GPT-generated multimodal instruction-following data. + creator_organization_name: Microsoft + access: open + num_parameters: 7000000000 + release_date: 2023-10-05 + tags: [VISION_LANGUAGE_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + # 01.AI From 3a995e2df659a2506e75bf639cd8fe3827e89ca2 Mon Sep 17 00:00:00 2001 From: JosselinSomervilleRoberts Date: Fri, 12 Jan 2024 17:44:49 -0800 Subject: [PATCH 03/21] update requirements --- requirements.txt | 3 ++- setup.cfg | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 16c53e0ca08..23f40c77cf0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ 2captcha-python==1.1.3 absl-py==1.2.0 +accelerate=0.25.0 aiodns==3.0.0 aiohttp==3.8.5 aiohttp-retry==2.8.3 @@ -167,7 +168,7 @@ torchvision==0.13.1 ; sys_platform == "darwin" torch==1.12.1+cu113 ; sys_platform == "linux" torchvision==0.13.1+cu113 ; sys_platform == "linux" tqdm==4.64.1 -transformers==4.33.1 +transformers==4.36.0 trio==0.22.0 trio-websocket==0.9.2 typer==0.4.2 diff --git a/setup.cfg b/setup.cfg index 843619d6dba..c3703582c68 100644 --- a/setup.cfg +++ b/setup.cfg @@ -52,7 +52,7 @@ install_requires= scikit-learn~=1.1.2 # Models and Metrics Extras - transformers~=4.33.1 # For anthropic_client, huggingface_client, huggingface_tokenizer, test_openai_token_cost_estimator, model_summac (via summarization_metrics) + transformers~=4.36.0 # For anthropic_client, vision_language.huggingface_vlm_client, huggingface_client, huggingface_tokenizer, test_openai_token_cost_estimator, model_summac (via summarization_metrics) # TODO: Upgrade torch - we need > 2.0.0 for newer versions of transformers torch>=1.12.1,<3.0.0 # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics) torchvision>=0.13.1,<3.0.0 # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics) @@ -98,7 +98,7 @@ cleva = langdetect==1.0.9 images = - accelerate~=0.23.0 # For the newer versions of Transformers + accelerate~=0.25.0 # For the newer versions of Transformers pillow~=9.4.0 mongo = From 774e3d06ab9b2e3054cd59680ef82a0d2ccf9ed4 Mon Sep 17 00:00:00 2001 From: JosselinSomervilleRoberts Date: Fri, 12 Jan 2024 19:11:38 -0800 Subject: [PATCH 04/21] Add test scripts --- test_llava.py | 21 +++++++++++++++++++++ test_llava_quantized.py | 24 ++++++++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 test_llava.py create mode 100644 test_llava_quantized.py diff --git a/test_llava.py b/test_llava.py new file mode 100644 index 00000000000..4abdc46efc8 --- /dev/null +++ b/test_llava.py @@ -0,0 +1,21 @@ +import requests +from PIL import Image + +image_url = "https://llava-vl.github.io/static/images/view.jpg" +image = Image.open(requests.get(image_url, stream=True).raw) + + +import torch + +from transformers import pipeline + +model_id = "llava-hf/llava-1.5-7b-hf" + +pipe = pipeline("image-to-text", model=model_id) + +max_new_tokens = 200 +prompt = "USER: \nWhat are the things I should be cautious about when I visit this place?\nASSISTANT:" + +outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200}) + +print(outputs[0]["generated_text"]) diff --git a/test_llava_quantized.py b/test_llava_quantized.py new file mode 100644 index 00000000000..8c34c304c04 --- /dev/null +++ b/test_llava_quantized.py @@ -0,0 +1,24 @@ +import requests +from PIL import Image + +image_url = "https://llava-vl.github.io/static/images/view.jpg" +image = Image.open(requests.get(image_url, stream=True).raw) + + +import torch +from transformers import BitsAndBytesConfig + +quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16) + +from transformers import pipeline + +model_id = "llava-hf/llava-1.5-7b-hf" + +pipe = pipeline("image-to-text", model=model_id, model_kwargs={"quantization_config": quantization_config}) + +max_new_tokens = 200 +prompt = "USER: \nWhat are the things I should be cautious about when I visit this place?\nASSISTANT:" + +outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200}) + +print(outputs[0]["generated_text"]) From b629fabb444ae1792a1316e920c376f793e7dc24 Mon Sep 17 00:00:00 2001 From: JosselinSomervilleRoberts Date: Fri, 12 Jan 2024 19:13:23 -0800 Subject: [PATCH 05/21] Fix typo in requirements --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 23f40c77cf0..9ae700e47ea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ 2captcha-python==1.1.3 absl-py==1.2.0 -accelerate=0.25.0 +accelerate==0.25.0 aiodns==3.0.0 aiohttp==3.8.5 aiohttp-retry==2.8.3 From 0f1abe2ba7d606813383685baa62d80baddcfba5 Mon Sep 17 00:00:00 2001 From: JosselinSomervilleRoberts Date: Fri, 12 Jan 2024 19:16:03 -0800 Subject: [PATCH 06/21] Loosening huggingface-hub version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 9ae700e47ea..0d60b8aed0c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -57,7 +57,7 @@ greenlet==1.1.3 gunicorn==20.1.0 h11==0.14.0 httplib2==0.20.4 -huggingface-hub==0.15.1 +huggingface-hub>=0.15.1 icetk==0.0.4 identify==2.5.6 idna==3.4 From e7e92636599a6b903f010b147635636169f241db Mon Sep 17 00:00:00 2001 From: JosselinSomervilleRoberts Date: Fri, 12 Jan 2024 19:17:20 -0800 Subject: [PATCH 07/21] Loosening tokenizers version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 0d60b8aed0c..e6f2f482e91 100644 --- a/requirements.txt +++ b/requirements.txt @@ -160,7 +160,7 @@ thinc==8.1.12 threadpoolctl==3.1.0 tiktoken==0.3.3 tls-client==0.1.8 -tokenizers==0.13.3 +tokenizers>=0.13.3 toml==0.10.2 tomli==2.0.1 torch==1.12.1 ; sys_platform == "darwin" From 80b5422548cb5a2c58972f7c3c6319f84e9e3688 Mon Sep 17 00:00:00 2001 From: JosselinSomervilleRoberts Date: Fri, 12 Jan 2024 19:18:32 -0800 Subject: [PATCH 08/21] Loosening fsspec version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e6f2f482e91..54b87e47e1a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -43,7 +43,7 @@ filelock==3.8.0 flake8==5.0.4 fonttools==4.37.4 frozenlist==1.3.1 -fsspec==2022.8.2 +fsspec>=2022.8.2 gdown==4.4.0 gevent==21.12.0 gin-config==0.5.0 From 046037618b1e6d9caedd627a295fe0afa6b81d54 Mon Sep 17 00:00:00 2001 From: JosselinSomervilleRoberts Date: Sat, 13 Jan 2024 07:58:15 -0800 Subject: [PATCH 09/21] Loosening HEIM's accelerate version --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index c3703582c68..028547d2e55 100644 --- a/setup.cfg +++ b/setup.cfg @@ -140,7 +140,7 @@ heim = gdown~=4.4.0 # HEIM models - accelerate~=0.23.0 + accelerate~=0.25.0 diffusers~=0.24.0 jax~=0.4.13 jaxlib~=0.4.13 From 2a39e4b04f9c685aeaa431ada5f0a94cc06c6c5c Mon Sep 17 00:00:00 2001 From: JosselinSomervilleRoberts Date: Sat, 13 Jan 2024 08:03:53 -0800 Subject: [PATCH 10/21] Loosening AlephAlpha's tokenizers version --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 028547d2e55..bd98e3f6570 100644 --- a/setup.cfg +++ b/setup.cfg @@ -107,7 +107,7 @@ mongo = # Model extras aleph-alpha = aleph-alpha-client~=2.14.0 - tokenizers~=0.13.3 + tokenizers>=0.13.3 anthropic = anthropic~=0.2.5 From 90b858c80d713fda5e5b287fa59630d6d86bc7c9 Mon Sep 17 00:00:00 2001 From: JosselinSomervilleRoberts Date: Sat, 13 Jan 2024 08:56:52 -0800 Subject: [PATCH 11/21] Update model deployment name and alis for Llava --- src/helm/config/model_deployments.yaml | 15 ++++++++------- .../vision_language/huggingface_vlm_client.py | 4 +++- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml index 70df6fde037..fd1261be848 100644 --- a/src/helm/config/model_deployments.yaml +++ b/src/helm/config/model_deployments.yaml @@ -441,6 +441,14 @@ model_deployments: client_spec: class_name: "helm.proxy.clients.huggingface_client.HuggingFaceClient" + ## Microsoft + - name: huggingface/llava-1.5-7b-hf + model_name: microsoft/llava-1.5-7b-hf + tokenizer_name: hf-internal-testing/llama-tokenizer + max_sequence_length: 2048 + client_spec: + class_name: "helm.proxy.clients.vision_language.huggingface_vlm_client" + ## OpenAI - name: huggingface/gpt2 model_name: openai/gpt2 @@ -691,13 +699,6 @@ model_deployments: client_spec: class_name: "helm.proxy.clients.microsoft_client.MicrosoftClient" - - name: llava-hf/llava-1.5-7b-hf - model_name: llava-hf/llava-1.5-7b-hf - tokenizer_name: hf-internal-testing/llama-tokenizer - max_sequence_length: 2048 - client_spec: - class_name: "helm.proxy.clients.vision_language.huggingface_vlm_client" - # Neurips - name: neurips/local model_name: neurips/local diff --git a/src/helm/proxy/clients/vision_language/huggingface_vlm_client.py b/src/helm/proxy/clients/vision_language/huggingface_vlm_client.py index 4b30b724619..b9ce2da562a 100644 --- a/src/helm/proxy/clients/vision_language/huggingface_vlm_client.py +++ b/src/helm/proxy/clients/vision_language/huggingface_vlm_client.py @@ -30,7 +30,9 @@ class HuggingFaceVLMClient(CachingClient): _models_lock: Lock = Lock() _models: Dict[str, ImageToTextPipeline] = {} - _models_aliases: Dict[str, str] = {} + _models_aliases: Dict[str, str] = { + "huggingface/llava-1.5-7b-hf": "llava-hf/llava-1.5-7b-hf", + } def __init__(self, tokenizer: Tokenizer, tokenizer_name: str, cache_config: CacheConfig): super().__init__(cache_config=cache_config) From 387c83d1a4e9a898a117b391c37ddba06e28cd91 Mon Sep 17 00:00:00 2001 From: JosselinSomervilleRoberts Date: Sat, 13 Jan 2024 08:59:22 -0800 Subject: [PATCH 12/21] Update Llava's tag --- src/helm/config/model_metadata.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml index f21f3868471..37250cdae7e 100644 --- a/src/helm/config/model_metadata.yaml +++ b/src/helm/config/model_metadata.yaml @@ -1074,7 +1074,7 @@ models: access: open num_parameters: 7000000000 release_date: 2023-10-05 - tags: [VISION_LANGUAGE_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + tags: [VISION_LANGUAGE_MODEL_TAG] From 7f64441b0010be701d90a7ba097942aec4dfe894 Mon Sep 17 00:00:00 2001 From: JosselinSomervilleRoberts Date: Sat, 13 Jan 2024 09:59:01 -0800 Subject: [PATCH 13/21] Fix typo in model deployment for Llava --- src/helm/config/model_deployments.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml index fd1261be848..3f68c7e5e1b 100644 --- a/src/helm/config/model_deployments.yaml +++ b/src/helm/config/model_deployments.yaml @@ -447,7 +447,7 @@ model_deployments: tokenizer_name: hf-internal-testing/llama-tokenizer max_sequence_length: 2048 client_spec: - class_name: "helm.proxy.clients.vision_language.huggingface_vlm_client" + class_name: "helm.proxy.clients.vision_language.huggingface_vlm_client.HuggingFaceVLMClient" ## OpenAI - name: huggingface/gpt2 From 37d97a07b6d764a2207426921092a8bfd7f6f083 Mon Sep 17 00:00:00 2001 From: JosselinSomervilleRoberts Date: Sat, 13 Jan 2024 10:46:20 -0800 Subject: [PATCH 14/21] Add Llava Run Expander --- src/helm/benchmark/model_metadata_registry.py | 2 ++ src/helm/benchmark/run_expander.py | 22 +++++++++++++++++++ src/helm/benchmark/run_specs.py | 6 +++++ src/helm/config/model_metadata.yaml | 2 +- 4 files changed, 31 insertions(+), 1 deletion(-) diff --git a/src/helm/benchmark/model_metadata_registry.py b/src/helm/benchmark/model_metadata_registry.py index 1a8e8bf9a77..a4b85edc813 100644 --- a/src/helm/benchmark/model_metadata_registry.py +++ b/src/helm/benchmark/model_metadata_registry.py @@ -54,6 +54,8 @@ VISION_LANGUAGE_MODEL_TAG: str = "VISION_LANGUAGE_MODEL_TAG" # IDEFICS require a special prompt format (see `IDEFICSInstructRunExpander`) IDEFICS_INSTRUCT_MODEL_TAG: str = "IDEFICS_INSTRUCT_MODEL_TAG" +# Llava should use a special prompt format (see `LlavaRunExpander`) +LLAVA_MODEL_TAG: str = "LLAVA_MODEL_TAG" # Frozen is set to false as the model_deployment_registry.py file diff --git a/src/helm/benchmark/run_expander.py b/src/helm/benchmark/run_expander.py index 122b89decb7..2a0ca440d7a 100644 --- a/src/helm/benchmark/run_expander.py +++ b/src/helm/benchmark/run_expander.py @@ -396,6 +396,28 @@ def expand(self, run_spec: RunSpec) -> List[RunSpec]: ] +class LlavaRunExpander(RunExpander): + """ + Custom prompt for Llava 1.5 models which should use a specific format. + See https://colab.research.google.com/drive/1qsl6cd2c8gGtEW1xV5io7S8NHh-Cp1TV?usp=sharing for more information. + """ + + name = "llava" + + def expand(self, run_spec: RunSpec) -> List[RunSpec]: + return [ + replace( + run_spec, + name=run_spec.name, + adapter_spec=replace( + run_spec.adapter_spec, + input_prefix="USER: ", + output_prefix="\nASSISTANT: ", + ), + ), + ] + + class FormatPromptRunExpander(RunExpander): """Adds a prefix and suffix to the prompt.""" diff --git a/src/helm/benchmark/run_specs.py b/src/helm/benchmark/run_specs.py index 775491e66c0..b2d798007f4 100644 --- a/src/helm/benchmark/run_specs.py +++ b/src/helm/benchmark/run_specs.py @@ -31,6 +31,7 @@ OpenAIRunExpander, GoogleRunExpander, IDEFICSInstructRunExpander, + LlavaRunExpander, StopRunExpander, ChatMLRunExpander, IncreaseTemperatureRunExpander, @@ -64,6 +65,7 @@ GOOGLE_PALM_2_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, IDEFICS_INSTRUCT_MODEL_TAG, + LLAVA_MODEL_TAG, NO_NEWLINES_TAG, NLG_PREFIX_TAG, CHATML_MODEL_TAG, @@ -3087,6 +3089,10 @@ def alter_run_spec(run_spec: RunSpec) -> RunSpec: if IDEFICS_INSTRUCT_MODEL_TAG in model.tags: run_spec = singleton(IDEFICSInstructRunExpander().expand(run_spec)) + # Llava + if LLAVA_MODEL_TAG in model.tags: + run_spec = singleton(LlavaRunExpander().expand(run_spec)) + # For multiple choice if BUGGY_TEMP_0_TAG in model.tags and run_spec.adapter_spec.temperature == 0: increase_temperature_expander = IncreaseTemperatureRunExpander(value=1e-4) diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml index 641fd33329d..b860defc29a 100644 --- a/src/helm/config/model_metadata.yaml +++ b/src/helm/config/model_metadata.yaml @@ -1082,7 +1082,7 @@ models: access: open num_parameters: 7000000000 release_date: 2023-10-05 - tags: [VISION_LANGUAGE_MODEL_TAG] + tags: [VISION_LANGUAGE_MODEL_TAG, LLAVA_TAG] From 44de313059fa3bc454b53401ce8cfd3c7f786fa0 Mon Sep 17 00:00:00 2001 From: JosselinSomervilleRoberts Date: Sat, 13 Jan 2024 11:01:01 -0800 Subject: [PATCH 15/21] Add Llava 13b and bakllava --- src/helm/config/model_deployments.yaml | 15 +++++++++++++++ src/helm/config/model_metadata.yaml | 20 +++++++++++++++++++- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml index 65e0230737d..ceb9478ea13 100644 --- a/src/helm/config/model_deployments.yaml +++ b/src/helm/config/model_deployments.yaml @@ -457,6 +457,21 @@ model_deployments: max_sequence_length: 2048 client_spec: class_name: "helm.proxy.clients.vision_language.huggingface_vlm_client.HuggingFaceVLMClient" + + - name: huggingface/llava-1.5-13b-hf + model_name: microsoft/llava-1.5-13b-hf + tokenizer_name: hf-internal-testing/llama-tokenizer + max_sequence_length: 2048 + client_spec: + class_name: "helm.proxy.clients.vision_language.huggingface_vlm_client.HuggingFaceVLMClient" + + ## Mistral AI + - name: huggingface/bakLlava-v1-hf + model_name: mistralai/bakLlava-v1-hf + tokenizer_name: hf-internal-testing/llama-tokenizer + max_sequence_length: 2048 + client_spec: + class_name: "helm.proxy.clients.vision_language.huggingface_vlm_client.HuggingFaceVLMClient" ## OpenAI - name: huggingface/gpt2 diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml index b860defc29a..242702906a6 100644 --- a/src/helm/config/model_metadata.yaml +++ b/src/helm/config/model_metadata.yaml @@ -1082,7 +1082,16 @@ models: access: open num_parameters: 7000000000 release_date: 2023-10-05 - tags: [VISION_LANGUAGE_MODEL_TAG, LLAVA_TAG] + tags: [VISION_LANGUAGE_MODEL_TAG, LLAVA_MODEL_TAG] + + - name: microsoft/llava-1.5-13b-hf + display_name: LLaVA 1.5 (13B) + description: LLaVa is an open-source chatbot trained by fine-tuning LlamA/Vicuna on GPT-generated multimodal instruction-following data. + creator_organization_name: Microsoft + access: open + num_parameters: 13000000000 + release_date: 2023-10-05 + tags: [VISION_LANGUAGE_MODEL_TAG, LLAVA_MODEL_TAG] @@ -1124,6 +1133,15 @@ models: release_date: 2023-12-08 tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + - name: microsoft/bakLlava-v1-hf + display_name: BakLLaVA v1 (7B) + description: BakLLaVA v1 is a Mistral 7B base augmented with the LLaVA 1.5 architecture. ([blog](https://huggingface.co/llava-hf/bakLlava-v1-hf)) + creator_organization_name: Mistral AI + access: open + num_parameters: 7000000000 + release_date: 2023-10-16 + tags: [VISION_LANGUAGE_MODEL_TAG, LLAVA_MODEL_TAG] + # MosaicML From bb16b16a6d6fe3684047a5eddbc46600ef99267d Mon Sep 17 00:00:00 2001 From: JosselinSomervilleRoberts Date: Sat, 13 Jan 2024 11:02:16 -0800 Subject: [PATCH 16/21] Fix typo for bakllava --- src/helm/config/model_metadata.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml index 242702906a6..e5b5f6b3cfe 100644 --- a/src/helm/config/model_metadata.yaml +++ b/src/helm/config/model_metadata.yaml @@ -1133,7 +1133,7 @@ models: release_date: 2023-12-08 tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] - - name: microsoft/bakLlava-v1-hf + - name: mistralai/bakLlava-v1-hf display_name: BakLLaVA v1 (7B) description: BakLLaVA v1 is a Mistral 7B base augmented with the LLaVA 1.5 architecture. ([blog](https://huggingface.co/llava-hf/bakLlava-v1-hf)) creator_organization_name: Mistral AI From 1ce0f479645781bf7526ef9071c1845de4fa8704 Mon Sep 17 00:00:00 2001 From: JosselinSomervilleRoberts Date: Sat, 13 Jan 2024 11:03:53 -0800 Subject: [PATCH 17/21] Add aliases for new models --- .../proxy/clients/vision_language/huggingface_vlm_client.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/helm/proxy/clients/vision_language/huggingface_vlm_client.py b/src/helm/proxy/clients/vision_language/huggingface_vlm_client.py index b9ce2da562a..a6986f19610 100644 --- a/src/helm/proxy/clients/vision_language/huggingface_vlm_client.py +++ b/src/helm/proxy/clients/vision_language/huggingface_vlm_client.py @@ -32,6 +32,8 @@ class HuggingFaceVLMClient(CachingClient): _models: Dict[str, ImageToTextPipeline] = {} _models_aliases: Dict[str, str] = { "huggingface/llava-1.5-7b-hf": "llava-hf/llava-1.5-7b-hf", + "huggingface/llava-1.5-13b-hf": "llava-hf/llava-1.5-13b-hf", + "huggingface/bakLlava-v1-hf": "llava-hf/bakLlava-v1-hf", } def __init__(self, tokenizer: Tokenizer, tokenizer_name: str, cache_config: CacheConfig): From 2e79b69337b66e520b33c9cf8224a408db6e4200 Mon Sep 17 00:00:00 2001 From: JosselinSomervilleRoberts Date: Sat, 13 Jan 2024 11:08:31 -0800 Subject: [PATCH 18/21] Debugging Run expander --- src/helm/benchmark/run_expander.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/helm/benchmark/run_expander.py b/src/helm/benchmark/run_expander.py index 2a0ca440d7a..efce7059f56 100644 --- a/src/helm/benchmark/run_expander.py +++ b/src/helm/benchmark/run_expander.py @@ -405,6 +405,7 @@ class LlavaRunExpander(RunExpander): name = "llava" def expand(self, run_spec: RunSpec) -> List[RunSpec]: + print("=============\n===========\n================\nLLAVA\n===============\n===============\n===============") return [ replace( run_spec, @@ -412,7 +413,9 @@ def expand(self, run_spec: RunSpec) -> List[RunSpec]: adapter_spec=replace( run_spec.adapter_spec, input_prefix="USER: ", + input_suffix="", output_prefix="\nASSISTANT: ", + output_suffix="", ), ), ] From d46b5296ba0f26d6b762f2293d775291c44aaf12 Mon Sep 17 00:00:00 2001 From: JosselinSomervilleRoberts Date: Sat, 13 Jan 2024 11:15:58 -0800 Subject: [PATCH 19/21] Fix prefix --- src/helm/benchmark/run_expander.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/helm/benchmark/run_expander.py b/src/helm/benchmark/run_expander.py index efce7059f56..afb68dd1e8b 100644 --- a/src/helm/benchmark/run_expander.py +++ b/src/helm/benchmark/run_expander.py @@ -405,7 +405,6 @@ class LlavaRunExpander(RunExpander): name = "llava" def expand(self, run_spec: RunSpec) -> List[RunSpec]: - print("=============\n===========\n================\nLLAVA\n===============\n===============\n===============") return [ replace( run_spec, From 00decc67a8d9b79109dc1d3ba503af30cc3cf68e Mon Sep 17 00:00:00 2001 From: JosselinSomervilleRoberts Date: Sat, 13 Jan 2024 11:16:49 -0800 Subject: [PATCH 20/21] Fix prefix 2 --- .../proxy/clients/vision_language/huggingface_vlm_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helm/proxy/clients/vision_language/huggingface_vlm_client.py b/src/helm/proxy/clients/vision_language/huggingface_vlm_client.py index a6986f19610..db7cb8aec7c 100644 --- a/src/helm/proxy/clients/vision_language/huggingface_vlm_client.py +++ b/src/helm/proxy/clients/vision_language/huggingface_vlm_client.py @@ -52,7 +52,7 @@ def make_request(self, request: Request) -> RequestResult: assert request.multimodal_prompt is not None, "Multimodal prompt is required" # Build the prompt - prompt: str = "" + prompt: str = "" image: Optional[Image.Image] = None for media_object in request.multimodal_prompt.media_objects: if media_object.is_type("image") and media_object.location: From a8ffc01d33bb0b3fc82467a6fc94b29838f4e97c Mon Sep 17 00:00:00 2001 From: JosselinSomervilleRoberts Date: Sat, 13 Jan 2024 13:31:22 -0800 Subject: [PATCH 21/21] Changes requested --- src/helm/config/model_metadata.yaml | 4 ++-- .../vision_language/huggingface_vlm_client.py | 1 + test_llava.py | 21 ---------------- test_llava_quantized.py | 24 ------------------- 4 files changed, 3 insertions(+), 47 deletions(-) delete mode 100644 test_llava.py delete mode 100644 test_llava_quantized.py diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml index e5b5f6b3cfe..87f321e8acd 100644 --- a/src/helm/config/model_metadata.yaml +++ b/src/helm/config/model_metadata.yaml @@ -1077,7 +1077,7 @@ models: - name: microsoft/llava-1.5-7b-hf display_name: LLaVA 1.5 (7B) - description: LLaVa is an open-source chatbot trained by fine-tuning LlamA/Vicuna on GPT-generated multimodal instruction-following data. + description: LLaVa is an open-source chatbot trained by fine-tuning LlamA/Vicuna on GPT-generated multimodal instruction-following data. ([paper](https://arxiv.org/abs/2304.08485)) creator_organization_name: Microsoft access: open num_parameters: 7000000000 @@ -1086,7 +1086,7 @@ models: - name: microsoft/llava-1.5-13b-hf display_name: LLaVA 1.5 (13B) - description: LLaVa is an open-source chatbot trained by fine-tuning LlamA/Vicuna on GPT-generated multimodal instruction-following data. + description: LLaVa is an open-source chatbot trained by fine-tuning LlamA/Vicuna on GPT-generated multimodal instruction-following data. ([paper](https://arxiv.org/abs/2304.08485)) creator_organization_name: Microsoft access: open num_parameters: 13000000000 diff --git a/src/helm/proxy/clients/vision_language/huggingface_vlm_client.py b/src/helm/proxy/clients/vision_language/huggingface_vlm_client.py index db7cb8aec7c..8cedd5399f0 100644 --- a/src/helm/proxy/clients/vision_language/huggingface_vlm_client.py +++ b/src/helm/proxy/clients/vision_language/huggingface_vlm_client.py @@ -56,6 +56,7 @@ def make_request(self, request: Request) -> RequestResult: image: Optional[Image.Image] = None for media_object in request.multimodal_prompt.media_objects: if media_object.is_type("image") and media_object.location: + # TODO #2235: Figure out is fome HuggingFace models support multiple images if image is not None: raise ValueError("Only one image is supported in the multimodal prompt") image = open_image(media_object.location) diff --git a/test_llava.py b/test_llava.py deleted file mode 100644 index 4abdc46efc8..00000000000 --- a/test_llava.py +++ /dev/null @@ -1,21 +0,0 @@ -import requests -from PIL import Image - -image_url = "https://llava-vl.github.io/static/images/view.jpg" -image = Image.open(requests.get(image_url, stream=True).raw) - - -import torch - -from transformers import pipeline - -model_id = "llava-hf/llava-1.5-7b-hf" - -pipe = pipeline("image-to-text", model=model_id) - -max_new_tokens = 200 -prompt = "USER: \nWhat are the things I should be cautious about when I visit this place?\nASSISTANT:" - -outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200}) - -print(outputs[0]["generated_text"]) diff --git a/test_llava_quantized.py b/test_llava_quantized.py deleted file mode 100644 index 8c34c304c04..00000000000 --- a/test_llava_quantized.py +++ /dev/null @@ -1,24 +0,0 @@ -import requests -from PIL import Image - -image_url = "https://llava-vl.github.io/static/images/view.jpg" -image = Image.open(requests.get(image_url, stream=True).raw) - - -import torch -from transformers import BitsAndBytesConfig - -quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16) - -from transformers import pipeline - -model_id = "llava-hf/llava-1.5-7b-hf" - -pipe = pipeline("image-to-text", model=model_id, model_kwargs={"quantization_config": quantization_config}) - -max_new_tokens = 200 -prompt = "USER: \nWhat are the things I should be cautious about when I visit this place?\nASSISTANT:" - -outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200}) - -print(outputs[0]["generated_text"])