Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added Llava and BakLlava #2234

Merged
merged 22 commits into from
Jan 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
9d632a2
First draft of the VLM Huggingface client
JosselinSomervilleRoberts Jan 13, 2024
1f58a5a
Added Llava 1.5 7B
JosselinSomervilleRoberts Jan 13, 2024
3a995e2
update requirements
JosselinSomervilleRoberts Jan 13, 2024
774e3d0
Add test scripts
JosselinSomervilleRoberts Jan 13, 2024
b629fab
Fix typo in requirements
JosselinSomervilleRoberts Jan 13, 2024
0f1abe2
Loosening huggingface-hub version
JosselinSomervilleRoberts Jan 13, 2024
e7e9263
Loosening tokenizers version
JosselinSomervilleRoberts Jan 13, 2024
80b5422
Loosening fsspec version
JosselinSomervilleRoberts Jan 13, 2024
0460376
Loosening HEIM's accelerate version
JosselinSomervilleRoberts Jan 13, 2024
2a39e4b
Loosening AlephAlpha's tokenizers version
JosselinSomervilleRoberts Jan 13, 2024
90b858c
Update model deployment name and alis for Llava
JosselinSomervilleRoberts Jan 13, 2024
387c83d
Update Llava's tag
JosselinSomervilleRoberts Jan 13, 2024
7f64441
Fix typo in model deployment for Llava
JosselinSomervilleRoberts Jan 13, 2024
e42bedd
Merge branch 'main' into joss-llava
JosselinSomervilleRoberts Jan 13, 2024
37d97a0
Add Llava Run Expander
JosselinSomervilleRoberts Jan 13, 2024
44de313
Add Llava 13b and bakllava
JosselinSomervilleRoberts Jan 13, 2024
bb16b16
Fix typo for bakllava
JosselinSomervilleRoberts Jan 13, 2024
1ce0f47
Add aliases for new models
JosselinSomervilleRoberts Jan 13, 2024
2e79b69
Debugging Run expander
JosselinSomervilleRoberts Jan 13, 2024
d46b529
Fix prefix
JosselinSomervilleRoberts Jan 13, 2024
00decc6
Fix prefix 2
JosselinSomervilleRoberts Jan 13, 2024
a8ffc01
Changes requested
JosselinSomervilleRoberts Jan 13, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
2captcha-python==1.1.3
absl-py==1.2.0
accelerate==0.25.0
aiodns==3.0.0
aiohttp==3.8.5
aiohttp-retry==2.8.3
Expand Down Expand Up @@ -42,7 +43,7 @@ filelock==3.8.0
flake8==5.0.4
fonttools==4.37.4
frozenlist==1.3.1
fsspec==2022.8.2
fsspec>=2022.8.2
gdown==4.4.0
gevent==21.12.0
gin-config==0.5.0
Expand All @@ -56,7 +57,7 @@ greenlet==1.1.3
gunicorn==20.1.0
h11==0.14.0
httplib2==0.20.4
huggingface-hub==0.15.1
huggingface-hub>=0.15.1
icetk==0.0.4
identify==2.5.6
idna==3.4
Expand Down Expand Up @@ -159,15 +160,15 @@ thinc==8.1.12
threadpoolctl==3.1.0
tiktoken==0.3.3
tls-client==0.1.8
tokenizers==0.13.3
tokenizers>=0.13.3
toml==0.10.2
tomli==2.0.1
torch==1.12.1 ; sys_platform == "darwin"
torchvision==0.13.1 ; sys_platform == "darwin"
torch==1.12.1+cu113 ; sys_platform == "linux"
torchvision==0.13.1+cu113 ; sys_platform == "linux"
tqdm==4.64.1
transformers==4.33.1
transformers==4.36.0
trio==0.22.0
trio-websocket==0.9.2
typer==0.4.2
Expand Down
8 changes: 4 additions & 4 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ install_requires=
scikit-learn~=1.1.2

# Models and Metrics Extras
transformers~=4.33.1 # For anthropic_client, huggingface_client, huggingface_tokenizer, test_openai_token_cost_estimator, model_summac (via summarization_metrics)
transformers~=4.36.0 # For anthropic_client, vision_language.huggingface_vlm_client, huggingface_client, huggingface_tokenizer, test_openai_token_cost_estimator, model_summac (via summarization_metrics)
# TODO: Upgrade torch - we need > 2.0.0 for newer versions of transformers
torch>=1.12.1,<3.0.0 # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics)
torchvision>=0.13.1,<3.0.0 # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics)
Expand Down Expand Up @@ -98,7 +98,7 @@ cleva =
langdetect==1.0.9

images =
accelerate~=0.23.0 # For the newer versions of Transformers
accelerate~=0.25.0 # For the newer versions of Transformers
pillow~=9.4.0

mongo =
Expand All @@ -107,7 +107,7 @@ mongo =
# Model extras
aleph-alpha =
aleph-alpha-client~=2.14.0
tokenizers~=0.13.3
tokenizers>=0.13.3

anthropic =
anthropic~=0.2.5
Expand Down Expand Up @@ -142,7 +142,7 @@ heim =
gdown~=4.4.0

# HEIM models
accelerate~=0.23.0
accelerate~=0.25.0
diffusers~=0.24.0
jax~=0.4.13
jaxlib~=0.4.13
Expand Down
2 changes: 2 additions & 0 deletions src/helm/benchmark/model_metadata_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@
VISION_LANGUAGE_MODEL_TAG: str = "VISION_LANGUAGE_MODEL_TAG"
# IDEFICS require a special prompt format (see `IDEFICSInstructRunExpander`)
IDEFICS_INSTRUCT_MODEL_TAG: str = "IDEFICS_INSTRUCT_MODEL_TAG"
# Llava should use a special prompt format (see `LlavaRunExpander`)
LLAVA_MODEL_TAG: str = "LLAVA_MODEL_TAG"


# Frozen is set to false as the model_deployment_registry.py file
Expand Down
24 changes: 24 additions & 0 deletions src/helm/benchmark/run_expander.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,30 @@ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
]


class LlavaRunExpander(RunExpander):
"""
Custom prompt for Llava 1.5 models which should use a specific format.
See https://colab.research.google.com/drive/1qsl6cd2c8gGtEW1xV5io7S8NHh-Cp1TV?usp=sharing for more information.
"""

name = "llava"

def expand(self, run_spec: RunSpec) -> List[RunSpec]:
return [
replace(
run_spec,
name=run_spec.name,
adapter_spec=replace(
run_spec.adapter_spec,
input_prefix="USER: <image>",
input_suffix="",
output_prefix="\nASSISTANT: ",
output_suffix="",
),
),
]


class FormatPromptRunExpander(RunExpander):
"""Adds a prefix and suffix to the prompt."""

Expand Down
6 changes: 6 additions & 0 deletions src/helm/benchmark/run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
OpenAIRunExpander,
GoogleRunExpander,
IDEFICSInstructRunExpander,
LlavaRunExpander,
StopRunExpander,
ChatMLRunExpander,
IncreaseTemperatureRunExpander,
Expand Down Expand Up @@ -64,6 +65,7 @@
GOOGLE_PALM_2_MODEL_TAG,
GOOGLE_GEMINI_MODEL_TAG,
IDEFICS_INSTRUCT_MODEL_TAG,
LLAVA_MODEL_TAG,
NO_NEWLINES_TAG,
NLG_PREFIX_TAG,
CHATML_MODEL_TAG,
Expand Down Expand Up @@ -3087,6 +3089,10 @@ def alter_run_spec(run_spec: RunSpec) -> RunSpec:
if IDEFICS_INSTRUCT_MODEL_TAG in model.tags:
run_spec = singleton(IDEFICSInstructRunExpander().expand(run_spec))

# Llava
if LLAVA_MODEL_TAG in model.tags:
run_spec = singleton(LlavaRunExpander().expand(run_spec))

# For multiple choice
if BUGGY_TEMP_0_TAG in model.tags and run_spec.adapter_spec.temperature == 0:
increase_temperature_expander = IncreaseTemperatureRunExpander(value=1e-4)
Expand Down
23 changes: 23 additions & 0 deletions src/helm/config/model_deployments.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,29 @@ model_deployments:
client_spec:
class_name: "helm.proxy.clients.huggingface_client.HuggingFaceClient"

## Microsoft
- name: huggingface/llava-1.5-7b-hf
model_name: microsoft/llava-1.5-7b-hf
tokenizer_name: hf-internal-testing/llama-tokenizer
max_sequence_length: 2048
client_spec:
class_name: "helm.proxy.clients.vision_language.huggingface_vlm_client.HuggingFaceVLMClient"

- name: huggingface/llava-1.5-13b-hf
model_name: microsoft/llava-1.5-13b-hf
tokenizer_name: hf-internal-testing/llama-tokenizer
max_sequence_length: 2048
client_spec:
class_name: "helm.proxy.clients.vision_language.huggingface_vlm_client.HuggingFaceVLMClient"

## Mistral AI
- name: huggingface/bakLlava-v1-hf
model_name: mistralai/bakLlava-v1-hf
tokenizer_name: hf-internal-testing/llama-tokenizer
max_sequence_length: 2048
client_spec:
class_name: "helm.proxy.clients.vision_language.huggingface_vlm_client.HuggingFaceVLMClient"

## OpenAI
- name: huggingface/gpt2
model_name: openai/gpt2
Expand Down
27 changes: 27 additions & 0 deletions src/helm/config/model_metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1075,6 +1075,24 @@ models:
release_date: 2022-01-28
tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]

- name: microsoft/llava-1.5-7b-hf
display_name: LLaVA 1.5 (7B)
description: LLaVa is an open-source chatbot trained by fine-tuning LlamA/Vicuna on GPT-generated multimodal instruction-following data. ([paper](https://arxiv.org/abs/2304.08485))
creator_organization_name: Microsoft
access: open
num_parameters: 7000000000
release_date: 2023-10-05
tags: [VISION_LANGUAGE_MODEL_TAG, LLAVA_MODEL_TAG]

- name: microsoft/llava-1.5-13b-hf
display_name: LLaVA 1.5 (13B)
description: LLaVa is an open-source chatbot trained by fine-tuning LlamA/Vicuna on GPT-generated multimodal instruction-following data. ([paper](https://arxiv.org/abs/2304.08485))
creator_organization_name: Microsoft
access: open
num_parameters: 13000000000
release_date: 2023-10-05
tags: [VISION_LANGUAGE_MODEL_TAG, LLAVA_MODEL_TAG]



# 01.AI
Expand Down Expand Up @@ -1115,6 +1133,15 @@ models:
release_date: 2023-12-08
tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]

- name: mistralai/bakLlava-v1-hf
display_name: BakLLaVA v1 (7B)
description: BakLLaVA v1 is a Mistral 7B base augmented with the LLaVA 1.5 architecture. ([blog](https://huggingface.co/llava-hf/bakLlava-v1-hf))
creator_organization_name: Mistral AI
access: open
num_parameters: 7000000000
release_date: 2023-10-16
tags: [VISION_LANGUAGE_MODEL_TAG, LLAVA_MODEL_TAG]



# MosaicML
Expand Down
106 changes: 106 additions & 0 deletions src/helm/proxy/clients/vision_language/huggingface_vlm_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from threading import Lock
from typing import Dict, List, Optional

from transformers import pipeline
from transformers.pipelines import ImageToTextPipeline

from helm.common.cache import CacheConfig
from helm.common.images_utils import open_image
from helm.common.media_object import TEXT_TYPE
from helm.common.optional_dependencies import handle_module_not_found_error
from helm.common.request import Request, RequestResult, Sequence, Token
from helm.common.tokenization_request import (
TokenizationRequest,
TokenizationRequestResult,
)
from helm.common.request import wrap_request_time
from helm.proxy.clients.client import CachingClient, generate_uid_for_multimodal_prompt
from helm.proxy.tokenizers.tokenizer import Tokenizer

try:
from PIL import Image
except ModuleNotFoundError as e:
handle_module_not_found_error(e, ["images"])


class HuggingFaceVLMClient(CachingClient):
"""
General CLient for VLM models from HuggingFace.
"""

_models_lock: Lock = Lock()
_models: Dict[str, ImageToTextPipeline] = {}
_models_aliases: Dict[str, str] = {
"huggingface/llava-1.5-7b-hf": "llava-hf/llava-1.5-7b-hf",
"huggingface/llava-1.5-13b-hf": "llava-hf/llava-1.5-13b-hf",
"huggingface/bakLlava-v1-hf": "llava-hf/bakLlava-v1-hf",
}

def __init__(self, tokenizer: Tokenizer, tokenizer_name: str, cache_config: CacheConfig):
super().__init__(cache_config=cache_config)
self.tokenizer = tokenizer
self.tokenizer_name = tokenizer_name

def _get_model(self, model_name: str) -> ImageToTextPipeline:
with self._models_lock:
model_id: str = self._models_aliases.get(model_name, model_name)
if model_id not in self._models:
self._models[model_id] = pipeline("image-to-text", model=model_id)
return self._models[model_id]

def make_request(self, request: Request) -> RequestResult:
assert request.multimodal_prompt is not None, "Multimodal prompt is required"

# Build the prompt
prompt: str = ""
image: Optional[Image.Image] = None
for media_object in request.multimodal_prompt.media_objects:
if media_object.is_type("image") and media_object.location:
# TODO #2235: Figure out is fome HuggingFace models support multiple images
if image is not None:
raise ValueError("Only one image is supported in the multimodal prompt")
JosselinSomervilleRoberts marked this conversation as resolved.
Show resolved Hide resolved
image = open_image(media_object.location)
elif media_object.is_type(TEXT_TYPE):
if media_object.text is None:
raise ValueError("MediaObject of text type has missing text field value")
prompt += f"\n{media_object.text}"
else:
raise ValueError(f"Unsupported media object type: {media_object.type}")

# Generate
try:
generation_args = {
"max_new_tokens": request.max_tokens,
}

def do_it():
model: ImageToTextPipeline = self._get_model(request.model_deployment)
outputs = model(image, prompt=prompt, generate_kwargs=generation_args)
return outputs[0]

cache_key = CachingClient.make_cache_key(
raw_request={
"model": request.model,
"prompt": generate_uid_for_multimodal_prompt(request.multimodal_prompt),
**generation_args,
},
request=request,
)
result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
except RuntimeError as e:
return RequestResult(success=False, cached=False, error=str(e), completions=[], embedding=[])

tokenization_result: TokenizationRequestResult = self.tokenizer.tokenize(
TokenizationRequest(result["generated_text"], tokenizer=self.tokenizer_name)
)
tokens: List[Token] = [
Token(text=str(text), logprob=0, top_logprobs={}) for text in tokenization_result.raw_tokens
]
completions: List[Sequence] = [Sequence(text=result["generated_text"], logprob=0, tokens=tokens)]
return RequestResult(
success=True,
cached=cached,
request_time=result["request_time"],
completions=completions,
embedding=[],
)
Loading